add cviruntime

commit 3f4938648950a7f3bf9a19c320ca9fae7c52de20 Author: sophgo-forum-service <forum_service@sophgo.com> Date: Mon May 13 13:44:23 2024 +0800 [feat] cviruntime opensource for cv18xx soc. - a4b6a3, add cumsum and gatherelements_pt.
2024-05-31 11:51:34 +08:00
parent 88a2fed916
commit e25f20f7a3
904 changed files with 260029 additions and 0 deletions
--- a/.version/2024-05-31.md
+++ b/.version/2024-05-31.md
@ -18,3 +18,4 @@
 | Lab-Project-FreeRTOS-POSIX | freertos/Source/FreeRTOS-Plus-POSIX | https://github.com/sophgo/Lab-Project-FreeRTOS-POSIX.git | sg200x-dev    | 5042bfd      |
 | cvibuilder                 | cvibuilder                          | https://github.com/sophgo/cvibuilder.git                 | sg200x-dev    | 4309f2a      |
 | cvikernel                  | cvikernel                           | https://github.com/sophgo/cvikernel.git                  | sg200x-dev    | 9f1f57a      |
+| cviruntime                 | cviruntime                          | https://github.com/sophgo/cviruntime.git                 | sg200x-dev    | 3f49386      |
--- a/cviruntime/.gitignore
+++ b/cviruntime/.gitignore
@ -0,0 +1,3 @@
+build/
+build_sdk/
+**/__pycache__
--- a/cviruntime/CMakeLists.txt
+++ b/cviruntime/CMakeLists.txt
@ -0,0 +1,196 @@
+cmake_minimum_required(VERSION 3.1.0)
+
+project(cviruntime C CXX)
+
+execute_process(
+  COMMAND git describe --always --tags --dirty
+  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  RESULT_VARIABLE GIT_EXEC_RESULT
+  OUTPUT_VARIABLE GIT_SHORT_HASH)
+
+string(STRIP ${GIT_SHORT_HASH} GIT_SHORT_HASH)
+string(TIMESTAMP BUILD_TIME "%Y%m%d")
+set(RUNTIME_VERSION "${GIT_SHORT_HASH}@${BUILD_TIME}")
+message(STATUS "runtime version: ${RUNTIME_VERSION}")
+add_definitions(-DRUNTIME_VERSION="${RUNTIME_VERSION}")
+
+set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
+set(CMAKE_INSTALL_RPATH "\${ORIGIN}/../lib;\${ORIGIN}/")
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+
+option(ENABLE_COMPRESS_CMDBUF "enable compressed cmdbuf" ON)
+option(ENABLE_CPU_FUNC "enable cpu functions" ON)
+option(ENABLE_PMU "enable tpu PMU" ON)
+
+set(SAFETY_FLAGS "-Werror -Wall -Wextra -fno-strict-aliasing -Wno-missing-field-initializers")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAFETY_FLAGS}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAFETY_FLAGS}")
+
+if(CMAKE_CROSSCOMPILING)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ftree-vectorize")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ftree-vectorize -Wno-unused-parameter")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ftree-vectorize -Wno-unused-parameter")
+endif()
+
+if (NOT DEFINED CHIP)
+  message(FATAL_ERROR "no CHIP specified")
+endif()
+if (NOT DEFINED RUNTIME)
+  message(FATAL_ERROR "no RUNTIME specified")
+endif()
+if(NOT DEFINED CVIKERNEL_PATH)
+  message(FATAL_ERROR "Please set CVIKERNEL_PATH to point to the cvikernel installation")
+endif()
+if (NOT DEFINED FLATBUFFERS_PATH)
+  message(FATAL_ERROR "Please set FLATBUF_PATH")
+endif()
+if(NOT DEFINED CVIBUILDER_PATH)
+  message(FATAL_ERROR "Please set CVIBUILDER_PATH to point to the CVIBUILDER installation")
+endif()
+if(RUNTIME STREQUAL CMODEL)
+  if(NOT DEFINED CMODEL_PATH)
+    message(FATAL_ERROR "Please set CMODEL_PATH to point to the cmodel source installation")
+  endif()
+endif()
+
+message(STATUS "CHIP: ${CHIP}")
+message(STATUS "RUNTIME: ${RUNTIME}")
+message(STATUS "CMODEL_PATH: ${CMODEL_PATH}")
+message(STATUS "CVIKERNEL_PATH: ${CVIKERNEL_PATH}")
+message(STATUS "FLATBUFFERS_PATH: ${FLATBUFFERS_PATH}")
+message(STATUS "CVIBUILDER_PATH: ${CVIBUILDER_PATH}")
+
+
+message(STATUS "CHIP: ${CHIP}")
+
+if (CHIP STREQUAL cv183x)
+  add_definitions(-DCHIPID=0x1)
+elseif (CHIP STREQUAL cv182x)
+  add_definitions(-DCHIPID=0x2)
+elseif (CHIP STREQUAL cv181x)
+  add_definitions(-DCHIPID=0x3)
+  set(ENABLE_COMPRESS_CMDBUF OFF CACHE BOOL "" FORCE)
+elseif (CHIP STREQUAL cv180x)
+  add_definitions(-DCHIPID=0x4)
+  set(ENABLE_COMPRESS_CMDBUF OFF CACHE BOOL "" FORCE)
+  set(ENABLE_CPU_FUNC OFF CACHE BOOL "" FORCE)
+endif()
+add_definitions(-DCHIP=${CHIP})
+
+if (${ENABLE_COMPRESS_CMDBUF})
+  add_definitions(-DENABLE_COMPRESS_CMDBUF)
+endif()
+
+if (${ENABLE_CPU_FUNC})
+  add_definitions(-DENABLE_CPU_FUNC)
+endif()
+
+if (${ENABLE_PMU})
+  add_definitions(-DENABLE_PMU)
+endif()
+
+include_directories(
+  ${PROJECT_SOURCE_DIR}/include
+  ${PROJECT_SOURCE_DIR}/src/common
+  ${CVIBUILDER_PATH}/include
+  ${FLATBUFFERS_PATH}/include
+  ${CVIKERNEL_PATH}/include
+  ${CMAKE_CURRENT_BINARY_DIR})
+
+if (${ENABLE_COMPRESS_CMDBUF}) 
+  include_directories(${PROJECT_SOURCE_DIR}/include/lz4)
+endif()
+
+link_directories(${CVIKERNEL_PATH}/lib)
+set(CVI_LIBS ${CVI_LIBS} cvikernel)
+
+if(RUNTIME STREQUAL CMODEL)
+  include_directories(${CMODEL_PATH}/include)
+  link_directories(${CMODEL_PATH}/lib)
+  set(CVI_LIBS ${CVI_LIBS} cvicmodel)
+endif()
+
+add_subdirectory(src)
+add_subdirectory(tool)
+if (ENABLE_PYRUNTIME STREQUAL "ON")
+  add_subdirectory(python)
+endif()
+
+if (ENABLE_TEST STREQUAL "ON")
+  add_subdirectory(test)
+endif()
+
+if (NOT CMAKE_CROSSCOMPILING)
+  if (ENABLE_TEST STREQUAL "ON")
+    enable_testing()
+  endif()
+endif()
+
+file(GLOB HEADERS
+  include/cviruntime.h
+  include/bmruntime.h
+  include/bmruntime_bmkernel.h
+  include/cviruntime_context.h
+  include/cviruntime_extra.h
+  include/cvitpu_debug.h)
+install(FILES ${HEADERS} DESTINATION include)
+
+file(GLOB RUNTIME_HEADERS
+  include/runtime/cpu_function.hpp
+  include/runtime/neuron.hpp
+  include/runtime/op_param.hpp)
+install(FILES ${RUNTIME_HEADERS} DESTINATION include/runtime)
+
+if(NOT CMAKE_CROSSCOMPILING)
+  # install the whole sample dir as source code
+  install(DIRECTORY samples DESTINATION .)
+endif()
+
+install(FILES scripts/envs_tpu_sdk.sh
+    PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+    DESTINATION .)
+
+if (CHIP STREQUAL cv183x)
+  install(FILES scripts/regression_new_models_cv183x.sh
+      PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+      DESTINATION . RENAME regression_models.sh)
+  install(FILES scripts/regression_models_e2e_cv183x.sh
+      PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+      DESTINATION . RENAME regression_models_e2e.sh)
+  install(FILES scripts/regression_samples_cv183x.sh
+      PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+      DESTINATION . RENAME regression_samples.sh)
+elseif (CHIP STREQUAL cv182x)
+  install(FILES scripts/regression_new_models_cv182x.sh
+      PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+      DESTINATION . RENAME regression_models.sh)
+  install(FILES scripts/regression_models_e2e_cv182x.sh
+      PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+      DESTINATION . RENAME regression_models_e2e.sh)
+  install(FILES scripts/regression_samples_cv182x.sh
+      PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+      DESTINATION . RENAME regression_samples.sh)
+elseif (CHIP STREQUAL cv181x)
+  install(FILES scripts/regression_new_models_cv181x.sh
+      PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+      DESTINATION . RENAME regression_models.sh)
+  install(FILES scripts/regression_models_e2e_cv181x.sh
+      PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+      DESTINATION . RENAME regression_models_e2e.sh)
+  install(FILES scripts/regression_samples_cv181x.sh
+      PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+      DESTINATION . RENAME regression_samples.sh)
+elseif (CHIP STREQUAL cv180x)
+  install(FILES scripts/regression_new_models_cv180x.sh
+      PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+      DESTINATION . RENAME regression_models.sh)
+  install(FILES scripts/regression_models_e2e_cv180x.sh
+      PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+      DESTINATION . RENAME regression_models_e2e.sh)
+  install(FILES scripts/regression_samples_cv180x.sh
+      PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+      DESTINATION . RENAME regression_samples.sh)
+endif()
--- a/cviruntime/README.md
+++ b/cviruntime/README.md
@ -0,0 +1,65 @@
+# runtime
+
+## overview
+
+runtime is a lib released as SDK for use to develop TPU application. As well as a few tools for testing/benchmarking/profiling, etc.
+
+tools
+
+* test_cvimodel
+
+## dependency
+
+cvibuilder (for cvimodel_generated.h)
+bmkernel (if run bmkernel directly)
+cmodel (if RUNTIME=CMODEL)
+
+## build
+
+assuming install to ../install
+
+assuming support install to ../install
+assuming cvibuilder install to ../install
+assuming bmkernel install to ../install
+assuming cmodel install to ../install
+
+```
+$ cd runtime
+$ mkdir build
+$ cd build
+$ cmake -G Ninja -DCHIP=BM1880v2 -DRUNTIME=CMODEL -DSUPPORT_PATH=../install -DCVIBUILDER_PATH=../install -DCVIKERNEL_PATH=../install -DCMODEL_PATH=../install -DCMAKE_INSTALL_PREFIX=../../install ..
+
+Build
+$ cmake --build .
+$ cmake --build . -- -v
+
+Install
+$ cmake --build . --target install
+$ cmake --build . --target install -- -v
+
+Test
+$ cmake --build . --target test -- -v
+
+Uninstall
+$ xargs rm < install_manifest.txt
+```
+
+## output
+
+## test
+
+'''
+$ cd runtime/build
+# cp bmnet/tests/regression/build/bm1880v2/caffe/resnet50/BM1880v2_resnet50_1.bmodel
+$ ./test/test_bmnet_bmodel \
+    /data/release/bmnet_models/resnet50/int8/resnet50_input_1_3_224_224.bin \
+    BM1880v2_resnet50_1.bmodel \
+    BM1880v2_resnet50_1_output.bin \
+    1 3 224 224
+'''
+
+## TODO
+
+* add SAFETY_FLAGS back
+* for bm1880v2 only, need refactor for all chips
+* add cpu layer back (comments out for now, search for SKIP_CPU_LAYER)
--- a/cviruntime/build_tpu_sdk.sh
+++ b/cviruntime/build_tpu_sdk.sh
@ -0,0 +1,160 @@
+#!/bin/bash
+
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+
+echo "TPU_SDK_BUILD_PATH=$TPU_SDK_BUILD_PATH"
+echo "TPU_SDK_INSTALL_PATH=$TPU_SDK_INSTALL_PATH"
+echo "TOP_DIR=$TOP_DIR"
+
+TOOLCHAIN_FILE_PATH=$DIR/scripts/toolchain.cmake
+echo "TOOLCHAIN_FILE_PATH=$TOOLCHAIN_FILE_PATH"
+TOOLCHAIN_AARCH64=$DIR/scripts/toolchain-aarch64-linux.cmake
+TOOLCHAIN_ARM=$DIR/scripts/toolchain-linux-gnueabihf.cmake
+TOOLCHAIN_UCLIBC=$DIR/scripts/toolchain-linux-uclibc.cmake
+TOOLCHAIN_RISCV64=$DIR/scripts/toolchain-riscv64-linux-x86_64.cmake
+TOOLCHAIN_RISCV64_MUSL=$DIR/scripts/toolchain-riscv64-linux-musl-x86_64.cmake
+
+if [ ! -e "$OSS_TARBALL_PATH" ]; then
+    echo "${OSS_TARBALL_PATH} not present, run build_3rd_party first"
+    exit 1
+fi
+
+mkdir -p "$TPU_SDK_BUILD_PATH"/build_sdk
+mkdir -p "$TPU_SDK_INSTALL_PATH"
+
+"$OSS_PATH"/run_build.sh -n zlib -e -t "$OSS_TARBALL_PATH" -i "$TPU_SDK_INSTALL_PATH"
+"$OSS_PATH"/run_build.sh -n flatbuffers -e -t "$OSS_TARBALL_PATH" -i "$TPU_SDK_INSTALL_PATH"/flatbuffers
+"$OSS_PATH"/run_build.sh -n opencv -e -t "$OSS_TARBALL_PATH" -i "$TPU_SDK_INSTALL_PATH"/opencv
+
+#
+# build
+#
+BUILD_TYPE="RELEASE"
+if [ "$BUILD_TYPE" == "RELEASE" ]; then
+  BUILD_FLAG="-DCMAKE_BUILD_TYPE=RELEASE -DCMAKE_C_FLAGS_RELEASE=-O3 -DCMAKE_CXX_FLAGS_RELEASE=-O3"
+else
+  BUILD_FLAG="-DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_FLAGS=-ggdb"
+fi
+BUILD_PATH=$TPU_SDK_BUILD_PATH
+
+CHIP_ID="${CHIP_ARCH,,}"
+echo "CHIP_ID=$CHIP_ID"
+
+# build host flatbuffers
+FLATBUFFERS_HOST_PATH=$BUILD_PATH/install_flatbuffers_host
+mkdir -p $FLATBUFFERS_HOST_PATH
+if [ ! -e $BUILD_PATH/build_flatbuffers_host ]; then
+  mkdir -p $BUILD_PATH/build_flatbuffers_host
+fi
+pushd $BUILD_PATH/build_flatbuffers_host
+cmake -G Ninja -DCMAKE_INSTALL_PREFIX=$FLATBUFFERS_HOST_PATH \
+    $TOP_DIR/flatbuffers
+cmake --build . --target install
+test $? -ne 0 && echo "build flatbuffers failed !!" && popd && exit 1
+popd
+
+# build target flat buffer
+# move to build_oss
+
+# generate target-independent flatbuffer schema
+CVIMODEL_HOST_PATH=$BUILD_PATH/install_cvimodel_host
+if [ ! -e $BUILD_PATH/build_cvimodel ]; then
+  mkdir -p $BUILD_PATH/build_cvimodel
+fi
+pushd $BUILD_PATH/build_cvimodel
+cmake -G Ninja -DFLATBUFFERS_PATH=$FLATBUFFERS_HOST_PATH \
+    -DCMAKE_INSTALL_PREFIX=$CVIMODEL_HOST_PATH \
+    $TOP_DIR/cvibuilder
+cmake --build . --target install
+test $? -ne 0 && echo "build cvibuilder failed !!" && popd && exit 1
+popd
+
+# build cvikernel
+if [ ! -e $BUILD_PATH/build_cvikernel ]; then
+  mkdir -p $BUILD_PATH/build_cvikernel
+fi
+pushd $BUILD_PATH/build_cvikernel
+cmake -G Ninja $BUILD_FLAG \
+    -DCHIP=$CHIP_ID \
+    -DCMAKE_TOOLCHAIN_FILE=$TOOLCHAIN_FILE_PATH \
+    -DCMAKE_INSTALL_PREFIX=$TPU_SDK_INSTALL_PATH \
+    $TOP_DIR/cvikernel
+cmake --build . --target install -- -v
+test $? -ne 0 && echo "build cvikernel failed !!" && popd && exit 1
+popd
+
+# build cnpy
+if [ ! -e $BUILD_PATH/build_cnpy ]; then
+  mkdir -p $BUILD_PATH/build_cnpy
+fi
+pushd $BUILD_PATH/build_cnpy
+cmake -G Ninja $BUILD_FLAG \
+    -DCMAKE_TOOLCHAIN_FILE=$TOOLCHAIN_FILE_PATH \
+    -DCMAKE_INSTALL_PREFIX=$TPU_SDK_INSTALL_PATH \
+    $TOP_DIR/cnpy
+cmake --build . --target install
+test $? -ne 0 && echo "build cnpy failed !!" && popd && exit 1
+popd
+
+# build runtime
+
+if [ ! -e $BUILD_PATH/build_cviruntime ]; then
+  mkdir $BUILD_PATH/build_cviruntime
+fi
+pushd $BUILD_PATH/build_cviruntime
+cmake -G Ninja -DCHIP=$CHIP_ID -DRUNTIME=SOC $BUILD_FLAG \
+    -DCMAKE_TOOLCHAIN_FILE=$TOOLCHAIN_FILE_PATH \
+    -DCVIKERNEL_PATH=$TPU_SDK_INSTALL_PATH \
+    -DCNPY_PATH=$TPU_SDK_INSTALL_PATH/lib \
+    -DFLATBUFFERS_PATH=$TPU_SDK_INSTALL_PATH/flatbuffers \
+    -DCVIBUILDER_PATH=$CVIMODEL_HOST_PATH \
+    -DCMAKE_INSTALL_PREFIX=$TPU_SDK_INSTALL_PATH \
+    -DENABLE_TEST=OFF \
+    $TOP_DIR/cviruntime
+cmake --build . --target install -- -v
+test $? -ne 0 && echo "build cviruntime failed !!" && popd && exit 1
+popd
+
+# build cvimath
+if [ ! -e $BUILD_PATH/build_cvimath ]; then
+  mkdir $BUILD_PATH/build_cvimath
+fi
+pushd $BUILD_PATH/build_cvimath
+
+cmake -G Ninja  \
+    -DTOOLCHAIN_ROOT_DIR=$TOOLCHAIN_GCC_PATH \
+    -DCMAKE_TOOLCHAIN_FILE=$TOOLCHAIN_FILE_PATH \
+    -DTPU_SDK_ROOT=$TPU_SDK_INSTALL_PATH \
+    -DCMAKE_INSTALL_PREFIX=$TPU_SDK_INSTALL_PATH \
+    $TOP_DIR/cvimath
+cmake --build . --target install -- -v
+test $? -ne 0 && echo "build cvimath failed !!" && popd && exit 1
+popd
+
+if [ ! -e $BUILD_PATH/build_samples ]; then
+  mkdir $BUILD_PATH/build_samples
+fi
+pushd $BUILD_PATH/build_samples
+cmake -G Ninja $BUILD_FLAG \
+    -DCMAKE_TOOLCHAIN_FILE=$TOOLCHAIN_FILE_PATH \
+    -DTPU_SDK_PATH=$TPU_SDK_INSTALL_PATH \
+    -DOPENCV_PATH=$TPU_SDK_INSTALL_PATH/opencv \
+    -DCMAKE_INSTALL_PREFIX=$TPU_SDK_INSTALL_PATH/samples \
+    $DIR/samples
+cmake --build . --target install -- -v
+test $? -ne 0 && echo "build samples failed !!" && popd && exit 1
+popd
+
+# Copy some files for release build
+mkdir -p $TPU_SDK_INSTALL_PATH/cmake
+cp $TOOLCHAIN_FILE_PATH $TPU_SDK_INSTALL_PATH/cmake
+cp $TOOLCHAIN_AARCH64 $TPU_SDK_INSTALL_PATH/cmake
+cp $TOOLCHAIN_ARM $TPU_SDK_INSTALL_PATH/cmake
+cp $TOOLCHAIN_UCLIBC $TPU_SDK_INSTALL_PATH/cmake
+cp $TOOLCHAIN_RISCV64 $TPU_SDK_INSTALL_PATH/cmake
+cp $TOOLCHAIN_RISCV64_MUSL $TPU_SDK_INSTALL_PATH/cmake
+
+# copy lib
+mkdir -p "$SYSTEM_OUT_DIR"/lib/
+cp -a "$TPU_SDK_INSTALL_PATH"/lib/*.so* "$SYSTEM_OUT_DIR"/lib/
+cp -a "$TPU_SDK_INSTALL_PATH"/opencv/lib/*.so* "$SYSTEM_OUT_DIR"/lib/
--- a/cviruntime/custom_op/example/CMakeLists.txt
+++ b/cviruntime/custom_op/example/CMakeLists.txt
@ -0,0 +1,21 @@
+cmake_minimum_required(VERSION 2.8.0)
+project(custom_cpu_function CXX)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+
+if(NOT DEFINED MLIR_INCLUDE)
+  message(FATAL_ERROR "Please set MLIR_INCLUDE to point to the include path of mlir")
+endif()
+
+include_directories(${PROJECT_SOURCE_DIR})
+include_directories(${MLIR_INCLUDE})
+
+add_library(CustomOpPlugin SHARED
+            LeakyReluOp.cpp
+            ROIAlignOp.cpp
+            SoftmaxOp.cpp
+            UnPoolingOp.cpp)
+
+install(TARGETS CustomOpPlugin DESTINATION lib/custom_op/)
--- a/cviruntime/custom_op/example/LeakyReluOp.cpp
+++ b/cviruntime/custom_op/example/LeakyReluOp.cpp
@ -0,0 +1,461 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*/
+#include "LeakyReluOp.h"
+#include "QuantHelper.h"
+#include <cvikernel/cvikernel.h>
+
+#define NPU_SHIFT 5
+#define EU_SHIFT 4
+#define NPU_NUM (1 << NPU_SHIFT)
+#define EU_NUM (1 << EU_SHIFT)
+#define LOCAL_MEM_SIZE (1 << 15)
+#define NEURON_MEMORY 0
+#define WEIGHT_MEMORY 1
+
+namespace cvi {
+
+void LeakyReluOp::interpretFp32(
+    std::vector<std::shared_ptr<std::vector<float>>> &operand_tensors,
+    std::vector<std::vector<int64_t>> &operand_shapes,
+    std::shared_ptr<std::vector<float>> &result_tensor,
+    std::vector<int64_t> &result_shape) {
+  int n = operand_shapes[0][0];
+  int c = operand_shapes[0][1];
+  int h = operand_shapes[0][2];
+  int w = operand_shapes[0][3];
+  auto input = operand_tensors[0]->data();
+  auto output = result_tensor->data();
+  auto negative_slope = param.get<float>("negative_slope");
+
+  for (int i = 0; i < (int)operand_tensors[0]->size(); ++i) {
+    if (input[i] >= 0) {
+      output[i] = input[i];
+    } else {
+      output[i] = negative_slope * input[i];
+    }
+  }
+}
+
+void LeakyReluOp::interpretInt8(
+    std::vector<std::shared_ptr<std::vector<float>>> &operand_tensors,
+    std::vector<std::vector<int64_t>> &operand_shapes,
+    std::shared_ptr<std::vector<float>> &result_tensor,
+    std::vector<int64_t> &result_shape) {
+  int n = operand_shapes[0][0];
+  int c = operand_shapes[0][1];
+  int h = operand_shapes[0][2];
+  int w = operand_shapes[0][3];
+
+  auto input = operand_tensors[0]->data();
+  auto quant_pos_rshift =
+      param.has("rshift_pos") ? (float)param.get<int8_t>("rshift_pos") : 0.0f;
+  auto quant_pos_multiplier =
+      param.has("m_i8_pos") ? (float)param.get<int8_t>("m_i8_pos") : 0.0f;
+  auto quant_neg_rshift = (float)param.get<int8_t>("rshift_neg");
+  auto quant_neg_multiplier = (float)param.get<int8_t>("m_i8_neg");
+
+  auto output = result_tensor->data();
+
+  // rshift and saturate on output
+  for (int i = 0; i < (int)operand_tensors[0]->size(); ++i) {
+    if (input[i] > 0) {
+      if (quant_pos_multiplier != 0.0f) {
+        output[i] = (float)applyMultiplierAndRShiftAndSaturateInt8(
+            input[i], (uint32_t)quant_pos_rshift, quant_pos_multiplier, false);
+      } else {
+        output[i] = input[i];
+      }
+    } else {
+      output[i] = (float)applyMultiplierAndRShiftAndSaturateInt8(
+          input[i], (uint32_t)quant_neg_rshift, quant_neg_multiplier, false);
+    }
+  }
+}
+
+void LeakyReluOp::quantizeInt8() {
+  // support per-tensor only for now
+  setOpQuantPerchannel(false);
+  // use rshift and INT8 multiplier
+  setOpQuantParamType("RSHIFT_AND_M_I8");
+
+  float negative_slope = param.get<float>("negative_slope");
+  std::cout << "  negative_slope: " << std::to_string(negative_slope) << "\n";
+
+  // create tensors for rshift and multiplier
+  float rshift_pos = 0;
+  float multiplier_pos = 0;
+  float rshift_neg = 0;
+  float multiplier_neg = 0;
+
+  // quantization
+  float threshold_x = getPrevOpThreshold();
+  float threshold_y = getOpThreshold();
+  std::cout << "threshold_y = " << std::to_string(threshold_y)
+            << ", threshold_x = " << std::to_string(threshold_x) << "\n";
+
+  // positive
+  double qscale_pos = threshold_x / threshold_y;
+  if (fabs(threshold_x - threshold_y) < 1e-5 * std::min(threshold_x, threshold_y)) {
+    // no positive scale
+    rshift_pos = 0;
+    multiplier_pos = 0;
+    std::cout << "  Positive: no_scale\n";
+  } else {
+    uint32_t uint_multiplier_pos;
+    rshift_pos =
+        (float)findRShiftAndMultiplierFromQScale(qscale_pos, &uint_multiplier_pos, false);
+    multiplier_pos = (float)uint_multiplier_pos;
+    std::cout << "  Positive: ";
+    std::cout << "  [multiplier : rshift] = [" << std::to_string(multiplier_pos) << " : "
+              << std::to_string(rshift_pos) << "]\n";
+  }
+  // negative
+  float qscale_neg = fabs(qscale_pos * negative_slope);
+  uint32_t uint_multiplier_neg = 0;
+  rshift_neg =
+      (float)findRShiftAndMultiplierFromQScale(qscale_neg, &uint_multiplier_neg, false);
+  multiplier_neg = (float)uint_multiplier_neg;
+  std::cout << "  Negative: ";
+  std::cout << "  [multiplier : rshift] = [" << std::to_string(multiplier_neg) << " : "
+            << std::to_string(rshift_neg) << "]\n";
+
+  bool do_pos_scale = (multiplier_pos != 0.0) ? true : false;
+  if (do_pos_scale) {
+    param.put<int8_t>("rshift_pos", static_cast<int8_t>(rshift_pos));
+    param.put<int8_t>("m_i8_pos", static_cast<int8_t>(multiplier_pos));
+  }
+  param.put<int8_t>("rshift_neg", static_cast<int8_t>(rshift_neg));
+  param.put<int8_t>("m_i8_neg", static_cast<int8_t>(multiplier_neg));
+}
+
+void LeakyReluOp::codeGenInt8(void *ctx,
+                              std::vector<std::vector<int64_t>> &operand_shapes,
+                              std::vector<uint64_t> &operand_gaddrs,
+                              std::vector<int64_t> &result_shape, uint64_t result_gaddr,
+                              int layer_id) {
+  auto pos_rshift = param.has("rshift_pos") ? param.get<int8_t>("rshift_pos") : 0;
+  auto pos_m_i8 = param.has("m_i8_pos") ? param.get<int8_t>("m_i8_pos") : 0;
+  auto neg_rshift = param.has("rshift_neg") ? param.get<int8_t>("rshift_neg") : 0;
+  auto neg_m_i8 = param.has("m_i8_neg") ? param.get<int8_t>("m_i8_neg") : 0;
+  assert(neg_m_i8);
+
+  int n = operand_shapes[0][0];
+  int c = operand_shapes[0][1];
+  int h = operand_shapes[0][2];
+  int w = operand_shapes[0][3];
+  uint64_t operand_gaddr = operand_gaddrs[0];
+  uint64_t ga_output = result_gaddr;
+
+  leakyrelu_codegen((cvk_context_t *)ctx,           // ctx
+                    layer_id,      // layer_id
+                    operand_gaddr, // input_gaddr
+                    result_gaddr,  // output_gaddr
+                    n,             // input_n
+                    c,             // input_c
+                    h,             // input_h
+                    w,             // input_w
+                    pos_rshift,    // GT_right_shift_width
+                    neg_rshift,    // LE_right_shift_width
+                    pos_m_i8,      // GT_scale
+                    neg_m_i8       // LE_scale
+  );
+}
+
+void LeakyReluOp::tdma_load(cvk_context_t *ctx, cvk_tl_t *tlp, uint64_t ga_src) {
+  cvk_tg_t ts_data;
+  ts_data.base_reg_index = NEURON_MEMORY;
+  ts_data.fmt = tlp->fmt;
+  ts_data.start_address = ga_src;
+  ts_data.shape = {tlp->shape.n, tlp->shape.c, tlp->shape.h, tlp->shape.w};
+  ts_data.stride = ctx->ops->tg_default_stride(ctx, ts_data.shape, ts_data.fmt);
+
+  cvk_tdma_g2l_tensor_copy_param_t p1;
+  p1.src = &ts_data;
+  p1.dst = tlp;
+  ctx->ops->tdma_g2l_tensor_copy(ctx, &p1);
+}
+
+void LeakyReluOp::tdma_store(cvk_context_t *ctx, cvk_tl_t *tlp, uint64_t ga_dst) {
+  cvk_tg_t ts_data;
+  ts_data.base_reg_index = NEURON_MEMORY;
+  ts_data.fmt = tlp->fmt;
+  ts_data.start_address = ga_dst;
+  ts_data.shape = {tlp->shape.n, tlp->shape.c, tlp->shape.h, tlp->shape.w};
+  ts_data.stride = ctx->ops->tg_default_stride(ctx, ts_data.shape, ts_data.fmt);
+
+  cvk_tdma_l2g_tensor_copy_param_t p1;
+  p1.src = tlp;
+  p1.dst = &ts_data;
+  ctx->ops->tdma_l2g_tensor_copy(ctx, &p1);
+}
+
+void LeakyReluOp::leakyrelu_kernel(cvk_context_t *ctx, int layer_id, cvk_tl_t &bottom,
+                                   cvk_tl_t &relu, cvk_tl_t &neg,
+                                   int GT_right_shift_width, int LE_right_shift_width,
+                                   int GT_scale, int LE_scale) {
+  bool isIgnorePosPart = (GT_scale == 0);
+  bool isSlopeSmallerThanOne = ((LE_scale >> LE_right_shift_width) == 0);
+
+  if (isIgnorePosPart) {
+    cvk_tiu_mul_param_t p4;
+    p4.res_high = nullptr;
+    p4.res_low = &relu;
+    p4.a = &bottom;
+    p4.b_const.val = LE_scale;
+    p4.b_const.is_signed = true;
+    p4.b_is_const = 1;
+    p4.rshift_bits = LE_right_shift_width;
+    p4.layer_id = layer_id;
+    p4.relu_enable = 0;
+    ctx->ops->tiu_mul(ctx, &p4);
+
+    if (isSlopeSmallerThanOne) {
+      cvk_tiu_max_param_t p1;
+      p1.max = &bottom;
+      p1.a = &bottom;
+      p1.b = &relu;
+      p1.b_is_const = 0;
+      p1.layer_id = layer_id;
+      ctx->ops->tiu_max(ctx, &p1);
+    } else {
+      cvk_tiu_min_param_t p1;
+      p1.min = &bottom;
+      p1.a = &bottom;
+      p1.b = &relu;
+      p1.b_is_const = 0;
+      p1.layer_id = layer_id;
+      ctx->ops->tiu_min(ctx, &p1);
+    }
+  } else {
+    // 0. relu = relu(bottom)
+    cvk_tiu_max_param_t p13;
+    p13.max = &relu;
+    p13.a = &bottom;
+    p13.b_is_const = 1;
+    p13.b_const.is_signed = 1;
+    p13.b_const.val = 0;
+    p13.layer_id = layer_id;
+    ctx->ops->tiu_max(ctx, &p13);
+
+    // 1. relu = (relu * GT_scale) >> GT_right_shift_width
+    cvk_tiu_mul_param_t p;
+    p.res_high = nullptr;
+    p.res_low = &relu;
+    p.a = &relu;
+    p.b_const.val = GT_scale;
+    p.b_const.is_signed = true;
+    p.b_is_const = 1;
+    p.rshift_bits = GT_right_shift_width;
+    p.layer_id = layer_id;
+    p.relu_enable = 0;
+    ctx->ops->tiu_mul(ctx, &p);
+
+    // 2. neg = neg(0, botom)
+    cvk_tiu_min_param_t p7;
+    p7.min = &neg;
+    p7.a = &bottom;
+    p7.b_is_const = 1;
+    p7.b_const.val = 0;
+    p7.b_const.is_signed = 1;
+    p7.layer_id = layer_id;
+    ctx->ops->tiu_min(ctx, &p7);
+
+    // 3. neg (n,c,h,w) = (neg(n,c,h,w) * slope) >> LE_right_shift_width
+    cvk_tiu_mul_param_t p8;
+    p8.res_high = nullptr;
+    p8.res_low = &neg;
+    p8.a = &neg;
+    p8.b_const.val = LE_scale;
+    p8.b_const.is_signed = true;
+    p8.b_is_const = 1;
+    p8.rshift_bits = LE_right_shift_width;
+    p8.layer_id = layer_id;
+    p8.relu_enable = 0;
+    ctx->ops->tiu_mul(ctx, &p8);
+
+    // 4. bottom = or relu, neg
+    cvk_tiu_or_int8_param_t p9;
+    p9.res = &bottom;
+    p9.a = &relu;
+    p9.b = &neg;
+    p9.layer_id = layer_id;
+    ctx->ops->tiu_or_int8(ctx, &p9);
+  }
+}
+
+void LeakyReluOp::leakyrelu_codegen(cvk_context_t *ctx, uint32_t layer_id,
+                                    uint64_t input_gaddr, uint64_t output_gaddr,
+                                    int input_n, int input_c, int input_h, int input_w,
+                                    int GT_right_shift_width, int LE_right_shift_width,
+                                    int GT_scale, int LE_scale) {
+  printf("leakyrelu_codegen:\n"
+         "  layer_id %d\n"
+         "  input_gddr: %lx, output_gaddr: %lx\n"
+         "  input (%d, %d, %d, %d)\n"
+         "  GT_scale:%d, LE_scale:%d\n"
+         "  GT_right_shift_width:%d, LE_right_shift_width:%d\n",
+         layer_id, input_gaddr, output_gaddr, input_n, input_c, input_h, input_w,
+         GT_scale, LE_scale, GT_right_shift_width, LE_right_shift_width);
+
+  // Split input based on local memory
+  uint32_t total_eu = NPU_NUM * EU_NUM;
+  uint32_t lane_size = LOCAL_MEM_SIZE;
+  uint32_t total_mem_size = NPU_NUM * LOCAL_MEM_SIZE;
+  uint32_t max_N = (1 << 12) - 1; // 1880v2: 12 bit
+  uint32_t max_W = (1 << 12) - 1; // 1880v2: 12 bit
+  uint32_t count = input_n * input_c * input_h * input_w;
+  uint32_t tiled_N = count / total_eu / 3; // 3 blobs
+  tiled_N = (tiled_N > max_N) ? max_N : tiled_N;
+
+  // local tensor shape(tiled_N, npu_num, 1, eu_num)
+  cvk_tl_shape_t tl_shape = {tiled_N, static_cast<uint32_t>(NPU_NUM), 1,
+                             static_cast<uint32_t>(EU_NUM)};
+  cvk_tl_stride_t tl_stride = ctx->ops->tl_default_stride(ctx, tl_shape, CVK_FMT_I8, 1);
+
+  // Find max tiled_N
+  uint32_t required_size = 0;
+  do {
+    tl_shape.n = tiled_N;
+    tl_stride = ctx->ops->tl_default_stride(ctx, tl_shape, CVK_FMT_I8, 1);
+    required_size = 3 * tl_shape.n * tl_stride.n; // 3 blobs
+
+    if (required_size <= lane_size) {
+      break;
+    }
+
+  } while (--tiled_N);
+
+  printf("  tiled_bottom shape (%d, %d, %d, %d), stride (%d, %d, %d, %d)\n"
+         "  required_size %d kB/lane\n",
+         tl_shape.n, tl_shape.c, tl_shape.h, tl_shape.w, tl_stride.n, tl_stride.c,
+         tl_stride.h, tl_stride.w, required_size / 1024);
+
+  assert(tiled_N);
+  if (!tiled_N) {
+    return;
+  }
+
+  // Tiled local memory layout:
+  //   tiled bottom/result
+  //   tiled relu
+  //   tiled neg
+
+  // Tiled bottom
+  required_size /= 3; // for 3 blobs
+  cvk_tl_t tl_tiled_bottom;
+  tl_tiled_bottom.start_address = 0;
+  tl_tiled_bottom.fmt = CVK_FMT_I8;
+  tl_tiled_bottom.shape = tl_shape;
+  tl_tiled_bottom.stride = tl_stride;
+
+  // Tiled relu
+  cvk_tl_t tl_tiled_relu = tl_tiled_bottom;
+  tl_tiled_relu.start_address = tl_tiled_bottom.start_address + required_size;
+
+  // Tiled neg
+  cvk_tl_t tl_tiled_neg = tl_tiled_bottom;
+  tl_tiled_neg.start_address = tl_tiled_relu.start_address + required_size;
+
+  // In unit of tiled_N * npu_num * eu_num
+  uint32_t global_input_offset = 0;
+  for (uint32_t i = 0; i < (count / total_eu / tiled_N); i++) {
+    // Load as a chunk of contiguous memory in global memory, not use global
+    // shape/stride Local memory use tensor shape to maximize eu utilization.
+    tdma_load(ctx, &tl_tiled_bottom, input_gaddr + global_input_offset);
+    leakyrelu_kernel(ctx, layer_id, tl_tiled_bottom, tl_tiled_relu, tl_tiled_neg,
+                     GT_right_shift_width, LE_right_shift_width, GT_scale, LE_scale);
+    // Store bottom as a chunk of contiguous memory, not use global shape/stride
+    tdma_store(ctx, &tl_tiled_bottom, output_gaddr + global_input_offset);
+
+    // Next input offset
+    global_input_offset += tiled_N * total_eu;
+
+  } // for (uint32_t i = 0; i < (count/total_eu/tiled_N); i++)
+
+  // Remaining count, in unit of npu_num * eu_num
+  if (global_input_offset < count) {
+    uint32_t tiled_W = (count - global_input_offset) / NPU_NUM;
+    tiled_N = 1;
+    do {
+      tl_shape.n = tiled_N;
+      tl_shape.w = tiled_W;
+      tl_stride = ctx->ops->tl_default_stride(ctx, tl_shape, CVK_FMT_I8, 1);
+      required_size = 3 * tl_shape.n * tl_stride.n; // 3 blobs
+
+      if (required_size <= lane_size && (tiled_W <= max_W)) {
+        break;
+      } else {
+        tiled_W /= 2;
+        tiled_N *= 2;
+      }
+    } while (true); // Magic number for 2^12 -1 - 32
+
+    if ((count - global_input_offset) % NPU_NUM != 0) {
+      std::cout << "Remaining size should align npu_num, or die";
+      assert(0);
+    }
+
+    // Update shape, stride
+    tl_shape.n = tiled_N;
+    tl_shape.w = tiled_W;
+    tl_stride = ctx->ops->tl_default_stride(ctx, tl_shape, CVK_FMT_I8, 1);
+    required_size = tl_shape.n * tl_stride.n;
+
+    printf("  tiled_bottom shape (%d, %d, %d, %d), stride (%d, %d, %d, %d)\n"
+           "  required_size %d kB/lane\n",
+           tl_shape.n, tl_shape.c, tl_shape.h, tl_shape.w, tl_stride.n, tl_stride.c,
+           tl_stride.h, tl_stride.w, required_size / 1024);
+
+    // Tiled bottom
+    tl_tiled_bottom.shape = tl_shape;
+    tl_tiled_bottom.stride = tl_stride;
+
+    // Tiled bottom with precise stride
+    cvk_tl_t tl_tiled_bottom_precise_stride = tl_tiled_bottom;
+    tl_tiled_bottom_precise_stride.stride = {
+        static_cast<uint32_t>(tl_shape.h * tl_shape.w * sizeof(uint8_t)),
+        static_cast<uint32_t>(tl_shape.h * tl_shape.w * sizeof(uint8_t)),
+        static_cast<uint32_t>(tl_shape.w * sizeof(uint8_t)), sizeof(uint8_t)};
+
+    printf("  tiled_bottom_precise shape (%d, %d, %d, %d), stride (%d, %d, %d, %d)\n"
+           "  required_size %d kB/lane\n",
+           tl_shape.n, tl_shape.c, tl_shape.h, tl_shape.w,
+           tl_tiled_bottom_precise_stride.stride.n,
+           tl_tiled_bottom_precise_stride.stride.c,
+           tl_tiled_bottom_precise_stride.stride.h,
+           tl_tiled_bottom_precise_stride.stride.w, required_size / 1024);
+
+    // Tiled relu
+    tl_tiled_relu = tl_tiled_bottom;
+    tl_tiled_relu.start_address = tl_tiled_bottom.start_address + required_size;
+
+    // Tiled neg
+    tl_tiled_neg = tl_tiled_bottom;
+    tl_tiled_neg.start_address = tl_tiled_relu.start_address + required_size;
+
+    // Load as a chunk of contiguous memory in global memory, not use global
+    // shape/stride Local memory use tensor shape to maximize eu utilization.
+
+    tdma_load(ctx, &tl_tiled_bottom, input_gaddr + global_input_offset);
+
+    leakyrelu_kernel(ctx, layer_id, tl_tiled_bottom, tl_tiled_relu, tl_tiled_neg,
+                     GT_right_shift_width, LE_right_shift_width, GT_scale, LE_scale);
+
+    // Store bottom as a chunk of contiguous memory, not use global shape/stride
+    tdma_store(ctx, &tl_tiled_bottom, output_gaddr + global_input_offset);
+
+    global_input_offset += tl_tiled_bottom_precise_stride.shape.n *
+                           tl_tiled_bottom_precise_stride.stride.n * NPU_NUM;
+  }
+
+  // Remaining count, in unit of eu_num
+  if (global_input_offset != count) {
+    printf("global_input_offset != count (%d != %d)/n", global_input_offset, count);
+    assert(0);
+  }
+}
+
+RegisterCustomOp(leaky_relu, LeakyReluOp);
+
+} // namespace cvi
--- a/cviruntime/custom_op/example/LeakyReluOp.h
+++ b/cviruntime/custom_op/example/LeakyReluOp.h
@ -0,0 +1,44 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*/
+#ifndef LEAKY_RELU_OP_H_
+#define LEAKY_RELU_OP_H_
+
+#include "tpuc/CustomOp.h"
+#include <cvikernel/cvikernel.h>
+
+namespace cvi {
+
+class LeakyReluOp : public CustomOp {
+public:
+  LeakyReluOp(OpParam &param) : CustomOp(param) {}
+
+  void interpretFp32(std::vector<std::shared_ptr<std::vector<float>>> &operand_tensors,
+                     std::vector<std::vector<int64_t>> &operand_shapes,
+                     std::shared_ptr<std::vector<float>> &result_tensor,
+                     std::vector<int64_t> &result_shape);
+  void interpretInt8(std::vector<std::shared_ptr<std::vector<float>>> &operand_tensors,
+                     std::vector<std::vector<int64_t>> &operand_shapes,
+                     std::shared_ptr<std::vector<float>> &result_tensor,
+                     std::vector<int64_t> &result_shape);
+  void quantizeInt8();
+  void codeGenInt8(void *ctx,
+                   std::vector<std::vector<int64_t>> &operand_shapes,
+                   std::vector<uint64_t> &operand_gaddrs,
+                   std::vector<int64_t> &result_shape, uint64_t result_gaddr,
+                   int layer_id);
+
+private:
+  void tdma_load(cvk_context_t *ctx, cvk_tl_t *tlp, uint64_t ga_src);
+  void tdma_store(cvk_context_t *ctx, cvk_tl_t *tlp, uint64_t ga_dst);
+  void leakyrelu_kernel(cvk_context_t *ctx, int layer_id, cvk_tl_t &bottom,
+                        cvk_tl_t &relu, cvk_tl_t &neg, int GT_right_shift_width,
+                        int LE_right_shift_width, int GT_scale, int LE_scale);
+  void leakyrelu_codegen(cvk_context_t *ctx, uint32_t layer_id, uint64_t input_gaddr,
+                         uint64_t output_gaddr, int input_n, int input_c, int input_h,
+                         int input_w, int GT_right_shift_width, int LE_right_shift_width,
+                         int GT_scale, int LE_scale);
+};
+
+} // namespace cvi
+#endif
--- a/cviruntime/custom_op/example/QuantHelper.h
+++ b/cviruntime/custom_op/example/QuantHelper.h
@ -0,0 +1,191 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*/
+#ifndef CVI_QUANT_HELPER_H
+#define CVI_QUANT_HELPER_H
+#include <assert.h>
+#include <stdint.h>
+#include <cmath>
+#include <limits>
+#include <iostream>
+
+static int RoundingDivideByPOT(int x, int exponent) {
+  if (x == 0) {
+    return 0;
+  }
+  if (exponent == 0) {
+    return x;
+  }
+  assert(exponent > 0);
+  const int shift_vec = -exponent;
+  const int fixup = (x & shift_vec) >> 31;
+  const int fixed_up_x = x + fixup;
+
+  int nudge = 1 << (exponent - 1);
+  int val = (fixed_up_x + nudge) >> exponent;
+
+  return val;
+}
+
+static int SaturatingRoundingDoublingHighMul(int a, int b) {
+  int64_t a_64(a);
+  int64_t b_64(b);
+  int64_t ab_64 = a_64 * b_64;
+  int nudge = ab_64 >= 0 ? (1 << 30) : (1 - (1 << 30));
+  int ab_x2_high32 = static_cast<int>((ab_64 + nudge) / (1ll << 31));
+  return ab_x2_high32;
+}
+
+/// saturate a float to range [-128, 127]
+static int8_t saturateInt8(float f) {
+#if 0
+  // cast
+  int q = (int)f;
+#elif 0
+  // away_from_zero
+  int q = (f >= 0) ? (int)std::ceil(f) : (int)std::floor(f);
+#elif 0
+  // round
+  int q = (int)std::roundf(f);
+#elif 0
+  // trancate, (towards zero)
+  int q = (f >= 0) ? (int)std::floor(f) : (int)std::ceil(f);
+#elif 1
+  // from caffe_int8
+  int q = (int)std::floor(f + 0.5);
+#else
+  // looks HW is different than std::round()
+  // we shall apply round only for input quant()
+  int q = (int)std::round(f);
+#endif
+  if (q > 127)
+    q = 127;
+  if (q < -128)
+    q = -128;
+
+  return (int8_t)q;
+}
+
+/// Simulate HW behavior, after accumuation
+/// apply multiplier, do rshift, and then saturate to INT8
+/// used in BM1880v2 per-channel mode (32bit bias)
+/// qdm mode
+///   use GOOGLE GEMMLOWP QDM multiply and shift
+///   during multiply, a factor of (1 << 31) has been devided
+static int8_t applyMultiplierAndRShiftAndSaturateInt8(float v, uint32_t rshift,
+                                                      uint32_t multiplier, bool qdm) {
+  if (qdm) {
+    int32_t q = RoundingDivideByPOT(
+        SaturatingRoundingDoublingHighMul((int32_t)v, (int32_t)multiplier), rshift);
+    // llvm::errs() << "v,rshift,multiplier,q = " << v << ","
+    //             << rshift << "," << multiplier << "," << q << "\n";
+    return saturateInt8((float)q);
+  } else {
+    return saturateInt8(v * multiplier / (1 << rshift));
+  }
+}
+
+// reference to reference to [arxiv 1712.05877]
+// This implementation comes from tensorflow
+// https://github.com/tensorflow/tensorflow/blob/98ff991500a0247f8f57c60db9a206204268bc42/tensorflow/lite/kernels/internal/quantization_util.cc#L52-L90
+#define Tensorflow_QuantizeMultiplier QuantizeMultiplier
+static void QuantizeMultiplier(double double_multiplier, int32_t *quantized_multiplier,
+                        int *shift) {
+  if (double_multiplier == 0.) {
+    *quantized_multiplier = 0;
+    *shift = 0;
+    return;
+  }
+
+  const double q = std::frexp(double_multiplier, shift);
+  auto q_fixed = static_cast<int64_t>(std::round(q * (1ll << 31)));
+
+  assert(q_fixed <= (1ll << 31));
+  if (q_fixed == (1ll << 31)) {
+    q_fixed /= 2;
+    ++*shift;
+  }
+
+  assert(q_fixed <= std::numeric_limits<int32_t>::max());
+  // A shift amount smaller than -31 would cause all bits to be shifted out
+  // and thus all results would be zero. We implement that instead with
+  // q_fixed==0, so as to avoid hitting issues with right-shift
+  // operations with shift amounts greater than 31. Note that this happens
+  // roughly when abs(double_multiplier) < 2^-31 and the present handling means
+  // that we're effectively flushing tiny double_multiplier's to zero.
+  // We could conceivably handle values in the range (roughly) [32, 63]
+  // as 'denormals' i.e. (shift==0, q_fixed < 2^30). In that point of view
+  // the present handling is just doing 'flush denormals to zero'. We could
+  // reconsider and actually generate nonzero denormals if a need arises.
+  if (*shift < -31) {
+    *shift = 0;
+    q_fixed = 0;
+  }
+  *quantized_multiplier = static_cast<int32_t>(q_fixed);
+}
+
+/// find RShift and Multiplier from QScale
+///   QScale = Multiplier / (1 << RShift)
+///   Multiplier is an integer
+/// case 1: specifically multiply a int8/uint8 multplier, then rshift
+///   used in layers like element_wise, pooling, concat, etc
+///   qdm is false
+///   a max_multiplier (127 or 255 normally) has to be provided
+/// case 2: qdm mode
+///   used in BM1880v2 per-channel conv mode
+///   qdm is true
+///   reference to [arxiv 1712.05877]
+///     choose the int32 value nearest to 2^31 * M0, M0 in [0.5, 1]
+///     this value is always at least 2^30 and have at least 30 bits accuracy
+///   the max_multiplier argument is ignored, fixed to (1 << 31)
+/// if 'uint32_t *multiplier' is present, return multipler alongside
+static int8_t findRShiftAndMultiplierFromQScale(double qscale,
+                                                uint32_t *multiplier = nullptr,
+                                                bool qdm = false,
+                                                uint32_t max_multiplier = 127) {
+  if (qdm) {
+#if 0
+    max_multiplier = (1 << 31);
+    for (uint32_t rshift = 0; rshift < 63; ++rshift) {
+      if ( ((double)qscale * (1ULL << (rshift + 1))) >= (double)max_multiplier ) {
+        if (multiplier) {
+          *multiplier = (uint32_t)((double)qscale * (1ULL << rshift));
+        }
+        return rshift - 31;
+      }
+    }
+#endif
+    // this ensures if qscale is 0, both multiplier and shift will be 0
+    int32_t quantized_multiplier = 0;
+    int lshift = 0;
+    Tensorflow_QuantizeMultiplier(qscale, &quantized_multiplier, &lshift);
+    if (multiplier)
+      *multiplier = quantized_multiplier;
+    int rshift = -lshift;
+    assert(rshift >= 0);
+    if (rshift > 25) {
+      std::cout << "WARNING: large rshift = " << rshift << ", qscale = " << qscale
+                << "\n";
+    }
+    return (int8_t)rshift;
+  } else {
+    assert(qscale < max_multiplier);
+    for (int8_t rshift = 0; rshift < 63; ++rshift) {
+      if (((double)qscale * (1ULL << (rshift + 1))) >= (double)max_multiplier) {
+        if (multiplier) {
+          *multiplier = (uint32_t)((double)qscale * (1ULL << rshift));
+        }
+        return rshift;
+      }
+    }
+    // assert(false);
+    std::cout << "WARNING: failed to find rshift, qscale = " << std::to_string(qscale)
+              << "\n";
+    // we are here because qscale is too small, return 0 for both shift and multiplier
+    if (multiplier) {
+      *multiplier = 0;
+    }
+    return 0;
+  }
+}
+#endif
--- a/cviruntime/custom_op/example/README
+++ b/cviruntime/custom_op/example/README
@ -0,0 +1,42 @@
+CustomOp:
+    Attributes:
+      `param`           : required, a PoolParam struct attributes, carrying
+                          filter size, stride, padding, and do_relu.
+      `quant`           : required, a QuantParam struct attributes.
+      `name`            : required, name for calibration, comparing, or debug.
+      `do_quant`         : required, quantize to int8/bf16 or not.
+      `threshold_overwrite` : required, overwrite threshold backward/forward or not.
+      `layer_id`        : optional, id for profiling.
+
+FrontEnd:
+    def convert_leaky_relu_op(self, onnx_node):
+        assert(onnx_node.op_type == "LeakyRelu")
+        alpha = onnx_node.attrs.get("alpha", 0.01)
+        custom_op_param = {
+            'tpu': True,
+            'do_quant': True,
+            'operation_name': 'leaky_relu',
+            'threshold_overwrite': 'backward',
+            'param': {
+                'negative_slope': float(alpha)
+            }
+        }
+        op, input_shape, tensor_type = self.getOperand(onnx_node.inputs[0])
+        operands = list()
+        operands.append(op)
+        output_shape = input_shape
+        custom_op = self.CVI.add_custom_op("{}_{}".format(onnx_node.name, onnx_node.op_type), operands, output_shape, **custom_op_param)
+        self.addOperand(onnx_node.name, custom_op, output_shape, TensorType.ACTIVATION)
+
+Calibration:
+    gen_data_list.py /work/dataset/coco/val2017/ 1000 cali_list.txt
+    python /work/cvitek_mlir/python/run_calibration.py \
+        --model_name yolo_v3 yolo_v3_416_onnx_opt.mlir cali_list.txt \
+        --input_num=100 --custom_op_plugin libCustomOpPlugin.so
+
+Quantization & Optimization:
+  add "--custom-op-plugin libCustomOpPlugin.so"
+
+Codegen:
+  add "--custom-op-plugin libCustomOpPlugin.so" &
+      "--custom-runtime-lib libCustomOpRuntime_arm64.so,libCustomOpRuntime_x86.so"
--- a/cviruntime/custom_op/example/ROIAlignOp.cpp
+++ b/cviruntime/custom_op/example/ROIAlignOp.cpp
@ -0,0 +1,122 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*/
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <cmath>
+#include <numeric>
+#include <algorithm>
+#include "ROIAlignOp.h"
+
+namespace cvi {
+
+void ROIAlignOp::interpretFp32(
+    std::vector<std::shared_ptr<std::vector<float>>> &operand_tensors,
+    std::vector<std::vector<int64_t>> &operand_shapes,
+    std::shared_ptr<std::vector<float>> &result_tensor,
+    std::vector<int64_t> &result_shape) {
+  const int32_t pooled_h = param.get<int32_t>("pooled_h");
+  const int32_t pooled_w = param.get<int32_t>("pooled_w");
+  const float spatial_scale = param.get<float>("spatial_scale");
+
+  auto data_shape = operand_shapes[0];
+  auto roi_shape = operand_shapes[1];
+
+  const int batch = (int)data_shape[0];
+  const int channel = (int)data_shape[1];
+  const int height = (int)data_shape[2];
+  const int width = (int)data_shape[3];
+
+  const int rois_num = roi_shape[2];
+  assert(batch * rois_num == result_shape[0]);
+  assert(channel == result_shape[1]);
+
+  float* data = operand_tensors[0]->data();
+  float* rois = operand_tensors[1]->data();
+  float* result = result_tensor->data();
+
+  const int one_batch_output_size = rois_num * channel * pooled_h * pooled_w;
+
+  for (int b = 0; b < batch; ++b) {
+    float* batch_rois = rois + b * rois_num * 5;
+    float* batch_output = result + b * one_batch_output_size;
+    for (int roi_idx = 0; roi_idx < rois_num; ++roi_idx) {
+      const int roi_batch_idx = batch_rois[roi_idx * 5];
+      assert(roi_batch_idx == b);
+
+      const float roi_start_x = batch_rois[roi_idx * 5 + 1] * spatial_scale;
+      const float roi_start_y = batch_rois[roi_idx * 5 + 2] * spatial_scale;
+      const float roi_end_x = batch_rois[roi_idx * 5 + 3] * spatial_scale;
+      const float roi_end_y = batch_rois[roi_idx * 5 + 4] * spatial_scale;
+
+      const float roi_w = std::max(roi_end_x - roi_start_x + 1, 1.0f);
+      const float roi_h = std::max(roi_end_y - roi_start_y + 1, 1.0f);
+
+      float bin_size_w = roi_w / (float)pooled_w;
+      float bin_size_h = roi_h / (float)pooled_h;
+
+      float* batch_data = data + b * channel * height * width;
+
+      for (int c = 0; c < channel; ++c) {
+        for (int ph = 0; ph < pooled_h; ++ph) {
+          for (int pw = 0; pw < pooled_w; ++pw) {
+            const float region_start_x = std::min(pw * bin_size_w + roi_start_x, (float)(width));
+            const float region_start_y = std::min(ph * bin_size_h + roi_start_y, (float)(height));
+            const float region_end_x = std::min((pw+1) * bin_size_w + roi_start_x, (float)(width));
+            const float region_end_y = std::min((ph+1) * bin_size_h + roi_start_y, (float)(height));
+
+            const int region_grid_w = int(std::ceil(bin_size_w));
+            const int region_grid_h = int(std::ceil(bin_size_h));
+
+            const int output_idx = ph * pooled_w + pw;
+            if (region_start_x >= region_end_x || region_start_y >= region_end_y) {
+              batch_output[output_idx] = 0;
+              continue;
+            }
+
+            float value = 0;
+            float fmax = std::numeric_limits<float>::min();
+            for (int gh = 0; gh < region_grid_h; ++gh) {
+              for (int gw = 0; gw < region_grid_w; ++gw) {
+                float x = roi_start_x + gw;
+                float y = roi_start_y + gh;
+
+                const int x_low = x;
+                const int y_low = y;
+
+                const int x_high = x_low + 1;
+                const int y_high = y_low + 1;
+
+                const float x_ratio = x - x_low;
+                const float y_ratio = y - y_low;
+
+                const float w1 = (1 - y_ratio) * (1 - x_ratio);
+                const float w2 = (1 - y_ratio) * x_ratio;
+                const float w3 = y_ratio * (1 - x_ratio);
+                const float w4 = y_ratio * x_ratio;
+
+                const float data1 = batch_data[y_low * height + x_low];
+                const float data2 = batch_data[y_low * height + x_high];
+                const float data3 = batch_data[y_high * height + x_low];
+                const float data4 = batch_data[y_high * height + x_high];
+                value = w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+                if (value > fmax) {
+                  fmax = value;
+                }
+              }
+            }
+            batch_output[output_idx] = fmax;
+          }
+        }
+
+        batch_data += height * width;
+        batch_output += pooled_h * pooled_w;
+      }
+    }
+  }
+}
+
+RegisterCustomOp(roialign, ROIAlignOp);
+
+} // namespace cvi
--- a/cviruntime/custom_op/example/ROIAlignOp.h
+++ b/cviruntime/custom_op/example/ROIAlignOp.h
@ -0,0 +1,22 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*/
+#ifndef ROI_ALIGN_OP_H_
+#define ROI_ALIGN_OP_H_
+
+#include "tpuc/CustomOp.h"
+
+namespace cvi {
+
+class ROIAlignOp : public CustomOp {
+public:
+  ROIAlignOp(OpParam &param) : CustomOp(param) {}
+
+  void interpretFp32(std::vector<std::shared_ptr<std::vector<float>>> &operand_tensors,
+                     std::vector<std::vector<int64_t>> &operand_shapes,
+                     std::shared_ptr<std::vector<float>> &result_tensor,
+                     std::vector<int64_t> &result_shape);
+};
+
+} // namespace cvi
+#endif
--- a/cviruntime/custom_op/example/SoftmaxOp.cpp
+++ b/cviruntime/custom_op/example/SoftmaxOp.cpp
@ -0,0 +1,74 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*/
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <cmath>
+#include "SoftmaxOp.h"
+
+namespace cvi {
+
+void SoftmaxOp::interpretFp32(
+    std::vector<std::shared_ptr<std::vector<float>>> &operand_tensors,
+    std::vector<std::vector<int64_t>> &operand_shapes,
+    std::shared_ptr<std::vector<float>> &result_tensor,
+    std::vector<int64_t> &result_shape) {
+  (void)result_shape;
+  auto axis = param.get<int32_t>("axis");
+  auto& shape = operand_shapes[0];
+  axis = axis % shape.size();
+
+  int32_t n = 1, inner_dim = 1;
+  for(int i = 0; i < axis; ++i) {
+    n *= shape[i];
+  }
+
+  for(size_t i = axis + 1; i < shape.size(); ++i) {
+    inner_dim *= shape[i];
+  }
+
+  int32_t c = shape[axis];
+  int32_t dim = c * inner_dim;
+
+  float *max = new float[inner_dim];
+  float *sum = new float[inner_dim];
+  float *p = operand_tensors[0]->data();
+  float *q = result_tensor->data();
+
+  for (int i = 0; i < n; ++i) {
+    memcpy(max, p, inner_dim * sizeof(float));
+    memset(sum, 0, inner_dim * sizeof(float));
+    // find max value accross channel
+    int c_offset = i *dim;
+    for (int j = 0; j <c; ++j, c_offset +=inner_dim) {
+      for (int k = 0; k <inner_dim; k++) {
+        if (max[k] < p[c_offset + k])
+          max[k] = p[c_offset + k];
+      }
+    }
+
+    // calculate exp(x)
+    c_offset = i *dim;
+    for (int j = 0; j <c; ++j, c_offset +=inner_dim) {
+      for (int k = 0; k <inner_dim; k++) {
+        q[c_offset + k] = std::exp(p[c_offset + k] - max[k]);
+        sum[k] += q[c_offset + k];
+      }
+    }
+
+    c_offset = i *dim;
+    for (int j = 0; j <c; ++j, c_offset +=inner_dim) {
+      for (int k = 0; k <inner_dim; k++) {
+        q[c_offset + k] /= sum[k];
+      }
+    }
+  }
+
+  delete[] max;
+  delete[] sum;
+}
+
+RegisterCustomOp(mysoftmax, SoftmaxOp);
+
+} // namespace cvi
--- a/cviruntime/custom_op/example/SoftmaxOp.h
+++ b/cviruntime/custom_op/example/SoftmaxOp.h
@ -0,0 +1,22 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*/
+#ifndef LEAKY_RELU_OP_H_
+#define LEAKY_RELU_OP_H_
+
+#include "tpuc/CustomOp.h"
+
+namespace cvi {
+
+class SoftmaxOp : public CustomOp {
+public:
+  SoftmaxOp(OpParam &param) : CustomOp(param) {}
+
+  void interpretFp32(std::vector<std::shared_ptr<std::vector<float>>> &operand_tensors,
+                     std::vector<std::vector<int64_t>> &operand_shapes,
+                     std::shared_ptr<std::vector<float>> &result_tensor,
+                     std::vector<int64_t> &result_shape);
+};
+
+} // namespace cvi
+#endif
--- a/cviruntime/custom_op/example/UnPoolingOp.cpp
+++ b/cviruntime/custom_op/example/UnPoolingOp.cpp
@ -0,0 +1,307 @@
+/*
+ * Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+ */
+#include "UnPoolingOp.h"
+#include "QuantHelper.h"
+#include <cvikernel/cvikernel.h>
+
+#define NPU_SHIFT 5
+#define EU_SHIFT 4
+#define NPU_NUM (1 << NPU_SHIFT)
+#define EU_NUM (1 << EU_SHIFT)
+#define LOCAL_MEM_SIZE (1 << 15)
+#define NEURON_MEMORY 0
+#define WEIGHT_MEMORY 1
+
+namespace cvi {
+
+void UnPoolingOp::interpretFp32(
+    std::vector<std::shared_ptr<std::vector<float>>> &operand_tensors,
+    std::vector<std::vector<int64_t>> &operand_shapes,
+    std::shared_ptr<std::vector<float>> &result_tensor,
+    std::vector<int64_t> &result_shape) {
+  unpooling(operand_tensors, operand_shapes, result_tensor, result_shape);
+}
+
+void UnPoolingOp::interpretInt8(
+    std::vector<std::shared_ptr<std::vector<float>>> &operand_tensors,
+    std::vector<std::vector<int64_t>> &operand_shapes,
+    std::shared_ptr<std::vector<float>> &result_tensor,
+    std::vector<int64_t> &result_shape) {
+  unpooling(operand_tensors, operand_shapes, result_tensor, result_shape);
+}
+
+void UnPoolingOp::quantizeInt8() {
+  // support per-tensor only for now
+  setOpQuantPerchannel(false);
+  // use rshift and INT8 multiplier
+  setOpQuantParamType("RSHIFT_AND_M_I8");
+
+  // quantization
+  float threshold_x = getPrevOpThreshold();
+  float threshold_y = getOpThreshold();
+  std::cout << "threshold_y = " << std::to_string(threshold_y)
+            << ", threshold_x = " << std::to_string(threshold_x) << "\n";
+}
+
+void UnPoolingOp::codeGenInt8(void *ctx,
+                              std::vector<std::vector<int64_t>> &operand_shapes,
+                              std::vector<uint64_t> &operand_gaddrs,
+                              std::vector<int64_t> &result_shape,
+                              uint64_t result_gaddr, int layer_id) {
+  int n = operand_shapes[0][0];
+  int c = operand_shapes[0][1];
+  int h = operand_shapes[0][2];
+  int w = operand_shapes[0][3];
+  uint64_t data_gaddr = operand_gaddrs[0];
+  uint64_t mask_gaddr = operand_gaddrs[1];
+  uint64_t ga_output = result_gaddr;
+
+  int scale = param.get<int>("scale");
+  int unpool_h = param.get<int>("unpool_h");
+  int unpool_w = param.get<int>("unpool_w");
+
+  unpooling_codegen((cvk_context_t *)ctx, // ctx
+                    layer_id,             // layer_id
+                    data_gaddr,           // data_gaddr
+                    mask_gaddr,           // mask_gaddr
+                    ga_output,            // output_gaddr
+                    n, c, h, w,              // input shape
+                    scale, unpool_h, unpool_w);
+}
+
+void UnPoolingOp::alloc_lmem(cvk_context_t *ctx, uint32_t tiling_c, uint32_t tiling_h,
+    uint32_t input_c, uint32_t input_h, uint32_t input_w,
+    uint32_t output_c, uint32_t output_h, uint32_t output_w,
+    cvk_fmt_t fmt, int eu_align, cvk_tl_t &tl_ifmap, cvk_tl_t &tl_working,
+    cvk_tl_t &tl_mask, cvk_tl_t &tl_ofmap) {
+  uint32_t tl_offset = 0;
+  ctx->ops->lmem_init_tensor(ctx, &tl_ifmap, {1, tiling_c, tiling_h, input_w}, fmt,
+                       eu_align);
+  tl_ifmap.start_address = tl_offset;
+  tl_offset += ctx->ops->lmem_tensor_to_size(ctx, tl_ifmap.shape, tl_ifmap.fmt,
+                                       tl_ifmap.eu_align);
+
+  ctx->ops->lmem_init_tensor(ctx, &tl_working, {1, tiling_c, tiling_h, output_w}, fmt,
+                       eu_align);
+  tl_working.start_address = tl_offset;
+  tl_offset += ctx->ops->lmem_tensor_to_size(ctx, tl_working.shape, tl_working.fmt,
+                                       tl_working.eu_align);
+
+  uint32_t tiling_oh = tiling_h * (output_h / input_h);
+  ctx->ops->lmem_init_tensor(ctx, &tl_mask, {1, tiling_c, tiling_oh, output_w}, fmt,
+                       eu_align);
+  tl_mask.start_address = tl_offset;
+  tl_offset += ctx->ops->lmem_tensor_to_size(ctx, tl_mask.shape, tl_mask.fmt,
+                                        tl_mask.eu_align);
+
+  ctx->ops->lmem_init_tensor(ctx, &tl_ofmap, {1, tiling_c, tiling_oh, output_w}, fmt,
+                       eu_align);
+  tl_ofmap.start_address = tl_offset;
+}
+
+void UnPoolingOp::tdma_load(cvk_context_t *ctx, cvk_tl_t *tlp, uint64_t ga_src,
+                            cvk_tg_stride_t stride, int32_t n_pos, int32_t c_pos, int32_t h_pos) {
+  cvk_tg_t ts_data;
+  ts_data.base_reg_index = NEURON_MEMORY;
+  ts_data.fmt = tlp->fmt;
+  ts_data.start_address = ga_src + stride.n * n_pos + stride.c * c_pos + stride.h * h_pos;
+  ts_data.shape = {tlp->shape.n, tlp->shape.c, tlp->shape.h, tlp->shape.w};
+  ts_data.stride = stride;
+
+  cvk_tdma_g2l_tensor_copy_param_t p1;
+  p1.src = &ts_data;
+  p1.dst = tlp;
+  ctx->ops->tdma_g2l_tensor_copy(ctx, &p1);
+}
+
+void UnPoolingOp::unpooling_compute(
+    cvk_context_t *ctx, uint32_t layer_id, int scale_h, int scale_w, 
+    cvk_tl_t *tl_ifmap, cvk_tl_t *tl_working, cvk_tl_t *tl_mask, cvk_tl_t *tl_ofmap) {
+
+  cvk_tl_stride_t tl_ifmap_fake_stride = {0, tl_ifmap->stride.c, tl_ifmap->stride.h, tl_ifmap->stride.w};
+  cvk_tl_t tl_ifmap_fake = {0};
+  tl_ifmap_fake.start_address = tl_ifmap->start_address;
+  tl_ifmap_fake.fmt = tl_ifmap->fmt;
+  tl_ifmap_fake.shape = {scale_w, tl_ifmap->shape.c, tl_ifmap->shape.h, tl_ifmap->shape.w};
+  tl_ifmap_fake.stride = tl_ifmap_fake_stride;
+  tl_ifmap_fake.eu_align = tl_ifmap->eu_align;
+
+  cvk_tl_stride_t tl_working_fake_stride = {
+     tl_working->stride.w, tl_working->stride.c,
+     tl_working->stride.h, tl_working->stride.w * scale_w};
+  cvk_tl_t tl_working_fake = {0};
+  tl_working_fake.start_address = tl_working->start_address;
+  tl_working_fake.fmt = tl_working->fmt;
+  tl_working_fake.shape = {scale_w, tl_ifmap->shape.c, tl_ifmap->shape.h, tl_ifmap->shape.w};
+  tl_working_fake.stride = tl_working_fake_stride;
+  tl_working_fake.eu_align = tl_working->eu_align;
+
+  cvk_tiu_copy_param_t param = {0};
+  param.dst = &tl_working_fake;
+  param.src =  &tl_ifmap_fake;
+  param.layer_id = layer_id;
+  ctx->ops->tiu_copy(ctx, &param);
+
+  cvk_tl_stride_t tl_working_fake2_stride = {0, tl_working->stride.c, tl_working->stride.h, tl_working->stride.w};
+  cvk_tl_t tl_working_fake2 = {0};
+  tl_working_fake2.start_address = tl_working->start_address;
+  tl_working_fake2.fmt = tl_working->fmt;
+  tl_working_fake2.shape = {scale_h, tl_ofmap->shape.c, tl_ifmap->shape.h, tl_ofmap->shape.w};
+  tl_working_fake2.stride = tl_working_fake2_stride;
+  tl_working_fake2.eu_align = tl_working->eu_align;
+
+  cvk_tl_stride_t tl_ofmap_fake_stride = {tl_ofmap->stride.h, tl_ofmap->stride.c, tl_ofmap->stride.h * scale_h, tl_ofmap->stride.w};
+  cvk_tl_t tl_ofmap_fake = {0};
+  tl_ofmap_fake.start_address = tl_ofmap->start_address;
+  tl_ofmap_fake.fmt = tl_ofmap->fmt;
+  tl_ofmap_fake.shape =  {scale_h, tl_ofmap->shape.c, tl_ifmap->shape.h, tl_ofmap->shape.w};
+  tl_ofmap_fake.stride = tl_ofmap_fake_stride;
+  tl_ofmap_fake.eu_align = tl_ofmap->eu_align;
+
+  cvk_tiu_copy_param_t param2 = {0};
+  param2.dst = &tl_ofmap_fake;
+  param2.src =  &tl_working_fake2;
+  param2.layer_id = layer_id;
+  ctx->ops->tiu_copy(ctx, &param2);
+
+  cvk_tiu_mul_param_t param3 = {0};
+  param3.res_high = nullptr;
+  param3.res_low = tl_ofmap;
+  param3.a = tl_ofmap;
+  param3.b_is_const = 0;
+  param3.b = tl_mask;
+  param3.layer_id = layer_id;
+  param3.rshift_bits = 0;
+  param3.relu_enable = 0;
+  ctx->ops->tiu_mul(ctx, &param3);
+}
+
+void UnPoolingOp::tdma_store(cvk_context_t *ctx, cvk_tl_t *tlp,
+                             uint64_t ga_dst, cvk_tg_stride_t stride,
+                             uint32_t n_pos, uint32_t c_pos, uint32_t h_pos,
+                             uint32_t crop_h, uint32_t crop_w) {
+  cvk_tl_t tl_src;
+  tl_src.start_address = tlp->start_address;
+  tl_src.fmt = tlp->fmt;
+  tl_src.shape = {tlp->shape.n, tlp->shape.c, tlp->shape.h - crop_h, tlp->shape.w - crop_w};
+  tl_src.stride = tlp->stride;
+
+  cvk_tg_t tg_dst;
+  tg_dst.base_reg_index = NEURON_MEMORY;
+  tg_dst.fmt = tlp->fmt;
+  tg_dst.start_address = ga_dst + stride.n * n_pos + stride.c * c_pos + stride.h * h_pos;
+  tg_dst.shape = {tlp->shape.n, tlp->shape.c, tlp->shape.h - crop_h, tlp->shape.w - crop_w};
+  tg_dst.stride = stride;
+
+  cvk_tdma_l2g_tensor_copy_param_t p1;
+  p1.src = &tl_src;
+  p1.dst = &tg_dst;
+  ctx->ops->tdma_l2g_tensor_copy(ctx, &p1);
+}
+
+void UnPoolingOp::unpooling_codegen(cvk_context_t *ctx, uint32_t layer_id,
+                                    uint64_t data_gaddr, uint64_t mask_gaddr, uint64_t output_gaddr,
+                                    int input_n, int input_c, int input_h, int input_w,
+                                    int scale, int unpool_h, int unpool_w) {
+  printf("unpooling_codegen:\n"
+         "  layer_id %d\n"
+         "  data_gddr: %lx, mask_gaddr: %lx, output_gaddr: %lx\n"
+         "  input (%d, %d, %d, %d)\n"
+         "  scale:%d, unpool_h:%d, unpool_w:%d\n",
+         layer_id, data_gaddr, mask_gaddr, output_gaddr, input_n, input_c, input_h,
+         input_w, scale, unpool_h, unpool_w);
+
+  // Split input based on local memory
+  uint32_t total_eu = NPU_NUM * EU_NUM;
+  uint32_t lane_size = LOCAL_MEM_SIZE;
+  uint32_t total_mem_size = NPU_NUM * LOCAL_MEM_SIZE;
+  uint32_t max_N = (1 << 12) - 1; // 1880v2: 12 bit
+  uint32_t max_W = (1 << 12) - 1; // 1880v2: 12 bit
+  uint32_t count = input_n * input_c * input_h * input_w;
+
+  uint32_t output_c = input_c;
+  uint32_t output_h = input_h * scale;
+  uint32_t output_w = input_w * scale;
+
+  uint32_t n_step = 1;
+  uint32_t c_step = 0;
+  uint32_t h_step = 0;
+
+  h_step = input_h;
+  uint32_t h_factor = scale;
+
+  for (; h_step > 0; --h_step) {
+    uint32_t total_size;
+    for (c_step = input_c; c_step >= (uint32_t)NPU_NUM ; --c_step) {
+      cvk_tl_shape_t tiled_ifmap_shape = {1, c_step, h_step, input_w};
+      uint32_t tiled_ifmap_size =
+          ctx->ops->lmem_tensor_to_size(ctx, tiled_ifmap_shape, CVK_FMT_I8, 0);
+
+      cvk_tl_shape_t tiled_working_shape = {1, c_step, h_step, output_w};
+      uint32_t tiled_working_size =
+          ctx->ops->lmem_tensor_to_size(ctx, tiled_working_shape, CVK_FMT_I8, 0);
+
+      cvk_tl_shape_t tiled_ofmap_shape = {1, c_step, h_step * h_factor, output_w};
+      uint32_t tiled_ofmap_size =
+          ctx->ops->lmem_tensor_to_size(ctx, tiled_ofmap_shape, CVK_FMT_I8, 0);
+
+      total_size = tiled_ifmap_size + tiled_working_size + tiled_ofmap_size * 2;
+      if (total_size <= static_cast<uint32_t>(LOCAL_MEM_SIZE))
+        break;
+    }
+    if (total_size <= static_cast<uint32_t>(LOCAL_MEM_SIZE))
+      break;
+  }
+
+  printf("tiling: c_step %d, h_step %d\n", c_step, h_step);
+  assert(c_step && h_step && "Expect valid tiling");
+
+  cvk_tg_stride_t ifmap_stride = {
+    input_c * input_h * input_w,
+    input_h * input_w,
+    input_w};
+  cvk_tg_stride_t mask_stride = {
+    output_c * output_h * output_w,
+    output_h * output_w,
+    output_w};
+  cvk_tg_stride_t output_stride = {
+    output_c * unpool_h * unpool_w,
+    unpool_h * unpool_w,
+    unpool_w};
+
+  uint64_t output_offset = 0;
+  uint32_t crop_h = 0;
+  uint32_t crop_w = 0;
+  for (uint32_t n_pos = 0; n_pos < input_n; n_pos += n_step) {
+    for (uint32_t c_pos = 0; c_pos < input_c; c_pos += c_step) {
+      uint32_t tiling_c = std::min(input_c - c_pos, c_step);
+      for (uint32_t h_pos = 0; h_pos < input_h; h_pos += h_step) {
+        uint32_t tiling_h = std::min(input_h - h_pos, h_step);
+
+        cvk_tl_t tl_ifmap, tl_ofmap, tl_mask, tl_working;
+        alloc_lmem(ctx, tiling_c, tiling_h, input_c, input_h, input_w, output_c,
+                                    output_h, output_w, CVK_FMT_I8, 0, tl_ifmap, tl_working,
+                                    tl_mask, tl_ofmap);
+
+        tdma_load(ctx, &tl_ifmap, data_gaddr, ifmap_stride, n_pos, c_pos, h_pos);
+        tdma_load(ctx, &tl_mask, mask_gaddr, mask_stride, n_pos, c_pos, h_pos * scale);
+
+        unpooling_compute(ctx, layer_id, scale, scale, &tl_ifmap, &tl_working, &tl_mask, &tl_ofmap);
+
+        uint32_t oh_pos = h_pos * scale;
+        crop_w = output_w - unpool_w;
+        if (oh_pos + tiling_h * scale > unpool_h) {
+          crop_h = oh_pos + tiling_h * scale - unpool_h;
+        } else {
+          crop_h = 0;
+        }
+        tdma_store(ctx, &tl_ofmap, output_gaddr, output_stride, n_pos, c_pos, h_pos * scale, crop_h, crop_w);
+      }
+    }
+  }
+}
+
+RegisterCustomOp(unpooling, UnPoolingOp);
+
+} // namespace cvi
--- a/cviruntime/custom_op/example/UnPoolingOp.h
+++ b/cviruntime/custom_op/example/UnPoolingOp.h
@ -0,0 +1,104 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*/
+#ifndef UNPOOLING_OP_H_
+#define UNPOOLING_OP_H_
+
+#include "tpuc/CustomOp.h"
+#include <cvikernel/cvikernel.h>
+
+namespace cvi {
+
+class UnPoolingOp : public CustomOp {
+public:
+  UnPoolingOp(OpParam &param) : CustomOp(param) {}
+
+  void interpretFp32(std::vector<std::shared_ptr<std::vector<float>>> &operand_tensors,
+                     std::vector<std::vector<int64_t>> &operand_shapes,
+                     std::shared_ptr<std::vector<float>> &result_tensor,
+                     std::vector<int64_t> &result_shape);
+  void interpretInt8(std::vector<std::shared_ptr<std::vector<float>>> &operand_tensors,
+                     std::vector<std::vector<int64_t>> &operand_shapes,
+                     std::shared_ptr<std::vector<float>> &result_tensor,
+                     std::vector<int64_t> &result_shape);
+  void quantizeInt8();
+  void codeGenInt8(void *ctx,
+                   std::vector<std::vector<int64_t>> &operand_shapes,
+                   std::vector<uint64_t> &operand_gaddrs,
+                   std::vector<int64_t> &result_shape, uint64_t result_gaddr,
+                   int layer_id);
+
+private:
+  void alloc_lmem(cvk_context_t *ctx, uint32_t tiling_c, uint32_t tiling_h,
+    uint32_t input_c, uint32_t input_h, uint32_t input_w,
+    uint32_t output_c, uint32_t output_h, uint32_t output_w,
+    cvk_fmt_t fmt, int eu_align, cvk_tl_t &tl_ifmap, cvk_tl_t &tl_working,
+    cvk_tl_t &tl_ofmap, cvk_tl_t &tl_mask);
+  void tdma_load(cvk_context_t *ctx, cvk_tl_t *tlp, uint64_t ga_src, cvk_tg_stride_t stride,
+                 int n_pos, int c_pos, int h_pos);
+  void unpooling_compute(cvk_context_t *ctx, uint32_t layer_id, int scale_h, int scale_w,
+    cvk_tl_t *tl_ifmap, cvk_tl_t *tl_working, cvk_tl_t *tl_mask, cvk_tl_t *tl_ofmap);
+  void tdma_store(cvk_context_t *ctx, cvk_tl_t *tlp, uint64_t ga_dst, cvk_tg_stride_t stride,
+    uint32_t n_pos, uint32_t c_pos, uint32_t h_pos, uint32_t crop_h, uint32_t crop_w);
+  void unpooling_codegen(cvk_context_t *ctx, uint32_t layer_id,
+                        uint64_t data_gaddr, uint64_t mask_gaddr, uint64_t output_gaddr,
+                        int input_n, int input_c, int input_h, int input_w,
+                        int scale, int unpool_h, int unpool_w);
+
+  void unpooling(std::vector<std::shared_ptr<std::vector<float>>> &operand_tensors,
+      std::vector<std::vector<int64_t>> &operand_shapes,
+      std::shared_ptr<std::vector<float>> &result_tensor,
+      std::vector<int64_t> &result_shape) {
+    int in = operand_shapes[0][0];
+    int ic = operand_shapes[0][1];
+    int ih = operand_shapes[0][2];
+    int iw = operand_shapes[0][3];
+
+    int oh = result_shape[2];
+    int ow = result_shape[3];
+
+    float *data = operand_tensors[0]->data();
+    float *mask = operand_tensors[1]->data();
+    float *output = result_tensor->data();
+    auto scale = param.get<int>("scale");
+    auto unpool_h = param.get<int>("unpool_h");
+    auto unpool_w = param.get<int>("unpool_w");
+
+    assert(oh == unpool_h);
+    assert(ow == unpool_w);
+
+    int sh = ih * scale;
+    int sw = iw * scale;
+    // always use float to store int8 value
+    std::vector<float> tmp_out(in * ic * sh * sw);
+
+    for (int n = 0; n < in; n++) {
+      for (int c = 0; c < ic; c++) {
+        for (int h = 0; h < sh; h++) {
+          for (int w = 0; w < sw; w++) {
+            int isw = w / scale;
+            int ish = h / scale;
+            int out_idx = ((n * ic + c) * sh + h) * sw + w;
+            int in_idx = ((n * ic + c) * ih + ish) * iw + isw;
+            tmp_out[out_idx] = data[in_idx] * mask[out_idx];
+          }
+        }
+      }
+    }
+
+    for (int n = 0; n < in; n++) {
+      for (int c = 0; c < ic; c++) {
+        for (int h = 0; h < oh; h++) {
+          for (int w = 0; w < ow; w++) {
+            int out_idx = ((n * ic + c) * oh + h) * ow + w;
+            int in_idx = ((n * ic + c) * sh + h) * sw + w;
+            output[out_idx] = tmp_out[in_idx];
+          }
+        }
+      }
+    }
+  }
+};
+
+} // namespace cvi
+#endif
--- a/cviruntime/custom_op/example/build.sh
+++ b/cviruntime/custom_op/example/build.sh
@ -0,0 +1,36 @@
+#!/bin/bash
+set -e
+
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+INSTALL_PATH=$DIR/install
+if [[ ! -e $INSTALL_PATH ]]; then
+  mkdir $DIR/install
+fi
+if [ -z "$ARM_TOOLCHAIN_GCC_PATH" ]; then
+  ARM_TOOLCHAIN_GCC_PATH=$TPU_BASE/host-tools/gcc/gcc-linaro-6.3.1-2017.05-x86_64_aarch64-linux-gnu
+fi
+export PATH=$ARM_TOOLCHAIN_GCC_PATH/bin:$PATH
+export TOOLCHAIN_FILE_PATH=$DIR/cmake/toolchain-aarch64-linux.cmake
+export MLIR_INCLUDE=$TPU_BASE/cvitek_mlir/include
+export CVIRUNTIME_INCLUDE=$MLIR_INCLUDE
+export AARCH64_SYSROOT_PATH=$TPU_BASE/cvitek_sysroot
+
+if [[ ! -e $DIR/build ]]; then
+  mkdir $DIR/build
+fi
+pushd $DIR/build
+rm -rf *
+cmake -DMLIR_INCLUDE=$MLIR_INCLUDE \
+      -DCMAKE_INSTALL_PREFIX=$INSTALL_PATH ..
+make install
+rm -rf *
+cmake -DCVIRUNTIME_INCLUDE=$CVIRUNTIME_INCLUDE \
+      -DCMAKE_INSTALL_PREFIX=$INSTALL_PATH ../runtime
+make install
+rm -rf *
+cmake -DCMAKE_SYSROOT=$AARCH64_SYSROOT_PATH \
+      -DCMAKE_TOOLCHAIN_FILE=$TOOLCHAIN_FILE_PATH \
+      -DCVIRUNTIME_INCLUDE=$CVIRUNTIME_INCLUDE \
+      -DCMAKE_INSTALL_PREFIX=$INSTALL_PATH ../runtime
+make install
+popd
--- a/cviruntime/custom_op/example/cmake/toolchain-aarch64-linux.cmake
+++ b/cviruntime/custom_op/example/cmake/toolchain-aarch64-linux.cmake
@ -0,0 +1,54 @@
+include(CMakeForceCompiler)
+
+# The Generic system name is used for embedded targets (targets without OS) in
+# CMake
+set( CMAKE_SYSTEM_NAME          Linux )
+set( CMAKE_SYSTEM_PROCESSOR     aarch64 )
+
+# The toolchain prefix for all toolchain executables
+set( ARCH arm64 )
+
+# specify the cross compiler. We force the compiler so that CMake doesn't
+# attempt to build a simple test program as this will fail without us using
+# the -nostartfiles option on the command line
+if(DEFINED ENV{CROSS_COMPILE_64})
+  set(CROSS_COMPILE $ENV{CROSS_COMPILE_64})
+else()
+  set(CROSS_COMPILE aarch64-linux-gnu-)
+endif()
+
+set(CMAKE_C_COMPILER ${CROSS_COMPILE}gcc)
+set(CMAKE_CXX_COMPILER ${CROSS_COMPILE}g++)
+
+message(STATUS "CMAKE_C_COMPILER: ${CMAKE_C_COMPILER}")
+message(STATUS "CMAKE_CXX_COMPILER: ${CMAKE_CXX_COMPILER}")
+
+# To build the tests, we need to set where the target environment containing
+# the required library is. On Debian-like systems, this is
+# /usr/aarch64-linux-gnu.
+SET(CMAKE_FIND_ROOT_PATH ${AARCH64_SYSROOT_PATH})
+# search for programs in the build host directories
+SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+# for libraries and headers in the target directories
+SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+# We must set the OBJCOPY setting into cache so that it's available to the
+# whole project. Otherwise, this does not get set into the CACHE and therefore
+# the build doesn't know what the OBJCOPY filepath is
+set(CMAKE_OBJCOPY ${CROSS_COMPILE}objcopy
+	    CACHE FILEPATH "The toolchain objcopy command " FORCE )
+
+# Set the CMAKE C flags (which should also be used by the assembler!
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Os -std=gnu11" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffunction-sections" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fdata-sections" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-pointer-to-int-cast" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-missing-field-initializers" )
+
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "" )
+set( CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "" )
+
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-field-initializers" )
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-parentheses" )
--- a/cviruntime/custom_op/example/resnet18_convert.py
+++ b/cviruntime/custom_op/example/resnet18_convert.py
@ -0,0 +1,97 @@
+#!/usr/bin/python3
+"""
+Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+"""
+
+import onnx
+from cvi_toolkit.transform.BaseConverter import TensorType
+from cvi_toolkit.transform.onnx_converter import OnnxConverter
+from cvi_toolkit.transform.tflite_converter_int8 import TFLiteConverter
+from cvi_toolkit.transform.tensorflow_converter import TFConverter
+from cvi_toolkit.utils.log_setting import setup_logger
+from cvi_toolkit.data.preprocess import add_preprocess_parser, preprocess
+
+logger = setup_logger('root', log_level="INFO")
+
+class MyOnnxConverter(OnnxConverter):
+    def __init__(self, model_name, onnx_model, mlir_file_path, batch_size=1, preprocessor=None):
+        super().__init__(model_name, onnx_model, mlir_file_path, batch_size, preprocessor.to_dict())
+        self.onnxop_factory['LeakyRelu'] = lambda node: self.convert_leaky_relu(node);
+
+    def convert_graph(self):
+        """convert all to mlir"""
+
+        # add input op
+        # add input op
+        for idx, input in enumerate(self.input_nodes):
+            input_shape = list()
+            for i, dim in enumerate(input.type.tensor_type.shape.dim):
+                # batch size
+                # dim is zero, mean mutli batch
+                if i == 0 and dim.dim_value <= 0:
+                    input_shape.append(self.batch_size)
+                else:
+                    input_shape.append(dim.dim_value)
+
+            if not self.preprocess_args:
+                input_op = self.CVI.add_input_op(input.name, idx, **{})
+            else:
+                preprocess_hint = {
+                    'mean': self.preprocess_args['perchannel_mean'],
+                    'scale':  self.preprocess_args['perchannel_scale'],
+                    'pixel_format': self.preprocess_args["pixel_format"],
+                    'channel_order': self.preprocess_args["channel_order"],
+                    'aligned': self.preprocess_args["aligned"],
+                    'resize_dims': self.preprocess_args['resize_dims'],
+                    'keep_aspect_ratio': self.preprocess_args['keep_aspect_ratio']
+                }
+                # add input op
+                input_op = self.CVI.add_input_op(input.name, idx, **preprocess_hint)
+            self.addOperand(input.name, input_op, input_shape, TensorType.ACTIVATION)
+
+        def NoneAndRaise(node):
+            raise RuntimeError("{} Op not support now".format(node.op_type))
+        # add node op
+        for n in self.converted_nodes:
+            self.onnxop_factory.get(n.op_type, lambda x: NoneAndRaise(x))(n)
+
+        self.add_softmax_op()
+        # add return op
+        return_op = list()
+        # Set output
+        op, _, _ = self.getOperand("prob")
+        return_op.append(op)
+
+        self.CVI.add_return_op(return_op)
+        mlir_txt = self.CVI.print_module()
+        with open(self.mlir_file_path, "w") as f:
+            f.write(mlir_txt)
+
+    def add_softmax_op(self):
+        softmax_op_param = {
+            'tpu': False,
+            'do_quant': False,
+            'operation_name': 'mysoftmax',
+            'threshold_overwrite': 'none',
+            'param': {
+                'axis': 1
+            }
+        }
+        op, input_shape, tensor_type = self.getOperand('output')
+        operands = list()
+        operands.append(op)
+        output_shape = input_shape
+        custom_op = self.CVI.add_custom_op("prob_softmax", operands, output_shape, **softmax_op_param)
+        self.addOperand("prob", custom_op, output_shape, TensorType.ACTIVATION)
+
+if __name__ == "__main__":
+    onnx_model = onnx.load('model/resnet18.onnx')
+    preprocessor = preprocess()
+    preprocessor.config(net_input_dims="224,224",
+               resize_dims="256,256", crop_method='center', keep_aspect_ratio=False,
+               raw_scale=1.0, mean='0.406,0.456,0.485', std='0.225,0.224,0.229', input_scale=1.0,
+               channel_order='bgr', pixel_format=None, data_format='nchw',
+               aligned=False, gray=False)
+    c = MyOnnxConverter('resnet18', 'model/resnet18.onnx',
+                        'resnet18.mlir', batch_size=1, preprocessor=preprocessor)
+    c.run()
--- a/cviruntime/custom_op/example/roialign.mlir
+++ b/cviruntime/custom_op/example/roialign.mlir
@ -0,0 +1,11 @@
+
+
+module {
+  func @tpu_func(%arg0 : tensor<1x512x38x50xf32>, %arg1 : tensor<1x1x300x5xf32>) -> tensor<300x512x7x7xf32> {
+    %0 = "tpu.weight_file"() {filename = "roialign_1_06eeeb7e.npz"} : () -> memref<10xf32>
+    %1 = "tpu.input"(%arg0) {name = "data0", quant = {is_asymmetric = false, is_perchannel = false, mode = "NONE", param_type = "NONE", threshold_max = 0.000000e+00 : f32, threshold_min = 0.000000e+00 : f32, zero_point = 0 : i32}} : (tensor<1x512x38x50xf32>) -> tensor<1x512x38x50xf32>
+    %2 = "tpu.input"(%arg1) {name = "data1", quant = {is_asymmetric = false, is_perchannel = false, mode = "NONE", param_type = "NONE", threshold_max = 0.000000e+00 : f32, threshold_min = 0.000000e+00 : f32, zero_point = 0 : i32}} : (tensor<1x1x300x5xf32>) -> tensor<1x1x300x5xf32>
+    %3 = "tpu.custom_op"(%1, %2) {name = "roi_align", operation_name = "roialign", param = {pooled_h = 7 : i32, pooled_w = 7 : i32, spatial_scale = 6.250000e-02 : f32}, quant = {is_asymmetric = false, is_perchannel = false, mode = "NONE", param_type = "NONE", threshold_max = 0.000000e+00 : f32, threshold_min = 0.000000e+00 : f32, zero_point = 0 : i32}} : (tensor<1x512x38x50xf32>, tensor<1x1x300x5xf32>) -> tensor<300x512x7x7xf32>
+    return %3 : tensor<300x512x7x7xf32>
+  }
+}
--- a/cviruntime/custom_op/example/runtime/CMakeLists.txt
+++ b/cviruntime/custom_op/example/runtime/CMakeLists.txt
@ -0,0 +1,20 @@
+cmake_minimum_required(VERSION 2.8.0)
+project(custom_cpu_function CXX)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+
+include_directories(${PROJECT_SOURCE_DIR})
+include_directories(${CVIRUNTIME_INCLUDE})
+
+add_library(CustomOpRuntime SHARED
+            SoftmaxOpRuntime.cpp
+            ROIAlignOpRuntime.cpp
+            OpRuntimeRegister.cpp)
+if(NOT CMAKE_CROSSCOMPILING)
+  set_target_properties(CustomOpRuntime PROPERTIES SUFFIX "_x86.so")
+else()
+  set_target_properties(CustomOpRuntime PROPERTIES SUFFIX "_arm64.so")
+endif()
+install(TARGETS CustomOpRuntime DESTINATION lib/custom_op/)
--- a/cviruntime/custom_op/example/runtime/OpRuntimeRegister.cpp
+++ b/cviruntime/custom_op/example/runtime/OpRuntimeRegister.cpp
@ -0,0 +1,12 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*/
+#include <runtime/cpu_function.hpp>
+#include "SoftmaxOpRuntime.hpp"
+#include "ROIAlignOpRuntime.hpp"
+
+REGISTER_OP_RUNTIME_FUNCS(
+  {(char *)"mysoftmax", SoftmaxOpRuntime::open},
+  {(char *)"roialign", ROIAlignOpRuntime::open}
+  // add more custom op runtime func here.
+);
--- a/cviruntime/custom_op/example/runtime/ROIAlignOpRuntime.cpp
+++ b/cviruntime/custom_op/example/runtime/ROIAlignOpRuntime.cpp
@ -0,0 +1,124 @@
+#include <iostream>
+#include <vector>
+#include <string>
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+#include <runtime/neuron.hpp>
+#include "ROIAlignOpRuntime.hpp"
+
+
+ROIAlignOpRuntime::~ROIAlignOpRuntime() {}
+
+void ROIAlignOpRuntime::setup(std::vector<std::shared_ptr<cvi::runtime::Neuron>> &inputs,
+             std::vector<std::shared_ptr<cvi::runtime::Neuron>> &outputs,
+             cvi::OpParam &param) {
+  pooled_h = param.get<int32_t>("pooled_h");
+  pooled_w = param.get<int32_t>("pooled_w");
+  spatial_scale = param.get<float>("spatial_scale");
+
+  auto on = outputs[0]->shape[0];
+  auto oc = outputs[0]->shape[1];
+  if (inputs[0]->shape[1] == oc && inputs[1]->shape[2] == on) {
+    _bottoms = inputs;
+  } else {
+    std::swap(inputs[0], inputs[1]);
+    _bottoms = inputs;
+  }
+
+  _tops = outputs;
+}
+
+void ROIAlignOpRuntime::run() {
+  auto top_data = _tops[0]->cpu_data<float>();
+
+  size_t bottom_count = _bottoms.size();
+  assert(bottom_count == 2);
+
+  float *data = (float *)_bottoms[0]->cpu_data<float>();
+  float *rois = (float *)_bottoms[1]->cpu_data<float>();
+
+  int num_rois = _bottoms[1]->shape[2];
+  int batch = _bottoms[0]->shape[0];
+  int channel = _bottoms[0]->shape[1];
+  int height = _bottoms[0]->shape[2];
+  int width = _bottoms[0]->shape[3];
+
+  for (int b = 0; b < batch; ++b) {
+    auto batch_rois = rois + _bottoms[1]->offset(b);
+    auto batch_output = top_data + b * num_rois * channel * pooled_h * pooled_w;
+    for (int roi_idx = 0; roi_idx < num_rois; ++roi_idx) {
+      const int roi_batch_idx = batch_rois[roi_idx * 5];
+      assert(roi_batch_idx == b);
+
+      const float roi_start_x = batch_rois[roi_idx * 5 + 1] * spatial_scale;
+      const float roi_start_y = batch_rois[roi_idx * 5 + 2] * spatial_scale;
+      const float roi_end_x = batch_rois[roi_idx * 5 + 3] * spatial_scale;
+      const float roi_end_y = batch_rois[roi_idx * 5 + 4] * spatial_scale;
+
+      const float roi_w = std::max(roi_end_x - roi_start_x + 1, 1.0f);
+      const float roi_h = std::max(roi_end_y - roi_start_y + 1, 1.0f);
+
+      float bin_size_w = roi_w / (float)pooled_w;
+      float bin_size_h = roi_h / (float)pooled_h;
+
+      float* batch_data = data + b * channel * height * width;
+
+      for (int c = 0; c < channel; ++c) {
+        for (int ph = 0; ph < pooled_h; ++ph) {
+          for (int pw = 0; pw < pooled_w; ++pw) {
+            const float region_start_x = std::min(pw * bin_size_w + roi_start_x, (float)(width));
+            const float region_start_y = std::min(ph * bin_size_h + roi_start_y, (float)(height));
+            const float region_end_x = std::min((pw+1) * bin_size_w + roi_start_x, (float)(width));
+            const float region_end_y = std::min((ph+1) * bin_size_h + roi_start_y, (float)(height));
+
+            const int region_grid_w = int(std::ceil(bin_size_w));
+            const int region_grid_h = int(std::ceil(bin_size_h));
+
+            const int output_idx = ph * pooled_w + pw;
+            if (region_start_x >= region_end_x || region_start_y >= region_end_y) {
+              batch_output[output_idx] = 0;
+              continue;
+            }
+
+            float value = 0;
+            float fmax = std::numeric_limits<float>::min();
+            for (int gh = 0; gh < region_grid_h; ++gh) {
+              for (int gw = 0; gw < region_grid_w; ++gw) {
+                float x = roi_start_x + gw;
+                float y = roi_start_y + gh;
+
+                const int x_low = x;
+                const int y_low = y;
+
+                const int x_high = x_low + 1;
+                const int y_high = y_low + 1;
+
+                const float x_ratio = x - x_low;
+                const float y_ratio = y - y_low;
+
+                const float w1 = (1 - y_ratio) * (1 - x_ratio);
+                const float w2 = (1 - y_ratio) * x_ratio;
+                const float w3 = y_ratio * (1 - x_ratio);
+                const float w4 = y_ratio * x_ratio;
+
+                const float data1 = batch_data[y_low * height + x_low];
+                const float data2 = batch_data[y_low * height + x_high];
+                const float data3 = batch_data[y_high * height + x_low];
+                const float data4 = batch_data[y_high * height + x_high];
+                value = w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+                if (value > fmax) {
+                  fmax = value;
+                }
+              }
+            }
+            batch_output[output_idx] = fmax;
+          }
+        }
+
+        batch_data += height * width;
+        batch_output += pooled_h * pooled_w;
+      }
+    }
+  }
+}
--- a/cviruntime/custom_op/example/runtime/ROIAlignOpRuntime.hpp
+++ b/cviruntime/custom_op/example/runtime/ROIAlignOpRuntime.hpp
@ -0,0 +1,30 @@
+#include <iostream>
+#include <vector>
+#include <algorithm>
+#include <unordered_map>
+#include <runtime/neuron.hpp>
+#include <runtime/op_param.hpp>
+#include <runtime/cpu_function.hpp>
+
+class ROIAlignOpRuntime : public cvi::runtime::ICpuFunction {
+
+public:
+  ROIAlignOpRuntime() {}
+
+  ~ROIAlignOpRuntime();
+  void setup(std::vector<std::shared_ptr<cvi::runtime::Neuron>> &inputs,
+             std::vector<std::shared_ptr<cvi::runtime::Neuron>> &outputs,
+             cvi::OpParam &param);
+  void run();
+
+  static ICpuFunction *open() { return new ROIAlignOpRuntime(); }
+  static void close(ICpuFunction *func) { delete func; }
+
+private:
+  std::vector<std::shared_ptr<cvi::runtime::Neuron>> _bottoms;
+  std::vector<std::shared_ptr<cvi::runtime::Neuron>> _tops;
+
+  int pooled_h;
+  int pooled_w;
+  float spatial_scale;
+};
--- a/cviruntime/custom_op/example/runtime/SoftmaxOpRuntime.cpp
+++ b/cviruntime/custom_op/example/runtime/SoftmaxOpRuntime.cpp
@ -0,0 +1,74 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*/
+#include <cmath>
+#include <string.h>
+#include "SoftmaxOpRuntime.hpp"
+
+SoftmaxOpRuntime::~SoftmaxOpRuntime() {
+  if (_max)
+    delete[] _max;
+  if (_sum)
+    delete[] _sum;
+}
+
+void SoftmaxOpRuntime::setup(std::vector<std::shared_ptr<cvi::runtime::Neuron>> &inputs,
+                             std::vector<std::shared_ptr<cvi::runtime::Neuron>> &outputs,
+                             cvi::OpParam &param) {
+  _bottom = inputs[0];
+  _top = outputs[0];
+  _axis = param.get<int32_t>("axis");
+  assert(_axis >= 0);
+  auto shape = _bottom->shape;
+  _axis = _axis % shape.size();
+
+  _n = 1;
+  for(int i = 0; i < _axis; ++i) {
+    _n *= shape[i];
+  }
+
+  _inner_dim = 1;
+  for(size_t i = _axis+1; i < shape.size(); ++i) {
+    _inner_dim *= shape[i];
+  }
+
+  _c = shape[_axis];
+  _dim = _c * _inner_dim;
+
+  _max = new float[_inner_dim];
+  _sum = new float[_inner_dim];
+}
+
+void SoftmaxOpRuntime::run() {
+  auto bottom_data = _bottom->cpu_data<float>();
+  auto top_data = _top->cpu_data<float>();
+
+  for (int i = 0; i < _n; ++i) {
+    memcpy(_max, bottom_data, _inner_dim * sizeof(float));
+    memset(_sum, 0, _inner_dim * sizeof(float));
+    // find max value accross channel
+    int c_offset = i * _dim;
+    for (int j = 0; j < _c; ++j, c_offset += _inner_dim) {
+      for (int k = 0; k < _inner_dim; k++) {
+        if (_max[k] < bottom_data[c_offset + k])
+          _max[k] = bottom_data[c_offset + k];
+      }
+    }
+
+    // calculate exp(x)
+    c_offset = i * _dim;
+    for (int j = 0; j < _c; ++j, c_offset += _inner_dim) {
+      for (int k = 0; k < _inner_dim; k++) {
+        top_data[c_offset + k] = std::exp(bottom_data[c_offset + k] - _max[k]);
+        _sum[k] += top_data[c_offset + k];
+      }
+    }
+
+    c_offset = i * _dim;
+    for (int j = 0; j < _c; ++j, c_offset += _inner_dim) {
+      for (int k = 0; k < _inner_dim; k++) {
+        top_data[c_offset + k] /= _sum[k];
+      }
+    }
+  }
+}
--- a/cviruntime/custom_op/example/runtime/SoftmaxOpRuntime.hpp
+++ b/cviruntime/custom_op/example/runtime/SoftmaxOpRuntime.hpp
@ -0,0 +1,36 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*/
+#include <iostream>
+#include <vector>
+#include <string>
+#include <runtime/neuron.hpp>
+#include <runtime/cpu_function.hpp>
+#include <runtime/op_param.hpp>
+
+class SoftmaxOpRuntime : public cvi::runtime::ICpuFunction {
+
+public:
+  SoftmaxOpRuntime() = default;
+  ~SoftmaxOpRuntime();
+
+private:
+  std::shared_ptr<cvi::runtime::Neuron> _bottom;
+  std::shared_ptr<cvi::runtime::Neuron> _top;
+  int _axis;
+  int _inner_dim;
+  int _dim;
+  int _c;
+  int _n;
+  float *_max = nullptr;
+  float *_sum = nullptr;
+
+public:
+  static ICpuFunction *open() { return new SoftmaxOpRuntime(); }
+
+  void setup(std::vector<std::shared_ptr<cvi::runtime::Neuron>> &inputs,
+             std::vector<std::shared_ptr<cvi::runtime::Neuron>> &outputs,
+             cvi::OpParam &param);
+  void run();
+
+};
--- a/cviruntime/custom_op/example/segnet_convert.py
+++ b/cviruntime/custom_op/example/segnet_convert.py
@ -0,0 +1,70 @@
+#!/usr/bin/python3
+"""
+Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+"""
+
+from argparse import ArgumentParser
+from cvi_toolkit.transform.BaseConverter import TensorType
+from cvi_toolkit.transform.caffe_converter import CaffeConverter
+from cvi_toolkit.utils.log_setting import setup_logger
+from cvi_toolkit.data.preprocess import preprocess
+
+logger = setup_logger('root', log_level="INFO")
+
+class MyCaffeConverter(CaffeConverter):
+    def __init__(self, model_name, prototxt, caffe_model, mlir_file_path, batch_size=1):
+        super().__init__(model_name, prototxt, caffe_model, mlir_file_path, batch_size)
+        self.caffeop_factory['Upsample'] = lambda layer: self.convert_unpooling_op(layer);
+
+
+    def convert_unpooling_op(self, layer):
+        assert(self.layerType(layer) == "Upsample")
+        data, data_shape, _ = self.getOperand(layer.bottom[0])
+        mask, mask_shape, _ = self.getOperand(layer.bottom[1])
+        operands = list()
+        operands.append(data)
+        operands.append(mask)
+        
+        p = layer.upsample_param
+        scale = p.scale
+        if p.HasField("upsample_h"):
+            unpool_h = p.upsample_h
+        else:
+            unpool_h = mask_shape[2]
+        if p.HasField("upsample_w"):
+            unpool_w = p.upsample_w
+        else:
+            unpool_w = mask_shape[3]
+
+        output_shape = [data_shape[0], data_shape[1], unpool_h, unpool_w]
+
+        custom_op_param = {
+            'tpu': True,
+            'do_quant': True,
+            'operation_name': 'unpooling',
+            'threshold_overwrite': 'backward',
+            'param': {
+                'unpool_h': unpool_h,
+                'unpool_w': unpool_w,
+                'scale': scale
+            }
+        }
+        print("layer name: {}, top name: {}\n".format(layer.name, layer.top[0]))
+        custom_op = self.CVI.add_custom_op(layer.name,
+                                           operands, output_shape, **custom_op_param)
+        self.addOperand(layer.top[0], custom_op, output_shape, TensorType.ACTIVATION)
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--model_path", type=str)
+    parser.add_argument("--model_dat", type=str)
+    parser.add_argument("--mlir_file_path", type=str)
+    args = parser.parse_args()
+
+    #preprocessor = preprocess()
+    #preprocessor.config(net_input_dims="360,480",
+    #                    resize_dims="360,480")
+    
+    c = MyCaffeConverter('segnet', args.model_path, args.model_dat,
+                        args.mlir_file_path, batch_size=1)
+    c.run()
--- a/cviruntime/custom_op/example/yolo_v3_convert.py
+++ b/cviruntime/custom_op/example/yolo_v3_convert.py
@ -0,0 +1,52 @@
+#!/usr/bin/python3
+"""
+Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+"""
+
+import onnx
+from cvi_toolkit.transform.BaseConverter import TensorType
+from cvi_toolkit.transform.onnx_converter import OnnxConverter
+from cvi_toolkit.transform.tflite_converter_int8 import TFLiteConverter
+from cvi_toolkit.transform.tensorflow_converter import TFConverter
+from cvi_toolkit.utils.log_setting import setup_logger
+from cvi_toolkit.data.preprocess import add_preprocess_parser, preprocess
+
+logger = setup_logger('root', log_level="INFO")
+
+class MyOnnxConverter(OnnxConverter):
+    def __init__(self, model_name, onnx_model, mlir_file_path, batch_size=1, preprocessor=None):
+        super().__init__(model_name, onnx_model, mlir_file_path, batch_size, preprocessor.to_dict())
+        self.onnxop_factory['LeakyRelu'] = lambda node: self.convert_leaky_relu(node);
+
+    def convert_leaky_relu(self, onnx_node):
+        assert(onnx_node.op_type == "LeakyRelu")
+        alpha = onnx_node.attrs.get("alpha", 0.01)
+        custom_op_param = {
+            'tpu': True,
+            'do_quant': True,
+            'operation_name': 'leaky_relu',
+            'threshold_overwrite': 'backward',
+            'param': {
+                'negative_slope': float(alpha)
+            }
+        }
+        op, input_shape, tensor_type = self.getOperand(onnx_node.inputs[0])
+        operands = list()
+        operands.append(op)
+        output_shape = input_shape
+        custom_op = self.CVI.add_custom_op("{}_{}".format(onnx_node.name, onnx_node.op_type),
+                                           operands, output_shape, **custom_op_param)
+        self.addOperand(onnx_node.name, custom_op, output_shape, TensorType.ACTIVATION)
+
+
+if __name__ == "__main__":
+    onnx_model = onnx.load('model/yolov3-416.onnx')
+    preprocessor = preprocess()
+    preprocessor.config(net_input_dims="416,416",
+               resize_dims="416,416", crop_method='center', keep_aspect_ratio=True,
+               raw_scale=1.0, mean='0,0,0', std='1,1,1', input_scale=1.0,
+               channel_order='bgr', pixel_format=None, data_format='nchw',
+               aligned=False, gray=False)
+    c = MyOnnxConverter('yolo_v3', 'model/yolov3-416.onnx',
+                        'yolo_v3_416.mlir', batch_size=1, preprocessor=preprocessor)
+    c.run()
--- a/cviruntime/doc/assets/cmdbuf_debug.png
+++ b/cviruntime/doc/assets/cmdbuf_debug.png
--- a/cviruntime/doc/assets/cvimodel.png
+++ b/cviruntime/doc/assets/cvimodel.png
--- a/cviruntime/doc/assets/framework.jpg
+++ b/cviruntime/doc/assets/framework.jpg
--- a/cviruntime/doc/assets/logo_0.png
+++ b/cviruntime/doc/assets/logo_0.png
--- a/cviruntime/doc/cvitek_tpu_sdk_development_manual.md
+++ b/cviruntime/doc/cvitek_tpu_sdk_development_manual.md
--- a/cviruntime/include/bmruntime.h
+++ b/cviruntime/include/bmruntime.h
@ -0,0 +1,107 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*
+* File Name: bmruntime.h
+* Description:
+*/
+
+#ifndef _BM_RUNTIME_H_
+#define _BM_RUNTIME_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct bm_context;
+typedef struct bm_context *bmctx_t;
+struct bm_device;
+typedef struct bm_device *bmdev_t;
+typedef int32_t bmerr_t;
+
+struct bm_memory;
+typedef struct bm_memory *bmmem_t;
+typedef bmmem_t bmmem_device_t;
+
+typedef enum bmfmt_e {
+  BM_FMT_FP32   = 0,
+  BM_FMT_FP16   = 1,
+  BM_FMT_INT16  = 2,
+  BM_FMT_INT8   = 3,
+  BM_FMT_BF16   = 4,
+  BM_FMT_MAX    = 5
+} bmfmt_t;
+
+#define BM_SHAPE_MAX_DIM       (4)
+typedef struct bmshape_s {
+  bmfmt_t fmt;
+  int dim_size;
+  int dim[BM_SHAPE_MAX_DIM];
+} bmshape_t;
+
+typedef struct _cvi_array_base {
+  uint64_t gaddr_base0;
+  uint64_t gaddr_base1;
+  uint64_t gaddr_base2;
+  uint64_t gaddr_base3;
+  uint64_t gaddr_base4;
+  uint64_t gaddr_base5;
+  uint64_t gaddr_base6;
+  uint64_t gaddr_base7;
+} cvi_array_base;
+
+bmerr_t bm_init(int index, bmctx_t *ctx);
+void bm_exit(bmctx_t ctx);
+
+bmmem_device_t bmmem_device_alloc_raw(bmctx_t ctx, size_t size);
+bmmem_device_t bmmem_device_prealloc_raw(bmctx_t ctx, bmmem_device_t mem, uint64_t offset, size_t size);
+void bmmem_device_free(bmctx_t ctx, bmmem_device_t mem);
+void bmmem_device_free_ex(uint64_t p_addr);
+
+size_t bmmem_device_size(bmmem_device_t mem);
+uint64_t bmmem_device_addr(bmmem_device_t mem);
+int32_t bmmem_device_inc_ref(bmmem_device_t mem);
+int32_t bmmem_device_dec_ref(bmmem_device_t mem);
+uint8_t* bmmem_device_v_addr(bmmem_device_t mem);
+
+bmerr_t bm_memcpy_s2d(bmctx_t ctx, bmmem_device_t dst, uint8_t* src);
+bmerr_t bm_memcpy_d2s(bmctx_t ctx, uint8_t* dst, bmmem_device_t src);
+bmerr_t bm_memcpy_s2d_ex(bmctx_t ctx, bmmem_device_t dst, uint8_t* src, uint64_t offset, size_t size);
+bmerr_t bm_memcpy_d2s_ex(bmctx_t ctx, uint8_t* dst, bmmem_device_t src, uint64_t offset, size_t size);
+
+bmerr_t bm_load_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                       uint64_t neuron_gaddr, uint64_t weight_gaddr,
+                       bool enable_pmu, bmmem_device_t *cmdbuf_mem);
+
+bmerr_t bm_run_cmdbuf_ex(bmctx_t ctx, bmmem_device_t cmdbuf_mem, uint16_t *seq_no,
+                       uint64_t input_base_addr, uint64_t output_base_addr);
+bmerr_t bm_run_cmdbuf_ex2(bmctx_t ctx, bmmem_device_t cmdbuf_mem, uint16_t *seq_no,
+                       cvi_array_base *array_base);
+bmerr_t cvi_run_async(bmctx_t ctx, bmmem_device_t cmdbuf_mem);
+bmerr_t cvi_wait_cmdbuf_all(bmctx_t ctx);
+
+bmerr_t bm_run_cmdbuf(bmctx_t ctx, bmmem_device_t cmdbuf_mem,
+                      uint16_t *seq_no);
+bmerr_t bm_send_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                       uint16_t *seq_no);
+bmerr_t bm_wait_cmdbuf_done(bmctx_t ctx, uint16_t seq_no);
+bmerr_t bm_parse_pmubuf(bmmem_device_t cmdbuf_mem, uint8_t **buf_start, uint32_t *buf_len);
+bmerr_t bm_run_cmdbuf_pio(bmctx_t ctx, uint8_t *cmdbuf, size_t sz);
+
+bmerr_t cvi_load_cmdbuf_tee(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                       uint64_t neuron_gaddr, uint64_t weight_gaddr, uint32_t weight_len, bmmem_device_t *cmdbuf_mem);
+bmerr_t cvi_run_cmdbuf_tee(bmctx_t ctx, uint16_t *seq_no, uint64_t dmabuf_addr, cvi_array_base *array_base);
+
+void bm_device_set_base_reg(bmctx_t ctx, uint32_t inx, uint64_t addr);
+uint64_t bm_device_read_base_reg(bmctx_t ctx, unsigned int inx);
+
+void cviruntime_cvikernel_create(bmctx_t ctx, void **p_bk_ctx);
+void cviruntime_cvikernel_submit(bmctx_t ctx);
+void cviruntime_cvikernel_destroy(bmctx_t ctx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BM_RUNTIME_H_ */
--- a/cviruntime/include/bmruntime_bmkernel.h
+++ b/cviruntime/include/bmruntime_bmkernel.h
@ -0,0 +1,26 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*
+* File Name: bmruntime_bmkernel.h
+* Description:
+*/
+
+#ifndef _BM_RUNTIME_BMKERNEL_H_
+#define _BM_RUNTIME_BMKERNEL_H_
+
+#include <bmruntime.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void bmruntime_bmkernel_create(bmctx_t ctx, void **p_bk_ctx);
+void bmruntime_bmkernel_destroy(bmctx_t ctx);
+void bmruntime_bmkernel_submit(bmctx_t ctx);
+void bmruntime_bmkernel_submit_pio(bmctx_t ctx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BM_RUNTIME_BMKERNEL_H_ */
--- a/cviruntime/include/cviruntime.h
+++ b/cviruntime/include/cviruntime.h
@ -0,0 +1,301 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*
+* File Name: cviruntime.h
+* Description:
+*/
+
+#ifndef _CVIRUNTIME_H_
+#define _CVIRUNTIME_H_
+
+#include <stddef.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include "cvitpu_debug.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// data type of tensor
+typedef enum {
+  CVI_FMT_FP32   = 0,
+  CVI_FMT_INT32  = 1,
+  CVI_FMT_UINT32 = 2,
+  CVI_FMT_BF16   = 3,
+  CVI_FMT_INT16  = 4,
+  CVI_FMT_UINT16 = 5,
+  CVI_FMT_INT8   = 6,
+  CVI_FMT_UINT8  = 7
+} CVI_FMT;
+
+// memory source of Tensor buf.
+typedef enum {
+  CVI_MEM_SYSTEM  = 1,
+  CVI_MEM_DEVICE  = 2
+} CVI_MEM_TYPE_E;
+
+// pixel format
+typedef enum {
+  CVI_NN_PIXEL_RGB_PACKED     = 0,
+  CVI_NN_PIXEL_BGR_PACKED     = 1,
+  CVI_NN_PIXEL_RGB_PLANAR     = 2,
+  CVI_NN_PIXEL_BGR_PLANAR     = 3,
+  CVI_NN_PIXEL_YUV_NV12       = 11,
+  CVI_NN_PIXEL_YUV_NV21       = 12,
+  CVI_NN_PIXEL_YUV_420_PLANAR = 13,
+  CVI_NN_PIXEL_GRAYSCALE      = 15,
+  CVI_NN_PIXEL_TENSOR         = 100,
+  CVI_NN_PIXEL_RGBA_PLANAR = 1000,
+  // please don't use below values,
+  // only for backward compatibility
+  CVI_NN_PIXEL_PLANAR         = 101,
+  CVI_NN_PIXEL_PACKED         = 102
+} CVI_NN_PIXEL_FORMAT_E;
+
+typedef enum {
+  /*
+   * bool, default value is false,
+   * if set to true, runtime will output all tensors as
+   * output tensors for debugging.
+   */
+  OPTION_OUTPUT_ALL_TENSORS       = 4,
+  /*
+   * unsigned int, default value is 0,
+   * set program id, for switch programs in cvimodel
+   */
+  OPTION_PROGRAM_INDEX            = 9,
+  // DEPRECATED
+  OPTION_BATCH_SIZE               = 1,
+  // DEPRECATED
+  OPTION_SKIP_POSTPROCESS         = 6,
+  // DEPRECATED
+  OPTION_PREPARE_BUF_FOR_INPUTS   = 2,
+  // DEPRECATED
+  OPTION_PREPARE_BUF_FOR_OUTPUTS  = 3,
+  // DEPRECATED
+  OPTION_SKIP_PREPROCESS          = 5,
+  // DEPRECATED
+  OPTION_INPUT_MEM_TYPE           = 7,
+  // DEPRECATED
+  OPTION_OUTPUT_MEM_TYPE          = 8
+} CVI_CONFIG_OPTION;
+
+#define CVI_DIM_MAX (6)
+typedef struct {
+  int32_t dim[CVI_DIM_MAX];
+  size_t dim_size;
+} CVI_SHAPE;
+
+typedef struct {
+  char                  *name;
+  CVI_SHAPE             shape;
+  CVI_FMT               fmt;
+  size_t                count;
+  size_t                mem_size;
+  uint8_t               *sys_mem;
+  uint64_t              paddr;
+  CVI_MEM_TYPE_E        mem_type;
+  float                 qscale;
+  int                   zero_point;
+  CVI_NN_PIXEL_FORMAT_E pixel_format;
+  bool                  aligned;
+  float                 mean[3];
+  float                 scale[3];
+  void                  *owner;
+  char                  reserved[32];
+} CVI_TENSOR;
+
+typedef CVI_NN_PIXEL_FORMAT_E CVI_FRAME_TYPE;
+#define CVI_FRAME_PLANAR  CVI_NN_PIXEL_PLANAR
+#define CVI_FRAME_PACKAGE CVI_NN_PIXEL_PACKED
+
+typedef struct {
+  CVI_FRAME_TYPE type;
+  CVI_SHAPE shape;
+  CVI_FMT fmt;
+  uint32_t stride[3];
+  uint64_t pyaddr[3];
+} CVI_VIDEO_FRAME_INFO;
+
+typedef void *CVI_MODEL_HANDLE;
+
+typedef int CVI_RC;
+/*
+ * Register a cvimodel file to runtime, and return a model handle.
+ * @param [in] model_file,     file name of cvimodel.
+ * @param [out] model,         handle to registered model.
+ */
+CVI_RC CVI_NN_RegisterModel(const char *model_file, CVI_MODEL_HANDLE *model);
+
+/*
+ * Register a cvimodel file from memory, and return a model handle.
+ * @param [in] buf,            buffer to store cvimodel data.
+ * @param [in] size,           bytes of cvimodel data.
+ * @param [out] model,         handle to registered model.
+ */
+CVI_RC CVI_NN_RegisterModelFromBuffer(const int8_t *buf, uint32_t size, CVI_MODEL_HANDLE *model);
+
+CVI_RC CVI_NN_RegisterModelFromFd(const int fd, const size_t ud_offset, CVI_MODEL_HANDLE *model);
+
+/*
+ * Clone model that pointed by previous model handle, it will increment
+ * the refence count of model. The returned handle will share resources with
+ * previous handle, and save considerable memory.
+ * @param [in] model,  previous handle of model
+ * @param [out] cloned, cloned handle of same model.
+ */
+CVI_RC CVI_NN_CloneModel(CVI_MODEL_HANDLE model, CVI_MODEL_HANDLE *cloned);
+
+/*
+ * Get version number of cvimodel.
+ * @param [in] model,  previous handle of model
+ * @param [out] major version number.
+ * @param [out] minor version number.
+ */
+CVI_RC CVI_NN_GetModelVersion(CVI_MODEL_HANDLE model, int32_t *major, int32_t *minor);
+
+/*
+ * Get version number of cvimodel.
+ * @param [in] model,  previous handle of model
+ * @param [out] target name, cv182x,cv183x
+ */
+const char * CVI_NN_GetModelTarget(CVI_MODEL_HANDLE model);
+
+/*
+ * To set the configuration that specified by CVI_CONFIG_OPTION.
+ * This API must to be called before GetInputOutputTensors if user
+ * want to change default configuration.
+ * It only needs to set all these configurations once.
+ * @param [in] model,   handle of model
+ * @param [in] option,  option defiend in enum CVI_CONFIG_OPTION
+ * @param [in] variant value related to parameter option
+ */
+CVI_RC CVI_NN_SetConfig(CVI_MODEL_HANDLE model, CVI_CONFIG_OPTION option, ...);
+
+/*
+ * Get input and output tensors of model. It needs to be call before
+ * Forward/ForwardAsync API.
+ * @param [in] model,         handle of model.
+ * @param [out] inputs,       array of input tensors.
+ * @param [out] input_num,    number of input tensors.
+ * @param [out] outputs,      array of output tensors.
+ * @param [out] output_num,   number of output tensors.
+ */
+CVI_RC CVI_NN_GetInputOutputTensors(CVI_MODEL_HANDLE model, CVI_TENSOR **inputs,
+    int32_t *input_num, CVI_TENSOR **outputs, int32_t *output_num);
+/*
+ * Inference forwarding in blocking mode.
+ */
+CVI_RC CVI_NN_Forward(CVI_MODEL_HANDLE model, CVI_TENSOR inputs[], int32_t input_num,
+    CVI_TENSOR outputs[], int32_t output_num);
+/*
+ * Infernece forwarding in asynchronous mode and
+ * waiting result by calling ForwardWait.
+ */
+CVI_RC CVI_NN_ForwardAsync(CVI_MODEL_HANDLE model, CVI_TENSOR inputs[], int32_t input_num,
+    CVI_TENSOR outputs[], int32_t output_num, void **task_no);
+/*
+ * Waiting result after do inference forward in async mode.
+ */
+CVI_RC CVI_NN_ForwardWait(CVI_MODEL_HANDLE model, void *task_no);
+/*
+ * Decrement of the reference count of model.
+ * It will cleanup all resources of model if reference
+ * declined to zero.
+ */
+CVI_RC CVI_NN_CleanupModel(CVI_MODEL_HANDLE model);
+
+///
+/// Helper functions
+///
+CVI_RC CVI_NN_GetInputTensors(CVI_MODEL_HANDLE model, CVI_TENSOR **inputs, int32_t *input_num);
+CVI_RC CVI_NN_GetOutputTensors(CVI_MODEL_HANDLE model, CVI_TENSOR **outputs, int32_t *output_num);
+
+#define CVI_NN_DEFAULT_TENSOR (NULL)
+/*
+ * Get tensor from input or output tensors by name.
+ * @param [in] name.     name of wanted tensor.
+ *                       if value is CVI_NN_DEFAULT_TENSOR or NULL, return first tensor.
+ *                       And it also support wild-card matching if name ended by '*' character.
+ * @param [in] tensors,  array of input or output tensors.
+ * @param [in] num,      number of input or output tensors.
+ */
+CVI_TENSOR *CVI_NN_GetTensorByName(const char *name, CVI_TENSOR *tensors, int32_t num);
+/*
+ * Get Name of tensor.
+ */
+char *CVI_NN_TensorName(CVI_TENSOR *tensor);
+/*
+ * Get Buffer pointer of tensor.
+ */
+void *CVI_NN_TensorPtr(CVI_TENSOR *tensor);
+/*
+ * Get Byte size of tensor's buffer.
+ * tensor size = tensor count * sizeof(tensor data type)
+ */
+size_t CVI_NN_TensorSize(CVI_TENSOR *tensor);
+/*
+ * Get Count of elements stored in tensor.
+ */
+size_t CVI_NN_TensorCount(CVI_TENSOR *tensor);
+/*
+ * Get quant scale to do quantization(fp32 -> int8)
+ */
+float CVI_NN_TensorQuantScale(CVI_TENSOR *tensor);
+/*
+ * Get quant zero point to do asymmetric quantization(fp32 -> int8)
+ */
+int CVI_NN_TensorQuantZeroPoint(CVI_TENSOR *tensor);
+/*
+ * Get shape of a tensor.
+ */
+CVI_SHAPE CVI_NN_TensorShape(CVI_TENSOR *tensor);
+
+/*
+ * Set system memory for tensor.
+ */
+CVI_RC CVI_NN_SetTensorPtr(CVI_TENSOR *tensor, void *mem);
+
+/*
+ * Set physical Address for tensor.
+ */
+CVI_RC CVI_NN_SetTensorPhysicalAddr(CVI_TENSOR *tensor, uint64_t paddr);
+
+/*
+ * Do data copy from video frame to tensor
+ * WARNNING, this API is DEPRECATED.
+ */
+CVI_RC CVI_NN_SetTensorWithVideoFrame(
+    CVI_MODEL_HANDLE model, CVI_TENSOR* tensor,
+    CVI_VIDEO_FRAME_INFO* video_frame_info);
+
+/*
+ * Do data copy from video frame to tensor
+ * WARNNING, this API is DEPRECATED.
+ */
+CVI_RC CVI_NN_FeedTensorWithFrames(
+    CVI_MODEL_HANDLE model, CVI_TENSOR *tensor,
+    CVI_FRAME_TYPE type, CVI_FMT format,
+    int32_t channel_num, uint64_t *channel_paddrs,
+    int32_t height, int32_t width, uint32_t height_stride);
+
+/*
+ * Fill frames data from vpss to tensor.
+ */
+CVI_RC CVI_NN_SetTensorWithAlignedFrames(
+    CVI_TENSOR *tensor, uint64_t frame_paddrs[],
+    int32_t frame_num,  CVI_NN_PIXEL_FORMAT_E pixel_format);
+
+/*
+ * set shared memory size befor registering all cvimodels.
+ */
+void CVI_NN_Global_SetSharedMemorySize(size_t size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _CVIRUNTIME_H_
--- a/cviruntime/include/cviruntime_context.h
+++ b/cviruntime/include/cviruntime_context.h
@ -0,0 +1,119 @@
+#ifndef _CVIRUNTIME_CONTEXT_H_
+#define _CVIRUNTIME_CONTEXT_H_
+
+#include <stddef.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include "cvitpu_debug.h"
+
+#ifdef __cplusplus
+  extern "C" {
+#endif
+
+typedef void *CVI_RT_HANDLE;
+typedef void *CVI_RT_SHANDLE;
+typedef void *CVI_RT_KHANDLE;
+typedef void *CVI_RT_MEM;
+typedef int CVI_RC;
+
+typedef struct __CVI_RT_ARRAYBASE {
+  uint64_t gaddr_base0;
+  uint64_t gaddr_base1;
+  uint64_t gaddr_base2;
+  uint64_t gaddr_base3;
+  uint64_t gaddr_base4;
+  uint64_t gaddr_base5;
+  uint64_t gaddr_base6;
+  uint64_t gaddr_base7;
+} CVI_RT_ARRAYBASE;
+
+typedef enum {
+  CVI_ALLOC_WEIGHT = 0,
+  CVI_ALLOC_PROGRAM = 1,
+  CVI_ALLOC_NEURON = 2,
+  CVI_ALLOC_SHARED = 3,
+  CVI_ALLOC_DMABUF = 4,
+  CVI_ALLOC_UNKNOWN = 5
+} CVI_ALLOC_TYPE;
+
+typedef CVI_RT_MEM (*CVI_MEM_ALLOC_CB) (CVI_RT_HANDLE, uint64_t, CVI_ALLOC_TYPE, const char *);
+typedef void (*CVI_MEM_FREE_CB) (CVI_RT_HANDLE, CVI_RT_MEM);
+
+CVI_RC CVI_RT_Init(CVI_RT_HANDLE *rt_handle);
+CVI_RC CVI_RT_DeInit(CVI_RT_HANDLE rt_handle);
+
+CVI_RT_KHANDLE CVI_RT_RegisterKernel(CVI_RT_HANDLE rt_handle, uint32_t cmdbuf_size);
+CVI_RC CVI_RT_UnRegisterKernel(CVI_RT_KHANDLE rt_khandle);
+
+CVI_RC CVI_RT_Submit(CVI_RT_KHANDLE rt_khandle);
+CVI_RC CVI_RT_SubmitAsync(CVI_RT_KHANDLE rt_khandle, uint8_t submit_previous);
+CVI_RC CVI_RT_WaitForAsync(CVI_RT_KHANDLE rt_khandle);
+
+CVI_RC CVI_RT_LoadCmdbuf(
+    CVI_RT_HANDLE rt_handle, uint8_t *cmdbuf,
+    uint64_t cmdbuf_sz, uint64_t gaddr_base0,
+    uint64_t gaddr_base1, bool enable_pmu,
+    CVI_RT_MEM *cmdbuf_mem);
+CVI_RC CVI_RT_LoadDmabuf(
+    CVI_RT_HANDLE rt_handle, CVI_RT_MEM dmabuf,
+    uint64_t cmdbuf_sz, uint64_t gaddr_base0,
+    uint64_t gaddr_base1, bool enable_pmu, CVI_RT_MEM *dmabuf_mem);
+CVI_RC CVI_RT_RunCmdbuf(
+    CVI_RT_HANDLE rt_handle, CVI_RT_MEM cmdbuf_mem,
+    uint64_t gaddr_base2, uint64_t gaddr_base3);
+CVI_RC CVI_RT_RunCmdbufEx(
+    CVI_RT_HANDLE rt_handle, CVI_RT_MEM cmdbuf_mem,
+    CVI_RT_ARRAYBASE *p_array_base);
+
+CVI_RC CVI_RT_LoadCmdbufTee(
+    CVI_RT_HANDLE rt_handle, uint8_t *cmdbuf,
+    size_t sz, uint64_t neuron_gaddr, uint64_t weight_gaddr,
+    uint32_t weight_len, CVI_RT_MEM *cmdbuf_mem);
+
+CVI_RC CVI_RT_RunCmdbufTee(
+    CVI_RT_HANDLE rt_handle, CVI_RT_MEM cmdbuf_mem,
+    CVI_RT_ARRAYBASE *p_array_base);
+
+CVI_RT_MEM CVI_RT_MemAlloc(CVI_RT_HANDLE rt_handle, uint64_t size);
+CVI_RT_MEM CVI_RT_MemPreAlloc(CVI_RT_MEM mem, uint64_t offset, uint64_t size);
+void CVI_RT_MemFree(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem);
+void CVI_RT_MemFreeEx(uint64_t p_addr);
+uint64_t CVI_RT_MemGetSize(CVI_RT_MEM mem);
+uint64_t CVI_RT_MemGetPAddr(CVI_RT_MEM mem);
+uint8_t* CVI_RT_MemGetVAddr(CVI_RT_MEM mem);
+int32_t CVI_RT_MemIncRef(CVI_RT_MEM mem);
+int32_t CVI_RT_MemDecRef(CVI_RT_MEM mem);
+
+CVI_RC CVI_RT_MemCopyS2D(CVI_RT_HANDLE rt_handle, CVI_RT_MEM dst, uint8_t* src);
+CVI_RC CVI_RT_MemCopyD2S(CVI_RT_HANDLE rt_handle, uint8_t* dst, CVI_RT_MEM src);
+CVI_RC CVI_RT_MemCopyS2DEx(
+    CVI_RT_HANDLE rt_handle, CVI_RT_MEM dst,
+    uint64_t offset, uint64_t len, uint8_t* src);
+CVI_RC CVI_RT_MemFlush(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem);
+CVI_RC CVI_RT_MemInvld(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem);
+CVI_RC CVI_RT_MemFlushEx(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem, uint64_t len);
+CVI_RC CVI_RT_MemInvldEx(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem, uint64_t len);
+
+CVI_RC CVI_RT_ParsePmuBuf(CVI_RT_MEM cmdbuf_mem, uint8_t **buf_start, uint32_t *buf_len);
+
+CVI_RC CVI_RT_SetBaseReg(CVI_RT_HANDLE rt_handle, uint32_t inx, uint64_t base_addr);
+
+/*
+ * set memory alloc and free callback function.
+ * @param [in] CVI_MEM_ALLOC_CB,  memory alloc function
+ * @param [in] CVI_MEM_FREE_CB,  memory free function
+ */
+CVI_RC CVI_RT_Global_SetMemAllocCallback(
+    CVI_MEM_ALLOC_CB alloc_cb, CVI_MEM_FREE_CB free_cb);
+
+/*
+ * reset to default memory alloc and free function.
+ */
+void CVI_RT_Global_ResetMemAllocCallback();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _CVIRUNTIME_CONTEXT_H_
+
--- a/cviruntime/include/cviruntime_extra.h
+++ b/cviruntime/include/cviruntime_extra.h
@ -0,0 +1,48 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*
+* File Name: cviruntime_extra.h
+* Description:
+*/
+
+#ifndef _CVIRUNTIME_EXTRA_H_
+#define _CVIRUNTIME_EXTRA_H_
+
+#include <stddef.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include "cvitpu_debug.h"
+#include "cviruntime.h"
+#include "cviruntime_context.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void* CVI_KFUNC_HANDLE;
+
+/*
+ * Create tpu kernel function by given parameters.
+ */
+CVI_KFUNC_HANDLE CVI_NN_PrepareMatrixMulKernelFunc(
+    CVI_RT_HANDLE ctx, CVI_FMT fmt, uint32_t m, uint32_t k, uint32_t n);
+/*
+ */
+CVI_KFUNC_HANDLE CVI_NN_PrepareGrayImageLightKernelFunc(
+    CVI_RT_HANDLE ctx, uint32_t ih, uint32_t iw, uint32_t kernel_sz);
+/*
+ * Run tpu kernel function
+ */
+CVI_RC CVI_NN_RunKernelFunc(CVI_KFUNC_HANDLE kfun, int32_t mem_num, ...);
+/*
+ * Destroy tpu kernel function
+ */
+CVI_RC CVI_NN_DestroyKernelFunc(CVI_KFUNC_HANDLE kfun);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _CVIRUNTIME_H_
--- a/cviruntime/include/cvitpu_debug.h
+++ b/cviruntime/include/cvitpu_debug.h
@ -0,0 +1,179 @@
+#ifndef _CVITPU_DEBUG_H_
+#define _CVITPU_DEBUG_H_
+
+#include <stdio.h>
+#include <syslog.h>
+#include <assert.h>
+
+#ifndef CVI_SUCCESS
+#define CVI_SUCCESS           0
+#endif  
+#ifndef CVI_FAILURE
+#define CVI_FAILURE           -1
+#endif
+#define CVI_RC_SUCCESS        CVI_SUCCESS             // The operation was successful
+#define CVI_RC_AGAIN          CVI_ERR_TPU_AGAIN       // Not ready yet
+#define CVI_RC_FAILURE        CVI_FAILURE             // General failure
+#define CVI_RC_TIMEOUT        CVI_ERR_TPU_TIMEOUT     // Timeout
+#define CVI_RC_UNINIT         CVI_ERR_TPU_UNINIT      // Uninitialzed
+#define CVI_RC_INVALID_ARG    CVI_ERR_TPU_INVALID_ARG // Arguments invalid
+#define CVI_RC_NOMEM          CVI_ERR_TPU_NOMEM       // Not enough memory
+#define CVI_RC_DATA_ERR       CVI_ERR_TPU_DATA_ERR    // Data error
+#define CVI_RC_BUSY           CVI_ERR_TPU_BUSY        // Busy
+#define CVI_RC_UNSUPPORT      CVI_ERR_TPU_UNSUPPORT   // Not supported yet
+
+
+#define LOG_TOWARD_SYSLOG
+#if defined(__i386__) || defined(__x86_64__)
+#undef LOG_TOWARD_SYSLOG
+#endif
+
+#ifdef LOG_TOWARD_SYSLOG
+#define TPU_LOG_FATAL(...)                  \
+  do {                                      \
+    syslog(LOG_LOCAL6|0, __VA_ARGS__);      \
+  } while (0)
+
+#define TPU_LOG_ERROR(...)                  \
+    do {                                    \
+      syslog(LOG_LOCAL6|3, __VA_ARGS__);    \
+    } while (0)
+
+#define TPU_LOG_WARNING(...)                \
+    do {                                    \
+      syslog(LOG_LOCAL6|4, __VA_ARGS__);    \
+    } while (0)
+
+#define TPU_LOG_NOTICE(...)                 \
+    do {                                    \
+      syslog(LOG_LOCAL6|5, __VA_ARGS__);    \
+    } while (0)
+
+#define TPU_LOG_INFO(...)                   \
+    do {                                    \
+      syslog(LOG_LOCAL6|6, __VA_ARGS__);    \
+    } while (0)
+
+#define TPU_LOG_DEBUG(...)                  \
+    do {                                    \
+      syslog(LOG_LOCAL6|7, __VA_ARGS__);    \
+    } while (0)
+
+#else
+#define TPU_LOG_FATAL(...) printf(__VA_ARGS__)
+#define TPU_LOG_ERROR(...) printf(__VA_ARGS__)
+#define TPU_LOG_WARNING(...) printf(__VA_ARGS__)
+#define TPU_LOG_NOTICE(...) printf(__VA_ARGS__)
+#define TPU_LOG_INFO(...) printf(__VA_ARGS__)
+#define TPU_LOG_DEBUG(...) printf(__VA_ARGS__)
+#endif
+
+#define NDEBUG_ASSERT
+#ifdef NDEBUG_ASSERT
+#define TPU_ASSERT(condition, message)                                                      \
+    do {                                                                                    \
+      if (!(condition)) {                                                                   \
+        TPU_LOG_ERROR("%s ERROR in %s %d\n", message ? message : "", __FILE__, __LINE__);   \
+        assert(0);                                                                          \
+      }                                                                                     \
+    } while (0)
+#else
+#define TPU_ASSERT(condition, message)                                                      \
+    do {                                                                                    \
+      assert(condition && message);                                                         \
+    } while (0)
+#endif
+
+//following referened middleware pre-define 
+/*******************************************************************************/
+/*|----------------------------------------------------------------|*/
+/*| 11|   APP_ID   |   MOD_ID    | ERR_LEVEL |   ERR_ID            |*/
+/*|----------------------------------------------------------------|*/
+/*|<--><--6bits----><----8bits---><--3bits---><------13bits------->|*/
+/*******************************************************************************/
+#define CVI_TPU_ERR_APPID  (0x00000000L)
+#define CVI_TPU_RUNTIME  0x77
+#define CVI_TPU_ERR(module, level, errid) \
+  ((int)(0xC0000000L | (CVI_TPU_ERR_APPID) | ((module) << 16) | ((level)<<13) | (errid)))
+
+typedef enum _TPU_ERR_LEVEL_E {
+  TPU_EN_ERR_LEVEL_DEBUG = 0,  /* debug-level                                  */
+  TPU_EN_ERR_LEVEL_INFO,       /* informational                                */
+  TPU_EN_ERR_LEVEL_NOTICE,     /* normal but significant condition             */
+  TPU_EN_ERR_LEVEL_WARNING,    /* warning conditions                           */
+  TPU_EN_ERR_LEVEL_ERROR,      /* error conditions                             */
+  TPU_EN_ERR_LEVEL_CRIT,       /* critical conditions                          */
+  TPU_EN_ERR_LEVEL_ALERT,      /* action must be taken immediately             */
+  TPU_EN_ERR_LEVEL_FATAL,      /* just for compatibility with previous version */
+  TPU_EN_ERR_LEVEL_BUTT
+} TPU_ERR_LEVEL_E;
+
+/* NOTE! the following defined all common error code,		*/
+/*** all module must reserved 0~63 for their common error code*/
+typedef enum _TPU_EN_ERR_CODE_E {
+  TPU_EN_ERR_INVALID_DEVID = 1, /* invalid device ID */
+  TPU_EN_ERR_INVALID_CHNID = 2, /* invalid channel ID*/
+  TPU_EN_ERR_ILLEGAL_PARAM = 3,
+  /* at least one parameter is illegal*/
+  /* eg, an illegal enumeration value */
+  TPU_EN_ERR_EXIST         = 4, /* resource exists*/
+  TPU_EN_ERR_UNEXIST       = 5, /* resource unexists */
+  TPU_EN_ERR_NULL_PTR      = 6, /* using a NULL point*/
+  TPU_EN_ERR_NOT_CONFIG    = 7,
+  /* try to enable or initialize system, device*/
+  /* or channel, before configing attribute*/
+  TPU_EN_ERR_NOT_SUPPORT   = 8,
+  /* operation or type is not supported by NOW*/
+  TPU_EN_ERR_NOT_PERM      = 9,
+  /* operation is not permitted*/
+  /* eg, try to change static attribute*/
+  TPU_EN_ERR_INVALID_PIPEID = 10,
+  /* invalid pipe ID*/
+  TPU_EN_ERR_INVALID_GRPID  = 11,
+  /* invalid group ID*/
+  TPU_EN_ERR_NOMEM         = 12,
+  /* failure caused by malloc memory*/
+  TPU_EN_ERR_NOBUF         = 13,
+  /* failure caused by malloc buffer*/
+  TPU_EN_ERR_BUF_EMPTY     = 14,
+  /* no data in buffer */
+  TPU_EN_ERR_BUF_FULL      = 15,
+  /* no buffer for new data*/
+  TPU_EN_ERR_SYS_NOTREADY  = 16,
+  /* System is not ready, maybe not initialized or*/
+  /* loaded. Returning the error code when opening*/
+  /* a device file failed.*/
+  TPU_EN_ERR_BADADDR       = 17,
+  /* bad address,*/
+  /* eg. used for copy_from_user & copy_to_user*/
+  TPU_EN_ERR_BUSY          = 18,
+  /* resource is busy,*/
+  /* eg. destroy a venc chn without unregister it */
+  TPU_EN_ERR_SIZE_NOT_ENOUGH = 19,
+  /* buffer size is smaller than the actual size required */
+  TPU_EN_ERR_INVALID_VB    = 20,
+
+  /* tpu error code extension */
+  TPU_EN_ERR_TIMEOUT    = 21,
+  TPU_EN_ERR_DATAERR    = 22,
+
+  /* invalid VB handle */
+  TPU_EN_ERR_BUTT          = 63,
+  /* maximum code, private error code of all modules*/
+  /* must be greater than it */
+} TPU_EN_ERR_CODE_E;
+
+typedef enum _CVI_TPU_ERRCODE {
+  CVI_ERR_TPU_SUCCESS = 0,
+  CVI_ERR_TPU_AGAIN = CVI_TPU_ERR(CVI_TPU_RUNTIME, TPU_EN_ERR_LEVEL_ERROR, TPU_EN_ERR_SYS_NOTREADY),
+  CVI_ERR_TPU_FAILURE = -1,
+  CVI_ERR_TPU_TIMEOUT = CVI_TPU_ERR(CVI_TPU_RUNTIME, TPU_EN_ERR_LEVEL_ERROR, TPU_EN_ERR_TIMEOUT),
+  CVI_ERR_TPU_UNINIT = CVI_TPU_ERR(CVI_TPU_RUNTIME, TPU_EN_ERR_LEVEL_ERROR, TPU_EN_ERR_NOT_CONFIG),
+  CVI_ERR_TPU_INVALID_ARG = CVI_TPU_ERR(CVI_TPU_RUNTIME, TPU_EN_ERR_LEVEL_ERROR, TPU_EN_ERR_ILLEGAL_PARAM),
+  CVI_ERR_TPU_NOMEM = CVI_TPU_ERR(CVI_TPU_RUNTIME, TPU_EN_ERR_LEVEL_ERROR, TPU_EN_ERR_NOMEM),
+  CVI_ERR_TPU_DATA_ERR = CVI_TPU_ERR(CVI_TPU_RUNTIME, TPU_EN_ERR_LEVEL_ERROR, TPU_EN_ERR_DATAERR),
+  CVI_ERR_TPU_BUSY = CVI_TPU_ERR(CVI_TPU_RUNTIME, TPU_EN_ERR_LEVEL_ERROR, TPU_EN_ERR_BUSY),
+  CVI_ERR_TPU_UNSUPPORT = CVI_TPU_ERR(CVI_TPU_RUNTIME, TPU_EN_ERR_LEVEL_ERROR, TPU_EN_ERR_NOT_SUPPORT),
+} CVI_TPU_ERRCODE;
+
+#endif
--- a/cviruntime/include/lz4/lz4.h
+++ b/cviruntime/include/lz4/lz4.h
@ -0,0 +1,774 @@
+/*
+ *  LZ4 - Fast LZ compression algorithm
+ *  Header File
+ *  Copyright (C) 2011-present, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+    - LZ4 homepage : http://www.lz4.org
+    - LZ4 source repository : https://github.com/lz4/lz4
+*/
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef LZ4_H_2983827168210
+#define LZ4_H_2983827168210
+
+/* --- Dependency --- */
+#include <stddef.h>   /* size_t */
+
+
+/**
+  Introduction
+
+  LZ4 is lossless compression algorithm, providing compression speed >500 MB/s per core,
+  scalable with multi-cores CPU. It features an extremely fast decoder, with speed in
+  multiple GB/s per core, typically reaching RAM speed limits on multi-core systems.
+
+  The LZ4 compression library provides in-memory compression and decompression functions.
+  It gives full buffer control to user.
+  Compression can be done in:
+    - a single step (described as Simple Functions)
+    - a single step, reusing a context (described in Advanced Functions)
+    - unbounded multiple steps (described as Streaming compression)
+
+  lz4.h generates and decodes LZ4-compressed blocks (doc/lz4_Block_format.md).
+  Decompressing such a compressed block requires additional metadata.
+  Exact metadata depends on exact decompression function.
+  For the typical case of LZ4_decompress_safe(),
+  metadata includes block's compressed size, and maximum bound of decompressed size.
+  Each application is free to encode and pass such metadata in whichever way it wants.
+
+  lz4.h only handle blocks, it can not generate Frames.
+
+  Blocks are different from Frames (doc/lz4_Frame_format.md).
+  Frames bundle both blocks and metadata in a specified manner.
+  Embedding metadata is required for compressed data to be self-contained and portable.
+  Frame format is delivered through a companion API, declared in lz4frame.h.
+  The `lz4` CLI can only manage frames.
+*/
+
+/*^***************************************************************
+*  Export parameters
+*****************************************************************/
+/*
+*  LZ4_DLL_EXPORT :
+*  Enable exporting of functions when building a Windows DLL
+*  LZ4LIB_VISIBILITY :
+*  Control library symbols visibility.
+*/
+#ifndef LZ4LIB_VISIBILITY
+#  if defined(__GNUC__) && (__GNUC__ >= 4)
+#    define LZ4LIB_VISIBILITY __attribute__ ((visibility ("default")))
+#  else
+#    define LZ4LIB_VISIBILITY
+#  endif
+#endif
+#if defined(LZ4_DLL_EXPORT) && (LZ4_DLL_EXPORT==1)
+#  define LZ4LIB_API __declspec(dllexport) LZ4LIB_VISIBILITY
+#elif defined(LZ4_DLL_IMPORT) && (LZ4_DLL_IMPORT==1)
+#  define LZ4LIB_API __declspec(dllimport) LZ4LIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+#  define LZ4LIB_API LZ4LIB_VISIBILITY
+#endif
+
+/*------   Version   ------*/
+#define LZ4_VERSION_MAJOR    1    /* for breaking interface changes  */
+#define LZ4_VERSION_MINOR    9    /* for new (non-breaking) interface capabilities */
+#define LZ4_VERSION_RELEASE  3    /* for tweaks, bug-fixes, or development */
+
+#define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE)
+
+#define LZ4_LIB_VERSION LZ4_VERSION_MAJOR.LZ4_VERSION_MINOR.LZ4_VERSION_RELEASE
+#define LZ4_QUOTE(str) #str
+#define LZ4_EXPAND_AND_QUOTE(str) LZ4_QUOTE(str)
+#define LZ4_VERSION_STRING LZ4_EXPAND_AND_QUOTE(LZ4_LIB_VERSION)
+
+LZ4LIB_API int LZ4_versionNumber (void);  /**< library version number; useful to check dll version */
+LZ4LIB_API const char* LZ4_versionString (void);   /**< library version string; useful to check dll version */
+
+
+/*-************************************
+*  Tuning parameter
+**************************************/
+/*!
+ * LZ4_MEMORY_USAGE :
+ * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+ * Increasing memory usage improves compression ratio.
+ * Reduced memory usage may improve speed, thanks to better cache locality.
+ * Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache
+ */
+#ifndef LZ4_MEMORY_USAGE
+# define LZ4_MEMORY_USAGE 14
+#endif
+
+
+/*-************************************
+*  Simple Functions
+**************************************/
+/*! LZ4_compress_default() :
+ *  Compresses 'srcSize' bytes from buffer 'src'
+ *  into already allocated 'dst' buffer of size 'dstCapacity'.
+ *  Compression is guaranteed to succeed if 'dstCapacity' >= LZ4_compressBound(srcSize).
+ *  It also runs faster, so it's a recommended setting.
+ *  If the function cannot compress 'src' into a more limited 'dst' budget,
+ *  compression stops *immediately*, and the function result is zero.
+ *  In which case, 'dst' content is undefined (invalid).
+ *      srcSize : max supported value is LZ4_MAX_INPUT_SIZE.
+ *      dstCapacity : size of buffer 'dst' (which must be already allocated)
+ *     @return  : the number of bytes written into buffer 'dst' (necessarily <= dstCapacity)
+ *                or 0 if compression fails
+ * Note : This function is protected against buffer overflow scenarios (never writes outside 'dst' buffer, nor read outside 'source' buffer).
+ */
+LZ4LIB_API int LZ4_compress_default(const char* src, char* dst, int srcSize, int dstCapacity);
+
+/*! LZ4_decompress_safe() :
+ *  compressedSize : is the exact complete size of the compressed block.
+ *  dstCapacity : is the size of destination buffer (which must be already allocated), presumed an upper bound of decompressed size.
+ * @return : the number of bytes decompressed into destination buffer (necessarily <= dstCapacity)
+ *           If destination buffer is not large enough, decoding will stop and output an error code (negative value).
+ *           If the source stream is detected malformed, the function will stop decoding and return a negative result.
+ * Note 1 : This function is protected against malicious data packets :
+ *          it will never writes outside 'dst' buffer, nor read outside 'source' buffer,
+ *          even if the compressed block is maliciously modified to order the decoder to do these actions.
+ *          In such case, the decoder stops immediately, and considers the compressed block malformed.
+ * Note 2 : compressedSize and dstCapacity must be provided to the function, the compressed block does not contain them.
+ *          The implementation is free to send / store / derive this information in whichever way is most beneficial.
+ *          If there is a need for a different format which bundles together both compressed data and its metadata, consider looking at lz4frame.h instead.
+ */
+LZ4LIB_API int LZ4_decompress_safe (const char* src, char* dst, int compressedSize, int dstCapacity);
+
+
+/*-************************************
+*  Advanced Functions
+**************************************/
+#define LZ4_MAX_INPUT_SIZE        0x7E000000   /* 2 113 929 216 bytes */
+#define LZ4_COMPRESSBOUND(isize)  ((unsigned)(isize) > (unsigned)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16)
+
+/*! LZ4_compressBound() :
+    Provides the maximum size that LZ4 compression may output in a "worst case" scenario (input data not compressible)
+    This function is primarily useful for memory allocation purposes (destination buffer size).
+    Macro LZ4_COMPRESSBOUND() is also provided for compilation-time evaluation (stack memory allocation for example).
+    Note that LZ4_compress_default() compresses faster when dstCapacity is >= LZ4_compressBound(srcSize)
+        inputSize  : max supported value is LZ4_MAX_INPUT_SIZE
+        return : maximum output size in a "worst case" scenario
+              or 0, if input size is incorrect (too large or negative)
+*/
+LZ4LIB_API int LZ4_compressBound(int inputSize);
+
+/*! LZ4_compress_fast() :
+    Same as LZ4_compress_default(), but allows selection of "acceleration" factor.
+    The larger the acceleration value, the faster the algorithm, but also the lesser the compression.
+    It's a trade-off. It can be fine tuned, with each successive value providing roughly +~3% to speed.
+    An acceleration value of "1" is the same as regular LZ4_compress_default()
+    Values <= 0 will be replaced by LZ4_ACCELERATION_DEFAULT (currently == 1, see lz4.c).
+    Values > LZ4_ACCELERATION_MAX will be replaced by LZ4_ACCELERATION_MAX (currently == 65537, see lz4.c).
+*/
+LZ4LIB_API int LZ4_compress_fast (const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
+
+
+/*! LZ4_compress_fast_extState() :
+ *  Same as LZ4_compress_fast(), using an externally allocated memory space for its state.
+ *  Use LZ4_sizeofState() to know how much memory must be allocated,
+ *  and allocate it on 8-bytes boundaries (using `malloc()` typically).
+ *  Then, provide this buffer as `void* state` to compression function.
+ */
+LZ4LIB_API int LZ4_sizeofState(void);
+LZ4LIB_API int LZ4_compress_fast_extState (void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
+
+
+/*! LZ4_compress_destSize() :
+ *  Reverse the logic : compresses as much data as possible from 'src' buffer
+ *  into already allocated buffer 'dst', of size >= 'targetDestSize'.
+ *  This function either compresses the entire 'src' content into 'dst' if it's large enough,
+ *  or fill 'dst' buffer completely with as much data as possible from 'src'.
+ *  note: acceleration parameter is fixed to "default".
+ *
+ * *srcSizePtr : will be modified to indicate how many bytes where read from 'src' to fill 'dst'.
+ *               New value is necessarily <= input value.
+ * @return : Nb bytes written into 'dst' (necessarily <= targetDestSize)
+ *           or 0 if compression fails.
+ *
+ * Note : from v1.8.2 to v1.9.1, this function had a bug (fixed un v1.9.2+):
+ *        the produced compressed content could, in specific circumstances,
+ *        require to be decompressed into a destination buffer larger
+ *        by at least 1 byte than the content to decompress.
+ *        If an application uses `LZ4_compress_destSize()`,
+ *        it's highly recommended to update liblz4 to v1.9.2 or better.
+ *        If this can't be done or ensured,
+ *        the receiving decompression function should provide
+ *        a dstCapacity which is > decompressedSize, by at least 1 byte.
+ *        See https://github.com/lz4/lz4/issues/859 for details
+ */
+LZ4LIB_API int LZ4_compress_destSize (const char* src, char* dst, int* srcSizePtr, int targetDstSize);
+
+
+/*! LZ4_decompress_safe_partial() :
+ *  Decompress an LZ4 compressed block, of size 'srcSize' at position 'src',
+ *  into destination buffer 'dst' of size 'dstCapacity'.
+ *  Up to 'targetOutputSize' bytes will be decoded.
+ *  The function stops decoding on reaching this objective.
+ *  This can be useful to boost performance
+ *  whenever only the beginning of a block is required.
+ *
+ * @return : the number of bytes decoded in `dst` (necessarily <= targetOutputSize)
+ *           If source stream is detected malformed, function returns a negative result.
+ *
+ *  Note 1 : @return can be < targetOutputSize, if compressed block contains less data.
+ *
+ *  Note 2 : targetOutputSize must be <= dstCapacity
+ *
+ *  Note 3 : this function effectively stops decoding on reaching targetOutputSize,
+ *           so dstCapacity is kind of redundant.
+ *           This is because in older versions of this function,
+ *           decoding operation would still write complete sequences.
+ *           Therefore, there was no guarantee that it would stop writing at exactly targetOutputSize,
+ *           it could write more bytes, though only up to dstCapacity.
+ *           Some "margin" used to be required for this operation to work properly.
+ *           Thankfully, this is no longer necessary.
+ *           The function nonetheless keeps the same signature, in an effort to preserve API compatibility.
+ *
+ *  Note 4 : If srcSize is the exact size of the block,
+ *           then targetOutputSize can be any value,
+ *           including larger than the block's decompressed size.
+ *           The function will, at most, generate block's decompressed size.
+ *
+ *  Note 5 : If srcSize is _larger_ than block's compressed size,
+ *           then targetOutputSize **MUST** be <= block's decompressed size.
+ *           Otherwise, *silent corruption will occur*.
+ */
+LZ4LIB_API int LZ4_decompress_safe_partial (const char* src, char* dst, int srcSize, int targetOutputSize, int dstCapacity);
+
+
+/*-*********************************************
+*  Streaming Compression Functions
+***********************************************/
+typedef union LZ4_stream_u LZ4_stream_t;  /* incomplete type (defined later) */
+
+LZ4LIB_API LZ4_stream_t* LZ4_createStream(void);
+LZ4LIB_API int           LZ4_freeStream (LZ4_stream_t* streamPtr);
+
+/*! LZ4_resetStream_fast() : v1.9.0+
+ *  Use this to prepare an LZ4_stream_t for a new chain of dependent blocks
+ *  (e.g., LZ4_compress_fast_continue()).
+ *
+ *  An LZ4_stream_t must be initialized once before usage.
+ *  This is automatically done when created by LZ4_createStream().
+ *  However, should the LZ4_stream_t be simply declared on stack (for example),
+ *  it's necessary to initialize it first, using LZ4_initStream().
+ *
+ *  After init, start any new stream with LZ4_resetStream_fast().
+ *  A same LZ4_stream_t can be re-used multiple times consecutively
+ *  and compress multiple streams,
+ *  provided that it starts each new stream with LZ4_resetStream_fast().
+ *
+ *  LZ4_resetStream_fast() is much faster than LZ4_initStream(),
+ *  but is not compatible with memory regions containing garbage data.
+ *
+ *  Note: it's only useful to call LZ4_resetStream_fast()
+ *        in the context of streaming compression.
+ *        The *extState* functions perform their own resets.
+ *        Invoking LZ4_resetStream_fast() before is redundant, and even counterproductive.
+ */
+LZ4LIB_API void LZ4_resetStream_fast (LZ4_stream_t* streamPtr);
+
+/*! LZ4_loadDict() :
+ *  Use this function to reference a static dictionary into LZ4_stream_t.
+ *  The dictionary must remain available during compression.
+ *  LZ4_loadDict() triggers a reset, so any previous data will be forgotten.
+ *  The same dictionary will have to be loaded on decompression side for successful decoding.
+ *  Dictionary are useful for better compression of small data (KB range).
+ *  While LZ4 accept any input as dictionary,
+ *  results are generally better when using Zstandard's Dictionary Builder.
+ *  Loading a size of 0 is allowed, and is the same as reset.
+ * @return : loaded dictionary size, in bytes (necessarily <= 64 KB)
+ */
+LZ4LIB_API int LZ4_loadDict (LZ4_stream_t* streamPtr, const char* dictionary, int dictSize);
+
+/*! LZ4_compress_fast_continue() :
+ *  Compress 'src' content using data from previously compressed blocks, for better compression ratio.
+ * 'dst' buffer must be already allocated.
+ *  If dstCapacity >= LZ4_compressBound(srcSize), compression is guaranteed to succeed, and runs faster.
+ *
+ * @return : size of compressed block
+ *           or 0 if there is an error (typically, cannot fit into 'dst').
+ *
+ *  Note 1 : Each invocation to LZ4_compress_fast_continue() generates a new block.
+ *           Each block has precise boundaries.
+ *           Each block must be decompressed separately, calling LZ4_decompress_*() with relevant metadata.
+ *           It's not possible to append blocks together and expect a single invocation of LZ4_decompress_*() to decompress them together.
+ *
+ *  Note 2 : The previous 64KB of source data is __assumed__ to remain present, unmodified, at same address in memory !
+ *
+ *  Note 3 : When input is structured as a double-buffer, each buffer can have any size, including < 64 KB.
+ *           Make sure that buffers are separated, by at least one byte.
+ *           This construction ensures that each block only depends on previous block.
+ *
+ *  Note 4 : If input buffer is a ring-buffer, it can have any size, including < 64 KB.
+ *
+ *  Note 5 : After an error, the stream status is undefined (invalid), it can only be reset or freed.
+ */
+LZ4LIB_API int LZ4_compress_fast_continue (LZ4_stream_t* streamPtr, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
+
+/*! LZ4_saveDict() :
+ *  If last 64KB data cannot be guaranteed to remain available at its current memory location,
+ *  save it into a safer place (char* safeBuffer).
+ *  This is schematically equivalent to a memcpy() followed by LZ4_loadDict(),
+ *  but is much faster, because LZ4_saveDict() doesn't need to rebuild tables.
+ * @return : saved dictionary size in bytes (necessarily <= maxDictSize), or 0 if error.
+ */
+LZ4LIB_API int LZ4_saveDict (LZ4_stream_t* streamPtr, char* safeBuffer, int maxDictSize);
+
+
+/*-**********************************************
+*  Streaming Decompression Functions
+*  Bufferless synchronous API
+************************************************/
+typedef union LZ4_streamDecode_u LZ4_streamDecode_t;   /* tracking context */
+
+/*! LZ4_createStreamDecode() and LZ4_freeStreamDecode() :
+ *  creation / destruction of streaming decompression tracking context.
+ *  A tracking context can be re-used multiple times.
+ */
+LZ4LIB_API LZ4_streamDecode_t* LZ4_createStreamDecode(void);
+LZ4LIB_API int                 LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream);
+
+/*! LZ4_setStreamDecode() :
+ *  An LZ4_streamDecode_t context can be allocated once and re-used multiple times.
+ *  Use this function to start decompression of a new stream of blocks.
+ *  A dictionary can optionally be set. Use NULL or size 0 for a reset order.
+ *  Dictionary is presumed stable : it must remain accessible and unmodified during next decompression.
+ * @return : 1 if OK, 0 if error
+ */
+LZ4LIB_API int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize);
+
+/*! LZ4_decoderRingBufferSize() : v1.8.2+
+ *  Note : in a ring buffer scenario (optional),
+ *  blocks are presumed decompressed next to each other
+ *  up to the moment there is not enough remaining space for next block (remainingSize < maxBlockSize),
+ *  at which stage it resumes from beginning of ring buffer.
+ *  When setting such a ring buffer for streaming decompression,
+ *  provides the minimum size of this ring buffer
+ *  to be compatible with any source respecting maxBlockSize condition.
+ * @return : minimum ring buffer size,
+ *           or 0 if there is an error (invalid maxBlockSize).
+ */
+LZ4LIB_API int LZ4_decoderRingBufferSize(int maxBlockSize);
+#define LZ4_DECODER_RING_BUFFER_SIZE(maxBlockSize) (65536 + 14 + (maxBlockSize))  /* for static allocation; maxBlockSize presumed valid */
+
+/*! LZ4_decompress_*_continue() :
+ *  These decoding functions allow decompression of consecutive blocks in "streaming" mode.
+ *  A block is an unsplittable entity, it must be presented entirely to a decompression function.
+ *  Decompression functions only accepts one block at a time.
+ *  The last 64KB of previously decoded data *must* remain available and unmodified at the memory position where they were decoded.
+ *  If less than 64KB of data has been decoded, all the data must be present.
+ *
+ *  Special : if decompression side sets a ring buffer, it must respect one of the following conditions :
+ *  - Decompression buffer size is _at least_ LZ4_decoderRingBufferSize(maxBlockSize).
+ *    maxBlockSize is the maximum size of any single block. It can have any value > 16 bytes.
+ *    In which case, encoding and decoding buffers do not need to be synchronized.
+ *    Actually, data can be produced by any source compliant with LZ4 format specification, and respecting maxBlockSize.
+ *  - Synchronized mode :
+ *    Decompression buffer size is _exactly_ the same as compression buffer size,
+ *    and follows exactly same update rule (block boundaries at same positions),
+ *    and decoding function is provided with exact decompressed size of each block (exception for last block of the stream),
+ *    _then_ decoding & encoding ring buffer can have any size, including small ones ( < 64 KB).
+ *  - Decompression buffer is larger than encoding buffer, by a minimum of maxBlockSize more bytes.
+ *    In which case, encoding and decoding buffers do not need to be synchronized,
+ *    and encoding ring buffer can have any size, including small ones ( < 64 KB).
+ *
+ *  Whenever these conditions are not possible,
+ *  save the last 64KB of decoded data into a safe buffer where it can't be modified during decompression,
+ *  then indicate where this data is saved using LZ4_setStreamDecode(), before decompressing next block.
+*/
+LZ4LIB_API int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* src, char* dst, int srcSize, int dstCapacity);
+
+
+/*! LZ4_decompress_*_usingDict() :
+ *  These decoding functions work the same as
+ *  a combination of LZ4_setStreamDecode() followed by LZ4_decompress_*_continue()
+ *  They are stand-alone, and don't need an LZ4_streamDecode_t structure.
+ *  Dictionary is presumed stable : it must remain accessible and unmodified during decompression.
+ *  Performance tip : Decompression speed can be substantially increased
+ *                    when dst == dictStart + dictSize.
+ */
+LZ4LIB_API int LZ4_decompress_safe_usingDict (const char* src, char* dst, int srcSize, int dstCapcity, const char* dictStart, int dictSize);
+
+#endif /* LZ4_H_2983827168210 */
+
+
+/*^*************************************
+ * !!!!!!   STATIC LINKING ONLY   !!!!!!
+ ***************************************/
+
+/*-****************************************************************************
+ * Experimental section
+ *
+ * Symbols declared in this section must be considered unstable. Their
+ * signatures or semantics may change, or they may be removed altogether in the
+ * future. They are therefore only safe to depend on when the caller is
+ * statically linked against the library.
+ *
+ * To protect against unsafe usage, not only are the declarations guarded,
+ * the definitions are hidden by default
+ * when building LZ4 as a shared/dynamic library.
+ *
+ * In order to access these declarations,
+ * define LZ4_STATIC_LINKING_ONLY in your application
+ * before including LZ4's headers.
+ *
+ * In order to make their implementations accessible dynamically, you must
+ * define LZ4_PUBLISH_STATIC_FUNCTIONS when building the LZ4 library.
+ ******************************************************************************/
+
+#ifdef LZ4_STATIC_LINKING_ONLY
+
+#ifndef LZ4_STATIC_3504398509
+#define LZ4_STATIC_3504398509
+
+#ifdef LZ4_PUBLISH_STATIC_FUNCTIONS
+#define LZ4LIB_STATIC_API LZ4LIB_API
+#else
+#define LZ4LIB_STATIC_API
+#endif
+
+
+/*! LZ4_compress_fast_extState_fastReset() :
+ *  A variant of LZ4_compress_fast_extState().
+ *
+ *  Using this variant avoids an expensive initialization step.
+ *  It is only safe to call if the state buffer is known to be correctly initialized already
+ *  (see above comment on LZ4_resetStream_fast() for a definition of "correctly initialized").
+ *  From a high level, the difference is that
+ *  this function initializes the provided state with a call to something like LZ4_resetStream_fast()
+ *  while LZ4_compress_fast_extState() starts with a call to LZ4_resetStream().
+ */
+LZ4LIB_STATIC_API int LZ4_compress_fast_extState_fastReset (void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
+
+/*! LZ4_attach_dictionary() :
+ *  This is an experimental API that allows
+ *  efficient use of a static dictionary many times.
+ *
+ *  Rather than re-loading the dictionary buffer into a working context before
+ *  each compression, or copying a pre-loaded dictionary's LZ4_stream_t into a
+ *  working LZ4_stream_t, this function introduces a no-copy setup mechanism,
+ *  in which the working stream references the dictionary stream in-place.
+ *
+ *  Several assumptions are made about the state of the dictionary stream.
+ *  Currently, only streams which have been prepared by LZ4_loadDict() should
+ *  be expected to work.
+ *
+ *  Alternatively, the provided dictionaryStream may be NULL,
+ *  in which case any existing dictionary stream is unset.
+ *
+ *  If a dictionary is provided, it replaces any pre-existing stream history.
+ *  The dictionary contents are the only history that can be referenced and
+ *  logically immediately precede the data compressed in the first subsequent
+ *  compression call.
+ *
+ *  The dictionary will only remain attached to the working stream through the
+ *  first compression call, at the end of which it is cleared. The dictionary
+ *  stream (and source buffer) must remain in-place / accessible / unchanged
+ *  through the completion of the first compression call on the stream.
+ */
+LZ4LIB_STATIC_API void LZ4_attach_dictionary(LZ4_stream_t* workingStream, const LZ4_stream_t* dictionaryStream);
+
+
+/*! In-place compression and decompression
+ *
+ * It's possible to have input and output sharing the same buffer,
+ * for highly contrained memory environments.
+ * In both cases, it requires input to lay at the end of the buffer,
+ * and decompression to start at beginning of the buffer.
+ * Buffer size must feature some margin, hence be larger than final size.
+ *
+ * |<------------------------buffer--------------------------------->|
+ *                             |<-----------compressed data--------->|
+ * |<-----------decompressed size------------------>|
+ *                                                  |<----margin---->|
+ *
+ * This technique is more useful for decompression,
+ * since decompressed size is typically larger,
+ * and margin is short.
+ *
+ * In-place decompression will work inside any buffer
+ * which size is >= LZ4_DECOMPRESS_INPLACE_BUFFER_SIZE(decompressedSize).
+ * This presumes that decompressedSize > compressedSize.
+ * Otherwise, it means compression actually expanded data,
+ * and it would be more efficient to store such data with a flag indicating it's not compressed.
+ * This can happen when data is not compressible (already compressed, or encrypted).
+ *
+ * For in-place compression, margin is larger, as it must be able to cope with both
+ * history preservation, requiring input data to remain unmodified up to LZ4_DISTANCE_MAX,
+ * and data expansion, which can happen when input is not compressible.
+ * As a consequence, buffer size requirements are much higher,
+ * and memory savings offered by in-place compression are more limited.
+ *
+ * There are ways to limit this cost for compression :
+ * - Reduce history size, by modifying LZ4_DISTANCE_MAX.
+ *   Note that it is a compile-time constant, so all compressions will apply this limit.
+ *   Lower values will reduce compression ratio, except when input_size < LZ4_DISTANCE_MAX,
+ *   so it's a reasonable trick when inputs are known to be small.
+ * - Require the compressor to deliver a "maximum compressed size".
+ *   This is the `dstCapacity` parameter in `LZ4_compress*()`.
+ *   When this size is < LZ4_COMPRESSBOUND(inputSize), then compression can fail,
+ *   in which case, the return code will be 0 (zero).
+ *   The caller must be ready for these cases to happen,
+ *   and typically design a backup scheme to send data uncompressed.
+ * The combination of both techniques can significantly reduce
+ * the amount of margin required for in-place compression.
+ *
+ * In-place compression can work in any buffer
+ * which size is >= (maxCompressedSize)
+ * with maxCompressedSize == LZ4_COMPRESSBOUND(srcSize) for guaranteed compression success.
+ * LZ4_COMPRESS_INPLACE_BUFFER_SIZE() depends on both maxCompressedSize and LZ4_DISTANCE_MAX,
+ * so it's possible to reduce memory requirements by playing with them.
+ */
+
+#define LZ4_DECOMPRESS_INPLACE_MARGIN(compressedSize)          (((compressedSize) >> 8) + 32)
+#define LZ4_DECOMPRESS_INPLACE_BUFFER_SIZE(decompressedSize)   ((decompressedSize) + LZ4_DECOMPRESS_INPLACE_MARGIN(decompressedSize))  /**< note: presumes that compressedSize < decompressedSize. note2: margin is overestimated a bit, since it could use compressedSize instead */
+
+#ifndef LZ4_DISTANCE_MAX   /* history window size; can be user-defined at compile time */
+#  define LZ4_DISTANCE_MAX 65535   /* set to maximum value by default */
+#endif
+
+#define LZ4_COMPRESS_INPLACE_MARGIN                           (LZ4_DISTANCE_MAX + 32)   /* LZ4_DISTANCE_MAX can be safely replaced by srcSize when it's smaller */
+#define LZ4_COMPRESS_INPLACE_BUFFER_SIZE(maxCompressedSize)   ((maxCompressedSize) + LZ4_COMPRESS_INPLACE_MARGIN)  /**< maxCompressedSize is generally LZ4_COMPRESSBOUND(inputSize), but can be set to any lower value, with the risk that compression can fail (return code 0(zero)) */
+
+#endif   /* LZ4_STATIC_3504398509 */
+#endif   /* LZ4_STATIC_LINKING_ONLY */
+
+
+
+#ifndef LZ4_H_98237428734687
+#define LZ4_H_98237428734687
+
+/*-************************************************************
+ *  Private Definitions
+ **************************************************************
+ * Do not use these definitions directly.
+ * They are only exposed to allow static allocation of `LZ4_stream_t` and `LZ4_streamDecode_t`.
+ * Accessing members will expose user code to API and/or ABI break in future versions of the library.
+ **************************************************************/
+#define LZ4_HASHLOG   (LZ4_MEMORY_USAGE-2)
+#define LZ4_HASHTABLESIZE (1 << LZ4_MEMORY_USAGE)
+#define LZ4_HASH_SIZE_U32 (1 << LZ4_HASHLOG)       /* required as macro for static allocation */
+
+#if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+# include <stdint.h>
+  typedef  int8_t  LZ4_i8;
+  typedef uint8_t  LZ4_byte;
+  typedef uint16_t LZ4_u16;
+  typedef uint32_t LZ4_u32;
+#else
+  typedef   signed char  LZ4_i8;
+  typedef unsigned char  LZ4_byte;
+  typedef unsigned short LZ4_u16;
+  typedef unsigned int   LZ4_u32;
+#endif
+
+typedef struct LZ4_stream_t_internal LZ4_stream_t_internal;
+struct LZ4_stream_t_internal {
+    LZ4_u32 hashTable[LZ4_HASH_SIZE_U32];
+    LZ4_u32 currentOffset;
+    LZ4_u32 tableType;
+    const LZ4_byte* dictionary;
+    const LZ4_stream_t_internal* dictCtx;
+    LZ4_u32 dictSize;
+};
+
+typedef struct {
+    const LZ4_byte* externalDict;
+    size_t extDictSize;
+    const LZ4_byte* prefixEnd;
+    size_t prefixSize;
+} LZ4_streamDecode_t_internal;
+
+
+/*! LZ4_stream_t :
+ *  Do not use below internal definitions directly !
+ *  Declare or allocate an LZ4_stream_t instead.
+ *  LZ4_stream_t can also be created using LZ4_createStream(), which is recommended.
+ *  The structure definition can be convenient for static allocation
+ *  (on stack, or as part of larger structure).
+ *  Init this structure with LZ4_initStream() before first use.
+ *  note : only use this definition in association with static linking !
+ *  this definition is not API/ABI safe, and may change in future versions.
+ */
+#define LZ4_STREAMSIZE       16416  /* static size, for inter-version compatibility */
+#define LZ4_STREAMSIZE_VOIDP (LZ4_STREAMSIZE / sizeof(void*))
+union LZ4_stream_u {
+    void* table[LZ4_STREAMSIZE_VOIDP];
+    LZ4_stream_t_internal internal_donotuse;
+}; /* previously typedef'd to LZ4_stream_t */
+
+
+/*! LZ4_initStream() : v1.9.0+
+ *  An LZ4_stream_t structure must be initialized at least once.
+ *  This is automatically done when invoking LZ4_createStream(),
+ *  but it's not when the structure is simply declared on stack (for example).
+ *
+ *  Use LZ4_initStream() to properly initialize a newly declared LZ4_stream_t.
+ *  It can also initialize any arbitrary buffer of sufficient size,
+ *  and will @return a pointer of proper type upon initialization.
+ *
+ *  Note : initialization fails if size and alignment conditions are not respected.
+ *         In which case, the function will @return NULL.
+ *  Note2: An LZ4_stream_t structure guarantees correct alignment and size.
+ *  Note3: Before v1.9.0, use LZ4_resetStream() instead
+ */
+LZ4LIB_API LZ4_stream_t* LZ4_initStream (void* buffer, size_t size);
+
+
+/*! LZ4_streamDecode_t :
+ *  information structure to track an LZ4 stream during decompression.
+ *  init this structure  using LZ4_setStreamDecode() before first use.
+ *  note : only use in association with static linking !
+ *         this definition is not API/ABI safe,
+ *         and may change in a future version !
+ */
+#define LZ4_STREAMDECODESIZE_U64 (4 + ((sizeof(void*)==16) ? 2 : 0) /*AS-400*/ )
+#define LZ4_STREAMDECODESIZE     (LZ4_STREAMDECODESIZE_U64 * sizeof(unsigned long long))
+union LZ4_streamDecode_u {
+    unsigned long long table[LZ4_STREAMDECODESIZE_U64];
+    LZ4_streamDecode_t_internal internal_donotuse;
+} ;   /* previously typedef'd to LZ4_streamDecode_t */
+
+
+
+/*-************************************
+*  Obsolete Functions
+**************************************/
+
+/*! Deprecation warnings
+ *
+ *  Deprecated functions make the compiler generate a warning when invoked.
+ *  This is meant to invite users to update their source code.
+ *  Should deprecation warnings be a problem, it is generally possible to disable them,
+ *  typically with -Wno-deprecated-declarations for gcc
+ *  or _CRT_SECURE_NO_WARNINGS in Visual.
+ *
+ *  Another method is to define LZ4_DISABLE_DEPRECATE_WARNINGS
+ *  before including the header file.
+ */
+#ifdef LZ4_DISABLE_DEPRECATE_WARNINGS
+#  define LZ4_DEPRECATED(message)   /* disable deprecation warnings */
+#else
+#  if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
+#    define LZ4_DEPRECATED(message) [[deprecated(message)]]
+#  elif defined(_MSC_VER)
+#    define LZ4_DEPRECATED(message) __declspec(deprecated(message))
+#  elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ * 10 + __GNUC_MINOR__ >= 45))
+#    define LZ4_DEPRECATED(message) __attribute__((deprecated(message)))
+#  elif defined(__GNUC__) && (__GNUC__ * 10 + __GNUC_MINOR__ >= 31)
+#    define LZ4_DEPRECATED(message) __attribute__((deprecated))
+#  else
+#    pragma message("WARNING: LZ4_DEPRECATED needs custom implementation for this compiler")
+#    define LZ4_DEPRECATED(message)   /* disabled */
+#  endif
+#endif /* LZ4_DISABLE_DEPRECATE_WARNINGS */
+
+/*! Obsolete compression functions (since v1.7.3) */
+LZ4_DEPRECATED("use LZ4_compress_default() instead")       LZ4LIB_API int LZ4_compress               (const char* src, char* dest, int srcSize);
+LZ4_DEPRECATED("use LZ4_compress_default() instead")       LZ4LIB_API int LZ4_compress_limitedOutput (const char* src, char* dest, int srcSize, int maxOutputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead") LZ4LIB_API int LZ4_compress_withState               (void* state, const char* source, char* dest, int inputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead") LZ4LIB_API int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead") LZ4LIB_API int LZ4_compress_continue                (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead") LZ4LIB_API int LZ4_compress_limitedOutput_continue  (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize, int maxOutputSize);
+
+/*! Obsolete decompression functions (since v1.8.0) */
+LZ4_DEPRECATED("use LZ4_decompress_fast() instead") LZ4LIB_API int LZ4_uncompress (const char* source, char* dest, int outputSize);
+LZ4_DEPRECATED("use LZ4_decompress_safe() instead") LZ4LIB_API int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize);
+
+/* Obsolete streaming functions (since v1.7.0)
+ * degraded functionality; do not use!
+ *
+ * In order to perform streaming compression, these functions depended on data
+ * that is no longer tracked in the state. They have been preserved as well as
+ * possible: using them will still produce a correct output. However, they don't
+ * actually retain any history between compression calls. The compression ratio
+ * achieved will therefore be no better than compressing each chunk
+ * independently.
+ */
+LZ4_DEPRECATED("Use LZ4_createStream() instead") LZ4LIB_API void* LZ4_create (char* inputBuffer);
+LZ4_DEPRECATED("Use LZ4_createStream() instead") LZ4LIB_API int   LZ4_sizeofStreamState(void);
+LZ4_DEPRECATED("Use LZ4_resetStream() instead")  LZ4LIB_API int   LZ4_resetStreamState(void* state, char* inputBuffer);
+LZ4_DEPRECATED("Use LZ4_saveDict() instead")     LZ4LIB_API char* LZ4_slideInputBuffer (void* state);
+
+/*! Obsolete streaming decoding functions (since v1.7.0) */
+LZ4_DEPRECATED("use LZ4_decompress_safe_usingDict() instead") LZ4LIB_API int LZ4_decompress_safe_withPrefix64k (const char* src, char* dst, int compressedSize, int maxDstSize);
+LZ4_DEPRECATED("use LZ4_decompress_fast_usingDict() instead") LZ4LIB_API int LZ4_decompress_fast_withPrefix64k (const char* src, char* dst, int originalSize);
+
+/*! Obsolete LZ4_decompress_fast variants (since v1.9.0) :
+ *  These functions used to be faster than LZ4_decompress_safe(),
+ *  but this is no longer the case. They are now slower.
+ *  This is because LZ4_decompress_fast() doesn't know the input size,
+ *  and therefore must progress more cautiously into the input buffer to not read beyond the end of block.
+ *  On top of that `LZ4_decompress_fast()` is not protected vs malformed or malicious inputs, making it a security liability.
+ *  As a consequence, LZ4_decompress_fast() is strongly discouraged, and deprecated.
+ *
+ *  The last remaining LZ4_decompress_fast() specificity is that
+ *  it can decompress a block without knowing its compressed size.
+ *  Such functionality can be achieved in a more secure manner
+ *  by employing LZ4_decompress_safe_partial().
+ *
+ *  Parameters:
+ *  originalSize : is the uncompressed size to regenerate.
+ *                 `dst` must be already allocated, its size must be >= 'originalSize' bytes.
+ * @return : number of bytes read from source buffer (== compressed size).
+ *           The function expects to finish at block's end exactly.
+ *           If the source stream is detected malformed, the function stops decoding and returns a negative result.
+ *  note : LZ4_decompress_fast*() requires originalSize. Thanks to this information, it never writes past the output buffer.
+ *         However, since it doesn't know its 'src' size, it may read an unknown amount of input, past input buffer bounds.
+ *         Also, since match offsets are not validated, match reads from 'src' may underflow too.
+ *         These issues never happen if input (compressed) data is correct.
+ *         But they may happen if input data is invalid (error or intentional tampering).
+ *         As a consequence, use these functions in trusted environments with trusted data **only**.
+ */
+LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using LZ4_decompress_safe() instead")
+LZ4LIB_API int LZ4_decompress_fast (const char* src, char* dst, int originalSize);
+LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using LZ4_decompress_safe_continue() instead")
+LZ4LIB_API int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* src, char* dst, int originalSize);
+LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using LZ4_decompress_safe_usingDict() instead")
+LZ4LIB_API int LZ4_decompress_fast_usingDict (const char* src, char* dst, int originalSize, const char* dictStart, int dictSize);
+
+/*! LZ4_resetStream() :
+ *  An LZ4_stream_t structure must be initialized at least once.
+ *  This is done with LZ4_initStream(), or LZ4_resetStream().
+ *  Consider switching to LZ4_initStream(),
+ *  invoking LZ4_resetStream() will trigger deprecation warnings in the future.
+ */
+LZ4LIB_API void LZ4_resetStream (LZ4_stream_t* streamPtr);
+
+
+#endif /* LZ4_H_98237428734687 */
+
+
+#if defined (__cplusplus)
+}
+#endif
--- a/cviruntime/include/lz4/lz4frame.h
+++ b/cviruntime/include/lz4/lz4frame.h
@ -0,0 +1,623 @@
+/*
+   LZ4 auto-framing library
+   Header File
+   Copyright (C) 2011-2017, Yann Collet.
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - LZ4 source repository : https://github.com/lz4/lz4
+   - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+
+/* LZ4F is a stand-alone API able to create and decode LZ4 frames
+ * conformant with specification v1.6.1 in doc/lz4_Frame_format.md .
+ * Generated frames are compatible with `lz4` CLI.
+ *
+ * LZ4F also offers streaming capabilities.
+ *
+ * lz4.h is not required when using lz4frame.h,
+ * except to extract common constant such as LZ4_VERSION_NUMBER.
+ * */
+
+#ifndef LZ4F_H_09782039843
+#define LZ4F_H_09782039843
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* ---   Dependency   --- */
+#include <stddef.h>   /* size_t */
+
+
+/**
+  Introduction
+
+  lz4frame.h implements LZ4 frame specification (doc/lz4_Frame_format.md).
+  lz4frame.h provides frame compression functions that take care
+  of encoding standard metadata alongside LZ4-compressed blocks.
+*/
+
+/*-***************************************************************
+ *  Compiler specifics
+ *****************************************************************/
+/*  LZ4_DLL_EXPORT :
+ *  Enable exporting of functions when building a Windows DLL
+ *  LZ4FLIB_VISIBILITY :
+ *  Control library symbols visibility.
+ */
+#ifndef LZ4FLIB_VISIBILITY
+#  if defined(__GNUC__) && (__GNUC__ >= 4)
+#    define LZ4FLIB_VISIBILITY __attribute__ ((visibility ("default")))
+#  else
+#    define LZ4FLIB_VISIBILITY
+#  endif
+#endif
+#if defined(LZ4_DLL_EXPORT) && (LZ4_DLL_EXPORT==1)
+#  define LZ4FLIB_API __declspec(dllexport) LZ4FLIB_VISIBILITY
+#elif defined(LZ4_DLL_IMPORT) && (LZ4_DLL_IMPORT==1)
+#  define LZ4FLIB_API __declspec(dllimport) LZ4FLIB_VISIBILITY
+#else
+#  define LZ4FLIB_API LZ4FLIB_VISIBILITY
+#endif
+
+#ifdef LZ4F_DISABLE_DEPRECATE_WARNINGS
+#  define LZ4F_DEPRECATE(x) x
+#else
+#  if defined(_MSC_VER)
+#    define LZ4F_DEPRECATE(x) x   /* __declspec(deprecated) x - only works with C++ */
+#  elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 6))
+#    define LZ4F_DEPRECATE(x) x __attribute__((deprecated))
+#  else
+#    define LZ4F_DEPRECATE(x) x   /* no deprecation warning for this compiler */
+#  endif
+#endif
+
+
+/*-************************************
+ *  Error management
+ **************************************/
+typedef size_t LZ4F_errorCode_t;
+
+LZ4FLIB_API unsigned    LZ4F_isError(LZ4F_errorCode_t code);   /**< tells when a function result is an error code */
+LZ4FLIB_API const char* LZ4F_getErrorName(LZ4F_errorCode_t code);   /**< return error code string; for debugging */
+
+
+/*-************************************
+ *  Frame compression types
+ ************************************* */
+/* #define LZ4F_ENABLE_OBSOLETE_ENUMS   // uncomment to enable obsolete enums */
+#ifdef LZ4F_ENABLE_OBSOLETE_ENUMS
+#  define LZ4F_OBSOLETE_ENUM(x) , LZ4F_DEPRECATE(x) = LZ4F_##x
+#else
+#  define LZ4F_OBSOLETE_ENUM(x)
+#endif
+
+/* The larger the block size, the (slightly) better the compression ratio,
+ * though there are diminishing returns.
+ * Larger blocks also increase memory usage on both compression and decompression sides.
+ */
+typedef enum {
+    LZ4F_default=0,
+    LZ4F_max64KB=4,
+    LZ4F_max256KB=5,
+    LZ4F_max1MB=6,
+    LZ4F_max4MB=7
+    LZ4F_OBSOLETE_ENUM(max64KB)
+    LZ4F_OBSOLETE_ENUM(max256KB)
+    LZ4F_OBSOLETE_ENUM(max1MB)
+    LZ4F_OBSOLETE_ENUM(max4MB)
+} LZ4F_blockSizeID_t;
+
+/* Linked blocks sharply reduce inefficiencies when using small blocks,
+ * they compress better.
+ * However, some LZ4 decoders are only compatible with independent blocks */
+typedef enum {
+    LZ4F_blockLinked=0,
+    LZ4F_blockIndependent
+    LZ4F_OBSOLETE_ENUM(blockLinked)
+    LZ4F_OBSOLETE_ENUM(blockIndependent)
+} LZ4F_blockMode_t;
+
+typedef enum {
+    LZ4F_noContentChecksum=0,
+    LZ4F_contentChecksumEnabled
+    LZ4F_OBSOLETE_ENUM(noContentChecksum)
+    LZ4F_OBSOLETE_ENUM(contentChecksumEnabled)
+} LZ4F_contentChecksum_t;
+
+typedef enum {
+    LZ4F_noBlockChecksum=0,
+    LZ4F_blockChecksumEnabled
+} LZ4F_blockChecksum_t;
+
+typedef enum {
+    LZ4F_frame=0,
+    LZ4F_skippableFrame
+    LZ4F_OBSOLETE_ENUM(skippableFrame)
+} LZ4F_frameType_t;
+
+#ifdef LZ4F_ENABLE_OBSOLETE_ENUMS
+typedef LZ4F_blockSizeID_t blockSizeID_t;
+typedef LZ4F_blockMode_t blockMode_t;
+typedef LZ4F_frameType_t frameType_t;
+typedef LZ4F_contentChecksum_t contentChecksum_t;
+#endif
+
+/*! LZ4F_frameInfo_t :
+ *  makes it possible to set or read frame parameters.
+ *  Structure must be first init to 0, using memset() or LZ4F_INIT_FRAMEINFO,
+ *  setting all parameters to default.
+ *  It's then possible to update selectively some parameters */
+typedef struct {
+  LZ4F_blockSizeID_t     blockSizeID;         /* max64KB, max256KB, max1MB, max4MB; 0 == default */
+  LZ4F_blockMode_t       blockMode;           /* LZ4F_blockLinked, LZ4F_blockIndependent; 0 == default */
+  LZ4F_contentChecksum_t contentChecksumFlag; /* 1: frame terminated with 32-bit checksum of decompressed data; 0: disabled (default) */
+  LZ4F_frameType_t       frameType;           /* read-only field : LZ4F_frame or LZ4F_skippableFrame */
+  unsigned long long     contentSize;         /* Size of uncompressed content ; 0 == unknown */
+  unsigned               dictID;              /* Dictionary ID, sent by compressor to help decoder select correct dictionary; 0 == no dictID provided */
+  LZ4F_blockChecksum_t   blockChecksumFlag;   /* 1: each block followed by a checksum of block's compressed data; 0: disabled (default) */
+} LZ4F_frameInfo_t;
+
+#define LZ4F_INIT_FRAMEINFO   { LZ4F_default, LZ4F_blockLinked, LZ4F_noContentChecksum, LZ4F_frame, 0ULL, 0U, LZ4F_noBlockChecksum }    /* v1.8.3+ */
+
+/*! LZ4F_preferences_t :
+ *  makes it possible to supply advanced compression instructions to streaming interface.
+ *  Structure must be first init to 0, using memset() or LZ4F_INIT_PREFERENCES,
+ *  setting all parameters to default.
+ *  All reserved fields must be set to zero. */
+typedef struct {
+  LZ4F_frameInfo_t frameInfo;
+  int      compressionLevel;    /* 0: default (fast mode); values > LZ4HC_CLEVEL_MAX count as LZ4HC_CLEVEL_MAX; values < 0 trigger "fast acceleration" */
+  unsigned autoFlush;           /* 1: always flush; reduces usage of internal buffers */
+  unsigned favorDecSpeed;       /* 1: parser favors decompression speed vs compression ratio. Only works for high compression modes (>= LZ4HC_CLEVEL_OPT_MIN) */  /* v1.8.2+ */
+  unsigned reserved[3];         /* must be zero for forward compatibility */
+} LZ4F_preferences_t;
+
+#define LZ4F_INIT_PREFERENCES   { LZ4F_INIT_FRAMEINFO, 0, 0u, 0u, { 0u, 0u, 0u } }    /* v1.8.3+ */
+
+
+/*-*********************************
+*  Simple compression function
+***********************************/
+
+LZ4FLIB_API int LZ4F_compressionLevel_max(void);   /* v1.8.0+ */
+
+/*! LZ4F_compressFrameBound() :
+ *  Returns the maximum possible compressed size with LZ4F_compressFrame() given srcSize and preferences.
+ * `preferencesPtr` is optional. It can be replaced by NULL, in which case, the function will assume default preferences.
+ *  Note : this result is only usable with LZ4F_compressFrame().
+ *         It may also be used with LZ4F_compressUpdate() _if no flush() operation_ is performed.
+ */
+LZ4FLIB_API size_t LZ4F_compressFrameBound(size_t srcSize, const LZ4F_preferences_t* preferencesPtr);
+
+/*! LZ4F_compressFrame() :
+ *  Compress an entire srcBuffer into a valid LZ4 frame.
+ *  dstCapacity MUST be >= LZ4F_compressFrameBound(srcSize, preferencesPtr).
+ *  The LZ4F_preferences_t structure is optional : you can provide NULL as argument. All preferences will be set to default.
+ * @return : number of bytes written into dstBuffer.
+ *           or an error code if it fails (can be tested using LZ4F_isError())
+ */
+LZ4FLIB_API size_t LZ4F_compressFrame(void* dstBuffer, size_t dstCapacity,
+                                const void* srcBuffer, size_t srcSize,
+                                const LZ4F_preferences_t* preferencesPtr);
+
+
+/*-***********************************
+*  Advanced compression functions
+*************************************/
+typedef struct LZ4F_cctx_s LZ4F_cctx;   /* incomplete type */
+typedef LZ4F_cctx* LZ4F_compressionContext_t;   /* for compatibility with previous API version */
+
+typedef struct {
+  unsigned stableSrc;    /* 1 == src content will remain present on future calls to LZ4F_compress(); skip copying src content within tmp buffer */
+  unsigned reserved[3];
+} LZ4F_compressOptions_t;
+
+/*---   Resource Management   ---*/
+
+#define LZ4F_VERSION 100    /* This number can be used to check for an incompatible API breaking change */
+LZ4FLIB_API unsigned LZ4F_getVersion(void);
+
+/*! LZ4F_createCompressionContext() :
+ * The first thing to do is to create a compressionContext object, which will be used in all compression operations.
+ * This is achieved using LZ4F_createCompressionContext(), which takes as argument a version.
+ * The version provided MUST be LZ4F_VERSION. It is intended to track potential version mismatch, notably when using DLL.
+ * The function will provide a pointer to a fully allocated LZ4F_cctx object.
+ * If @return != zero, there was an error during context creation.
+ * Object can release its memory using LZ4F_freeCompressionContext();
+ */
+LZ4FLIB_API LZ4F_errorCode_t LZ4F_createCompressionContext(LZ4F_cctx** cctxPtr, unsigned version);
+LZ4FLIB_API LZ4F_errorCode_t LZ4F_freeCompressionContext(LZ4F_cctx* cctx);
+
+
+/*----    Compression    ----*/
+
+#define LZ4F_HEADER_SIZE_MIN  7   /* LZ4 Frame header size can vary, depending on selected paramaters */
+#define LZ4F_HEADER_SIZE_MAX 19
+
+/* Size in bytes of a block header in little-endian format. Highest bit indicates if block data is uncompressed */
+#define LZ4F_BLOCK_HEADER_SIZE 4
+
+/* Size in bytes of a block checksum footer in little-endian format. */
+#define LZ4F_BLOCK_CHECKSUM_SIZE 4
+
+/* Size in bytes of the content checksum. */
+#define LZ4F_CONTENT_CHECKSUM_SIZE 4
+
+/*! LZ4F_compressBegin() :
+ *  will write the frame header into dstBuffer.
+ *  dstCapacity must be >= LZ4F_HEADER_SIZE_MAX bytes.
+ * `prefsPtr` is optional : you can provide NULL as argument, all preferences will then be set to default.
+ * @return : number of bytes written into dstBuffer for the header
+ *           or an error code (which can be tested using LZ4F_isError())
+ */
+LZ4FLIB_API size_t LZ4F_compressBegin(LZ4F_cctx* cctx,
+                                      void* dstBuffer, size_t dstCapacity,
+                                      const LZ4F_preferences_t* prefsPtr);
+
+/*! LZ4F_compressBound() :
+ *  Provides minimum dstCapacity required to guarantee success of
+ *  LZ4F_compressUpdate(), given a srcSize and preferences, for a worst case scenario.
+ *  When srcSize==0, LZ4F_compressBound() provides an upper bound for LZ4F_flush() and LZ4F_compressEnd() instead.
+ *  Note that the result is only valid for a single invocation of LZ4F_compressUpdate().
+ *  When invoking LZ4F_compressUpdate() multiple times,
+ *  if the output buffer is gradually filled up instead of emptied and re-used from its start,
+ *  one must check if there is enough remaining capacity before each invocation, using LZ4F_compressBound().
+ * @return is always the same for a srcSize and prefsPtr.
+ *  prefsPtr is optional : when NULL is provided, preferences will be set to cover worst case scenario.
+ *  tech details :
+ * @return if automatic flushing is not enabled, includes the possibility that internal buffer might already be filled by up to (blockSize-1) bytes.
+ *  It also includes frame footer (ending + checksum), since it might be generated by LZ4F_compressEnd().
+ * @return doesn't include frame header, as it was already generated by LZ4F_compressBegin().
+ */
+LZ4FLIB_API size_t LZ4F_compressBound(size_t srcSize, const LZ4F_preferences_t* prefsPtr);
+
+/*! LZ4F_compressUpdate() :
+ *  LZ4F_compressUpdate() can be called repetitively to compress as much data as necessary.
+ *  Important rule: dstCapacity MUST be large enough to ensure operation success even in worst case situations.
+ *  This value is provided by LZ4F_compressBound().
+ *  If this condition is not respected, LZ4F_compress() will fail (result is an errorCode).
+ *  LZ4F_compressUpdate() doesn't guarantee error recovery.
+ *  When an error occurs, compression context must be freed or resized.
+ * `cOptPtr` is optional : NULL can be provided, in which case all options are set to default.
+ * @return : number of bytes written into `dstBuffer` (it can be zero, meaning input data was just buffered).
+ *           or an error code if it fails (which can be tested using LZ4F_isError())
+ */
+LZ4FLIB_API size_t LZ4F_compressUpdate(LZ4F_cctx* cctx,
+                                       void* dstBuffer, size_t dstCapacity,
+                                 const void* srcBuffer, size_t srcSize,
+                                 const LZ4F_compressOptions_t* cOptPtr);
+
+/*! LZ4F_flush() :
+ *  When data must be generated and sent immediately, without waiting for a block to be completely filled,
+ *  it's possible to call LZ4_flush(). It will immediately compress any data buffered within cctx.
+ * `dstCapacity` must be large enough to ensure the operation will be successful.
+ * `cOptPtr` is optional : it's possible to provide NULL, all options will be set to default.
+ * @return : nb of bytes written into dstBuffer (can be zero, when there is no data stored within cctx)
+ *           or an error code if it fails (which can be tested using LZ4F_isError())
+ *  Note : LZ4F_flush() is guaranteed to be successful when dstCapacity >= LZ4F_compressBound(0, prefsPtr).
+ */
+LZ4FLIB_API size_t LZ4F_flush(LZ4F_cctx* cctx,
+                              void* dstBuffer, size_t dstCapacity,
+                        const LZ4F_compressOptions_t* cOptPtr);
+
+/*! LZ4F_compressEnd() :
+ *  To properly finish an LZ4 frame, invoke LZ4F_compressEnd().
+ *  It will flush whatever data remained within `cctx` (like LZ4_flush())
+ *  and properly finalize the frame, with an endMark and a checksum.
+ * `cOptPtr` is optional : NULL can be provided, in which case all options will be set to default.
+ * @return : nb of bytes written into dstBuffer, necessarily >= 4 (endMark),
+ *           or an error code if it fails (which can be tested using LZ4F_isError())
+ *  Note : LZ4F_compressEnd() is guaranteed to be successful when dstCapacity >= LZ4F_compressBound(0, prefsPtr).
+ *  A successful call to LZ4F_compressEnd() makes `cctx` available again for another compression task.
+ */
+LZ4FLIB_API size_t LZ4F_compressEnd(LZ4F_cctx* cctx,
+                                    void* dstBuffer, size_t dstCapacity,
+                              const LZ4F_compressOptions_t* cOptPtr);
+
+
+/*-*********************************
+*  Decompression functions
+***********************************/
+typedef struct LZ4F_dctx_s LZ4F_dctx;   /* incomplete type */
+typedef LZ4F_dctx* LZ4F_decompressionContext_t;   /* compatibility with previous API versions */
+
+typedef struct {
+  unsigned stableDst;    /* pledges that last 64KB decompressed data will remain available unmodified. This optimization skips storage operations in tmp buffers. */
+  unsigned reserved[3];  /* must be set to zero for forward compatibility */
+} LZ4F_decompressOptions_t;
+
+
+/* Resource management */
+
+/*! LZ4F_createDecompressionContext() :
+ *  Create an LZ4F_dctx object, to track all decompression operations.
+ *  The version provided MUST be LZ4F_VERSION.
+ *  The function provides a pointer to an allocated and initialized LZ4F_dctx object.
+ *  The result is an errorCode, which can be tested using LZ4F_isError().
+ *  dctx memory can be released using LZ4F_freeDecompressionContext();
+ *  Result of LZ4F_freeDecompressionContext() indicates current state of decompressionContext when being released.
+ *  That is, it should be == 0 if decompression has been completed fully and correctly.
+ */
+LZ4FLIB_API LZ4F_errorCode_t LZ4F_createDecompressionContext(LZ4F_dctx** dctxPtr, unsigned version);
+LZ4FLIB_API LZ4F_errorCode_t LZ4F_freeDecompressionContext(LZ4F_dctx* dctx);
+
+
+/*-***********************************
+*  Streaming decompression functions
+*************************************/
+
+#define LZ4F_MIN_SIZE_TO_KNOW_HEADER_LENGTH 5
+
+/*! LZ4F_headerSize() : v1.9.0+
+ *  Provide the header size of a frame starting at `src`.
+ * `srcSize` must be >= LZ4F_MIN_SIZE_TO_KNOW_HEADER_LENGTH,
+ *  which is enough to decode the header length.
+ * @return : size of frame header
+ *           or an error code, which can be tested using LZ4F_isError()
+ *  note : Frame header size is variable, but is guaranteed to be
+ *         >= LZ4F_HEADER_SIZE_MIN bytes, and <= LZ4F_HEADER_SIZE_MAX bytes.
+ */
+LZ4FLIB_API size_t LZ4F_headerSize(const void* src, size_t srcSize);
+
+/*! LZ4F_getFrameInfo() :
+ *  This function extracts frame parameters (max blockSize, dictID, etc.).
+ *  Its usage is optional: user can call LZ4F_decompress() directly.
+ *
+ *  Extracted information will fill an existing LZ4F_frameInfo_t structure.
+ *  This can be useful for allocation and dictionary identification purposes.
+ *
+ *  LZ4F_getFrameInfo() can work in the following situations :
+ *
+ *  1) At the beginning of a new frame, before any invocation of LZ4F_decompress().
+ *     It will decode header from `srcBuffer`,
+ *     consuming the header and starting the decoding process.
+ *
+ *     Input size must be large enough to contain the full frame header.
+ *     Frame header size can be known beforehand by LZ4F_headerSize().
+ *     Frame header size is variable, but is guaranteed to be >= LZ4F_HEADER_SIZE_MIN bytes,
+ *     and not more than <= LZ4F_HEADER_SIZE_MAX bytes.
+ *     Hence, blindly providing LZ4F_HEADER_SIZE_MAX bytes or more will always work.
+ *     It's allowed to provide more input data than the header size,
+ *     LZ4F_getFrameInfo() will only consume the header.
+ *
+ *     If input size is not large enough,
+ *     aka if it's smaller than header size,
+ *     function will fail and return an error code.
+ *
+ *  2) After decoding has been started,
+ *     it's possible to invoke LZ4F_getFrameInfo() anytime
+ *     to extract already decoded frame parameters stored within dctx.
+ *
+ *     Note that, if decoding has barely started,
+ *     and not yet read enough information to decode the header,
+ *     LZ4F_getFrameInfo() will fail.
+ *
+ *  The number of bytes consumed from srcBuffer will be updated in *srcSizePtr (necessarily <= original value).
+ *  LZ4F_getFrameInfo() only consumes bytes when decoding has not yet started,
+ *  and when decoding the header has been successful.
+ *  Decompression must then resume from (srcBuffer + *srcSizePtr).
+ *
+ * @return : a hint about how many srcSize bytes LZ4F_decompress() expects for next call,
+ *           or an error code which can be tested using LZ4F_isError().
+ *  note 1 : in case of error, dctx is not modified. Decoding operation can resume from beginning safely.
+ *  note 2 : frame parameters are *copied into* an already allocated LZ4F_frameInfo_t structure.
+ */
+LZ4FLIB_API size_t LZ4F_getFrameInfo(LZ4F_dctx* dctx,
+                                     LZ4F_frameInfo_t* frameInfoPtr,
+                                     const void* srcBuffer, size_t* srcSizePtr);
+
+/*! LZ4F_decompress() :
+ *  Call this function repetitively to regenerate data compressed in `srcBuffer`.
+ *
+ *  The function requires a valid dctx state.
+ *  It will read up to *srcSizePtr bytes from srcBuffer,
+ *  and decompress data into dstBuffer, of capacity *dstSizePtr.
+ *
+ *  The nb of bytes consumed from srcBuffer will be written into *srcSizePtr (necessarily <= original value).
+ *  The nb of bytes decompressed into dstBuffer will be written into *dstSizePtr (necessarily <= original value).
+ *
+ *  The function does not necessarily read all input bytes, so always check value in *srcSizePtr.
+ *  Unconsumed source data must be presented again in subsequent invocations.
+ *
+ * `dstBuffer` can freely change between each consecutive function invocation.
+ * `dstBuffer` content will be overwritten.
+ *
+ * @return : an hint of how many `srcSize` bytes LZ4F_decompress() expects for next call.
+ *  Schematically, it's the size of the current (or remaining) compressed block + header of next block.
+ *  Respecting the hint provides some small speed benefit, because it skips intermediate buffers.
+ *  This is just a hint though, it's always possible to provide any srcSize.
+ *
+ *  When a frame is fully decoded, @return will be 0 (no more data expected).
+ *  When provided with more bytes than necessary to decode a frame,
+ *  LZ4F_decompress() will stop reading exactly at end of current frame, and @return 0.
+ *
+ *  If decompression failed, @return is an error code, which can be tested using LZ4F_isError().
+ *  After a decompression error, the `dctx` context is not resumable.
+ *  Use LZ4F_resetDecompressionContext() to return to clean state.
+ *
+ *  After a frame is fully decoded, dctx can be used again to decompress another frame.
+ */
+LZ4FLIB_API size_t LZ4F_decompress(LZ4F_dctx* dctx,
+                                   void* dstBuffer, size_t* dstSizePtr,
+                                   const void* srcBuffer, size_t* srcSizePtr,
+                                   const LZ4F_decompressOptions_t* dOptPtr);
+
+
+/*! LZ4F_resetDecompressionContext() : added in v1.8.0
+ *  In case of an error, the context is left in "undefined" state.
+ *  In which case, it's necessary to reset it, before re-using it.
+ *  This method can also be used to abruptly stop any unfinished decompression,
+ *  and start a new one using same context resources. */
+LZ4FLIB_API void LZ4F_resetDecompressionContext(LZ4F_dctx* dctx);   /* always successful */
+
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* LZ4F_H_09782039843 */
+
+#if defined(LZ4F_STATIC_LINKING_ONLY) && !defined(LZ4F_H_STATIC_09782039843)
+#define LZ4F_H_STATIC_09782039843
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* These declarations are not stable and may change in the future.
+ * They are therefore only safe to depend on
+ * when the caller is statically linked against the library.
+ * To access their declarations, define LZ4F_STATIC_LINKING_ONLY.
+ *
+ * By default, these symbols aren't published into shared/dynamic libraries.
+ * You can override this behavior and force them to be published
+ * by defining LZ4F_PUBLISH_STATIC_FUNCTIONS.
+ * Use at your own risk.
+ */
+#ifdef LZ4F_PUBLISH_STATIC_FUNCTIONS
+# define LZ4FLIB_STATIC_API LZ4FLIB_API
+#else
+# define LZ4FLIB_STATIC_API
+#endif
+
+
+/* ---   Error List   --- */
+#define LZ4F_LIST_ERRORS(ITEM) \
+        ITEM(OK_NoError) \
+        ITEM(ERROR_GENERIC) \
+        ITEM(ERROR_maxBlockSize_invalid) \
+        ITEM(ERROR_blockMode_invalid) \
+        ITEM(ERROR_contentChecksumFlag_invalid) \
+        ITEM(ERROR_compressionLevel_invalid) \
+        ITEM(ERROR_headerVersion_wrong) \
+        ITEM(ERROR_blockChecksum_invalid) \
+        ITEM(ERROR_reservedFlag_set) \
+        ITEM(ERROR_allocation_failed) \
+        ITEM(ERROR_srcSize_tooLarge) \
+        ITEM(ERROR_dstMaxSize_tooSmall) \
+        ITEM(ERROR_frameHeader_incomplete) \
+        ITEM(ERROR_frameType_unknown) \
+        ITEM(ERROR_frameSize_wrong) \
+        ITEM(ERROR_srcPtr_wrong) \
+        ITEM(ERROR_decompressionFailed) \
+        ITEM(ERROR_headerChecksum_invalid) \
+        ITEM(ERROR_contentChecksum_invalid) \
+        ITEM(ERROR_frameDecoding_alreadyStarted) \
+        ITEM(ERROR_maxCode)
+
+#define LZ4F_GENERATE_ENUM(ENUM) LZ4F_##ENUM,
+
+/* enum list is exposed, to handle specific errors */
+typedef enum { LZ4F_LIST_ERRORS(LZ4F_GENERATE_ENUM)
+              _LZ4F_dummy_error_enum_for_c89_never_used } LZ4F_errorCodes;
+
+LZ4FLIB_STATIC_API LZ4F_errorCodes LZ4F_getErrorCode(size_t functionResult);
+
+LZ4FLIB_STATIC_API size_t LZ4F_getBlockSize(unsigned);
+
+/**********************************
+ *  Bulk processing dictionary API
+ *********************************/
+
+/* A Dictionary is useful for the compression of small messages (KB range).
+ * It dramatically improves compression efficiency.
+ *
+ * LZ4 can ingest any input as dictionary, though only the last 64 KB are useful.
+ * Best results are generally achieved by using Zstandard's Dictionary Builder
+ * to generate a high-quality dictionary from a set of samples.
+ *
+ * Loading a dictionary has a cost, since it involves construction of tables.
+ * The Bulk processing dictionary API makes it possible to share this cost
+ * over an arbitrary number of compression jobs, even concurrently,
+ * markedly improving compression latency for these cases.
+ *
+ * The same dictionary will have to be used on the decompression side
+ * for decoding to be successful.
+ * To help identify the correct dictionary at decoding stage,
+ * the frame header allows optional embedding of a dictID field.
+ */
+typedef struct LZ4F_CDict_s LZ4F_CDict;
+
+/*! LZ4_createCDict() :
+ *  When compressing multiple messages / blocks using the same dictionary, it's recommended to load it just once.
+ *  LZ4_createCDict() will create a digested dictionary, ready to start future compression operations without startup delay.
+ *  LZ4_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only.
+ * `dictBuffer` can be released after LZ4_CDict creation, since its content is copied within CDict */
+LZ4FLIB_STATIC_API LZ4F_CDict* LZ4F_createCDict(const void* dictBuffer, size_t dictSize);
+LZ4FLIB_STATIC_API void        LZ4F_freeCDict(LZ4F_CDict* CDict);
+
+
+/*! LZ4_compressFrame_usingCDict() :
+ *  Compress an entire srcBuffer into a valid LZ4 frame using a digested Dictionary.
+ *  cctx must point to a context created by LZ4F_createCompressionContext().
+ *  If cdict==NULL, compress without a dictionary.
+ *  dstBuffer MUST be >= LZ4F_compressFrameBound(srcSize, preferencesPtr).
+ *  If this condition is not respected, function will fail (@return an errorCode).
+ *  The LZ4F_preferences_t structure is optional : you may provide NULL as argument,
+ *  but it's not recommended, as it's the only way to provide dictID in the frame header.
+ * @return : number of bytes written into dstBuffer.
+ *           or an error code if it fails (can be tested using LZ4F_isError()) */
+LZ4FLIB_STATIC_API size_t LZ4F_compressFrame_usingCDict(
+    LZ4F_cctx* cctx,
+    void* dst, size_t dstCapacity,
+    const void* src, size_t srcSize,
+    const LZ4F_CDict* cdict,
+    const LZ4F_preferences_t* preferencesPtr);
+
+
+/*! LZ4F_compressBegin_usingCDict() :
+ *  Inits streaming dictionary compression, and writes the frame header into dstBuffer.
+ *  dstCapacity must be >= LZ4F_HEADER_SIZE_MAX bytes.
+ * `prefsPtr` is optional : you may provide NULL as argument,
+ *  however, it's the only way to provide dictID in the frame header.
+ * @return : number of bytes written into dstBuffer for the header,
+ *           or an error code (which can be tested using LZ4F_isError()) */
+LZ4FLIB_STATIC_API size_t LZ4F_compressBegin_usingCDict(
+    LZ4F_cctx* cctx,
+    void* dstBuffer, size_t dstCapacity,
+    const LZ4F_CDict* cdict,
+    const LZ4F_preferences_t* prefsPtr);
+
+
+/*! LZ4F_decompress_usingDict() :
+ *  Same as LZ4F_decompress(), using a predefined dictionary.
+ *  Dictionary is used "in place", without any preprocessing.
+ *  It must remain accessible throughout the entire frame decoding. */
+LZ4FLIB_STATIC_API size_t LZ4F_decompress_usingDict(
+    LZ4F_dctx* dctxPtr,
+    void* dstBuffer, size_t* dstSizePtr,
+    const void* srcBuffer, size_t* srcSizePtr,
+    const void* dict, size_t dictSize,
+    const LZ4F_decompressOptions_t* decompressOptionsPtr);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* defined(LZ4F_STATIC_LINKING_ONLY) && !defined(LZ4F_H_STATIC_09782039843) */
--- a/cviruntime/include/lz4/xxhash.h
+++ b/cviruntime/include/lz4/xxhash.h
@ -0,0 +1,328 @@
+/*
+   xxHash - Extremely Fast Hash algorithm
+   Header File
+   Copyright (C) 2012-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - xxHash source repository : https://github.com/Cyan4973/xxHash
+*/
+
+/* Notice extracted from xxHash homepage :
+
+xxHash is an extremely fast Hash algorithm, running at RAM speed limits.
+It also successfully passes all tests from the SMHasher suite.
+
+Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
+
+Name            Speed       Q.Score   Author
+xxHash          5.4 GB/s     10
+CrapWow         3.2 GB/s      2       Andrew
+MumurHash 3a    2.7 GB/s     10       Austin Appleby
+SpookyHash      2.0 GB/s     10       Bob Jenkins
+SBox            1.4 GB/s      9       Bret Mulvey
+Lookup3         1.2 GB/s      9       Bob Jenkins
+SuperFastHash   1.2 GB/s      1       Paul Hsieh
+CityHash64      1.05 GB/s    10       Pike & Alakuijala
+FNV             0.55 GB/s     5       Fowler, Noll, Vo
+CRC32           0.43 GB/s     9
+MD5-32          0.33 GB/s    10       Ronald L. Rivest
+SHA1-32         0.28 GB/s    10
+
+Q.Score is a measure of quality of the hash function.
+It depends on successfully passing SMHasher test set.
+10 is a perfect score.
+
+A 64-bit version, named XXH64, is available since r35.
+It offers much better speed, but for 64-bit applications only.
+Name     Speed on 64 bits    Speed on 32 bits
+XXH64       13.8 GB/s            1.9 GB/s
+XXH32        6.8 GB/s            6.0 GB/s
+*/
+
+#ifndef XXHASH_H_5627135585666179
+#define XXHASH_H_5627135585666179 1
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* ****************************
+*  Definitions
+******************************/
+#include <stddef.h>   /* size_t */
+typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
+
+
+/* ****************************
+ *  API modifier
+ ******************************/
+/** XXH_INLINE_ALL (and XXH_PRIVATE_API)
+ *  This is useful to include xxhash functions in `static` mode
+ *  in order to inline them, and remove their symbol from the public list.
+ *  Inlining can offer dramatic performance improvement on small keys.
+ *  Methodology :
+ *     #define XXH_INLINE_ALL
+ *     #include "xxhash.h"
+ * `xxhash.c` is automatically included.
+ *  It's not useful to compile and link it as a separate module.
+ */
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
+#  ifndef XXH_STATIC_LINKING_ONLY
+#    define XXH_STATIC_LINKING_ONLY
+#  endif
+#  if defined(__GNUC__)
+#    define XXH_PUBLIC_API static __inline __attribute__((unused))
+#  elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#    define XXH_PUBLIC_API static inline
+#  elif defined(_MSC_VER)
+#    define XXH_PUBLIC_API static __inline
+#  else
+     /* this version may generate warnings for unused static functions */
+#    define XXH_PUBLIC_API static
+#  endif
+#else
+#  define XXH_PUBLIC_API   /* do nothing */
+#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
+
+/*! XXH_NAMESPACE, aka Namespace Emulation :
+ *
+ * If you want to include _and expose_ xxHash functions from within your own library,
+ * but also want to avoid symbol collisions with other libraries which may also include xxHash,
+ *
+ * you can use XXH_NAMESPACE, to automatically prefix any public symbol from xxhash library
+ * with the value of XXH_NAMESPACE (therefore, avoid NULL and numeric values).
+ *
+ * Note that no change is required within the calling program as long as it includes `xxhash.h` :
+ * regular symbol name will be automatically translated by this header.
+ */
+#ifdef XXH_NAMESPACE
+#  define XXH_CAT(A,B) A##B
+#  define XXH_NAME2(A,B) XXH_CAT(A,B)
+#  define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
+#  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+#  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+#  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+#  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
+#  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
+#  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
+#  define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
+#  define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
+#  define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
+#  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
+#  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
+#  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
+#  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
+#  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
+#  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+#  define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
+#  define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
+#  define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
+#endif
+
+
+/* *************************************
+*  Version
+***************************************/
+#define XXH_VERSION_MAJOR    0
+#define XXH_VERSION_MINOR    6
+#define XXH_VERSION_RELEASE  5
+#define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
+XXH_PUBLIC_API unsigned XXH_versionNumber (void);
+
+
+/*-**********************************************************************
+*  32-bit hash
+************************************************************************/
+typedef unsigned int XXH32_hash_t;
+
+/*! XXH32() :
+    Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input".
+    The memory between input & input+length must be valid (allocated and read-accessible).
+    "seed" can be used to alter the result predictably.
+    Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s */
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, unsigned int seed);
+
+/*======   Streaming   ======*/
+typedef struct XXH32_state_s XXH32_state_t;   /* incomplete type */
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, unsigned int seed);
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH32_hash_t  XXH32_digest (const XXH32_state_t* statePtr);
+
+/*
+ * Streaming functions generate the xxHash of an input provided in multiple segments.
+ * Note that, for small input, they are slower than single-call functions, due to state management.
+ * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
+ *
+ * XXH state must first be allocated, using XXH*_createState() .
+ *
+ * Start a new hash by initializing state with a seed, using XXH*_reset().
+ *
+ * Then, feed the hash state by calling XXH*_update() as many times as necessary.
+ * The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
+ *
+ * Finally, a hash value can be produced anytime, by using XXH*_digest().
+ * This function returns the nn-bits hash as an int or long long.
+ *
+ * It's still possible to continue inserting input into the hash state after a digest,
+ * and generate some new hashes later on, by calling again XXH*_digest().
+ *
+ * When done, free XXH state space if it was allocated dynamically.
+ */
+
+/*======   Canonical representation   ======*/
+
+typedef struct { unsigned char digest[4]; } XXH32_canonical_t;
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
+
+/* Default result type for XXH functions are primitive unsigned 32 and 64 bits.
+ * The canonical representation uses human-readable write convention, aka big-endian (large digits first).
+ * These functions allow transformation of hash result into and from its canonical format.
+ * This way, hash values can be written into a file / memory, and remain comparable on different systems and programs.
+ */
+
+
+#ifndef XXH_NO_LONG_LONG
+/*-**********************************************************************
+*  64-bit hash
+************************************************************************/
+typedef unsigned long long XXH64_hash_t;
+
+/*! XXH64() :
+    Calculate the 64-bit hash of sequence of length "len" stored at memory address "input".
+    "seed" can be used to alter the result predictably.
+    This function runs faster on 64-bit systems, but slower on 32-bit systems (see benchmark).
+*/
+XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, unsigned long long seed);
+
+/*======   Streaming   ======*/
+typedef struct XXH64_state_s XXH64_state_t;   /* incomplete type */
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset  (XXH64_state_t* statePtr, unsigned long long seed);
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t  XXH64_digest (const XXH64_state_t* statePtr);
+
+/*======   Canonical representation   ======*/
+typedef struct { unsigned char digest[8]; } XXH64_canonical_t;
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
+#endif  /* XXH_NO_LONG_LONG */
+
+
+
+#ifdef XXH_STATIC_LINKING_ONLY
+
+/* ================================================================================================
+   This section contains declarations which are not guaranteed to remain stable.
+   They may change in future versions, becoming incompatible with a different version of the library.
+   These declarations should only be used with static linking.
+   Never use them in association with dynamic linking !
+=================================================================================================== */
+
+/* These definitions are only present to allow
+ * static allocation of XXH state, on stack or in a struct for example.
+ * Never **ever** use members directly. */
+
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+
+struct XXH32_state_s {
+   uint32_t total_len_32;
+   uint32_t large_len;
+   uint32_t v1;
+   uint32_t v2;
+   uint32_t v3;
+   uint32_t v4;
+   uint32_t mem32[4];
+   uint32_t memsize;
+   uint32_t reserved;   /* never read nor write, might be removed in a future version */
+};   /* typedef'd to XXH32_state_t */
+
+struct XXH64_state_s {
+   uint64_t total_len;
+   uint64_t v1;
+   uint64_t v2;
+   uint64_t v3;
+   uint64_t v4;
+   uint64_t mem64[4];
+   uint32_t memsize;
+   uint32_t reserved[2];          /* never read nor write, might be removed in a future version */
+};   /* typedef'd to XXH64_state_t */
+
+# else
+
+struct XXH32_state_s {
+   unsigned total_len_32;
+   unsigned large_len;
+   unsigned v1;
+   unsigned v2;
+   unsigned v3;
+   unsigned v4;
+   unsigned mem32[4];
+   unsigned memsize;
+   unsigned reserved;   /* never read nor write, might be removed in a future version */
+};   /* typedef'd to XXH32_state_t */
+
+#   ifndef XXH_NO_LONG_LONG  /* remove 64-bit support */
+struct XXH64_state_s {
+   unsigned long long total_len;
+   unsigned long long v1;
+   unsigned long long v2;
+   unsigned long long v3;
+   unsigned long long v4;
+   unsigned long long mem64[4];
+   unsigned memsize;
+   unsigned reserved[2];     /* never read nor write, might be removed in a future version */
+};   /* typedef'd to XXH64_state_t */
+#    endif
+
+# endif
+
+
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
+#  include "xxhash.c"   /* include xxhash function bodies as `static`, for inlining */
+#endif
+
+#endif /* XXH_STATIC_LINKING_ONLY */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* XXHASH_H_5627135585666179 */
--- a/cviruntime/include/runtime/cpu_function.hpp
+++ b/cviruntime/include/runtime/cpu_function.hpp
@ -0,0 +1,74 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*/
+
+#ifndef RUNTIME_CPU_FUNCTION_H
+#define RUNTIME_CPU_FUNCTION_H
+#include <vector>
+#include <stdint.h>
+#include <memory>
+#include <runtime/neuron.hpp>
+#include <runtime/op_param.hpp>
+
+namespace cvi {
+namespace runtime {
+
+class ICpuFunction {
+public:
+  ICpuFunction() {}
+  virtual ~ICpuFunction() {}
+  virtual void setup(tensor_list_t &inputs,
+                     tensor_list_t &outputs,
+                     OpParam &param) = 0;
+  virtual void run() = 0;
+
+protected:
+  template <typename T>
+  void print_data(T data) {
+    if (sizeof(T) == 4) {
+      printf("%e ", (float)data);
+    } else if (sizeof(T) == 1) {
+      printf("%4d ", (int)data);
+    } else {
+      assert(0);
+      std::cout << data << " ";
+    }
+  }
+
+  template <typename T>
+  void dump(const std::string &tag, const T *data, size_t count) {
+    auto ptr = (T *)data;
+    int loop = count / 10;
+    std::cout << "-------Dump " << tag << ", size:" << count << "\n";
+
+    for (int i = 0; i < loop; i++) {
+      for (int j = 0; j < 10; j++) {
+        print_data(*(ptr++));
+      }
+      std::cout << "\n";
+    }
+    for (int j = 0; j < (int)(count % 10); j++) {
+      print_data(*(ptr++));
+    }
+    std::cout << "\n";
+  }
+};
+
+typedef ICpuFunction *(*ICpuFunctionCreate)();
+
+} // namespace runtime
+} // namespace cvi
+
+typedef struct {
+  char *name;
+  cvi::runtime::ICpuFunctionCreate func;
+} CustomOpRuntimeFunc;
+
+#define REGISTER_OP_RUNTIME_FUNCS(X, ...)                                                \
+  extern "C" {                                                                           \
+  CustomOpRuntimeFunc customOpRuntimeFuncs[] = {X, ##__VA_ARGS__};                       \
+  int customOpRuntimeFuncsNum =                                                          \
+      sizeof(customOpRuntimeFuncs) / sizeof(CustomOpRuntimeFunc);                        \
+  }
+
+#endif
--- a/cviruntime/include/runtime/debug.h
+++ b/cviruntime/include/runtime/debug.h
@ -0,0 +1,23 @@
+#ifndef _BM_DEBUG_H_
+#define _BM_DEBUG_H_
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <assert.h>
+#include <memory.h>
+#include <sys/mman.h>
+#include "cvitpu_debug.h"
+
+
+// print the version of runtime.
+void showRuntimeVersion();
+// dump sysfs debug file
+void dumpSysfsDebugFile(const char *path);
+
+void mem_protect(uint8_t *vaddr, size_t size);
+
+void mem_unprotect(uint8_t *vaddr, size_t size);
+
+#endif /* _BM_DEBUG_H_ */
--- a/cviruntime/include/runtime/kernel_function.hpp
+++ b/cviruntime/include/runtime/kernel_function.hpp
@ -0,0 +1,34 @@
+#ifndef RUNTIME_TDMA_COPY_HPP
+#define RUNTIME_TDMA_COPY_HPP
+
+#include "cviruntime_context.h"
+#include "cviruntime.h"
+#include "cvikernel/cvikernel.h"
+
+namespace cvi {
+namespace runtime {
+
+CVI_RC runtimeExecuteKernelFunction(
+    CVI_RT_HANDLE ctx, CVI_RT_MEM codeBuf,
+    uint64_t gaddrSrc, uint64_t gaddrDst);
+
+CVI_RT_MEM runtimeJitTdmaStrideCopy(
+    CVI_RT_HANDLE ctx, void *cvk, CVI_FMT fmt,
+    cvk_tg_shape_t *shapeDst, cvk_tg_stride_t *strideDst,
+    cvk_tg_shape_t *shapeSrc, cvk_tg_stride_t *strideSrc);
+
+CVI_RT_MEM runtimeJitMatrixMul(
+    CVI_RT_HANDLE ctx, void* cvk_ctx, CVI_FMT fmt,
+    uint32_t m, uint32_t k, uint32_t n);
+
+CVI_RT_MEM runtimeJitEuclideanDistance(
+    CVI_RT_HANDLE ctx, void* cvk_ctx,
+    uint32_t records, uint32_t feature_size);
+
+CVI_RT_MEM runtimeJitGrayImageLight(
+    CVI_RT_HANDLE ctx, void* cvk_ctx,
+    int32_t ih, int32_t iw, int32_t kernel_sz);
+}
+
+}
+#endif
--- a/cviruntime/include/runtime/model.hpp
+++ b/cviruntime/include/runtime/model.hpp
@ -0,0 +1,87 @@
+#ifndef RUNTIME_MODEL_H
+#define RUNTIME_MODEL_H
+
+#include <vector>
+#include <map>
+#include <memory>
+#include <string>
+#include <cvibuilder/cvimodel_generated.h>
+#include <runtime/stream.hpp>
+#include <runtime/program.hpp>
+#include <runtime/neuron.hpp>
+#include <runtime/cpu_function.hpp>
+#include <runtime/taskpool.hpp>
+
+namespace cvi {
+namespace runtime {
+
+typedef struct {
+  char magic[8];
+  uint32_t body_size;
+  char major;
+  char minor;
+  char md5[16];
+  char chip[16];
+  char padding[2];
+} MODEL_HEADER;
+
+class CviModel {
+public:
+  CviModel(CVI_RT_HANDLE ctx, int count);
+
+  CVI_RC acquire(const int8_t *buf, size_t size);
+  CVI_RC acquire(const std::string &modelFile);
+  CVI_RC acquire(const int fd, const size_t ud_offset);
+  void refer() { ref++; }
+  void release();
+
+  CVI_RC loadProgram(Program **program,
+      int program_id, bool export_all_tensors,
+      bool skip_preprocess);
+
+  static std::string getChipType(const std::string &modelFile,
+      const int8_t *buf = nullptr, size_t size = 0);
+
+  int32_t program_num;
+  int32_t major_ver = 1;
+  int32_t minor_ver = 2;
+
+  // global info
+  static std::string targetChipType;
+
+private:
+  ~CviModel();
+
+  CVI_RC parse(BaseStream *stream);
+  CVI_RC loadWeight(BaseStream *stream, size_t offset, size_t size);
+  CVI_RC loadDmabuf(BaseStream *stream, size_t offset, size_t size, const cvi::model::Section *section);
+  CVI_RC loadCmdbuf(BaseStream *stream, size_t offset, size_t size, const cvi::model::Section *section);
+  CVI_RC extractSections(BaseStream *stream, size_t bin_offset);
+  CVI_RC parseModelHeader(BaseStream *stream, size_t &payload_sz,
+                          size_t &header_sz);
+  bool checkIfMatchTargetChipType(std::string &target);
+  CVI_RC showAndCheckVersion();
+  void parseProgramNum();
+  void createCpuWeightMap();
+
+  CVI_RT_HANDLE _ctx;
+  std::atomic<int32_t> ref;
+  TaskPool *_pool = nullptr;
+  cvi::model::Model *_fb_model;
+  uint8_t *_model_body = nullptr;
+  CVI_RT_MEM _weight_mem = nullptr;
+  CustomFunctionSection _custom_section;
+  std::vector<CpuRuntimeFunction *> _cpu_functions;
+  tensor_map_t weight_map;
+  dmabuf_map_t dmabuf_map;
+  bool encrypt_model;
+  bool isprotect = false; //protect cmdbuf_mem and weight_mem 
+  int _count;
+  std::string _model_name;
+  size_t _max_shared_mem_size;
+};
+
+} // namespace runtime
+} // namespace cvi
+
+#endif
--- a/cviruntime/include/runtime/neuron.hpp
+++ b/cviruntime/include/runtime/neuron.hpp
@ -0,0 +1,148 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*/
+
+#ifndef RUNTIME_NEURON_H
+#define RUNTIME_NEURON_H
+
+#include <map>
+#include <vector>
+#include <memory>
+#include "cviruntime.h"
+#include "cviruntime_context.h"
+
+namespace cvi {
+namespace runtime {
+
+class Neuron {
+public:
+  enum NeuronState {
+    TPU_MEM = 0,
+    CPU_MEM = 1,
+  };
+
+  enum NeuronType {
+    WEIGHT      = 0,
+    ACTIVATION  = 1,
+  };
+
+  Neuron(CVI_RT_HANDLE ctx, const void *model_tensor,
+         CVI_RT_MEM weight_mem, const char *model_name);
+  Neuron(CVI_RT_HANDLE ctx, CVI_RT_HANDLE cvk,
+         const void *model_tensor,
+         uint64_t *baseAddrArray,
+         CVI_RT_MEM *baseMemArray,
+         const char *model_name);
+  ~Neuron();
+
+  template <typename T>
+  inline T* cpu_data() {
+    _state = Neuron::CPU_MEM;
+    return (T *)sys_mem();
+  }
+
+  inline size_t count() {
+    return _count;
+  }
+
+  inline size_t size() {
+    return _size;
+  }
+
+  inline size_t offset(int n, int c = 0, int h = 0, int w = 0) {
+    return (((n * shape[1] + c) * shape[2] + h) * shape[3] + w);
+  }
+
+  inline bool overwrote() {
+    return _overwrote;
+  }
+
+  inline void setState(NeuronState state) {
+    _state = state;
+  }
+
+  inline uint8_t *sys_mem() {
+    return (_vaddr ? _vaddr : _cpu_mem);
+  }
+
+  inline uint64_t paddr() {
+    return _paddr;
+  }
+
+  inline float qscale() {
+    return _qscale;
+  }
+
+  inline void setQScale(float scale) {
+    _qscale = scale;
+  }
+
+  inline int zero_point(){
+    return _zero_point;
+  }
+
+  CVI_RC preloadChannelAndCompact(int32_t channel_idx, uint64_t src_paddr);
+  CVI_RC preloadFrameAndCompact(int32_t frame_idx, uint64_t src_paddr);
+  CVI_RC preload(int32_t frame_idx, uint64_t src_paddr);
+
+  void load(CVI_TENSOR &tensor);
+  void store(CVI_TENSOR &tensor);
+  void toCpu();
+  void toTpu();
+  CVI_RC reserveIonMem(int64_t offset);
+  CVI_RC reserveSysMem();
+  void updateBaseAddr(uint64_t paddr);
+  bool isPacked();
+
+private:
+  void updateBaseAddr(CVI_RT_MEM mem);
+  inline void setZeroPoint(int zp) { _zero_point = zp; }
+  void setPixelFormatAndSize(const std::string &pixel_format, int32_t dsize);
+  void setPixelAlign(CVI_NN_PIXEL_FORMAT_E format);
+  uint32_t yuv_size(int n, int c, int h, int w, CVI_NN_PIXEL_FORMAT_E format);
+
+public:
+  std::string name;
+  std::vector<int> shape;
+  CVI_FMT fmt;
+  NeuronType type;
+  CVI_NN_PIXEL_FORMAT_E pixel_format;
+  std::vector<float> scale;
+  std::vector<float> mean;
+  bool aligned = false;
+  int vpss_w_align, vpss_y_align, vpss_channel_align;
+
+private:
+  CVI_RT_HANDLE _ctx;
+  CVI_RT_KHANDLE _cvk;
+  CVI_RT_MEM _streamCopyCmdbuf = nullptr;
+  CVI_RT_MEM _channelPreloadCmdbuf = nullptr;
+  CVI_RT_MEM _framePreloadCmdbuf = nullptr;
+  CVI_RT_MEM _base_mem = nullptr;
+  CVI_RT_MEM _gmem = nullptr;
+  uint8_t* _cpu_mem = nullptr;
+  uint8_t* _vaddr = nullptr;
+  uint64_t _paddr = 0;
+  NeuronState _state;
+  uint32_t _id;
+  uint32_t _count;
+  uint32_t _size;
+  uint32_t _tensor_size = 0;
+  bool _overwrote = false;
+  float _qscale = 1.0f;
+  int _zero_point = 0;
+  uint64_t *_baseAddrArray;
+  CVI_RT_MEM *_baseMemArray;
+  int32_t _baseAddrIndex = 1;
+  std::string _model_name;
+  std::string _module_name;
+};
+
+typedef std::map<std::string, std::shared_ptr<Neuron>> tensor_map_t;
+typedef std::vector<std::shared_ptr<Neuron>> tensor_list_t;
+
+
+} // namespace runtime
+} // namespace cvi
+
+#endif
--- a/cviruntime/include/runtime/op_param.hpp
+++ b/cviruntime/include/runtime/op_param.hpp
@ -0,0 +1,53 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*/
+
+#ifndef CVI_RUNTIME_OP_PARAMETER_H
+#define CVI_RUNTIME_OP_PARAMETER_H
+#include <iostream>
+#include <string>
+#include <vector>
+#include <map>
+#include <memory>
+#include <assert.h>
+
+namespace cvi {
+
+class FieldBase {
+public:
+  FieldBase() = default;
+  virtual ~FieldBase() = default;
+};
+
+template <typename T>
+class Field: public FieldBase {
+public:
+  Field(T& val): data(val) {}
+  T data;
+};
+
+class OpParam {
+public:
+  template <typename T>
+  void put(std::string name, T value) {
+    fields[name] = std::make_shared<Field<T>>(value);
+  }
+
+  template <typename T>
+  T& get(std::string name) {
+    auto f = dynamic_cast<Field<T>*>(fields[name].get());
+    assert(f);
+    return f->data;
+  }
+
+  bool has(std::string name) {
+    auto it = fields.find(name);
+    return it != fields.end();
+  }
+
+private:
+  std::map<std::string, std::shared_ptr<FieldBase>> fields;
+};
+
+}
+#endif
--- a/cviruntime/include/runtime/program.hpp
+++ b/cviruntime/include/runtime/program.hpp
@ -0,0 +1,145 @@
+#ifndef RUNTIME_PROGRAM_H
+#define RUNTIME_PROGRAM_H
+
+#include <map>
+#include <list>
+#include <vector>
+#include <unordered_map>
+#include <iostream>
+#include <fstream>
+#include <runtime/neuron.hpp>
+#include <runtime/stream.hpp>
+#include <runtime/section.hpp>
+#include <runtime/cpu_function.hpp>
+#include <runtime/op_param.hpp>
+#include <cviruntime_context.h>
+#include "cviruntime.h"
+#include <runtime/taskpool.hpp>
+#include <cvibuilder/cvimodel_generated.h>
+#include <cvibuilder/parameter_generated.h>
+
+namespace cvi {
+namespace runtime {
+
+typedef std::unordered_map<std::string, CVI_RT_MEM> dmabuf_map_t;
+
+class Routine;
+class Program {
+public:
+  Program(CVI_RT_HANDLE ctx, TaskPool *pool,
+          dmabuf_map_t &dmabuf_map,
+          std::vector<CpuRuntimeFunction *> &functions,
+          tensor_map_t &weight_map,
+          CVI_RT_MEM weight_mem,
+          const char *model_name,
+          size_t max_shared_mem_size);
+  ~Program();
+
+  void setOptions(bool export_all_tensors,
+                  bool skip_preprocess);
+  CVI_RC load(const cvi::model::Program *fb_program);
+
+  bool forward(CVI_TENSOR *inputs, int input_num,
+               CVI_TENSOR *outputs, int output_num);
+
+  void *forwardAsync(CVI_TENSOR *inputs, int input_num,
+                     CVI_TENSOR *outputs, int output_num);
+
+  CVI_RC forwardWait(void *task);
+
+  const tensor_list_t &input_tensors() { return in_tensors; }
+  const tensor_list_t &output_tensors() { return out_tensors; }
+
+  CVI_TENSOR *exportInputs(int32_t &size);
+  CVI_TENSOR *exportOutputs(int32_t &size);
+
+  tensor_list_t in_tensors;
+  tensor_list_t out_tensors;
+  tensor_map_t neuron_map;
+  tensor_map_t &weight_map;
+  dmabuf_map_t &dmabuf_map;
+  std::vector<CpuRuntimeFunction *> &cpu_functions;
+  /* 0: shared_mem,
+   * 1: weight_mem,
+   * 2: private_mem,
+   * 3~7: io_mem
+   */
+  uint64_t baseAddrArray[8];
+  CVI_RT_MEM baseMemArray[8];
+
+private:
+  CVI_RC createNeuronSpace(const cvi::model::Program *fb_program);
+  CVI_RC createNeuronMap(const cvi::model::Program *fb_program);
+  CVI_RC createRoutines(const cvi::model::Program *fb_program);
+  bool run();
+
+  CVI_RT_HANDLE _ctx;
+  CVI_RT_KHANDLE _cvk;
+  bool _export_all_tensors;
+  bool _skip_preprocess;
+  TaskPool *_pool = nullptr;
+  CVI_RT_MEM private_mem = nullptr;
+  CVI_RT_MEM shared_mem = nullptr;
+  std::list<std::shared_ptr<Routine>> _routines;
+  std::string _model_name;
+  size_t _max_shared_mem_size;
+};
+
+class Routine {
+public:
+  Routine(CVI_RT_HANDLE ctx, Program *program, bool tpu)
+    : tpu(tpu), _ctx(ctx), _program(program) {}
+  virtual ~Routine() {}
+  virtual bool initialize(const cvi::model::Routine *routine) = 0;
+  virtual CVI_RC run() = 0;
+  virtual void reset() = 0;
+  virtual CVI_RC prepare() { return CVI_RC_SUCCESS; }
+  tensor_list_t inputs;
+  tensor_list_t outputs;
+  bool tpu;
+
+protected:
+  CVI_RT_HANDLE _ctx;
+  Program *_program;
+};
+
+class TpuRoutine : public Routine {
+public:
+  TpuRoutine(CVI_RT_HANDLE ctx, Program *program)
+    : Routine(ctx, program, true) {}
+  ~TpuRoutine() {
+  }
+
+  bool initialize(const cvi::model::Routine *routine);
+  int init_dmabuf (Program *program, const std::string &name);
+  CVI_RC run();
+  void reset();
+
+private:
+  CVI_RT_MEM buf_mem = nullptr;
+  bool enable_pmu = false;
+  bool encrypted = false;
+};
+
+class CpuRoutine : public Routine {
+public:
+  CpuRoutine(CVI_RT_HANDLE ctx, Program *program)
+    : Routine(ctx, program, false) {}
+  ~CpuRoutine() { delete _func; }
+
+  bool initialize(const cvi::model::Routine *routine);
+  CVI_RC run();
+  CVI_RC prepare();
+  void reset();
+
+private:
+  void fetchQscaleFromDequant(OpParam &param);
+  void handleFuncArgs(const uint8_t *args, OpParam &param);
+  ICpuFunctionCreate _func_open = nullptr;
+  ICpuFunction *_func = nullptr;
+};
+
+} // namespace runtime
+} // namespace cvi
+
+#endif
--- a/cviruntime/include/runtime/section.hpp
+++ b/cviruntime/include/runtime/section.hpp
@ -0,0 +1,48 @@
+#ifndef RUNTIME_SECTION_H
+#define RUNTIME_SECTION_H
+
+#include <vector>
+#include <map>
+#include <memory>
+#include <string>
+#include <runtime/stream.hpp>
+#include <runtime/cpu_function.hpp>
+#include "alloc.h"
+
+namespace cvi {
+namespace runtime {
+
+class WeightSection {
+public:
+  WeightSection(size_t offset, size_t size) : offset(offset), size(size) {}
+
+  size_t offset;
+  size_t size;
+};
+
+class CpuRuntimeFunction {
+public:
+  CpuRuntimeFunction(const std::string &name, ICpuFunctionCreate func_open)
+      : name(name), func_open(func_open) {}
+  ~CpuRuntimeFunction() = default;
+
+  const std::string name;
+  ICpuFunctionCreate func_open;
+};
+
+class CustomFunctionSection {
+public:
+  CustomFunctionSection() = default;
+  ~CustomFunctionSection();
+  bool load(BaseStream *stream, size_t offset, size_t size,
+            std::vector<CpuRuntimeFunction *> &cpu_functions);
+
+private:
+  int shm_fd = 0;
+  void *dso_handle = nullptr;
+};
+
+} // namespace runtime
+} // namespace cvi
+
+#endif
--- a/cviruntime/include/runtime/shared_mem.hpp
+++ b/cviruntime/include/runtime/shared_mem.hpp
@ -0,0 +1,16 @@
+#ifndef RUNTIME_SHARED_MEM_H
+#define RUNTIME_SHARED_MEM_H
+
+#include <cviruntime_context.h>
+
+namespace cvi {
+namespace runtime {
+
+void setSharedMemSize(size_t size);
+CVI_RT_MEM allocateSharedMemory(CVI_RT_HANDLE ctx, size_t size);
+void deallocateSharedMemory(CVI_RT_HANDLE ctx, CVI_RT_MEM mem);
+
+} // namespace runtime
+} // namespace cvi
+
+#endif
--- a/cviruntime/include/runtime/stream.hpp
+++ b/cviruntime/include/runtime/stream.hpp
@ -0,0 +1,57 @@
+#ifndef RUNTIME_CVISTREAM_H
+#define RUNTIME_CVISTREAM_H
+
+#include <iostream>
+#include <fstream>
+
+namespace cvi {
+namespace runtime {
+
+class BaseStream {
+public:
+  BaseStream() {}
+  virtual ~BaseStream() {}
+
+  size_t length() {
+    return _length;
+  }
+
+  virtual size_t read(uint8_t *buf, size_t offset, size_t size) = 0;
+
+protected:
+  size_t _length = 0;
+};
+
+class FileStream : public BaseStream {
+public:
+  FileStream(const std::string &file_name);
+  ~FileStream();
+  size_t read(uint8_t *buf, size_t offset, size_t size);
+private:
+  std::ifstream *_fstream;
+};
+
+class BufferStream : public BaseStream {
+public:
+  BufferStream(const int8_t *buf, size_t size);
+  ~BufferStream() {}
+  size_t read(uint8_t *buf, size_t offset, size_t size);
+
+private:
+  const int8_t *buffer;
+};
+
+class FdStream : public BaseStream {
+public:
+  FdStream(const int fd, const size_t ud_offset);
+  ~FdStream() {};
+  size_t read(uint8_t *buf, size_t offset, size_t size);
+private:
+  int file_descriptor;
+  size_t user_define_offset = 0; //The file header's offset that user defined.
+};
+
+} // namespace runtime
+} // namespace cvi
+
+#endif
--- a/cviruntime/include/runtime/taskpool.hpp
+++ b/cviruntime/include/runtime/taskpool.hpp
@ -0,0 +1,113 @@
+/*
+ * This file is licensed under the zlib/libpng license, included in this
+ * distribution in the file COPYING.
+ */
+#ifndef RUNTIME_TASKQUE_H
+#define RUNTIME_TASKQUE_H
+
+#include <future>
+#include <thread>
+#include <deque>
+#include <vector>
+#include <utility>
+#include <chrono>
+#include <list>
+#include <mutex>
+#include <condition_variable>
+#include "cviruntime.h"
+
+namespace cvi {
+namespace runtime {
+
+class TaskPool;
+
+class Task {
+public:
+  Task(TaskPool *pool, void *program, CVI_TENSOR *inputs, int input_num,
+       CVI_TENSOR *outputs, int output_num);
+
+  void *program;
+  int input_num;
+  int output_num;
+  CVI_TENSOR *inputs;
+  CVI_TENSOR *outputs;
+  CVI_RC retCode = CVI_RC_UNINIT;
+};
+
+class RingQueue {
+public:
+  RingQueue(int capacity) : _capacity(capacity) { _queue.resize(_capacity); }
+
+  ~RingQueue() {}
+
+  void put(Task *task) {
+    std::unique_lock<std::mutex> lock(_mutex);
+    while (_capacity - _length <= 1) {
+      _cond_idel.wait(lock);
+    }
+    _queue[_tail] = task;
+    move(_tail);
+    _length++;
+    _cond_busy.notify_one();
+  }
+
+  Task *get() {
+    std::unique_lock<std::mutex> lock(_mutex);
+    while (_length == 0) {
+      _cond_busy.wait(lock);
+    }
+    if (_capacity - _length == 1) {
+      _cond_idel.notify_one();
+    }
+    auto task = _queue[_head];
+    move(_head);
+    _length--;
+    return task;
+  }
+
+  inline uint32_t move(uint32_t &index) {
+    ++index;
+    index %= _capacity;
+    return index;
+  }
+
+private:
+  uint32_t _capacity;
+  uint32_t _head = 0;
+  uint32_t _tail = 0;
+  uint32_t _length = 0;
+  std::vector<Task *> _queue;
+  std::mutex _mutex;
+  std::condition_variable _cond_busy;
+  std::condition_variable _cond_idel;
+};
+
+class TaskPool {
+public:
+  TaskPool(int pool_size)
+      : _pool_size(pool_size), _queue(pool_size * 4),
+        _started(false), _done(false) {}
+  ~TaskPool();
+
+  void startPool();
+  void addTask(Task *task) { _queue.put(task); }
+  void waitTask(Task *task);
+  void workFunc();
+
+private:
+  void addTerminateTask() { _queue.put(nullptr); }
+  static void run(TaskPool *pool) { pool->workFunc(); }
+
+  int _pool_size;
+  RingQueue _queue;
+  std::atomic<bool> _started;
+  std::atomic<bool> _done;
+  std::mutex _mutex;
+  std::vector<std::thread> _threads;
+  std::condition_variable _cond_feedback;
+};
+
+}
+}
+
+#endif // WORKQUEUE_threadpool_hpp
--- a/cviruntime/include/runtime/version.h
+++ b/cviruntime/include/runtime/version.h
@ -0,0 +1,8 @@
+#ifndef CVIRUNTIME_VERSION_H
+#define CVIRUNTIME_VERSION_H
+
+#define CVIRUNTIME_MAJOR_VER 1
+#define CVIRUNTIME_MINOR_VER 1
+#define CVIRUNTIME_SUBMINOR_VER 1
+
+#endif
--- a/cviruntime/python/CMakeLists.txt
+++ b/cviruntime/python/CMakeLists.txt
@ -0,0 +1,15 @@
+cmake_minimum_required(VERSION 3.1.0)
+project(pyruntime)
+
+if(CMAKE_CROSSCOMPILING)
+  include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
+  include_directories(${CMAKE_SYSROOT}/include)
+  link_directories(${CNPY_PATH})
+endif()
+
+add_subdirectory(include/pybind11)
+pybind11_add_module(pyruntime pyruntime.cpp)
+
+set(CVI_LIBS ${CVI_LIBS} cviruntime cvikernel)
+target_link_libraries(pyruntime PRIVATE ${CVI_LIBS})
+install(TARGETS pyruntime DESTINATION python)
--- a/cviruntime/python/README
+++ b/cviruntime/python/README
@ -0,0 +1,21 @@
+1. Download pyruntime.tgz from NAS server("ai/prebuilt/pyruntime.tgz")
+   This package provides a complete environment of python3.6, and also contains
+   modules of torch, torchvision, numpy and cvitek's runtime python wrapper.
+
+2. Decompress pacakge to soc platform:
+   $ cd /mnt/data
+   $ tar -zxvf pyruntime.tgz ./
+
+3. Setup environment variables.
+   $ export PATH=/mnt/data/python/bin/:/mnt/data/cvitek_tpu_sdk/bin:${PATH}
+   $ export LD_LIBRARY_PATH=/mnt/data/python/lib:/mnt/data/cvitek_tpu_sdk/lib:${LD_LIBRARY_PATH}
+   $ export PYTHON_EGG_CACHE=/mnt/data/python/.cache/
+
+4. Run accuracy evaluation script:
+   $ ./eval_imagenet.py \
+        --dataset imagenet/img_val_extracted/ \
+        --cvimodel resnet50_bs4.cvimodel \
+        --count 1000 --batch_size 4 \
+        --image_resize_dims 256,256 \
+        --net_input_dims 224,224 \
+        --mean 104.01,116.67,122.68
--- a/cviruntime/python/cvk_test.py
+++ b/cviruntime/python/cvk_test.py
@ -0,0 +1,16 @@
+import numpy as np
+import pyruntime as rt
+cvk_ctx = rt.CvkContext("CviContext")
+a = np.array([[[[1]]]], dtype=np.int8)
+b = cvk_ctx.lmem_alloc_tensor(a, 1)
+b.shapes()
+b.address()
+c = cvk_ctx.lmem_alloc_tensor(a, 1)
+c.shapes()
+c.address()
+
+cvk_ctx.tdma_g2l_tensor_copy(b, a)
+d = np.array([[[[0]]]], dtype=np.int8)
+cvk_ctx.tdma_l2g_tensor_copy(d, b)
+
+print(a == d)
--- a/cviruntime/python/eval_imagenet.py
+++ b/cviruntime/python/eval_imagenet.py
@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+# use pytorch for dataloader
+# https://github.com/pytorch/examples/blob/master/imagenet/main.py
+
+import argparse
+import os
+import random
+import shutil
+import time
+import warnings
+import numpy as np
+import pyruntime
+from PIL import Image
+from PIL import ImageFile
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+
+import torch
+import torch.nn as nn
+import torch.utils.data
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self, name, fmt=':f'):
+        self.name = name
+        self.fmt = fmt
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    def __str__(self):
+        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
+        return fmtstr.format(**self.__dict__)
+
+class ProgressMeter(object):
+    def __init__(self, num_batches, meters, prefix=""):
+        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
+        self.meters = meters
+        self.prefix = prefix
+
+    def display(self, batch):
+        entries = [self.prefix + self.batch_fmtstr.format(batch)]
+        entries += [str(meter) for meter in self.meters]
+        print('\t'.join(entries))
+
+    def _get_batch_fmtstr(self, num_batches):
+        num_digits = len(str(num_batches // 1))
+        fmt = '{:' + str(num_digits) + 'd}'
+        return '[' + fmt + '/' + fmt.format(num_batches) + ']'
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    maxk = max(topk)
+    batch_size = target.size(0)
+
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+    res = []
+    for k in topk:
+        correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
+        res.append(correct_k.mul_(100.0 / batch_size))
+    return res
+
+def datasetLoader(args):
+  image_resize_dims = [int(s) for s in args.image_resize_dims.split(',')]
+  net_input_dims = [int(s) for s in args.net_input_dims.split(',')]
+  image_resize_dims = [ max(x,y) for (x,y) in zip(image_resize_dims, net_input_dims)]
+
+  valdir = os.path.join(args.dataset, 'val')
+  if (args.loader_transforms):
+    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+    val_loader = torch.utils.data.DataLoader(
+        datasets.ImageFolder(valdir, transforms.Compose([
+            transforms.Resize(image_resize_dims),
+            transforms.CenterCrop(net_input_dims),
+            transforms.ToTensor(),
+            normalize,
+        ])),
+        batch_size=args.batch_size, shuffle=True)
+  else:
+    val_loader = torch.utils.data.DataLoader(
+        datasets.ImageFolder(valdir, transforms.Compose([
+            transforms.Resize(image_resize_dims),
+            transforms.CenterCrop(net_input_dims),
+            transforms.ToTensor()
+        ])),
+        batch_size=args.batch_size, shuffle=True)
+  return val_loader
+
+def imagePreprocssing(args, images, mean, qscale):
+  inputs = np.array([])
+  for image in images:
+    if args.loader_transforms:
+      # loader do normalize already
+      x = image.numpy()
+    else:
+      # pytorch ToTensor() will do HWC to CHW, and change range to [0.0, 1.0]
+      # for pytorch, seeing errors if not include ToTensor in transforms
+      # change to range [0, 255]
+      x = image.numpy() * 255
+      x = x.astype('uint8')
+      # transposed already in ToTensor()
+      # x = np.transpose(x, (2, 0, 1))
+      # still need the swap for caffe models
+      x = x[[2,1,0], :, :]
+      x = x.astype(np.float32)
+      if args.raw_scale != 255.0:
+        x = x * args.raw_scale / 255.0
+      # apply mean
+      if mean.size != 0:
+        x -= mean
+      if qscale != 0:
+        x = x * qscale
+    # expand to 4-D again
+    x = np.expand_dims(x, axis=0)
+    if inputs.size:
+      inputs = np.append(inputs, x, axis=0)
+    else:
+      inputs = x
+
+  if args.input_scale != 1.0:
+    inputs *= args.input_scale
+  return inputs
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser(description="Classification Evaluation on ImageNet Dataset.")
+  parser.add_argument("--cvimodel", type=str)
+  parser.add_argument("--batch_size", type=int, default=1)
+  parser.add_argument("--dataset", type=str, help="The root directory of the ImageNet dataset.")
+  parser.add_argument("--image_resize_dims", type=str, default='256,256')
+  parser.add_argument("--net_input_dims", type=str, default='224,224')
+  parser.add_argument("--raw_scale", type=float, help="Multiply raw input image data by this scale.", default=255.0)
+  parser.add_argument("--mean", help="Per Channel image mean values")
+  parser.add_argument("--input_scale", type=float, help="Multiply input features by this scale.", default=1.0)
+  parser.add_argument("--count", type=int, default=50000)
+  parser.add_argument("--loader_transforms", type=int, help="image transform by torch loader", default=0)
+  args = parser.parse_args()
+
+  if args.mean:
+    mean = np.array([float(s) for s in args.mean.split(',')], dtype=np.float32)
+    mean = mean[:, np.newaxis, np.newaxis]
+  else:
+    mean = np.array([])
+
+  # load model
+  model = pyruntime.Model(args.cvimodel, args.batch_size)
+  print('load model {}'.format(args.cvimodel))
+
+  val_loader = datasetLoader(args)
+
+  batch_time = AverageMeter('Time', ':6.3f')
+  losses = AverageMeter('Loss', ':.4e')
+  top1 = AverageMeter('Acc@1', ':6.2f')
+  top5 = AverageMeter('Acc@5', ':6.2f')
+  progress = ProgressMeter(len(val_loader) * args.batch_size,
+                           [batch_time, losses, top1, top5],
+                           prefix='Test: ')
+
+  # define loss function (criterion) and optimizer
+  criterion = nn.CrossEntropyLoss()
+  threshold = ((50 + args.batch_size - 1) // args.batch_size) * args.batch_size
+  total = len(val_loader) * args.batch_size
+  count = 0
+  end = time.time()
+  for i, (images, target) in enumerate(val_loader):
+    # preprocessing
+    x = imagePreprocssing(args, images, mean, model.inputs[0].qscale)
+    # inference
+    model.inputs[0].data[:] = x
+    model.forward()
+
+    # validate output prob
+    assert(len(model.outputs) == 1)
+    res = model.outputs[0].data
+    prob = np.reshape(res, (res.shape[0], res.shape[1]))
+    output = torch.from_numpy(prob)
+
+    # loss
+    loss = criterion(output, target)
+
+    # measure accuracy and record loss
+    acc1, acc5 = accuracy(output, target, topk=(1, 5))
+    losses.update(loss.item(), images.size(0))
+    top1.update(acc1[0], images.size(0))
+    top5.update(acc5[0], images.size(0))
+
+    # measure elapsed time
+    batch_time.update(time.time() - end)
+    end = time.time()
+
+    count += args.batch_size
+    if count % threshold == 0:
+      progress.display(count)
+    if count >= args.count:
+      progress.display(count)
+      break
+    if count + args.batch_size > total:
+      progress.display(count)
+      break
+  print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
+        .format(top1=top1, top5=top5))
--- a/cviruntime/python/include/aarch64-linux-gnu/python3.5m/pyconfig.h
+++ b/cviruntime/python/include/aarch64-linux-gnu/python3.5m/pyconfig.h
--- a/cviruntime/python/include/aarch64-linux-gnu/python3.6m/pyconfig.h
+++ b/cviruntime/python/include/aarch64-linux-gnu/python3.6m/pyconfig.h
--- a/cviruntime/python/include/pybind11/CMakeLists.txt
+++ b/cviruntime/python/include/pybind11/CMakeLists.txt
@ -0,0 +1,114 @@
+# CMakeLists.txt -- Build system for the pybind11 modules
+#
+# Copyright (c) 2015 Wenzel Jakob <wenzel@inf.ethz.ch>
+#
+# All rights reserved. Use of this source code is governed by a
+# BSD-style license that can be found in the LICENSE file.
+
+cmake_minimum_required(VERSION 2.8.12)
+
+if (POLICY CMP0048)
+  # cmake warns if loaded from a min-3.0-required parent dir, so silence the warning:
+  cmake_policy(SET CMP0048 NEW)
+endif()
+
+# CMake versions < 3.4.0 do not support try_compile/pthread checks without C as active language.
+if(CMAKE_VERSION VERSION_LESS 3.4.0)
+  project(pybind11)
+else()
+  project(pybind11 CXX)
+endif()
+
+# Check if pybind11 is being used directly or via add_subdirectory
+set(PYBIND11_MASTER_PROJECT OFF)
+if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
+  set(PYBIND11_MASTER_PROJECT ON)
+endif()
+
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/tools")
+
+include(pybind11Tools)
+
+# Cache variables so pybind11_add_module can be used in parent projects
+set(PYBIND11_INCLUDE_DIR "${CMAKE_CURRENT_LIST_DIR}/include" CACHE INTERNAL "")
+set(PYTHON_INCLUDE_DIRS ${PYTHON_INCLUDE_DIRS} CACHE INTERNAL "")
+set(PYTHON_LIBRARIES ${PYTHON_LIBRARIES} CACHE INTERNAL "")
+set(PYTHON_MODULE_PREFIX ${PYTHON_MODULE_PREFIX} CACHE INTERNAL "")
+set(PYTHON_MODULE_EXTENSION ${PYTHON_MODULE_EXTENSION} CACHE INTERNAL "")
+set(PYTHON_VERSION_MAJOR ${PYTHON_VERSION_MAJOR} CACHE INTERNAL "")
+set(PYTHON_VERSION_MINOR ${PYTHON_VERSION_MINOR} CACHE INTERNAL "")
+
+# NB: when adding a header don't forget to also add it to setup.py
+set(PYBIND11_HEADERS
+  include/pybind11/detail/class.h
+  include/pybind11/detail/common.h
+  include/pybind11/detail/descr.h
+  include/pybind11/detail/init.h
+  include/pybind11/detail/internals.h
+  include/pybind11/detail/typeid.h
+  include/pybind11/attr.h
+  include/pybind11/buffer_info.h
+  include/pybind11/cast.h
+  include/pybind11/chrono.h
+  include/pybind11/common.h
+  include/pybind11/complex.h
+  include/pybind11/options.h
+  include/pybind11/eigen.h
+  include/pybind11/embed.h
+  include/pybind11/eval.h
+  include/pybind11/functional.h
+  include/pybind11/numpy.h
+  include/pybind11/operators.h
+  include/pybind11/pybind11.h
+  include/pybind11/pytypes.h
+  include/pybind11/stl.h
+  include/pybind11/stl_bind.h
+)
+string(REPLACE "include/" "${CMAKE_CURRENT_SOURCE_DIR}/include/"
+       PYBIND11_HEADERS "${PYBIND11_HEADERS}")
+
+include(GNUInstallDirs)
+include(CMakePackageConfigHelpers)
+
+# extract project version from source
+file(STRINGS "${PYBIND11_INCLUDE_DIR}/pybind11/detail/common.h" pybind11_version_defines
+     REGEX "#define PYBIND11_VERSION_(MAJOR|MINOR|PATCH) ")
+foreach(ver ${pybind11_version_defines})
+  if (ver MATCHES "#define PYBIND11_VERSION_(MAJOR|MINOR|PATCH) +([^ ]+)$")
+    set(PYBIND11_VERSION_${CMAKE_MATCH_1} "${CMAKE_MATCH_2}" CACHE INTERNAL "")
+  endif()
+endforeach()
+set(${PROJECT_NAME}_VERSION ${PYBIND11_VERSION_MAJOR}.${PYBIND11_VERSION_MINOR}.${PYBIND11_VERSION_PATCH})
+message(STATUS "pybind11 v${${PROJECT_NAME}_VERSION}")
+
+option (USE_PYTHON_INCLUDE_DIR "Install pybind11 headers in Python include directory instead of default installation prefix" OFF)
+if (USE_PYTHON_INCLUDE_DIR)
+    file(RELATIVE_PATH CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_PREFIX} ${PYTHON_INCLUDE_DIRS})
+endif()
+
+if(NOT (CMAKE_VERSION VERSION_LESS 3.0))  # CMake >= 3.0
+  # Build an interface library target:
+  add_library(pybind11 INTERFACE)
+  add_library(pybind11::pybind11 ALIAS pybind11)  # to match exported target
+  target_include_directories(pybind11 INTERFACE $<BUILD_INTERFACE:${PYBIND11_INCLUDE_DIR}>
+                                                $<BUILD_INTERFACE:${PYTHON_INCLUDE_DIRS}>
+                                                $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+  target_compile_options(pybind11 INTERFACE $<BUILD_INTERFACE:${PYBIND11_CPP_STANDARD}>)
+
+  add_library(module INTERFACE)
+  add_library(pybind11::module ALIAS module)
+  if(NOT MSVC)
+    target_compile_options(module INTERFACE -fvisibility=hidden)
+  endif()
+  target_link_libraries(module INTERFACE pybind11::pybind11)
+  if(WIN32 OR CYGWIN)
+    target_link_libraries(module INTERFACE $<BUILD_INTERFACE:${PYTHON_LIBRARIES}>)
+  elseif(APPLE)
+    target_link_libraries(module INTERFACE "-undefined dynamic_lookup")
+  endif()
+
+  add_library(embed INTERFACE)
+  add_library(pybind11::embed ALIAS embed)
+  target_link_libraries(embed INTERFACE pybind11::pybind11 $<BUILD_INTERFACE:${PYTHON_LIBRARIES}>)
+endif()
--- a/cviruntime/python/include/pybind11/CONTRIBUTING.md
+++ b/cviruntime/python/include/pybind11/CONTRIBUTING.md
@ -0,0 +1,49 @@
+Thank you for your interest in this project! Please refer to the following
+sections on how to contribute code and bug reports.
+
+### Reporting bugs
+
+At the moment, this project is run in the spare time of a single person
+([Wenzel Jakob](http://rgl.epfl.ch/people/wjakob)) with very limited resources
+for issue tracker tickets. Thus, before submitting a question or bug report,
+please take a moment of your time and ensure that your issue isn't already
+discussed in the project documentation provided at
+[http://pybind11.readthedocs.org/en/latest](http://pybind11.readthedocs.org/en/latest).
+
+Assuming that you have identified a previously unknown problem or an important
+question, it's essential that you submit a self-contained and minimal piece of
+code that reproduces the problem. In other words: no external dependencies,
+isolate the function(s) that cause breakage, submit matched and complete C++
+and Python snippets that can be easily compiled and run on my end.
+
+## Pull requests
+Contributions are submitted, reviewed, and accepted using Github pull requests.
+Please refer to [this
+article](https://help.github.com/articles/using-pull-requests) for details and
+adhere to the following rules to make the process as smooth as possible:
+
+* Make a new branch for every feature you're working on.
+* Make small and clean pull requests that are easy to review but make sure they
+  do add value by themselves.
+* Add tests for any new functionality and run the test suite (``make pytest``)
+  to ensure that no existing features break.
+* Please run ``flake8`` and ``tools/check-style.sh`` to check your code matches
+  the project style. (Note that ``check-style.sh`` requires ``gawk``.)
+* This project has a strong focus on providing general solutions using a
+  minimal amount of code, thus small pull requests are greatly preferred.
+
+### Licensing of contributions
+
+pybind11 is provided under a BSD-style license that can be found in the
+``LICENSE`` file. By using, distributing, or contributing to this project, you
+agree to the terms and conditions of this license.
+
+You are under no obligation whatsoever to provide any bug fixes, patches, or
+upgrades to the features, functionality or performance of the source code
+("Enhancements") to anyone; however, if you choose to make your Enhancements
+available either publicly, or directly to the author of this software, without
+imposing a separate written license agreement for such Enhancements, then you
+hereby grant the following license: a non-exclusive, royalty-free perpetual
+license to install, use, modify, prepare derivative works, incorporate into
+other computer software, distribute, and sublicense such enhancements or
+derivative works thereof, in binary and source code form.
--- a/cviruntime/python/include/pybind11/LICENSE
+++ b/cviruntime/python/include/pybind11/LICENSE
@ -0,0 +1,29 @@
+Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>, All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Please also refer to the file CONTRIBUTING.md, which clarifies licensing of
+external contributions to this project including patches, pull requests, etc.
--- a/cviruntime/python/include/pybind11/README.md
+++ b/cviruntime/python/include/pybind11/README.md
@ -0,0 +1,129 @@
+![pybind11 logo](https://github.com/pybind/pybind11/raw/master/docs/pybind11-logo.png)
+
+# pybind11 — Seamless operability between C++11 and Python
+
+[![Documentation Status](https://readthedocs.org/projects/pybind11/badge/?version=master)](http://pybind11.readthedocs.org/en/master/?badge=master)
+[![Documentation Status](https://readthedocs.org/projects/pybind11/badge/?version=stable)](http://pybind11.readthedocs.org/en/stable/?badge=stable)
+[![Gitter chat](https://img.shields.io/gitter/room/gitterHQ/gitter.svg)](https://gitter.im/pybind/Lobby)
+[![Build Status](https://travis-ci.org/pybind/pybind11.svg?branch=master)](https://travis-ci.org/pybind/pybind11)
+[![Build status](https://ci.appveyor.com/api/projects/status/riaj54pn4h08xy40?svg=true)](https://ci.appveyor.com/project/wjakob/pybind11)
+
+**pybind11** is a lightweight header-only library that exposes C++ types in Python
+and vice versa, mainly to create Python bindings of existing C++ code. Its
+goals and syntax are similar to the excellent
+[Boost.Python](http://www.boost.org/doc/libs/1_58_0/libs/python/doc/) library
+by David Abrahams: to minimize boilerplate code in traditional extension
+modules by inferring type information using compile-time introspection.
+
+The main issue with Boost.Python—and the reason for creating such a similar
+project—is Boost. Boost is an enormously large and complex suite of utility
+libraries that works with almost every C++ compiler in existence. This
+compatibility has its cost: arcane template tricks and workarounds are
+necessary to support the oldest and buggiest of compiler specimens. Now that
+C++11-compatible compilers are widely available, this heavy machinery has
+become an excessively large and unnecessary dependency.
+
+Think of this library as a tiny self-contained version of Boost.Python with
+everything stripped away that isn't relevant for binding generation. Without
+comments, the core header files only require ~4K lines of code and depend on
+Python (2.7 or 3.x, or PyPy2.7 >= 5.7) and the C++ standard library. This
+compact implementation was possible thanks to some of the new C++11 language
+features (specifically: tuples, lambda functions and variadic templates). Since
+its creation, this library has grown beyond Boost.Python in many ways, leading
+to dramatically simpler binding code in many common situations.
+
+Tutorial and reference documentation is provided at
+[http://pybind11.readthedocs.org/en/master](http://pybind11.readthedocs.org/en/master).
+A PDF version of the manual is available
+[here](https://media.readthedocs.org/pdf/pybind11/master/pybind11.pdf).
+
+## Core features
+pybind11 can map the following core C++ features to Python
+
+- Functions accepting and returning custom data structures per value, reference, or pointer
+- Instance methods and static methods
+- Overloaded functions
+- Instance attributes and static attributes
+- Arbitrary exception types
+- Enumerations
+- Callbacks
+- Iterators and ranges
+- Custom operators
+- Single and multiple inheritance
+- STL data structures
+- Smart pointers with reference counting like ``std::shared_ptr``
+- Internal references with correct reference counting
+- C++ classes with virtual (and pure virtual) methods can be extended in Python
+
+## Goodies
+In addition to the core functionality, pybind11 provides some extra goodies:
+
+- Python 2.7, 3.x, and PyPy (PyPy2.7 >= 5.7) are supported with an
+  implementation-agnostic interface.
+
+- It is possible to bind C++11 lambda functions with captured variables. The
+  lambda capture data is stored inside the resulting Python function object.
+
+- pybind11 uses C++11 move constructors and move assignment operators whenever
+  possible to efficiently transfer custom data types.
+
+- It's easy to expose the internal storage of custom data types through
+  Pythons' buffer protocols. This is handy e.g. for fast conversion between
+  C++ matrix classes like Eigen and NumPy without expensive copy operations.
+
+- pybind11 can automatically vectorize functions so that they are transparently
+  applied to all entries of one or more NumPy array arguments.
+
+- Python's slice-based access and assignment operations can be supported with
+  just a few lines of code.
+
+- Everything is contained in just a few header files; there is no need to link
+  against any additional libraries.
+
+- Binaries are generally smaller by a factor of at least 2 compared to
+  equivalent bindings generated by Boost.Python. A recent pybind11 conversion
+  of PyRosetta, an enormous Boost.Python binding project,
+  [reported](http://graylab.jhu.edu/RosettaCon2016/PyRosetta-4.pdf) a binary
+  size reduction of **5.4x** and compile time reduction by **5.8x**.
+
+- Function signatures are precomputed at compile time (using ``constexpr``),
+  leading to smaller binaries.
+
+- With little extra effort, C++ types can be pickled and unpickled similar to
+  regular Python objects.
+
+## Supported compilers
+
+1. Clang/LLVM 3.3 or newer (for Apple Xcode's clang, this is 5.0.0 or newer)
+2. GCC 4.8 or newer
+3. Microsoft Visual Studio 2015 Update 3 or newer
+4. Intel C++ compiler 17 or newer (16 with pybind11 v2.0 and 15 with pybind11 v2.0 and a [workaround](https://github.com/pybind/pybind11/issues/276))
+5. Cygwin/GCC (tested on 2.5.1)
+
+## About
+
+This project was created by [Wenzel Jakob](http://rgl.epfl.ch/people/wjakob).
+Significant features and/or improvements to the code were contributed by
+Jonas Adler,
+Lori A. Burns,
+Sylvain Corlay,
+Trent Houliston,
+Axel Huebl,
+@hulucc,
+Sergey Lyskov
+Johan Mabille,
+Tomasz Miąsko,
+Dean Moldovan,
+Ben Pritchard,
+Jason Rhinelander,
+Boris Schäling,
+Pim Schellart,
+Henry Schreiner,
+Ivan Smirnov, and
+Patrick Stewart.
+
+### License
+
+pybind11 is provided under a BSD-style license that can be found in the
+``LICENSE`` file. By using, distributing, or contributing to this project,
+you agree to the terms and conditions of this license.
--- a/cviruntime/python/include/pybind11/include/pybind11/attr.h
+++ b/cviruntime/python/include/pybind11/include/pybind11/attr.h
@ -0,0 +1,493 @@
+/*
+    pybind11/attr.h: Infrastructure for processing custom
+    type and function attributes
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "cast.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+/// \addtogroup annotations
+/// @{
+
+/// Annotation for methods
+struct is_method { handle class_; is_method(const handle &c) : class_(c) { } };
+
+/// Annotation for operators
+struct is_operator { };
+
+/// Annotation for parent scope
+struct scope { handle value; scope(const handle &s) : value(s) { } };
+
+/// Annotation for documentation
+struct doc { const char *value; doc(const char *value) : value(value) { } };
+
+/// Annotation for function names
+struct name { const char *value; name(const char *value) : value(value) { } };
+
+/// Annotation indicating that a function is an overload associated with a given "sibling"
+struct sibling { handle value; sibling(const handle &value) : value(value.ptr()) { } };
+
+/// Annotation indicating that a class derives from another given type
+template <typename T> struct base {
+    PYBIND11_DEPRECATED("base<T>() was deprecated in favor of specifying 'T' as a template argument to class_")
+    base() { }
+};
+
+/// Keep patient alive while nurse lives
+template <size_t Nurse, size_t Patient> struct keep_alive { };
+
+/// Annotation indicating that a class is involved in a multiple inheritance relationship
+struct multiple_inheritance { };
+
+/// Annotation which enables dynamic attributes, i.e. adds `__dict__` to a class
+struct dynamic_attr { };
+
+/// Annotation which enables the buffer protocol for a type
+struct buffer_protocol { };
+
+/// Annotation which requests that a special metaclass is created for a type
+struct metaclass {
+    handle value;
+
+    PYBIND11_DEPRECATED("py::metaclass() is no longer required. It's turned on by default now.")
+    metaclass() {}
+
+    /// Override pybind11's default metaclass
+    explicit metaclass(handle value) : value(value) { }
+};
+
+/// Annotation that marks a class as local to the module:
+struct module_local { const bool value; constexpr module_local(bool v = true) : value(v) { } };
+
+/// Annotation to mark enums as an arithmetic type
+struct arithmetic { };
+
+/** \rst
+    A call policy which places one or more guard variables (``Ts...``) around the function call.
+
+    For example, this definition:
+
+    .. code-block:: cpp
+
+        m.def("foo", foo, py::call_guard<T>());
+
+    is equivalent to the following pseudocode:
+
+    .. code-block:: cpp
+
+        m.def("foo", [](args...) {
+            T scope_guard;
+            return foo(args...); // forwarded arguments
+        });
+ \endrst */
+template <typename... Ts> struct call_guard;
+
+template <> struct call_guard<> { using type = detail::void_type; };
+
+template <typename T>
+struct call_guard<T> {
+    static_assert(std::is_default_constructible<T>::value,
+                  "The guard type must be default constructible");
+
+    using type = T;
+};
+
+template <typename T, typename... Ts>
+struct call_guard<T, Ts...> {
+    struct type {
+        T guard{}; // Compose multiple guard types with left-to-right default-constructor order
+        typename call_guard<Ts...>::type next{};
+    };
+};
+
+/// @} annotations
+
+NAMESPACE_BEGIN(detail)
+/* Forward declarations */
+enum op_id : int;
+enum op_type : int;
+struct undefined_t;
+template <op_id id, op_type ot, typename L = undefined_t, typename R = undefined_t> struct op_;
+inline void keep_alive_impl(size_t Nurse, size_t Patient, function_call &call, handle ret);
+
+/// Internal data structure which holds metadata about a keyword argument
+struct argument_record {
+    const char *name;  ///< Argument name
+    const char *descr; ///< Human-readable version of the argument value
+    handle value;      ///< Associated Python object
+    bool convert : 1;  ///< True if the argument is allowed to convert when loading
+    bool none : 1;     ///< True if None is allowed when loading
+
+    argument_record(const char *name, const char *descr, handle value, bool convert, bool none)
+        : name(name), descr(descr), value(value), convert(convert), none(none) { }
+};
+
+/// Internal data structure which holds metadata about a bound function (signature, overloads, etc.)
+struct function_record {
+    function_record()
+        : is_constructor(false), is_new_style_constructor(false), is_stateless(false),
+          is_operator(false), has_args(false), has_kwargs(false), is_method(false) { }
+
+    /// Function name
+    char *name = nullptr; /* why no C++ strings? They generate heavier code.. */
+
+    // User-specified documentation string
+    char *doc = nullptr;
+
+    /// Human-readable version of the function signature
+    char *signature = nullptr;
+
+    /// List of registered keyword arguments
+    std::vector<argument_record> args;
+
+    /// Pointer to lambda function which converts arguments and performs the actual call
+    handle (*impl) (function_call &) = nullptr;
+
+    /// Storage for the wrapped function pointer and captured data, if any
+    void *data[3] = { };
+
+    /// Pointer to custom destructor for 'data' (if needed)
+    void (*free_data) (function_record *ptr) = nullptr;
+
+    /// Return value policy associated with this function
+    return_value_policy policy = return_value_policy::automatic;
+
+    /// True if name == '__init__'
+    bool is_constructor : 1;
+
+    /// True if this is a new-style `__init__` defined in `detail/init.h`
+    bool is_new_style_constructor : 1;
+
+    /// True if this is a stateless function pointer
+    bool is_stateless : 1;
+
+    /// True if this is an operator (__add__), etc.
+    bool is_operator : 1;
+
+    /// True if the function has a '*args' argument
+    bool has_args : 1;
+
+    /// True if the function has a '**kwargs' argument
+    bool has_kwargs : 1;
+
+    /// True if this is a method
+    bool is_method : 1;
+
+    /// Number of arguments (including py::args and/or py::kwargs, if present)
+    std::uint16_t nargs;
+
+    /// Python method object
+    PyMethodDef *def = nullptr;
+
+    /// Python handle to the parent scope (a class or a module)
+    handle scope;
+
+    /// Python handle to the sibling function representing an overload chain
+    handle sibling;
+
+    /// Pointer to next overload
+    function_record *next = nullptr;
+};
+
+/// Special data structure which (temporarily) holds metadata about a bound class
+struct type_record {
+    PYBIND11_NOINLINE type_record()
+        : multiple_inheritance(false), dynamic_attr(false), buffer_protocol(false),
+          default_holder(true), module_local(false) { }
+
+    /// Handle to the parent scope
+    handle scope;
+
+    /// Name of the class
+    const char *name = nullptr;
+
+    // Pointer to RTTI type_info data structure
+    const std::type_info *type = nullptr;
+
+    /// How large is the underlying C++ type?
+    size_t type_size = 0;
+
+    /// What is the alignment of the underlying C++ type?
+    size_t type_align = 0;
+
+    /// How large is the type's holder?
+    size_t holder_size = 0;
+
+    /// The global operator new can be overridden with a class-specific variant
+    void *(*operator_new)(size_t) = nullptr;
+
+    /// Function pointer to class_<..>::init_instance
+    void (*init_instance)(instance *, const void *) = nullptr;
+
+    /// Function pointer to class_<..>::dealloc
+    void (*dealloc)(detail::value_and_holder &) = nullptr;
+
+    /// List of base classes of the newly created type
+    list bases;
+
+    /// Optional docstring
+    const char *doc = nullptr;
+
+    /// Custom metaclass (optional)
+    handle metaclass;
+
+    /// Multiple inheritance marker
+    bool multiple_inheritance : 1;
+
+    /// Does the class manage a __dict__?
+    bool dynamic_attr : 1;
+
+    /// Does the class implement the buffer protocol?
+    bool buffer_protocol : 1;
+
+    /// Is the default (unique_ptr) holder type used?
+    bool default_holder : 1;
+
+    /// Is the class definition local to the module shared object?
+    bool module_local : 1;
+
+    PYBIND11_NOINLINE void add_base(const std::type_info &base, void *(*caster)(void *)) {
+        auto base_info = detail::get_type_info(base, false);
+        if (!base_info) {
+            std::string tname(base.name());
+            detail::clean_type_id(tname);
+            pybind11_fail("generic_type: type \"" + std::string(name) +
+                          "\" referenced unknown base type \"" + tname + "\"");
+        }
+
+        if (default_holder != base_info->default_holder) {
+            std::string tname(base.name());
+            detail::clean_type_id(tname);
+            pybind11_fail("generic_type: type \"" + std::string(name) + "\" " +
+                    (default_holder ? "does not have" : "has") +
+                    " a non-default holder type while its base \"" + tname + "\" " +
+                    (base_info->default_holder ? "does not" : "does"));
+        }
+
+        bases.append((PyObject *) base_info->type);
+
+        if (base_info->type->tp_dictoffset != 0)
+            dynamic_attr = true;
+
+        if (caster)
+            base_info->implicit_casts.emplace_back(type, caster);
+    }
+};
+
+inline function_call::function_call(const function_record &f, handle p) :
+        func(f), parent(p) {
+    args.reserve(f.nargs);
+    args_convert.reserve(f.nargs);
+}
+
+/// Tag for a new-style `__init__` defined in `detail/init.h`
+struct is_new_style_constructor { };
+
+/**
+ * Partial template specializations to process custom attributes provided to
+ * cpp_function_ and class_. These are either used to initialize the respective
+ * fields in the type_record and function_record data structures or executed at
+ * runtime to deal with custom call policies (e.g. keep_alive).
+ */
+template <typename T, typename SFINAE = void> struct process_attribute;
+
+template <typename T> struct process_attribute_default {
+    /// Default implementation: do nothing
+    static void init(const T &, function_record *) { }
+    static void init(const T &, type_record *) { }
+    static void precall(function_call &) { }
+    static void postcall(function_call &, handle) { }
+};
+
+/// Process an attribute specifying the function's name
+template <> struct process_attribute<name> : process_attribute_default<name> {
+    static void init(const name &n, function_record *r) { r->name = const_cast<char *>(n.value); }
+};
+
+/// Process an attribute specifying the function's docstring
+template <> struct process_attribute<doc> : process_attribute_default<doc> {
+    static void init(const doc &n, function_record *r) { r->doc = const_cast<char *>(n.value); }
+};
+
+/// Process an attribute specifying the function's docstring (provided as a C-style string)
+template <> struct process_attribute<const char *> : process_attribute_default<const char *> {
+    static void init(const char *d, function_record *r) { r->doc = const_cast<char *>(d); }
+    static void init(const char *d, type_record *r) { r->doc = const_cast<char *>(d); }
+};
+template <> struct process_attribute<char *> : process_attribute<const char *> { };
+
+/// Process an attribute indicating the function's return value policy
+template <> struct process_attribute<return_value_policy> : process_attribute_default<return_value_policy> {
+    static void init(const return_value_policy &p, function_record *r) { r->policy = p; }
+};
+
+/// Process an attribute which indicates that this is an overloaded function associated with a given sibling
+template <> struct process_attribute<sibling> : process_attribute_default<sibling> {
+    static void init(const sibling &s, function_record *r) { r->sibling = s.value; }
+};
+
+/// Process an attribute which indicates that this function is a method
+template <> struct process_attribute<is_method> : process_attribute_default<is_method> {
+    static void init(const is_method &s, function_record *r) { r->is_method = true; r->scope = s.class_; }
+};
+
+/// Process an attribute which indicates the parent scope of a method
+template <> struct process_attribute<scope> : process_attribute_default<scope> {
+    static void init(const scope &s, function_record *r) { r->scope = s.value; }
+};
+
+/// Process an attribute which indicates that this function is an operator
+template <> struct process_attribute<is_operator> : process_attribute_default<is_operator> {
+    static void init(const is_operator &, function_record *r) { r->is_operator = true; }
+};
+
+template <> struct process_attribute<is_new_style_constructor> : process_attribute_default<is_new_style_constructor> {
+    static void init(const is_new_style_constructor &, function_record *r) { r->is_new_style_constructor = true; }
+};
+
+/// Process a keyword argument attribute (*without* a default value)
+template <> struct process_attribute<arg> : process_attribute_default<arg> {
+    static void init(const arg &a, function_record *r) {
+        if (r->is_method && r->args.empty())
+            r->args.emplace_back("self", nullptr, handle(), true /*convert*/, false /*none not allowed*/);
+        r->args.emplace_back(a.name, nullptr, handle(), !a.flag_noconvert, a.flag_none);
+    }
+};
+
+/// Process a keyword argument attribute (*with* a default value)
+template <> struct process_attribute<arg_v> : process_attribute_default<arg_v> {
+    static void init(const arg_v &a, function_record *r) {
+        if (r->is_method && r->args.empty())
+            r->args.emplace_back("self", nullptr /*descr*/, handle() /*parent*/, true /*convert*/, false /*none not allowed*/);
+
+        if (!a.value) {
+#if !defined(NDEBUG)
+            std::string descr("'");
+            if (a.name) descr += std::string(a.name) + ": ";
+            descr += a.type + "'";
+            if (r->is_method) {
+                if (r->name)
+                    descr += " in method '" + (std::string) str(r->scope) + "." + (std::string) r->name + "'";
+                else
+                    descr += " in method of '" + (std::string) str(r->scope) + "'";
+            } else if (r->name) {
+                descr += " in function '" + (std::string) r->name + "'";
+            }
+            pybind11_fail("arg(): could not convert default argument "
+                          + descr + " into a Python object (type not registered yet?)");
+#else
+            pybind11_fail("arg(): could not convert default argument "
+                          "into a Python object (type not registered yet?). "
+                          "Compile in debug mode for more information.");
+#endif
+        }
+        r->args.emplace_back(a.name, a.descr, a.value.inc_ref(), !a.flag_noconvert, a.flag_none);
+    }
+};
+
+/// Process a parent class attribute.  Single inheritance only (class_ itself already guarantees that)
+template <typename T>
+struct process_attribute<T, enable_if_t<is_pyobject<T>::value>> : process_attribute_default<handle> {
+    static void init(const handle &h, type_record *r) { r->bases.append(h); }
+};
+
+/// Process a parent class attribute (deprecated, does not support multiple inheritance)
+template <typename T>
+struct process_attribute<base<T>> : process_attribute_default<base<T>> {
+    static void init(const base<T> &, type_record *r) { r->add_base(typeid(T), nullptr); }
+};
+
+/// Process a multiple inheritance attribute
+template <>
+struct process_attribute<multiple_inheritance> : process_attribute_default<multiple_inheritance> {
+    static void init(const multiple_inheritance &, type_record *r) { r->multiple_inheritance = true; }
+};
+
+template <>
+struct process_attribute<dynamic_attr> : process_attribute_default<dynamic_attr> {
+    static void init(const dynamic_attr &, type_record *r) { r->dynamic_attr = true; }
+};
+
+template <>
+struct process_attribute<buffer_protocol> : process_attribute_default<buffer_protocol> {
+    static void init(const buffer_protocol &, type_record *r) { r->buffer_protocol = true; }
+};
+
+template <>
+struct process_attribute<metaclass> : process_attribute_default<metaclass> {
+    static void init(const metaclass &m, type_record *r) { r->metaclass = m.value; }
+};
+
+template <>
+struct process_attribute<module_local> : process_attribute_default<module_local> {
+    static void init(const module_local &l, type_record *r) { r->module_local = l.value; }
+};
+
+/// Process an 'arithmetic' attribute for enums (does nothing here)
+template <>
+struct process_attribute<arithmetic> : process_attribute_default<arithmetic> {};
+
+template <typename... Ts>
+struct process_attribute<call_guard<Ts...>> : process_attribute_default<call_guard<Ts...>> { };
+
+/**
+ * Process a keep_alive call policy -- invokes keep_alive_impl during the
+ * pre-call handler if both Nurse, Patient != 0 and use the post-call handler
+ * otherwise
+ */
+template <size_t Nurse, size_t Patient> struct process_attribute<keep_alive<Nurse, Patient>> : public process_attribute_default<keep_alive<Nurse, Patient>> {
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N != 0 && P != 0, int> = 0>
+    static void precall(function_call &call) { keep_alive_impl(Nurse, Patient, call, handle()); }
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N != 0 && P != 0, int> = 0>
+    static void postcall(function_call &, handle) { }
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N == 0 || P == 0, int> = 0>
+    static void precall(function_call &) { }
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N == 0 || P == 0, int> = 0>
+    static void postcall(function_call &call, handle ret) { keep_alive_impl(Nurse, Patient, call, ret); }
+};
+
+/// Recursively iterate over variadic template arguments
+template <typename... Args> struct process_attributes {
+    static void init(const Args&... args, function_record *r) {
+        int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::init(args, r), 0) ... };
+        ignore_unused(unused);
+    }
+    static void init(const Args&... args, type_record *r) {
+        int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::init(args, r), 0) ... };
+        ignore_unused(unused);
+    }
+    static void precall(function_call &call) {
+        int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::precall(call), 0) ... };
+        ignore_unused(unused);
+    }
+    static void postcall(function_call &call, handle fn_ret) {
+        int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::postcall(call, fn_ret), 0) ... };
+        ignore_unused(unused);
+    }
+};
+
+template <typename T>
+using is_call_guard = is_instantiation<call_guard, T>;
+
+/// Extract the ``type`` from the first `call_guard` in `Extras...` (or `void_type` if none found)
+template <typename... Extra>
+using extract_guard_t = typename exactly_one_t<is_call_guard, call_guard<>, Extra...>::type;
+
+/// Check the number of named arguments at compile time
+template <typename... Extra,
+          size_t named = constexpr_sum(std::is_base_of<arg, Extra>::value...),
+          size_t self  = constexpr_sum(std::is_same<is_method, Extra>::value...)>
+constexpr bool expected_num_args(size_t nargs, bool has_args, bool has_kwargs) {
+    return named == 0 || (self + named + has_args + has_kwargs) == nargs;
+}
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
--- a/cviruntime/python/include/pybind11/include/pybind11/buffer_info.h
+++ b/cviruntime/python/include/pybind11/include/pybind11/buffer_info.h
@ -0,0 +1,114 @@
+/*
+    pybind11/buffer_info.h: Python buffer object interface
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+/// Information record describing a Python buffer object
+struct buffer_info {
+    void *ptr = nullptr;          // Pointer to the underlying storage
+    ssize_t itemsize = 0;         // Size of individual items in bytes
+    ssize_t size = 0;             // Total number of entries
+    std::string format;           // For homogeneous buffers, this should be set to format_descriptor<T>::format()
+    ssize_t ndim = 0;             // Number of dimensions
+    std::vector<ssize_t> shape;   // Shape of the tensor (1 entry per dimension)
+    std::vector<ssize_t> strides; // Number of bytes between adjacent entries (for each per dimension)
+    bool readonly = false;        // flag to indicate if the underlying storage may be written to
+
+    buffer_info() { }
+
+    buffer_info(void *ptr, ssize_t itemsize, const std::string &format, ssize_t ndim,
+                detail::any_container<ssize_t> shape_in, detail::any_container<ssize_t> strides_in, bool readonly=false)
+    : ptr(ptr), itemsize(itemsize), size(1), format(format), ndim(ndim),
+      shape(std::move(shape_in)), strides(std::move(strides_in)), readonly(readonly) {
+        if (ndim != (ssize_t) shape.size() || ndim != (ssize_t) strides.size())
+            pybind11_fail("buffer_info: ndim doesn't match shape and/or strides length");
+        for (size_t i = 0; i < (size_t) ndim; ++i)
+            size *= shape[i];
+    }
+
+    template <typename T>
+    buffer_info(T *ptr, detail::any_container<ssize_t> shape_in, detail::any_container<ssize_t> strides_in, bool readonly=false)
+    : buffer_info(private_ctr_tag(), ptr, sizeof(T), format_descriptor<T>::format(), static_cast<ssize_t>(shape_in->size()), std::move(shape_in), std::move(strides_in), readonly) { }
+
+    buffer_info(void *ptr, ssize_t itemsize, const std::string &format, ssize_t size, bool readonly=false)
+    : buffer_info(ptr, itemsize, format, 1, {size}, {itemsize}, readonly) { }
+
+    template <typename T>
+    buffer_info(T *ptr, ssize_t size, bool readonly=false)
+    : buffer_info(ptr, sizeof(T), format_descriptor<T>::format(), size, readonly) { }
+
+    template <typename T>
+    buffer_info(const T *ptr, ssize_t size, bool readonly=true)
+    : buffer_info(const_cast<T*>(ptr), sizeof(T), format_descriptor<T>::format(), size, readonly) { }
+
+    explicit buffer_info(Py_buffer *view, bool ownview = true)
+    : buffer_info(view->buf, view->itemsize, view->format, view->ndim,
+            {view->shape, view->shape + view->ndim}, {view->strides, view->strides + view->ndim}, view->readonly) {
+        this->view = view;
+        this->ownview = ownview;
+    }
+
+    buffer_info(const buffer_info &) = delete;
+    buffer_info& operator=(const buffer_info &) = delete;
+
+    buffer_info(buffer_info &&other) {
+        (*this) = std::move(other);
+    }
+
+    buffer_info& operator=(buffer_info &&rhs) {
+        ptr = rhs.ptr;
+        itemsize = rhs.itemsize;
+        size = rhs.size;
+        format = std::move(rhs.format);
+        ndim = rhs.ndim;
+        shape = std::move(rhs.shape);
+        strides = std::move(rhs.strides);
+        std::swap(view, rhs.view);
+        std::swap(ownview, rhs.ownview);
+        readonly = rhs.readonly;
+        return *this;
+    }
+
+    ~buffer_info() {
+        if (view && ownview) { PyBuffer_Release(view); delete view; }
+    }
+
+private:
+    struct private_ctr_tag { };
+
+    buffer_info(private_ctr_tag, void *ptr, ssize_t itemsize, const std::string &format, ssize_t ndim,
+                detail::any_container<ssize_t> &&shape_in, detail::any_container<ssize_t> &&strides_in, bool readonly)
+    : buffer_info(ptr, itemsize, format, ndim, std::move(shape_in), std::move(strides_in), readonly) { }
+
+    Py_buffer *view = nullptr;
+    bool ownview = false;
+};
+
+NAMESPACE_BEGIN(detail)
+
+template <typename T, typename SFINAE = void> struct compare_buffer_info {
+    static bool compare(const buffer_info& b) {
+        return b.format == format_descriptor<T>::format() && b.itemsize == (ssize_t) sizeof(T);
+    }
+};
+
+template <typename T> struct compare_buffer_info<T, detail::enable_if_t<std::is_integral<T>::value>> {
+    static bool compare(const buffer_info& b) {
+        return (size_t) b.itemsize == sizeof(T) && (b.format == format_descriptor<T>::value ||
+            ((sizeof(T) == sizeof(long)) && b.format == (std::is_unsigned<T>::value ? "L" : "l")) ||
+            ((sizeof(T) == sizeof(size_t)) && b.format == (std::is_unsigned<T>::value ? "N" : "n")));
+    }
+};
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
--- a/cviruntime/python/include/pybind11/include/pybind11/cast.h
+++ b/cviruntime/python/include/pybind11/include/pybind11/cast.h
--- a/cviruntime/python/include/pybind11/include/pybind11/chrono.h
+++ b/cviruntime/python/include/pybind11/include/pybind11/chrono.h
@ -0,0 +1,184 @@
+/*
+    pybind11/chrono.h: Transparent conversion between std::chrono and python's datetime
+
+    Copyright (c) 2016 Trent Houliston <trent@houliston.me> and
+                       Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include <cmath>
+#include <ctime>
+#include <chrono>
+#include <datetime.h>
+
+// Backport the PyDateTime_DELTA functions from Python3.3 if required
+#ifndef PyDateTime_DELTA_GET_DAYS
+#define PyDateTime_DELTA_GET_DAYS(o)         (((PyDateTime_Delta*)o)->days)
+#endif
+#ifndef PyDateTime_DELTA_GET_SECONDS
+#define PyDateTime_DELTA_GET_SECONDS(o)      (((PyDateTime_Delta*)o)->seconds)
+#endif
+#ifndef PyDateTime_DELTA_GET_MICROSECONDS
+#define PyDateTime_DELTA_GET_MICROSECONDS(o) (((PyDateTime_Delta*)o)->microseconds)
+#endif
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+template <typename type> class duration_caster {
+public:
+    typedef typename type::rep rep;
+    typedef typename type::period period;
+
+    typedef std::chrono::duration<uint_fast32_t, std::ratio<86400>> days;
+
+    bool load(handle src, bool) {
+        using namespace std::chrono;
+
+        // Lazy initialise the PyDateTime import
+        if (!PyDateTimeAPI) { PyDateTime_IMPORT; }
+
+        if (!src) return false;
+        // If invoked with datetime.delta object
+        if (PyDelta_Check(src.ptr())) {
+            value = type(duration_cast<duration<rep, period>>(
+                  days(PyDateTime_DELTA_GET_DAYS(src.ptr()))
+                + seconds(PyDateTime_DELTA_GET_SECONDS(src.ptr()))
+                + microseconds(PyDateTime_DELTA_GET_MICROSECONDS(src.ptr()))));
+            return true;
+        }
+        // If invoked with a float we assume it is seconds and convert
+        else if (PyFloat_Check(src.ptr())) {
+            value = type(duration_cast<duration<rep, period>>(duration<double>(PyFloat_AsDouble(src.ptr()))));
+            return true;
+        }
+        else return false;
+    }
+
+    // If this is a duration just return it back
+    static const std::chrono::duration<rep, period>& get_duration(const std::chrono::duration<rep, period> &src) {
+        return src;
+    }
+
+    // If this is a time_point get the time_since_epoch
+    template <typename Clock> static std::chrono::duration<rep, period> get_duration(const std::chrono::time_point<Clock, std::chrono::duration<rep, period>> &src) {
+        return src.time_since_epoch();
+    }
+
+    static handle cast(const type &src, return_value_policy /* policy */, handle /* parent */) {
+        using namespace std::chrono;
+
+        // Use overloaded function to get our duration from our source
+        // Works out if it is a duration or time_point and get the duration
+        auto d = get_duration(src);
+
+        // Lazy initialise the PyDateTime import
+        if (!PyDateTimeAPI) { PyDateTime_IMPORT; }
+
+        // Declare these special duration types so the conversions happen with the correct primitive types (int)
+        using dd_t = duration<int, std::ratio<86400>>;
+        using ss_t = duration<int, std::ratio<1>>;
+        using us_t = duration<int, std::micro>;
+
+        auto dd = duration_cast<dd_t>(d);
+        auto subd = d - dd;
+        auto ss = duration_cast<ss_t>(subd);
+        auto us = duration_cast<us_t>(subd - ss);
+        return PyDelta_FromDSU(dd.count(), ss.count(), us.count());
+    }
+
+    PYBIND11_TYPE_CASTER(type, _("datetime.timedelta"));
+};
+
+// This is for casting times on the system clock into datetime.datetime instances
+template <typename Duration> class type_caster<std::chrono::time_point<std::chrono::system_clock, Duration>> {
+public:
+    typedef std::chrono::time_point<std::chrono::system_clock, Duration> type;
+    bool load(handle src, bool) {
+        using namespace std::chrono;
+
+        // Lazy initialise the PyDateTime import
+        if (!PyDateTimeAPI) { PyDateTime_IMPORT; }
+
+        if (!src) return false;
+
+        std::tm cal;
+        microseconds msecs;
+
+        if (PyDateTime_Check(src.ptr())) {
+            cal.tm_sec   = PyDateTime_DATE_GET_SECOND(src.ptr());
+            cal.tm_min   = PyDateTime_DATE_GET_MINUTE(src.ptr());
+            cal.tm_hour  = PyDateTime_DATE_GET_HOUR(src.ptr());
+            cal.tm_mday  = PyDateTime_GET_DAY(src.ptr());
+            cal.tm_mon   = PyDateTime_GET_MONTH(src.ptr()) - 1;
+            cal.tm_year  = PyDateTime_GET_YEAR(src.ptr()) - 1900;
+            cal.tm_isdst = -1;
+            msecs        = microseconds(PyDateTime_DATE_GET_MICROSECOND(src.ptr()));
+        } else if (PyDate_Check(src.ptr())) {
+            cal.tm_sec   = 0;
+            cal.tm_min   = 0;
+            cal.tm_hour  = 0;
+            cal.tm_mday  = PyDateTime_GET_DAY(src.ptr());
+            cal.tm_mon   = PyDateTime_GET_MONTH(src.ptr()) - 1;
+            cal.tm_year  = PyDateTime_GET_YEAR(src.ptr()) - 1900;
+            cal.tm_isdst = -1;
+            msecs        = microseconds(0);
+        } else if (PyTime_Check(src.ptr())) {
+            cal.tm_sec   = PyDateTime_TIME_GET_SECOND(src.ptr());
+            cal.tm_min   = PyDateTime_TIME_GET_MINUTE(src.ptr());
+            cal.tm_hour  = PyDateTime_TIME_GET_HOUR(src.ptr());
+            cal.tm_mday  = 1;   // This date (day, month, year) = (1, 0, 70)
+            cal.tm_mon   = 0;   // represents 1-Jan-1970, which is the first
+            cal.tm_year  = 70;  // earliest available date for Python's datetime
+            cal.tm_isdst = -1;
+            msecs        = microseconds(PyDateTime_TIME_GET_MICROSECOND(src.ptr()));
+        }
+        else return false;
+
+        value = system_clock::from_time_t(std::mktime(&cal)) + msecs;
+        return true;
+    }
+
+    static handle cast(const std::chrono::time_point<std::chrono::system_clock, Duration> &src, return_value_policy /* policy */, handle /* parent */) {
+        using namespace std::chrono;
+
+        // Lazy initialise the PyDateTime import
+        if (!PyDateTimeAPI) { PyDateTime_IMPORT; }
+
+        std::time_t tt = system_clock::to_time_t(time_point_cast<system_clock::duration>(src));
+        // this function uses static memory so it's best to copy it out asap just in case
+        // otherwise other code that is using localtime may break this (not just python code)
+        std::tm localtime = *std::localtime(&tt);
+
+        // Declare these special duration types so the conversions happen with the correct primitive types (int)
+        using us_t = duration<int, std::micro>;
+
+        return PyDateTime_FromDateAndTime(localtime.tm_year + 1900,
+                                          localtime.tm_mon + 1,
+                                          localtime.tm_mday,
+                                          localtime.tm_hour,
+                                          localtime.tm_min,
+                                          localtime.tm_sec,
+                                          (duration_cast<us_t>(src.time_since_epoch() % seconds(1))).count());
+    }
+    PYBIND11_TYPE_CASTER(type, _("datetime.datetime"));
+};
+
+// Other clocks that are not the system clock are not measured as datetime.datetime objects
+// since they are not measured on calendar time. So instead we just make them timedeltas
+// Or if they have passed us a time as a float we convert that
+template <typename Clock, typename Duration> class type_caster<std::chrono::time_point<Clock, Duration>>
+: public duration_caster<std::chrono::time_point<Clock, Duration>> {
+};
+
+template <typename Rep, typename Period> class type_caster<std::chrono::duration<Rep, Period>>
+: public duration_caster<std::chrono::duration<Rep, Period>> {
+};
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
--- a/cviruntime/python/include/pybind11/include/pybind11/common.h
+++ b/cviruntime/python/include/pybind11/include/pybind11/common.h
@ -0,0 +1,2 @@
+#include "detail/common.h"
+#warning "Including 'common.h' is deprecated. It will be removed in v3.0. Use 'pybind11.h'."
--- a/cviruntime/python/include/pybind11/include/pybind11/complex.h
+++ b/cviruntime/python/include/pybind11/include/pybind11/complex.h
@ -0,0 +1,65 @@
+/*
+    pybind11/complex.h: Complex number support
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include <complex>
+
+/// glibc defines I as a macro which breaks things, e.g., boost template names
+#ifdef I
+#  undef I
+#endif
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+template <typename T> struct format_descriptor<std::complex<T>, detail::enable_if_t<std::is_floating_point<T>::value>> {
+    static constexpr const char c = format_descriptor<T>::c;
+    static constexpr const char value[3] = { 'Z', c, '\0' };
+    static std::string format() { return std::string(value); }
+};
+
+#ifndef PYBIND11_CPP17
+
+template <typename T> constexpr const char format_descriptor<
+    std::complex<T>, detail::enable_if_t<std::is_floating_point<T>::value>>::value[3];
+
+#endif
+
+NAMESPACE_BEGIN(detail)
+
+template <typename T> struct is_fmt_numeric<std::complex<T>, detail::enable_if_t<std::is_floating_point<T>::value>> {
+    static constexpr bool value = true;
+    static constexpr int index = is_fmt_numeric<T>::index + 3;
+};
+
+template <typename T> class type_caster<std::complex<T>> {
+public:
+    bool load(handle src, bool convert) {
+        if (!src)
+            return false;
+        if (!convert && !PyComplex_Check(src.ptr()))
+            return false;
+        Py_complex result = PyComplex_AsCComplex(src.ptr());
+        if (result.real == -1.0 && PyErr_Occurred()) {
+            PyErr_Clear();
+            return false;
+        }
+        value = std::complex<T>((T) result.real, (T) result.imag);
+        return true;
+    }
+
+    static handle cast(const std::complex<T> &src, return_value_policy /* policy */, handle /* parent */) {
+        return PyComplex_FromDoubles((double) src.real(), (double) src.imag());
+    }
+
+    PYBIND11_TYPE_CASTER(std::complex<T>, _("complex"));
+};
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
--- a/cviruntime/python/include/pybind11/include/pybind11/detail/class.h
+++ b/cviruntime/python/include/pybind11/include/pybind11/detail/class.h
@ -0,0 +1,639 @@
+/*
+    pybind11/detail/class.h: Python C API implementation details for py::class_
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "../attr.h"
+#include "../options.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+#if PY_VERSION_HEX >= 0x03030000
+#  define PYBIND11_BUILTIN_QUALNAME
+#  define PYBIND11_SET_OLDPY_QUALNAME(obj, nameobj)
+#else
+// In pre-3.3 Python, we still set __qualname__ so that we can produce reliable function type
+// signatures; in 3.3+ this macro expands to nothing:
+#  define PYBIND11_SET_OLDPY_QUALNAME(obj, nameobj) setattr((PyObject *) obj, "__qualname__", nameobj)
+#endif
+
+inline PyTypeObject *type_incref(PyTypeObject *type) {
+    Py_INCREF(type);
+    return type;
+}
+
+#if !defined(PYPY_VERSION)
+
+/// `pybind11_static_property.__get__()`: Always pass the class instead of the instance.
+extern "C" inline PyObject *pybind11_static_get(PyObject *self, PyObject * /*ob*/, PyObject *cls) {
+    return PyProperty_Type.tp_descr_get(self, cls, cls);
+}
+
+/// `pybind11_static_property.__set__()`: Just like the above `__get__()`.
+extern "C" inline int pybind11_static_set(PyObject *self, PyObject *obj, PyObject *value) {
+    PyObject *cls = PyType_Check(obj) ? obj : (PyObject *) Py_TYPE(obj);
+    return PyProperty_Type.tp_descr_set(self, cls, value);
+}
+
+/** A `static_property` is the same as a `property` but the `__get__()` and `__set__()`
+    methods are modified to always use the object type instead of a concrete instance.
+    Return value: New reference. */
+inline PyTypeObject *make_static_property_type() {
+    constexpr auto *name = "pybind11_static_property";
+    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto heap_type = (PyHeapTypeObject *) PyType_Type.tp_alloc(&PyType_Type, 0);
+    if (!heap_type)
+        pybind11_fail("make_static_property_type(): error allocating type!");
+
+    heap_type->ht_name = name_obj.inc_ref().ptr();
+#ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+#endif
+
+    auto type = &heap_type->ht_type;
+    type->tp_name = name;
+    type->tp_base = type_incref(&PyProperty_Type);
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+    type->tp_descr_get = pybind11_static_get;
+    type->tp_descr_set = pybind11_static_set;
+
+    if (PyType_Ready(type) < 0)
+        pybind11_fail("make_static_property_type(): failure in PyType_Ready()!");
+
+    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
+    PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
+
+    return type;
+}
+
+#else // PYPY
+
+/** PyPy has some issues with the above C API, so we evaluate Python code instead.
+    This function will only be called once so performance isn't really a concern.
+    Return value: New reference. */
+inline PyTypeObject *make_static_property_type() {
+    auto d = dict();
+    PyObject *result = PyRun_String(R"(\
+        class pybind11_static_property(property):
+            def __get__(self, obj, cls):
+                return property.__get__(self, cls, cls)
+
+            def __set__(self, obj, value):
+                cls = obj if isinstance(obj, type) else type(obj)
+                property.__set__(self, cls, value)
+        )", Py_file_input, d.ptr(), d.ptr()
+    );
+    if (result == nullptr)
+        throw error_already_set();
+    Py_DECREF(result);
+    return (PyTypeObject *) d["pybind11_static_property"].cast<object>().release().ptr();
+}
+
+#endif // PYPY
+
+/** Types with static properties need to handle `Type.static_prop = x` in a specific way.
+    By default, Python replaces the `static_property` itself, but for wrapped C++ types
+    we need to call `static_property.__set__()` in order to propagate the new value to
+    the underlying C++ data structure. */
+extern "C" inline int pybind11_meta_setattro(PyObject* obj, PyObject* name, PyObject* value) {
+    // Use `_PyType_Lookup()` instead of `PyObject_GetAttr()` in order to get the raw
+    // descriptor (`property`) instead of calling `tp_descr_get` (`property.__get__()`).
+    PyObject *descr = _PyType_Lookup((PyTypeObject *) obj, name);
+
+    // The following assignment combinations are possible:
+    //   1. `Type.static_prop = value`             --> descr_set: `Type.static_prop.__set__(value)`
+    //   2. `Type.static_prop = other_static_prop` --> setattro:  replace existing `static_prop`
+    //   3. `Type.regular_attribute = value`       --> setattro:  regular attribute assignment
+    const auto static_prop = (PyObject *) get_internals().static_property_type;
+    const auto call_descr_set = descr && PyObject_IsInstance(descr, static_prop)
+                                && !PyObject_IsInstance(value, static_prop);
+    if (call_descr_set) {
+        // Call `static_property.__set__()` instead of replacing the `static_property`.
+#if !defined(PYPY_VERSION)
+        return Py_TYPE(descr)->tp_descr_set(descr, obj, value);
+#else
+        if (PyObject *result = PyObject_CallMethod(descr, "__set__", "OO", obj, value)) {
+            Py_DECREF(result);
+            return 0;
+        } else {
+            return -1;
+        }
+#endif
+    } else {
+        // Replace existing attribute.
+        return PyType_Type.tp_setattro(obj, name, value);
+    }
+}
+
+#if PY_MAJOR_VERSION >= 3
+/**
+ * Python 3's PyInstanceMethod_Type hides itself via its tp_descr_get, which prevents aliasing
+ * methods via cls.attr("m2") = cls.attr("m1"): instead the tp_descr_get returns a plain function,
+ * when called on a class, or a PyMethod, when called on an instance.  Override that behaviour here
+ * to do a special case bypass for PyInstanceMethod_Types.
+ */
+extern "C" inline PyObject *pybind11_meta_getattro(PyObject *obj, PyObject *name) {
+    PyObject *descr = _PyType_Lookup((PyTypeObject *) obj, name);
+    if (descr && PyInstanceMethod_Check(descr)) {
+        Py_INCREF(descr);
+        return descr;
+    }
+    else {
+        return PyType_Type.tp_getattro(obj, name);
+    }
+}
+#endif
+
+/** This metaclass is assigned by default to all pybind11 types and is required in order
+    for static properties to function correctly. Users may override this using `py::metaclass`.
+    Return value: New reference. */
+inline PyTypeObject* make_default_metaclass() {
+    constexpr auto *name = "pybind11_type";
+    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto heap_type = (PyHeapTypeObject *) PyType_Type.tp_alloc(&PyType_Type, 0);
+    if (!heap_type)
+        pybind11_fail("make_default_metaclass(): error allocating metaclass!");
+
+    heap_type->ht_name = name_obj.inc_ref().ptr();
+#ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+#endif
+
+    auto type = &heap_type->ht_type;
+    type->tp_name = name;
+    type->tp_base = type_incref(&PyType_Type);
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+
+    type->tp_setattro = pybind11_meta_setattro;
+#if PY_MAJOR_VERSION >= 3
+    type->tp_getattro = pybind11_meta_getattro;
+#endif
+
+    if (PyType_Ready(type) < 0)
+        pybind11_fail("make_default_metaclass(): failure in PyType_Ready()!");
+
+    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
+    PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
+
+    return type;
+}
+
+/// For multiple inheritance types we need to recursively register/deregister base pointers for any
+/// base classes with pointers that are difference from the instance value pointer so that we can
+/// correctly recognize an offset base class pointer. This calls a function with any offset base ptrs.
+inline void traverse_offset_bases(void *valueptr, const detail::type_info *tinfo, instance *self,
+        bool (*f)(void * /*parentptr*/, instance * /*self*/)) {
+    for (handle h : reinterpret_borrow<tuple>(tinfo->type->tp_bases)) {
+        if (auto parent_tinfo = get_type_info((PyTypeObject *) h.ptr())) {
+            for (auto &c : parent_tinfo->implicit_casts) {
+                if (c.first == tinfo->cpptype) {
+                    auto *parentptr = c.second(valueptr);
+                    if (parentptr != valueptr)
+                        f(parentptr, self);
+                    traverse_offset_bases(parentptr, parent_tinfo, self, f);
+                    break;
+                }
+            }
+        }
+    }
+}
+
+inline bool register_instance_impl(void *ptr, instance *self) {
+    get_internals().registered_instances.emplace(ptr, self);
+    return true; // unused, but gives the same signature as the deregister func
+}
+inline bool deregister_instance_impl(void *ptr, instance *self) {
+    auto &registered_instances = get_internals().registered_instances;
+    auto range = registered_instances.equal_range(ptr);
+    for (auto it = range.first; it != range.second; ++it) {
+        if (Py_TYPE(self) == Py_TYPE(it->second)) {
+            registered_instances.erase(it);
+            return true;
+        }
+    }
+    return false;
+}
+
+inline void register_instance(instance *self, void *valptr, const type_info *tinfo) {
+    register_instance_impl(valptr, self);
+    if (!tinfo->simple_ancestors)
+        traverse_offset_bases(valptr, tinfo, self, register_instance_impl);
+}
+
+inline bool deregister_instance(instance *self, void *valptr, const type_info *tinfo) {
+    bool ret = deregister_instance_impl(valptr, self);
+    if (!tinfo->simple_ancestors)
+        traverse_offset_bases(valptr, tinfo, self, deregister_instance_impl);
+    return ret;
+}
+
+/// Instance creation function for all pybind11 types. It allocates the internal instance layout for
+/// holding C++ objects and holders.  Allocation is done lazily (the first time the instance is cast
+/// to a reference or pointer), and initialization is done by an `__init__` function.
+inline PyObject *make_new_instance(PyTypeObject *type) {
+#if defined(PYPY_VERSION)
+    // PyPy gets tp_basicsize wrong (issue 2482) under multiple inheritance when the first inherited
+    // object is a a plain Python type (i.e. not derived from an extension type).  Fix it.
+    ssize_t instance_size = static_cast<ssize_t>(sizeof(instance));
+    if (type->tp_basicsize < instance_size) {
+        type->tp_basicsize = instance_size;
+    }
+#endif
+    PyObject *self = type->tp_alloc(type, 0);
+    auto inst = reinterpret_cast<instance *>(self);
+    // Allocate the value/holder internals:
+    inst->allocate_layout();
+
+    inst->owned = true;
+
+    return self;
+}
+
+/// Instance creation function for all pybind11 types. It only allocates space for the
+/// C++ object, but doesn't call the constructor -- an `__init__` function must do that.
+extern "C" inline PyObject *pybind11_object_new(PyTypeObject *type, PyObject *, PyObject *) {
+    return make_new_instance(type);
+}
+
+/// An `__init__` function constructs the C++ object. Users should provide at least one
+/// of these using `py::init` or directly with `.def(__init__, ...)`. Otherwise, the
+/// following default function will be used which simply throws an exception.
+extern "C" inline int pybind11_object_init(PyObject *self, PyObject *, PyObject *) {
+    PyTypeObject *type = Py_TYPE(self);
+    std::string msg;
+#if defined(PYPY_VERSION)
+    msg += handle((PyObject *) type).attr("__module__").cast<std::string>() + ".";
+#endif
+    msg += type->tp_name;
+    msg += ": No constructor defined!";
+    PyErr_SetString(PyExc_TypeError, msg.c_str());
+    return -1;
+}
+
+inline void add_patient(PyObject *nurse, PyObject *patient) {
+    auto &internals = get_internals();
+    auto instance = reinterpret_cast<detail::instance *>(nurse);
+    instance->has_patients = true;
+    Py_INCREF(patient);
+    internals.patients[nurse].push_back(patient);
+}
+
+inline void clear_patients(PyObject *self) {
+    auto instance = reinterpret_cast<detail::instance *>(self);
+    auto &internals = get_internals();
+    auto pos = internals.patients.find(self);
+    assert(pos != internals.patients.end());
+    // Clearing the patients can cause more Python code to run, which
+    // can invalidate the iterator. Extract the vector of patients
+    // from the unordered_map first.
+    auto patients = std::move(pos->second);
+    internals.patients.erase(pos);
+    instance->has_patients = false;
+    for (PyObject *&patient : patients)
+        Py_CLEAR(patient);
+}
+
+/// Clears all internal data from the instance and removes it from registered instances in
+/// preparation for deallocation.
+inline void clear_instance(PyObject *self) {
+    auto instance = reinterpret_cast<detail::instance *>(self);
+
+    // Deallocate any values/holders, if present:
+    for (auto &v_h : values_and_holders(instance)) {
+        if (v_h) {
+
+            // We have to deregister before we call dealloc because, for virtual MI types, we still
+            // need to be able to get the parent pointers.
+            if (v_h.instance_registered() && !deregister_instance(instance, v_h.value_ptr(), v_h.type))
+                pybind11_fail("pybind11_object_dealloc(): Tried to deallocate unregistered instance!");
+
+            if (instance->owned || v_h.holder_constructed())
+                v_h.type->dealloc(v_h);
+        }
+    }
+    // Deallocate the value/holder layout internals:
+    instance->deallocate_layout();
+
+    if (instance->weakrefs)
+        PyObject_ClearWeakRefs(self);
+
+    PyObject **dict_ptr = _PyObject_GetDictPtr(self);
+    if (dict_ptr)
+        Py_CLEAR(*dict_ptr);
+
+    if (instance->has_patients)
+        clear_patients(self);
+}
+
+/// Instance destructor function for all pybind11 types. It calls `type_info.dealloc`
+/// to destroy the C++ object itself, while the rest is Python bookkeeping.
+extern "C" inline void pybind11_object_dealloc(PyObject *self) {
+    clear_instance(self);
+
+    auto type = Py_TYPE(self);
+    type->tp_free(self);
+
+#if PY_VERSION_HEX < 0x03080000
+    // `type->tp_dealloc != pybind11_object_dealloc` means that we're being called
+    // as part of a derived type's dealloc, in which case we're not allowed to decref
+    // the type here. For cross-module compatibility, we shouldn't compare directly
+    // with `pybind11_object_dealloc`, but with the common one stashed in internals.
+    auto pybind11_object_type = (PyTypeObject *) get_internals().instance_base;
+    if (type->tp_dealloc == pybind11_object_type->tp_dealloc)
+        Py_DECREF(type);
+#else
+    // This was not needed before Python 3.8 (Python issue 35810)
+    // https://github.com/pybind/pybind11/issues/1946
+    Py_DECREF(type);
+#endif
+}
+
+/** Create the type which can be used as a common base for all classes.  This is
+    needed in order to satisfy Python's requirements for multiple inheritance.
+    Return value: New reference. */
+inline PyObject *make_object_base_type(PyTypeObject *metaclass) {
+    constexpr auto *name = "pybind11_object";
+    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto heap_type = (PyHeapTypeObject *) metaclass->tp_alloc(metaclass, 0);
+    if (!heap_type)
+        pybind11_fail("make_object_base_type(): error allocating type!");
+
+    heap_type->ht_name = name_obj.inc_ref().ptr();
+#ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+#endif
+
+    auto type = &heap_type->ht_type;
+    type->tp_name = name;
+    type->tp_base = type_incref(&PyBaseObject_Type);
+    type->tp_basicsize = static_cast<ssize_t>(sizeof(instance));
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+
+    type->tp_new = pybind11_object_new;
+    type->tp_init = pybind11_object_init;
+    type->tp_dealloc = pybind11_object_dealloc;
+
+    /* Support weak references (needed for the keep_alive feature) */
+    type->tp_weaklistoffset = offsetof(instance, weakrefs);
+
+    if (PyType_Ready(type) < 0)
+        pybind11_fail("PyType_Ready failed in make_object_base_type():" + error_string());
+
+    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
+    PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
+
+    assert(!PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC));
+    return (PyObject *) heap_type;
+}
+
+/// dynamic_attr: Support for `d = instance.__dict__`.
+extern "C" inline PyObject *pybind11_get_dict(PyObject *self, void *) {
+    PyObject *&dict = *_PyObject_GetDictPtr(self);
+    if (!dict)
+        dict = PyDict_New();
+    Py_XINCREF(dict);
+    return dict;
+}
+
+/// dynamic_attr: Support for `instance.__dict__ = dict()`.
+extern "C" inline int pybind11_set_dict(PyObject *self, PyObject *new_dict, void *) {
+    if (!PyDict_Check(new_dict)) {
+        PyErr_Format(PyExc_TypeError, "__dict__ must be set to a dictionary, not a '%.200s'",
+                     Py_TYPE(new_dict)->tp_name);
+        return -1;
+    }
+    PyObject *&dict = *_PyObject_GetDictPtr(self);
+    Py_INCREF(new_dict);
+    Py_CLEAR(dict);
+    dict = new_dict;
+    return 0;
+}
+
+/// dynamic_attr: Allow the garbage collector to traverse the internal instance `__dict__`.
+extern "C" inline int pybind11_traverse(PyObject *self, visitproc visit, void *arg) {
+    PyObject *&dict = *_PyObject_GetDictPtr(self);
+    Py_VISIT(dict);
+    return 0;
+}
+
+/// dynamic_attr: Allow the GC to clear the dictionary.
+extern "C" inline int pybind11_clear(PyObject *self) {
+    PyObject *&dict = *_PyObject_GetDictPtr(self);
+    Py_CLEAR(dict);
+    return 0;
+}
+
+/// Give instances of this type a `__dict__` and opt into garbage collection.
+inline void enable_dynamic_attributes(PyHeapTypeObject *heap_type) {
+    auto type = &heap_type->ht_type;
+#if defined(PYPY_VERSION)
+    pybind11_fail(std::string(type->tp_name) + ": dynamic attributes are "
+                                               "currently not supported in "
+                                               "conjunction with PyPy!");
+#endif
+    type->tp_flags |= Py_TPFLAGS_HAVE_GC;
+    type->tp_dictoffset = type->tp_basicsize; // place dict at the end
+    type->tp_basicsize += (ssize_t)sizeof(PyObject *); // and allocate enough space for it
+    type->tp_traverse = pybind11_traverse;
+    type->tp_clear = pybind11_clear;
+
+    static PyGetSetDef getset[] = {
+        {const_cast<char*>("__dict__"), pybind11_get_dict, pybind11_set_dict, nullptr, nullptr},
+        {nullptr, nullptr, nullptr, nullptr, nullptr}
+    };
+    type->tp_getset = getset;
+}
+
+/// buffer_protocol: Fill in the view as specified by flags.
+extern "C" inline int pybind11_getbuffer(PyObject *obj, Py_buffer *view, int flags) {
+    // Look for a `get_buffer` implementation in this type's info or any bases (following MRO).
+    type_info *tinfo = nullptr;
+    for (auto type : reinterpret_borrow<tuple>(Py_TYPE(obj)->tp_mro)) {
+        tinfo = get_type_info((PyTypeObject *) type.ptr());
+        if (tinfo && tinfo->get_buffer)
+            break;
+    }
+    if (view == nullptr || !tinfo || !tinfo->get_buffer) {
+        if (view)
+            view->obj = nullptr;
+        PyErr_SetString(PyExc_BufferError, "pybind11_getbuffer(): Internal error");
+        return -1;
+    }
+    std::memset(view, 0, sizeof(Py_buffer));
+    buffer_info *info = tinfo->get_buffer(obj, tinfo->get_buffer_data);
+    view->obj = obj;
+    view->ndim = 1;
+    view->internal = info;
+    view->buf = info->ptr;
+    view->itemsize = info->itemsize;
+    view->len = view->itemsize;
+    for (auto s : info->shape)
+        view->len *= s;
+    view->readonly = info->readonly;
+    if ((flags & PyBUF_WRITABLE) == PyBUF_WRITABLE && info->readonly) {
+        if (view)
+            view->obj = nullptr;
+        PyErr_SetString(PyExc_BufferError, "Writable buffer requested for readonly storage");
+        return -1;
+    }
+    if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT)
+        view->format = const_cast<char *>(info->format.c_str());
+    if ((flags & PyBUF_STRIDES) == PyBUF_STRIDES) {
+        view->ndim = (int) info->ndim;
+        view->strides = &info->strides[0];
+        view->shape = &info->shape[0];
+    }
+    Py_INCREF(view->obj);
+    return 0;
+}
+
+/// buffer_protocol: Release the resources of the buffer.
+extern "C" inline void pybind11_releasebuffer(PyObject *, Py_buffer *view) {
+    delete (buffer_info *) view->internal;
+}
+
+/// Give this type a buffer interface.
+inline void enable_buffer_protocol(PyHeapTypeObject *heap_type) {
+    heap_type->ht_type.tp_as_buffer = &heap_type->as_buffer;
+#if PY_MAJOR_VERSION < 3
+    heap_type->ht_type.tp_flags |= Py_TPFLAGS_HAVE_NEWBUFFER;
+#endif
+
+    heap_type->as_buffer.bf_getbuffer = pybind11_getbuffer;
+    heap_type->as_buffer.bf_releasebuffer = pybind11_releasebuffer;
+}
+
+/** Create a brand new Python type according to the `type_record` specification.
+    Return value: New reference. */
+inline PyObject* make_new_python_type(const type_record &rec) {
+    auto name = reinterpret_steal<object>(PYBIND11_FROM_STRING(rec.name));
+
+    auto qualname = name;
+    if (rec.scope && !PyModule_Check(rec.scope.ptr()) && hasattr(rec.scope, "__qualname__")) {
+#if PY_MAJOR_VERSION >= 3
+        qualname = reinterpret_steal<object>(
+            PyUnicode_FromFormat("%U.%U", rec.scope.attr("__qualname__").ptr(), name.ptr()));
+#else
+        qualname = str(rec.scope.attr("__qualname__").cast<std::string>() + "." + rec.name);
+#endif
+    }
+
+    object module;
+    if (rec.scope) {
+        if (hasattr(rec.scope, "__module__"))
+            module = rec.scope.attr("__module__");
+        else if (hasattr(rec.scope, "__name__"))
+            module = rec.scope.attr("__name__");
+    }
+
+    auto full_name = c_str(
+#if !defined(PYPY_VERSION)
+        module ? str(module).cast<std::string>() + "." + rec.name :
+#endif
+        rec.name);
+
+    char *tp_doc = nullptr;
+    if (rec.doc && options::show_user_defined_docstrings()) {
+        /* Allocate memory for docstring (using PyObject_MALLOC, since
+           Python will free this later on) */
+        size_t size = strlen(rec.doc) + 1;
+        tp_doc = (char *) PyObject_MALLOC(size);
+        memcpy((void *) tp_doc, rec.doc, size);
+    }
+
+    auto &internals = get_internals();
+    auto bases = tuple(rec.bases);
+    auto base = (bases.size() == 0) ? internals.instance_base
+                                    : bases[0].ptr();
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto metaclass = rec.metaclass.ptr() ? (PyTypeObject *) rec.metaclass.ptr()
+                                         : internals.default_metaclass;
+
+    auto heap_type = (PyHeapTypeObject *) metaclass->tp_alloc(metaclass, 0);
+    if (!heap_type)
+        pybind11_fail(std::string(rec.name) + ": Unable to create type object!");
+
+    heap_type->ht_name = name.release().ptr();
+#ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = qualname.inc_ref().ptr();
+#endif
+
+    auto type = &heap_type->ht_type;
+    type->tp_name = full_name;
+    type->tp_doc = tp_doc;
+    type->tp_base = type_incref((PyTypeObject *)base);
+    type->tp_basicsize = static_cast<ssize_t>(sizeof(instance));
+    if (bases.size() > 0)
+        type->tp_bases = bases.release().ptr();
+
+    /* Don't inherit base __init__ */
+    type->tp_init = pybind11_object_init;
+
+    /* Supported protocols */
+    type->tp_as_number = &heap_type->as_number;
+    type->tp_as_sequence = &heap_type->as_sequence;
+    type->tp_as_mapping = &heap_type->as_mapping;
+#if PY_VERSION_HEX >= 0x03050000
+    type->tp_as_async = &heap_type->as_async;
+#endif
+
+    /* Flags */
+    type->tp_flags |= Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+#if PY_MAJOR_VERSION < 3
+    type->tp_flags |= Py_TPFLAGS_CHECKTYPES;
+#endif
+
+    if (rec.dynamic_attr)
+        enable_dynamic_attributes(heap_type);
+
+    if (rec.buffer_protocol)
+        enable_buffer_protocol(heap_type);
+
+    if (PyType_Ready(type) < 0)
+        pybind11_fail(std::string(rec.name) + ": PyType_Ready failed (" + error_string() + ")!");
+
+    assert(rec.dynamic_attr ? PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC)
+                            : !PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC));
+
+    /* Register type with the parent scope */
+    if (rec.scope)
+        setattr(rec.scope, rec.name, (PyObject *) type);
+    else
+        Py_INCREF(type); // Keep it alive forever (reference leak)
+
+    if (module) // Needed by pydoc
+        setattr((PyObject *) type, "__module__", module);
+
+    PYBIND11_SET_OLDPY_QUALNAME(type, qualname);
+
+    return (PyObject *) type;
+}
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
--- a/cviruntime/python/include/pybind11/include/pybind11/detail/common.h
+++ b/cviruntime/python/include/pybind11/include/pybind11/detail/common.h
@ -0,0 +1,820 @@
+/*
+    pybind11/detail/common.h -- Basic macros
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#if !defined(NAMESPACE_BEGIN)
+#  define NAMESPACE_BEGIN(name) namespace name {
+#endif
+#if !defined(NAMESPACE_END)
+#  define NAMESPACE_END(name) }
+#endif
+
+// Robust support for some features and loading modules compiled against different pybind versions
+// requires forcing hidden visibility on pybind code, so we enforce this by setting the attribute on
+// the main `pybind11` namespace.
+#if !defined(PYBIND11_NAMESPACE)
+#  ifdef __GNUG__
+#    define PYBIND11_NAMESPACE pybind11 __attribute__((visibility("hidden")))
+#  else
+#    define PYBIND11_NAMESPACE pybind11
+#  endif
+#endif
+
+#if !(defined(_MSC_VER) && __cplusplus == 199711L) && !defined(__INTEL_COMPILER)
+#  if __cplusplus >= 201402L
+#    define PYBIND11_CPP14
+#    if __cplusplus >= 201703L
+#      define PYBIND11_CPP17
+#    endif
+#  endif
+#elif defined(_MSC_VER) && __cplusplus == 199711L
+// MSVC sets _MSVC_LANG rather than __cplusplus (supposedly until the standard is fully implemented)
+// Unless you use the /Zc:__cplusplus flag on Visual Studio 2017 15.7 Preview 3 or newer
+#  if _MSVC_LANG >= 201402L
+#    define PYBIND11_CPP14
+#    if _MSVC_LANG > 201402L && _MSC_VER >= 1910
+#      define PYBIND11_CPP17
+#    endif
+#  endif
+#endif
+
+// Compiler version assertions
+#if defined(__INTEL_COMPILER)
+#  if __INTEL_COMPILER < 1700
+#    error pybind11 requires Intel C++ compiler v17 or newer
+#  endif
+#elif defined(__clang__) && !defined(__apple_build_version__)
+#  if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ < 3)
+#    error pybind11 requires clang 3.3 or newer
+#  endif
+#elif defined(__clang__)
+// Apple changes clang version macros to its Xcode version; the first Xcode release based on
+// (upstream) clang 3.3 was Xcode 5:
+#  if __clang_major__ < 5
+#    error pybind11 requires Xcode/clang 5.0 or newer
+#  endif
+#elif defined(__GNUG__)
+#  if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8)
+#    error pybind11 requires gcc 4.8 or newer
+#  endif
+#elif defined(_MSC_VER)
+// Pybind hits various compiler bugs in 2015u2 and earlier, and also makes use of some stl features
+// (e.g. std::negation) added in 2015u3:
+#  if _MSC_FULL_VER < 190024210
+#    error pybind11 requires MSVC 2015 update 3 or newer
+#  endif
+#endif
+
+#if !defined(PYBIND11_EXPORT)
+#  if defined(WIN32) || defined(_WIN32)
+#    define PYBIND11_EXPORT __declspec(dllexport)
+#  else
+#    define PYBIND11_EXPORT __attribute__ ((visibility("default")))
+#  endif
+#endif
+
+#if defined(_MSC_VER)
+#  define PYBIND11_NOINLINE __declspec(noinline)
+#else
+#  define PYBIND11_NOINLINE __attribute__ ((noinline))
+#endif
+
+#if defined(PYBIND11_CPP14)
+#  define PYBIND11_DEPRECATED(reason) [[deprecated(reason)]]
+#else
+#  define PYBIND11_DEPRECATED(reason) __attribute__((deprecated(reason)))
+#endif
+
+#define PYBIND11_VERSION_MAJOR 2
+#define PYBIND11_VERSION_MINOR 5
+#define PYBIND11_VERSION_PATCH dev1
+
+/// Include Python header, disable linking to pythonX_d.lib on Windows in debug mode
+#if defined(_MSC_VER)
+#  if (PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION < 4)
+#    define HAVE_ROUND 1
+#  endif
+#  pragma warning(push)
+#  pragma warning(disable: 4510 4610 4512 4005)
+#  if defined(_DEBUG) && !defined(Py_DEBUG)
+#    define PYBIND11_DEBUG_MARKER
+#    undef _DEBUG
+#  endif
+#endif
+
+#include <Python.h>
+#include <frameobject.h>
+#include <pythread.h>
+
+/* Python #defines overrides on all sorts of core functions, which
+   tends to weak havok in C++ codebases that expect these to work
+   like regular functions (potentially with several overloads) */
+#if defined(isalnum)
+#  undef isalnum
+#  undef isalpha
+#  undef islower
+#  undef isspace
+#  undef isupper
+#  undef tolower
+#  undef toupper
+#endif
+
+#if defined(copysign)
+#  undef copysign
+#endif
+
+#if defined(_MSC_VER)
+#  if defined(PYBIND11_DEBUG_MARKER)
+#    define _DEBUG
+#    undef PYBIND11_DEBUG_MARKER
+#  endif
+#  pragma warning(pop)
+#endif
+
+#include <cstddef>
+#include <cstring>
+#include <forward_list>
+#include <vector>
+#include <string>
+#include <stdexcept>
+#include <unordered_set>
+#include <unordered_map>
+#include <memory>
+#include <typeindex>
+#include <type_traits>
+
+#if PY_MAJOR_VERSION >= 3 /// Compatibility macros for various Python versions
+#define PYBIND11_INSTANCE_METHOD_NEW(ptr, class_) PyInstanceMethod_New(ptr)
+#define PYBIND11_INSTANCE_METHOD_CHECK PyInstanceMethod_Check
+#define PYBIND11_INSTANCE_METHOD_GET_FUNCTION PyInstanceMethod_GET_FUNCTION
+#define PYBIND11_BYTES_CHECK PyBytes_Check
+#define PYBIND11_BYTES_FROM_STRING PyBytes_FromString
+#define PYBIND11_BYTES_FROM_STRING_AND_SIZE PyBytes_FromStringAndSize
+#define PYBIND11_BYTES_AS_STRING_AND_SIZE PyBytes_AsStringAndSize
+#define PYBIND11_BYTES_AS_STRING PyBytes_AsString
+#define PYBIND11_BYTES_SIZE PyBytes_Size
+#define PYBIND11_LONG_CHECK(o) PyLong_Check(o)
+#define PYBIND11_LONG_AS_LONGLONG(o) PyLong_AsLongLong(o)
+#define PYBIND11_LONG_FROM_SIGNED(o) PyLong_FromSsize_t((ssize_t) o)
+#define PYBIND11_LONG_FROM_UNSIGNED(o) PyLong_FromSize_t((size_t) o)
+#define PYBIND11_BYTES_NAME "bytes"
+#define PYBIND11_STRING_NAME "str"
+#define PYBIND11_SLICE_OBJECT PyObject
+#define PYBIND11_FROM_STRING PyUnicode_FromString
+#define PYBIND11_STR_TYPE ::pybind11::str
+#define PYBIND11_BOOL_ATTR "__bool__"
+#define PYBIND11_NB_BOOL(ptr) ((ptr)->nb_bool)
+// Providing a separate declaration to make Clang's -Wmissing-prototypes happy
+#define PYBIND11_PLUGIN_IMPL(name) \
+    extern "C" PYBIND11_EXPORT PyObject *PyInit_##name();   \
+    extern "C" PYBIND11_EXPORT PyObject *PyInit_##name()
+
+#else
+#define PYBIND11_INSTANCE_METHOD_NEW(ptr, class_) PyMethod_New(ptr, nullptr, class_)
+#define PYBIND11_INSTANCE_METHOD_CHECK PyMethod_Check
+#define PYBIND11_INSTANCE_METHOD_GET_FUNCTION PyMethod_GET_FUNCTION
+#define PYBIND11_BYTES_CHECK PyString_Check
+#define PYBIND11_BYTES_FROM_STRING PyString_FromString
+#define PYBIND11_BYTES_FROM_STRING_AND_SIZE PyString_FromStringAndSize
+#define PYBIND11_BYTES_AS_STRING_AND_SIZE PyString_AsStringAndSize
+#define PYBIND11_BYTES_AS_STRING PyString_AsString
+#define PYBIND11_BYTES_SIZE PyString_Size
+#define PYBIND11_LONG_CHECK(o) (PyInt_Check(o) || PyLong_Check(o))
+#define PYBIND11_LONG_AS_LONGLONG(o) (PyInt_Check(o) ? (long long) PyLong_AsLong(o) : PyLong_AsLongLong(o))
+#define PYBIND11_LONG_FROM_SIGNED(o) PyInt_FromSsize_t((ssize_t) o) // Returns long if needed.
+#define PYBIND11_LONG_FROM_UNSIGNED(o) PyInt_FromSize_t((size_t) o) // Returns long if needed.
+#define PYBIND11_BYTES_NAME "str"
+#define PYBIND11_STRING_NAME "unicode"
+#define PYBIND11_SLICE_OBJECT PySliceObject
+#define PYBIND11_FROM_STRING PyString_FromString
+#define PYBIND11_STR_TYPE ::pybind11::bytes
+#define PYBIND11_BOOL_ATTR "__nonzero__"
+#define PYBIND11_NB_BOOL(ptr) ((ptr)->nb_nonzero)
+// Providing a separate PyInit decl to make Clang's -Wmissing-prototypes happy
+#define PYBIND11_PLUGIN_IMPL(name) \
+    static PyObject *pybind11_init_wrapper();               \
+    extern "C" PYBIND11_EXPORT void init##name();           \
+    extern "C" PYBIND11_EXPORT void init##name() {          \
+        (void)pybind11_init_wrapper();                      \
+    }                                                       \
+    PyObject *pybind11_init_wrapper()
+#endif
+
+#if PY_VERSION_HEX >= 0x03050000 && PY_VERSION_HEX < 0x03050200
+extern "C" {
+    struct _Py_atomic_address { void *value; };
+    PyAPI_DATA(_Py_atomic_address) _PyThreadState_Current;
+}
+#endif
+
+#define PYBIND11_TRY_NEXT_OVERLOAD ((PyObject *) 1) // special failure return code
+#define PYBIND11_STRINGIFY(x) #x
+#define PYBIND11_TOSTRING(x) PYBIND11_STRINGIFY(x)
+#define PYBIND11_CONCAT(first, second) first##second
+#define PYBIND11_ENSURE_INTERNALS_READY \
+    pybind11::detail::get_internals();
+
+#define PYBIND11_CHECK_PYTHON_VERSION \
+    {                                                                          \
+        const char *compiled_ver = PYBIND11_TOSTRING(PY_MAJOR_VERSION)         \
+            "." PYBIND11_TOSTRING(PY_MINOR_VERSION);                           \
+        const char *runtime_ver = Py_GetVersion();                             \
+        size_t len = std::strlen(compiled_ver);                                \
+        if (std::strncmp(runtime_ver, compiled_ver, len) != 0                  \
+                || (runtime_ver[len] >= '0' && runtime_ver[len] <= '9')) {     \
+            PyErr_Format(PyExc_ImportError,                                    \
+                "Python version mismatch: module was compiled for Python %s, " \
+                "but the interpreter version is incompatible: %s.",            \
+                compiled_ver, runtime_ver);                                    \
+            return nullptr;                                                    \
+        }                                                                      \
+    }
+
+#define PYBIND11_CATCH_INIT_EXCEPTIONS \
+        catch (pybind11::error_already_set &e) {                               \
+            PyErr_SetString(PyExc_ImportError, e.what());                      \
+            return nullptr;                                                    \
+        } catch (const std::exception &e) {                                    \
+            PyErr_SetString(PyExc_ImportError, e.what());                      \
+            return nullptr;                                                    \
+        }                                                                      \
+
+/** \rst
+    ***Deprecated in favor of PYBIND11_MODULE***
+
+    This macro creates the entry point that will be invoked when the Python interpreter
+    imports a plugin library. Please create a `module` in the function body and return
+    the pointer to its underlying Python object at the end.
+
+    .. code-block:: cpp
+
+        PYBIND11_PLUGIN(example) {
+            pybind11::module m("example", "pybind11 example plugin");
+            /// Set up bindings here
+            return m.ptr();
+        }
+\endrst */
+#define PYBIND11_PLUGIN(name)                                                  \
+    PYBIND11_DEPRECATED("PYBIND11_PLUGIN is deprecated, use PYBIND11_MODULE")  \
+    static PyObject *pybind11_init();                                          \
+    PYBIND11_PLUGIN_IMPL(name) {                                               \
+        PYBIND11_CHECK_PYTHON_VERSION                                          \
+        PYBIND11_ENSURE_INTERNALS_READY                                        \
+        try {                                                                  \
+            return pybind11_init();                                            \
+        } PYBIND11_CATCH_INIT_EXCEPTIONS                                       \
+    }                                                                          \
+    PyObject *pybind11_init()
+
+/** \rst
+    This macro creates the entry point that will be invoked when the Python interpreter
+    imports an extension module. The module name is given as the fist argument and it
+    should not be in quotes. The second macro argument defines a variable of type
+    `py::module` which can be used to initialize the module.
+
+    .. code-block:: cpp
+
+        PYBIND11_MODULE(example, m) {
+            m.doc() = "pybind11 example module";
+
+            // Add bindings here
+            m.def("foo", []() {
+                return "Hello, World!";
+            });
+        }
+\endrst */
+#define PYBIND11_MODULE(name, variable)                                        \
+    static void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &);     \
+    PYBIND11_PLUGIN_IMPL(name) {                                               \
+        PYBIND11_CHECK_PYTHON_VERSION                                          \
+        PYBIND11_ENSURE_INTERNALS_READY                                        \
+        auto m = pybind11::module(PYBIND11_TOSTRING(name));                    \
+        try {                                                                  \
+            PYBIND11_CONCAT(pybind11_init_, name)(m);                          \
+            return m.ptr();                                                    \
+        } PYBIND11_CATCH_INIT_EXCEPTIONS                                       \
+    }                                                                          \
+    void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &variable)
+
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+using ssize_t = Py_ssize_t;
+using size_t  = std::size_t;
+
+/// Approach used to cast a previously unknown C++ instance into a Python object
+enum class return_value_policy : uint8_t {
+    /** This is the default return value policy, which falls back to the policy
+        return_value_policy::take_ownership when the return value is a pointer.
+        Otherwise, it uses return_value::move or return_value::copy for rvalue
+        and lvalue references, respectively. See below for a description of what
+        all of these different policies do. */
+    automatic = 0,
+
+    /** As above, but use policy return_value_policy::reference when the return
+        value is a pointer. This is the default conversion policy for function
+        arguments when calling Python functions manually from C++ code (i.e. via
+        handle::operator()). You probably won't need to use this. */
+    automatic_reference,
+
+    /** Reference an existing object (i.e. do not create a new copy) and take
+        ownership. Python will call the destructor and delete operator when the
+        object’s reference count reaches zero. Undefined behavior ensues when
+        the C++ side does the same.. */
+    take_ownership,
+
+    /** Create a new copy of the returned object, which will be owned by
+        Python. This policy is comparably safe because the lifetimes of the two
+        instances are decoupled. */
+    copy,
+
+    /** Use std::move to move the return value contents into a new instance
+        that will be owned by Python. This policy is comparably safe because the
+        lifetimes of the two instances (move source and destination) are
+        decoupled. */
+    move,
+
+    /** Reference an existing object, but do not take ownership. The C++ side
+        is responsible for managing the object’s lifetime and deallocating it
+        when it is no longer used. Warning: undefined behavior will ensue when
+        the C++ side deletes an object that is still referenced and used by
+        Python. */
+    reference,
+
+    /** This policy only applies to methods and properties. It references the
+        object without taking ownership similar to the above
+        return_value_policy::reference policy. In contrast to that policy, the
+        function or property’s implicit this argument (called the parent) is
+        considered to be the the owner of the return value (the child).
+        pybind11 then couples the lifetime of the parent to the child via a
+        reference relationship that ensures that the parent cannot be garbage
+        collected while Python is still using the child. More advanced
+        variations of this scheme are also possible using combinations of
+        return_value_policy::reference and the keep_alive call policy */
+    reference_internal
+};
+
+NAMESPACE_BEGIN(detail)
+
+inline static constexpr int log2(size_t n, int k = 0) { return (n <= 1) ? k : log2(n >> 1, k + 1); }
+
+// Returns the size as a multiple of sizeof(void *), rounded up.
+inline static constexpr size_t size_in_ptrs(size_t s) { return 1 + ((s - 1) >> log2(sizeof(void *))); }
+
+/**
+ * The space to allocate for simple layout instance holders (see below) in multiple of the size of
+ * a pointer (e.g.  2 means 16 bytes on 64-bit architectures).  The default is the minimum required
+ * to holder either a std::unique_ptr or std::shared_ptr (which is almost always
+ * sizeof(std::shared_ptr<T>)).
+ */
+constexpr size_t instance_simple_holder_in_ptrs() {
+    static_assert(sizeof(std::shared_ptr<int>) >= sizeof(std::unique_ptr<int>),
+            "pybind assumes std::shared_ptrs are at least as big as std::unique_ptrs");
+    return size_in_ptrs(sizeof(std::shared_ptr<int>));
+}
+
+// Forward declarations
+struct type_info;
+struct value_and_holder;
+
+struct nonsimple_values_and_holders {
+    void **values_and_holders;
+    uint8_t *status;
+};
+
+/// The 'instance' type which needs to be standard layout (need to be able to use 'offsetof')
+struct instance {
+    PyObject_HEAD
+    /// Storage for pointers and holder; see simple_layout, below, for a description
+    union {
+        void *simple_value_holder[1 + instance_simple_holder_in_ptrs()];
+        nonsimple_values_and_holders nonsimple;
+    };
+    /// Weak references
+    PyObject *weakrefs;
+    /// If true, the pointer is owned which means we're free to manage it with a holder.
+    bool owned : 1;
+    /**
+     * An instance has two possible value/holder layouts.
+     *
+     * Simple layout (when this flag is true), means the `simple_value_holder` is set with a pointer
+     * and the holder object governing that pointer, i.e. [val1*][holder].  This layout is applied
+     * whenever there is no python-side multiple inheritance of bound C++ types *and* the type's
+     * holder will fit in the default space (which is large enough to hold either a std::unique_ptr
+     * or std::shared_ptr).
+     *
+     * Non-simple layout applies when using custom holders that require more space than `shared_ptr`
+     * (which is typically the size of two pointers), or when multiple inheritance is used on the
+     * python side.  Non-simple layout allocates the required amount of memory to have multiple
+     * bound C++ classes as parents.  Under this layout, `nonsimple.values_and_holders` is set to a
+     * pointer to allocated space of the required space to hold a sequence of value pointers and
+     * holders followed `status`, a set of bit flags (1 byte each), i.e.
+     * [val1*][holder1][val2*][holder2]...[bb...]  where each [block] is rounded up to a multiple of
+     * `sizeof(void *)`.  `nonsimple.status` is, for convenience, a pointer to the
+     * beginning of the [bb...] block (but not independently allocated).
+     *
+     * Status bits indicate whether the associated holder is constructed (&
+     * status_holder_constructed) and whether the value pointer is registered (&
+     * status_instance_registered) in `registered_instances`.
+     */
+    bool simple_layout : 1;
+    /// For simple layout, tracks whether the holder has been constructed
+    bool simple_holder_constructed : 1;
+    /// For simple layout, tracks whether the instance is registered in `registered_instances`
+    bool simple_instance_registered : 1;
+    /// If true, get_internals().patients has an entry for this object
+    bool has_patients : 1;
+
+    /// Initializes all of the above type/values/holders data (but not the instance values themselves)
+    void allocate_layout();
+
+    /// Destroys/deallocates all of the above
+    void deallocate_layout();
+
+    /// Returns the value_and_holder wrapper for the given type (or the first, if `find_type`
+    /// omitted).  Returns a default-constructed (with `.inst = nullptr`) object on failure if
+    /// `throw_if_missing` is false.
+    value_and_holder get_value_and_holder(const type_info *find_type = nullptr, bool throw_if_missing = true);
+
+    /// Bit values for the non-simple status flags
+    static constexpr uint8_t status_holder_constructed  = 1;
+    static constexpr uint8_t status_instance_registered = 2;
+};
+
+static_assert(std::is_standard_layout<instance>::value, "Internal error: `pybind11::detail::instance` is not standard layout!");
+
+/// from __cpp_future__ import (convenient aliases from C++14/17)
+#if defined(PYBIND11_CPP14) && (!defined(_MSC_VER) || _MSC_VER >= 1910)
+using std::enable_if_t;
+using std::conditional_t;
+using std::remove_cv_t;
+using std::remove_reference_t;
+#else
+template <bool B, typename T = void> using enable_if_t = typename std::enable_if<B, T>::type;
+template <bool B, typename T, typename F> using conditional_t = typename std::conditional<B, T, F>::type;
+template <typename T> using remove_cv_t = typename std::remove_cv<T>::type;
+template <typename T> using remove_reference_t = typename std::remove_reference<T>::type;
+#endif
+
+/// Index sequences
+#if defined(PYBIND11_CPP14)
+using std::index_sequence;
+using std::make_index_sequence;
+#else
+template<size_t ...> struct index_sequence  { };
+template<size_t N, size_t ...S> struct make_index_sequence_impl : make_index_sequence_impl <N - 1, N - 1, S...> { };
+template<size_t ...S> struct make_index_sequence_impl <0, S...> { typedef index_sequence<S...> type; };
+template<size_t N> using make_index_sequence = typename make_index_sequence_impl<N>::type;
+#endif
+
+/// Make an index sequence of the indices of true arguments
+template <typename ISeq, size_t, bool...> struct select_indices_impl { using type = ISeq; };
+template <size_t... IPrev, size_t I, bool B, bool... Bs> struct select_indices_impl<index_sequence<IPrev...>, I, B, Bs...>
+    : select_indices_impl<conditional_t<B, index_sequence<IPrev..., I>, index_sequence<IPrev...>>, I + 1, Bs...> {};
+template <bool... Bs> using select_indices = typename select_indices_impl<index_sequence<>, 0, Bs...>::type;
+
+/// Backports of std::bool_constant and std::negation to accommodate older compilers
+template <bool B> using bool_constant = std::integral_constant<bool, B>;
+template <typename T> struct negation : bool_constant<!T::value> { };
+
+template <typename...> struct void_t_impl { using type = void; };
+template <typename... Ts> using void_t = typename void_t_impl<Ts...>::type;
+
+/// Compile-time all/any/none of that check the boolean value of all template types
+#if defined(__cpp_fold_expressions) && !(defined(_MSC_VER) && (_MSC_VER < 1916))
+template <class... Ts> using all_of = bool_constant<(Ts::value && ...)>;
+template <class... Ts> using any_of = bool_constant<(Ts::value || ...)>;
+#elif !defined(_MSC_VER)
+template <bool...> struct bools {};
+template <class... Ts> using all_of = std::is_same<
+    bools<Ts::value..., true>,
+    bools<true, Ts::value...>>;
+template <class... Ts> using any_of = negation<all_of<negation<Ts>...>>;
+#else
+// MSVC has trouble with the above, but supports std::conjunction, which we can use instead (albeit
+// at a slight loss of compilation efficiency).
+template <class... Ts> using all_of = std::conjunction<Ts...>;
+template <class... Ts> using any_of = std::disjunction<Ts...>;
+#endif
+template <class... Ts> using none_of = negation<any_of<Ts...>>;
+
+template <class T, template<class> class... Predicates> using satisfies_all_of = all_of<Predicates<T>...>;
+template <class T, template<class> class... Predicates> using satisfies_any_of = any_of<Predicates<T>...>;
+template <class T, template<class> class... Predicates> using satisfies_none_of = none_of<Predicates<T>...>;
+
+/// Strip the class from a method type
+template <typename T> struct remove_class { };
+template <typename C, typename R, typename... A> struct remove_class<R (C::*)(A...)> { typedef R type(A...); };
+template <typename C, typename R, typename... A> struct remove_class<R (C::*)(A...) const> { typedef R type(A...); };
+
+/// Helper template to strip away type modifiers
+template <typename T> struct intrinsic_type                       { typedef T type; };
+template <typename T> struct intrinsic_type<const T>              { typedef typename intrinsic_type<T>::type type; };
+template <typename T> struct intrinsic_type<T*>                   { typedef typename intrinsic_type<T>::type type; };
+template <typename T> struct intrinsic_type<T&>                   { typedef typename intrinsic_type<T>::type type; };
+template <typename T> struct intrinsic_type<T&&>                  { typedef typename intrinsic_type<T>::type type; };
+template <typename T, size_t N> struct intrinsic_type<const T[N]> { typedef typename intrinsic_type<T>::type type; };
+template <typename T, size_t N> struct intrinsic_type<T[N]>       { typedef typename intrinsic_type<T>::type type; };
+template <typename T> using intrinsic_t = typename intrinsic_type<T>::type;
+
+/// Helper type to replace 'void' in some expressions
+struct void_type { };
+
+/// Helper template which holds a list of types
+template <typename...> struct type_list { };
+
+/// Compile-time integer sum
+#ifdef __cpp_fold_expressions
+template <typename... Ts> constexpr size_t constexpr_sum(Ts... ns) { return (0 + ... + size_t{ns}); }
+#else
+constexpr size_t constexpr_sum() { return 0; }
+template <typename T, typename... Ts>
+constexpr size_t constexpr_sum(T n, Ts... ns) { return size_t{n} + constexpr_sum(ns...); }
+#endif
+
+NAMESPACE_BEGIN(constexpr_impl)
+/// Implementation details for constexpr functions
+constexpr int first(int i) { return i; }
+template <typename T, typename... Ts>
+constexpr int first(int i, T v, Ts... vs) { return v ? i : first(i + 1, vs...); }
+
+constexpr int last(int /*i*/, int result) { return result; }
+template <typename T, typename... Ts>
+constexpr int last(int i, int result, T v, Ts... vs) { return last(i + 1, v ? i : result, vs...); }
+NAMESPACE_END(constexpr_impl)
+
+/// Return the index of the first type in Ts which satisfies Predicate<T>.  Returns sizeof...(Ts) if
+/// none match.
+template <template<typename> class Predicate, typename... Ts>
+constexpr int constexpr_first() { return constexpr_impl::first(0, Predicate<Ts>::value...); }
+
+/// Return the index of the last type in Ts which satisfies Predicate<T>, or -1 if none match.
+template <template<typename> class Predicate, typename... Ts>
+constexpr int constexpr_last() { return constexpr_impl::last(0, -1, Predicate<Ts>::value...); }
+
+/// Return the Nth element from the parameter pack
+template <size_t N, typename T, typename... Ts>
+struct pack_element { using type = typename pack_element<N - 1, Ts...>::type; };
+template <typename T, typename... Ts>
+struct pack_element<0, T, Ts...> { using type = T; };
+
+/// Return the one and only type which matches the predicate, or Default if none match.
+/// If more than one type matches the predicate, fail at compile-time.
+template <template<typename> class Predicate, typename Default, typename... Ts>
+struct exactly_one {
+    static constexpr auto found = constexpr_sum(Predicate<Ts>::value...);
+    static_assert(found <= 1, "Found more than one type matching the predicate");
+
+    static constexpr auto index = found ? constexpr_first<Predicate, Ts...>() : 0;
+    using type = conditional_t<found, typename pack_element<index, Ts...>::type, Default>;
+};
+template <template<typename> class P, typename Default>
+struct exactly_one<P, Default> { using type = Default; };
+
+template <template<typename> class Predicate, typename Default, typename... Ts>
+using exactly_one_t = typename exactly_one<Predicate, Default, Ts...>::type;
+
+/// Defer the evaluation of type T until types Us are instantiated
+template <typename T, typename... /*Us*/> struct deferred_type { using type = T; };
+template <typename T, typename... Us> using deferred_t = typename deferred_type<T, Us...>::type;
+
+/// Like is_base_of, but requires a strict base (i.e. `is_strict_base_of<T, T>::value == false`,
+/// unlike `std::is_base_of`)
+template <typename Base, typename Derived> using is_strict_base_of = bool_constant<
+    std::is_base_of<Base, Derived>::value && !std::is_same<Base, Derived>::value>;
+
+/// Like is_base_of, but also requires that the base type is accessible (i.e. that a Derived pointer
+/// can be converted to a Base pointer)
+template <typename Base, typename Derived> using is_accessible_base_of = bool_constant<
+    std::is_base_of<Base, Derived>::value && std::is_convertible<Derived *, Base *>::value>;
+
+template <template<typename...> class Base>
+struct is_template_base_of_impl {
+    template <typename... Us> static std::true_type check(Base<Us...> *);
+    static std::false_type check(...);
+};
+
+/// Check if a template is the base of a type. For example:
+/// `is_template_base_of<Base, T>` is true if `struct T : Base<U> {}` where U can be anything
+template <template<typename...> class Base, typename T>
+#if !defined(_MSC_VER)
+using is_template_base_of = decltype(is_template_base_of_impl<Base>::check((intrinsic_t<T>*)nullptr));
+#else // MSVC2015 has trouble with decltype in template aliases
+struct is_template_base_of : decltype(is_template_base_of_impl<Base>::check((intrinsic_t<T>*)nullptr)) { };
+#endif
+
+/// Check if T is an instantiation of the template `Class`. For example:
+/// `is_instantiation<shared_ptr, T>` is true if `T == shared_ptr<U>` where U can be anything.
+template <template<typename...> class Class, typename T>
+struct is_instantiation : std::false_type { };
+template <template<typename...> class Class, typename... Us>
+struct is_instantiation<Class, Class<Us...>> : std::true_type { };
+
+/// Check if T is std::shared_ptr<U> where U can be anything
+template <typename T> using is_shared_ptr = is_instantiation<std::shared_ptr, T>;
+
+/// Check if T looks like an input iterator
+template <typename T, typename = void> struct is_input_iterator : std::false_type {};
+template <typename T>
+struct is_input_iterator<T, void_t<decltype(*std::declval<T &>()), decltype(++std::declval<T &>())>>
+    : std::true_type {};
+
+template <typename T> using is_function_pointer = bool_constant<
+    std::is_pointer<T>::value && std::is_function<typename std::remove_pointer<T>::type>::value>;
+
+template <typename F> struct strip_function_object {
+    using type = typename remove_class<decltype(&F::operator())>::type;
+};
+
+// Extracts the function signature from a function, function pointer or lambda.
+template <typename Function, typename F = remove_reference_t<Function>>
+using function_signature_t = conditional_t<
+    std::is_function<F>::value,
+    F,
+    typename conditional_t<
+        std::is_pointer<F>::value || std::is_member_pointer<F>::value,
+        std::remove_pointer<F>,
+        strip_function_object<F>
+    >::type
+>;
+
+/// Returns true if the type looks like a lambda: that is, isn't a function, pointer or member
+/// pointer.  Note that this can catch all sorts of other things, too; this is intended to be used
+/// in a place where passing a lambda makes sense.
+template <typename T> using is_lambda = satisfies_none_of<remove_reference_t<T>,
+        std::is_function, std::is_pointer, std::is_member_pointer>;
+
+/// Ignore that a variable is unused in compiler warnings
+inline void ignore_unused(const int *) { }
+
+/// Apply a function over each element of a parameter pack
+#ifdef __cpp_fold_expressions
+#define PYBIND11_EXPAND_SIDE_EFFECTS(PATTERN) (((PATTERN), void()), ...)
+#else
+using expand_side_effects = bool[];
+#define PYBIND11_EXPAND_SIDE_EFFECTS(PATTERN) pybind11::detail::expand_side_effects{ ((PATTERN), void(), false)..., false }
+#endif
+
+NAMESPACE_END(detail)
+
+/// C++ bindings of builtin Python exceptions
+class builtin_exception : public std::runtime_error {
+public:
+    using std::runtime_error::runtime_error;
+    /// Set the error using the Python C API
+    virtual void set_error() const = 0;
+};
+
+#define PYBIND11_RUNTIME_EXCEPTION(name, type) \
+    class name : public builtin_exception { public: \
+        using builtin_exception::builtin_exception; \
+        name() : name("") { } \
+        void set_error() const override { PyErr_SetString(type, what()); } \
+    };
+
+PYBIND11_RUNTIME_EXCEPTION(stop_iteration, PyExc_StopIteration)
+PYBIND11_RUNTIME_EXCEPTION(index_error, PyExc_IndexError)
+PYBIND11_RUNTIME_EXCEPTION(key_error, PyExc_KeyError)
+PYBIND11_RUNTIME_EXCEPTION(value_error, PyExc_ValueError)
+PYBIND11_RUNTIME_EXCEPTION(type_error, PyExc_TypeError)
+PYBIND11_RUNTIME_EXCEPTION(buffer_error, PyExc_BufferError)
+PYBIND11_RUNTIME_EXCEPTION(import_error, PyExc_ImportError)
+PYBIND11_RUNTIME_EXCEPTION(cast_error, PyExc_RuntimeError) /// Thrown when pybind11::cast or handle::call fail due to a type casting error
+PYBIND11_RUNTIME_EXCEPTION(reference_cast_error, PyExc_RuntimeError) /// Used internally
+
+[[noreturn]] PYBIND11_NOINLINE inline void pybind11_fail(const char *reason) { throw std::runtime_error(reason); }
+[[noreturn]] PYBIND11_NOINLINE inline void pybind11_fail(const std::string &reason) { throw std::runtime_error(reason); }
+
+template <typename T, typename SFINAE = void> struct format_descriptor { };
+
+NAMESPACE_BEGIN(detail)
+// Returns the index of the given type in the type char array below, and in the list in numpy.h
+// The order here is: bool; 8 ints ((signed,unsigned)x(8,16,32,64)bits); float,double,long double;
+// complex float,double,long double.  Note that the long double types only participate when long
+// double is actually longer than double (it isn't under MSVC).
+// NB: not only the string below but also complex.h and numpy.h rely on this order.
+template <typename T, typename SFINAE = void> struct is_fmt_numeric { static constexpr bool value = false; };
+template <typename T> struct is_fmt_numeric<T, enable_if_t<std::is_arithmetic<T>::value>> {
+    static constexpr bool value = true;
+    static constexpr int index = std::is_same<T, bool>::value ? 0 : 1 + (
+        std::is_integral<T>::value ? detail::log2(sizeof(T))*2 + std::is_unsigned<T>::value : 8 + (
+        std::is_same<T, double>::value ? 1 : std::is_same<T, long double>::value ? 2 : 0));
+};
+NAMESPACE_END(detail)
+
+template <typename T> struct format_descriptor<T, detail::enable_if_t<std::is_arithmetic<T>::value>> {
+    static constexpr const char c = "?bBhHiIqQfdg"[detail::is_fmt_numeric<T>::index];
+    static constexpr const char value[2] = { c, '\0' };
+    static std::string format() { return std::string(1, c); }
+};
+
+#if !defined(PYBIND11_CPP17)
+
+template <typename T> constexpr const char format_descriptor<
+    T, detail::enable_if_t<std::is_arithmetic<T>::value>>::value[2];
+
+#endif
+
+/// RAII wrapper that temporarily clears any Python error state
+struct error_scope {
+    PyObject *type, *value, *trace;
+    error_scope() { PyErr_Fetch(&type, &value, &trace); }
+    ~error_scope() { PyErr_Restore(type, value, trace); }
+};
+
+/// Dummy destructor wrapper that can be used to expose classes with a private destructor
+struct nodelete { template <typename T> void operator()(T*) { } };
+
+NAMESPACE_BEGIN(detail)
+template <typename... Args>
+struct overload_cast_impl {
+    constexpr overload_cast_impl() {} // MSVC 2015 needs this
+
+    template <typename Return>
+    constexpr auto operator()(Return (*pf)(Args...)) const noexcept
+                              -> decltype(pf) { return pf; }
+
+    template <typename Return, typename Class>
+    constexpr auto operator()(Return (Class::*pmf)(Args...), std::false_type = {}) const noexcept
+                              -> decltype(pmf) { return pmf; }
+
+    template <typename Return, typename Class>
+    constexpr auto operator()(Return (Class::*pmf)(Args...) const, std::true_type) const noexcept
+                              -> decltype(pmf) { return pmf; }
+};
+NAMESPACE_END(detail)
+
+// overload_cast requires variable templates: C++14
+#if defined(PYBIND11_CPP14)
+#define PYBIND11_OVERLOAD_CAST 1
+/// Syntax sugar for resolving overloaded function pointers:
+///  - regular: static_cast<Return (Class::*)(Arg0, Arg1, Arg2)>(&Class::func)
+///  - sweet:   overload_cast<Arg0, Arg1, Arg2>(&Class::func)
+template <typename... Args>
+static constexpr detail::overload_cast_impl<Args...> overload_cast = {};
+// MSVC 2015 only accepts this particular initialization syntax for this variable template.
+#endif
+
+/// Const member function selector for overload_cast
+///  - regular: static_cast<Return (Class::*)(Arg) const>(&Class::func)
+///  - sweet:   overload_cast<Arg>(&Class::func, const_)
+static constexpr auto const_ = std::true_type{};
+
+#if !defined(PYBIND11_CPP14) // no overload_cast: providing something that static_assert-fails:
+template <typename... Args> struct overload_cast {
+    static_assert(detail::deferred_t<std::false_type, Args...>::value,
+                  "pybind11::overload_cast<...> requires compiling in C++14 mode");
+};
+#endif // overload_cast
+
+NAMESPACE_BEGIN(detail)
+
+// Adaptor for converting arbitrary container arguments into a vector; implicitly convertible from
+// any standard container (or C-style array) supporting std::begin/std::end, any singleton
+// arithmetic type (if T is arithmetic), or explicitly constructible from an iterator pair.
+template <typename T>
+class any_container {
+    std::vector<T> v;
+public:
+    any_container() = default;
+
+    // Can construct from a pair of iterators
+    template <typename It, typename = enable_if_t<is_input_iterator<It>::value>>
+    any_container(It first, It last) : v(first, last) { }
+
+    // Implicit conversion constructor from any arbitrary container type with values convertible to T
+    template <typename Container, typename = enable_if_t<std::is_convertible<decltype(*std::begin(std::declval<const Container &>())), T>::value>>
+    any_container(const Container &c) : any_container(std::begin(c), std::end(c)) { }
+
+    // initializer_list's aren't deducible, so don't get matched by the above template; we need this
+    // to explicitly allow implicit conversion from one:
+    template <typename TIn, typename = enable_if_t<std::is_convertible<TIn, T>::value>>
+    any_container(const std::initializer_list<TIn> &c) : any_container(c.begin(), c.end()) { }
+
+    // Avoid copying if given an rvalue vector of the correct type.
+    any_container(std::vector<T> &&v) : v(std::move(v)) { }
+
+    // Moves the vector out of an rvalue any_container
+    operator std::vector<T> &&() && { return std::move(v); }
+
+    // Dereferencing obtains a reference to the underlying vector
+    std::vector<T> &operator*() { return v; }
+    const std::vector<T> &operator*() const { return v; }
+
+    // -> lets you call methods on the underlying vector
+    std::vector<T> *operator->() { return &v; }
+    const std::vector<T> *operator->() const { return &v; }
+};
+
+NAMESPACE_END(detail)
+
+
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
--- a/cviruntime/python/include/pybind11/include/pybind11/detail/descr.h
+++ b/cviruntime/python/include/pybind11/include/pybind11/detail/descr.h
@ -0,0 +1,100 @@
+/*
+    pybind11/detail/descr.h: Helper type for concatenating type signatures at compile time
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "common.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+#if !defined(_MSC_VER)
+#  define PYBIND11_DESCR_CONSTEXPR static constexpr
+#else
+#  define PYBIND11_DESCR_CONSTEXPR const
+#endif
+
+/* Concatenate type signatures at compile time */
+template <size_t N, typename... Ts>
+struct descr {
+    char text[N + 1];
+
+    constexpr descr() : text{'\0'} { }
+    constexpr descr(char const (&s)[N+1]) : descr(s, make_index_sequence<N>()) { }
+
+    template <size_t... Is>
+    constexpr descr(char const (&s)[N+1], index_sequence<Is...>) : text{s[Is]..., '\0'} { }
+
+    template <typename... Chars>
+    constexpr descr(char c, Chars... cs) : text{c, static_cast<char>(cs)..., '\0'} { }
+
+    static constexpr std::array<const std::type_info *, sizeof...(Ts) + 1> types() {
+        return {{&typeid(Ts)..., nullptr}};
+    }
+};
+
+template <size_t N1, size_t N2, typename... Ts1, typename... Ts2, size_t... Is1, size_t... Is2>
+constexpr descr<N1 + N2, Ts1..., Ts2...> plus_impl(const descr<N1, Ts1...> &a, const descr<N2, Ts2...> &b,
+                                                   index_sequence<Is1...>, index_sequence<Is2...>) {
+    return {a.text[Is1]..., b.text[Is2]...};
+}
+
+template <size_t N1, size_t N2, typename... Ts1, typename... Ts2>
+constexpr descr<N1 + N2, Ts1..., Ts2...> operator+(const descr<N1, Ts1...> &a, const descr<N2, Ts2...> &b) {
+    return plus_impl(a, b, make_index_sequence<N1>(), make_index_sequence<N2>());
+}
+
+template <size_t N>
+constexpr descr<N - 1> _(char const(&text)[N]) { return descr<N - 1>(text); }
+constexpr descr<0> _(char const(&)[1]) { return {}; }
+
+template <size_t Rem, size_t... Digits> struct int_to_str : int_to_str<Rem/10, Rem%10, Digits...> { };
+template <size_t...Digits> struct int_to_str<0, Digits...> {
+    static constexpr auto digits = descr<sizeof...(Digits)>(('0' + Digits)...);
+};
+
+// Ternary description (like std::conditional)
+template <bool B, size_t N1, size_t N2>
+constexpr enable_if_t<B, descr<N1 - 1>> _(char const(&text1)[N1], char const(&)[N2]) {
+    return _(text1);
+}
+template <bool B, size_t N1, size_t N2>
+constexpr enable_if_t<!B, descr<N2 - 1>> _(char const(&)[N1], char const(&text2)[N2]) {
+    return _(text2);
+}
+
+template <bool B, typename T1, typename T2>
+constexpr enable_if_t<B, T1> _(const T1 &d, const T2 &) { return d; }
+template <bool B, typename T1, typename T2>
+constexpr enable_if_t<!B, T2> _(const T1 &, const T2 &d) { return d; }
+
+template <size_t Size> auto constexpr _() -> decltype(int_to_str<Size / 10, Size % 10>::digits) {
+    return int_to_str<Size / 10, Size % 10>::digits;
+}
+
+template <typename Type> constexpr descr<1, Type> _() { return {'%'}; }
+
+constexpr descr<0> concat() { return {}; }
+
+template <size_t N, typename... Ts>
+constexpr descr<N, Ts...> concat(const descr<N, Ts...> &descr) { return descr; }
+
+template <size_t N, typename... Ts, typename... Args>
+constexpr auto concat(const descr<N, Ts...> &d, const Args &...args)
+    -> decltype(std::declval<descr<N + 2, Ts...>>() + concat(args...)) {
+    return d + _(", ") + concat(args...);
+}
+
+template <size_t N, typename... Ts>
+constexpr descr<N + 2, Ts...> type_descr(const descr<N, Ts...> &descr) {
+    return _("{") + descr + _("}");
+}
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
--- a/cviruntime/python/include/pybind11/include/pybind11/detail/init.h
+++ b/cviruntime/python/include/pybind11/include/pybind11/detail/init.h
@ -0,0 +1,335 @@
+/*
+    pybind11/detail/init.h: init factory function implementation and support code.
+
+    Copyright (c) 2017 Jason Rhinelander <jason@imaginary.ca>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "class.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+template <>
+class type_caster<value_and_holder> {
+public:
+    bool load(handle h, bool) {
+        value = reinterpret_cast<value_and_holder *>(h.ptr());
+        return true;
+    }
+
+    template <typename> using cast_op_type = value_and_holder &;
+    operator value_and_holder &() { return *value; }
+    static constexpr auto name = _<value_and_holder>();
+
+private:
+    value_and_holder *value = nullptr;
+};
+
+NAMESPACE_BEGIN(initimpl)
+
+inline void no_nullptr(void *ptr) {
+    if (!ptr) throw type_error("pybind11::init(): factory function returned nullptr");
+}
+
+// Implementing functions for all forms of py::init<...> and py::init(...)
+template <typename Class> using Cpp = typename Class::type;
+template <typename Class> using Alias = typename Class::type_alias;
+template <typename Class> using Holder = typename Class::holder_type;
+
+template <typename Class> using is_alias_constructible = std::is_constructible<Alias<Class>, Cpp<Class> &&>;
+
+// Takes a Cpp pointer and returns true if it actually is a polymorphic Alias instance.
+template <typename Class, enable_if_t<Class::has_alias, int> = 0>
+bool is_alias(Cpp<Class> *ptr) {
+    return dynamic_cast<Alias<Class> *>(ptr) != nullptr;
+}
+// Failing fallback version of the above for a no-alias class (always returns false)
+template <typename /*Class*/>
+constexpr bool is_alias(void *) { return false; }
+
+// Constructs and returns a new object; if the given arguments don't map to a constructor, we fall
+// back to brace aggregate initiailization so that for aggregate initialization can be used with
+// py::init, e.g.  `py::init<int, int>` to initialize a `struct T { int a; int b; }`.  For
+// non-aggregate types, we need to use an ordinary T(...) constructor (invoking as `T{...}` usually
+// works, but will not do the expected thing when `T` has an `initializer_list<T>` constructor).
+template <typename Class, typename... Args, detail::enable_if_t<std::is_constructible<Class, Args...>::value, int> = 0>
+inline Class *construct_or_initialize(Args &&...args) { return new Class(std::forward<Args>(args)...); }
+template <typename Class, typename... Args, detail::enable_if_t<!std::is_constructible<Class, Args...>::value, int> = 0>
+inline Class *construct_or_initialize(Args &&...args) { return new Class{std::forward<Args>(args)...}; }
+
+// Attempts to constructs an alias using a `Alias(Cpp &&)` constructor.  This allows types with
+// an alias to provide only a single Cpp factory function as long as the Alias can be
+// constructed from an rvalue reference of the base Cpp type.  This means that Alias classes
+// can, when appropriate, simply define a `Alias(Cpp &&)` constructor rather than needing to
+// inherit all the base class constructors.
+template <typename Class>
+void construct_alias_from_cpp(std::true_type /*is_alias_constructible*/,
+                              value_and_holder &v_h, Cpp<Class> &&base) {
+    v_h.value_ptr() = new Alias<Class>(std::move(base));
+}
+template <typename Class>
+[[noreturn]] void construct_alias_from_cpp(std::false_type /*!is_alias_constructible*/,
+                                           value_and_holder &, Cpp<Class> &&) {
+    throw type_error("pybind11::init(): unable to convert returned instance to required "
+                     "alias class: no `Alias<Class>(Class &&)` constructor available");
+}
+
+// Error-generating fallback for factories that don't match one of the below construction
+// mechanisms.
+template <typename Class>
+void construct(...) {
+    static_assert(!std::is_same<Class, Class>::value /* always false */,
+            "pybind11::init(): init function must return a compatible pointer, "
+            "holder, or value");
+}
+
+// Pointer return v1: the factory function returns a class pointer for a registered class.
+// If we don't need an alias (because this class doesn't have one, or because the final type is
+// inherited on the Python side) we can simply take over ownership.  Otherwise we need to try to
+// construct an Alias from the returned base instance.
+template <typename Class>
+void construct(value_and_holder &v_h, Cpp<Class> *ptr, bool need_alias) {
+    no_nullptr(ptr);
+    if (Class::has_alias && need_alias && !is_alias<Class>(ptr)) {
+        // We're going to try to construct an alias by moving the cpp type.  Whether or not
+        // that succeeds, we still need to destroy the original cpp pointer (either the
+        // moved away leftover, if the alias construction works, or the value itself if we
+        // throw an error), but we can't just call `delete ptr`: it might have a special
+        // deleter, or might be shared_from_this.  So we construct a holder around it as if
+        // it was a normal instance, then steal the holder away into a local variable; thus
+        // the holder and destruction happens when we leave the C++ scope, and the holder
+        // class gets to handle the destruction however it likes.
+        v_h.value_ptr() = ptr;
+        v_h.set_instance_registered(true); // To prevent init_instance from registering it
+        v_h.type->init_instance(v_h.inst, nullptr); // Set up the holder
+        Holder<Class> temp_holder(std::move(v_h.holder<Holder<Class>>())); // Steal the holder
+        v_h.type->dealloc(v_h); // Destroys the moved-out holder remains, resets value ptr to null
+        v_h.set_instance_registered(false);
+
+        construct_alias_from_cpp<Class>(is_alias_constructible<Class>{}, v_h, std::move(*ptr));
+    } else {
+        // Otherwise the type isn't inherited, so we don't need an Alias
+        v_h.value_ptr() = ptr;
+    }
+}
+
+// Pointer return v2: a factory that always returns an alias instance ptr.  We simply take over
+// ownership of the pointer.
+template <typename Class, enable_if_t<Class::has_alias, int> = 0>
+void construct(value_and_holder &v_h, Alias<Class> *alias_ptr, bool) {
+    no_nullptr(alias_ptr);
+    v_h.value_ptr() = static_cast<Cpp<Class> *>(alias_ptr);
+}
+
+// Holder return: copy its pointer, and move or copy the returned holder into the new instance's
+// holder.  This also handles types like std::shared_ptr<T> and std::unique_ptr<T> where T is a
+// derived type (through those holder's implicit conversion from derived class holder constructors).
+template <typename Class>
+void construct(value_and_holder &v_h, Holder<Class> holder, bool need_alias) {
+    auto *ptr = holder_helper<Holder<Class>>::get(holder);
+    // If we need an alias, check that the held pointer is actually an alias instance
+    if (Class::has_alias && need_alias && !is_alias<Class>(ptr))
+        throw type_error("pybind11::init(): construction failed: returned holder-wrapped instance "
+                         "is not an alias instance");
+
+    v_h.value_ptr() = ptr;
+    v_h.type->init_instance(v_h.inst, &holder);
+}
+
+// return-by-value version 1: returning a cpp class by value.  If the class has an alias and an
+// alias is required the alias must have an `Alias(Cpp &&)` constructor so that we can construct
+// the alias from the base when needed (i.e. because of Python-side inheritance).  When we don't
+// need it, we simply move-construct the cpp value into a new instance.
+template <typename Class>
+void construct(value_and_holder &v_h, Cpp<Class> &&result, bool need_alias) {
+    static_assert(std::is_move_constructible<Cpp<Class>>::value,
+        "pybind11::init() return-by-value factory function requires a movable class");
+    if (Class::has_alias && need_alias)
+        construct_alias_from_cpp<Class>(is_alias_constructible<Class>{}, v_h, std::move(result));
+    else
+        v_h.value_ptr() = new Cpp<Class>(std::move(result));
+}
+
+// return-by-value version 2: returning a value of the alias type itself.  We move-construct an
+// Alias instance (even if no the python-side inheritance is involved).  The is intended for
+// cases where Alias initialization is always desired.
+template <typename Class>
+void construct(value_and_holder &v_h, Alias<Class> &&result, bool) {
+    static_assert(std::is_move_constructible<Alias<Class>>::value,
+        "pybind11::init() return-by-alias-value factory function requires a movable alias class");
+    v_h.value_ptr() = new Alias<Class>(std::move(result));
+}
+
+// Implementing class for py::init<...>()
+template <typename... Args>
+struct constructor {
+    template <typename Class, typename... Extra, enable_if_t<!Class::has_alias, int> = 0>
+    static void execute(Class &cl, const Extra&... extra) {
+        cl.def("__init__", [](value_and_holder &v_h, Args... args) {
+            v_h.value_ptr() = construct_or_initialize<Cpp<Class>>(std::forward<Args>(args)...);
+        }, is_new_style_constructor(), extra...);
+    }
+
+    template <typename Class, typename... Extra,
+              enable_if_t<Class::has_alias &&
+                          std::is_constructible<Cpp<Class>, Args...>::value, int> = 0>
+    static void execute(Class &cl, const Extra&... extra) {
+        cl.def("__init__", [](value_and_holder &v_h, Args... args) {
+            if (Py_TYPE(v_h.inst) == v_h.type->type)
+                v_h.value_ptr() = construct_or_initialize<Cpp<Class>>(std::forward<Args>(args)...);
+            else
+                v_h.value_ptr() = construct_or_initialize<Alias<Class>>(std::forward<Args>(args)...);
+        }, is_new_style_constructor(), extra...);
+    }
+
+    template <typename Class, typename... Extra,
+              enable_if_t<Class::has_alias &&
+                          !std::is_constructible<Cpp<Class>, Args...>::value, int> = 0>
+    static void execute(Class &cl, const Extra&... extra) {
+        cl.def("__init__", [](value_and_holder &v_h, Args... args) {
+            v_h.value_ptr() = construct_or_initialize<Alias<Class>>(std::forward<Args>(args)...);
+        }, is_new_style_constructor(), extra...);
+    }
+};
+
+// Implementing class for py::init_alias<...>()
+template <typename... Args> struct alias_constructor {
+    template <typename Class, typename... Extra,
+              enable_if_t<Class::has_alias && std::is_constructible<Alias<Class>, Args...>::value, int> = 0>
+    static void execute(Class &cl, const Extra&... extra) {
+        cl.def("__init__", [](value_and_holder &v_h, Args... args) {
+            v_h.value_ptr() = construct_or_initialize<Alias<Class>>(std::forward<Args>(args)...);
+        }, is_new_style_constructor(), extra...);
+    }
+};
+
+// Implementation class for py::init(Func) and py::init(Func, AliasFunc)
+template <typename CFunc, typename AFunc = void_type (*)(),
+          typename = function_signature_t<CFunc>, typename = function_signature_t<AFunc>>
+struct factory;
+
+// Specialization for py::init(Func)
+template <typename Func, typename Return, typename... Args>
+struct factory<Func, void_type (*)(), Return(Args...)> {
+    remove_reference_t<Func> class_factory;
+
+    factory(Func &&f) : class_factory(std::forward<Func>(f)) { }
+
+    // The given class either has no alias or has no separate alias factory;
+    // this always constructs the class itself.  If the class is registered with an alias
+    // type and an alias instance is needed (i.e. because the final type is a Python class
+    // inheriting from the C++ type) the returned value needs to either already be an alias
+    // instance, or the alias needs to be constructible from a `Class &&` argument.
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra &...extra) && {
+        #if defined(PYBIND11_CPP14)
+        cl.def("__init__", [func = std::move(class_factory)]
+        #else
+        auto &func = class_factory;
+        cl.def("__init__", [func]
+        #endif
+        (value_and_holder &v_h, Args... args) {
+            construct<Class>(v_h, func(std::forward<Args>(args)...),
+                             Py_TYPE(v_h.inst) != v_h.type->type);
+        }, is_new_style_constructor(), extra...);
+    }
+};
+
+// Specialization for py::init(Func, AliasFunc)
+template <typename CFunc, typename AFunc,
+          typename CReturn, typename... CArgs, typename AReturn, typename... AArgs>
+struct factory<CFunc, AFunc, CReturn(CArgs...), AReturn(AArgs...)> {
+    static_assert(sizeof...(CArgs) == sizeof...(AArgs),
+                  "pybind11::init(class_factory, alias_factory): class and alias factories "
+                  "must have identical argument signatures");
+    static_assert(all_of<std::is_same<CArgs, AArgs>...>::value,
+                  "pybind11::init(class_factory, alias_factory): class and alias factories "
+                  "must have identical argument signatures");
+
+    remove_reference_t<CFunc> class_factory;
+    remove_reference_t<AFunc> alias_factory;
+
+    factory(CFunc &&c, AFunc &&a)
+        : class_factory(std::forward<CFunc>(c)), alias_factory(std::forward<AFunc>(a)) { }
+
+    // The class factory is called when the `self` type passed to `__init__` is the direct
+    // class (i.e. not inherited), the alias factory when `self` is a Python-side subtype.
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra&... extra) && {
+        static_assert(Class::has_alias, "The two-argument version of `py::init()` can "
+                                        "only be used if the class has an alias");
+        #if defined(PYBIND11_CPP14)
+        cl.def("__init__", [class_func = std::move(class_factory), alias_func = std::move(alias_factory)]
+        #else
+        auto &class_func = class_factory;
+        auto &alias_func = alias_factory;
+        cl.def("__init__", [class_func, alias_func]
+        #endif
+        (value_and_holder &v_h, CArgs... args) {
+            if (Py_TYPE(v_h.inst) == v_h.type->type)
+                // If the instance type equals the registered type we don't have inheritance, so
+                // don't need the alias and can construct using the class function:
+                construct<Class>(v_h, class_func(std::forward<CArgs>(args)...), false);
+            else
+                construct<Class>(v_h, alias_func(std::forward<CArgs>(args)...), true);
+        }, is_new_style_constructor(), extra...);
+    }
+};
+
+/// Set just the C++ state. Same as `__init__`.
+template <typename Class, typename T>
+void setstate(value_and_holder &v_h, T &&result, bool need_alias) {
+    construct<Class>(v_h, std::forward<T>(result), need_alias);
+}
+
+/// Set both the C++ and Python states
+template <typename Class, typename T, typename O,
+          enable_if_t<std::is_convertible<O, handle>::value, int> = 0>
+void setstate(value_and_holder &v_h, std::pair<T, O> &&result, bool need_alias) {
+    construct<Class>(v_h, std::move(result.first), need_alias);
+    setattr((PyObject *) v_h.inst, "__dict__", result.second);
+}
+
+/// Implementation for py::pickle(GetState, SetState)
+template <typename Get, typename Set,
+          typename = function_signature_t<Get>, typename = function_signature_t<Set>>
+struct pickle_factory;
+
+template <typename Get, typename Set,
+          typename RetState, typename Self, typename NewInstance, typename ArgState>
+struct pickle_factory<Get, Set, RetState(Self), NewInstance(ArgState)> {
+    static_assert(std::is_same<intrinsic_t<RetState>, intrinsic_t<ArgState>>::value,
+                  "The type returned by `__getstate__` must be the same "
+                  "as the argument accepted by `__setstate__`");
+
+    remove_reference_t<Get> get;
+    remove_reference_t<Set> set;
+
+    pickle_factory(Get get, Set set)
+        : get(std::forward<Get>(get)), set(std::forward<Set>(set)) { }
+
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra &...extra) && {
+        cl.def("__getstate__", std::move(get));
+
+#if defined(PYBIND11_CPP14)
+        cl.def("__setstate__", [func = std::move(set)]
+#else
+        auto &func = set;
+        cl.def("__setstate__", [func]
+#endif
+        (value_and_holder &v_h, ArgState state) {
+            setstate<Class>(v_h, func(std::forward<ArgState>(state)),
+                            Py_TYPE(v_h.inst) != v_h.type->type);
+        }, is_new_style_constructor(), extra...);
+    }
+};
+
+NAMESPACE_END(initimpl)
+NAMESPACE_END(detail)
+NAMESPACE_END(pybind11)
--- a/cviruntime/python/include/pybind11/include/pybind11/detail/internals.h
+++ b/cviruntime/python/include/pybind11/include/pybind11/detail/internals.h
@ -0,0 +1,349 @@
+/*
+    pybind11/detail/internals.h: Internal data structure and related functions
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "../pytypes.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+// Forward declarations
+inline PyTypeObject *make_static_property_type();
+inline PyTypeObject *make_default_metaclass();
+inline PyObject *make_object_base_type(PyTypeObject *metaclass);
+
+// The old Python Thread Local Storage (TLS) API is deprecated in Python 3.7 in favor of the new
+// Thread Specific Storage (TSS) API.
+#if PY_VERSION_HEX >= 0x03070000
+#    define PYBIND11_TLS_KEY_INIT(var) Py_tss_t *var = nullptr
+#    define PYBIND11_TLS_GET_VALUE(key) PyThread_tss_get((key))
+#    define PYBIND11_TLS_REPLACE_VALUE(key, value) PyThread_tss_set((key), (value))
+#    define PYBIND11_TLS_DELETE_VALUE(key) PyThread_tss_set((key), nullptr)
+#    define PYBIND11_TLS_FREE(key) PyThread_tss_free(key)
+#else
+    // Usually an int but a long on Cygwin64 with Python 3.x
+#    define PYBIND11_TLS_KEY_INIT(var) decltype(PyThread_create_key()) var = 0
+#    define PYBIND11_TLS_GET_VALUE(key) PyThread_get_key_value((key))
+#    if PY_MAJOR_VERSION < 3
+#        define PYBIND11_TLS_DELETE_VALUE(key)                               \
+             PyThread_delete_key_value(key)
+#        define PYBIND11_TLS_REPLACE_VALUE(key, value)                       \
+             do {                                                            \
+                 PyThread_delete_key_value((key));                           \
+                 PyThread_set_key_value((key), (value));                     \
+             } while (false)
+#    else
+#        define PYBIND11_TLS_DELETE_VALUE(key)                               \
+             PyThread_set_key_value((key), nullptr)
+#        define PYBIND11_TLS_REPLACE_VALUE(key, value)                       \
+             PyThread_set_key_value((key), (value))
+#    endif
+#    define PYBIND11_TLS_FREE(key) (void)key
+#endif
+
+// Python loads modules by default with dlopen with the RTLD_LOCAL flag; under libc++ and possibly
+// other STLs, this means `typeid(A)` from one module won't equal `typeid(A)` from another module
+// even when `A` is the same, non-hidden-visibility type (e.g. from a common include).  Under
+// libstdc++, this doesn't happen: equality and the type_index hash are based on the type name,
+// which works.  If not under a known-good stl, provide our own name-based hash and equality
+// functions that use the type name.
+#if defined(__GLIBCXX__)
+inline bool same_type(const std::type_info &lhs, const std::type_info &rhs) { return lhs == rhs; }
+using type_hash = std::hash<std::type_index>;
+using type_equal_to = std::equal_to<std::type_index>;
+#else
+inline bool same_type(const std::type_info &lhs, const std::type_info &rhs) {
+    return lhs.name() == rhs.name() || std::strcmp(lhs.name(), rhs.name()) == 0;
+}
+
+struct type_hash {
+    size_t operator()(const std::type_index &t) const {
+        size_t hash = 5381;
+        const char *ptr = t.name();
+        while (auto c = static_cast<unsigned char>(*ptr++))
+            hash = (hash * 33) ^ c;
+        return hash;
+    }
+};
+
+struct type_equal_to {
+    bool operator()(const std::type_index &lhs, const std::type_index &rhs) const {
+        return lhs.name() == rhs.name() || std::strcmp(lhs.name(), rhs.name()) == 0;
+    }
+};
+#endif
+
+template <typename value_type>
+using type_map = std::unordered_map<std::type_index, value_type, type_hash, type_equal_to>;
+
+struct overload_hash {
+    inline size_t operator()(const std::pair<const PyObject *, const char *>& v) const {
+        size_t value = std::hash<const void *>()(v.first);
+        value ^= std::hash<const void *>()(v.second)  + 0x9e3779b9 + (value<<6) + (value>>2);
+        return value;
+    }
+};
+
+/// Internal data structure used to track registered instances and types.
+/// Whenever binary incompatible changes are made to this structure,
+/// `PYBIND11_INTERNALS_VERSION` must be incremented.
+struct internals {
+    type_map<type_info *> registered_types_cpp; // std::type_index -> pybind11's type information
+    std::unordered_map<PyTypeObject *, std::vector<type_info *>> registered_types_py; // PyTypeObject* -> base type_info(s)
+    std::unordered_multimap<const void *, instance*> registered_instances; // void * -> instance*
+    std::unordered_set<std::pair<const PyObject *, const char *>, overload_hash> inactive_overload_cache;
+    type_map<std::vector<bool (*)(PyObject *, void *&)>> direct_conversions;
+    std::unordered_map<const PyObject *, std::vector<PyObject *>> patients;
+    std::forward_list<void (*) (std::exception_ptr)> registered_exception_translators;
+    std::unordered_map<std::string, void *> shared_data; // Custom data to be shared across extensions
+    std::vector<PyObject *> loader_patient_stack; // Used by `loader_life_support`
+    std::forward_list<std::string> static_strings; // Stores the std::strings backing detail::c_str()
+    PyTypeObject *static_property_type;
+    PyTypeObject *default_metaclass;
+    PyObject *instance_base;
+#if defined(WITH_THREAD)
+    PYBIND11_TLS_KEY_INIT(tstate);
+    PyInterpreterState *istate = nullptr;
+    ~internals() {
+        // This destructor is called *after* Py_Finalize() in finalize_interpreter().
+        // That *SHOULD BE* fine. The following details what happens whe PyThread_tss_free is called.
+        // PYBIND11_TLS_FREE is PyThread_tss_free on python 3.7+. On older python, it does nothing.
+        // PyThread_tss_free calls PyThread_tss_delete and PyMem_RawFree.
+        // PyThread_tss_delete just calls TlsFree (on Windows) or pthread_key_delete (on *NIX). Neither
+        // of those have anything to do with CPython internals.
+        // PyMem_RawFree *requires* that the `tstate` be allocated with the CPython allocator.
+        PYBIND11_TLS_FREE(tstate);
+    }
+#endif
+};
+
+/// Additional type information which does not fit into the PyTypeObject.
+/// Changes to this struct also require bumping `PYBIND11_INTERNALS_VERSION`.
+struct type_info {
+    PyTypeObject *type;
+    const std::type_info *cpptype;
+    size_t type_size, type_align, holder_size_in_ptrs;
+    void *(*operator_new)(size_t);
+    void (*init_instance)(instance *, const void *);
+    void (*dealloc)(value_and_holder &v_h);
+    std::vector<PyObject *(*)(PyObject *, PyTypeObject *)> implicit_conversions;
+    std::vector<std::pair<const std::type_info *, void *(*)(void *)>> implicit_casts;
+    std::vector<bool (*)(PyObject *, void *&)> *direct_conversions;
+    buffer_info *(*get_buffer)(PyObject *, void *) = nullptr;
+    void *get_buffer_data = nullptr;
+    void *(*module_local_load)(PyObject *, const type_info *) = nullptr;
+    /* A simple type never occurs as a (direct or indirect) parent
+     * of a class that makes use of multiple inheritance */
+    bool simple_type : 1;
+    /* True if there is no multiple inheritance in this type's inheritance tree */
+    bool simple_ancestors : 1;
+    /* for base vs derived holder_type checks */
+    bool default_holder : 1;
+    /* true if this is a type registered with py::module_local */
+    bool module_local : 1;
+};
+
+/// Tracks the `internals` and `type_info` ABI version independent of the main library version
+#define PYBIND11_INTERNALS_VERSION 4
+
+/// On MSVC, debug and release builds are not ABI-compatible!
+#if defined(_MSC_VER) && defined(_DEBUG)
+#   define PYBIND11_BUILD_TYPE "_debug"
+#else
+#   define PYBIND11_BUILD_TYPE ""
+#endif
+
+/// Let's assume that different compilers are ABI-incompatible.
+#if defined(_MSC_VER)
+#   define PYBIND11_COMPILER_TYPE "_msvc"
+#elif defined(__INTEL_COMPILER)
+#   define PYBIND11_COMPILER_TYPE "_icc"
+#elif defined(__clang__)
+#   define PYBIND11_COMPILER_TYPE "_clang"
+#elif defined(__PGI)
+#   define PYBIND11_COMPILER_TYPE "_pgi"
+#elif defined(__MINGW32__)
+#   define PYBIND11_COMPILER_TYPE "_mingw"
+#elif defined(__CYGWIN__)
+#   define PYBIND11_COMPILER_TYPE "_gcc_cygwin"
+#elif defined(__GNUC__)
+#   define PYBIND11_COMPILER_TYPE "_gcc"
+#else
+#   define PYBIND11_COMPILER_TYPE "_unknown"
+#endif
+
+#if defined(_LIBCPP_VERSION)
+#  define PYBIND11_STDLIB "_libcpp"
+#elif defined(__GLIBCXX__) || defined(__GLIBCPP__)
+#  define PYBIND11_STDLIB "_libstdcpp"
+#else
+#  define PYBIND11_STDLIB ""
+#endif
+
+/// On Linux/OSX, changes in __GXX_ABI_VERSION__ indicate ABI incompatibility.
+#if defined(__GXX_ABI_VERSION)
+#  define PYBIND11_BUILD_ABI "_cxxabi" PYBIND11_TOSTRING(__GXX_ABI_VERSION)
+#else
+#  define PYBIND11_BUILD_ABI ""
+#endif
+
+#if defined(WITH_THREAD)
+#  define PYBIND11_INTERNALS_KIND ""
+#else
+#  define PYBIND11_INTERNALS_KIND "_without_thread"
+#endif
+
+#define PYBIND11_INTERNALS_ID "__pybind11_internals_v" \
+    PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION) PYBIND11_INTERNALS_KIND PYBIND11_COMPILER_TYPE PYBIND11_STDLIB PYBIND11_BUILD_ABI PYBIND11_BUILD_TYPE "__"
+
+#define PYBIND11_MODULE_LOCAL_ID "__pybind11_module_local_v" \
+    PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION) PYBIND11_INTERNALS_KIND PYBIND11_COMPILER_TYPE PYBIND11_STDLIB PYBIND11_BUILD_ABI PYBIND11_BUILD_TYPE "__"
+
+/// Each module locally stores a pointer to the `internals` data. The data
+/// itself is shared among modules with the same `PYBIND11_INTERNALS_ID`.
+inline internals **&get_internals_pp() {
+    static internals **internals_pp = nullptr;
+    return internals_pp;
+}
+
+inline void translate_exception(std::exception_ptr p) {
+    try {
+        if (p) std::rethrow_exception(p);
+    } catch (error_already_set &e)           { e.restore();                                    return;
+    } catch (const builtin_exception &e)     { e.set_error();                                  return;
+    } catch (const std::bad_alloc &e)        { PyErr_SetString(PyExc_MemoryError,   e.what()); return;
+    } catch (const std::domain_error &e)     { PyErr_SetString(PyExc_ValueError,    e.what()); return;
+    } catch (const std::invalid_argument &e) { PyErr_SetString(PyExc_ValueError,    e.what()); return;
+    } catch (const std::length_error &e)     { PyErr_SetString(PyExc_ValueError,    e.what()); return;
+    } catch (const std::out_of_range &e)     { PyErr_SetString(PyExc_IndexError,    e.what()); return;
+    } catch (const std::range_error &e)      { PyErr_SetString(PyExc_ValueError,    e.what()); return;
+    } catch (const std::overflow_error &e)   { PyErr_SetString(PyExc_OverflowError, e.what()); return;
+    } catch (const std::exception &e)        { PyErr_SetString(PyExc_RuntimeError,  e.what()); return;
+    } catch (...) {
+        PyErr_SetString(PyExc_RuntimeError, "Caught an unknown exception!");
+        return;
+    }
+}
+
+#if !defined(__GLIBCXX__)
+inline void translate_local_exception(std::exception_ptr p) {
+    try {
+        if (p) std::rethrow_exception(p);
+    } catch (error_already_set &e)       { e.restore();   return;
+    } catch (const builtin_exception &e) { e.set_error(); return;
+    }
+}
+#endif
+
+/// Return a reference to the current `internals` data
+PYBIND11_NOINLINE inline internals &get_internals() {
+    auto **&internals_pp = get_internals_pp();
+    if (internals_pp && *internals_pp)
+        return **internals_pp;
+
+    // Ensure that the GIL is held since we will need to make Python calls.
+    // Cannot use py::gil_scoped_acquire here since that constructor calls get_internals.
+    struct gil_scoped_acquire_local {
+        gil_scoped_acquire_local() : state (PyGILState_Ensure()) {}
+        ~gil_scoped_acquire_local() { PyGILState_Release(state); }
+        const PyGILState_STATE state;
+    } gil;
+
+    constexpr auto *id = PYBIND11_INTERNALS_ID;
+    auto builtins = handle(PyEval_GetBuiltins());
+    if (builtins.contains(id) && isinstance<capsule>(builtins[id])) {
+        internals_pp = static_cast<internals **>(capsule(builtins[id]));
+
+        // We loaded builtins through python's builtins, which means that our `error_already_set`
+        // and `builtin_exception` may be different local classes than the ones set up in the
+        // initial exception translator, below, so add another for our local exception classes.
+        //
+        // libstdc++ doesn't require this (types there are identified only by name)
+#if !defined(__GLIBCXX__)
+        (*internals_pp)->registered_exception_translators.push_front(&translate_local_exception);
+#endif
+    } else {
+        if (!internals_pp) internals_pp = new internals*();
+        auto *&internals_ptr = *internals_pp;
+        internals_ptr = new internals();
+#if defined(WITH_THREAD)
+        PyEval_InitThreads();
+        PyThreadState *tstate = PyThreadState_Get();
+        #if PY_VERSION_HEX >= 0x03070000
+            internals_ptr->tstate = PyThread_tss_alloc();
+            if (!internals_ptr->tstate || PyThread_tss_create(internals_ptr->tstate))
+                pybind11_fail("get_internals: could not successfully initialize the TSS key!");
+            PyThread_tss_set(internals_ptr->tstate, tstate);
+        #else
+            internals_ptr->tstate = PyThread_create_key();
+            if (internals_ptr->tstate == -1)
+                pybind11_fail("get_internals: could not successfully initialize the TLS key!");
+            PyThread_set_key_value(internals_ptr->tstate, tstate);
+        #endif
+        internals_ptr->istate = tstate->interp;
+#endif
+        builtins[id] = capsule(internals_pp);
+        internals_ptr->registered_exception_translators.push_front(&translate_exception);
+        internals_ptr->static_property_type = make_static_property_type();
+        internals_ptr->default_metaclass = make_default_metaclass();
+        internals_ptr->instance_base = make_object_base_type(internals_ptr->default_metaclass);
+    }
+    return **internals_pp;
+}
+
+/// Works like `internals.registered_types_cpp`, but for module-local registered types:
+inline type_map<type_info *> &registered_local_types_cpp() {
+    static type_map<type_info *> locals{};
+    return locals;
+}
+
+/// Constructs a std::string with the given arguments, stores it in `internals`, and returns its
+/// `c_str()`.  Such strings objects have a long storage duration -- the internal strings are only
+/// cleared when the program exits or after interpreter shutdown (when embedding), and so are
+/// suitable for c-style strings needed by Python internals (such as PyTypeObject's tp_name).
+template <typename... Args>
+const char *c_str(Args &&...args) {
+    auto &strings = get_internals().static_strings;
+    strings.emplace_front(std::forward<Args>(args)...);
+    return strings.front().c_str();
+}
+
+NAMESPACE_END(detail)
+
+/// Returns a named pointer that is shared among all extension modules (using the same
+/// pybind11 version) running in the current interpreter. Names starting with underscores
+/// are reserved for internal usage. Returns `nullptr` if no matching entry was found.
+inline PYBIND11_NOINLINE void *get_shared_data(const std::string &name) {
+    auto &internals = detail::get_internals();
+    auto it = internals.shared_data.find(name);
+    return it != internals.shared_data.end() ? it->second : nullptr;
+}
+
+/// Set the shared data that can be later recovered by `get_shared_data()`.
+inline PYBIND11_NOINLINE void *set_shared_data(const std::string &name, void *data) {
+    detail::get_internals().shared_data[name] = data;
+    return data;
+}
+
+/// Returns a typed reference to a shared data entry (by using `get_shared_data()`) if
+/// such entry exists. Otherwise, a new object of default-constructible type `T` is
+/// added to the shared data under the given name and a reference to it is returned.
+template<typename T>
+T &get_or_create_shared_data(const std::string &name) {
+    auto &internals = detail::get_internals();
+    auto it = internals.shared_data.find(name);
+    T *ptr = (T *) (it != internals.shared_data.end() ? it->second : nullptr);
+    if (!ptr) {
+        ptr = new T();
+        internals.shared_data[name] = ptr;
+    }
+    return *ptr;
+}
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
--- a/cviruntime/python/include/pybind11/include/pybind11/detail/typeid.h
+++ b/cviruntime/python/include/pybind11/include/pybind11/detail/typeid.h
@ -0,0 +1,55 @@
+/*
+    pybind11/detail/typeid.h: Compiler-independent access to type identifiers
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include <cstdio>
+#include <cstdlib>
+
+#if defined(__GNUG__)
+#include <cxxabi.h>
+#endif
+
+#include "common.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+/// Erase all occurrences of a substring
+inline void erase_all(std::string &string, const std::string &search) {
+    for (size_t pos = 0;;) {
+        pos = string.find(search, pos);
+        if (pos == std::string::npos) break;
+        string.erase(pos, search.length());
+    }
+}
+
+PYBIND11_NOINLINE inline void clean_type_id(std::string &name) {
+#if defined(__GNUG__)
+    int status = 0;
+    std::unique_ptr<char, void (*)(void *)> res {
+        abi::__cxa_demangle(name.c_str(), nullptr, nullptr, &status), std::free };
+    if (status == 0)
+        name = res.get();
+#else
+    detail::erase_all(name, "class ");
+    detail::erase_all(name, "struct ");
+    detail::erase_all(name, "enum ");
+#endif
+    detail::erase_all(name, "pybind11::");
+}
+NAMESPACE_END(detail)
+
+/// Return a string representation of a C++ type
+template <typename T> static std::string type_id() {
+    std::string name(typeid(T).name());
+    detail::clean_type_id(name);
+    return name;
+}
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
--- a/cviruntime/python/include/pybind11/include/pybind11/eigen.h
+++ b/cviruntime/python/include/pybind11/include/pybind11/eigen.h
@ -0,0 +1,607 @@
+/*
+    pybind11/eigen.h: Transparent conversion for dense and sparse Eigen matrices
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "numpy.h"
+
+#if defined(__INTEL_COMPILER)
+#  pragma warning(disable: 1682) // implicit conversion of a 64-bit integral type to a smaller integral type (potential portability problem)
+#elif defined(__GNUG__) || defined(__clang__)
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wconversion"
+#  pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#  ifdef __clang__
+//   Eigen generates a bunch of implicit-copy-constructor-is-deprecated warnings with -Wdeprecated
+//   under Clang, so disable that warning here:
+#    pragma GCC diagnostic ignored "-Wdeprecated"
+#  endif
+#  if __GNUC__ >= 7
+#    pragma GCC diagnostic ignored "-Wint-in-bool-context"
+#  endif
+#endif
+
+#if defined(_MSC_VER)
+#  pragma warning(push)
+#  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
+#  pragma warning(disable: 4996) // warning C4996: std::unary_negate is deprecated in C++17
+#endif
+
+#include <Eigen/Core>
+#include <Eigen/SparseCore>
+
+// Eigen prior to 3.2.7 doesn't have proper move constructors--but worse, some classes get implicit
+// move constructors that break things.  We could detect this an explicitly copy, but an extra copy
+// of matrices seems highly undesirable.
+static_assert(EIGEN_VERSION_AT_LEAST(3,2,7), "Eigen support in pybind11 requires Eigen >= 3.2.7");
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+// Provide a convenience alias for easier pass-by-ref usage with fully dynamic strides:
+using EigenDStride = Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>;
+template <typename MatrixType> using EigenDRef = Eigen::Ref<MatrixType, 0, EigenDStride>;
+template <typename MatrixType> using EigenDMap = Eigen::Map<MatrixType, 0, EigenDStride>;
+
+NAMESPACE_BEGIN(detail)
+
+#if EIGEN_VERSION_AT_LEAST(3,3,0)
+using EigenIndex = Eigen::Index;
+#else
+using EigenIndex = EIGEN_DEFAULT_DENSE_INDEX_TYPE;
+#endif
+
+// Matches Eigen::Map, Eigen::Ref, blocks, etc:
+template <typename T> using is_eigen_dense_map = all_of<is_template_base_of<Eigen::DenseBase, T>, std::is_base_of<Eigen::MapBase<T, Eigen::ReadOnlyAccessors>, T>>;
+template <typename T> using is_eigen_mutable_map = std::is_base_of<Eigen::MapBase<T, Eigen::WriteAccessors>, T>;
+template <typename T> using is_eigen_dense_plain = all_of<negation<is_eigen_dense_map<T>>, is_template_base_of<Eigen::PlainObjectBase, T>>;
+template <typename T> using is_eigen_sparse = is_template_base_of<Eigen::SparseMatrixBase, T>;
+// Test for objects inheriting from EigenBase<Derived> that aren't captured by the above.  This
+// basically covers anything that can be assigned to a dense matrix but that don't have a typical
+// matrix data layout that can be copied from their .data().  For example, DiagonalMatrix and
+// SelfAdjointView fall into this category.
+template <typename T> using is_eigen_other = all_of<
+    is_template_base_of<Eigen::EigenBase, T>,
+    negation<any_of<is_eigen_dense_map<T>, is_eigen_dense_plain<T>, is_eigen_sparse<T>>>
+>;
+
+// Captures numpy/eigen conformability status (returned by EigenProps::conformable()):
+template <bool EigenRowMajor> struct EigenConformable {
+    bool conformable = false;
+    EigenIndex rows = 0, cols = 0;
+    EigenDStride stride{0, 0};      // Only valid if negativestrides is false!
+    bool negativestrides = false;   // If true, do not use stride!
+
+    EigenConformable(bool fits = false) : conformable{fits} {}
+    // Matrix type:
+    EigenConformable(EigenIndex r, EigenIndex c,
+            EigenIndex rstride, EigenIndex cstride) :
+        conformable{true}, rows{r}, cols{c} {
+        // TODO: when Eigen bug #747 is fixed, remove the tests for non-negativity. http://eigen.tuxfamily.org/bz/show_bug.cgi?id=747
+        if (rstride < 0 || cstride < 0) {
+            negativestrides = true;
+        } else {
+            stride = {EigenRowMajor ? rstride : cstride /* outer stride */,
+                      EigenRowMajor ? cstride : rstride /* inner stride */ };
+        }
+    }
+    // Vector type:
+    EigenConformable(EigenIndex r, EigenIndex c, EigenIndex stride)
+        : EigenConformable(r, c, r == 1 ? c*stride : stride, c == 1 ? r : r*stride) {}
+
+    template <typename props> bool stride_compatible() const {
+        // To have compatible strides, we need (on both dimensions) one of fully dynamic strides,
+        // matching strides, or a dimension size of 1 (in which case the stride value is irrelevant)
+        return
+            !negativestrides &&
+            (props::inner_stride == Eigen::Dynamic || props::inner_stride == stride.inner() ||
+                (EigenRowMajor ? cols : rows) == 1) &&
+            (props::outer_stride == Eigen::Dynamic || props::outer_stride == stride.outer() ||
+                (EigenRowMajor ? rows : cols) == 1);
+    }
+    operator bool() const { return conformable; }
+};
+
+template <typename Type> struct eigen_extract_stride { using type = Type; };
+template <typename PlainObjectType, int MapOptions, typename StrideType>
+struct eigen_extract_stride<Eigen::Map<PlainObjectType, MapOptions, StrideType>> { using type = StrideType; };
+template <typename PlainObjectType, int Options, typename StrideType>
+struct eigen_extract_stride<Eigen::Ref<PlainObjectType, Options, StrideType>> { using type = StrideType; };
+
+// Helper struct for extracting information from an Eigen type
+template <typename Type_> struct EigenProps {
+    using Type = Type_;
+    using Scalar = typename Type::Scalar;
+    using StrideType = typename eigen_extract_stride<Type>::type;
+    static constexpr EigenIndex
+        rows = Type::RowsAtCompileTime,
+        cols = Type::ColsAtCompileTime,
+        size = Type::SizeAtCompileTime;
+    static constexpr bool
+        row_major = Type::IsRowMajor,
+        vector = Type::IsVectorAtCompileTime, // At least one dimension has fixed size 1
+        fixed_rows = rows != Eigen::Dynamic,
+        fixed_cols = cols != Eigen::Dynamic,
+        fixed = size != Eigen::Dynamic, // Fully-fixed size
+        dynamic = !fixed_rows && !fixed_cols; // Fully-dynamic size
+
+    template <EigenIndex i, EigenIndex ifzero> using if_zero = std::integral_constant<EigenIndex, i == 0 ? ifzero : i>;
+    static constexpr EigenIndex inner_stride = if_zero<StrideType::InnerStrideAtCompileTime, 1>::value,
+                                outer_stride = if_zero<StrideType::OuterStrideAtCompileTime,
+                                                       vector ? size : row_major ? cols : rows>::value;
+    static constexpr bool dynamic_stride = inner_stride == Eigen::Dynamic && outer_stride == Eigen::Dynamic;
+    static constexpr bool requires_row_major = !dynamic_stride && !vector && (row_major ? inner_stride : outer_stride) == 1;
+    static constexpr bool requires_col_major = !dynamic_stride && !vector && (row_major ? outer_stride : inner_stride) == 1;
+
+    // Takes an input array and determines whether we can make it fit into the Eigen type.  If
+    // the array is a vector, we attempt to fit it into either an Eigen 1xN or Nx1 vector
+    // (preferring the latter if it will fit in either, i.e. for a fully dynamic matrix type).
+    static EigenConformable<row_major> conformable(const array &a) {
+        const auto dims = a.ndim();
+        if (dims < 1 || dims > 2)
+            return false;
+
+        if (dims == 2) { // Matrix type: require exact match (or dynamic)
+
+            EigenIndex
+                np_rows = a.shape(0),
+                np_cols = a.shape(1),
+                np_rstride = a.strides(0) / static_cast<ssize_t>(sizeof(Scalar)),
+                np_cstride = a.strides(1) / static_cast<ssize_t>(sizeof(Scalar));
+            if ((fixed_rows && np_rows != rows) || (fixed_cols && np_cols != cols))
+                return false;
+
+            return {np_rows, np_cols, np_rstride, np_cstride};
+        }
+
+        // Otherwise we're storing an n-vector.  Only one of the strides will be used, but whichever
+        // is used, we want the (single) numpy stride value.
+        const EigenIndex n = a.shape(0),
+              stride = a.strides(0) / static_cast<ssize_t>(sizeof(Scalar));
+
+        if (vector) { // Eigen type is a compile-time vector
+            if (fixed && size != n)
+                return false; // Vector size mismatch
+            return {rows == 1 ? 1 : n, cols == 1 ? 1 : n, stride};
+        }
+        else if (fixed) {
+            // The type has a fixed size, but is not a vector: abort
+            return false;
+        }
+        else if (fixed_cols) {
+            // Since this isn't a vector, cols must be != 1.  We allow this only if it exactly
+            // equals the number of elements (rows is Dynamic, and so 1 row is allowed).
+            if (cols != n) return false;
+            return {1, n, stride};
+        }
+        else {
+            // Otherwise it's either fully dynamic, or column dynamic; both become a column vector
+            if (fixed_rows && rows != n) return false;
+            return {n, 1, stride};
+        }
+    }
+
+    static constexpr bool show_writeable = is_eigen_dense_map<Type>::value && is_eigen_mutable_map<Type>::value;
+    static constexpr bool show_order = is_eigen_dense_map<Type>::value;
+    static constexpr bool show_c_contiguous = show_order && requires_row_major;
+    static constexpr bool show_f_contiguous = !show_c_contiguous && show_order && requires_col_major;
+
+    static constexpr auto descriptor =
+        _("numpy.ndarray[") + npy_format_descriptor<Scalar>::name +
+        _("[")  + _<fixed_rows>(_<(size_t) rows>(), _("m")) +
+        _(", ") + _<fixed_cols>(_<(size_t) cols>(), _("n")) +
+        _("]") +
+        // For a reference type (e.g. Ref<MatrixXd>) we have other constraints that might need to be
+        // satisfied: writeable=True (for a mutable reference), and, depending on the map's stride
+        // options, possibly f_contiguous or c_contiguous.  We include them in the descriptor output
+        // to provide some hint as to why a TypeError is occurring (otherwise it can be confusing to
+        // see that a function accepts a 'numpy.ndarray[float64[3,2]]' and an error message that you
+        // *gave* a numpy.ndarray of the right type and dimensions.
+        _<show_writeable>(", flags.writeable", "") +
+        _<show_c_contiguous>(", flags.c_contiguous", "") +
+        _<show_f_contiguous>(", flags.f_contiguous", "") +
+        _("]");
+};
+
+// Casts an Eigen type to numpy array.  If given a base, the numpy array references the src data,
+// otherwise it'll make a copy.  writeable lets you turn off the writeable flag for the array.
+template <typename props> handle eigen_array_cast(typename props::Type const &src, handle base = handle(), bool writeable = true) {
+    constexpr ssize_t elem_size = sizeof(typename props::Scalar);
+    array a;
+    if (props::vector)
+        a = array({ src.size() }, { elem_size * src.innerStride() }, src.data(), base);
+    else
+        a = array({ src.rows(), src.cols() }, { elem_size * src.rowStride(), elem_size * src.colStride() },
+                  src.data(), base);
+
+    if (!writeable)
+        array_proxy(a.ptr())->flags &= ~detail::npy_api::NPY_ARRAY_WRITEABLE_;
+
+    return a.release();
+}
+
+// Takes an lvalue ref to some Eigen type and a (python) base object, creating a numpy array that
+// reference the Eigen object's data with `base` as the python-registered base class (if omitted,
+// the base will be set to None, and lifetime management is up to the caller).  The numpy array is
+// non-writeable if the given type is const.
+template <typename props, typename Type>
+handle eigen_ref_array(Type &src, handle parent = none()) {
+    // none here is to get past array's should-we-copy detection, which currently always
+    // copies when there is no base.  Setting the base to None should be harmless.
+    return eigen_array_cast<props>(src, parent, !std::is_const<Type>::value);
+}
+
+// Takes a pointer to some dense, plain Eigen type, builds a capsule around it, then returns a numpy
+// array that references the encapsulated data with a python-side reference to the capsule to tie
+// its destruction to that of any dependent python objects.  Const-ness is determined by whether or
+// not the Type of the pointer given is const.
+template <typename props, typename Type, typename = enable_if_t<is_eigen_dense_plain<Type>::value>>
+handle eigen_encapsulate(Type *src) {
+    capsule base(src, [](void *o) { delete static_cast<Type *>(o); });
+    return eigen_ref_array<props>(*src, base);
+}
+
+// Type caster for regular, dense matrix types (e.g. MatrixXd), but not maps/refs/etc. of dense
+// types.
+template<typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_dense_plain<Type>::value>> {
+    using Scalar = typename Type::Scalar;
+    using props = EigenProps<Type>;
+
+    bool load(handle src, bool convert) {
+        // If we're in no-convert mode, only load if given an array of the correct type
+        if (!convert && !isinstance<array_t<Scalar>>(src))
+            return false;
+
+        // Coerce into an array, but don't do type conversion yet; the copy below handles it.
+        auto buf = array::ensure(src);
+
+        if (!buf)
+            return false;
+
+        auto dims = buf.ndim();
+        if (dims < 1 || dims > 2)
+            return false;
+
+        auto fits = props::conformable(buf);
+        if (!fits)
+            return false;
+
+        // Allocate the new type, then build a numpy reference into it
+        value = Type(fits.rows, fits.cols);
+        auto ref = reinterpret_steal<array>(eigen_ref_array<props>(value));
+        if (dims == 1) ref = ref.squeeze();
+        else if (ref.ndim() == 1) buf = buf.squeeze();
+
+        int result = detail::npy_api::get().PyArray_CopyInto_(ref.ptr(), buf.ptr());
+
+        if (result < 0) { // Copy failed!
+            PyErr_Clear();
+            return false;
+        }
+
+        return true;
+    }
+
+private:
+
+    // Cast implementation
+    template <typename CType>
+    static handle cast_impl(CType *src, return_value_policy policy, handle parent) {
+        switch (policy) {
+            case return_value_policy::take_ownership:
+            case return_value_policy::automatic:
+                return eigen_encapsulate<props>(src);
+            case return_value_policy::move:
+                return eigen_encapsulate<props>(new CType(std::move(*src)));
+            case return_value_policy::copy:
+                return eigen_array_cast<props>(*src);
+            case return_value_policy::reference:
+            case return_value_policy::automatic_reference:
+                return eigen_ref_array<props>(*src);
+            case return_value_policy::reference_internal:
+                return eigen_ref_array<props>(*src, parent);
+            default:
+                throw cast_error("unhandled return_value_policy: should not happen!");
+        };
+    }
+
+public:
+
+    // Normal returned non-reference, non-const value:
+    static handle cast(Type &&src, return_value_policy /* policy */, handle parent) {
+        return cast_impl(&src, return_value_policy::move, parent);
+    }
+    // If you return a non-reference const, we mark the numpy array readonly:
+    static handle cast(const Type &&src, return_value_policy /* policy */, handle parent) {
+        return cast_impl(&src, return_value_policy::move, parent);
+    }
+    // lvalue reference return; default (automatic) becomes copy
+    static handle cast(Type &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic || policy == return_value_policy::automatic_reference)
+            policy = return_value_policy::copy;
+        return cast_impl(&src, policy, parent);
+    }
+    // const lvalue reference return; default (automatic) becomes copy
+    static handle cast(const Type &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic || policy == return_value_policy::automatic_reference)
+            policy = return_value_policy::copy;
+        return cast(&src, policy, parent);
+    }
+    // non-const pointer return
+    static handle cast(Type *src, return_value_policy policy, handle parent) {
+        return cast_impl(src, policy, parent);
+    }
+    // const pointer return
+    static handle cast(const Type *src, return_value_policy policy, handle parent) {
+        return cast_impl(src, policy, parent);
+    }
+
+    static constexpr auto name = props::descriptor;
+
+    operator Type*() { return &value; }
+    operator Type&() { return value; }
+    operator Type&&() && { return std::move(value); }
+    template <typename T> using cast_op_type = movable_cast_op_type<T>;
+
+private:
+    Type value;
+};
+
+// Base class for casting reference/map/block/etc. objects back to python.
+template <typename MapType> struct eigen_map_caster {
+private:
+    using props = EigenProps<MapType>;
+
+public:
+
+    // Directly referencing a ref/map's data is a bit dangerous (whatever the map/ref points to has
+    // to stay around), but we'll allow it under the assumption that you know what you're doing (and
+    // have an appropriate keep_alive in place).  We return a numpy array pointing directly at the
+    // ref's data (The numpy array ends up read-only if the ref was to a const matrix type.) Note
+    // that this means you need to ensure you don't destroy the object in some other way (e.g. with
+    // an appropriate keep_alive, or with a reference to a statically allocated matrix).
+    static handle cast(const MapType &src, return_value_policy policy, handle parent) {
+        switch (policy) {
+            case return_value_policy::copy:
+                return eigen_array_cast<props>(src);
+            case return_value_policy::reference_internal:
+                return eigen_array_cast<props>(src, parent, is_eigen_mutable_map<MapType>::value);
+            case return_value_policy::reference:
+            case return_value_policy::automatic:
+            case return_value_policy::automatic_reference:
+                return eigen_array_cast<props>(src, none(), is_eigen_mutable_map<MapType>::value);
+            default:
+                // move, take_ownership don't make any sense for a ref/map:
+                pybind11_fail("Invalid return_value_policy for Eigen Map/Ref/Block type");
+        }
+    }
+
+    static constexpr auto name = props::descriptor;
+
+    // Explicitly delete these: support python -> C++ conversion on these (i.e. these can be return
+    // types but not bound arguments).  We still provide them (with an explicitly delete) so that
+    // you end up here if you try anyway.
+    bool load(handle, bool) = delete;
+    operator MapType() = delete;
+    template <typename> using cast_op_type = MapType;
+};
+
+// We can return any map-like object (but can only load Refs, specialized next):
+template <typename Type> struct type_caster<Type, enable_if_t<is_eigen_dense_map<Type>::value>>
+    : eigen_map_caster<Type> {};
+
+// Loader for Ref<...> arguments.  See the documentation for info on how to make this work without
+// copying (it requires some extra effort in many cases).
+template <typename PlainObjectType, typename StrideType>
+struct type_caster<
+    Eigen::Ref<PlainObjectType, 0, StrideType>,
+    enable_if_t<is_eigen_dense_map<Eigen::Ref<PlainObjectType, 0, StrideType>>::value>
+> : public eigen_map_caster<Eigen::Ref<PlainObjectType, 0, StrideType>> {
+private:
+    using Type = Eigen::Ref<PlainObjectType, 0, StrideType>;
+    using props = EigenProps<Type>;
+    using Scalar = typename props::Scalar;
+    using MapType = Eigen::Map<PlainObjectType, 0, StrideType>;
+    using Array = array_t<Scalar, array::forcecast |
+                ((props::row_major ? props::inner_stride : props::outer_stride) == 1 ? array::c_style :
+                 (props::row_major ? props::outer_stride : props::inner_stride) == 1 ? array::f_style : 0)>;
+    static constexpr bool need_writeable = is_eigen_mutable_map<Type>::value;
+    // Delay construction (these have no default constructor)
+    std::unique_ptr<MapType> map;
+    std::unique_ptr<Type> ref;
+    // Our array.  When possible, this is just a numpy array pointing to the source data, but
+    // sometimes we can't avoid copying (e.g. input is not a numpy array at all, has an incompatible
+    // layout, or is an array of a type that needs to be converted).  Using a numpy temporary
+    // (rather than an Eigen temporary) saves an extra copy when we need both type conversion and
+    // storage order conversion.  (Note that we refuse to use this temporary copy when loading an
+    // argument for a Ref<M> with M non-const, i.e. a read-write reference).
+    Array copy_or_ref;
+public:
+    bool load(handle src, bool convert) {
+        // First check whether what we have is already an array of the right type.  If not, we can't
+        // avoid a copy (because the copy is also going to do type conversion).
+        bool need_copy = !isinstance<Array>(src);
+
+        EigenConformable<props::row_major> fits;
+        if (!need_copy) {
+            // We don't need a converting copy, but we also need to check whether the strides are
+            // compatible with the Ref's stride requirements
+            Array aref = reinterpret_borrow<Array>(src);
+
+            if (aref && (!need_writeable || aref.writeable())) {
+                fits = props::conformable(aref);
+                if (!fits) return false; // Incompatible dimensions
+                if (!fits.template stride_compatible<props>())
+                    need_copy = true;
+                else
+                    copy_or_ref = std::move(aref);
+            }
+            else {
+                need_copy = true;
+            }
+        }
+
+        if (need_copy) {
+            // We need to copy: If we need a mutable reference, or we're not supposed to convert
+            // (either because we're in the no-convert overload pass, or because we're explicitly
+            // instructed not to copy (via `py::arg().noconvert()`) we have to fail loading.
+            if (!convert || need_writeable) return false;
+
+            Array copy = Array::ensure(src);
+            if (!copy) return false;
+            fits = props::conformable(copy);
+            if (!fits || !fits.template stride_compatible<props>())
+                return false;
+            copy_or_ref = std::move(copy);
+            loader_life_support::add_patient(copy_or_ref);
+        }
+
+        ref.reset();
+        map.reset(new MapType(data(copy_or_ref), fits.rows, fits.cols, make_stride(fits.stride.outer(), fits.stride.inner())));
+        ref.reset(new Type(*map));
+
+        return true;
+    }
+
+    operator Type*() { return ref.get(); }
+    operator Type&() { return *ref; }
+    template <typename _T> using cast_op_type = pybind11::detail::cast_op_type<_T>;
+
+private:
+    template <typename T = Type, enable_if_t<is_eigen_mutable_map<T>::value, int> = 0>
+    Scalar *data(Array &a) { return a.mutable_data(); }
+
+    template <typename T = Type, enable_if_t<!is_eigen_mutable_map<T>::value, int> = 0>
+    const Scalar *data(Array &a) { return a.data(); }
+
+    // Attempt to figure out a constructor of `Stride` that will work.
+    // If both strides are fixed, use a default constructor:
+    template <typename S> using stride_ctor_default = bool_constant<
+        S::InnerStrideAtCompileTime != Eigen::Dynamic && S::OuterStrideAtCompileTime != Eigen::Dynamic &&
+        std::is_default_constructible<S>::value>;
+    // Otherwise, if there is a two-index constructor, assume it is (outer,inner) like
+    // Eigen::Stride, and use it:
+    template <typename S> using stride_ctor_dual = bool_constant<
+        !stride_ctor_default<S>::value && std::is_constructible<S, EigenIndex, EigenIndex>::value>;
+    // Otherwise, if there is a one-index constructor, and just one of the strides is dynamic, use
+    // it (passing whichever stride is dynamic).
+    template <typename S> using stride_ctor_outer = bool_constant<
+        !any_of<stride_ctor_default<S>, stride_ctor_dual<S>>::value &&
+        S::OuterStrideAtCompileTime == Eigen::Dynamic && S::InnerStrideAtCompileTime != Eigen::Dynamic &&
+        std::is_constructible<S, EigenIndex>::value>;
+    template <typename S> using stride_ctor_inner = bool_constant<
+        !any_of<stride_ctor_default<S>, stride_ctor_dual<S>>::value &&
+        S::InnerStrideAtCompileTime == Eigen::Dynamic && S::OuterStrideAtCompileTime != Eigen::Dynamic &&
+        std::is_constructible<S, EigenIndex>::value>;
+
+    template <typename S = StrideType, enable_if_t<stride_ctor_default<S>::value, int> = 0>
+    static S make_stride(EigenIndex, EigenIndex) { return S(); }
+    template <typename S = StrideType, enable_if_t<stride_ctor_dual<S>::value, int> = 0>
+    static S make_stride(EigenIndex outer, EigenIndex inner) { return S(outer, inner); }
+    template <typename S = StrideType, enable_if_t<stride_ctor_outer<S>::value, int> = 0>
+    static S make_stride(EigenIndex outer, EigenIndex) { return S(outer); }
+    template <typename S = StrideType, enable_if_t<stride_ctor_inner<S>::value, int> = 0>
+    static S make_stride(EigenIndex, EigenIndex inner) { return S(inner); }
+
+};
+
+// type_caster for special matrix types (e.g. DiagonalMatrix), which are EigenBase, but not
+// EigenDense (i.e. they don't have a data(), at least not with the usual matrix layout).
+// load() is not supported, but we can cast them into the python domain by first copying to a
+// regular Eigen::Matrix, then casting that.
+template <typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_other<Type>::value>> {
+protected:
+    using Matrix = Eigen::Matrix<typename Type::Scalar, Type::RowsAtCompileTime, Type::ColsAtCompileTime>;
+    using props = EigenProps<Matrix>;
+public:
+    static handle cast(const Type &src, return_value_policy /* policy */, handle /* parent */) {
+        handle h = eigen_encapsulate<props>(new Matrix(src));
+        return h;
+    }
+    static handle cast(const Type *src, return_value_policy policy, handle parent) { return cast(*src, policy, parent); }
+
+    static constexpr auto name = props::descriptor;
+
+    // Explicitly delete these: support python -> C++ conversion on these (i.e. these can be return
+    // types but not bound arguments).  We still provide them (with an explicitly delete) so that
+    // you end up here if you try anyway.
+    bool load(handle, bool) = delete;
+    operator Type() = delete;
+    template <typename> using cast_op_type = Type;
+};
+
+template<typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_sparse<Type>::value>> {
+    typedef typename Type::Scalar Scalar;
+    typedef remove_reference_t<decltype(*std::declval<Type>().outerIndexPtr())> StorageIndex;
+    typedef typename Type::Index Index;
+    static constexpr bool rowMajor = Type::IsRowMajor;
+
+    bool load(handle src, bool) {
+        if (!src)
+            return false;
+
+        auto obj = reinterpret_borrow<object>(src);
+        object sparse_module = module::import("scipy.sparse");
+        object matrix_type = sparse_module.attr(
+            rowMajor ? "csr_matrix" : "csc_matrix");
+
+        if (!obj.get_type().is(matrix_type)) {
+            try {
+                obj = matrix_type(obj);
+            } catch (const error_already_set &) {
+                return false;
+            }
+        }
+
+        auto values = array_t<Scalar>((object) obj.attr("data"));
+        auto innerIndices = array_t<StorageIndex>((object) obj.attr("indices"));
+        auto outerIndices = array_t<StorageIndex>((object) obj.attr("indptr"));
+        auto shape = pybind11::tuple((pybind11::object) obj.attr("shape"));
+        auto nnz = obj.attr("nnz").cast<Index>();
+
+        if (!values || !innerIndices || !outerIndices)
+            return false;
+
+        value = Eigen::MappedSparseMatrix<Scalar, Type::Flags, StorageIndex>(
+            shape[0].cast<Index>(), shape[1].cast<Index>(), nnz,
+            outerIndices.mutable_data(), innerIndices.mutable_data(), values.mutable_data());
+
+        return true;
+    }
+
+    static handle cast(const Type &src, return_value_policy /* policy */, handle /* parent */) {
+        const_cast<Type&>(src).makeCompressed();
+
+        object matrix_type = module::import("scipy.sparse").attr(
+            rowMajor ? "csr_matrix" : "csc_matrix");
+
+        array data(src.nonZeros(), src.valuePtr());
+        array outerIndices((rowMajor ? src.rows() : src.cols()) + 1, src.outerIndexPtr());
+        array innerIndices(src.nonZeros(), src.innerIndexPtr());
+
+        return matrix_type(
+            std::make_tuple(data, innerIndices, outerIndices),
+            std::make_pair(src.rows(), src.cols())
+        ).release();
+    }
+
+    PYBIND11_TYPE_CASTER(Type, _<(Type::IsRowMajor) != 0>("scipy.sparse.csr_matrix[", "scipy.sparse.csc_matrix[")
+            + npy_format_descriptor<Scalar>::name + _("]"));
+};
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
+
+#if defined(__GNUG__) || defined(__clang__)
+#  pragma GCC diagnostic pop
+#elif defined(_MSC_VER)
+#  pragma warning(pop)
+#endif
--- a/cviruntime/python/include/pybind11/include/pybind11/embed.h
+++ b/cviruntime/python/include/pybind11/include/pybind11/embed.h
@ -0,0 +1,202 @@
+/*
+    pybind11/embed.h: Support for embedding the interpreter
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include "eval.h"
+
+#if defined(PYPY_VERSION)
+#  error Embedding the interpreter is not supported with PyPy
+#endif
+
+#if PY_MAJOR_VERSION >= 3
+#  define PYBIND11_EMBEDDED_MODULE_IMPL(name)            \
+      extern "C" PyObject *pybind11_init_impl_##name();  \
+      extern "C" PyObject *pybind11_init_impl_##name() { \
+          return pybind11_init_wrapper_##name();         \
+      }
+#else
+#  define PYBIND11_EMBEDDED_MODULE_IMPL(name)            \
+      extern "C" void pybind11_init_impl_##name();       \
+      extern "C" void pybind11_init_impl_##name() {      \
+          pybind11_init_wrapper_##name();                \
+      }
+#endif
+
+/** \rst
+    Add a new module to the table of builtins for the interpreter. Must be
+    defined in global scope. The first macro parameter is the name of the
+    module (without quotes). The second parameter is the variable which will
+    be used as the interface to add functions and classes to the module.
+
+    .. code-block:: cpp
+
+        PYBIND11_EMBEDDED_MODULE(example, m) {
+            // ... initialize functions and classes here
+            m.def("foo", []() {
+                return "Hello, World!";
+            });
+        }
+ \endrst */
+#define PYBIND11_EMBEDDED_MODULE(name, variable)                              \
+    static void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &);    \
+    static PyObject PYBIND11_CONCAT(*pybind11_init_wrapper_, name)() {        \
+        auto m = pybind11::module(PYBIND11_TOSTRING(name));                   \
+        try {                                                                 \
+            PYBIND11_CONCAT(pybind11_init_, name)(m);                         \
+            return m.ptr();                                                   \
+        } catch (pybind11::error_already_set &e) {                            \
+            PyErr_SetString(PyExc_ImportError, e.what());                     \
+            return nullptr;                                                   \
+        } catch (const std::exception &e) {                                   \
+            PyErr_SetString(PyExc_ImportError, e.what());                     \
+            return nullptr;                                                   \
+        }                                                                     \
+    }                                                                         \
+    PYBIND11_EMBEDDED_MODULE_IMPL(name)                                       \
+    pybind11::detail::embedded_module name(PYBIND11_TOSTRING(name),           \
+                               PYBIND11_CONCAT(pybind11_init_impl_, name));   \
+    void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &variable)
+
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+/// Python 2.7/3.x compatible version of `PyImport_AppendInittab` and error checks.
+struct embedded_module {
+#if PY_MAJOR_VERSION >= 3
+    using init_t = PyObject *(*)();
+#else
+    using init_t = void (*)();
+#endif
+    embedded_module(const char *name, init_t init) {
+        if (Py_IsInitialized())
+            pybind11_fail("Can't add new modules after the interpreter has been initialized");
+
+        auto result = PyImport_AppendInittab(name, init);
+        if (result == -1)
+            pybind11_fail("Insufficient memory to add a new module");
+    }
+};
+
+NAMESPACE_END(detail)
+
+/** \rst
+    Initialize the Python interpreter. No other pybind11 or CPython API functions can be
+    called before this is done; with the exception of `PYBIND11_EMBEDDED_MODULE`. The
+    optional parameter can be used to skip the registration of signal handlers (see the
+    `Python documentation`_ for details). Calling this function again after the interpreter
+    has already been initialized is a fatal error.
+
+    If initializing the Python interpreter fails, then the program is terminated.  (This
+    is controlled by the CPython runtime and is an exception to pybind11's normal behavior
+    of throwing exceptions on errors.)
+
+    .. _Python documentation: https://docs.python.org/3/c-api/init.html#c.Py_InitializeEx
+ \endrst */
+inline void initialize_interpreter(bool init_signal_handlers = true) {
+    if (Py_IsInitialized())
+        pybind11_fail("The interpreter is already running");
+
+    Py_InitializeEx(init_signal_handlers ? 1 : 0);
+
+    // Make .py files in the working directory available by default
+    module::import("sys").attr("path").cast<list>().append(".");
+}
+
+/** \rst
+    Shut down the Python interpreter. No pybind11 or CPython API functions can be called
+    after this. In addition, pybind11 objects must not outlive the interpreter:
+
+    .. code-block:: cpp
+
+        { // BAD
+            py::initialize_interpreter();
+            auto hello = py::str("Hello, World!");
+            py::finalize_interpreter();
+        } // <-- BOOM, hello's destructor is called after interpreter shutdown
+
+        { // GOOD
+            py::initialize_interpreter();
+            { // scoped
+                auto hello = py::str("Hello, World!");
+            } // <-- OK, hello is cleaned up properly
+            py::finalize_interpreter();
+        }
+
+        { // BETTER
+            py::scoped_interpreter guard{};
+            auto hello = py::str("Hello, World!");
+        }
+
+    .. warning::
+
+        The interpreter can be restarted by calling `initialize_interpreter` again.
+        Modules created using pybind11 can be safely re-initialized. However, Python
+        itself cannot completely unload binary extension modules and there are several
+        caveats with regard to interpreter restarting. All the details can be found
+        in the CPython documentation. In short, not all interpreter memory may be
+        freed, either due to reference cycles or user-created global data.
+
+ \endrst */
+inline void finalize_interpreter() {
+    handle builtins(PyEval_GetBuiltins());
+    const char *id = PYBIND11_INTERNALS_ID;
+
+    // Get the internals pointer (without creating it if it doesn't exist).  It's possible for the
+    // internals to be created during Py_Finalize() (e.g. if a py::capsule calls `get_internals()`
+    // during destruction), so we get the pointer-pointer here and check it after Py_Finalize().
+    detail::internals **internals_ptr_ptr = detail::get_internals_pp();
+    // It could also be stashed in builtins, so look there too:
+    if (builtins.contains(id) && isinstance<capsule>(builtins[id]))
+        internals_ptr_ptr = capsule(builtins[id]);
+
+    Py_Finalize();
+
+    if (internals_ptr_ptr) {
+        delete *internals_ptr_ptr;
+        *internals_ptr_ptr = nullptr;
+    }
+}
+
+/** \rst
+    Scope guard version of `initialize_interpreter` and `finalize_interpreter`.
+    This a move-only guard and only a single instance can exist.
+
+    .. code-block:: cpp
+
+        #include <pybind11/embed.h>
+
+        int main() {
+            py::scoped_interpreter guard{};
+            py::print(Hello, World!);
+        } // <-- interpreter shutdown
+ \endrst */
+class scoped_interpreter {
+public:
+    scoped_interpreter(bool init_signal_handlers = true) {
+        initialize_interpreter(init_signal_handlers);
+    }
+
+    scoped_interpreter(const scoped_interpreter &) = delete;
+    scoped_interpreter(scoped_interpreter &&other) noexcept { other.is_valid = false; }
+    scoped_interpreter &operator=(const scoped_interpreter &) = delete;
+    scoped_interpreter &operator=(scoped_interpreter &&) = delete;
+
+    ~scoped_interpreter() {
+        if (is_valid)
+            finalize_interpreter();
+    }
+
+private:
+    bool is_valid = true;
+};
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
--- a/cviruntime/python/include/pybind11/include/pybind11/eval.h
+++ b/cviruntime/python/include/pybind11/include/pybind11/eval.h
@ -0,0 +1,117 @@
+/*
+    pybind11/exec.h: Support for evaluating Python expressions and statements
+    from strings and files
+
+    Copyright (c) 2016 Klemens Morgenstern <klemens.morgenstern@ed-chemnitz.de> and
+                       Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+enum eval_mode {
+    /// Evaluate a string containing an isolated expression
+    eval_expr,
+
+    /// Evaluate a string containing a single statement. Returns \c none
+    eval_single_statement,
+
+    /// Evaluate a string containing a sequence of statement. Returns \c none
+    eval_statements
+};
+
+template <eval_mode mode = eval_expr>
+object eval(str expr, object global = globals(), object local = object()) {
+    if (!local)
+        local = global;
+
+    /* PyRun_String does not accept a PyObject / encoding specifier,
+       this seems to be the only alternative */
+    std::string buffer = "# -*- coding: utf-8 -*-\n" + (std::string) expr;
+
+    int start;
+    switch (mode) {
+        case eval_expr:             start = Py_eval_input;   break;
+        case eval_single_statement: start = Py_single_input; break;
+        case eval_statements:       start = Py_file_input;   break;
+        default: pybind11_fail("invalid evaluation mode");
+    }
+
+    PyObject *result = PyRun_String(buffer.c_str(), start, global.ptr(), local.ptr());
+    if (!result)
+        throw error_already_set();
+    return reinterpret_steal<object>(result);
+}
+
+template <eval_mode mode = eval_expr, size_t N>
+object eval(const char (&s)[N], object global = globals(), object local = object()) {
+    /* Support raw string literals by removing common leading whitespace */
+    auto expr = (s[0] == '\n') ? str(module::import("textwrap").attr("dedent")(s))
+                               : str(s);
+    return eval<mode>(expr, global, local);
+}
+
+inline void exec(str expr, object global = globals(), object local = object()) {
+    eval<eval_statements>(expr, global, local);
+}
+
+template <size_t N>
+void exec(const char (&s)[N], object global = globals(), object local = object()) {
+    eval<eval_statements>(s, global, local);
+}
+
+template <eval_mode mode = eval_statements>
+object eval_file(str fname, object global = globals(), object local = object()) {
+    if (!local)
+        local = global;
+
+    int start;
+    switch (mode) {
+        case eval_expr:             start = Py_eval_input;   break;
+        case eval_single_statement: start = Py_single_input; break;
+        case eval_statements:       start = Py_file_input;   break;
+        default: pybind11_fail("invalid evaluation mode");
+    }
+
+    int closeFile = 1;
+    std::string fname_str = (std::string) fname;
+#if PY_VERSION_HEX >= 0x03040000
+    FILE *f = _Py_fopen_obj(fname.ptr(), "r");
+#elif PY_VERSION_HEX >= 0x03000000
+    FILE *f = _Py_fopen(fname.ptr(), "r");
+#else
+    /* No unicode support in open() :( */
+    auto fobj = reinterpret_steal<object>(PyFile_FromString(
+        const_cast<char *>(fname_str.c_str()),
+        const_cast<char*>("r")));
+    FILE *f = nullptr;
+    if (fobj)
+        f = PyFile_AsFile(fobj.ptr());
+    closeFile = 0;
+#endif
+    if (!f) {
+        PyErr_Clear();
+        pybind11_fail("File \"" + fname_str + "\" could not be opened!");
+    }
+
+#if PY_VERSION_HEX < 0x03000000 && defined(PYPY_VERSION)
+    PyObject *result = PyRun_File(f, fname_str.c_str(), start, global.ptr(),
+                                  local.ptr());
+    (void) closeFile;
+#else
+    PyObject *result = PyRun_FileEx(f, fname_str.c_str(), start, global.ptr(),
+                                    local.ptr(), closeFile);
+#endif
+
+    if (!result)
+        throw error_already_set();
+    return reinterpret_steal<object>(result);
+}
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
--- a/cviruntime/python/include/pybind11/include/pybind11/functional.h
+++ b/cviruntime/python/include/pybind11/include/pybind11/functional.h
@ -0,0 +1,101 @@
+/*
+    pybind11/functional.h: std::function<> support
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include <functional>
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+template <typename Return, typename... Args>
+struct type_caster<std::function<Return(Args...)>> {
+    using type = std::function<Return(Args...)>;
+    using retval_type = conditional_t<std::is_same<Return, void>::value, void_type, Return>;
+    using function_type = Return (*) (Args...);
+
+public:
+    bool load(handle src, bool convert) {
+        if (src.is_none()) {
+            // Defer accepting None to other overloads (if we aren't in convert mode):
+            if (!convert) return false;
+            return true;
+        }
+
+        if (!isinstance<function>(src))
+            return false;
+
+        auto func = reinterpret_borrow<function>(src);
+
+        /*
+           When passing a C++ function as an argument to another C++
+           function via Python, every function call would normally involve
+           a full C++ -> Python -> C++ roundtrip, which can be prohibitive.
+           Here, we try to at least detect the case where the function is
+           stateless (i.e. function pointer or lambda function without
+           captured variables), in which case the roundtrip can be avoided.
+         */
+        if (auto cfunc = func.cpp_function()) {
+            auto c = reinterpret_borrow<capsule>(PyCFunction_GET_SELF(cfunc.ptr()));
+            auto rec = (function_record *) c;
+
+            if (rec && rec->is_stateless &&
+                    same_type(typeid(function_type), *reinterpret_cast<const std::type_info *>(rec->data[1]))) {
+                struct capture { function_type f; };
+                value = ((capture *) &rec->data)->f;
+                return true;
+            }
+        }
+
+        // ensure GIL is held during functor destruction
+        struct func_handle {
+            function f;
+            func_handle(function&& f_) : f(std::move(f_)) {}
+            func_handle(const func_handle&) = default;
+            ~func_handle() {
+                gil_scoped_acquire acq;
+                function kill_f(std::move(f));
+            }
+        };
+
+        // to emulate 'move initialization capture' in C++11
+        struct func_wrapper {
+            func_handle hfunc;
+            func_wrapper(func_handle&& hf): hfunc(std::move(hf)) {}
+            Return operator()(Args... args) const {
+                gil_scoped_acquire acq;
+                object retval(hfunc.f(std::forward<Args>(args)...));
+                /* Visual studio 2015 parser issue: need parentheses around this expression */
+                return (retval.template cast<Return>());
+            }
+        };
+
+        value = func_wrapper(func_handle(std::move(func)));
+        return true;
+    }
+
+    template <typename Func>
+    static handle cast(Func &&f_, return_value_policy policy, handle /* parent */) {
+        if (!f_)
+            return none().inc_ref();
+
+        auto result = f_.template target<function_type>();
+        if (result)
+            return cpp_function(*result, policy).release();
+        else
+            return cpp_function(std::forward<Func>(f_), policy).release();
+    }
+
+    PYBIND11_TYPE_CASTER(type, _("Callable[[") + concat(make_caster<Args>::name...) + _("], ")
+                               + make_caster<retval_type>::name + _("]"));
+};
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
--- a/cviruntime/python/include/pybind11/include/pybind11/iostream.h
+++ b/cviruntime/python/include/pybind11/include/pybind11/iostream.h
@ -0,0 +1,209 @@
+/*
+    pybind11/iostream.h -- Tools to assist with redirecting cout and cerr to Python
+
+    Copyright (c) 2017 Henry F. Schreiner
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+#include <streambuf>
+#include <ostream>
+#include <string>
+#include <memory>
+#include <iostream>
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+// Buffer that writes to Python instead of C++
+class pythonbuf : public std::streambuf {
+private:
+    using traits_type = std::streambuf::traits_type;
+
+    const size_t buf_size;
+    std::unique_ptr<char[]> d_buffer;
+    object pywrite;
+    object pyflush;
+
+    int overflow(int c) {
+        if (!traits_type::eq_int_type(c, traits_type::eof())) {
+            *pptr() = traits_type::to_char_type(c);
+            pbump(1);
+        }
+        return sync() == 0 ? traits_type::not_eof(c) : traits_type::eof();
+    }
+
+    int sync() {
+        if (pbase() != pptr()) {
+            // This subtraction cannot be negative, so dropping the sign
+            str line(pbase(), static_cast<size_t>(pptr() - pbase()));
+
+            {
+                gil_scoped_acquire tmp;
+                pywrite(line);
+                pyflush();
+            }
+
+            setp(pbase(), epptr());
+        }
+        return 0;
+    }
+
+public:
+
+    pythonbuf(object pyostream, size_t buffer_size = 1024)
+        : buf_size(buffer_size),
+          d_buffer(new char[buf_size]),
+          pywrite(pyostream.attr("write")),
+          pyflush(pyostream.attr("flush")) {
+        setp(d_buffer.get(), d_buffer.get() + buf_size - 1);
+    }
+
+    pythonbuf(pythonbuf&&) = default;
+
+    /// Sync before destroy
+    ~pythonbuf() {
+        sync();
+    }
+};
+
+NAMESPACE_END(detail)
+
+
+/** \rst
+    This a move-only guard that redirects output.
+
+    .. code-block:: cpp
+
+        #include <pybind11/iostream.h>
+
+        ...
+
+        {
+            py::scoped_ostream_redirect output;
+            std::cout << "Hello, World!"; // Python stdout
+        } // <-- return std::cout to normal
+
+    You can explicitly pass the c++ stream and the python object,
+    for example to guard stderr instead.
+
+    .. code-block:: cpp
+
+        {
+            py::scoped_ostream_redirect output{std::cerr, py::module::import("sys").attr("stderr")};
+            std::cerr << "Hello, World!";
+        }
+ \endrst */
+class scoped_ostream_redirect {
+protected:
+    std::streambuf *old;
+    std::ostream &costream;
+    detail::pythonbuf buffer;
+
+public:
+    scoped_ostream_redirect(
+            std::ostream &costream = std::cout,
+            object pyostream = module::import("sys").attr("stdout"))
+        : costream(costream), buffer(pyostream) {
+        old = costream.rdbuf(&buffer);
+    }
+
+    ~scoped_ostream_redirect() {
+        costream.rdbuf(old);
+    }
+
+    scoped_ostream_redirect(const scoped_ostream_redirect &) = delete;
+    scoped_ostream_redirect(scoped_ostream_redirect &&other) = default;
+    scoped_ostream_redirect &operator=(const scoped_ostream_redirect &) = delete;
+    scoped_ostream_redirect &operator=(scoped_ostream_redirect &&) = delete;
+};
+
+
+/** \rst
+    Like `scoped_ostream_redirect`, but redirects cerr by default. This class
+    is provided primary to make ``py::call_guard`` easier to make.
+
+    .. code-block:: cpp
+
+     m.def("noisy_func", &noisy_func,
+           py::call_guard<scoped_ostream_redirect,
+                          scoped_estream_redirect>());
+
+\endrst */
+class scoped_estream_redirect : public scoped_ostream_redirect {
+public:
+    scoped_estream_redirect(
+            std::ostream &costream = std::cerr,
+            object pyostream = module::import("sys").attr("stderr"))
+        : scoped_ostream_redirect(costream,pyostream) {}
+};
+
+
+NAMESPACE_BEGIN(detail)
+
+// Class to redirect output as a context manager. C++ backend.
+class OstreamRedirect {
+    bool do_stdout_;
+    bool do_stderr_;
+    std::unique_ptr<scoped_ostream_redirect> redirect_stdout;
+    std::unique_ptr<scoped_estream_redirect> redirect_stderr;
+
+public:
+    OstreamRedirect(bool do_stdout = true, bool do_stderr = true)
+        : do_stdout_(do_stdout), do_stderr_(do_stderr) {}
+
+    void enter() {
+        if (do_stdout_)
+            redirect_stdout.reset(new scoped_ostream_redirect());
+        if (do_stderr_)
+            redirect_stderr.reset(new scoped_estream_redirect());
+    }
+
+    void exit() {
+        redirect_stdout.reset();
+        redirect_stderr.reset();
+    }
+};
+
+NAMESPACE_END(detail)
+
+/** \rst
+    This is a helper function to add a C++ redirect context manager to Python
+    instead of using a C++ guard. To use it, add the following to your binding code:
+
+    .. code-block:: cpp
+
+        #include <pybind11/iostream.h>
+
+        ...
+
+        py::add_ostream_redirect(m, "ostream_redirect");
+
+    You now have a Python context manager that redirects your output:
+
+    .. code-block:: python
+
+        with m.ostream_redirect():
+            m.print_to_cout_function()
+
+    This manager can optionally be told which streams to operate on:
+
+    .. code-block:: python
+
+        with m.ostream_redirect(stdout=true, stderr=true):
+            m.noisy_function_with_error_printing()
+
+ \endrst */
+inline class_<detail::OstreamRedirect> add_ostream_redirect(module m, std::string name = "ostream_redirect") {
+    return class_<detail::OstreamRedirect>(m, name.c_str(), module_local())
+        .def(init<bool,bool>(), arg("stdout")=true, arg("stderr")=true)
+        .def("__enter__", &detail::OstreamRedirect::enter)
+        .def("__exit__", [](detail::OstreamRedirect &self_, args) { self_.exit(); });
+}
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
--- a/cviruntime/python/include/pybind11/include/pybind11/numpy.h
+++ b/cviruntime/python/include/pybind11/include/pybind11/numpy.h
--- a/cviruntime/python/include/pybind11/include/pybind11/operators.h
+++ b/cviruntime/python/include/pybind11/include/pybind11/operators.h
@ -0,0 +1,168 @@
+/*
+    pybind11/operator.h: Metatemplates for operator overloading
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+#if defined(__clang__) && !defined(__INTEL_COMPILER)
+#  pragma clang diagnostic ignored "-Wunsequenced" // multiple unsequenced modifications to 'self' (when using def(py::self OP Type()))
+#elif defined(_MSC_VER)
+#  pragma warning(push)
+#  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
+#endif
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+/// Enumeration with all supported operator types
+enum op_id : int {
+    op_add, op_sub, op_mul, op_div, op_mod, op_divmod, op_pow, op_lshift,
+    op_rshift, op_and, op_xor, op_or, op_neg, op_pos, op_abs, op_invert,
+    op_int, op_long, op_float, op_str, op_cmp, op_gt, op_ge, op_lt, op_le,
+    op_eq, op_ne, op_iadd, op_isub, op_imul, op_idiv, op_imod, op_ilshift,
+    op_irshift, op_iand, op_ixor, op_ior, op_complex, op_bool, op_nonzero,
+    op_repr, op_truediv, op_itruediv, op_hash
+};
+
+enum op_type : int {
+    op_l, /* base type on left */
+    op_r, /* base type on right */
+    op_u  /* unary operator */
+};
+
+struct self_t { };
+static const self_t self = self_t();
+
+/// Type for an unused type slot
+struct undefined_t { };
+
+/// Don't warn about an unused variable
+inline self_t __self() { return self; }
+
+/// base template of operator implementations
+template <op_id, op_type, typename B, typename L, typename R> struct op_impl { };
+
+/// Operator implementation generator
+template <op_id id, op_type ot, typename L, typename R> struct op_ {
+    template <typename Class, typename... Extra> void execute(Class &cl, const Extra&... extra) const {
+        using Base = typename Class::type;
+        using L_type = conditional_t<std::is_same<L, self_t>::value, Base, L>;
+        using R_type = conditional_t<std::is_same<R, self_t>::value, Base, R>;
+        using op = op_impl<id, ot, Base, L_type, R_type>;
+        cl.def(op::name(), &op::execute, is_operator(), extra...);
+        #if PY_MAJOR_VERSION < 3
+        if (id == op_truediv || id == op_itruediv)
+            cl.def(id == op_itruediv ? "__idiv__" : ot == op_l ? "__div__" : "__rdiv__",
+                    &op::execute, is_operator(), extra...);
+        #endif
+    }
+    template <typename Class, typename... Extra> void execute_cast(Class &cl, const Extra&... extra) const {
+        using Base = typename Class::type;
+        using L_type = conditional_t<std::is_same<L, self_t>::value, Base, L>;
+        using R_type = conditional_t<std::is_same<R, self_t>::value, Base, R>;
+        using op = op_impl<id, ot, Base, L_type, R_type>;
+        cl.def(op::name(), &op::execute_cast, is_operator(), extra...);
+        #if PY_MAJOR_VERSION < 3
+        if (id == op_truediv || id == op_itruediv)
+            cl.def(id == op_itruediv ? "__idiv__" : ot == op_l ? "__div__" : "__rdiv__",
+                    &op::execute, is_operator(), extra...);
+        #endif
+    }
+};
+
+#define PYBIND11_BINARY_OPERATOR(id, rid, op, expr)                                    \
+template <typename B, typename L, typename R> struct op_impl<op_##id, op_l, B, L, R> { \
+    static char const* name() { return "__" #id "__"; }                                \
+    static auto execute(const L &l, const R &r) -> decltype(expr) { return (expr); }   \
+    static B execute_cast(const L &l, const R &r) { return B(expr); }                  \
+};                                                                                     \
+template <typename B, typename L, typename R> struct op_impl<op_##id, op_r, B, L, R> { \
+    static char const* name() { return "__" #rid "__"; }                               \
+    static auto execute(const R &r, const L &l) -> decltype(expr) { return (expr); }   \
+    static B execute_cast(const R &r, const L &l) { return B(expr); }                  \
+};                                                                                     \
+inline op_<op_##id, op_l, self_t, self_t> op(const self_t &, const self_t &) {         \
+    return op_<op_##id, op_l, self_t, self_t>();                                       \
+}                                                                                      \
+template <typename T> op_<op_##id, op_l, self_t, T> op(const self_t &, const T &) {    \
+    return op_<op_##id, op_l, self_t, T>();                                            \
+}                                                                                      \
+template <typename T> op_<op_##id, op_r, T, self_t> op(const T &, const self_t &) {    \
+    return op_<op_##id, op_r, T, self_t>();                                            \
+}
+
+#define PYBIND11_INPLACE_OPERATOR(id, op, expr)                                        \
+template <typename B, typename L, typename R> struct op_impl<op_##id, op_l, B, L, R> { \
+    static char const* name() { return "__" #id "__"; }                                \
+    static auto execute(L &l, const R &r) -> decltype(expr) { return expr; }           \
+    static B execute_cast(L &l, const R &r) { return B(expr); }                        \
+};                                                                                     \
+template <typename T> op_<op_##id, op_l, self_t, T> op(const self_t &, const T &) {    \
+    return op_<op_##id, op_l, self_t, T>();                                            \
+}
+
+#define PYBIND11_UNARY_OPERATOR(id, op, expr)                                          \
+template <typename B, typename L> struct op_impl<op_##id, op_u, B, L, undefined_t> {   \
+    static char const* name() { return "__" #id "__"; }                                \
+    static auto execute(const L &l) -> decltype(expr) { return expr; }                 \
+    static B execute_cast(const L &l) { return B(expr); }                              \
+};                                                                                     \
+inline op_<op_##id, op_u, self_t, undefined_t> op(const self_t &) {                    \
+    return op_<op_##id, op_u, self_t, undefined_t>();                                  \
+}
+
+PYBIND11_BINARY_OPERATOR(sub,       rsub,         operator-,    l - r)
+PYBIND11_BINARY_OPERATOR(add,       radd,         operator+,    l + r)
+PYBIND11_BINARY_OPERATOR(mul,       rmul,         operator*,    l * r)
+PYBIND11_BINARY_OPERATOR(truediv,   rtruediv,     operator/,    l / r)
+PYBIND11_BINARY_OPERATOR(mod,       rmod,         operator%,    l % r)
+PYBIND11_BINARY_OPERATOR(lshift,    rlshift,      operator<<,   l << r)
+PYBIND11_BINARY_OPERATOR(rshift,    rrshift,      operator>>,   l >> r)
+PYBIND11_BINARY_OPERATOR(and,       rand,         operator&,    l & r)
+PYBIND11_BINARY_OPERATOR(xor,       rxor,         operator^,    l ^ r)
+PYBIND11_BINARY_OPERATOR(eq,        eq,           operator==,   l == r)
+PYBIND11_BINARY_OPERATOR(ne,        ne,           operator!=,   l != r)
+PYBIND11_BINARY_OPERATOR(or,        ror,          operator|,    l | r)
+PYBIND11_BINARY_OPERATOR(gt,        lt,           operator>,    l > r)
+PYBIND11_BINARY_OPERATOR(ge,        le,           operator>=,   l >= r)
+PYBIND11_BINARY_OPERATOR(lt,        gt,           operator<,    l < r)
+PYBIND11_BINARY_OPERATOR(le,        ge,           operator<=,   l <= r)
+//PYBIND11_BINARY_OPERATOR(pow,       rpow,         pow,          std::pow(l,  r))
+PYBIND11_INPLACE_OPERATOR(iadd,     operator+=,   l += r)
+PYBIND11_INPLACE_OPERATOR(isub,     operator-=,   l -= r)
+PYBIND11_INPLACE_OPERATOR(imul,     operator*=,   l *= r)
+PYBIND11_INPLACE_OPERATOR(itruediv, operator/=,   l /= r)
+PYBIND11_INPLACE_OPERATOR(imod,     operator%=,   l %= r)
+PYBIND11_INPLACE_OPERATOR(ilshift,  operator<<=,  l <<= r)
+PYBIND11_INPLACE_OPERATOR(irshift,  operator>>=,  l >>= r)
+PYBIND11_INPLACE_OPERATOR(iand,     operator&=,   l &= r)
+PYBIND11_INPLACE_OPERATOR(ixor,     operator^=,   l ^= r)
+PYBIND11_INPLACE_OPERATOR(ior,      operator|=,   l |= r)
+PYBIND11_UNARY_OPERATOR(neg,        operator-,    -l)
+PYBIND11_UNARY_OPERATOR(pos,        operator+,    +l)
+PYBIND11_UNARY_OPERATOR(abs,        abs,          std::abs(l))
+PYBIND11_UNARY_OPERATOR(hash,       hash,         std::hash<L>()(l))
+PYBIND11_UNARY_OPERATOR(invert,     operator~,    (~l))
+PYBIND11_UNARY_OPERATOR(bool,       operator!,    !!l)
+PYBIND11_UNARY_OPERATOR(int,        int_,         (int) l)
+PYBIND11_UNARY_OPERATOR(float,      float_,       (double) l)
+
+#undef PYBIND11_BINARY_OPERATOR
+#undef PYBIND11_INPLACE_OPERATOR
+#undef PYBIND11_UNARY_OPERATOR
+NAMESPACE_END(detail)
+
+using detail::self;
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
+
+#if defined(_MSC_VER)
+#  pragma warning(pop)
+#endif
--- a/cviruntime/python/include/pybind11/include/pybind11/options.h
+++ b/cviruntime/python/include/pybind11/include/pybind11/options.h
@ -0,0 +1,65 @@
+/*
+    pybind11/options.h: global settings that are configurable at runtime.
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+class options {
+public:
+
+    // Default RAII constructor, which leaves settings as they currently are.
+    options() : previous_state(global_state()) {}
+
+    // Class is non-copyable.
+    options(const options&) = delete;
+    options& operator=(const options&) = delete;
+
+    // Destructor, which restores settings that were in effect before.
+    ~options() {
+        global_state() = previous_state;
+    }
+
+    // Setter methods (affect the global state):
+
+    options& disable_user_defined_docstrings() & { global_state().show_user_defined_docstrings = false; return *this; }
+
+    options& enable_user_defined_docstrings() & { global_state().show_user_defined_docstrings = true; return *this; }
+
+    options& disable_function_signatures() & { global_state().show_function_signatures = false; return *this; }
+
+    options& enable_function_signatures() & { global_state().show_function_signatures = true; return *this; }
+
+    // Getter methods (return the global state):
+
+    static bool show_user_defined_docstrings() { return global_state().show_user_defined_docstrings; }
+
+    static bool show_function_signatures() { return global_state().show_function_signatures; }
+
+    // This type is not meant to be allocated on the heap.
+    void* operator new(size_t) = delete;
+
+private:
+
+    struct state {
+        bool show_user_defined_docstrings = true;  //< Include user-supplied texts in docstrings.
+        bool show_function_signatures = true;      //< Include auto-generated function signatures in docstrings.
+    };
+
+    static state &global_state() {
+        static state instance;
+        return instance;
+    }
+
+    state previous_state;
+};
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
--- a/cviruntime/python/include/pybind11/include/pybind11/pybind11.h
+++ b/cviruntime/python/include/pybind11/include/pybind11/pybind11.h
--- a/cviruntime/python/include/pybind11/include/pybind11/pytypes.h
+++ b/cviruntime/python/include/pybind11/include/pybind11/pytypes.h
--- a/cviruntime/python/include/pybind11/include/pybind11/stl.h
+++ b/cviruntime/python/include/pybind11/include/pybind11/stl.h
@ -0,0 +1,386 @@
+/*
+    pybind11/stl.h: Transparent conversion for STL data types
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include <set>
+#include <unordered_set>
+#include <map>
+#include <unordered_map>
+#include <iostream>
+#include <list>
+#include <deque>
+#include <valarray>
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
+#endif
+
+#ifdef __has_include
+// std::optional (but including it in c++14 mode isn't allowed)
+#  if defined(PYBIND11_CPP17) && __has_include(<optional>)
+#    include <optional>
+#    define PYBIND11_HAS_OPTIONAL 1
+#  endif
+// std::experimental::optional (but not allowed in c++11 mode)
+#  if defined(PYBIND11_CPP14) && (__has_include(<experimental/optional>) && \
+                                 !__has_include(<optional>))
+#    include <experimental/optional>
+#    define PYBIND11_HAS_EXP_OPTIONAL 1
+#  endif
+// std::variant
+#  if defined(PYBIND11_CPP17) && __has_include(<variant>)
+#    include <variant>
+#    define PYBIND11_HAS_VARIANT 1
+#  endif
+#elif defined(_MSC_VER) && defined(PYBIND11_CPP17)
+#  include <optional>
+#  include <variant>
+#  define PYBIND11_HAS_OPTIONAL 1
+#  define PYBIND11_HAS_VARIANT 1
+#endif
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+/// Extracts an const lvalue reference or rvalue reference for U based on the type of T (e.g. for
+/// forwarding a container element).  Typically used indirect via forwarded_type(), below.
+template <typename T, typename U>
+using forwarded_type = conditional_t<
+    std::is_lvalue_reference<T>::value, remove_reference_t<U> &, remove_reference_t<U> &&>;
+
+/// Forwards a value U as rvalue or lvalue according to whether T is rvalue or lvalue; typically
+/// used for forwarding a container's elements.
+template <typename T, typename U>
+forwarded_type<T, U> forward_like(U &&u) {
+    return std::forward<detail::forwarded_type<T, U>>(std::forward<U>(u));
+}
+
+template <typename Type, typename Key> struct set_caster {
+    using type = Type;
+    using key_conv = make_caster<Key>;
+
+    bool load(handle src, bool convert) {
+        if (!isinstance<pybind11::set>(src))
+            return false;
+        auto s = reinterpret_borrow<pybind11::set>(src);
+        value.clear();
+        for (auto entry : s) {
+            key_conv conv;
+            if (!conv.load(entry, convert))
+                return false;
+            value.insert(cast_op<Key &&>(std::move(conv)));
+        }
+        return true;
+    }
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        if (!std::is_lvalue_reference<T>::value)
+            policy = return_value_policy_override<Key>::policy(policy);
+        pybind11::set s;
+        for (auto &&value : src) {
+            auto value_ = reinterpret_steal<object>(key_conv::cast(forward_like<T>(value), policy, parent));
+            if (!value_ || !s.add(value_))
+                return handle();
+        }
+        return s.release();
+    }
+
+    PYBIND11_TYPE_CASTER(type, _("Set[") + key_conv::name + _("]"));
+};
+
+template <typename Type, typename Key, typename Value> struct map_caster {
+    using key_conv   = make_caster<Key>;
+    using value_conv = make_caster<Value>;
+
+    bool load(handle src, bool convert) {
+        if (!isinstance<dict>(src))
+            return false;
+        auto d = reinterpret_borrow<dict>(src);
+        value.clear();
+        for (auto it : d) {
+            key_conv kconv;
+            value_conv vconv;
+            if (!kconv.load(it.first.ptr(), convert) ||
+                !vconv.load(it.second.ptr(), convert))
+                return false;
+            value.emplace(cast_op<Key &&>(std::move(kconv)), cast_op<Value &&>(std::move(vconv)));
+        }
+        return true;
+    }
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        dict d;
+        return_value_policy policy_key = policy;
+        return_value_policy policy_value = policy;
+        if (!std::is_lvalue_reference<T>::value) {
+            policy_key = return_value_policy_override<Key>::policy(policy_key);
+            policy_value = return_value_policy_override<Value>::policy(policy_value);
+        }
+        for (auto &&kv : src) {
+            auto key = reinterpret_steal<object>(key_conv::cast(forward_like<T>(kv.first), policy_key, parent));
+            auto value = reinterpret_steal<object>(value_conv::cast(forward_like<T>(kv.second), policy_value, parent));
+            if (!key || !value)
+                return handle();
+            d[key] = value;
+        }
+        return d.release();
+    }
+
+    PYBIND11_TYPE_CASTER(Type, _("Dict[") + key_conv::name + _(", ") + value_conv::name + _("]"));
+};
+
+template <typename Type, typename Value> struct list_caster {
+    using value_conv = make_caster<Value>;
+
+    bool load(handle src, bool convert) {
+        if (!isinstance<sequence>(src) || isinstance<str>(src))
+            return false;
+        auto s = reinterpret_borrow<sequence>(src);
+        value.clear();
+        reserve_maybe(s, &value);
+        for (auto it : s) {
+            value_conv conv;
+            if (!conv.load(it, convert))
+                return false;
+            value.push_back(cast_op<Value &&>(std::move(conv)));
+        }
+        return true;
+    }
+
+private:
+    template <typename T = Type,
+              enable_if_t<std::is_same<decltype(std::declval<T>().reserve(0)), void>::value, int> = 0>
+    void reserve_maybe(sequence s, Type *) { value.reserve(s.size()); }
+    void reserve_maybe(sequence, void *) { }
+
+public:
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        if (!std::is_lvalue_reference<T>::value)
+            policy = return_value_policy_override<Value>::policy(policy);
+        list l(src.size());
+        size_t index = 0;
+        for (auto &&value : src) {
+            auto value_ = reinterpret_steal<object>(value_conv::cast(forward_like<T>(value), policy, parent));
+            if (!value_)
+                return handle();
+            PyList_SET_ITEM(l.ptr(), (ssize_t) index++, value_.release().ptr()); // steals a reference
+        }
+        return l.release();
+    }
+
+    PYBIND11_TYPE_CASTER(Type, _("List[") + value_conv::name + _("]"));
+};
+
+template <typename Type, typename Alloc> struct type_caster<std::vector<Type, Alloc>>
+ : list_caster<std::vector<Type, Alloc>, Type> { };
+
+template <typename Type, typename Alloc> struct type_caster<std::deque<Type, Alloc>>
+ : list_caster<std::deque<Type, Alloc>, Type> { };
+
+template <typename Type, typename Alloc> struct type_caster<std::list<Type, Alloc>>
+ : list_caster<std::list<Type, Alloc>, Type> { };
+
+template <typename ArrayType, typename Value, bool Resizable, size_t Size = 0> struct array_caster {
+    using value_conv = make_caster<Value>;
+
+private:
+    template <bool R = Resizable>
+    bool require_size(enable_if_t<R, size_t> size) {
+        if (value.size() != size)
+            value.resize(size);
+        return true;
+    }
+    template <bool R = Resizable>
+    bool require_size(enable_if_t<!R, size_t> size) {
+        return size == Size;
+    }
+
+public:
+    bool load(handle src, bool convert) {
+        if (!isinstance<sequence>(src))
+            return false;
+        auto l = reinterpret_borrow<sequence>(src);
+        if (!require_size(l.size()))
+            return false;
+        size_t ctr = 0;
+        for (auto it : l) {
+            value_conv conv;
+            if (!conv.load(it, convert))
+                return false;
+            value[ctr++] = cast_op<Value &&>(std::move(conv));
+        }
+        return true;
+    }
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        list l(src.size());
+        size_t index = 0;
+        for (auto &&value : src) {
+            auto value_ = reinterpret_steal<object>(value_conv::cast(forward_like<T>(value), policy, parent));
+            if (!value_)
+                return handle();
+            PyList_SET_ITEM(l.ptr(), (ssize_t) index++, value_.release().ptr()); // steals a reference
+        }
+        return l.release();
+    }
+
+    PYBIND11_TYPE_CASTER(ArrayType, _("List[") + value_conv::name + _<Resizable>(_(""), _("[") + _<Size>() + _("]")) + _("]"));
+};
+
+template <typename Type, size_t Size> struct type_caster<std::array<Type, Size>>
+ : array_caster<std::array<Type, Size>, Type, false, Size> { };
+
+template <typename Type> struct type_caster<std::valarray<Type>>
+ : array_caster<std::valarray<Type>, Type, true> { };
+
+template <typename Key, typename Compare, typename Alloc> struct type_caster<std::set<Key, Compare, Alloc>>
+  : set_caster<std::set<Key, Compare, Alloc>, Key> { };
+
+template <typename Key, typename Hash, typename Equal, typename Alloc> struct type_caster<std::unordered_set<Key, Hash, Equal, Alloc>>
+  : set_caster<std::unordered_set<Key, Hash, Equal, Alloc>, Key> { };
+
+template <typename Key, typename Value, typename Compare, typename Alloc> struct type_caster<std::map<Key, Value, Compare, Alloc>>
+  : map_caster<std::map<Key, Value, Compare, Alloc>, Key, Value> { };
+
+template <typename Key, typename Value, typename Hash, typename Equal, typename Alloc> struct type_caster<std::unordered_map<Key, Value, Hash, Equal, Alloc>>
+  : map_caster<std::unordered_map<Key, Value, Hash, Equal, Alloc>, Key, Value> { };
+
+// This type caster is intended to be used for std::optional and std::experimental::optional
+template<typename T> struct optional_caster {
+    using value_conv = make_caster<typename T::value_type>;
+
+    template <typename T_>
+    static handle cast(T_ &&src, return_value_policy policy, handle parent) {
+        if (!src)
+            return none().inc_ref();
+        policy = return_value_policy_override<typename T::value_type>::policy(policy);
+        return value_conv::cast(*std::forward<T_>(src), policy, parent);
+    }
+
+    bool load(handle src, bool convert) {
+        if (!src) {
+            return false;
+        } else if (src.is_none()) {
+            return true;  // default-constructed value is already empty
+        }
+        value_conv inner_caster;
+        if (!inner_caster.load(src, convert))
+            return false;
+
+        value.emplace(cast_op<typename T::value_type &&>(std::move(inner_caster)));
+        return true;
+    }
+
+    PYBIND11_TYPE_CASTER(T, _("Optional[") + value_conv::name + _("]"));
+};
+
+#if PYBIND11_HAS_OPTIONAL
+template<typename T> struct type_caster<std::optional<T>>
+    : public optional_caster<std::optional<T>> {};
+
+template<> struct type_caster<std::nullopt_t>
+    : public void_caster<std::nullopt_t> {};
+#endif
+
+#if PYBIND11_HAS_EXP_OPTIONAL
+template<typename T> struct type_caster<std::experimental::optional<T>>
+    : public optional_caster<std::experimental::optional<T>> {};
+
+template<> struct type_caster<std::experimental::nullopt_t>
+    : public void_caster<std::experimental::nullopt_t> {};
+#endif
+
+/// Visit a variant and cast any found type to Python
+struct variant_caster_visitor {
+    return_value_policy policy;
+    handle parent;
+
+    using result_type = handle; // required by boost::variant in C++11
+
+    template <typename T>
+    result_type operator()(T &&src) const {
+        return make_caster<T>::cast(std::forward<T>(src), policy, parent);
+    }
+};
+
+/// Helper class which abstracts away variant's `visit` function. `std::variant` and similar
+/// `namespace::variant` types which provide a `namespace::visit()` function are handled here
+/// automatically using argument-dependent lookup. Users can provide specializations for other
+/// variant-like classes, e.g. `boost::variant` and `boost::apply_visitor`.
+template <template<typename...> class Variant>
+struct visit_helper {
+    template <typename... Args>
+    static auto call(Args &&...args) -> decltype(visit(std::forward<Args>(args)...)) {
+        return visit(std::forward<Args>(args)...);
+    }
+};
+
+/// Generic variant caster
+template <typename Variant> struct variant_caster;
+
+template <template<typename...> class V, typename... Ts>
+struct variant_caster<V<Ts...>> {
+    static_assert(sizeof...(Ts) > 0, "Variant must consist of at least one alternative.");
+
+    template <typename U, typename... Us>
+    bool load_alternative(handle src, bool convert, type_list<U, Us...>) {
+        auto caster = make_caster<U>();
+        if (caster.load(src, convert)) {
+            value = cast_op<U>(caster);
+            return true;
+        }
+        return load_alternative(src, convert, type_list<Us...>{});
+    }
+
+    bool load_alternative(handle, bool, type_list<>) { return false; }
+
+    bool load(handle src, bool convert) {
+        // Do a first pass without conversions to improve constructor resolution.
+        // E.g. `py::int_(1).cast<variant<double, int>>()` needs to fill the `int`
+        // slot of the variant. Without two-pass loading `double` would be filled
+        // because it appears first and a conversion is possible.
+        if (convert && load_alternative(src, false, type_list<Ts...>{}))
+            return true;
+        return load_alternative(src, convert, type_list<Ts...>{});
+    }
+
+    template <typename Variant>
+    static handle cast(Variant &&src, return_value_policy policy, handle parent) {
+        return visit_helper<V>::call(variant_caster_visitor{policy, parent},
+                                     std::forward<Variant>(src));
+    }
+
+    using Type = V<Ts...>;
+    PYBIND11_TYPE_CASTER(Type, _("Union[") + detail::concat(make_caster<Ts>::name...) + _("]"));
+};
+
+#if PYBIND11_HAS_VARIANT
+template <typename... Ts>
+struct type_caster<std::variant<Ts...>> : variant_caster<std::variant<Ts...>> { };
+#endif
+
+NAMESPACE_END(detail)
+
+inline std::ostream &operator<<(std::ostream &os, const handle &obj) {
+    os << (std::string) str(obj);
+    return os;
+}
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
--- a/cviruntime/python/include/pybind11/include/pybind11/stl_bind.h
+++ b/cviruntime/python/include/pybind11/include/pybind11/stl_bind.h
@ -0,0 +1,656 @@
+/*
+    pybind11/std_bind.h: Binding generators for STL data types
+
+    Copyright (c) 2016 Sergey Lyskov and Wenzel Jakob
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+#include "operators.h"
+
+#include <algorithm>
+#include <sstream>
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+/* SFINAE helper class used by 'is_comparable */
+template <typename T>  struct container_traits {
+    template <typename T2> static std::true_type test_comparable(decltype(std::declval<const T2 &>() == std::declval<const T2 &>())*);
+    template <typename T2> static std::false_type test_comparable(...);
+    template <typename T2> static std::true_type test_value(typename T2::value_type *);
+    template <typename T2> static std::false_type test_value(...);
+    template <typename T2> static std::true_type test_pair(typename T2::first_type *, typename T2::second_type *);
+    template <typename T2> static std::false_type test_pair(...);
+
+    static constexpr const bool is_comparable = std::is_same<std::true_type, decltype(test_comparable<T>(nullptr))>::value;
+    static constexpr const bool is_pair = std::is_same<std::true_type, decltype(test_pair<T>(nullptr, nullptr))>::value;
+    static constexpr const bool is_vector = std::is_same<std::true_type, decltype(test_value<T>(nullptr))>::value;
+    static constexpr const bool is_element = !is_pair && !is_vector;
+};
+
+/* Default: is_comparable -> std::false_type */
+template <typename T, typename SFINAE = void>
+struct is_comparable : std::false_type { };
+
+/* For non-map data structures, check whether operator== can be instantiated */
+template <typename T>
+struct is_comparable<
+    T, enable_if_t<container_traits<T>::is_element &&
+                   container_traits<T>::is_comparable>>
+    : std::true_type { };
+
+/* For a vector/map data structure, recursively check the value type (which is std::pair for maps) */
+template <typename T>
+struct is_comparable<T, enable_if_t<container_traits<T>::is_vector>> {
+    static constexpr const bool value =
+        is_comparable<typename T::value_type>::value;
+};
+
+/* For pairs, recursively check the two data types */
+template <typename T>
+struct is_comparable<T, enable_if_t<container_traits<T>::is_pair>> {
+    static constexpr const bool value =
+        is_comparable<typename T::first_type>::value &&
+        is_comparable<typename T::second_type>::value;
+};
+
+/* Fallback functions */
+template <typename, typename, typename... Args> void vector_if_copy_constructible(const Args &...) { }
+template <typename, typename, typename... Args> void vector_if_equal_operator(const Args &...) { }
+template <typename, typename, typename... Args> void vector_if_insertion_operator(const Args &...) { }
+template <typename, typename, typename... Args> void vector_modifiers(const Args &...) { }
+
+template<typename Vector, typename Class_>
+void vector_if_copy_constructible(enable_if_t<is_copy_constructible<Vector>::value, Class_> &cl) {
+    cl.def(init<const Vector &>(), "Copy constructor");
+}
+
+template<typename Vector, typename Class_>
+void vector_if_equal_operator(enable_if_t<is_comparable<Vector>::value, Class_> &cl) {
+    using T = typename Vector::value_type;
+
+    cl.def(self == self);
+    cl.def(self != self);
+
+    cl.def("count",
+        [](const Vector &v, const T &x) {
+            return std::count(v.begin(), v.end(), x);
+        },
+        arg("x"),
+        "Return the number of times ``x`` appears in the list"
+    );
+
+    cl.def("remove", [](Vector &v, const T &x) {
+            auto p = std::find(v.begin(), v.end(), x);
+            if (p != v.end())
+                v.erase(p);
+            else
+                throw value_error();
+        },
+        arg("x"),
+        "Remove the first item from the list whose value is x. "
+        "It is an error if there is no such item."
+    );
+
+    cl.def("__contains__",
+        [](const Vector &v, const T &x) {
+            return std::find(v.begin(), v.end(), x) != v.end();
+        },
+        arg("x"),
+        "Return true the container contains ``x``"
+    );
+}
+
+// Vector modifiers -- requires a copyable vector_type:
+// (Technically, some of these (pop and __delitem__) don't actually require copyability, but it seems
+// silly to allow deletion but not insertion, so include them here too.)
+template <typename Vector, typename Class_>
+void vector_modifiers(enable_if_t<is_copy_constructible<typename Vector::value_type>::value, Class_> &cl) {
+    using T = typename Vector::value_type;
+    using SizeType = typename Vector::size_type;
+    using DiffType = typename Vector::difference_type;
+
+    auto wrap_i = [](DiffType i, SizeType n) {
+        if (i < 0)
+            i += n;
+        if (i < 0 || (SizeType)i >= n)
+            throw index_error();
+        return i;
+    };
+
+    cl.def("append",
+           [](Vector &v, const T &value) { v.push_back(value); },
+           arg("x"),
+           "Add an item to the end of the list");
+
+    cl.def(init([](iterable it) {
+        auto v = std::unique_ptr<Vector>(new Vector());
+        v->reserve(len_hint(it));
+        for (handle h : it)
+           v->push_back(h.cast<T>());
+        return v.release();
+    }));
+
+    cl.def("clear",
+        [](Vector &v) {
+            v.clear();
+        },
+        "Clear the contents"
+    );
+
+    cl.def("extend",
+       [](Vector &v, const Vector &src) {
+           v.insert(v.end(), src.begin(), src.end());
+       },
+       arg("L"),
+       "Extend the list by appending all the items in the given list"
+    );
+
+    cl.def("extend",
+       [](Vector &v, iterable it) {
+           const size_t old_size = v.size();
+           v.reserve(old_size + len_hint(it));
+           try {
+               for (handle h : it) {
+                   v.push_back(h.cast<T>());
+               }
+           } catch (const cast_error &) {
+               v.erase(v.begin() + static_cast<typename Vector::difference_type>(old_size), v.end());
+               try {
+                   v.shrink_to_fit();
+               } catch (const std::exception &) {
+                   // Do nothing
+               }
+               throw;
+           }
+       },
+       arg("L"),
+       "Extend the list by appending all the items in the given list"
+    );
+
+    cl.def("insert",
+        [](Vector &v, DiffType i, const T &x) {
+            // Can't use wrap_i; i == v.size() is OK
+            if (i < 0)
+                i += v.size();
+            if (i < 0 || (SizeType)i > v.size())
+                throw index_error();
+            v.insert(v.begin() + i, x);
+        },
+        arg("i") , arg("x"),
+        "Insert an item at a given position."
+    );
+
+    cl.def("pop",
+        [](Vector &v) {
+            if (v.empty())
+                throw index_error();
+            T t = v.back();
+            v.pop_back();
+            return t;
+        },
+        "Remove and return the last item"
+    );
+
+    cl.def("pop",
+        [wrap_i](Vector &v, DiffType i) {
+            i = wrap_i(i, v.size());
+            T t = v[(SizeType) i];
+            v.erase(v.begin() + i);
+            return t;
+        },
+        arg("i"),
+        "Remove and return the item at index ``i``"
+    );
+
+    cl.def("__setitem__",
+        [wrap_i](Vector &v, DiffType i, const T &t) {
+            i = wrap_i(i, v.size());
+            v[(SizeType)i] = t;
+        }
+    );
+
+    /// Slicing protocol
+    cl.def("__getitem__",
+        [](const Vector &v, slice slice) -> Vector * {
+            size_t start, stop, step, slicelength;
+
+            if (!slice.compute(v.size(), &start, &stop, &step, &slicelength))
+                throw error_already_set();
+
+            Vector *seq = new Vector();
+            seq->reserve((size_t) slicelength);
+
+            for (size_t i=0; i<slicelength; ++i) {
+                seq->push_back(v[start]);
+                start += step;
+            }
+            return seq;
+        },
+        arg("s"),
+        "Retrieve list elements using a slice object"
+    );
+
+    cl.def("__setitem__",
+        [](Vector &v, slice slice,  const Vector &value) {
+            size_t start, stop, step, slicelength;
+            if (!slice.compute(v.size(), &start, &stop, &step, &slicelength))
+                throw error_already_set();
+
+            if (slicelength != value.size())
+                throw std::runtime_error("Left and right hand size of slice assignment have different sizes!");
+
+            for (size_t i=0; i<slicelength; ++i) {
+                v[start] = value[i];
+                start += step;
+            }
+        },
+        "Assign list elements using a slice object"
+    );
+
+    cl.def("__delitem__",
+        [wrap_i](Vector &v, DiffType i) {
+            i = wrap_i(i, v.size());
+            v.erase(v.begin() + i);
+        },
+        "Delete the list elements at index ``i``"
+    );
+
+    cl.def("__delitem__",
+        [](Vector &v, slice slice) {
+            size_t start, stop, step, slicelength;
+
+            if (!slice.compute(v.size(), &start, &stop, &step, &slicelength))
+                throw error_already_set();
+
+            if (step == 1 && false) {
+                v.erase(v.begin() + (DiffType) start, v.begin() + DiffType(start + slicelength));
+            } else {
+                for (size_t i = 0; i < slicelength; ++i) {
+                    v.erase(v.begin() + DiffType(start));
+                    start += step - 1;
+                }
+            }
+        },
+        "Delete list elements using a slice object"
+    );
+
+}
+
+// If the type has an operator[] that doesn't return a reference (most notably std::vector<bool>),
+// we have to access by copying; otherwise we return by reference.
+template <typename Vector> using vector_needs_copy = negation<
+    std::is_same<decltype(std::declval<Vector>()[typename Vector::size_type()]), typename Vector::value_type &>>;
+
+// The usual case: access and iterate by reference
+template <typename Vector, typename Class_>
+void vector_accessor(enable_if_t<!vector_needs_copy<Vector>::value, Class_> &cl) {
+    using T = typename Vector::value_type;
+    using SizeType = typename Vector::size_type;
+    using DiffType = typename Vector::difference_type;
+    using ItType   = typename Vector::iterator;
+
+    auto wrap_i = [](DiffType i, SizeType n) {
+        if (i < 0)
+            i += n;
+        if (i < 0 || (SizeType)i >= n)
+            throw index_error();
+        return i;
+    };
+
+    cl.def("__getitem__",
+        [wrap_i](Vector &v, DiffType i) -> T & {
+            i = wrap_i(i, v.size());
+            return v[(SizeType)i];
+        },
+        return_value_policy::reference_internal // ref + keepalive
+    );
+
+    cl.def("__iter__",
+           [](Vector &v) {
+               return make_iterator<
+                   return_value_policy::reference_internal, ItType, ItType, T&>(
+                   v.begin(), v.end());
+           },
+           keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+    );
+}
+
+// The case for special objects, like std::vector<bool>, that have to be returned-by-copy:
+template <typename Vector, typename Class_>
+void vector_accessor(enable_if_t<vector_needs_copy<Vector>::value, Class_> &cl) {
+    using T = typename Vector::value_type;
+    using SizeType = typename Vector::size_type;
+    using DiffType = typename Vector::difference_type;
+    using ItType   = typename Vector::iterator;
+    cl.def("__getitem__",
+        [](const Vector &v, DiffType i) -> T {
+            if (i < 0 && (i += v.size()) < 0)
+                throw index_error();
+            if ((SizeType)i >= v.size())
+                throw index_error();
+            return v[(SizeType)i];
+        }
+    );
+
+    cl.def("__iter__",
+           [](Vector &v) {
+               return make_iterator<
+                   return_value_policy::copy, ItType, ItType, T>(
+                   v.begin(), v.end());
+           },
+           keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+    );
+}
+
+template <typename Vector, typename Class_> auto vector_if_insertion_operator(Class_ &cl, std::string const &name)
+    -> decltype(std::declval<std::ostream&>() << std::declval<typename Vector::value_type>(), void()) {
+    using size_type = typename Vector::size_type;
+
+    cl.def("__repr__",
+           [name](Vector &v) {
+            std::ostringstream s;
+            s << name << '[';
+            for (size_type i=0; i < v.size(); ++i) {
+                s << v[i];
+                if (i != v.size() - 1)
+                    s << ", ";
+            }
+            s << ']';
+            return s.str();
+        },
+        "Return the canonical string representation of this list."
+    );
+}
+
+// Provide the buffer interface for vectors if we have data() and we have a format for it
+// GCC seems to have "void std::vector<bool>::data()" - doing SFINAE on the existence of data() is insufficient, we need to check it returns an appropriate pointer
+template <typename Vector, typename = void>
+struct vector_has_data_and_format : std::false_type {};
+template <typename Vector>
+struct vector_has_data_and_format<Vector, enable_if_t<std::is_same<decltype(format_descriptor<typename Vector::value_type>::format(), std::declval<Vector>().data()), typename Vector::value_type*>::value>> : std::true_type {};
+
+// Add the buffer interface to a vector
+template <typename Vector, typename Class_, typename... Args>
+enable_if_t<detail::any_of<std::is_same<Args, buffer_protocol>...>::value>
+vector_buffer(Class_& cl) {
+    using T = typename Vector::value_type;
+
+    static_assert(vector_has_data_and_format<Vector>::value, "There is not an appropriate format descriptor for this vector");
+
+    // numpy.h declares this for arbitrary types, but it may raise an exception and crash hard at runtime if PYBIND11_NUMPY_DTYPE hasn't been called, so check here
+    format_descriptor<T>::format();
+
+    cl.def_buffer([](Vector& v) -> buffer_info {
+        return buffer_info(v.data(), static_cast<ssize_t>(sizeof(T)), format_descriptor<T>::format(), 1, {v.size()}, {sizeof(T)});
+    });
+
+    cl.def(init([](buffer buf) {
+        auto info = buf.request();
+        if (info.ndim != 1 || info.strides[0] % static_cast<ssize_t>(sizeof(T)))
+            throw type_error("Only valid 1D buffers can be copied to a vector");
+        if (!detail::compare_buffer_info<T>::compare(info) || (ssize_t) sizeof(T) != info.itemsize)
+            throw type_error("Format mismatch (Python: " + info.format + " C++: " + format_descriptor<T>::format() + ")");
+
+        auto vec = std::unique_ptr<Vector>(new Vector());
+        vec->reserve((size_t) info.shape[0]);
+        T *p = static_cast<T*>(info.ptr);
+        ssize_t step = info.strides[0] / static_cast<ssize_t>(sizeof(T));
+        T *end = p + info.shape[0] * step;
+        for (; p != end; p += step)
+            vec->push_back(*p);
+        return vec.release();
+    }));
+
+    return;
+}
+
+template <typename Vector, typename Class_, typename... Args>
+enable_if_t<!detail::any_of<std::is_same<Args, buffer_protocol>...>::value> vector_buffer(Class_&) {}
+
+NAMESPACE_END(detail)
+
+//
+// std::vector
+//
+template <typename Vector, typename holder_type = std::unique_ptr<Vector>, typename... Args>
+class_<Vector, holder_type> bind_vector(handle scope, std::string const &name, Args&&... args) {
+    using Class_ = class_<Vector, holder_type>;
+
+    // If the value_type is unregistered (e.g. a converting type) or is itself registered
+    // module-local then make the vector binding module-local as well:
+    using vtype = typename Vector::value_type;
+    auto vtype_info = detail::get_type_info(typeid(vtype));
+    bool local = !vtype_info || vtype_info->module_local;
+
+    Class_ cl(scope, name.c_str(), pybind11::module_local(local), std::forward<Args>(args)...);
+
+    // Declare the buffer interface if a buffer_protocol() is passed in
+    detail::vector_buffer<Vector, Class_, Args...>(cl);
+
+    cl.def(init<>());
+
+    // Register copy constructor (if possible)
+    detail::vector_if_copy_constructible<Vector, Class_>(cl);
+
+    // Register comparison-related operators and functions (if possible)
+    detail::vector_if_equal_operator<Vector, Class_>(cl);
+
+    // Register stream insertion operator (if possible)
+    detail::vector_if_insertion_operator<Vector, Class_>(cl, name);
+
+    // Modifiers require copyable vector value type
+    detail::vector_modifiers<Vector, Class_>(cl);
+
+    // Accessor and iterator; return by value if copyable, otherwise we return by ref + keep-alive
+    detail::vector_accessor<Vector, Class_>(cl);
+
+    cl.def("__bool__",
+        [](const Vector &v) -> bool {
+            return !v.empty();
+        },
+        "Check whether the list is nonempty"
+    );
+
+    cl.def("__len__", &Vector::size);
+
+
+
+
+#if 0
+    // C++ style functions deprecated, leaving it here as an example
+    cl.def(init<size_type>());
+
+    cl.def("resize",
+         (void (Vector::*) (size_type count)) & Vector::resize,
+         "changes the number of elements stored");
+
+    cl.def("erase",
+        [](Vector &v, SizeType i) {
+        if (i >= v.size())
+            throw index_error();
+        v.erase(v.begin() + i);
+    }, "erases element at index ``i``");
+
+    cl.def("empty",         &Vector::empty,         "checks whether the container is empty");
+    cl.def("size",          &Vector::size,          "returns the number of elements");
+    cl.def("push_back", (void (Vector::*)(const T&)) &Vector::push_back, "adds an element to the end");
+    cl.def("pop_back",                               &Vector::pop_back, "removes the last element");
+
+    cl.def("max_size",      &Vector::max_size,      "returns the maximum possible number of elements");
+    cl.def("reserve",       &Vector::reserve,       "reserves storage");
+    cl.def("capacity",      &Vector::capacity,      "returns the number of elements that can be held in currently allocated storage");
+    cl.def("shrink_to_fit", &Vector::shrink_to_fit, "reduces memory usage by freeing unused memory");
+
+    cl.def("clear", &Vector::clear, "clears the contents");
+    cl.def("swap",   &Vector::swap, "swaps the contents");
+
+    cl.def("front", [](Vector &v) {
+        if (v.size()) return v.front();
+        else throw index_error();
+    }, "access the first element");
+
+    cl.def("back", [](Vector &v) {
+        if (v.size()) return v.back();
+        else throw index_error();
+    }, "access the last element ");
+
+#endif
+
+    return cl;
+}
+
+
+
+//
+// std::map, std::unordered_map
+//
+
+NAMESPACE_BEGIN(detail)
+
+/* Fallback functions */
+template <typename, typename, typename... Args> void map_if_insertion_operator(const Args &...) { }
+template <typename, typename, typename... Args> void map_assignment(const Args &...) { }
+
+// Map assignment when copy-assignable: just copy the value
+template <typename Map, typename Class_>
+void map_assignment(enable_if_t<is_copy_assignable<typename Map::mapped_type>::value, Class_> &cl) {
+    using KeyType = typename Map::key_type;
+    using MappedType = typename Map::mapped_type;
+
+    cl.def("__setitem__",
+           [](Map &m, const KeyType &k, const MappedType &v) {
+               auto it = m.find(k);
+               if (it != m.end()) it->second = v;
+               else m.emplace(k, v);
+           }
+    );
+}
+
+// Not copy-assignable, but still copy-constructible: we can update the value by erasing and reinserting
+template<typename Map, typename Class_>
+void map_assignment(enable_if_t<
+        !is_copy_assignable<typename Map::mapped_type>::value &&
+        is_copy_constructible<typename Map::mapped_type>::value,
+        Class_> &cl) {
+    using KeyType = typename Map::key_type;
+    using MappedType = typename Map::mapped_type;
+
+    cl.def("__setitem__",
+           [](Map &m, const KeyType &k, const MappedType &v) {
+               // We can't use m[k] = v; because value type might not be default constructable
+               auto r = m.emplace(k, v);
+               if (!r.second) {
+                   // value type is not copy assignable so the only way to insert it is to erase it first...
+                   m.erase(r.first);
+                   m.emplace(k, v);
+               }
+           }
+    );
+}
+
+
+template <typename Map, typename Class_> auto map_if_insertion_operator(Class_ &cl, std::string const &name)
+-> decltype(std::declval<std::ostream&>() << std::declval<typename Map::key_type>() << std::declval<typename Map::mapped_type>(), void()) {
+
+    cl.def("__repr__",
+           [name](Map &m) {
+            std::ostringstream s;
+            s << name << '{';
+            bool f = false;
+            for (auto const &kv : m) {
+                if (f)
+                    s << ", ";
+                s << kv.first << ": " << kv.second;
+                f = true;
+            }
+            s << '}';
+            return s.str();
+        },
+        "Return the canonical string representation of this map."
+    );
+}
+
+
+NAMESPACE_END(detail)
+
+template <typename Map, typename holder_type = std::unique_ptr<Map>, typename... Args>
+class_<Map, holder_type> bind_map(handle scope, const std::string &name, Args&&... args) {
+    using KeyType = typename Map::key_type;
+    using MappedType = typename Map::mapped_type;
+    using Class_ = class_<Map, holder_type>;
+
+    // If either type is a non-module-local bound type then make the map binding non-local as well;
+    // otherwise (e.g. both types are either module-local or converting) the map will be
+    // module-local.
+    auto tinfo = detail::get_type_info(typeid(MappedType));
+    bool local = !tinfo || tinfo->module_local;
+    if (local) {
+        tinfo = detail::get_type_info(typeid(KeyType));
+        local = !tinfo || tinfo->module_local;
+    }
+
+    Class_ cl(scope, name.c_str(), pybind11::module_local(local), std::forward<Args>(args)...);
+
+    cl.def(init<>());
+
+    // Register stream insertion operator (if possible)
+    detail::map_if_insertion_operator<Map, Class_>(cl, name);
+
+    cl.def("__bool__",
+        [](const Map &m) -> bool { return !m.empty(); },
+        "Check whether the map is nonempty"
+    );
+
+    cl.def("__iter__",
+           [](Map &m) { return make_key_iterator(m.begin(), m.end()); },
+           keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+    );
+
+    cl.def("items",
+           [](Map &m) { return make_iterator(m.begin(), m.end()); },
+           keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+    );
+
+    cl.def("__getitem__",
+        [](Map &m, const KeyType &k) -> MappedType & {
+            auto it = m.find(k);
+            if (it == m.end())
+              throw key_error();
+           return it->second;
+        },
+        return_value_policy::reference_internal // ref + keepalive
+    );
+
+    cl.def("__contains__",
+        [](Map &m, const KeyType &k) -> bool {
+            auto it = m.find(k);
+            if (it == m.end())
+              return false;
+           return true;
+        }
+    );
+
+    // Assignment provided only if the type is copyable
+    detail::map_assignment<Map, Class_>(cl);
+
+    cl.def("__delitem__",
+           [](Map &m, const KeyType &k) {
+               auto it = m.find(k);
+               if (it == m.end())
+                   throw key_error();
+               m.erase(it);
+           }
+    );
+
+    cl.def("__len__", &Map::size);
+
+    return cl;
+}
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
--- a/cviruntime/python/include/pybind11/tools/FindCatch.cmake
+++ b/cviruntime/python/include/pybind11/tools/FindCatch.cmake
@ -0,0 +1,57 @@
+# - Find the Catch test framework or download it (single header)
+#
+# This is a quick module for internal use. It assumes that Catch is
+# REQUIRED and that a minimum version is provided (not EXACT). If
+# a suitable version isn't found locally, the single header file
+# will be downloaded and placed in the build dir: PROJECT_BINARY_DIR.
+#
+# This code sets the following variables:
+#  CATCH_INCLUDE_DIR      - path to catch.hpp
+#  CATCH_VERSION          - version number
+
+if(NOT Catch_FIND_VERSION)
+  message(FATAL_ERROR "A version number must be specified.")
+elseif(Catch_FIND_REQUIRED)
+  message(FATAL_ERROR "This module assumes Catch is not required.")
+elseif(Catch_FIND_VERSION_EXACT)
+  message(FATAL_ERROR "Exact version numbers are not supported, only minimum.")
+endif()
+
+# Extract the version number from catch.hpp
+function(_get_catch_version)
+  file(STRINGS "${CATCH_INCLUDE_DIR}/catch.hpp" version_line REGEX "Catch v.*" LIMIT_COUNT 1)
+  if(version_line MATCHES "Catch v([0-9]+)\\.([0-9]+)\\.([0-9]+)")
+    set(CATCH_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}" PARENT_SCOPE)
+  endif()
+endfunction()
+
+# Download the single-header version of Catch
+function(_download_catch version destination_dir)
+  message(STATUS "Downloading catch v${version}...")
+  set(url https://github.com/philsquared/Catch/releases/download/v${version}/catch.hpp)
+  file(DOWNLOAD ${url} "${destination_dir}/catch.hpp" STATUS status)
+  list(GET status 0 error)
+  if(error)
+    message(FATAL_ERROR "Could not download ${url}")
+  endif()
+  set(CATCH_INCLUDE_DIR "${destination_dir}" CACHE INTERNAL "")
+endfunction()
+
+# Look for catch locally
+find_path(CATCH_INCLUDE_DIR NAMES catch.hpp PATH_SUFFIXES catch)
+if(CATCH_INCLUDE_DIR)
+  _get_catch_version()
+endif()
+
+# Download the header if it wasn't found or if it's outdated
+if(NOT CATCH_VERSION OR CATCH_VERSION VERSION_LESS ${Catch_FIND_VERSION})
+  if(DOWNLOAD_CATCH)
+    _download_catch(${Catch_FIND_VERSION} "${PROJECT_BINARY_DIR}/catch/")
+    _get_catch_version()
+  else()
+    set(CATCH_FOUND FALSE)
+    return()
+  endif()
+endif()
+
+set(CATCH_FOUND TRUE)
--- a/cviruntime/python/include/pybind11/tools/FindEigen3.cmake
+++ b/cviruntime/python/include/pybind11/tools/FindEigen3.cmake
@ -0,0 +1,81 @@
+# - Try to find Eigen3 lib
+#
+# This module supports requiring a minimum version, e.g. you can do
+#   find_package(Eigen3 3.1.2)
+# to require version 3.1.2 or newer of Eigen3.
+#
+# Once done this will define
+#
+#  EIGEN3_FOUND - system has eigen lib with correct version
+#  EIGEN3_INCLUDE_DIR - the eigen include directory
+#  EIGEN3_VERSION - eigen version
+
+# Copyright (c) 2006, 2007 Montel Laurent, <montel@kde.org>
+# Copyright (c) 2008, 2009 Gael Guennebaud, <g.gael@free.fr>
+# Copyright (c) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+# Redistribution and use is allowed according to the terms of the 2-clause BSD license.
+
+if(NOT Eigen3_FIND_VERSION)
+  if(NOT Eigen3_FIND_VERSION_MAJOR)
+    set(Eigen3_FIND_VERSION_MAJOR 2)
+  endif(NOT Eigen3_FIND_VERSION_MAJOR)
+  if(NOT Eigen3_FIND_VERSION_MINOR)
+    set(Eigen3_FIND_VERSION_MINOR 91)
+  endif(NOT Eigen3_FIND_VERSION_MINOR)
+  if(NOT Eigen3_FIND_VERSION_PATCH)
+    set(Eigen3_FIND_VERSION_PATCH 0)
+  endif(NOT Eigen3_FIND_VERSION_PATCH)
+
+  set(Eigen3_FIND_VERSION "${Eigen3_FIND_VERSION_MAJOR}.${Eigen3_FIND_VERSION_MINOR}.${Eigen3_FIND_VERSION_PATCH}")
+endif(NOT Eigen3_FIND_VERSION)
+
+macro(_eigen3_check_version)
+  file(READ "${EIGEN3_INCLUDE_DIR}/Eigen/src/Core/util/Macros.h" _eigen3_version_header)
+
+  string(REGEX MATCH "define[ \t]+EIGEN_WORLD_VERSION[ \t]+([0-9]+)" _eigen3_world_version_match "${_eigen3_version_header}")
+  set(EIGEN3_WORLD_VERSION "${CMAKE_MATCH_1}")
+  string(REGEX MATCH "define[ \t]+EIGEN_MAJOR_VERSION[ \t]+([0-9]+)" _eigen3_major_version_match "${_eigen3_version_header}")
+  set(EIGEN3_MAJOR_VERSION "${CMAKE_MATCH_1}")
+  string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen3_minor_version_match "${_eigen3_version_header}")
+  set(EIGEN3_MINOR_VERSION "${CMAKE_MATCH_1}")
+
+  set(EIGEN3_VERSION ${EIGEN3_WORLD_VERSION}.${EIGEN3_MAJOR_VERSION}.${EIGEN3_MINOR_VERSION})
+  if(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION})
+    set(EIGEN3_VERSION_OK FALSE)
+  else(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION})
+    set(EIGEN3_VERSION_OK TRUE)
+  endif(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION})
+
+  if(NOT EIGEN3_VERSION_OK)
+
+    message(STATUS "Eigen3 version ${EIGEN3_VERSION} found in ${EIGEN3_INCLUDE_DIR}, "
+                   "but at least version ${Eigen3_FIND_VERSION} is required")
+  endif(NOT EIGEN3_VERSION_OK)
+endmacro(_eigen3_check_version)
+
+if (EIGEN3_INCLUDE_DIR)
+
+  # in cache already
+  _eigen3_check_version()
+  set(EIGEN3_FOUND ${EIGEN3_VERSION_OK})
+
+else (EIGEN3_INCLUDE_DIR)
+
+  find_path(EIGEN3_INCLUDE_DIR NAMES signature_of_eigen3_matrix_library
+      PATHS
+      ${CMAKE_INSTALL_PREFIX}/include
+      ${KDE4_INCLUDE_DIR}
+      PATH_SUFFIXES eigen3 eigen
+    )
+
+  if(EIGEN3_INCLUDE_DIR)
+    _eigen3_check_version()
+  endif(EIGEN3_INCLUDE_DIR)
+
+  include(FindPackageHandleStandardArgs)
+  find_package_handle_standard_args(Eigen3 DEFAULT_MSG EIGEN3_INCLUDE_DIR EIGEN3_VERSION_OK)
+
+  mark_as_advanced(EIGEN3_INCLUDE_DIR)
+
+endif(EIGEN3_INCLUDE_DIR)
+
--- a/cviruntime/python/include/pybind11/tools/FindPythonLibsNew.cmake
+++ b/cviruntime/python/include/pybind11/tools/FindPythonLibsNew.cmake
@ -0,0 +1,202 @@
+# - Find python libraries
+# This module finds the libraries corresponding to the Python interpreter
+# FindPythonInterp provides.
+# This code sets the following variables:
+#
+#  PYTHONLIBS_FOUND           - have the Python libs been found
+#  PYTHON_PREFIX              - path to the Python installation
+#  PYTHON_LIBRARIES           - path to the python library
+#  PYTHON_INCLUDE_DIRS        - path to where Python.h is found
+#  PYTHON_MODULE_EXTENSION    - lib extension, e.g. '.so' or '.pyd'
+#  PYTHON_MODULE_PREFIX       - lib name prefix: usually an empty string
+#  PYTHON_SITE_PACKAGES       - path to installation site-packages
+#  PYTHON_IS_DEBUG            - whether the Python interpreter is a debug build
+#
+# Thanks to talljimbo for the patch adding the 'LDVERSION' config
+# variable usage.
+
+#=============================================================================
+# Copyright 2001-2009 Kitware, Inc.
+# Copyright 2012 Continuum Analytics, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# * Neither the names of Kitware, Inc., the Insight Software Consortium,
+# nor the names of their contributors may be used to endorse or promote
+# products derived from this software without specific prior written
+# permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#=============================================================================
+
+# Checking for the extension makes sure that `LibsNew` was found and not just `Libs`.
+if(PYTHONLIBS_FOUND AND PYTHON_MODULE_EXTENSION)
+    return()
+endif()
+
+# Use the Python interpreter to find the libs.
+if(PythonLibsNew_FIND_REQUIRED)
+    find_package(PythonInterp ${PythonLibsNew_FIND_VERSION} REQUIRED)
+else()
+    find_package(PythonInterp ${PythonLibsNew_FIND_VERSION})
+endif()
+
+if(NOT PYTHONINTERP_FOUND)
+    set(PYTHONLIBS_FOUND FALSE)
+    set(PythonLibsNew_FOUND FALSE)
+    return()
+endif()
+
+# According to http://stackoverflow.com/questions/646518/python-how-to-detect-debug-interpreter
+# testing whether sys has the gettotalrefcount function is a reliable, cross-platform
+# way to detect a CPython debug interpreter.
+#
+# The library suffix is from the config var LDVERSION sometimes, otherwise
+# VERSION. VERSION will typically be like "2.7" on unix, and "27" on windows.
+execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
+    "from distutils import sysconfig as s;import sys;import struct;
+print('.'.join(str(v) for v in sys.version_info));
+print(sys.prefix);
+print(s.get_python_inc(plat_specific=True));
+print(s.get_python_lib(plat_specific=True));
+print(s.get_config_var('SO'));
+print(hasattr(sys, 'gettotalrefcount')+0);
+print(struct.calcsize('@P'));
+print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION'));
+print(s.get_config_var('LIBDIR') or '');
+print(s.get_config_var('MULTIARCH') or '');
+"
+    RESULT_VARIABLE _PYTHON_SUCCESS
+    OUTPUT_VARIABLE _PYTHON_VALUES
+    ERROR_VARIABLE _PYTHON_ERROR_VALUE)
+
+if(NOT _PYTHON_SUCCESS MATCHES 0)
+    if(PythonLibsNew_FIND_REQUIRED)
+        message(FATAL_ERROR
+            "Python config failure:\n${_PYTHON_ERROR_VALUE}")
+    endif()
+    set(PYTHONLIBS_FOUND FALSE)
+    set(PythonLibsNew_FOUND FALSE)
+    return()
+endif()
+
+# Convert the process output into a list
+if(WIN32)
+    string(REGEX REPLACE "\\\\" "/" _PYTHON_VALUES ${_PYTHON_VALUES})
+endif()
+string(REGEX REPLACE ";" "\\\\;" _PYTHON_VALUES ${_PYTHON_VALUES})
+string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES})
+list(GET _PYTHON_VALUES 0 _PYTHON_VERSION_LIST)
+list(GET _PYTHON_VALUES 1 PYTHON_PREFIX)
+list(GET _PYTHON_VALUES 2 PYTHON_INCLUDE_DIR)
+list(GET _PYTHON_VALUES 3 PYTHON_SITE_PACKAGES)
+list(GET _PYTHON_VALUES 4 PYTHON_MODULE_EXTENSION)
+list(GET _PYTHON_VALUES 5 PYTHON_IS_DEBUG)
+list(GET _PYTHON_VALUES 6 PYTHON_SIZEOF_VOID_P)
+list(GET _PYTHON_VALUES 7 PYTHON_LIBRARY_SUFFIX)
+list(GET _PYTHON_VALUES 8 PYTHON_LIBDIR)
+list(GET _PYTHON_VALUES 9 PYTHON_MULTIARCH)
+
+# Make sure the Python has the same pointer-size as the chosen compiler
+# Skip if CMAKE_SIZEOF_VOID_P is not defined
+if(CMAKE_SIZEOF_VOID_P AND (NOT "${PYTHON_SIZEOF_VOID_P}" STREQUAL "${CMAKE_SIZEOF_VOID_P}"))
+    if(PythonLibsNew_FIND_REQUIRED)
+        math(EXPR _PYTHON_BITS "${PYTHON_SIZEOF_VOID_P} * 8")
+        math(EXPR _CMAKE_BITS "${CMAKE_SIZEOF_VOID_P} * 8")
+        message(FATAL_ERROR
+            "Python config failure: Python is ${_PYTHON_BITS}-bit, "
+            "chosen compiler is  ${_CMAKE_BITS}-bit")
+    endif()
+    set(PYTHONLIBS_FOUND FALSE)
+    set(PythonLibsNew_FOUND FALSE)
+    return()
+endif()
+
+# The built-in FindPython didn't always give the version numbers
+string(REGEX REPLACE "\\." ";" _PYTHON_VERSION_LIST ${_PYTHON_VERSION_LIST})
+list(GET _PYTHON_VERSION_LIST 0 PYTHON_VERSION_MAJOR)
+list(GET _PYTHON_VERSION_LIST 1 PYTHON_VERSION_MINOR)
+list(GET _PYTHON_VERSION_LIST 2 PYTHON_VERSION_PATCH)
+
+# Make sure all directory separators are '/'
+string(REGEX REPLACE "\\\\" "/" PYTHON_PREFIX "${PYTHON_PREFIX}")
+string(REGEX REPLACE "\\\\" "/" PYTHON_INCLUDE_DIR "${PYTHON_INCLUDE_DIR}")
+string(REGEX REPLACE "\\\\" "/" PYTHON_SITE_PACKAGES "${PYTHON_SITE_PACKAGES}")
+
+if(CMAKE_HOST_WIN32 AND NOT (MINGW AND DEFINED ENV{MSYSTEM}))
+    set(PYTHON_LIBRARY
+        "${PYTHON_PREFIX}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib")
+
+    # when run in a venv, PYTHON_PREFIX points to it. But the libraries remain in the
+    # original python installation. They may be found relative to PYTHON_INCLUDE_DIR.
+    if(NOT EXISTS "${PYTHON_LIBRARY}")
+        get_filename_component(_PYTHON_ROOT ${PYTHON_INCLUDE_DIR} DIRECTORY)
+        set(PYTHON_LIBRARY
+            "${_PYTHON_ROOT}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib")
+    endif()
+
+    # raise an error if the python libs are still not found.
+    if(NOT EXISTS "${PYTHON_LIBRARY}")
+        message(FATAL_ERROR "Python libraries not found")
+    endif()
+
+else()
+    if(PYTHON_MULTIARCH)
+        set(_PYTHON_LIBS_SEARCH "${PYTHON_LIBDIR}/${PYTHON_MULTIARCH}" "${PYTHON_LIBDIR}")
+    else()
+        set(_PYTHON_LIBS_SEARCH "${PYTHON_LIBDIR}")
+    endif()
+    #message(STATUS "Searching for Python libs in ${_PYTHON_LIBS_SEARCH}")
+    # Probably this needs to be more involved. It would be nice if the config
+    # information the python interpreter itself gave us were more complete.
+    find_library(PYTHON_LIBRARY
+        NAMES "python${PYTHON_LIBRARY_SUFFIX}"
+        PATHS ${_PYTHON_LIBS_SEARCH}
+        NO_DEFAULT_PATH)
+
+    # If all else fails, just set the name/version and let the linker figure out the path.
+    if(NOT PYTHON_LIBRARY)
+        set(PYTHON_LIBRARY python${PYTHON_LIBRARY_SUFFIX})
+    endif()
+endif()
+
+MARK_AS_ADVANCED(
+  PYTHON_LIBRARY
+  PYTHON_INCLUDE_DIR
+)
+
+# We use PYTHON_INCLUDE_DIR, PYTHON_LIBRARY and PYTHON_DEBUG_LIBRARY for the
+# cache entries because they are meant to specify the location of a single
+# library. We now set the variables listed by the documentation for this
+# module.
+SET(PYTHON_INCLUDE_DIRS "${PYTHON_INCLUDE_DIR}")
+SET(PYTHON_LIBRARIES "${PYTHON_LIBRARY}")
+SET(PYTHON_DEBUG_LIBRARIES "${PYTHON_DEBUG_LIBRARY}")
+
+find_package_message(PYTHON
+    "Found PythonLibs: ${PYTHON_LIBRARY}"
+    "${PYTHON_EXECUTABLE}${PYTHON_VERSION}")
+
+set(PYTHONLIBS_FOUND TRUE)
+set(PythonLibsNew_FOUND TRUE)
--- a/cviruntime/python/include/pybind11/tools/pybind11Config.cmake.in
+++ b/cviruntime/python/include/pybind11/tools/pybind11Config.cmake.in
@ -0,0 +1,104 @@
+# pybind11Config.cmake
+# --------------------
+#
+# PYBIND11 cmake module.
+# This module sets the following variables in your project::
+#
+#   pybind11_FOUND - true if pybind11 and all required components found on the system
+#   pybind11_VERSION - pybind11 version in format Major.Minor.Release
+#   pybind11_INCLUDE_DIRS - Directories where pybind11 and python headers are located.
+#   pybind11_INCLUDE_DIR - Directory where pybind11 headers are located.
+#   pybind11_DEFINITIONS - Definitions necessary to use pybind11, namely USING_pybind11.
+#   pybind11_LIBRARIES - compile flags and python libraries (as needed) to link against.
+#   pybind11_LIBRARY - empty.
+#   CMAKE_MODULE_PATH - appends location of accompanying FindPythonLibsNew.cmake and
+#                       pybind11Tools.cmake modules.
+#
+#
+# Available components: None
+#
+#
+# Exported targets::
+#
+# If pybind11 is found, this module defines the following :prop_tgt:`IMPORTED`
+# interface library targets::
+#
+#   pybind11::module - for extension modules
+#   pybind11::embed - for embedding the Python interpreter
+#
+# Python headers, libraries (as needed by platform), and the C++ standard
+# are attached to the target. Set PythonLibsNew variables to influence
+# python detection and PYBIND11_CPP_STANDARD (-std=c++11 or -std=c++14) to
+# influence standard setting. ::
+#
+#   find_package(pybind11 CONFIG REQUIRED)
+#   message(STATUS "Found pybind11 v${pybind11_VERSION}: ${pybind11_INCLUDE_DIRS}")
+#
+#   # Create an extension module
+#   add_library(mylib MODULE main.cpp)
+#   target_link_libraries(mylib pybind11::module)
+#
+#   # Or embed the Python interpreter into an executable
+#   add_executable(myexe main.cpp)
+#   target_link_libraries(myexe pybind11::embed)
+#
+# Suggested usage::
+#
+# find_package with version info is not recommended except for release versions. ::
+#
+#   find_package(pybind11 CONFIG)
+#   find_package(pybind11 2.0 EXACT CONFIG REQUIRED)
+#
+#
+# The following variables can be set to guide the search for this package::
+#
+#   pybind11_DIR - CMake variable, set to directory containing this Config file
+#   CMAKE_PREFIX_PATH - CMake variable, set to root directory of this package
+#   PATH - environment variable, set to bin directory of this package
+#   CMAKE_DISABLE_FIND_PACKAGE_pybind11 - CMake variable, disables
+#     find_package(pybind11) when not REQUIRED, perhaps to force internal build
+
+@PACKAGE_INIT@
+
+set(PN pybind11)
+
+# location of pybind11/pybind11.h
+set(${PN}_INCLUDE_DIR "${PACKAGE_PREFIX_DIR}/@CMAKE_INSTALL_INCLUDEDIR@")
+
+set(${PN}_LIBRARY "")
+set(${PN}_DEFINITIONS USING_${PN})
+
+check_required_components(${PN})
+
+# make detectable the FindPythonLibsNew.cmake module
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR})
+
+include(pybind11Tools)
+
+if(NOT (CMAKE_VERSION VERSION_LESS 3.0))
+#-----------------------------------------------------------------------------
+# Don't include targets if this file is being picked up by another
+# project which has already built this as a subproject
+#-----------------------------------------------------------------------------
+if(NOT TARGET ${PN}::pybind11)
+    include("${CMAKE_CURRENT_LIST_DIR}/${PN}Targets.cmake")
+
+    find_package(PythonLibsNew ${PYBIND11_PYTHON_VERSION} MODULE REQUIRED)
+    set_property(TARGET ${PN}::pybind11 APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${PYTHON_INCLUDE_DIRS})
+    set_property(TARGET ${PN}::embed APPEND PROPERTY INTERFACE_LINK_LIBRARIES ${PYTHON_LIBRARIES})
+    if(WIN32 OR CYGWIN)
+      set_property(TARGET ${PN}::module APPEND PROPERTY INTERFACE_LINK_LIBRARIES ${PYTHON_LIBRARIES})
+    endif()
+
+    if(CMAKE_VERSION VERSION_LESS 3.3)
+      set_property(TARGET ${PN}::pybind11 APPEND PROPERTY INTERFACE_COMPILE_OPTIONS "${PYBIND11_CPP_STANDARD}")
+    else()
+      set_property(TARGET ${PN}::pybind11 APPEND PROPERTY INTERFACE_COMPILE_OPTIONS $<$<COMPILE_LANGUAGE:CXX>:${PYBIND11_CPP_STANDARD}>)
+    endif()
+
+    get_property(_iid TARGET ${PN}::pybind11 PROPERTY INTERFACE_INCLUDE_DIRECTORIES)
+    get_property(_ill TARGET ${PN}::module PROPERTY INTERFACE_LINK_LIBRARIES)
+    set(${PN}_INCLUDE_DIRS ${_iid})
+    set(${PN}_LIBRARIES ${_ico} ${_ill})
+endif()
+endif()
--- a/cviruntime/python/include/pybind11/tools/pybind11Tools.cmake
+++ b/cviruntime/python/include/pybind11/tools/pybind11Tools.cmake
@ -0,0 +1,227 @@
+# tools/pybind11Tools.cmake -- Build system for the pybind11 modules
+#
+# Copyright (c) 2015 Wenzel Jakob <wenzel@inf.ethz.ch>
+#
+# All rights reserved. Use of this source code is governed by a
+# BSD-style license that can be found in the LICENSE file.
+
+cmake_minimum_required(VERSION 2.8.12)
+
+# Add a CMake parameter for choosing a desired Python version
+if(NOT PYBIND11_PYTHON_VERSION)
+  set(PYBIND11_PYTHON_VERSION "" CACHE STRING "Python version to use for compiling modules")
+endif()
+
+set(Python_ADDITIONAL_VERSIONS 3.9 3.8 3.7 3.6 3.5 3.4)
+find_package(PythonLibsNew ${PYBIND11_PYTHON_VERSION} REQUIRED)
+
+include(CheckCXXCompilerFlag)
+include(CMakeParseArguments)
+
+if(NOT PYBIND11_CPP_STANDARD AND NOT CMAKE_CXX_STANDARD)
+  if(NOT MSVC)
+    check_cxx_compiler_flag("-std=c++14" HAS_CPP14_FLAG)
+
+    if (HAS_CPP14_FLAG)
+      set(PYBIND11_CPP_STANDARD -std=c++14)
+    else()
+      check_cxx_compiler_flag("-std=c++11" HAS_CPP11_FLAG)
+      if (HAS_CPP11_FLAG)
+        set(PYBIND11_CPP_STANDARD -std=c++11)
+      else()
+        message(FATAL_ERROR "Unsupported compiler -- pybind11 requires C++11 support!")
+      endif()
+    endif()
+  elseif(MSVC)
+    set(PYBIND11_CPP_STANDARD /std:c++14)
+  endif()
+
+  set(PYBIND11_CPP_STANDARD ${PYBIND11_CPP_STANDARD} CACHE STRING
+      "C++ standard flag, e.g. -std=c++11, -std=c++14, /std:c++14.  Defaults to C++14 mode." FORCE)
+endif()
+
+# Checks whether the given CXX/linker flags can compile and link a cxx file.  cxxflags and
+# linkerflags are lists of flags to use.  The result variable is a unique variable name for each set
+# of flags: the compilation result will be cached base on the result variable.  If the flags work,
+# sets them in cxxflags_out/linkerflags_out internal cache variables (in addition to ${result}).
+function(_pybind11_return_if_cxx_and_linker_flags_work result cxxflags linkerflags cxxflags_out linkerflags_out)
+  set(CMAKE_REQUIRED_LIBRARIES ${linkerflags})
+  check_cxx_compiler_flag("${cxxflags}" ${result})
+  if (${result})
+    set(${cxxflags_out} "${cxxflags}" CACHE INTERNAL "" FORCE)
+    set(${linkerflags_out} "${linkerflags}" CACHE INTERNAL "" FORCE)
+  endif()
+endfunction()
+
+# Internal: find the appropriate link time optimization flags for this compiler
+function(_pybind11_add_lto_flags target_name prefer_thin_lto)
+  if (NOT DEFINED PYBIND11_LTO_CXX_FLAGS)
+    set(PYBIND11_LTO_CXX_FLAGS "" CACHE INTERNAL "")
+    set(PYBIND11_LTO_LINKER_FLAGS "" CACHE INTERNAL "")
+
+    if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
+      set(cxx_append "")
+      set(linker_append "")
+      if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND NOT APPLE)
+        # Clang Gold plugin does not support -Os; append -O3 to MinSizeRel builds to override it
+        set(linker_append ";$<$<CONFIG:MinSizeRel>:-O3>")
+      elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+        set(cxx_append ";-fno-fat-lto-objects")
+      endif()
+
+      if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND prefer_thin_lto)
+        _pybind11_return_if_cxx_and_linker_flags_work(HAS_FLTO_THIN
+          "-flto=thin${cxx_append}" "-flto=thin${linker_append}"
+          PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS)
+      endif()
+
+      if (NOT HAS_FLTO_THIN)
+        _pybind11_return_if_cxx_and_linker_flags_work(HAS_FLTO
+          "-flto${cxx_append}" "-flto${linker_append}"
+          PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS)
+      endif()
+    elseif (CMAKE_CXX_COMPILER_ID MATCHES "Intel")
+      # Intel equivalent to LTO is called IPO
+      _pybind11_return_if_cxx_and_linker_flags_work(HAS_INTEL_IPO
+      "-ipo" "-ipo" PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS)
+    elseif(MSVC)
+      # cmake only interprets libraries as linker flags when they start with a - (otherwise it
+      # converts /LTCG to \LTCG as if it was a Windows path).  Luckily MSVC supports passing flags
+      # with - instead of /, even if it is a bit non-standard:
+      _pybind11_return_if_cxx_and_linker_flags_work(HAS_MSVC_GL_LTCG
+        "/GL" "-LTCG" PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS)
+    endif()
+
+    if (PYBIND11_LTO_CXX_FLAGS)
+      message(STATUS "LTO enabled")
+    else()
+      message(STATUS "LTO disabled (not supported by the compiler and/or linker)")
+    endif()
+  endif()
+
+  # Enable LTO flags if found, except for Debug builds
+  if (PYBIND11_LTO_CXX_FLAGS)
+    target_compile_options(${target_name} PRIVATE "$<$<NOT:$<CONFIG:Debug>>:${PYBIND11_LTO_CXX_FLAGS}>")
+  endif()
+  if (PYBIND11_LTO_LINKER_FLAGS)
+    target_link_libraries(${target_name} PRIVATE "$<$<NOT:$<CONFIG:Debug>>:${PYBIND11_LTO_LINKER_FLAGS}>")
+  endif()
+endfunction()
+
+# Build a Python extension module:
+# pybind11_add_module(<name> [MODULE | SHARED] [EXCLUDE_FROM_ALL]
+#                     [NO_EXTRAS] [SYSTEM] [THIN_LTO] source1 [source2 ...])
+#
+function(pybind11_add_module target_name)
+  set(options MODULE SHARED EXCLUDE_FROM_ALL NO_EXTRAS SYSTEM THIN_LTO)
+  cmake_parse_arguments(ARG "${options}" "" "" ${ARGN})
+
+  if(ARG_MODULE AND ARG_SHARED)
+    message(FATAL_ERROR "Can't be both MODULE and SHARED")
+  elseif(ARG_SHARED)
+    set(lib_type SHARED)
+  else()
+    set(lib_type MODULE)
+  endif()
+
+  if(ARG_EXCLUDE_FROM_ALL)
+    set(exclude_from_all EXCLUDE_FROM_ALL)
+  endif()
+
+  add_library(${target_name} ${lib_type} ${exclude_from_all} ${ARG_UNPARSED_ARGUMENTS})
+
+  if(ARG_SYSTEM)
+    set(inc_isystem SYSTEM)
+  endif()
+
+  target_include_directories(${target_name} ${inc_isystem}
+    PRIVATE ${PYBIND11_INCLUDE_DIR}  # from project CMakeLists.txt
+    PRIVATE ${pybind11_INCLUDE_DIR}  # from pybind11Config
+    PRIVATE ${PYTHON_INCLUDE_DIRS})
+
+  # Python debug libraries expose slightly different objects
+  # https://docs.python.org/3.6/c-api/intro.html#debugging-builds
+  # https://stackoverflow.com/questions/39161202/how-to-work-around-missing-pymodule-create2-in-amd64-win-python35-d-lib
+  if(PYTHON_IS_DEBUG)
+    target_compile_definitions(${target_name} PRIVATE Py_DEBUG)
+  endif()
+
+  # The prefix and extension are provided by FindPythonLibsNew.cmake
+  set_target_properties(${target_name} PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}")
+  set_target_properties(${target_name} PROPERTIES SUFFIX "${PYTHON_MODULE_EXTENSION}")
+
+  # -fvisibility=hidden is required to allow multiple modules compiled against
+  # different pybind versions to work properly, and for some features (e.g.
+  # py::module_local).  We force it on everything inside the `pybind11`
+  # namespace; also turning it on for a pybind module compilation here avoids
+  # potential warnings or issues from having mixed hidden/non-hidden types.
+  set_target_properties(${target_name} PROPERTIES CXX_VISIBILITY_PRESET "hidden")
+  set_target_properties(${target_name} PROPERTIES CUDA_VISIBILITY_PRESET "hidden")
+
+  if(WIN32 OR CYGWIN)
+    # Link against the Python shared library on Windows
+    target_link_libraries(${target_name} PRIVATE ${PYTHON_LIBRARIES})
+  elseif(APPLE)
+    # It's quite common to have multiple copies of the same Python version
+    # installed on one's system. E.g.: one copy from the OS and another copy
+    # that's statically linked into an application like Blender or Maya.
+    # If we link our plugin library against the OS Python here and import it
+    # into Blender or Maya later on, this will cause segfaults when multiple
+    # conflicting Python instances are active at the same time (even when they
+    # are of the same version).
+
+    # Windows is not affected by this issue since it handles DLL imports
+    # differently. The solution for Linux and Mac OS is simple: we just don't
+    # link against the Python library. The resulting shared library will have
+    # missing symbols, but that's perfectly fine -- they will be resolved at
+    # import time.
+
+    target_link_libraries(${target_name} PRIVATE "-undefined dynamic_lookup")
+
+    if(ARG_SHARED)
+      # Suppress CMake >= 3.0 warning for shared libraries
+      set_target_properties(${target_name} PROPERTIES MACOSX_RPATH ON)
+    endif()
+  endif()
+
+  # Make sure C++11/14 are enabled
+  if(CMAKE_VERSION VERSION_LESS 3.3)
+    target_compile_options(${target_name} PUBLIC ${PYBIND11_CPP_STANDARD})
+  else()
+    target_compile_options(${target_name} PUBLIC $<$<COMPILE_LANGUAGE:CXX>:${PYBIND11_CPP_STANDARD}>)
+  endif()
+
+  if(ARG_NO_EXTRAS)
+    return()
+  endif()
+
+  _pybind11_add_lto_flags(${target_name} ${ARG_THIN_LTO})
+
+  if (NOT MSVC AND NOT ${CMAKE_BUILD_TYPE} MATCHES Debug|RelWithDebInfo)
+    # Strip unnecessary sections of the binary on Linux/Mac OS
+    if(CMAKE_STRIP)
+      if(APPLE)
+        add_custom_command(TARGET ${target_name} POST_BUILD
+                           COMMAND ${CMAKE_STRIP} -x $<TARGET_FILE:${target_name}>)
+      else()
+        add_custom_command(TARGET ${target_name} POST_BUILD
+                           COMMAND ${CMAKE_STRIP} $<TARGET_FILE:${target_name}>)
+      endif()
+    endif()
+  endif()
+
+  if(MSVC)
+    # /MP enables multithreaded builds (relevant when there are many files), /bigobj is
+    # needed for bigger binding projects due to the limit to 64k addressable sections
+    target_compile_options(${target_name} PRIVATE /bigobj)
+    if(CMAKE_VERSION VERSION_LESS 3.11)
+      target_compile_options(${target_name} PRIVATE $<$<NOT:$<CONFIG:Debug>>:/MP>)
+    else()
+      # Only set these options for C++ files.  This is important so that, for
+      # instance, projects that include other types of source files like CUDA
+      # .cu files don't get these options propagated to nvcc since that would
+      # cause the build to fail.
+      target_compile_options(${target_name} PRIVATE $<$<NOT:$<CONFIG:Debug>>:$<$<COMPILE_LANGUAGE:CXX>:/MP>>)
+    endif()
+  endif()
+endfunction()
--- a/cviruntime/python/pyruntime.cpp
+++ b/cviruntime/python/pyruntime.cpp
@ -0,0 +1,356 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/pytypes.h>
+#include <pybind11/stl.h>
+#include <pybind11/numpy.h>
+#include "cviruntime.h"
+#include <runtime/debug.h>
+#include <cvikernel/cvikernel.h>
+#include "cviruntime_context.h"
+
+namespace py = pybind11;
+
+PYBIND11_DECLARE_HOLDER_TYPE(T, std::shared_ptr<T>);
+
+struct PythonTensor {
+  PythonTensor(CVI_TENSOR *tensor) {
+    name = std::string(tensor->name);
+    qscale = tensor->qscale;
+    zpoint = tensor->zero_point;
+    std::vector<size_t> shape;
+    for (int i = 0; i < (int)tensor->shape.dim_size; i++) {
+      shape.push_back(tensor->shape.dim[i]);
+    }
+    data = py::array(getDtype(tensor->fmt), shape, (void *)CVI_NN_TensorPtr(tensor),
+                     py::cast(*this));
+  }
+
+  std::string name;
+  float qscale;
+  int zpoint;
+  py::array data;
+
+private:
+  py::dtype getDtype(CVI_FMT fmt) {
+    switch (fmt) {
+      case CVI_FMT_FP32:
+        return py::dtype("single");
+      case CVI_FMT_INT8:
+        return py::dtype("int8");
+      case CVI_FMT_UINT8:
+        return py::dtype("uint8");
+      case CVI_FMT_INT16:
+        return py::dtype("int16");
+      case CVI_FMT_UINT16:
+        return py::dtype("uint16");
+      case CVI_FMT_INT32:
+        return py::dtype("int32");
+      case CVI_FMT_UINT32:
+        return py::dtype("uint32");
+      case CVI_FMT_BF16:
+        // numpy has no bf16 type, use uint16 instread of bf16.
+        return py::dtype("uint16");
+      default:
+        assert(0);
+    }
+  }
+};
+
+struct PythonCviModel {
+  PythonCviModel(const std::string &model_file, int program_id, bool output_all_tensors) {
+    int ret = CVI_NN_RegisterModel(model_file.c_str(), &model);
+    if (ret != 0) {
+      assert(0);
+    }
+    this->config(program_id, output_all_tensors);
+  }
+
+  ~PythonCviModel() { CVI_NN_CleanupModel(model); }
+
+  py::object clone() {
+    auto new_cvimodel = new PythonCviModel();
+    int ret = CVI_NN_CloneModel(model, &new_cvimodel->model);
+    if (ret != 0) {
+      assert(0);
+    }
+    return py::cast(new_cvimodel);
+  }
+
+  void config(int program_id, bool output_all_tensors) {
+    CVI_NN_SetConfig(model, OPTION_PROGRAM_INDEX, program_id);
+    CVI_NN_SetConfig(model, OPTION_OUTPUT_ALL_TENSORS, output_all_tensors);
+    int32_t ret = CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num,
+                                         &output_tensors, &output_num);
+    if (ret != 0) {
+      assert(0);
+    }
+    for (int i = 0; i < input_num; i++) {
+      inputs.push_back(std::make_shared<PythonTensor>(&input_tensors[i]));
+    }
+    for (int i = 0; i < output_num; i++) {
+      outputs.push_back(std::make_shared<PythonTensor>(&output_tensors[i]));
+    }
+  }
+
+  void forward() {
+    int ret = CVI_NN_Forward(model, input_tensors, input_num, output_tensors, output_num);
+    if (ret != 0) {
+      assert(0);
+    }
+  }
+
+  std::vector<std::shared_ptr<PythonTensor>> inputs;
+  std::vector<std::shared_ptr<PythonTensor>> outputs;
+
+private:
+  PythonCviModel() {}
+  CVI_MODEL_HANDLE model = nullptr;
+  int32_t input_num = 0;
+  int32_t output_num = 0;
+  CVI_TENSOR *input_tensors = nullptr;
+  CVI_TENSOR *output_tensors = nullptr;
+};
+
+class PyCvkLmTensor {
+public:
+  PyCvkLmTensor() {}
+
+  PyCvkLmTensor(cvk_context_t *cvk_ctx, cvk_tl_shape_t shape,
+      cvk_fmt_t fmt, int eu_align) : cvk_ctx(cvk_ctx), fmt(fmt),
+          eu_align(eu_align) {
+
+    if (!cvk_ctx)
+      throw std::runtime_error("Expect valid kernel context");
+
+    lmTensor =
+        cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, shape, fmt, eu_align);
+    if (!lmTensor)
+      throw std::runtime_error("Fail to allocate tensor in local memory");
+  }
+
+  std::vector<int> shapes() {
+    std::vector<int> shapes = {0, 0, 0, 0};
+
+    if (lmTensor) {
+      shapes[0] = lmTensor->shape.n;
+      shapes[1] = lmTensor->shape.c;
+      shapes[2] = lmTensor->shape.h;
+      shapes[3] = lmTensor->shape.w;
+    }
+
+    return shapes;
+  }
+
+  int address() {
+    if (lmTensor)
+      return static_cast<int>(lmTensor->start_address);
+
+    return 0;
+  }
+
+  cvk_tl_t *allocated() {
+    return lmTensor;
+  }
+
+  cvk_fmt_t format() {
+    if (lmTensor)
+      return lmTensor->fmt;
+
+    return CVK_FMT_I8;
+  }
+
+private:
+  cvk_context_t *cvk_ctx = nullptr;
+  cvk_tl_shape_t shape;
+  cvk_fmt_t fmt = CVK_FMT_I8;
+  int eu_align = 0;
+  cvk_tl_t *lmTensor = nullptr;
+};
+
+class PyCviKernelContext {
+public:
+  const uint32_t CMDBUF_SIZE = 512 * 1024;
+
+  PyCviKernelContext(const std::string &name) : name(name) {
+    CVI_RT_Init(&rt_handle);
+    assert(rt_handle);
+    submit_handle = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+    assert(submit_handle);
+    cvk_ctx = (cvk_context_t *)submit_handle;
+  }
+
+  ~PyCviKernelContext() {
+    CVI_RT_UnRegisterKernel(cvk_ctx);
+    CVI_RT_DeInit(rt_handle);
+  }
+
+  cvk_fmt_t getCvkDataFormat(std::string format);
+
+  void checkTdmaParameters(py::buffer b, PyCvkLmTensor *lmTensor);
+  void setupGmTensor(cvk_tg_t &tg, py::buffer_info &info, CVI_RT_MEM mem);
+
+  // Kernel Operations
+  PyCvkLmTensor lmem_alloc_tensor(py::buffer b, int eu_align);
+  void tdma_g2l_tensor_copy(PyCvkLmTensor *lmTensor, py::buffer b);
+  void tdma_l2g_tensor_copy(py::buffer b, PyCvkLmTensor *lmTensor);
+
+private:
+  std::string name;
+  CVI_RT_HANDLE rt_handle = nullptr;
+  CVI_RT_KHANDLE submit_handle = nullptr;
+  cvk_context_t *cvk_ctx = nullptr;
+};
+
+cvk_fmt_t PyCviKernelContext::getCvkDataFormat(std::string format) {
+  if (format == "b")
+    return CVK_FMT_I8;
+
+  return CVK_FMT_I8;
+}
+
+void PyCviKernelContext::checkTdmaParameters(py::buffer b,
+                                             PyCvkLmTensor *lmTensor) {
+  if (!lmTensor)
+    throw std::runtime_error("Tensor in Local memory not assigned");
+
+  if (!lmTensor->allocated())
+    throw std::runtime_error("Tensor in local memory not allocated yet");
+
+  py::buffer_info info = b.request();
+  if (info.ndim != 4)
+    throw std::runtime_error("Only support NCHW 4D tensor");
+
+  if ((info.shape[0] != lmTensor->shapes()[0]) ||
+      (info.shape[1] != lmTensor->shapes()[1]) ||
+      (info.shape[2] != lmTensor->shapes()[2]) ||
+      (info.shape[3] != lmTensor->shapes()[3]))
+    throw std::runtime_error("Shape mismatched");
+}
+
+void PyCviKernelContext::setupGmTensor(cvk_tg_t &tg, py::buffer_info &info,
+                                       CVI_RT_MEM mem) {
+  memset(&tg, 0, sizeof(tg));
+  cvk_tg_shape_t tg_shape = {
+      static_cast<uint32_t>(info.shape[0]),
+      static_cast<uint32_t>(info.shape[1]),
+      static_cast<uint32_t>(info.shape[2]),
+      static_cast<uint32_t>(info.shape[3])};
+  cvk_ctx->ops->gmem_init_tensor(cvk_ctx, &tg, tg_shape,
+                                 getCvkDataFormat(info.format));
+  tg.start_address = CVI_RT_MemGetPAddr(mem);
+}
+
+PyCvkLmTensor PyCviKernelContext::lmem_alloc_tensor(py::buffer b,
+                                                    int eu_align) {
+  py::buffer_info info = b.request();
+
+  if (info.ndim != 4)
+    throw std::runtime_error("Local memory only support NCHW");
+
+  if (!info.shape[0] || !info.shape[1] || !info.shape[2] || !info.shape[3])
+    throw std::runtime_error("Shape should not zero");
+
+  cvk_tl_shape_t shape = {
+      static_cast<uint32_t>(info.shape[0]),
+      static_cast<uint32_t>(info.shape[1]),
+      static_cast<uint32_t>(info.shape[2]),
+      static_cast<uint32_t>(info.shape[3])};
+
+  cvk_fmt_t fmt = getCvkDataFormat(info.format);
+
+  PyCvkLmTensor lmTensor(cvk_ctx, shape, fmt, eu_align);
+
+  return lmTensor;
+}
+
+void PyCviKernelContext::tdma_g2l_tensor_copy(PyCvkLmTensor *lmTensor,
+                                              py::buffer b) {
+  checkTdmaParameters(b, lmTensor);
+
+  if (!lmTensor)
+    throw std::runtime_error("Tensor in Local memory not assigned");
+
+  if (!lmTensor->allocated())
+    throw std::runtime_error("Tensor in local memory not allocated yet");
+
+  py::buffer_info info = b.request();
+
+  size_t gm_size = info.shape[0] * info.shape[1] * info.shape[2] *
+                   info.shape[3];
+  CVI_RT_MEM mem = CVI_RT_MemAlloc(rt_handle, gm_size);
+
+  // Copy from system memory to device memory
+  CVI_RT_MemCopyS2D(rt_handle, mem, static_cast<uint8_t *>(info.ptr));
+
+  // Setup global memory
+  cvk_tg_t tg;
+  setupGmTensor(tg, info, mem);
+
+  cvk_tdma_g2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = lmTensor->allocated();
+  cvk_ctx->ops->tdma_g2l_tensor_copy(cvk_ctx, &p);
+
+  CVI_RT_Submit(cvk_ctx);
+
+  // free the device memory
+  CVI_RT_MemFree(rt_handle, mem);
+}
+
+void PyCviKernelContext::tdma_l2g_tensor_copy(py::buffer b,
+                                              PyCvkLmTensor *lmTensor) {
+  checkTdmaParameters(b, lmTensor);
+
+  py::buffer_info info = b.request();
+
+  size_t gm_size = info.shape[0] * info.shape[1] * info.shape[2] *
+                   info.shape[3];
+  CVI_RT_MEM mem = CVI_RT_MemAlloc(rt_handle, gm_size);
+
+  // Setup global memory
+  cvk_tg_t tg;
+  setupGmTensor(tg, info, mem);
+
+  cvk_tdma_l2g_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = lmTensor->allocated();
+  p.dst = &tg;
+  cvk_ctx->ops->tdma_l2g_tensor_copy(cvk_ctx, &p);
+  CVI_RT_Submit(cvk_ctx);
+
+  // Copy from device memory to system memory
+  CVI_RT_MemCopyD2S(rt_handle, static_cast<uint8_t *>(info.ptr), mem);
+
+  // Free the device memory
+  CVI_RT_MemFree(rt_handle, mem);
+}
+
+PYBIND11_MODULE(pyruntime, m) {
+  py::class_<PythonTensor, std::shared_ptr<PythonTensor>>(m, "Tensor")
+      .def_readonly("name", &PythonTensor::name)
+      .def_readonly("qscale", &PythonTensor::qscale)
+      .def_readonly("zpoint", &PythonTensor::zpoint)
+      .def_readwrite("data", &PythonTensor::data);
+
+  py::class_<PythonCviModel>(m, "Model")
+      .def(py::init<const std::string &, int, bool>(),
+           py::arg("cvimodel"), py::arg("program_id") = 0,
+           py::arg("output_all_tensors") = false)
+      .def("forward", &PythonCviModel::forward)
+      .def_readwrite("inputs", &PythonCviModel::inputs)
+      .def_readwrite("outputs", &PythonCviModel::outputs);
+
+  py::class_<PyCvkLmTensor>(m, "CvkLmTensor")
+      .def(py::init<>())
+      .def("shapes", &PyCvkLmTensor::shapes, "Get shape.")
+      .def("address", &PyCvkLmTensor::address, "Get address.");
+
+  py::class_<PyCviKernelContext>(m, "CvkContext")
+      .def(py::init<const std::string &>())
+      .def("lmem_alloc_tensor", &PyCviKernelContext::lmem_alloc_tensor,
+           "Allocate tensor in TPU local memory.")
+      .def("tdma_g2l_tensor_copy", &PyCviKernelContext::tdma_g2l_tensor_copy,
+           "Transfer data from host to TPU local memory.")
+      .def("tdma_l2g_tensor_copy", &PyCviKernelContext::tdma_l2g_tensor_copy,
+           "Transfer data from TPU local memory to host.");
+}
--- a/cviruntime/python/test.py
+++ b/cviruntime/python/test.py
@ -0,0 +1,79 @@
+#!/usr/bin/python3
+import argparse
+import pyruntime as rt
+import numpy as np
+
+def bf16_to_fp32(d_bf16):
+  s = d_bf16.shape
+  d_bf16 = d_bf16.flatten()
+  assert d_bf16.dtype == np.uint16
+  d_fp32 = np.empty_like(d_bf16, dtype=np.float32)
+  for i in range(len(d_bf16)):
+    d_fp32[i] = struct.unpack('<f', struct.pack('<HH', 0, d_bf16[i]))[0]
+  return d_fp32.reshape(s)
+
+def compare_one_array(a, b):
+  if a.dtype == np.uint16:
+    a = bf16_to_fp32(a)
+  return np.array_equal(a.astype(np.float32).flatten(),
+                        b.astype(np.float32).flatten())
+
+def max_name_sz(tensors):
+  max_sz = 0
+  for out in tensors:
+    sz = len(out.name)
+    max_sz = max_sz if sz < max_sz else sz
+  return max_sz
+
+
+def compare_with_ref(tensors, refs):
+  result = [0, 0]
+  style = "{:<" + str(max_name_sz(tensors) + 4) + "}";
+  print("To compare outputs with refernece npz:")
+  for out in tensors:
+    ref = refs[out.name]
+    same = compare_one_array(out.data, ref)
+    result[int(same)] += 1
+    print("  {} [{}]".format(str.format(style, out.name),
+          "PASS" if same else "FAIL"))
+  print("{} passed, {} failed, Compare {}!!!"
+        .format(result[1], result[0], "OK" if result[0] == 0 else "ERROR"))
+  return result[0] == 0
+
+def quant(x, scale):
+  x = x * scale
+  x = np.trunc(x + np.copysign(.5, x))
+  x[x > 127.0] = 127.0
+  x[x < -128.0] = -128.0
+  return x.astype(np.int8)
+
+def test(input_npz, cvimodel, ref_npz):
+  # load cvimodel
+  model = rt.Model(cvimodel, batch_num=1, output_all_tensors=True)
+  # fill data to inputs
+  data = model.inputs[0].data
+  qscale = model.inputs[0].qscale
+  # load input data and quant to int8
+  input_npz = np.load(input_npz)
+  input = input_npz[input_npz.files[0]]
+  print(input.shape)
+  input = quant(input, qscale)
+  # fill input data to input tensor of model
+  data[:] = input.reshape(data.shape)
+  for out in model.outputs:
+    print(out.name)
+    print(out.data.dtype)
+    print(out.data.shape)
+  # forward
+  model.forward()
+  # compare result with reference
+  refs = np.load(ref_npz)
+  compare_with_ref(model.outputs, refs)
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser(description="Test python runtime API.")
+  parser.add_argument("--cvimodel", type=str, help="cvimodel")
+  parser.add_argument("--input", type=str, help="input npz")
+  parser.add_argument("--reference", type=str, help="reference to output npz")
+  args = parser.parse_args()
+  test(args.input, args.cvimodel, args.reference)
--- a/cviruntime/samples/CMakeLists.txt
+++ b/cviruntime/samples/CMakeLists.txt
@ -0,0 +1,24 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+project(cvitek_samples C CXX)
+
+add_subdirectory(utils)
+add_subdirectory(runner)
+add_subdirectory(classifier)
+add_subdirectory(classifier_bf16)
+add_subdirectory(classifier_fused_preprocess)
+add_subdirectory(classifier_multi_batch)
+add_subdirectory(samples_extra)
+
+set(SCRIPT_FILES
+    run_classifier.sh
+    run_classifier_bf16.sh
+    run_classifier_fused_preprocess.sh
+    run_classifier_multi_batch.sh
+    )
+
+install(FILES ${SCRIPT_FILES}
+    PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+    DESTINATION .)
+
+install(DIRECTORY data/ DESTINATION data FILES_MATCHING PATTERN "*")
--- a/cviruntime/samples/README.md
+++ b/cviruntime/samples/README.md
@ -0,0 +1,190 @@
+# Samples for CVI TPU SDK
+
+## Catalogue
+| name                              | sample simple introduce |  
+| --------------------------------- | :------------: |
+| classifier                         |      sample without fuse_preprocess and quant to int8       |
+| classifier_bf16                    |      sample without fuse_preprocess and quant to bf16       |
+| classifier_fused_preprocess        |      sample with fuse_preprocess and quant to int8         |
+| classifier_multi_batch             |      sample with multiple batch merged model       | 
+
+## Sample introduction
+### classifier_bf16
+For model deployment, you need to refer to this sample first and convert it into a bf16 model to evaluate the model accuracy effect in the business scenario. For preprocessing, you can refer to the sample and use OpenCV to implement it.
+
+### classifier
+Ensure that the sample of bf16 model can be adjusted normally. Next, we can see the implementation of quantification as int8. Similarly, the pretreatment can be implemented by using OpenCV with reference to sample.
+
+### classifier_fused_preprocess
+If the pre-processing of the model takes a long time, you can bring --fuse_preprocess parameter allows TPU to implement partial preprocessing of the model to reduce the time of model preprocessing or memory copying.
+
+### classifier_multi_batch
+The combination of two models can share weight and memory, and support the implementation of different batches of the same model. Please refer to this sample.
+
+## How to Compile image input sample in docker
+
+The following documents are required:
+
+* cvitek_tpu_sdk_[cv182x|cv182x_uclibc|cv183x|cv181x_glibc32|cv181x_musl_riscv64].tar.gz
+* cvitek_tpu_samples.tar.gz
+
+**64 bit platform**
+
+``` shell
+tar zxf cvitek_tpu_sdk_cv183x.tar.gz
+export TPU_SDK_PATH=$PWD/cvitek_tpu_sdk
+cd cvitek_tpu_sdk && source ./envs_tpu_sdk.sh && cd ..
+
+tar zxf cvitek_tpu_samples.tar.gz
+cd cvitek_tpu_samples
+mkdir build_soc
+cd build_soc
+cmake -G Ninja \
+    -DCMAKE_BUILD_TYPE=RELEASE \
+    -DCMAKE_C_FLAGS_RELEASE=-O3 -DCMAKE_CXX_FLAGS_RELEASE=-O3 \
+    -DCMAKE_TOOLCHAIN_FILE=$TPU_SDK_PATH/cmake/toolchain-aarch64-linux.cmake \
+    -DTPU_SDK_PATH=$TPU_SDK_PATH \
+    -DOPENCV_PATH=$TPU_SDK_PATH/opencv \
+    -DCMAKE_INSTALL_PREFIX=../install_samples \
+    ..
+cmake --build . --target install
+```
+
+**32 bit platform**
+
+``` shell
+tar zxf cvitek_tpu_sdk_[cv182x|cv181x_glibc32].tar.gz
+export TPU_SDK_PATH=$PWD/cvitek_tpu_sdk
+cd cvitek_tpu_sdk && source ./envs_tpu_sdk.sh && cd ..
+
+tar zxf cvitek_tpu_samples.tar.gz
+cd cvitek_tpu_samples
+mkdir build_soc
+cd build_soc
+cmake -G Ninja \
+    -DCMAKE_BUILD_TYPE=RELEASE \
+    -DCMAKE_C_FLAGS_RELEASE=-O3 -DCMAKE_CXX_FLAGS_RELEASE=-O3 \
+    -DCMAKE_TOOLCHAIN_FILE=$TPU_SDK_PATH/cmake/toolchain-linux-gnueabihf.cmake \
+    -DTPU_SDK_PATH=$TPU_SDK_PATH \
+    -DOPENCV_PATH=$TPU_SDK_PATH/opencv \
+    -DCMAKE_INSTALL_PREFIX=../install_samples \
+    ..
+cmake --build . --target install
+```
+
+**uclibc platform**
+
+``` shell
+tar zxf cvitek_tpu_sdk_cv182x_uclibc.tar.gz
+export TPU_SDK_PATH=$PWD/cvitek_tpu_sdk
+cd cvitek_tpu_sdk && source ./envs_tpu_sdk.sh && cd ..
+
+tar zxf cvitek_tpu_samples.tar.gz
+cd cvitek_tpu_samples
+mkdir build_soc
+cd build_soc
+cmake -G Ninja \
+    -DCMAKE_BUILD_TYPE=RELEASE \
+    -DCMAKE_C_FLAGS_RELEASE=-O3 -DCMAKE_CXX_FLAGS_RELEASE=-O3 \
+    -DCMAKE_TOOLCHAIN_FILE=$TPU_SDK_PATH/cmake/toolchain-linux-uclibc.cmake \
+    -DTPU_SDK_PATH=$TPU_SDK_PATH \
+    -DOPENCV_PATH=$TPU_SDK_PATH/opencv \
+    -DCMAKE_INSTALL_PREFIX=../install_samples \
+    ..
+cmake --build . --target install
+```
+
+**cv181x musl riscv64 platform**
+
+``` shell
+tar zxf cvitek_tpu_sdk_cv181x_musl_riscv64.tar.gz
+export TPU_SDK_PATH=$PWD/cvitek_tpu_sdk
+cd cvitek_tpu_sdk && source ./envs_tpu_sdk.sh && cd ..
+
+tar zxf cvitek_tpu_samples.tar.gz
+cd cvitek_tpu_samples
+mkdir build_soc
+cd build_soc
+cmake -G Ninja \
+    -DCMAKE_BUILD_TYPE=RELEASE \
+    -DCMAKE_C_FLAGS_RELEASE=-O3 -DCMAKE_CXX_FLAGS_RELEASE=-O3 \
+    -DCMAKE_TOOLCHAIN_FILE=$TPU_SDK_PATH/cmake/toolchain-riscv64-linux-musl-x86_64.cmake \
+    -DTPU_SDK_PATH=$TPU_SDK_PATH \
+    -DOPENCV_PATH=$TPU_SDK_PATH/opencv \
+    -DCMAKE_INSTALL_PREFIX=../install_samples \
+    ..
+cmake --build . --target install
+```
+
+Fianlly, copy install_samples folder to development board.
+
+## How to Compile vpss input sample in docker
+
+just add -DMW_PATH in cmake
+
+The following documents are required:
+
+* cvitek_tpu_sdk_[cv182x|cv182x_uclibc|cv183x|cv181x_glibc32|cv181x_musl_riscv64].tar.gz
+* cvitek_tpu_samples.tar.gz
+* mw.tar.gz
+
+
+**64 bit platform**
+
+``` shell
+mkdir mw_path
+tar -zxvf mw.tar.gz -C mw_path
+export MW_PATH=$PWD/mw_path
+
+tar zxf cvitek_tpu_sdk_cv183x.tar.gz
+export TPU_SDK_PATH=$PWD/cvitek_tpu_sdk
+cd cvitek_tpu_sdk && source ./envs_tpu_sdk.sh && cd ..
+
+tar zxf cvitek_tpu_samples.tar.gz
+cd cvitek_tpu_samples
+mkdir build_soc
+cd build_soc
+cmake -G Ninja \
+    -DCMAKE_BUILD_TYPE=RELEASE \
+    -DCMAKE_C_FLAGS_RELEASE=-O3 -DCMAKE_CXX_FLAGS_RELEASE=-O3 \
+    -DCMAKE_TOOLCHAIN_FILE=$TPU_SDK_PATH/cmake/toolchain-aarch64-linux.cmake \
+    -DTPU_SDK_PATH=$TPU_SDK_PATH \
+    -DOPENCV_PATH=$TPU_SDK_PATH/opencv \
+    -DMW_PATH=$MW_PATH \
+    -DCHIP=183x \
+    -DCMAKE_INSTALL_PREFIX=../install_samples \
+    ..
+cmake --build . --target install
+```
+
+Fianlly, copy install_samples folder to development board.
+
+**musl platform**
+``` shell
+mkdir mw_path
+tar -zxvf mw.tar.gz -C mw_path
+export MW_PATH=$PWD/mw_path
+
+tar zxf cvitek_tpu_sdk_cv181x_musl_riscv64.tar.gz
+export TPU_SDK_PATH=$PWD/cvitek_tpu_sdk
+cd cvitek_tpu_sdk && source ./envs_tpu_sdk.sh && cd ..
+
+tar zxf cvitek_tpu_samples.tar.gz
+cd cvitek_tpu_samples
+mkdir build_soc
+cd build_soc
+cmake -G Ninja \
+    -DCMAKE_BUILD_TYPE=RELEASE \
+    -DCMAKE_C_FLAGS_RELEASE=-O3 -DCMAKE_CXX_FLAGS_RELEASE=-O3 \
+    -DCMAKE_TOOLCHAIN_FILE=$TPU_SDK_PATH/cmake/toolchain-riscv64-linux-musl-x86_64.cmake \
+    -DTPU_SDK_PATH=$TPU_SDK_PATH \
+    -DSDK_VER=musl_riscv64 \
+    -DOPENCV_PATH=$TPU_SDK_PATH/opencv \
+    -DMW_PATH=$MW_PATH \
+    -DCHIP=mars \
+    -DCMAKE_INSTALL_PREFIX=../install_samples \
+    ..
+cmake --build . --target install
+```
+
+Fianlly, copy install_samples folder to development board.
--- a/cviruntime/samples/classifier/CMakeLists.txt
+++ b/cviruntime/samples/classifier/CMakeLists.txt
@ -0,0 +1,40 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+project(cvi_sample_classifier C CXX)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+
+if(NOT DEFINED TPU_SDK_PATH)
+  message(FATAL_ERROR "Please set TPU_SDK_PATH to point to the TPU_SDK installation")
+endif()
+include_directories(${TPU_SDK_PATH}/include)
+link_directories(${TPU_SDK_PATH}/lib)
+
+if(NOT DEFINED OPENCV_PATH)
+  message(FATAL_ERROR "Please set OPENCV_PATH to point to the opencvn installation")
+endif()
+include_directories(${OPENCV_PATH}/include)
+link_directories(${OPENCV_PATH}/lib)
+
+set(CVI_LIBS ${CVI_LIBS} cviruntime cvikernel)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(CVI_LIBS ${CVI_LIBS} cvicmodel)
+endif()
+
+set(OPENCV_LIBS ${OPENCV_LIBS} opencv_core opencv_imgcodecs opencv_imgproc)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(OPENCV_LIBS ${OPENCV_LIBS} opencv_highgui)
+endif()
+
+set(EXTRA_LIBS ${EXTRA_LIBS} dl stdc++ pthread z)
+
+add_executable(cvi_sample_classifier
+    classifier.cpp)
+target_link_libraries(cvi_sample_classifier
+    ${CVI_LIBS}
+    ${OPENCV_LIBS}
+    ${EXTRA_LIBS})
+install(TARGETS cvi_sample_classifier
+    cvi_sample_classifier DESTINATION bin)
--- a/cviruntime/samples/classifier/README.md
+++ b/cviruntime/samples/classifier/README.md
@ -0,0 +1,117 @@
+# Mobilev2 Sample without fuse proprocess and quant to int8
+
+Copy Unzipped mobilenet_v2.cvimodel to EVB board
+
+### Download the model and convert the model under docker (optional)
+Mobilev2 model could clone from:https://github.com/shicai/MobileNet-Caffe
+
+#### For new toolchain guide
+The following documents are required:
+* tpu-mlir_xxxx.tar.gz (The release package of tpu-mlir)
+
+Transform cvimodel shell:
+``` shell
+tar zxf tpu-mlir_xxxx.tar.gz
+source tpu-mlir_xxxx/envsetup.sh
+
+mkdir workspace && cd workspace
+cp $TPUC_ROOT/regression/image/cat.jpg .
+cp -rf $TPUC_ROOT/regression/dataset/ILSVRC2012 .
+
+model_transform.py \
+--model_name mobilenet_v2 \
+--model_def ./mobilenet_v2_deploy.prototxt \
+--model_data ./mobilenet_v2.caffemodel \
+--test_input ./cat.jpg \
+--test_result mobilenet_v2_top_output.npz \
+--input_shapes [[1,3,224,224]]
+--resize_dims 256,256 \
+--mean 103.94,116.78,123.68 \
+--scale 0.017,0.017,0.017 \
+--pixel_format "bgr" \
+--tolerance 0.99,0.99 \
+--excepts prob \
+--mlir mobilenet_v2.mlir
+
+run_calibration.py \
+mobilenet_v2.mlir \
+--dataset=./ILSVRC2012 \
+--input_num=100 \
+-o mobilenet_v2_calibration_table
+
+model_deploy.py \
+--mlir mobilenet_v2.mlir \
+--calibration_table mobilenet_v2_calibration_table \
+--chip cv183x \
+--quantize INT8 \
+--quant_input \
+--test_input mobilenet_v2_in_f32.npz \
+--test_reference mobilenet_v2_top_output.npz \
+--excepts prob \
+--tolerance 0.9,0.6 \
+--model mobilenet_v2.cvimodel
+```
+
+
+
+#### For old toolchain guide
+The following documents are required:
+
+* cvitek_mlir_ubuntu-18.04.tar.gz
+
+Transform cvimodel shell:
+``` shell
+tar zxf cvitek_mlir_ubuntu-18.04.tar.gz
+source cvitek_mlir/cvitek_envs.sh
+
+mkdir workspace && cd workspace
+cp $MLIR_PATH/tpuc/regression/data/cat.jpg .
+cp -rf $MLIR_PATH/tpuc/regression/data/images .
+
+model_transform.py \
+--model_type caffe \
+--model_name mobilenet_v2 \
+--model_def ./mobilenet_v2_deploy.prototxt \
+--model_data ./mobilenet_v2.caffemodel \
+--image ./cat.jpg \
+--image_resize_dims 256,256 \
+--net_input_dims 224,224 \
+--mean 103.94,116.78,123.68 \
+--input_scale 0.017 \
+--model_channel_order "bgr" \
+--tolerance 0.99,0.99,0.99 \
+--excepts prob \
+--mlir mobilenet_v2_fp32.mlir
+
+run_calibration.py \
+mobilenet_v2_fp32.mlir \
+--dataset=./images \
+--input_num=100 \
+-o mobilenet_v2_calibration_table
+
+model_deploy.py \
+--model_name mobilenet_v2 \
+--mlir mobilenet_v2_fp32.mlir \
+--calibration_table mobilenet_v2_calibration_table \
+--chip cv183x \
+--quantize INT8 \
+--image cat.jpg \
+--excepts prob \
+--tolerance 0.9,0.9,0.6 \
+--correctness 0.95,0.95,0.9 \
+--cvimodel mobilenet_v2.cvimodel
+```
+Copy generated mobilenet_v2.cvimodel to EVB board
+
+
+## How To Compile Sample In Docker
+View the Top level directory README.md or View the cvitek_tpu_quick_start_guide.md
+
+## Run Samples In EVB Borad
+```
+cd install_samples
+./bin/cvi_sample_classifier \
+./mobilenet_v2.cvimodel \
+./data/cat.jpg \
+./data/synset_words.txt
+```
--- a/cviruntime/samples/classifier/classifier.cpp
+++ b/cviruntime/samples/classifier/classifier.cpp
@ -0,0 +1,127 @@
+#include <stdio.h>
+#include <fstream>
+#include <string>
+#include <numeric>
+#include <cviruntime.h>
+#include <opencv2/opencv.hpp>
+
+#define IMG_RESIZE_DIMS 256,256
+#define BGR_MEAN        103.94,116.78,123.68
+#define INPUT_SCALE     0.017
+
+static void usage(char **argv) {
+  printf("Usage:\n");
+  printf("   %s cvimodel image.jpg label_file\n", argv[0]);
+}
+
+int main(int argc, char **argv) {
+  if (argc != 4) {
+    usage(argv);
+    exit(-1);
+  }
+
+  // load model file
+  const char *model_file = argv[1];
+  CVI_MODEL_HANDLE model = nullptr;
+  int ret = CVI_NN_RegisterModel(model_file, &model);
+  if (CVI_RC_SUCCESS != ret) {
+    printf("CVI_NN_RegisterModel failed, err %d\n", ret);
+    exit(1);
+  }
+  printf("CVI_NN_RegisterModel succeeded\n");
+
+  // get input output tensors
+  CVI_TENSOR *input_tensors;
+  CVI_TENSOR *output_tensors;
+  int32_t input_num;
+  int32_t output_num;
+  CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num, &output_tensors,
+                               &output_num);
+  CVI_TENSOR *input = CVI_NN_GetTensorByName(CVI_NN_DEFAULT_TENSOR, input_tensors, input_num);
+  assert(input);
+  printf("input, name:%s\n", input->name);
+  CVI_TENSOR *output = CVI_NN_GetTensorByName(CVI_NN_DEFAULT_TENSOR, output_tensors, output_num);
+  assert(output);
+
+  float qscale = CVI_NN_TensorQuantScale(input);
+  printf("qscale:%f\n", qscale);
+  CVI_SHAPE shape = CVI_NN_TensorShape(input);
+
+  // nchw
+  int32_t height = shape.dim[2];
+  int32_t width = shape.dim[3];
+
+  // imread
+  cv::Mat image;
+  image = cv::imread(argv[2]);
+  if (!image.data) {
+    printf("Could not open or find the image\n");
+    return -1;
+  }
+
+  // resize
+  cv::resize(image, image, cv::Size(IMG_RESIZE_DIMS)); // linear is default
+  // crop
+  cv::Size size = cv::Size(height, width);
+  cv::Rect crop(cv::Point(0.5 * (image.cols - size.width),
+                          0.5 * (image.rows - size.height)), size);
+  image = image(crop);
+  // split
+  cv::Mat channels[3];
+  for (int i = 0; i < 3; i++) {
+    channels[i] = cv::Mat(height, width, CV_8SC1);
+  }
+  cv::split(image, channels);
+  // normalize
+  float mean[] = {BGR_MEAN};
+  for (int i = 0; i < 3; ++i) {
+    channels[i].convertTo(channels[i], CV_8SC1, INPUT_SCALE * qscale,
+                          -1 * mean[i] * INPUT_SCALE * qscale);
+  }
+
+  // fill to input tensor
+  int8_t *ptr = (int8_t *)CVI_NN_TensorPtr(input);
+  int channel_size = height * width;
+  for (int i = 0; i < 3; ++i) {
+    memcpy(ptr + i * channel_size, channels[i].data, channel_size);
+  }
+
+  // run inference
+  CVI_NN_Forward(model, input_tensors, input_num, output_tensors, output_num);
+  printf("CVI_NN_Forward succeeded\n");
+
+  // output result
+  std::vector<std::string> labels;
+  std::ifstream file(argv[3]);
+  if (!file) {
+    printf("Didn't find synset_words file\n");
+    exit(1);
+  } else {
+    std::string line;
+    while (std::getline(file, line)) {
+      labels.push_back(std::string(line));
+    }
+  }
+
+  int32_t top_num = 5;
+  float *prob = (float *)CVI_NN_TensorPtr(output);
+  int32_t count = CVI_NN_TensorCount(output);
+
+  // find top-k prob and cls
+  std::vector<size_t> idx(count);
+  std::iota(idx.begin(), idx.end(), 0);
+  std::sort(idx.begin(), idx.end(), [&prob](size_t idx_0, size_t idx_1) {return prob[idx_0] > prob[idx_1];});
+  // show results.
+  printf("------\n");
+  for (size_t i = 0; i < top_num; i++) {
+    int top_k_idx = idx[i];
+    printf("  %f, idx %d", prob[top_k_idx], top_k_idx);
+    if (!labels.empty())
+      printf(", %s", labels[top_k_idx].c_str());
+    printf("\n");
+  }
+  printf("------\n");
+  CVI_NN_CleanupModel(model);
+  printf("CVI_NN_CleanupModel succeeded\n");
+  return 0;
+}
--- a/Show More
+++ b/Show More