add cvikernel
commit 9f1f57a19c3c281a931dfc71b318494487193d56 Author: sophgo-forum-service <forum_service@sophgo.com> Date: Mon May 13 13:58:23 2024 +0800 [feat] cvikernel opensource for cv18xx soc. - 79b6a7, set lookup_interp_table layer_id.
This commit is contained in:
@ -17,3 +17,4 @@
|
||||
| FreeRTOS-Kernel | freertos/Source | https://github.com/sophgo/FreeRTOS-Kernel.git | sg200x-dev | d52c1b6e6 |
|
||||
| Lab-Project-FreeRTOS-POSIX | freertos/Source/FreeRTOS-Plus-POSIX | https://github.com/sophgo/Lab-Project-FreeRTOS-POSIX.git | sg200x-dev | 5042bfd |
|
||||
| cvibuilder | cvibuilder | https://github.com/sophgo/cvibuilder.git | sg200x-dev | 4309f2a |
|
||||
| cvikernel | cvikernel | https://github.com/sophgo/cvikernel.git | sg200x-dev | 9f1f57a |
|
||||
|
||||
1
cvikernel/.gitignore
vendored
Normal file
1
cvikernel/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
build/
|
||||
128
cvikernel/CMakeLists.txt
Normal file
128
cvikernel/CMakeLists.txt
Normal file
@ -0,0 +1,128 @@
|
||||
cmake_minimum_required(VERSION 3.1.0)
|
||||
|
||||
project(cvikernel C CXX)
|
||||
|
||||
set(CMAKE_C_STANDARD 99)
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
set(CMAKE_CXX_EXTENSIONS ON)
|
||||
|
||||
set(SAFETY_FLAGS "-Werror -Wall -Wextra -fno-strict-aliasing")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAFETY_FLAGS}")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAFETY_FLAGS}")
|
||||
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
install(FILES include/bmkernel/bm_kernel.h
|
||||
DESTINATION include/bmkernel)
|
||||
install(FILES include/bmkernel/bm_kernel_legacy.h
|
||||
DESTINATION include/bmkernel)
|
||||
install(FILES include/bmkernel/bm_regcpu.h
|
||||
DESTINATION include/bmkernel)
|
||||
install(FILES include/bmkernel/reg_tiu.h
|
||||
DESTINATION include/bmkernel)
|
||||
install(FILES include/bmkernel/reg_tdma.h
|
||||
DESTINATION include/bmkernel)
|
||||
install(FILES include/bmkernel/reg_bdcast.h
|
||||
DESTINATION include/bmkernel)
|
||||
install(FILES include/bmkernel/bm1880v2/bmkernel_1880v2.h
|
||||
DESTINATION include/bmkernel/bm1880v2)
|
||||
install(FILES include/bmkernel/bm1880v2/non_atomic.h
|
||||
DESTINATION include/bmkernel/bm1880v2)
|
||||
install(FILES include/bmkernel/bm1880v2/1880v2_fp_convert.h
|
||||
DESTINATION include/bmkernel/bm1880v2)
|
||||
install(FILES include/bmkernel/bm1880v2/bm_vlc_compress.h
|
||||
DESTINATION include/bmkernel/bm1880v2)
|
||||
install(FILES include/bmkernel/bm1880v2/compression.h
|
||||
DESTINATION include/bmkernel/bm1880v2)
|
||||
install(FILES include/bmkernel/bm1880v2/bm1880v2_tiu_reg.h
|
||||
DESTINATION include/bmkernel/bm1880v2)
|
||||
install(FILES include/bmkernel/bm1880v2/bm1880v2_tdma_reg.h
|
||||
DESTINATION include/bmkernel/bm1880v2)
|
||||
install(FILES include/bmkernel/bm1880v2/bm1880v2_tpu_cfg.h
|
||||
DESTINATION include/bmkernel/bm1880v2)
|
||||
install(FILES include/bmkernel/bm1822/bmkernel_1822.h
|
||||
DESTINATION include/bmkernel/bm1822)
|
||||
install(FILES include/bmkernel/bm1822/1822_fp_convert.h
|
||||
DESTINATION include/bmkernel/bm1822)
|
||||
install(FILES include/bmkernel/bm1822/bm_vlc_compress.h
|
||||
DESTINATION include/bmkernel/bm1822)
|
||||
install(FILES include/bmkernel/bm1822/compression.h
|
||||
DESTINATION include/bmkernel/bm1822)
|
||||
install(FILES include/bmkernel/bm1822/bm1822_tiu_reg.h
|
||||
DESTINATION include/bmkernel/bm1822)
|
||||
install(FILES include/bmkernel/bm1822/bm1822_tdma_reg.h
|
||||
DESTINATION include/bmkernel/bm1822)
|
||||
install(FILES include/bmkernel/bm1822/bm1822_tpu_cfg.h
|
||||
DESTINATION include/bmkernel/bm1822)
|
||||
install(FILES include/cvikernel/cv181x/cv181x_tiu_reg.h
|
||||
DESTINATION include/cvikernel/cv181x)
|
||||
install(FILES include/cvikernel/cv181x/cv181x_tdma_reg.h
|
||||
DESTINATION include/cvikernel/cv181x)
|
||||
install(FILES include/cvikernel/cv181x/cv181x_tpu_cfg.h
|
||||
DESTINATION include/cvikernel/cv181x)
|
||||
install(FILES include/cvikernel/cv180x/cv180x_tiu_reg.h
|
||||
DESTINATION include/cvikernel/cv180x)
|
||||
install(FILES include/cvikernel/cv180x/cv180x_tdma_reg.h
|
||||
DESTINATION include/cvikernel/cv180x)
|
||||
install(FILES include/cvikernel/cv180x/cv180x_tpu_cfg.h
|
||||
DESTINATION include/cvikernel/cv180x)
|
||||
install(FILES include/cvikernel/cvikernel.h
|
||||
DESTINATION include/cvikernel)
|
||||
install(FILES include/cvikernel/cvk_fp_convert.h
|
||||
DESTINATION include/cvikernel)
|
||||
install(FILES include/cvikernel/cvk_vlc_compress.h
|
||||
DESTINATION include/cvikernel)
|
||||
enable_testing()
|
||||
|
||||
include_directories(include)
|
||||
include_directories(src)
|
||||
|
||||
file(GLOB COMMON_SOURCES "src/*.c")
|
||||
file(GLOB_RECURSE BM1822_SOURCES "src/bm1822/*.c")
|
||||
file(GLOB_RECURSE BM1880v2_SOURCES "src/bm1880v2/*.c")
|
||||
file(GLOB_RECURSE CV1822_SOURCES "src/cv1822/*.c")
|
||||
file(GLOB_RECURSE CV1880v2_SOURCES "src/cv1880v2/*.c")
|
||||
file(GLOB_RECURSE CV181X_SOURCES "src/cv181x/*.c")
|
||||
file(GLOB_RECURSE CV180X_SOURCES "src/cv180x/*.c")
|
||||
|
||||
SET(_SOURCES ${COMMON_SOURCES})
|
||||
SET(_SOURCES ${_SOURCES} ${BM1822_SOURCES})
|
||||
SET(_SOURCES ${_SOURCES} ${CV1822_SOURCES})
|
||||
|
||||
if (CHIP STREQUAL "cv181x")
|
||||
SET(_SOURCES ${_SOURCES} ${CV181X_SOURCES})
|
||||
add_definitions(-DCHIPID=0x3)
|
||||
elseif (CHIP STREQUAL "cv180x")
|
||||
SET(_SOURCES ${_SOURCES} ${CV180X_SOURCES})
|
||||
add_definitions(-DCHIPID=0x4)
|
||||
elseif(CHIP STREQUAL "cv183x")
|
||||
add_definitions(-DCHIPID=0x1)
|
||||
SET(_SOURCES ${_SOURCES} ${BM1880v2_SOURCES})
|
||||
SET(_SOURCES ${_SOURCES} ${CV1880v2_SOURCES})
|
||||
elseif(CHIP STREQUAL "cv182x")
|
||||
add_definitions(-DCHIPID=0x2)
|
||||
else()
|
||||
# pc cmodel
|
||||
add_definitions(-DCHIPID=0x0)
|
||||
SET(_SOURCES ${_SOURCES} ${CV180X_SOURCES})
|
||||
SET(_SOURCES ${_SOURCES} ${CV181X_SOURCES})
|
||||
SET(_SOURCES ${_SOURCES} ${BM1880v2_SOURCES})
|
||||
SET(_SOURCES ${_SOURCES} ${CV1880v2_SOURCES})
|
||||
endif()
|
||||
|
||||
#
|
||||
# check for `enum-compare`
|
||||
# for c compiler not treat enum-compare as [error](https://gcc.gnu.org/bugzilla/show_bug.cgi?id=30357), we could leverage c++ compiler check temporary
|
||||
# you could refer [here](https://stackoverflow.com/questions/7690800/can-cmake-use-g-to-compile-c-files) for more details
|
||||
# and [here](https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html) for compiler options c/c++
|
||||
# default check is off
|
||||
#file(GLOB_RECURSE CHECK_SOURCE "src/bm1880v2/non_atomic/*.c")
|
||||
#set_source_files_properties(${CHECK_SOURCE} PROPERTIES LANGUAGE CXX )
|
||||
|
||||
add_library(cvikernel SHARED ${_SOURCES})
|
||||
add_library(cvikernel-static STATIC ${_SOURCES})
|
||||
target_link_libraries(cvikernel m) # m for <math.h>
|
||||
|
||||
install(TARGETS cvikernel cvikernel-static DESTINATION lib)
|
||||
|
||||
set(CVI_LIBS ${CVI_LIBS} cvikernel)
|
||||
56
cvikernel/README.md
Normal file
56
cvikernel/README.md
Normal file
@ -0,0 +1,56 @@
|
||||
# bmkernel
|
||||
|
||||
## overview
|
||||
|
||||
bmkernel is a lib for TPU instruction generation, serving as assembly.
|
||||
|
||||
## dependency
|
||||
|
||||
none
|
||||
|
||||
## build
|
||||
|
||||
assuming install to ../install_bmkernel
|
||||
|
||||
```
|
||||
$ cd bmkernel
|
||||
$ mkdir build
|
||||
$ cd build
|
||||
$ cmake -G Ninja -DCHIP=BM1880v2 -DCMAKE_INSTALL_PREFIX=../../install_bmkernel ..
|
||||
|
||||
Build
|
||||
$ cmake --build .
|
||||
$ cmake --build . -- -v
|
||||
|
||||
Install
|
||||
$ cmake --build . --target install
|
||||
$ cmake --build . --target install -- -v
|
||||
|
||||
Test
|
||||
$ cmake --build . --target test -- -v
|
||||
|
||||
Uninstall
|
||||
$ xargs rm < install_manifest.txt
|
||||
```
|
||||
|
||||
## output
|
||||
|
||||
```
|
||||
├── bin
|
||||
│ └── readcmdbuf
|
||||
├── include
|
||||
│ └── bmkernel
|
||||
│ ├── bm1880v2
|
||||
│ │ └── bmkernel_1880v2.h
|
||||
│ ├── bm_kernel.h
|
||||
│ └── bm_kernel_legacy.h
|
||||
└── lib
|
||||
├── libbmkernel.so
|
||||
└── libbmkernel-static.a
|
||||
```
|
||||
|
||||
## TODO
|
||||
|
||||
* add more testing
|
||||
* mv assembly & disassembly here
|
||||
* round trip testing, asm %s | disasm
|
||||
334
cvikernel/include/bmkernel/bm1822/1822_fp_convert.h
Normal file
334
cvikernel/include/bmkernel/bm1822/1822_fp_convert.h
Normal file
@ -0,0 +1,334 @@
|
||||
#ifndef ATOMIC_FP_H_
|
||||
#define ATOMIC_FP_H_
|
||||
|
||||
#if __arm__
|
||||
#define __DISABLE_FENV__
|
||||
#endif
|
||||
|
||||
#ifndef __DISABLE_FENV__
|
||||
#include <fenv.h>
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
static inline uint8_t convert_bf16_u8(uint16_t data);
|
||||
static inline uint8_t _convert_bf16_u8(uint16_t data, int int8_rnd_md);
|
||||
static inline int8_t _convert_bf16_s8(uint16_t data, int int8_rnd_md);
|
||||
static inline int8_t convert_bf16_s8(uint16_t data);
|
||||
static inline uint16_t convert_int8_bf16(uint8_t data, uint8_t sign);
|
||||
static inline uint32_t convert_fp32_u32(float fp32);
|
||||
static inline uint32_t convert_fp32_hex(float val);
|
||||
static inline float convert_hex_fp32(uint32_t hval);
|
||||
|
||||
static inline float convert_bf16_fp32(uint16_t bf16);
|
||||
static inline uint16_t convert_fp32_bf16(float fp32);
|
||||
|
||||
static inline void f32_integer(void *if32, void *o_integer, int integer_size, int accumulate, int int8_signed, int int8_rnd_md);
|
||||
//static inline void f32_integer(void *if32, void *o_integer,
|
||||
// 0 for 32 bit , 1 for 16 bit , 2 for 8 bit
|
||||
// int integer_size, int accumulate = 0, int int8_signed = 1, int int8_rnd_md = 0);
|
||||
|
||||
union convert_type_float {
|
||||
float fval;
|
||||
uint16_t bf16[2];
|
||||
uint32_t ival;
|
||||
};
|
||||
|
||||
typedef union convert_type_float convert_int_float;
|
||||
static const uint16_t NAN_VALUE = 0x7FC0;
|
||||
|
||||
//static int round_mode = 0;
|
||||
static uint8_t float_isnan(const float x) {
|
||||
//return isnan(x);
|
||||
return x != x;
|
||||
}
|
||||
|
||||
static inline int set_store_feround()
|
||||
{
|
||||
#ifndef __DISABLE_FENV__
|
||||
int round_mode = fegetround();
|
||||
fesetround(FE_TOWARDZERO);
|
||||
return round_mode;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void restore_feround(int round_mode)
|
||||
{
|
||||
#ifndef __DISABLE_FENV__
|
||||
fesetround(round_mode);
|
||||
#else
|
||||
(void)round_mode;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline uint8_t _convert_bf16_u8(uint16_t data, int int8_rnd_md)
|
||||
{
|
||||
/* convert bf16 to float32*/
|
||||
float fp32;
|
||||
convert_int_float convert_val;
|
||||
fp32 = convert_bf16_fp32(data);
|
||||
/* convert float32 to uint8_t*/
|
||||
f32_integer((void*)&fp32, &convert_val.ival, 2, 0, 0, int8_rnd_md);
|
||||
return (uint8_t) convert_val.ival;
|
||||
}
|
||||
|
||||
static inline uint8_t convert_bf16_u8(uint16_t data)
|
||||
{
|
||||
return (uint8_t) _convert_bf16_u8(data, 0);
|
||||
}
|
||||
|
||||
static inline int8_t _convert_bf16_s8(uint16_t data, int int8_rnd_md)
|
||||
{
|
||||
/* convert bf16 to float32*/
|
||||
float fp32;
|
||||
convert_int_float convert_val;
|
||||
fp32 = convert_bf16_fp32(data);
|
||||
/* convert float32 to uint8_t*/
|
||||
f32_integer((void*)&fp32, &convert_val.ival, 2, 0, 1, int8_rnd_md);
|
||||
return (int8_t) convert_val.ival;
|
||||
}
|
||||
|
||||
static inline int8_t convert_bf16_s8(uint16_t data)
|
||||
{
|
||||
return (int8_t) _convert_bf16_s8(data, 0);
|
||||
}
|
||||
|
||||
static inline uint16_t convert_int8_bf16(uint8_t data, uint8_t sign)
|
||||
{
|
||||
int32_t val = sign ? (int8_t) data : (uint8_t) data;
|
||||
/* need to round to bf16 mode */
|
||||
return convert_fp32_bf16((float) val);
|
||||
}
|
||||
|
||||
static inline uint16_t convert_fp32_bf16(float fp32)
|
||||
{
|
||||
if (float_isnan(fp32))
|
||||
return NAN_VALUE;
|
||||
convert_int_float convert_val;
|
||||
convert_val.fval = fp32;
|
||||
uint32_t input = convert_val.ival;
|
||||
uint32_t lsb = (input >> 16) & 1;
|
||||
uint32_t rounding_bias = 0x7fff + lsb;
|
||||
input += rounding_bias;
|
||||
convert_val.bf16[1] = (uint16_t) (input >> 16);
|
||||
|
||||
/* HW behavior */
|
||||
if ((convert_val.bf16[1] & 0x7f80) == 0x7f80) {
|
||||
convert_val.bf16[1] = 0x7f7f;
|
||||
}
|
||||
return convert_val.bf16[1];
|
||||
}
|
||||
|
||||
static inline uint8_t convert_fp32_u8(float fp32)
|
||||
{
|
||||
convert_int_float convert_val;
|
||||
f32_integer((void*)&fp32, &convert_val.ival, 2, 0, 0, 0);
|
||||
return (uint8_t) convert_val.ival;
|
||||
}
|
||||
|
||||
static inline int8_t convert_fp32_s8(float fp32)
|
||||
{
|
||||
convert_int_float convert_val;
|
||||
f32_integer((void*)&fp32, &convert_val.ival, 2, 0, 1, 0);
|
||||
return (int8_t) convert_val.ival;
|
||||
}
|
||||
|
||||
static inline uint32_t convert_fp32_u32(float fp32)
|
||||
{
|
||||
convert_int_float convert_val;
|
||||
f32_integer((void*)&fp32, &convert_val.ival, 0, 0, 0, 0);
|
||||
return (uint32_t) convert_val.ival;
|
||||
}
|
||||
|
||||
static inline int32_t convert_fp32_s32(float fp32)
|
||||
{
|
||||
convert_int_float convert_val;
|
||||
f32_integer((void*)&fp32, &convert_val.ival, 0, 0, 1, 0);
|
||||
return (int32_t) convert_val.ival;
|
||||
}
|
||||
|
||||
/* convert hex to float directly */
|
||||
static inline float convert_hex_fp32(uint32_t hval)
|
||||
{
|
||||
convert_int_float convert_val;
|
||||
convert_val.ival = hval;
|
||||
return convert_val.fval;
|
||||
}
|
||||
/* convert float to hex directly */
|
||||
static inline uint32_t convert_fp32_hex(float val)
|
||||
{
|
||||
convert_int_float convert_val;
|
||||
convert_val.fval = val;
|
||||
return convert_val.ival;
|
||||
}
|
||||
static inline float convert_bf16_fp32(uint16_t bf16)
|
||||
{
|
||||
convert_int_float convert_val;
|
||||
convert_val.bf16[1] = bf16;
|
||||
convert_val.bf16[0] = 0;
|
||||
return convert_val.fval;
|
||||
}
|
||||
|
||||
static inline void flt2int_flt(float x, unsigned long long* integer_part, float * sub_part, uint8_t sign)
|
||||
{
|
||||
convert_int_float work_x;
|
||||
int level_code;
|
||||
unsigned long tail_code;
|
||||
work_x.fval = x;
|
||||
level_code = ((work_x.ival >> 23) & 0xff) - 127;
|
||||
|
||||
//if the level code is negaive, the integer part of the float is zero
|
||||
if ( level_code < 0 ){
|
||||
*integer_part = 0;
|
||||
*sub_part = x;
|
||||
}
|
||||
else {
|
||||
tail_code = (work_x.ival) & 0x7fffff;
|
||||
tail_code = tail_code | 0x800000;
|
||||
|
||||
if (level_code < 23){
|
||||
tail_code >>= (23 - level_code);
|
||||
*integer_part = tail_code;
|
||||
work_x.ival &= 0xffffffff << (23 - level_code);
|
||||
*sub_part = x - work_x.fval;
|
||||
}
|
||||
else {
|
||||
tail_code <<= (level_code - 23);
|
||||
*integer_part = tail_code;
|
||||
if(level_code>30){
|
||||
*integer_part = 0x7fffffff;
|
||||
if(sign)*integer_part = 0x800000000;
|
||||
}
|
||||
*sub_part = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline static int flt2int(float ifval, int int8_rnd_md)
|
||||
{
|
||||
union {
|
||||
float floatNum;
|
||||
unsigned long intNum;
|
||||
} tempIfval;
|
||||
tempIfval.floatNum = ifval;
|
||||
uint8_t isPositive = ((tempIfval.intNum & 0x80000000UL) == 0x80000000UL) ? false : true ;
|
||||
float abs_fval = (!isPositive) ? -ifval : ifval;
|
||||
float sub_part;
|
||||
unsigned long long integer_part;
|
||||
uint8_t sign = !isPositive;
|
||||
flt2int_flt(abs_fval, &integer_part, &sub_part, sign);
|
||||
if (!isPositive)
|
||||
{
|
||||
unsigned long long result;
|
||||
if(int8_rnd_md == 0) { // round to nearest even
|
||||
if ( sub_part > 0.5f )
|
||||
{
|
||||
result = integer_part + 1;
|
||||
}
|
||||
else if (sub_part == 0.5f)
|
||||
{
|
||||
if ( integer_part & 0x1 )
|
||||
{
|
||||
result = integer_part + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
result = integer_part;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
result = integer_part;
|
||||
}
|
||||
} else { //round to zero
|
||||
result = integer_part;
|
||||
}
|
||||
if ( result > 0x80000000UL )
|
||||
{
|
||||
result = 0x80000000UL;
|
||||
}
|
||||
return -result;
|
||||
}
|
||||
else
|
||||
{
|
||||
unsigned long long result;
|
||||
if(int8_rnd_md == 0) { // round to nearest even
|
||||
if ( sub_part > 0.5f )
|
||||
{
|
||||
result = integer_part + 1;
|
||||
}
|
||||
else if ( sub_part == 0.5f )
|
||||
{
|
||||
if ( integer_part & 0x1 )
|
||||
{
|
||||
result = integer_part + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
result = integer_part;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
result = integer_part;
|
||||
}
|
||||
} else {
|
||||
result = integer_part;
|
||||
}
|
||||
if ( result > 0x7fffffff )
|
||||
{
|
||||
result = 0x7fffffff;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void f32_integer(void *if32, void *o_integer, int integer_size, int accumulate, int int8_signed, int int8_rnd_md)
|
||||
{
|
||||
int i_tmp;
|
||||
float *f_tmp;
|
||||
f_tmp = (float *)if32;
|
||||
i_tmp = flt2int(*f_tmp, int8_rnd_md);
|
||||
int *o32 = (int *)o_integer;
|
||||
int dst_f32 = *o32;
|
||||
short *o16 = (short *)o_integer;
|
||||
short dst_o16 = *o32;
|
||||
char *o8 = (char *)o_integer;
|
||||
char dst_o8 = *o8;
|
||||
|
||||
if (integer_size == 0) {
|
||||
*o32 = i_tmp;
|
||||
} else if (integer_size == 1) {
|
||||
*o16 = i_tmp;
|
||||
} else{
|
||||
*o8 = i_tmp;
|
||||
int min = (int8_signed) ? -128 : 0;
|
||||
int max = (int8_signed) ? 127 : 255;
|
||||
if (i_tmp < min ){
|
||||
*o8 = min;
|
||||
}
|
||||
else if (i_tmp > max){
|
||||
*o8 = max;
|
||||
}
|
||||
//*o8 = i_tmp;
|
||||
}
|
||||
if (accumulate) {
|
||||
if (integer_size == 0) {
|
||||
*o32 += dst_f32;
|
||||
} else if (integer_size == 1) {
|
||||
*o16 += dst_o16;
|
||||
} else
|
||||
*o8 += dst_o8;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* ATOMIC_FP_H_ */
|
||||
|
||||
306
cvikernel/include/bmkernel/bm1822/bm1822_tdma_reg.h
Normal file
306
cvikernel/include/bmkernel/bm1822/bm1822_tdma_reg.h
Normal file
@ -0,0 +1,306 @@
|
||||
#ifndef BM1822_TDMA_REG_H
|
||||
#define BM1822_TDMA_REG_H
|
||||
|
||||
/*
|
||||
* This file is generated by tools. Do not edit it manually.
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
typedef unsigned long long ullong;
|
||||
|
||||
typedef struct {
|
||||
uint32_t vld;
|
||||
uint32_t compress_en;
|
||||
uint32_t eod;
|
||||
uint32_t intp_en;
|
||||
uint32_t bar_en;
|
||||
uint32_t check_bf16_value;
|
||||
uint32_t trans_dir;
|
||||
uint32_t rsv00;
|
||||
uint32_t trans_fmt;
|
||||
uint32_t transpose_md;
|
||||
uint32_t rsv01;
|
||||
uint32_t intra_cmd_paral;
|
||||
uint32_t outstanding_en;
|
||||
uint32_t cmd_id;
|
||||
uint32_t spec_func;
|
||||
uint32_t dst_fmt;
|
||||
uint32_t src_fmt;
|
||||
uint32_t cmprs_fmt;
|
||||
uint32_t sys_dtype;
|
||||
uint32_t rsv2_1;
|
||||
uint32_t int8_sign;
|
||||
uint32_t compress_zero_guard;
|
||||
uint32_t int8_rnd_mode;
|
||||
uint32_t wait_id_tpu;
|
||||
uint32_t wait_id_other_tdma;
|
||||
uint32_t wait_id_sdma;
|
||||
uint32_t const_val;
|
||||
uint32_t src_base_reg_sel;
|
||||
uint32_t mv_lut_idx;
|
||||
uint32_t dst_base_reg_sel;
|
||||
uint32_t mv_lut_base;
|
||||
uint32_t rsv4_5;
|
||||
uint32_t dst_h_stride;
|
||||
uint32_t dst_c_stride_low;
|
||||
uint32_t dst_n_stride;
|
||||
uint32_t src_h_stride;
|
||||
uint32_t src_c_stride_low;
|
||||
uint32_t src_n_stride;
|
||||
uint32_t dst_c;
|
||||
uint32_t src_c;
|
||||
uint32_t dst_w;
|
||||
uint32_t dst_h;
|
||||
uint32_t src_w;
|
||||
uint32_t src_h;
|
||||
uint32_t dst_base_addr_low;
|
||||
uint32_t src_base_addr_low;
|
||||
uint32_t src_n;
|
||||
uint32_t dst_base_addr_high;
|
||||
uint32_t src_base_addr_high;
|
||||
uint32_t src_c_stride_high;
|
||||
uint32_t dst_c_stride_high;
|
||||
uint32_t compress_bias0;
|
||||
uint32_t compress_bias1;
|
||||
uint32_t layer_ID;
|
||||
} tdma_reg_t;
|
||||
|
||||
static inline void parse_tdma_reg(tdma_reg_t *r, const uint32_t *p)
|
||||
{
|
||||
r->vld = p[0] & 1;
|
||||
r->compress_en = (p[0] >> 1) & 1;
|
||||
r->eod = (p[0] >> 2) & 1;
|
||||
r->intp_en = (p[0] >> 3) & 1;
|
||||
r->bar_en = (p[0] >> 4) & 1;
|
||||
r->check_bf16_value = (p[0] >> 5) & 1;
|
||||
r->trans_dir = (p[0] >> 6) & ((1u << 2) - 1);
|
||||
r->rsv00 = (p[0] >> 8) & ((1u << 2) - 1);
|
||||
r->trans_fmt = (p[0] >> 10) & 1;
|
||||
r->transpose_md = (p[0] >> 11) & ((1u << 2) - 1);
|
||||
r->rsv01 = (p[0] >> 13) & 1;
|
||||
r->intra_cmd_paral = (p[0] >> 14) & 1;
|
||||
r->outstanding_en = (p[0] >> 15) & 1;
|
||||
r->cmd_id = (p[0] >> 16) & ((1u << 16) - 1);
|
||||
r->spec_func = p[1] & ((1u << 3) - 1);
|
||||
r->dst_fmt = (p[1] >> 3) & ((1u << 2) - 1);
|
||||
r->src_fmt = (p[1] >> 5) & ((1u << 2) - 1);
|
||||
r->cmprs_fmt = (p[1] >> 7) & 1;
|
||||
r->sys_dtype = (p[1] >> 8) & 1;
|
||||
r->rsv2_1 = (p[1] >> 9) & ((1u << 4) - 1);
|
||||
r->int8_sign = (p[1] >> 13) & 1;
|
||||
r->compress_zero_guard = (p[1] >> 14) & 1;
|
||||
r->int8_rnd_mode = (p[1] >> 15) & 1;
|
||||
r->wait_id_tpu = (p[1] >> 16) & ((1u << 16) - 1);
|
||||
r->wait_id_other_tdma = p[2] & ((1u << 16) - 1);
|
||||
r->wait_id_sdma = (p[2] >> 16) & ((1u << 16) - 1);
|
||||
r->const_val = p[3] & ((1u << 16) - 1);
|
||||
r->src_base_reg_sel = (p[3] >> 16) & ((1u << 3) - 1);
|
||||
r->mv_lut_idx = (p[3] >> 19) & 1;
|
||||
r->dst_base_reg_sel = (p[3] >> 20) & ((1u << 3) - 1);
|
||||
r->mv_lut_base = (p[3] >> 23) & 1;
|
||||
r->rsv4_5 = (p[3] >> 24) & ((1u << 8) - 1);
|
||||
r->dst_h_stride = p[4] & ((1u << 16) - 1);
|
||||
r->dst_c_stride_low = (p[4] >> 16) & ((1u << 16) - 1);
|
||||
r->dst_n_stride = p[5];
|
||||
r->src_h_stride = p[6] & ((1u << 16) - 1);
|
||||
r->src_c_stride_low = (p[6] >> 16) & ((1u << 16) - 1);
|
||||
r->src_n_stride = p[7];
|
||||
r->dst_c = p[8] & ((1u << 16) - 1);
|
||||
r->src_c = (p[8] >> 16) & ((1u << 16) - 1);
|
||||
r->dst_w = p[9] & ((1u << 16) - 1);
|
||||
r->dst_h = (p[9] >> 16) & ((1u << 16) - 1);
|
||||
r->src_w = p[10] & ((1u << 16) - 1);
|
||||
r->src_h = (p[10] >> 16) & ((1u << 16) - 1);
|
||||
r->dst_base_addr_low = p[11];
|
||||
r->src_base_addr_low = p[12];
|
||||
r->src_n = p[13] & ((1u << 16) - 1);
|
||||
r->dst_base_addr_high = (p[13] >> 16) & ((1u << 8) - 1);
|
||||
r->src_base_addr_high = (p[13] >> 24) & ((1u << 8) - 1);
|
||||
r->src_c_stride_high = p[14] & ((1u << 16) - 1);
|
||||
r->dst_c_stride_high = (p[14] >> 16) & ((1u << 16) - 1);
|
||||
r->compress_bias0 = p[15] & ((1u << 8) - 1);
|
||||
r->compress_bias1 = (p[15] >> 8) & ((1u << 8) - 1);
|
||||
r->layer_ID = (p[15] >> 16) & ((1u << 16) - 1);
|
||||
}
|
||||
|
||||
static inline void emit_tdma_reg(const tdma_reg_t *r, uint32_t *_p)
|
||||
{
|
||||
volatile uint32_t *p = (typeof(p))_p;
|
||||
p[15] = (r->compress_bias0 & ((1u << 8) - 1)) |
|
||||
((r->compress_bias1 & ((1u << 8) - 1)) << 8) |
|
||||
((r->layer_ID & ((1u << 16) - 1)) << 16);
|
||||
p[14] = (r->src_c_stride_high & ((1u << 16) - 1)) |
|
||||
((r->dst_c_stride_high & ((1u << 16) - 1)) << 16);
|
||||
p[13] = (r->src_n & ((1u << 16) - 1)) |
|
||||
((r->dst_base_addr_high & ((1u << 8) - 1)) << 16) |
|
||||
((r->src_base_addr_high & ((1u << 8) - 1)) << 24);
|
||||
p[12] = (r->src_base_addr_low & (((uint64_t)1 << 32) - 1));
|
||||
p[11] = (r->dst_base_addr_low & (((uint64_t)1 << 32) - 1));
|
||||
p[10] = (r->src_w & ((1u << 16) - 1)) |
|
||||
((r->src_h & ((1u << 16) - 1)) << 16);
|
||||
p[9] = (r->dst_w & ((1u << 16) - 1)) |
|
||||
((r->dst_h & ((1u << 16) - 1)) << 16);
|
||||
p[8] = (r->dst_c & ((1u << 16) - 1)) |
|
||||
((r->src_c & ((1u << 16) - 1)) << 16);
|
||||
p[7] = (r->src_n_stride & (((uint64_t)1 << 32) - 1));
|
||||
p[6] = (r->src_h_stride & ((1u << 16) - 1)) |
|
||||
((r->src_c_stride_low & ((1u << 16) - 1)) << 16);
|
||||
p[5] = (r->dst_n_stride & (((uint64_t)1 << 32) - 1));
|
||||
p[4] = (r->dst_h_stride & ((1u << 16) - 1)) |
|
||||
((r->dst_c_stride_low & ((1u << 16) - 1)) << 16);
|
||||
p[3] = (r->const_val & ((1u << 16) - 1)) |
|
||||
((r->src_base_reg_sel & ((1u << 3) - 1)) << 16) |
|
||||
((r->mv_lut_idx & 1) << 19) |
|
||||
((r->dst_base_reg_sel & ((1u << 3) - 1)) << 20) |
|
||||
((r->mv_lut_base & 1) << 23) |
|
||||
((r->rsv4_5 & ((1u << 8) - 1)) << 24);
|
||||
p[2] = (r->wait_id_other_tdma & ((1u << 16) - 1)) |
|
||||
((r->wait_id_sdma & ((1u << 16) - 1)) << 16);
|
||||
p[1] = (r->spec_func & ((1u << 3) - 1)) |
|
||||
((r->dst_fmt & ((1u << 2) - 1)) << 3) |
|
||||
((r->src_fmt & ((1u << 2) - 1)) << 5) |
|
||||
((r->cmprs_fmt & 1) << 7) |
|
||||
((r->sys_dtype & 1) << 8) |
|
||||
((r->rsv2_1 & ((1u << 4) - 1)) << 9) |
|
||||
((r->int8_sign & 1) << 13) |
|
||||
((r->compress_zero_guard & 1) << 14) |
|
||||
((r->int8_rnd_mode & 1) << 15) |
|
||||
((r->wait_id_tpu & ((1u << 16) - 1)) << 16);
|
||||
p[0] = (r->vld & 1) |
|
||||
((r->compress_en & 1) << 1) |
|
||||
((r->eod & 1) << 2) |
|
||||
((r->intp_en & 1) << 3) |
|
||||
((r->bar_en & 1) << 4) |
|
||||
((r->check_bf16_value & 1) << 5) |
|
||||
((r->trans_dir & ((1u << 2) - 1)) << 6) |
|
||||
((r->rsv00 & ((1u << 2) - 1)) << 8) |
|
||||
((r->trans_fmt & 1) << 10) |
|
||||
((r->transpose_md & ((1u << 2) - 1)) << 11) |
|
||||
((r->rsv01 & 1) << 13) |
|
||||
((r->intra_cmd_paral & 1) << 14) |
|
||||
((r->outstanding_en & 1) << 15) |
|
||||
((r->cmd_id & ((1u << 16) - 1)) << 16);
|
||||
}
|
||||
|
||||
static inline void reset_tdma_reg(tdma_reg_t *r)
|
||||
{
|
||||
r->vld = 0x0;
|
||||
r->compress_en = 0x0;
|
||||
r->eod = 0x0;
|
||||
r->intp_en = 0x0;
|
||||
r->bar_en = 0x0;
|
||||
r->check_bf16_value = 0x0;
|
||||
r->trans_dir = 0x0;
|
||||
r->rsv00 = 0x0;
|
||||
r->trans_fmt = 0x0;
|
||||
r->transpose_md = 0x0;
|
||||
r->rsv01 = 0x0;
|
||||
r->intra_cmd_paral = 0x0;
|
||||
r->outstanding_en = 0x0;
|
||||
r->cmd_id = 0x0;
|
||||
r->spec_func = 0x0;
|
||||
r->dst_fmt = 0x1;
|
||||
r->src_fmt = 0x1;
|
||||
r->cmprs_fmt = 0x0;
|
||||
r->sys_dtype = 0x0;
|
||||
r->rsv2_1 = 0x0;
|
||||
r->int8_sign = 0x0;
|
||||
r->compress_zero_guard = 0x0;
|
||||
r->int8_rnd_mode = 0x0;
|
||||
r->wait_id_tpu = 0x0;
|
||||
r->wait_id_other_tdma = 0x0;
|
||||
r->wait_id_sdma = 0x0;
|
||||
r->const_val = 0x0;
|
||||
r->src_base_reg_sel = 0x0;
|
||||
r->mv_lut_idx = 0x0;
|
||||
r->dst_base_reg_sel = 0x0;
|
||||
r->mv_lut_base = 0x0;
|
||||
r->rsv4_5 = 0x0;
|
||||
r->dst_h_stride = 0x1;
|
||||
r->dst_c_stride_low = 0x1;
|
||||
r->dst_n_stride = 0x1;
|
||||
r->src_h_stride = 0x1;
|
||||
r->src_c_stride_low = 0x1;
|
||||
r->src_n_stride = 0x1;
|
||||
r->dst_c = 0x1;
|
||||
r->src_c = 0x1;
|
||||
r->dst_w = 0x1;
|
||||
r->dst_h = 0x1;
|
||||
r->src_w = 0x1;
|
||||
r->src_h = 0x1;
|
||||
r->dst_base_addr_low = 0x0;
|
||||
r->src_base_addr_low = 0x0;
|
||||
r->src_n = 0x1;
|
||||
r->dst_base_addr_high = 0x0;
|
||||
r->src_base_addr_high = 0x0;
|
||||
r->src_c_stride_high = 0x0;
|
||||
r->dst_c_stride_high = 0x0;
|
||||
r->compress_bias0 = 0x0;
|
||||
r->compress_bias1 = 0x0;
|
||||
r->layer_ID = 0x0;
|
||||
}
|
||||
|
||||
static inline void trace_tdma_reg(tdma_reg_t *r, const char *tag)
|
||||
{
|
||||
#define trace_one_reg(name) \
|
||||
printf(" %s: 0x%llx\n", #name, (ullong)r->name)
|
||||
|
||||
printf("--- %s ---\n", tag);
|
||||
trace_one_reg(vld);
|
||||
trace_one_reg(compress_en);
|
||||
trace_one_reg(eod);
|
||||
trace_one_reg(intp_en);
|
||||
trace_one_reg(bar_en);
|
||||
trace_one_reg(check_bf16_value);
|
||||
trace_one_reg(trans_dir);
|
||||
trace_one_reg(rsv00);
|
||||
trace_one_reg(trans_fmt);
|
||||
trace_one_reg(transpose_md);
|
||||
trace_one_reg(rsv01);
|
||||
trace_one_reg(intra_cmd_paral);
|
||||
trace_one_reg(outstanding_en);
|
||||
trace_one_reg(cmd_id);
|
||||
trace_one_reg(spec_func);
|
||||
trace_one_reg(dst_fmt);
|
||||
trace_one_reg(src_fmt);
|
||||
trace_one_reg(cmprs_fmt);
|
||||
trace_one_reg(sys_dtype);
|
||||
trace_one_reg(rsv2_1);
|
||||
trace_one_reg(int8_sign);
|
||||
trace_one_reg(compress_zero_guard);
|
||||
trace_one_reg(int8_rnd_mode);
|
||||
trace_one_reg(wait_id_tpu);
|
||||
trace_one_reg(wait_id_other_tdma);
|
||||
trace_one_reg(wait_id_sdma);
|
||||
trace_one_reg(const_val);
|
||||
trace_one_reg(src_base_reg_sel);
|
||||
trace_one_reg(mv_lut_idx);
|
||||
trace_one_reg(dst_base_reg_sel);
|
||||
trace_one_reg(mv_lut_base);
|
||||
trace_one_reg(rsv4_5);
|
||||
trace_one_reg(dst_h_stride);
|
||||
trace_one_reg(dst_c_stride_low);
|
||||
trace_one_reg(dst_n_stride);
|
||||
trace_one_reg(src_h_stride);
|
||||
trace_one_reg(src_c_stride_low);
|
||||
trace_one_reg(src_n_stride);
|
||||
trace_one_reg(dst_c);
|
||||
trace_one_reg(src_c);
|
||||
trace_one_reg(dst_w);
|
||||
trace_one_reg(dst_h);
|
||||
trace_one_reg(src_w);
|
||||
trace_one_reg(src_h);
|
||||
trace_one_reg(dst_base_addr_low);
|
||||
trace_one_reg(src_base_addr_low);
|
||||
trace_one_reg(src_n);
|
||||
trace_one_reg(dst_base_addr_high);
|
||||
trace_one_reg(src_base_addr_high);
|
||||
trace_one_reg(src_c_stride_high);
|
||||
trace_one_reg(dst_c_stride_high);
|
||||
trace_one_reg(compress_bias0);
|
||||
trace_one_reg(compress_bias1);
|
||||
trace_one_reg(layer_ID);
|
||||
}
|
||||
#endif /* BM1822_TDMA_REG_H */
|
||||
599
cvikernel/include/bmkernel/bm1822/bm1822_tiu_reg.h
Normal file
599
cvikernel/include/bmkernel/bm1822/bm1822_tiu_reg.h
Normal file
@ -0,0 +1,599 @@
|
||||
#ifndef BM1822_TIU_REG_H
|
||||
#define BM1822_TIU_REG_H
|
||||
|
||||
/*
|
||||
* This file is generated by tools. Do not edit it manually.
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
typedef unsigned long long ullong;
|
||||
|
||||
typedef struct {
|
||||
uint32_t cmd_en;
|
||||
uint32_t cmd_end;
|
||||
uint32_t cmd_id_en;
|
||||
uint32_t cmd_keep;
|
||||
uint32_t cmd_intr_en;
|
||||
uint32_t tsk_typ;
|
||||
uint32_t tsk_eu_typ;
|
||||
uint32_t tsk_opd_num;
|
||||
uint32_t opt_res_shift;
|
||||
uint32_t opt_left_shift;
|
||||
uint32_t opt_shift_typ;
|
||||
uint32_t opt_rshift_typ;
|
||||
uint32_t dummy1;
|
||||
uint32_t opd_typ;
|
||||
uint32_t opt_chl_quan;
|
||||
uint32_t cmd_id_tpu;
|
||||
uint32_t cmd_id_gdma;
|
||||
uint32_t quan_m;
|
||||
uint32_t opt_res0_sign;
|
||||
uint32_t opt_opd0_sign;
|
||||
uint32_t opt_opd1_sign;
|
||||
uint32_t opt_opd2_sign;
|
||||
uint32_t opt_res0_seg;
|
||||
uint32_t opt_opd0_seg;
|
||||
uint32_t opt_opd1_seg;
|
||||
uint32_t opt_opd2_seg;
|
||||
uint32_t ps32_md;
|
||||
uint32_t double_conv;
|
||||
uint32_t opt_left_tran;
|
||||
uint32_t fp_round_typ;
|
||||
uint32_t opt_relu_typ;
|
||||
uint32_t opt_relu_value;
|
||||
uint32_t cmd_pre_exe_typ;
|
||||
uint32_t opt_res_add;
|
||||
uint32_t rsvd0;
|
||||
uint32_t conv_opd0_x_ins0;
|
||||
uint32_t conv_opd0_y_ins0;
|
||||
uint32_t conv_opd0_x_ins0_last;
|
||||
uint32_t conv_opd0_y_ins0_last;
|
||||
uint32_t conv_opd1_x_ins0;
|
||||
uint32_t conv_opd1_y_ins0;
|
||||
uint32_t dummy0;
|
||||
uint32_t opd0_ins_val;
|
||||
uint32_t conv_opd0_up_pad;
|
||||
uint32_t conv_opd0_dn_pad;
|
||||
uint32_t conv_opd0_lf_pad;
|
||||
uint32_t conv_opd0_rt_pad;
|
||||
uint32_t res0_n;
|
||||
uint32_t res0_c;
|
||||
uint32_t res0_h;
|
||||
uint32_t res0_w;
|
||||
uint32_t conv_op_x_str;
|
||||
uint32_t conv_op_y_str;
|
||||
uint32_t cmd_pre_exe;
|
||||
uint32_t rsvd1;
|
||||
uint32_t res0_addr;
|
||||
uint32_t opd0_addr;
|
||||
uint32_t opd1_addr;
|
||||
uint32_t opd2_addr;
|
||||
uint32_t opt_opd0_const;
|
||||
uint32_t opt_opd1_const;
|
||||
uint32_t opt_opd2_const;
|
||||
uint32_t short_nchwstr_same;
|
||||
uint32_t short_res0_str;
|
||||
uint32_t short_opd0_str;
|
||||
uint32_t short_opd1_str;
|
||||
uint32_t short_opd2_str;
|
||||
uint32_t dummy2;
|
||||
uint32_t opd0_n;
|
||||
uint32_t opd0_c;
|
||||
uint32_t dummy3;
|
||||
uint32_t rsvd2;
|
||||
uint32_t opd0_h;
|
||||
uint32_t opd0_w;
|
||||
uint32_t opd1_n;
|
||||
uint32_t opd1_c;
|
||||
uint32_t opd1_h;
|
||||
uint32_t opd1_w;
|
||||
uint32_t opd2_n;
|
||||
uint32_t opd2_c;
|
||||
uint32_t opd2_h;
|
||||
uint32_t opd2_w;
|
||||
uint32_t dummy4;
|
||||
uint32_t rsvd3;
|
||||
uint32_t layer_info;
|
||||
uint32_t res0_n_str;
|
||||
uint32_t res0_c_str;
|
||||
uint32_t res0_h_str;
|
||||
uint32_t res0_w_str;
|
||||
uint32_t res0_b_str;
|
||||
uint32_t opd0_n_str;
|
||||
uint32_t dummy5;
|
||||
uint32_t rsvd4;
|
||||
uint32_t opd0_c_str;
|
||||
uint32_t opd0_h_str;
|
||||
uint32_t opd0_w_str;
|
||||
uint32_t opd0_b_str;
|
||||
uint32_t opd1_n_str;
|
||||
uint32_t opd1_c_str;
|
||||
uint32_t opd1_h_str;
|
||||
uint32_t dummy6;
|
||||
uint32_t rsvd5;
|
||||
uint32_t opd1_w_str;
|
||||
uint32_t opd1_b_str;
|
||||
uint32_t opd2_n_str;
|
||||
uint32_t opd2_c_str;
|
||||
uint32_t opd2_h_str;
|
||||
uint32_t opd2_w_str;
|
||||
uint32_t opd2_b_str;
|
||||
uint32_t dummy7;
|
||||
uint32_t rsvd6;
|
||||
} tiu_reg_t;
|
||||
|
||||
static inline void parse_tiu_reg(tiu_reg_t *r, const uint32_t *p)
|
||||
{
|
||||
r->cmd_en = p[0] & 1;
|
||||
r->cmd_end = (p[0] >> 1) & 1;
|
||||
r->cmd_id_en = (p[0] >> 2) & 1;
|
||||
r->cmd_keep = (p[0] >> 3) & 1;
|
||||
r->cmd_intr_en = (p[0] >> 4) & 1;
|
||||
r->tsk_typ = (p[0] >> 5) & ((1u << 4) - 1);
|
||||
r->tsk_eu_typ = (p[0] >> 9) & ((1u << 5) - 1);
|
||||
r->tsk_opd_num = (p[0] >> 14) & ((1u << 2) - 1);
|
||||
r->opt_res_shift = (p[0] >> 16) & ((1u << 6) - 1);
|
||||
r->opt_left_shift = (p[0] >> 22) & ((1u << 5) - 1);
|
||||
r->opt_shift_typ = (p[0] >> 27) & 1;
|
||||
r->opt_rshift_typ = (p[0] >> 28) & 1;
|
||||
r->dummy1 = (p[0] >> 29) & 1;
|
||||
r->opd_typ = (p[0] >> 30) & 1;
|
||||
r->opt_chl_quan = (p[0] >> 31) & 1;
|
||||
r->cmd_id_tpu = p[1] & ((1u << 16) - 1);
|
||||
r->cmd_id_gdma = (p[1] >> 16) & ((1u << 16) - 1);
|
||||
r->quan_m = p[2];
|
||||
r->opt_res0_sign = p[3] & 1;
|
||||
r->opt_opd0_sign = (p[3] >> 1) & 1;
|
||||
r->opt_opd1_sign = (p[3] >> 2) & 1;
|
||||
r->opt_opd2_sign = (p[3] >> 3) & 1;
|
||||
r->opt_res0_seg = (p[3] >> 4) & ((1u << 2) - 1);
|
||||
r->opt_opd0_seg = (p[3] >> 6) & ((1u << 2) - 1);
|
||||
r->opt_opd1_seg = (p[3] >> 8) & ((1u << 2) - 1);
|
||||
r->opt_opd2_seg = (p[3] >> 10) & 1;
|
||||
r->ps32_md = (p[3] >> 11) & ((1u << 2) - 1);
|
||||
r->double_conv = (p[3] >> 13) & 1;
|
||||
r->opt_left_tran = (p[3] >> 14) & 1;
|
||||
r->fp_round_typ = (p[3] >> 15) & 1;
|
||||
r->opt_relu_typ = (p[3] >> 16) & ((1u << 2) - 1);
|
||||
r->opt_relu_value = (p[3] >> 18) & ((1u << 8) - 1);
|
||||
r->cmd_pre_exe_typ = (p[3] >> 26) & 1;
|
||||
r->opt_res_add = (p[3] >> 27) & 1;
|
||||
r->rsvd0 = (p[3] >> 28) & ((1u << 4) - 1);
|
||||
r->conv_opd0_x_ins0 = p[4] & ((1u << 4) - 1);
|
||||
r->conv_opd0_y_ins0 = (p[4] >> 4) & ((1u << 4) - 1);
|
||||
r->conv_opd0_x_ins0_last = (p[4] >> 8) & ((1u << 4) - 1);
|
||||
r->conv_opd0_y_ins0_last = (p[4] >> 12) & ((1u << 4) - 1);
|
||||
r->conv_opd1_x_ins0 = (p[4] >> 16) & ((1u << 4) - 1);
|
||||
r->conv_opd1_y_ins0 = (p[4] >> 20) & ((1u << 4) - 1);
|
||||
r->dummy0 = (p[4] >> 24) & ((1u << 8) - 1);
|
||||
r->opd0_ins_val = p[5] & ((1u << 16) - 1);
|
||||
r->conv_opd0_up_pad = (p[5] >> 16) & ((1u << 4) - 1);
|
||||
r->conv_opd0_dn_pad = (p[5] >> 20) & ((1u << 4) - 1);
|
||||
r->conv_opd0_lf_pad = (p[5] >> 24) & ((1u << 4) - 1);
|
||||
r->conv_opd0_rt_pad = (p[5] >> 28) & ((1u << 4) - 1);
|
||||
r->res0_n = p[6] & ((1u << 12) - 1);
|
||||
r->res0_c = (p[6] >> 12) & ((1u << 12) - 1);
|
||||
r->res0_h = (p[6] >> 24) & ((1u << 8) - 1);
|
||||
r->res0_h |= (uint64_t)(p[7] & ((1u << 4) - 1)) << 8;
|
||||
r->res0_w = (p[7] >> 4) & ((1u << 12) - 1);
|
||||
r->conv_op_x_str = (p[7] >> 16) & ((1u << 5) - 1);
|
||||
r->conv_op_y_str = (p[7] >> 21) & ((1u << 5) - 1);
|
||||
r->cmd_pre_exe = (p[7] >> 26) & ((1u << 2) - 1);
|
||||
r->rsvd1 = (p[7] >> 28) & ((1u << 4) - 1);
|
||||
r->res0_addr = p[8] & ((1u << 24) - 1);
|
||||
r->opd0_addr = (p[8] >> 24) & ((1u << 8) - 1);
|
||||
r->opd0_addr |= (uint64_t)(p[9] & ((1u << 16) - 1)) << 8;
|
||||
r->opd1_addr = (p[9] >> 16) & ((1u << 16) - 1);
|
||||
r->opd2_addr = p[10] & ((1u << 16) - 1);
|
||||
r->opt_opd0_const = (p[10] >> 16) & 1;
|
||||
r->opt_opd1_const = (p[10] >> 17) & 1;
|
||||
r->opt_opd2_const = (p[10] >> 18) & 1;
|
||||
r->short_nchwstr_same = (p[10] >> 19) & 1;
|
||||
r->short_res0_str = (p[10] >> 20) & ((1u << 2) - 1);
|
||||
r->short_opd0_str = (p[10] >> 22) & ((1u << 2) - 1);
|
||||
r->short_opd1_str = (p[10] >> 24) & ((1u << 2) - 1);
|
||||
r->short_opd2_str = (p[10] >> 26) & ((1u << 2) - 1);
|
||||
r->dummy2 = (p[10] >> 28) & ((1u << 4) - 1);
|
||||
r->opd0_n = p[11] & ((1u << 12) - 1);
|
||||
r->opd0_c = (p[11] >> 12) & ((1u << 12) - 1);
|
||||
r->dummy3 = (p[11] >> 24) & ((1u << 4) - 1);
|
||||
r->rsvd2 = (p[11] >> 28) & ((1u << 4) - 1);
|
||||
r->opd0_h = p[12] & ((1u << 12) - 1);
|
||||
r->opd0_w = (p[12] >> 12) & ((1u << 12) - 1);
|
||||
r->opd1_n = (p[12] >> 24) & ((1u << 8) - 1);
|
||||
r->opd1_n |= (uint64_t)(p[13] & ((1u << 4) - 1)) << 8;
|
||||
r->opd1_c = (p[13] >> 4) & ((1u << 12) - 1);
|
||||
r->opd1_h = (p[13] >> 16) & ((1u << 12) - 1);
|
||||
r->opd1_w = (p[13] >> 28) & ((1u << 4) - 1);
|
||||
r->opd1_w |= (uint64_t)(p[14] & ((1u << 8) - 1)) << 4;
|
||||
r->opd2_n = (p[14] >> 8) & ((1u << 12) - 1);
|
||||
r->opd2_c = (p[14] >> 20) & ((1u << 12) - 1);
|
||||
r->opd2_h = p[15] & ((1u << 12) - 1);
|
||||
r->opd2_w = (p[15] >> 12) & ((1u << 12) - 1);
|
||||
r->dummy4 = (p[15] >> 24) & ((1u << 4) - 1);
|
||||
r->rsvd3 = (p[15] >> 28) & ((1u << 4) - 1);
|
||||
r->layer_info = p[16] & ((1u << 16) - 1);
|
||||
r->res0_n_str = (p[16] >> 16) & ((1u << 16) - 1);
|
||||
r->res0_c_str = p[17] & ((1u << 16) - 1);
|
||||
r->res0_h_str = (p[17] >> 16) & ((1u << 16) - 1);
|
||||
r->res0_w_str = p[18] & ((1u << 16) - 1);
|
||||
r->res0_b_str = (p[18] >> 16) & ((1u << 16) - 1);
|
||||
r->opd0_n_str = p[19] & ((1u << 16) - 1);
|
||||
r->dummy5 = (p[19] >> 16) & ((1u << 12) - 1);
|
||||
r->rsvd4 = (p[19] >> 28) & ((1u << 4) - 1);
|
||||
r->opd0_c_str = p[20] & ((1u << 16) - 1);
|
||||
r->opd0_h_str = (p[20] >> 16) & ((1u << 16) - 1);
|
||||
r->opd0_w_str = p[21] & ((1u << 16) - 1);
|
||||
r->opd0_b_str = (p[21] >> 16) & ((1u << 16) - 1);
|
||||
r->opd1_n_str = p[22] & ((1u << 16) - 1);
|
||||
r->opd1_c_str = (p[22] >> 16) & ((1u << 16) - 1);
|
||||
r->opd1_h_str = p[23] & ((1u << 16) - 1);
|
||||
r->dummy6 = (p[23] >> 16) & ((1u << 12) - 1);
|
||||
r->rsvd5 = (p[23] >> 28) & ((1u << 4) - 1);
|
||||
r->opd1_w_str = p[24] & ((1u << 16) - 1);
|
||||
r->opd1_b_str = (p[24] >> 16) & ((1u << 16) - 1);
|
||||
r->opd2_n_str = p[25] & ((1u << 16) - 1);
|
||||
r->opd2_c_str = (p[25] >> 16) & ((1u << 16) - 1);
|
||||
r->opd2_h_str = p[26] & ((1u << 16) - 1);
|
||||
r->opd2_w_str = (p[26] >> 16) & ((1u << 16) - 1);
|
||||
r->opd2_b_str = p[27] & ((1u << 16) - 1);
|
||||
r->dummy7 = (p[27] >> 16) & ((1u << 12) - 1);
|
||||
r->rsvd6 = (p[27] >> 28) & ((1u << 4) - 1);
|
||||
}
|
||||
|
||||
static inline void emit_tiu_reg(const tiu_reg_t *r, uint32_t *_p)
|
||||
{
|
||||
volatile uint32_t *p = (typeof(p))_p;
|
||||
p[27] = (r->opd2_b_str & ((1u << 16) - 1)) |
|
||||
((r->dummy7 & ((1u << 12) - 1)) << 16) |
|
||||
((r->rsvd6 & ((1u << 4) - 1)) << 28);
|
||||
p[26] = (r->opd2_h_str & ((1u << 16) - 1)) |
|
||||
((r->opd2_w_str & ((1u << 16) - 1)) << 16);
|
||||
p[25] = (r->opd2_n_str & ((1u << 16) - 1)) |
|
||||
((r->opd2_c_str & ((1u << 16) - 1)) << 16);
|
||||
p[24] = (r->opd1_w_str & ((1u << 16) - 1)) |
|
||||
((r->opd1_b_str & ((1u << 16) - 1)) << 16);
|
||||
p[23] = (r->opd1_h_str & ((1u << 16) - 1)) |
|
||||
((r->dummy6 & ((1u << 12) - 1)) << 16) |
|
||||
((r->rsvd5 & ((1u << 4) - 1)) << 28);
|
||||
p[22] = (r->opd1_n_str & ((1u << 16) - 1)) |
|
||||
((r->opd1_c_str & ((1u << 16) - 1)) << 16);
|
||||
p[21] = (r->opd0_w_str & ((1u << 16) - 1)) |
|
||||
((r->opd0_b_str & ((1u << 16) - 1)) << 16);
|
||||
p[20] = (r->opd0_c_str & ((1u << 16) - 1)) |
|
||||
((r->opd0_h_str & ((1u << 16) - 1)) << 16);
|
||||
p[19] = (r->opd0_n_str & ((1u << 16) - 1)) |
|
||||
((r->dummy5 & ((1u << 12) - 1)) << 16) |
|
||||
((r->rsvd4 & ((1u << 4) - 1)) << 28);
|
||||
p[18] = (r->res0_w_str & ((1u << 16) - 1)) |
|
||||
((r->res0_b_str & ((1u << 16) - 1)) << 16);
|
||||
p[17] = (r->res0_c_str & ((1u << 16) - 1)) |
|
||||
((r->res0_h_str & ((1u << 16) - 1)) << 16);
|
||||
p[16] = (r->layer_info & ((1u << 16) - 1)) |
|
||||
((r->res0_n_str & ((1u << 16) - 1)) << 16);
|
||||
p[15] = (r->opd2_h & ((1u << 12) - 1)) |
|
||||
((r->opd2_w & ((1u << 12) - 1)) << 12) |
|
||||
((r->dummy4 & ((1u << 4) - 1)) << 24) |
|
||||
((r->rsvd3 & ((1u << 4) - 1)) << 28);
|
||||
p[14] = ((r->opd1_w >> 4) & ((1u << 8) - 1)) |
|
||||
((r->opd2_n & ((1u << 12) - 1)) << 8) |
|
||||
((r->opd2_c & ((1u << 12) - 1)) << 20);
|
||||
p[13] = ((r->opd1_n >> 8) & ((1u << 4) - 1)) |
|
||||
((r->opd1_c & ((1u << 12) - 1)) << 4) |
|
||||
((r->opd1_h & ((1u << 12) - 1)) << 16) |
|
||||
((r->opd1_w & ((1u << 4) - 1)) << 28);
|
||||
p[12] = (r->opd0_h & ((1u << 12) - 1)) |
|
||||
((r->opd0_w & ((1u << 12) - 1)) << 12) |
|
||||
((r->opd1_n & ((1u << 8) - 1)) << 24);
|
||||
p[11] = (r->opd0_n & ((1u << 12) - 1)) |
|
||||
((r->opd0_c & ((1u << 12) - 1)) << 12) |
|
||||
((r->dummy3 & ((1u << 4) - 1)) << 24) |
|
||||
((r->rsvd2 & ((1u << 4) - 1)) << 28);
|
||||
p[10] = (r->opd2_addr & ((1u << 16) - 1)) |
|
||||
((r->opt_opd0_const & 1) << 16) |
|
||||
((r->opt_opd1_const & 1) << 17) |
|
||||
((r->opt_opd2_const & 1) << 18) |
|
||||
((r->short_nchwstr_same & 1) << 19) |
|
||||
((r->short_res0_str & ((1u << 2) - 1)) << 20) |
|
||||
((r->short_opd0_str & ((1u << 2) - 1)) << 22) |
|
||||
((r->short_opd1_str & ((1u << 2) - 1)) << 24) |
|
||||
((r->short_opd2_str & ((1u << 2) - 1)) << 26) |
|
||||
((r->dummy2 & ((1u << 4) - 1)) << 28);
|
||||
p[9] = ((r->opd0_addr >> 8) & ((1u << 16) - 1)) |
|
||||
((r->opd1_addr & ((1u << 16) - 1)) << 16);
|
||||
p[8] = (r->res0_addr & ((1u << 24) - 1)) |
|
||||
((r->opd0_addr & ((1u << 8) - 1)) << 24);
|
||||
p[7] = ((r->res0_h >> 8) & ((1u << 4) - 1)) |
|
||||
((r->res0_w & ((1u << 12) - 1)) << 4) |
|
||||
((r->conv_op_x_str & ((1u << 5) - 1)) << 16) |
|
||||
((r->conv_op_y_str & ((1u << 5) - 1)) << 21) |
|
||||
((r->cmd_pre_exe & ((1u << 2) - 1)) << 26) |
|
||||
((r->rsvd1 & ((1u << 4) - 1)) << 28);
|
||||
p[6] = (r->res0_n & ((1u << 12) - 1)) |
|
||||
((r->res0_c & ((1u << 12) - 1)) << 12) |
|
||||
((r->res0_h & ((1u << 8) - 1)) << 24);
|
||||
p[5] = (r->opd0_ins_val & ((1u << 16) - 1)) |
|
||||
((r->conv_opd0_up_pad & ((1u << 4) - 1)) << 16) |
|
||||
((r->conv_opd0_dn_pad & ((1u << 4) - 1)) << 20) |
|
||||
((r->conv_opd0_lf_pad & ((1u << 4) - 1)) << 24) |
|
||||
((r->conv_opd0_rt_pad & ((1u << 4) - 1)) << 28);
|
||||
p[4] = (r->conv_opd0_x_ins0 & ((1u << 4) - 1)) |
|
||||
((r->conv_opd0_y_ins0 & ((1u << 4) - 1)) << 4) |
|
||||
((r->conv_opd0_x_ins0_last & ((1u << 4) - 1)) << 8) |
|
||||
((r->conv_opd0_y_ins0_last & ((1u << 4) - 1)) << 12) |
|
||||
((r->conv_opd1_x_ins0 & ((1u << 4) - 1)) << 16) |
|
||||
((r->conv_opd1_y_ins0 & ((1u << 4) - 1)) << 20) |
|
||||
((r->dummy0 & ((1u << 8) - 1)) << 24);
|
||||
p[3] = (r->opt_res0_sign & 1) |
|
||||
((r->opt_opd0_sign & 1) << 1) |
|
||||
((r->opt_opd1_sign & 1) << 2) |
|
||||
((r->opt_opd2_sign & 1) << 3) |
|
||||
((r->opt_res0_seg & ((1u << 2) - 1)) << 4) |
|
||||
((r->opt_opd0_seg & ((1u << 2) - 1)) << 6) |
|
||||
((r->opt_opd1_seg & ((1u << 2) - 1)) << 8) |
|
||||
((r->opt_opd2_seg & 1) << 10) |
|
||||
((r->ps32_md & ((1u << 2) - 1)) << 11) |
|
||||
((r->double_conv & 1) << 13) |
|
||||
((r->opt_left_tran & 1) << 14) |
|
||||
((r->fp_round_typ & 1) << 15) |
|
||||
((r->opt_relu_typ & ((1u << 2) - 1)) << 16) |
|
||||
((r->opt_relu_value & ((1u << 8) - 1)) << 18) |
|
||||
((r->cmd_pre_exe_typ & 1) << 26) |
|
||||
((r->opt_res_add & 1) << 27) |
|
||||
((r->rsvd0 & ((1u << 4) - 1)) << 28);
|
||||
p[2] = (r->quan_m & (((uint64_t)1 << 32) - 1));
|
||||
p[1] = (r->cmd_id_tpu & ((1u << 16) - 1)) |
|
||||
((r->cmd_id_gdma & ((1u << 16) - 1)) << 16);
|
||||
p[0] = (r->cmd_en & 1) |
|
||||
((r->cmd_end & 1) << 1) |
|
||||
((r->cmd_id_en & 1) << 2) |
|
||||
((r->cmd_keep & 1) << 3) |
|
||||
((r->cmd_intr_en & 1) << 4) |
|
||||
((r->tsk_typ & ((1u << 4) - 1)) << 5) |
|
||||
((r->tsk_eu_typ & ((1u << 5) - 1)) << 9) |
|
||||
((r->tsk_opd_num & ((1u << 2) - 1)) << 14) |
|
||||
((r->opt_res_shift & ((1u << 6) - 1)) << 16) |
|
||||
((r->opt_left_shift & ((1u << 5) - 1)) << 22) |
|
||||
((r->opt_shift_typ & 1) << 27) |
|
||||
((r->opt_rshift_typ & 1) << 28) |
|
||||
((r->dummy1 & 1) << 29) |
|
||||
((r->opd_typ & 1) << 30) |
|
||||
((r->opt_chl_quan & 1) << 31);
|
||||
}
|
||||
|
||||
static inline void reset_tiu_reg(tiu_reg_t *r)
|
||||
{
|
||||
r->cmd_en = 0x0;
|
||||
r->cmd_end = 0x0;
|
||||
r->cmd_id_en = 0x0;
|
||||
r->cmd_keep = 0x0;
|
||||
r->cmd_intr_en = 0x0;
|
||||
r->tsk_typ = 0x0;
|
||||
r->tsk_eu_typ = 0x0;
|
||||
r->tsk_opd_num = 0x3;
|
||||
r->opt_res_shift = 0xa;
|
||||
r->opt_left_shift = 0x2;
|
||||
r->opt_shift_typ = 0x1;
|
||||
r->opt_rshift_typ = 0x1;
|
||||
r->dummy1 = 0x0;
|
||||
r->opd_typ = 0x0;
|
||||
r->opt_chl_quan = 0x0;
|
||||
r->cmd_id_tpu = 0x0;
|
||||
r->cmd_id_gdma = 0x0;
|
||||
r->quan_m = 0x0;
|
||||
r->opt_res0_sign = 0x0;
|
||||
r->opt_opd0_sign = 0x0;
|
||||
r->opt_opd1_sign = 0x1;
|
||||
r->opt_opd2_sign = 0x1;
|
||||
r->opt_res0_seg = 0x1;
|
||||
r->opt_opd0_seg = 0x1;
|
||||
r->opt_opd1_seg = 0x1;
|
||||
r->opt_opd2_seg = 0x0;
|
||||
r->ps32_md = 0x0;
|
||||
r->double_conv = 0x0;
|
||||
r->opt_left_tran = 0x0;
|
||||
r->fp_round_typ = 0x0;
|
||||
r->opt_relu_typ = 0x0;
|
||||
r->opt_relu_value = 0x0;
|
||||
r->cmd_pre_exe_typ = 0x0;
|
||||
r->opt_res_add = 0x0;
|
||||
r->rsvd0 = 0x0;
|
||||
r->conv_opd0_x_ins0 = 0x0;
|
||||
r->conv_opd0_y_ins0 = 0x0;
|
||||
r->conv_opd0_x_ins0_last = 0x0;
|
||||
r->conv_opd0_y_ins0_last = 0x0;
|
||||
r->conv_opd1_x_ins0 = 0x0;
|
||||
r->conv_opd1_y_ins0 = 0x0;
|
||||
r->dummy0 = 0x0;
|
||||
r->opd0_ins_val = 0x0;
|
||||
r->conv_opd0_up_pad = 0x0;
|
||||
r->conv_opd0_dn_pad = 0x0;
|
||||
r->conv_opd0_lf_pad = 0x0;
|
||||
r->conv_opd0_rt_pad = 0x0;
|
||||
r->res0_n = 0x1;
|
||||
r->res0_c = 0x1;
|
||||
r->res0_h = 0x1;
|
||||
r->res0_w = 0x10;
|
||||
r->conv_op_x_str = 0x1;
|
||||
r->conv_op_y_str = 0x1;
|
||||
r->cmd_pre_exe = 0x0;
|
||||
r->rsvd1 = 0x1;
|
||||
r->res0_addr = 0x0;
|
||||
r->opd0_addr = 0x0;
|
||||
r->opd1_addr = 0x0;
|
||||
r->opd2_addr = 0x0;
|
||||
r->opt_opd0_const = 0x0;
|
||||
r->opt_opd1_const = 0x0;
|
||||
r->opt_opd2_const = 0x0;
|
||||
r->short_nchwstr_same = 0x0;
|
||||
r->short_res0_str = 0x0;
|
||||
r->short_opd0_str = 0x0;
|
||||
r->short_opd1_str = 0x0;
|
||||
r->short_opd2_str = 0x0;
|
||||
r->dummy2 = 0x0;
|
||||
r->opd0_n = 0x1;
|
||||
r->opd0_c = 0x1;
|
||||
r->dummy3 = 0x0;
|
||||
r->rsvd2 = 0x2;
|
||||
r->opd0_h = 0x1;
|
||||
r->opd0_w = 0x10;
|
||||
r->opd1_n = 0x1;
|
||||
r->opd1_c = 0x1;
|
||||
r->opd1_h = 0x1;
|
||||
r->opd1_w = 0x10;
|
||||
r->opd2_n = 0x1;
|
||||
r->opd2_c = 0x1;
|
||||
r->opd2_h = 0x1;
|
||||
r->opd2_w = 0x10;
|
||||
r->dummy4 = 0x0;
|
||||
r->rsvd3 = 0x3;
|
||||
r->layer_info = 0x0;
|
||||
r->res0_n_str = 0x10;
|
||||
r->res0_c_str = 0x10;
|
||||
r->res0_h_str = 0x0;
|
||||
r->res0_w_str = 0x1;
|
||||
r->res0_b_str = 0x10;
|
||||
r->opd0_n_str = 0x10;
|
||||
r->dummy5 = 0x0;
|
||||
r->rsvd4 = 0x4;
|
||||
r->opd0_c_str = 0x10;
|
||||
r->opd0_h_str = 0x0;
|
||||
r->opd0_w_str = 0x1;
|
||||
r->opd0_b_str = 0x10;
|
||||
r->opd1_n_str = 0x10;
|
||||
r->opd1_c_str = 0x10;
|
||||
r->opd1_h_str = 0x0;
|
||||
r->dummy6 = 0x0;
|
||||
r->rsvd5 = 0x5;
|
||||
r->opd1_w_str = 0x1;
|
||||
r->opd1_b_str = 0x10;
|
||||
r->opd2_n_str = 0x10;
|
||||
r->opd2_c_str = 0x10;
|
||||
r->opd2_h_str = 0x0;
|
||||
r->opd2_w_str = 0x1;
|
||||
r->opd2_b_str = 0x10;
|
||||
r->dummy7 = 0x0;
|
||||
r->rsvd6 = 0x6;
|
||||
}
|
||||
|
||||
static inline void trace_tiu_reg(tiu_reg_t *r, const char *tag)
|
||||
{
|
||||
#define trace_one_reg(name) \
|
||||
printf(" %s: 0x%llx\n", #name, (ullong)r->name)
|
||||
|
||||
printf("--- %s ---\n", tag);
|
||||
trace_one_reg(cmd_en);
|
||||
trace_one_reg(cmd_end);
|
||||
trace_one_reg(cmd_id_en);
|
||||
trace_one_reg(cmd_keep);
|
||||
trace_one_reg(cmd_intr_en);
|
||||
trace_one_reg(tsk_typ);
|
||||
trace_one_reg(tsk_eu_typ);
|
||||
trace_one_reg(tsk_opd_num);
|
||||
trace_one_reg(opt_res_shift);
|
||||
trace_one_reg(opt_left_shift);
|
||||
trace_one_reg(opt_shift_typ);
|
||||
trace_one_reg(opt_rshift_typ);
|
||||
trace_one_reg(dummy1);
|
||||
trace_one_reg(opd_typ);
|
||||
trace_one_reg(opt_chl_quan);
|
||||
trace_one_reg(cmd_id_tpu);
|
||||
trace_one_reg(cmd_id_gdma);
|
||||
trace_one_reg(quan_m);
|
||||
trace_one_reg(opt_res0_sign);
|
||||
trace_one_reg(opt_opd0_sign);
|
||||
trace_one_reg(opt_opd1_sign);
|
||||
trace_one_reg(opt_opd2_sign);
|
||||
trace_one_reg(opt_res0_seg);
|
||||
trace_one_reg(opt_opd0_seg);
|
||||
trace_one_reg(opt_opd1_seg);
|
||||
trace_one_reg(opt_opd2_seg);
|
||||
trace_one_reg(ps32_md);
|
||||
trace_one_reg(double_conv);
|
||||
trace_one_reg(opt_left_tran);
|
||||
trace_one_reg(fp_round_typ);
|
||||
trace_one_reg(opt_relu_typ);
|
||||
trace_one_reg(opt_relu_value);
|
||||
trace_one_reg(cmd_pre_exe_typ);
|
||||
trace_one_reg(opt_res_add);
|
||||
trace_one_reg(rsvd0);
|
||||
trace_one_reg(conv_opd0_x_ins0);
|
||||
trace_one_reg(conv_opd0_y_ins0);
|
||||
trace_one_reg(conv_opd0_x_ins0_last);
|
||||
trace_one_reg(conv_opd0_y_ins0_last);
|
||||
trace_one_reg(conv_opd1_x_ins0);
|
||||
trace_one_reg(conv_opd1_y_ins0);
|
||||
trace_one_reg(dummy0);
|
||||
trace_one_reg(opd0_ins_val);
|
||||
trace_one_reg(conv_opd0_up_pad);
|
||||
trace_one_reg(conv_opd0_dn_pad);
|
||||
trace_one_reg(conv_opd0_lf_pad);
|
||||
trace_one_reg(conv_opd0_rt_pad);
|
||||
trace_one_reg(res0_n);
|
||||
trace_one_reg(res0_c);
|
||||
trace_one_reg(res0_h);
|
||||
trace_one_reg(res0_w);
|
||||
trace_one_reg(conv_op_x_str);
|
||||
trace_one_reg(conv_op_y_str);
|
||||
trace_one_reg(cmd_pre_exe);
|
||||
trace_one_reg(rsvd1);
|
||||
trace_one_reg(res0_addr);
|
||||
trace_one_reg(opd0_addr);
|
||||
trace_one_reg(opd1_addr);
|
||||
trace_one_reg(opd2_addr);
|
||||
trace_one_reg(opt_opd0_const);
|
||||
trace_one_reg(opt_opd1_const);
|
||||
trace_one_reg(opt_opd2_const);
|
||||
trace_one_reg(short_nchwstr_same);
|
||||
trace_one_reg(short_res0_str);
|
||||
trace_one_reg(short_opd0_str);
|
||||
trace_one_reg(short_opd1_str);
|
||||
trace_one_reg(short_opd2_str);
|
||||
trace_one_reg(dummy2);
|
||||
trace_one_reg(opd0_n);
|
||||
trace_one_reg(opd0_c);
|
||||
trace_one_reg(dummy3);
|
||||
trace_one_reg(rsvd2);
|
||||
trace_one_reg(opd0_h);
|
||||
trace_one_reg(opd0_w);
|
||||
trace_one_reg(opd1_n);
|
||||
trace_one_reg(opd1_c);
|
||||
trace_one_reg(opd1_h);
|
||||
trace_one_reg(opd1_w);
|
||||
trace_one_reg(opd2_n);
|
||||
trace_one_reg(opd2_c);
|
||||
trace_one_reg(opd2_h);
|
||||
trace_one_reg(opd2_w);
|
||||
trace_one_reg(dummy4);
|
||||
trace_one_reg(rsvd3);
|
||||
trace_one_reg(layer_info);
|
||||
trace_one_reg(res0_n_str);
|
||||
trace_one_reg(res0_c_str);
|
||||
trace_one_reg(res0_h_str);
|
||||
trace_one_reg(res0_w_str);
|
||||
trace_one_reg(res0_b_str);
|
||||
trace_one_reg(opd0_n_str);
|
||||
trace_one_reg(dummy5);
|
||||
trace_one_reg(rsvd4);
|
||||
trace_one_reg(opd0_c_str);
|
||||
trace_one_reg(opd0_h_str);
|
||||
trace_one_reg(opd0_w_str);
|
||||
trace_one_reg(opd0_b_str);
|
||||
trace_one_reg(opd1_n_str);
|
||||
trace_one_reg(opd1_c_str);
|
||||
trace_one_reg(opd1_h_str);
|
||||
trace_one_reg(dummy6);
|
||||
trace_one_reg(rsvd5);
|
||||
trace_one_reg(opd1_w_str);
|
||||
trace_one_reg(opd1_b_str);
|
||||
trace_one_reg(opd2_n_str);
|
||||
trace_one_reg(opd2_c_str);
|
||||
trace_one_reg(opd2_h_str);
|
||||
trace_one_reg(opd2_w_str);
|
||||
trace_one_reg(opd2_b_str);
|
||||
trace_one_reg(dummy7);
|
||||
trace_one_reg(rsvd6);
|
||||
}
|
||||
#endif /* BM1822_TIU_REG_H */
|
||||
38
cvikernel/include/bmkernel/bm1822/bm1822_tpu_cfg.h
Normal file
38
cvikernel/include/bmkernel/bm1822/bm1822_tpu_cfg.h
Normal file
@ -0,0 +1,38 @@
|
||||
#ifndef __BM1822_TPU_CFG__
|
||||
#define __BM1822_TPU_CFG__
|
||||
|
||||
#define BM1822_VER 1822
|
||||
#define BM1822_HW_NPU_SHIFT 3
|
||||
#define BM1822_HW_EU_SHIFT 4
|
||||
#define BM1822_HW_LMEM_SHIFT 15
|
||||
#define BM1822_HW_LMEM_BANKS 8
|
||||
#define BM1822_HW_LMEM_BANK_SIZE 0x1000
|
||||
#define BM1822_HW_NODE_CHIP_SHIFT 0
|
||||
#define BM1822_HW_NPU_NUM (1 << BM1822_HW_NPU_SHIFT)
|
||||
#define BM1822_HW_EU_NUM (1 << BM1822_HW_EU_SHIFT)
|
||||
#define BM1822_HW_LMEM_SIZE (1 << BM1822_HW_LMEM_SHIFT)
|
||||
#define BM1822_HW_LMEM_START_ADDR 0x0C000000
|
||||
#define BM1822_HW_NODE_CHIP_NUM (1 << BM1822_HW_NODE_CHIP_SHIFT)
|
||||
|
||||
#if (BM1822_HW_LMEM_SIZE != (BM1822_HW_LMEM_BANK_SIZE * BM1822_HW_LMEM_BANKS))
|
||||
#error "Set wrong TPU configuraiton."
|
||||
#endif
|
||||
|
||||
#define BM1822_GLOBAL_MEM_START_ADDR 0x0
|
||||
#define BM1822_GLOBAL_MEM_SIZE 0x100000000
|
||||
|
||||
#define BM1822_GLOBAL_TIU_CMDBUF_ADDR 0x00000000
|
||||
#define BM1822_GLOBAL_TDMA_CMDBUF_ADDR 0x10000000
|
||||
#define BM1822_GLOBAL_TIU_CMDBUF_RESERVED_SIZE 0x10000000
|
||||
#define BM1822_GLOBAL_TDMA_CMDBUF_RESERVED_SIZE 0x10000000
|
||||
#define BM1822_GLOBAL_POOL_RESERVED_SIZE (BM1822_GLOBAL_MEM_SIZE - BM1822_GLOBAL_TIU_CMDBUF_RESERVED_SIZE - BM1822_GLOBAL_TDMA_CMDBUF_RESERVED_SIZE)
|
||||
|
||||
#define BM1822_UART_CTLR_BASE_ADDR 0x04140000
|
||||
|
||||
#define BM1822_TDMA_ENGINE_BASE_ADDR 0x0C100000
|
||||
#define BM1822_TDMA_ENGINE_END_ADDR (BM1822_TDMA_ENGINE_BASE_ADDR + 0x1000)
|
||||
|
||||
#define BM1822_TIU_ENGINE_BASE_ADDR 0x0C101000 //"NPS Register" in memory map?
|
||||
#define BM1822_TIU_ENGINE_END_ADDR (BM1822_TIU_ENGINE_BASE_ADDR + 0x1000)
|
||||
|
||||
#endif
|
||||
703
cvikernel/include/bmkernel/bm1822/bm_vlc_compress.h
Normal file
703
cvikernel/include/bmkernel/bm1822/bm_vlc_compress.h
Normal file
@ -0,0 +1,703 @@
|
||||
#ifndef __BM_VLC_COMPRESS_H__
|
||||
#define __BM_VLC_COMPRESS_H__
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#ifdef __cplusplus
|
||||
extern "C"
|
||||
{
|
||||
#endif
|
||||
|
||||
#define MAX_UNARY_FIELD_SIZE 47
|
||||
#define MAX_ORDER_K 5
|
||||
|
||||
/**
|
||||
* \data_type 0 means 8bit, 1 means 16bit
|
||||
*/
|
||||
static inline size_t get_out_bs_buf_size(uint64_t in_size, uint8_t data_type) {
|
||||
size_t blk_num = (data_type) ? ((in_size + 31) >> 5) : ((in_size + 15) >> 4);
|
||||
size_t in_size_pad = blk_num << (4 + data_type);
|
||||
size_t bs_buf_size = in_size_pad + (ceiling_func(blk_num, 16) << 4) + 16;
|
||||
return bs_buf_size;
|
||||
}
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint8_t signedness;
|
||||
uint8_t is_bfloat16;
|
||||
uint8_t bias0;
|
||||
uint8_t bias1;
|
||||
uint8_t zero_guard_en;
|
||||
} CommandInfo;
|
||||
typedef struct
|
||||
{
|
||||
uint8_t *stream; // stream buffer pointer
|
||||
int bit_pos; // current pointer (in bit)
|
||||
int buf_size; // in byte
|
||||
} StreamBuffer;
|
||||
|
||||
static inline int8_t two_side_circular_shift(int8_t val, uint8_t bias0, uint8_t bias1);
|
||||
static inline int8_t inv_two_side_circular_shift(int8_t val, uint8_t bias0, uint8_t bias1);
|
||||
static inline uint8_t center_shift(uint8_t val, uint8_t bias, uint8_t zero_guard);
|
||||
static inline uint8_t inv_center_shift(uint8_t val, uint8_t bias, uint8_t zero_guard);
|
||||
|
||||
static inline void init_stream(StreamBuffer *bs, const uint8_t *buf, int buf_size, uint8_t read_only);
|
||||
|
||||
static inline void bm_vlc_est_weight_bias(const uint8_t *ibuf, size_t isz, uint8_t signedness, uint8_t isBfloat16, CommandInfo *cmd_info);
|
||||
static inline void bm_vlc_enc_int8(const uint8_t *ibuf, size_t isz, uint8_t *obuf, size_t *osz, CommandInfo *cmd_info);
|
||||
static inline void bm_vlc_dec_int8_ext(const uint8_t *ibuf, size_t isz, uint8_t *obuf, size_t *bs_size);
|
||||
static inline void bm_vlc_dec_int8(const uint8_t *ibuf, size_t isz, uint8_t *obuf);
|
||||
static inline void bm_vlc_enc_bf16(const uint16_t *ibuf, size_t isz, uint8_t *obuf, size_t *osz, CommandInfo *cmd_info);
|
||||
static inline void bm_vlc_dec_bf16_ext(const uint8_t *ibuf, size_t isz, uint16_t *obuf, size_t *bs_size);
|
||||
static inline void bm_vlc_dec_bf16(const uint8_t *ibuf, size_t isz, uint16_t *obuf);
|
||||
|
||||
static inline uint8_t get_bit_val(uint8_t *buf, int byte_idx, int bit_idx)
|
||||
{
|
||||
return (buf[byte_idx] >> bit_idx) & 0x1;
|
||||
}
|
||||
|
||||
static inline uint8_t sign_to_unsign(uint8_t val)
|
||||
{
|
||||
uint8_t sign_i = (val >> 7) & 0x1;
|
||||
int abs_data_i = abs(((int8_t)val));
|
||||
return ((abs_data_i << 1) - sign_i);
|
||||
}
|
||||
|
||||
static inline int8_t unsign_to_sign(uint8_t val)
|
||||
{
|
||||
uint8_t sign_i = val & 0x1;
|
||||
int abs_data_i = (((int)val) + 1) >> 1;
|
||||
return (uint8_t)((sign_i == 1) ? (-abs_data_i) : abs_data_i);
|
||||
}
|
||||
|
||||
static inline void dispatch_bf16_data(const uint16_t *bf16_in, uint8_t *exp, uint8_t *frac, size_t isz)
|
||||
{
|
||||
for (size_t i = 0; i < isz; i++)
|
||||
{
|
||||
exp[i] = (uint8_t)((bf16_in[i] >> 7) & 0xFF);
|
||||
frac[i] = (uint8_t)(((bf16_in[i] >> 15) << 7) | (bf16_in[i] & 0x7F));
|
||||
}
|
||||
}
|
||||
|
||||
static inline void merge_bf16_data(const uint8_t *exp_in, const uint8_t *frac_in, uint16_t *bf16_out, size_t isz)
|
||||
{
|
||||
memset(bf16_out, 0, sizeof(uint16_t));
|
||||
for (size_t i = 0; i < isz; i++)
|
||||
{
|
||||
bf16_out[i] = ((frac_in[i] >> 7) << 15) | (exp_in[i] << 7) | (frac_in[i] & 0x7F);
|
||||
}
|
||||
}
|
||||
|
||||
// -- streaming operation handler --
|
||||
static inline void init_stream(StreamBuffer *bs, const uint8_t *buf, int buf_size, uint8_t read_only)
|
||||
{
|
||||
bs->bit_pos = 0;
|
||||
bs->stream = (uint8_t *)buf;
|
||||
bs->buf_size = buf_size;
|
||||
if (!read_only)
|
||||
memset((uint8_t *)buf, 0, sizeof(uint8_t) * buf_size);
|
||||
}
|
||||
|
||||
static inline void write_stream(StreamBuffer *bs, uint8_t *src, int bit_len)
|
||||
{
|
||||
for (int bit = 0; bit < bit_len; bit++)
|
||||
{
|
||||
int src_byte_i = bit / 8;
|
||||
int src_bit_i = bit % 8;
|
||||
int dest_byte_i = (bs->bit_pos + bit) / 8;
|
||||
int dest_bit_i = (bs->bit_pos + bit) % 8;
|
||||
bs->stream[dest_byte_i] |= (get_bit_val(src, src_byte_i, src_bit_i) << dest_bit_i);
|
||||
}
|
||||
bs->bit_pos += bit_len;
|
||||
}
|
||||
|
||||
static inline void move_stream_ptr(StreamBuffer *bs, int bit_len)
|
||||
{
|
||||
bs->bit_pos += bit_len;
|
||||
}
|
||||
|
||||
static inline void parse_stream(StreamBuffer *bs, uint8_t *dest, int bit_len)
|
||||
{
|
||||
memset(dest, 0, sizeof(uint8_t) * (bit_len + 7) >> 3);
|
||||
for (int bit = 0; bit < bit_len; bit++)
|
||||
{
|
||||
int dest_byte_i = bit / 8;
|
||||
int dest_bit_i = bit % 8;
|
||||
int bs_byte_i = (bs->bit_pos + bit) / 8;
|
||||
int bs_bit_i = (bs->bit_pos + bit) % 8;
|
||||
dest[dest_byte_i] |= (get_bit_val(bs->stream, bs_byte_i, bs_bit_i) << dest_bit_i);
|
||||
}
|
||||
bs->bit_pos += bit_len;
|
||||
}
|
||||
|
||||
// -- header read/write operation handler --
|
||||
static inline void vlc_enc_header(StreamBuffer *bs_header, CommandInfo *cmd_info, size_t blk_bs_size)
|
||||
{
|
||||
write_stream(bs_header, (uint8_t *)&blk_bs_size, 24); // bit[23:0] compressed block stream size
|
||||
move_stream_ptr(bs_header, 4); // bit[27:24] reserved
|
||||
write_stream(bs_header, (uint8_t *)&cmd_info->signedness, 1); // bit[28] signedness
|
||||
write_stream(bs_header, (uint8_t *)&cmd_info->is_bfloat16, 1); // bit[29] data type
|
||||
move_stream_ptr(bs_header, 2); // bit[31:30] bit depth
|
||||
write_stream(bs_header, (uint8_t *)&cmd_info->bias0, 8); // bit[39:32] bias0 for symbol remapping
|
||||
write_stream(bs_header, (uint8_t *)&cmd_info->bias1, 7); // bit[46:40] bias1 for symbol remapping
|
||||
write_stream(bs_header, (uint8_t *)&cmd_info->zero_guard_en, 1); // bit[47] zero guard
|
||||
}
|
||||
|
||||
static inline void vlc_dec_header_ext(StreamBuffer *bs_header, CommandInfo *cmd_info, size_t *blk_bs_size)
|
||||
{
|
||||
parse_stream(bs_header, (uint8_t *)blk_bs_size, 24); // bit[23:0] compressed block stream size
|
||||
move_stream_ptr(bs_header, 4); // bit[27:24] reserved
|
||||
parse_stream(bs_header, (uint8_t *)&cmd_info->signedness, 1); // bit[28] signedness
|
||||
parse_stream(bs_header, (uint8_t *)&cmd_info->is_bfloat16, 1); // bit[29] data type
|
||||
move_stream_ptr(bs_header, 2);
|
||||
parse_stream(bs_header, (uint8_t *)&cmd_info->bias0, 8); // bit[39:32] bias0 for symbol remapping
|
||||
parse_stream(bs_header, (uint8_t *)&cmd_info->bias1, 7); // bit[46:40] bias1 for symbol remapping
|
||||
parse_stream(bs_header, (uint8_t *)&cmd_info->zero_guard_en, 1); // bit[47] zero guard
|
||||
}
|
||||
|
||||
static inline void vlc_dec_header(StreamBuffer *bs_header, CommandInfo *cmd_info)
|
||||
{
|
||||
size_t blk_bs_size;
|
||||
vlc_dec_header_ext(bs_header, cmd_info, &blk_bs_size);
|
||||
}
|
||||
|
||||
// -- symbol remmaping handler --
|
||||
static inline uint8_t center_shift(uint8_t val, uint8_t bias, uint8_t zero_guard)
|
||||
{
|
||||
if (val == 0 && zero_guard)
|
||||
return 0;
|
||||
|
||||
int16_t shift_data_i = val - bias;
|
||||
uint8_t range = (bias <= 128) ? bias : 255 - bias;
|
||||
if (bias <= 128)
|
||||
{
|
||||
return (val >= (range << 1)) ? val : sign_to_unsign(shift_data_i) + zero_guard;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (val < (bias - range)) ? (range + bias - val + zero_guard) : (sign_to_unsign(shift_data_i) + zero_guard);
|
||||
}
|
||||
}
|
||||
|
||||
static inline uint8_t inv_center_shift(uint8_t val, uint8_t bias, uint8_t zero_guard)
|
||||
{
|
||||
if (val == 0 && zero_guard)
|
||||
return 0;
|
||||
|
||||
uint8_t unsign_data_i = val - zero_guard;
|
||||
uint8_t range = (bias <= 128) ? bias : 255 - bias;
|
||||
if (bias <= 128)
|
||||
{
|
||||
return (val >= (range << 1)) ? val : unsign_to_sign(unsign_data_i) + bias;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (unsign_data_i > (range << 1)) ? (range + bias - val + zero_guard) : unsign_to_sign(unsign_data_i) + bias;
|
||||
}
|
||||
}
|
||||
|
||||
static inline int8_t two_side_circular_shift(int8_t val, uint8_t bias0, uint8_t bias1)
|
||||
{
|
||||
if (val == 0)
|
||||
return 0;
|
||||
|
||||
uint8_t sign = (val < 0) ? true : false;
|
||||
int32_t abs_val = abs(val);
|
||||
abs_val -= (sign) ? bias1 : bias0;
|
||||
abs_val += (abs_val <= 0) ? (127 + sign) : 0;
|
||||
return (sign) ? -abs_val : abs_val;
|
||||
}
|
||||
|
||||
static inline int8_t inv_two_side_circular_shift(int8_t val, uint8_t bias0, uint8_t bias1)
|
||||
{
|
||||
if (val == 0)
|
||||
return 0;
|
||||
|
||||
uint8_t sign = (val < 0) ? true : false;
|
||||
uint32_t abs_val = abs(val);
|
||||
abs_val += (sign) ? bias1 : bias0;
|
||||
int32_t abs_val_minus = abs_val - (127 + sign);
|
||||
uint8_t abs_val_lsb = ((abs_val_minus <= 0)
|
||||
? abs_val
|
||||
: abs_val_minus) &
|
||||
0xFF;
|
||||
return (sign) ? -abs_val_lsb : abs_val_lsb;
|
||||
}
|
||||
|
||||
static inline void symbol_remapping(uint8_t *blk_in, uint8_t *blk_out, uint8_t bias0, uint8_t bias1, uint8_t signedness, uint8_t is_bf16_exp, uint8_t zero_guard)
|
||||
{
|
||||
if (is_bf16_exp == false && signedness == false)
|
||||
{
|
||||
// remapping bypass
|
||||
memcpy(blk_out, blk_in, sizeof(uint8_t) * 16);
|
||||
return;
|
||||
}
|
||||
|
||||
if (is_bf16_exp == true)
|
||||
{
|
||||
// center circular shift
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
blk_out[i] = center_shift(blk_in[i], bias0, zero_guard);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// two-side circular shift
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
int8_t shift_data_i = two_side_circular_shift((int8_t)blk_in[i], bias0, bias1);
|
||||
blk_out[i] = sign_to_unsign(shift_data_i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline void inv_symbol_remapping(uint8_t *blk_in, uint8_t *blk_out, uint8_t bias0, uint8_t bias1, uint8_t signedness, uint8_t is_bf16_exp, uint8_t zero_guard)
|
||||
{
|
||||
if (is_bf16_exp == false && signedness == false)
|
||||
{
|
||||
// remapping bypass
|
||||
memcpy(blk_out, blk_in, sizeof(uint8_t) * 16);
|
||||
return;
|
||||
}
|
||||
|
||||
if (is_bf16_exp == true)
|
||||
{
|
||||
// center circular shift
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
blk_out[i] = inv_center_shift(blk_in[i], bias0, zero_guard);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// two-side circular shift
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
int8_t sign_data_i = unsign_to_sign(blk_in[i]);
|
||||
blk_out[i] = (uint8_t)inv_two_side_circular_shift(sign_data_i, bias0, bias1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline int vlc_estimate_block_order(uint8_t *blk_in, uint8_t bf16_zvc_en)
|
||||
{
|
||||
int best_k = 0;
|
||||
int best_bs_size = 0x7FFFFFFF;
|
||||
|
||||
for (int k = 0; k <= (int)MAX_ORDER_K; k++)
|
||||
{
|
||||
uint8_t remain_field_size = k << 4;
|
||||
int unary_field_len = 0;
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
uint8_t group_idx = blk_in[i] >> k;
|
||||
unary_field_len += (group_idx + 1);
|
||||
}
|
||||
int znum_bit = (bf16_zvc_en && k > 0) ? 4 : 0;
|
||||
int blk_size = (unary_field_len <= MAX_UNARY_FIELD_SIZE)
|
||||
? remain_field_size + unary_field_len + znum_bit
|
||||
: 255;
|
||||
if (blk_size < best_bs_size)
|
||||
{
|
||||
best_k = k;
|
||||
best_bs_size = blk_size;
|
||||
}
|
||||
}
|
||||
|
||||
best_k = (best_bs_size > 128) ? -1 : best_k;
|
||||
return best_k;
|
||||
}
|
||||
// -- vlc block parrelel GR encode/decode --
|
||||
static inline uint8_t vlc_gr_enc_block_data(uint8_t *blk_in, StreamBuffer *bs, int order_k, uint8_t bf16_zvc_en)
|
||||
{
|
||||
// uncompressed mode
|
||||
if (order_k == -1)
|
||||
{
|
||||
write_stream(bs, blk_in, 128);
|
||||
return 128;
|
||||
}
|
||||
|
||||
// remain field
|
||||
uint8_t remain_field[16] = {0};
|
||||
uint8_t unary_field[8] = {0};
|
||||
uint8_t sym_end_pos[16] = {0};
|
||||
uint8_t unary_field_len = 0;
|
||||
int sym_end_pos_accum = -1;
|
||||
|
||||
// bit plane encode for remain field
|
||||
for (int k = 0; k < order_k; k++)
|
||||
{
|
||||
uint8_t bit_plane0 = 0, bit_plane1 = 0;
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
bit_plane0 |= (get_bit_val(blk_in, i, k) << i);
|
||||
bit_plane1 |= (get_bit_val(blk_in, i + 8, k) << i);
|
||||
}
|
||||
remain_field[k << 1] = bit_plane0;
|
||||
remain_field[(k << 1) + 1] = bit_plane1;
|
||||
}
|
||||
write_stream(bs, remain_field, order_k << 4);
|
||||
|
||||
if (bf16_zvc_en && order_k > 0)
|
||||
{
|
||||
int zero_num = 0;
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
if (blk_in[i] == 0)
|
||||
zero_num++;
|
||||
}
|
||||
assert(zero_num < 16);
|
||||
write_stream(bs, (uint8_t *)&zero_num, 4);
|
||||
}
|
||||
|
||||
// unary encode for unary field
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
int group_idx = blk_in[i] >> order_k;
|
||||
sym_end_pos_accum += (group_idx + 1);
|
||||
sym_end_pos[i] = sym_end_pos_accum;
|
||||
int byte_idx = sym_end_pos[i] / 8;
|
||||
int bit_idx = sym_end_pos[i] % 8;
|
||||
unary_field[byte_idx] |= (1 << (bit_idx));
|
||||
}
|
||||
unary_field_len = sym_end_pos[15] + 1;
|
||||
assert(unary_field_len <= MAX_UNARY_FIELD_SIZE);
|
||||
uint8_t ulen = (unary_field_len - 16) & 0x1F;
|
||||
write_stream(bs, unary_field, unary_field_len);
|
||||
|
||||
return ulen;
|
||||
}
|
||||
|
||||
static inline void vlc_gr_dec_block_data(StreamBuffer *bs, uint8_t bs_size, uint8_t *rec, int order_k, uint8_t bf16_zvc_en)
|
||||
{
|
||||
assert(bs_size <= 128);
|
||||
// uncompressed mode
|
||||
if (order_k == -1)
|
||||
{
|
||||
parse_stream(bs, rec, 128);
|
||||
return;
|
||||
}
|
||||
|
||||
// remain field
|
||||
uint8_t remain_data[16] = {0};
|
||||
uint8_t remain_bs[16] = {0};
|
||||
uint8_t unary_field[8] = {0};
|
||||
uint8_t sym_end_pos[16] = {0};
|
||||
uint8_t unary_sym[16] = {0};
|
||||
uint8_t remain_field_size = order_k << 4;
|
||||
|
||||
parse_stream(bs, remain_bs, remain_field_size);
|
||||
// bit plane encode for remain field
|
||||
for (int k = 0; k < order_k; k++)
|
||||
{
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
remain_data[i] |= (get_bit_val(remain_bs, k << 1, i) << k);
|
||||
remain_data[i + 8] |= (get_bit_val(remain_bs, (k << 1) + 1, i) << k);
|
||||
}
|
||||
}
|
||||
|
||||
// zero number info
|
||||
int znum_bit = (bf16_zvc_en && order_k > 0) ? 4 : 0;
|
||||
uint8_t znum = 0;
|
||||
parse_stream(bs, &znum, znum_bit);
|
||||
|
||||
// unary encode for unary field
|
||||
uint8_t unary_field_len = bs_size - remain_field_size - znum_bit;
|
||||
parse_stream(bs, unary_field, unary_field_len);
|
||||
|
||||
int sym_cnt = 0;
|
||||
for (uint8_t ubit_i = 0; ubit_i < unary_field_len; ubit_i++)
|
||||
{
|
||||
int byte_idx = ubit_i / 8;
|
||||
int bit_idx = ubit_i % 8;
|
||||
if (get_bit_val(unary_field, byte_idx, bit_idx) == 1)
|
||||
{
|
||||
sym_end_pos[sym_cnt] = ubit_i;
|
||||
sym_cnt++;
|
||||
}
|
||||
}
|
||||
unary_sym[0] = sym_end_pos[0];
|
||||
for (int i = 1; i < 16; i++)
|
||||
{
|
||||
unary_sym[i] = sym_end_pos[i] - sym_end_pos[i - 1] - 1;
|
||||
}
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
rec[i] = (unary_sym[i] << order_k) + remain_data[i];
|
||||
}
|
||||
}
|
||||
|
||||
// -- vlc encode int8 entry function --
|
||||
static inline void bm_vlc_enc_int8(const uint8_t *ibuf, size_t isz, uint8_t *obuf, size_t *osz, CommandInfo *cmd_info)
|
||||
{
|
||||
StreamBuffer bs_header, bs_kmap, bs_data;
|
||||
size_t blk_num = (isz + 15) >> 4;
|
||||
size_t header_size = 16;
|
||||
size_t kmap_size = ceiling_func(blk_num, 16) << 4;
|
||||
size_t bs_buf_size = header_size + kmap_size + (blk_num << 4);
|
||||
uint8_t *bsbuf = (uint8_t *)calloc(bs_buf_size, sizeof(uint8_t));
|
||||
|
||||
// block encode
|
||||
init_stream(&bs_kmap, bsbuf + header_size, kmap_size, false);
|
||||
init_stream(&bs_data, bsbuf + header_size + kmap_size, blk_num << 4, false);
|
||||
|
||||
for (size_t blk_idx = 0; blk_idx < blk_num; blk_idx++)
|
||||
{
|
||||
uint8_t blk_data[16] = {0}, blk_sr_data[16] = {0};
|
||||
size_t in_size = (blk_idx == (blk_num - 1)) ? isz - (blk_idx << 4) : 16;
|
||||
memcpy(blk_data, &ibuf[blk_idx << 4], sizeof(uint8_t) * in_size);
|
||||
|
||||
symbol_remapping(blk_data, blk_sr_data, cmd_info->bias0, cmd_info->bias1, cmd_info->signedness, false, false);
|
||||
|
||||
int k = vlc_estimate_block_order(blk_sr_data, false);
|
||||
uint8_t ulen = vlc_gr_enc_block_data(blk_sr_data, &bs_data, k, false);
|
||||
uint8_t k_info = (k == -1) ? 0xE0 : (k << 5) + ulen;
|
||||
write_stream(&bs_kmap, &k_info, 8);
|
||||
}
|
||||
|
||||
int blk_bs_size = ceiling_func(((bs_data.bit_pos + 7) >> 3), 16) << 4; // 16 byte align
|
||||
*osz = header_size + kmap_size + blk_bs_size;
|
||||
|
||||
// write header
|
||||
init_stream(&bs_header, bsbuf, header_size, false);
|
||||
vlc_enc_header(&bs_header, cmd_info, blk_bs_size);
|
||||
|
||||
memcpy(obuf, bsbuf, (*osz) * sizeof(uint8_t));
|
||||
free(bsbuf);
|
||||
}
|
||||
|
||||
// -- vlc decode int8 entry function --
|
||||
static inline void bm_vlc_dec_int8_ext(const uint8_t *ibuf, size_t isz, uint8_t *obuf, size_t *bs_size)
|
||||
{
|
||||
StreamBuffer bs_header, bs_kmap, bs_data;
|
||||
CommandInfo cmd_info;
|
||||
memset(&cmd_info, 0, sizeof(CommandInfo));
|
||||
|
||||
size_t blk_num = (isz + 15) >> 4;
|
||||
int header_size = 16;
|
||||
int kmap_size = ceiling_func(blk_num, 16) << 4;
|
||||
|
||||
// parse header
|
||||
init_stream(&bs_header, ibuf, header_size, true);
|
||||
vlc_dec_header_ext(&bs_header, &cmd_info, bs_size);
|
||||
|
||||
// Check whether valid header
|
||||
size_t bs_buf_size = get_out_bs_buf_size(isz, 0); // int8
|
||||
ASSERT(*bs_size <= bs_buf_size);
|
||||
ASSERT(cmd_info.is_bfloat16 == 0);
|
||||
|
||||
// block decode
|
||||
init_stream(&bs_kmap, ibuf + header_size, kmap_size, true);
|
||||
init_stream(&bs_data, ibuf + header_size + kmap_size, blk_num << 4, true);
|
||||
|
||||
for (size_t blk_idx = 0; blk_idx < blk_num; blk_idx++)
|
||||
{
|
||||
uint8_t blk_data[16] = {0}, blk_sr_data[16] = {0};
|
||||
uint8_t k_info = 0;
|
||||
parse_stream(&bs_kmap, &k_info, 8);
|
||||
uint8_t ulen = k_info & 0x1F;
|
||||
int k = (k_info >> 5 == 7) ? -1 : k_info >> 5;
|
||||
int blk_bs_size = (k == -1) ? 128 : (k << 4) + ulen + 16;
|
||||
vlc_gr_dec_block_data(&bs_data, blk_bs_size, blk_data, k, false);
|
||||
|
||||
inv_symbol_remapping(blk_data, blk_sr_data, cmd_info.bias0, cmd_info.bias1, cmd_info.signedness, false, false);
|
||||
|
||||
int out_size = (blk_idx == (blk_num - 1)) ? isz - (blk_idx << 4) : 16;
|
||||
memcpy(&obuf[blk_idx << 4], blk_sr_data, sizeof(uint8_t) * out_size);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void bm_vlc_dec_int8(const uint8_t *ibuf, size_t isz, uint8_t *obuf)
|
||||
{
|
||||
size_t bs_size;
|
||||
bm_vlc_dec_int8_ext(ibuf, isz, obuf, &bs_size);
|
||||
}
|
||||
|
||||
// -- vlc encode bfloat16 entry function --
|
||||
static inline void bm_vlc_enc_bf16(const uint16_t *ibuf, size_t isz, uint8_t *obuf, size_t *osz, CommandInfo *cmd_info)
|
||||
{
|
||||
StreamBuffer bs_header, bs_kmap, bs_data;
|
||||
size_t blk_num = (isz + 31) >> 5; // 32 bytes per blok
|
||||
size_t header_size = 16;
|
||||
size_t kmap_size = ceiling_func(blk_num, 16) << 4;
|
||||
size_t bs_buf_size = header_size + kmap_size + (blk_num << 5);
|
||||
uint8_t *bsbuf = (uint8_t *)calloc(bs_buf_size, sizeof(uint8_t));
|
||||
|
||||
// block encode
|
||||
init_stream(&bs_kmap, bsbuf + header_size, kmap_size, false);
|
||||
init_stream(&bs_data, bsbuf + header_size + kmap_size, blk_num << 5, false);
|
||||
|
||||
for (size_t blk_idx = 0; blk_idx < blk_num; blk_idx++)
|
||||
{
|
||||
uint8_t blk_data[16] = {0}, blk_sr_data[16] = {0}, blk_data_frac[16] = {0};
|
||||
size_t in_num = (blk_idx == (blk_num - 1)) ? ((isz >> 1) - (blk_idx << 4)) : 16;
|
||||
dispatch_bf16_data(&ibuf[blk_idx << 4], blk_data, blk_data_frac, in_num);
|
||||
|
||||
// exp: BGR encode
|
||||
symbol_remapping(blk_data, blk_sr_data, cmd_info->bias0, cmd_info->bias1, false, true, cmd_info->zero_guard_en);
|
||||
|
||||
int k = vlc_estimate_block_order(blk_sr_data, cmd_info->zero_guard_en);
|
||||
uint8_t ulen = vlc_gr_enc_block_data(blk_sr_data, &bs_data, k, cmd_info->zero_guard_en);
|
||||
uint8_t k_info = (k == -1) ? 0xE0 : (k << 5) + ulen;
|
||||
write_stream(&bs_kmap, &k_info, 8);
|
||||
|
||||
// frac: implicit zero compression
|
||||
for (size_t i = 0; i < 16; i++)
|
||||
{
|
||||
if (!cmd_info->zero_guard_en || blk_data[i] != 0)
|
||||
{
|
||||
write_stream(&bs_data, &blk_data_frac[i], 8);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int blk_bs_size = ceiling_func(((bs_data.bit_pos + 7) >> 3), 16) << 4; // 16 byte align
|
||||
*osz = header_size + kmap_size + blk_bs_size;
|
||||
|
||||
// write header
|
||||
init_stream(&bs_header, bsbuf, header_size, false);
|
||||
vlc_enc_header(&bs_header, cmd_info, blk_bs_size);
|
||||
|
||||
memcpy(obuf, bsbuf, (*osz) * sizeof(uint8_t));
|
||||
free(bsbuf);
|
||||
}
|
||||
|
||||
// -- vlc decode bfloat16 entry function --
|
||||
static inline void bm_vlc_dec_bf16_ext(const uint8_t *ibuf, size_t isz, uint16_t *obuf, size_t *bs_size)
|
||||
{
|
||||
StreamBuffer bs_header, bs_kmap, bs_data;
|
||||
CommandInfo cmd_info;
|
||||
memset(&cmd_info, 0, sizeof(CommandInfo));
|
||||
|
||||
size_t blk_num = (isz + 31) >> 5; // 32 bytes per blok
|
||||
int header_size = 16;
|
||||
int kmap_size = ceiling_func(blk_num, 16) << 4;
|
||||
|
||||
// parse header
|
||||
init_stream(&bs_header, ibuf, header_size, true);
|
||||
vlc_dec_header_ext(&bs_header, &cmd_info, bs_size);
|
||||
|
||||
// Check whether valid header
|
||||
size_t bs_buf_size = get_out_bs_buf_size(isz, 1); // bf16
|
||||
ASSERT(*bs_size <= bs_buf_size);
|
||||
ASSERT(cmd_info.is_bfloat16 == 1);
|
||||
|
||||
// block decode
|
||||
init_stream(&bs_kmap, ibuf + header_size, kmap_size, true);
|
||||
init_stream(&bs_data, ibuf + header_size + kmap_size, blk_num << 5, true);
|
||||
|
||||
for (size_t blk_idx = 0; blk_idx < blk_num; blk_idx++)
|
||||
{
|
||||
uint8_t blk_data[16] = {0}, blk_sr_data[16] = {0}, blk_data_frac[16] = {0};
|
||||
uint8_t k_info = 0;
|
||||
parse_stream(&bs_kmap, &k_info, 8);
|
||||
uint8_t ulen = k_info & 0x1F;
|
||||
int k = (k_info >> 5 == 7) ? -1 : k_info >> 5;
|
||||
int znum_bit = (cmd_info.zero_guard_en && k > 0) ? 4 : 0;
|
||||
uint8_t blk_bs_size = (k == -1) ? 128 : (k << 4) + ulen + 16 + znum_bit;
|
||||
|
||||
// exp: BGR decode
|
||||
vlc_gr_dec_block_data(&bs_data, blk_bs_size, blk_data, k, cmd_info.zero_guard_en);
|
||||
|
||||
inv_symbol_remapping(blk_data, blk_sr_data, cmd_info.bias0, cmd_info.bias1, false, true, cmd_info.zero_guard_en);
|
||||
|
||||
size_t out_num = (blk_idx == (blk_num - 1)) ? ((isz >> 1) - (blk_idx << 4)) : 16;
|
||||
|
||||
// frac: implicit zero compression
|
||||
for (size_t i = 0; i < out_num; i++)
|
||||
{
|
||||
if (!cmd_info.zero_guard_en || blk_sr_data[i] != 0)
|
||||
{
|
||||
parse_stream(&bs_data, &blk_data_frac[i], 8);
|
||||
}
|
||||
}
|
||||
merge_bf16_data(blk_sr_data, blk_data_frac, &obuf[blk_idx << 4], out_num);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void bm_vlc_dec_bf16(const uint8_t *ibuf, size_t isz, uint16_t *obuf)
|
||||
{
|
||||
size_t bs_size;
|
||||
bm_vlc_dec_bf16_ext(ibuf, isz, obuf, &bs_size);
|
||||
}
|
||||
|
||||
// -- offline estimate model weight params --
|
||||
static inline void bm_vlc_est_weight_bias(const uint8_t *ibuf, size_t isz, uint8_t signedness, uint8_t isBfloat16, CommandInfo *cmd_info)
|
||||
{
|
||||
assert(!(isBfloat16 && signedness)); // WARNING: signedness MUST be 0 as isBfloat16==True
|
||||
|
||||
cmd_info->is_bfloat16 = isBfloat16;
|
||||
if (isBfloat16 == false && signedness == true)
|
||||
{
|
||||
// two-side circular shift
|
||||
int hist[256] = {0};
|
||||
for (size_t i = 0; i < isz; i++)
|
||||
{
|
||||
hist[ibuf[i]]++;
|
||||
}
|
||||
|
||||
int8_t pos_v = 1;
|
||||
//while (pos_v < 128)
|
||||
// comparison is always true due to limited range of data type [-Werror=type-limits]
|
||||
while (true)
|
||||
{
|
||||
if (hist[((uint8_t)pos_v)] == 0)
|
||||
{
|
||||
pos_v++;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
//cmd_info->bias0 = (pos_v > 1 && pos_v < 128) ? (pos_v - 1) : 0;
|
||||
// comparison is always true due to limited range of data type [-Werror=type-limits]
|
||||
cmd_info->bias0 = (pos_v > 1) ? (pos_v - 1) : 0;
|
||||
int8_t neg_v = -1;
|
||||
//while (neg_v >= (-128)) // comparison is always true due to limited range of data type [-Werror=type-limits]
|
||||
while (true)
|
||||
{
|
||||
if (hist[(uint8_t)neg_v] == 0)
|
||||
{
|
||||
neg_v--;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
//cmd_info->bias1 = (neg_v < -1 && neg_v >= -128) ? abs(neg_v + 1) : 0;
|
||||
// comparison is always true due to limited range of data type [-Werror=type-limits]
|
||||
cmd_info->bias1 = (neg_v < -1) ? abs(neg_v + 1) : 0;
|
||||
cmd_info->signedness = true;
|
||||
}
|
||||
|
||||
if (isBfloat16 == true)
|
||||
{
|
||||
// center shift
|
||||
int64_t exp_accum = 0;
|
||||
uint16_t *bf16_in = (uint16_t *)ibuf;
|
||||
size_t inum = (isz >> 1), cnt = 0;
|
||||
for (size_t i = 0; i < inum; i++)
|
||||
{
|
||||
uint8_t exp = ((bf16_in[i] >> 7) & 0xFF);
|
||||
if (exp != 0)
|
||||
{
|
||||
exp_accum += exp;
|
||||
cnt++;
|
||||
}
|
||||
}
|
||||
if (cnt > 0)
|
||||
{
|
||||
cmd_info->bias0 = (uint8_t)((exp_accum / (float)cnt) + 0.5);
|
||||
}
|
||||
cmd_info->zero_guard_en = (inum == cnt) ? false : true;
|
||||
cmd_info->signedness = false;
|
||||
}
|
||||
}
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* __BM_VLC_COMPRESS_H__ */
|
||||
1176
cvikernel/include/bmkernel/bm1822/bmkernel_1822.h
Normal file
1176
cvikernel/include/bmkernel/bm1822/bmkernel_1822.h
Normal file
File diff suppressed because it is too large
Load Diff
369
cvikernel/include/bmkernel/bm1822/compression.h
Normal file
369
cvikernel/include/bmkernel/bm1822/compression.h
Normal file
@ -0,0 +1,369 @@
|
||||
#ifndef COMPRESSION_H
|
||||
#define COMPRESSION_H
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
typedef struct {
|
||||
uint32_t compress_md;
|
||||
uint32_t bit_length;
|
||||
int is_signed;
|
||||
|
||||
uint64_t total_data_num;
|
||||
uint32_t non_zero_data_num;
|
||||
|
||||
uint64_t header_bytes;
|
||||
uint64_t map_bytes;
|
||||
uint64_t data_bytes;
|
||||
uint64_t total_bytes;
|
||||
|
||||
int compressed_min;
|
||||
int compressed_max;
|
||||
} compression_info_t;
|
||||
|
||||
typedef struct {
|
||||
uint64_t header_offset;
|
||||
uint64_t header_size;
|
||||
uint64_t map_offset;
|
||||
uint64_t map_size;
|
||||
uint64_t data_offset;
|
||||
uint64_t data_size;
|
||||
uint64_t total_size;
|
||||
} compress_addr_info;
|
||||
|
||||
static uint64_t compression_map_bytes(uint64_t total_data_num)
|
||||
{
|
||||
uint64_t bit_alignment = 16 * 8;
|
||||
uint64_t bits = total_data_num;
|
||||
|
||||
return ceiling_func(bits, bit_alignment)*16;
|
||||
}
|
||||
|
||||
static uint64_t compression_map_clear_bytes(uint64_t total_data_num)
|
||||
{
|
||||
uint64_t bit_alignment = 2 * 8;
|
||||
uint64_t bits = total_data_num;
|
||||
|
||||
return ceiling_func(bits, bit_alignment)*2;
|
||||
}
|
||||
|
||||
|
||||
static uint64_t compression_data_bytes(uint64_t non_zero_data_num, uint32_t bit_length)
|
||||
{
|
||||
if (bit_length == 1)
|
||||
return 0;
|
||||
|
||||
uint64_t bit_alignment = 8;
|
||||
uint64_t bits = non_zero_data_num * bit_length;
|
||||
|
||||
return ceiling_func(bits, bit_alignment);
|
||||
}
|
||||
|
||||
static inline uint32_t compression_bit_length(uint32_t compress_md)
|
||||
{
|
||||
switch (compress_md) {
|
||||
case 0:
|
||||
return 8;
|
||||
case 1:
|
||||
return 4;
|
||||
case 2:
|
||||
return 2;
|
||||
case 3:
|
||||
return 1;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void compute_compressed_range(
|
||||
uint32_t bit_length, int is_signed, int *min, int *max)
|
||||
{
|
||||
if (is_signed) {
|
||||
switch (bit_length) {
|
||||
case 1:
|
||||
*min = -1;
|
||||
*max = 0;
|
||||
return;
|
||||
case 2:
|
||||
*min = -2;
|
||||
*max = 1;
|
||||
return;
|
||||
case 4:
|
||||
*min = -8;
|
||||
*max = 7;
|
||||
return;
|
||||
case 8:
|
||||
*min = -128;
|
||||
*max = 127;
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
*min = 0;
|
||||
switch (bit_length) {
|
||||
case 1:
|
||||
*max = 1;
|
||||
return;
|
||||
case 2:
|
||||
*max = 3;
|
||||
return;
|
||||
case 4:
|
||||
*max = 15;
|
||||
return;
|
||||
case 8:
|
||||
*max = 255;
|
||||
return;
|
||||
}
|
||||
}
|
||||
assert(0);
|
||||
}
|
||||
|
||||
static inline int saturate(int val, int max, int min)
|
||||
{
|
||||
if (val < min)
|
||||
return min;
|
||||
else if (val > max)
|
||||
return max;
|
||||
else
|
||||
return val;
|
||||
}
|
||||
|
||||
static inline uint64_t count_non_zero_results(
|
||||
uint8_t buf[], uint64_t size, int is_signed, int max, int min)
|
||||
{
|
||||
uint64_t n = 0;
|
||||
|
||||
for (uint64_t i = 0; i < size; i++) {
|
||||
int val = is_signed? (int8_t)buf[i]: buf[i];
|
||||
int res = saturate(val, max, min);
|
||||
if (res != 0)
|
||||
n++;
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
static inline void set_map_bit(uint8_t map[], uint64_t i)
|
||||
{
|
||||
uint64_t byte_i = i / 8;
|
||||
uint64_t bit_i = i % 8;
|
||||
|
||||
map[byte_i] |= (1 << bit_i);
|
||||
}
|
||||
|
||||
static inline uint8_t read_map_bit(uint8_t map[], uint64_t i)
|
||||
{
|
||||
uint64_t byte_i = i / 8;
|
||||
uint64_t bit_i = i % 8;
|
||||
|
||||
return (map[byte_i] >> bit_i) & 1;
|
||||
}
|
||||
|
||||
static inline void parse_header(
|
||||
uint32_t header, int *is_signed, uint32_t *compress_md, uint32_t *nz_num)
|
||||
{
|
||||
*is_signed = (header >> 29) & 1;
|
||||
*compress_md = (header >> 24) & 0b11;
|
||||
*nz_num = header & 0xffffff;
|
||||
}
|
||||
|
||||
static inline void fill_header(uint32_t *hdr, compression_info_t *info)
|
||||
{
|
||||
if(compression_bit_length(info->compress_md)!=1)
|
||||
{
|
||||
*hdr = (info->is_signed << 29) | (1 << 28) |
|
||||
(info->compress_md << 24) |
|
||||
info->non_zero_data_num;
|
||||
}else
|
||||
{
|
||||
*hdr = (info->is_signed << 29) | (1 << 28) |
|
||||
(info->compress_md << 24);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void fill_map(uint8_t map[], uint8_t buf[], compression_info_t *info)
|
||||
{
|
||||
int min = info->compressed_min;
|
||||
int max = info->compressed_max;
|
||||
|
||||
uint64_t clear_map = compression_map_clear_bytes(info->total_data_num);
|
||||
for (uint64_t i = 0; i < clear_map; i++)
|
||||
map[i] = 0;
|
||||
|
||||
for (uint64_t i = 0; i < info->total_data_num; i++) {
|
||||
int val = info->is_signed? (int8_t)buf[i]: buf[i];
|
||||
int res = saturate(val, max, min);
|
||||
if (res != 0)
|
||||
set_map_bit(map, i);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void compress_one_data(
|
||||
uint8_t data[], uint64_t i, uint8_t val, compression_info_t *info)
|
||||
{
|
||||
uint32_t bit_len = info->bit_length;
|
||||
uint32_t data_per_byte = 8 / bit_len;
|
||||
|
||||
uint32_t byte_i = i / data_per_byte;
|
||||
uint32_t bit_i = (i % data_per_byte) * bit_len;
|
||||
uint8_t mask = (1 << bit_len) - 1;
|
||||
|
||||
data[byte_i] |= (val & mask) << bit_i;
|
||||
}
|
||||
|
||||
static inline uint8_t sign_extend(uint8_t val, uint32_t bit_len)
|
||||
{
|
||||
int shift = 8 - bit_len;
|
||||
return (int8_t)(val << shift) >> shift;
|
||||
}
|
||||
|
||||
static inline uint8_t decompress_one_data(
|
||||
uint8_t data[], uint64_t i, compression_info_t *info)
|
||||
{
|
||||
uint32_t bit_len = info->bit_length;
|
||||
uint32_t data_per_byte = 8 / bit_len;
|
||||
|
||||
uint32_t byte_i = i / data_per_byte;
|
||||
uint32_t bit_i = (i % data_per_byte) * bit_len;
|
||||
uint8_t mask = (1 << bit_len) - 1;
|
||||
|
||||
uint8_t val = (data[byte_i] >> bit_i) & mask;
|
||||
if (info->is_signed)
|
||||
val = sign_extend(val, bit_len);
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
static inline void fill_data(uint8_t data[], uint8_t buf[], compression_info_t *info)
|
||||
{
|
||||
int min = info->compressed_min;
|
||||
int max = info->compressed_max;
|
||||
|
||||
for (uint64_t i = 0; i < info->data_bytes; i++)
|
||||
data[i] = 0;
|
||||
|
||||
uint64_t nz_i = 0;
|
||||
for (uint64_t i = 0; i < info->total_data_num; i++) {
|
||||
int val = info->is_signed? (int8_t)buf[i]: buf[i];
|
||||
int res = saturate(val, max, min);
|
||||
if (res != 0) {
|
||||
compress_one_data(data, nz_i, res, info);
|
||||
nz_i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline compression_info_t make_compression_info(
|
||||
uint8_t buf[], uint64_t size, uint32_t compress_md, int is_signed)
|
||||
{
|
||||
uint32_t bit_length = compression_bit_length(compress_md);
|
||||
|
||||
int min, max;
|
||||
compute_compressed_range(bit_length, is_signed, &min, &max);
|
||||
|
||||
uint32_t nz_num = count_non_zero_results(buf, size, is_signed, max, min);
|
||||
assert(nz_num <= 0xffffff);
|
||||
|
||||
compression_info_t info;
|
||||
info.compress_md = compress_md;
|
||||
info.bit_length = bit_length;
|
||||
info.is_signed = is_signed;
|
||||
info.total_data_num = size;
|
||||
info.non_zero_data_num = nz_num;
|
||||
info.header_bytes = 16;
|
||||
info.map_bytes = compression_map_bytes(size);
|
||||
info.data_bytes = compression_data_bytes(nz_num, bit_length);
|
||||
info.total_bytes = info.header_bytes + info.map_bytes + info.data_bytes;
|
||||
info.compressed_min = min;
|
||||
info.compressed_max = max;
|
||||
return info;
|
||||
}
|
||||
|
||||
static inline compression_info_t parse_compression_info(
|
||||
uint8_t compressed_buf[], uint64_t max_size, uint64_t total_data_num)
|
||||
{
|
||||
uint64_t header_bytes = 16;
|
||||
assert(header_bytes <= max_size);
|
||||
|
||||
int is_signed;
|
||||
uint32_t compress_md, nz_num;
|
||||
parse_header(*(uint32_t *)compressed_buf, &is_signed, &compress_md, &nz_num);
|
||||
|
||||
uint32_t bit_length = compression_bit_length(compress_md);
|
||||
int min, max;
|
||||
compute_compressed_range(bit_length, is_signed, &min, &max);
|
||||
|
||||
compression_info_t info;
|
||||
info.compress_md = compress_md;
|
||||
info.bit_length = compression_bit_length(compress_md);
|
||||
info.is_signed = is_signed;
|
||||
info.total_data_num = total_data_num;
|
||||
info.non_zero_data_num = nz_num;
|
||||
info.header_bytes = header_bytes;
|
||||
info.map_bytes = compression_map_bytes(total_data_num);
|
||||
info.data_bytes = compression_data_bytes(nz_num, info.bit_length);
|
||||
info.total_bytes = header_bytes + info.map_bytes + info.data_bytes;
|
||||
info.compressed_min = min;
|
||||
info.compressed_max = max;
|
||||
|
||||
assert(info.total_bytes <= max_size);
|
||||
|
||||
return info;
|
||||
}
|
||||
|
||||
static inline uint8_t * compress(
|
||||
uint8_t buf[], uint64_t size, uint32_t compress_md, int is_signed, compress_addr_info *compressed_data)
|
||||
{
|
||||
compression_info_t info =
|
||||
make_compression_info(buf, size, compress_md, is_signed);
|
||||
|
||||
assert(info.total_bytes < 0x100000);
|
||||
static uint8_t *result = new uint8_t[0x100000];
|
||||
uint32_t *hdr = (uint32_t *)result;
|
||||
uint8_t *map = &result[info.header_bytes];
|
||||
uint8_t *data = &map[info.map_bytes];
|
||||
|
||||
fill_header(hdr, &info);
|
||||
fill_map(map, buf, &info);
|
||||
if (info.bit_length != 1)
|
||||
fill_data(data, buf, &info);
|
||||
|
||||
compressed_data->header_offset = 0;
|
||||
compressed_data->header_size = 4;
|
||||
compressed_data->map_offset = info.header_bytes;
|
||||
compressed_data->map_size = compression_map_clear_bytes(info.total_data_num);
|
||||
compressed_data->data_offset = info.map_bytes + info.header_bytes;
|
||||
compressed_data->data_size = info.data_bytes;
|
||||
compressed_data->total_size = info.total_bytes;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline void decompress(
|
||||
uint8_t buf[], uint64_t size, uint8_t compressed_buf[], uint64_t max_size)
|
||||
{
|
||||
compression_info_t info =
|
||||
parse_compression_info(compressed_buf, max_size, size);
|
||||
assert(info.total_bytes <= max_size);
|
||||
assert(info.total_data_num == size);
|
||||
|
||||
uint8_t *map = &compressed_buf[info.header_bytes];
|
||||
if (info.bit_length == 1) {
|
||||
for (uint64_t i = 0; i < size; i++) {
|
||||
uint8_t val = read_map_bit(map, i);
|
||||
buf[i] = info.is_signed? sign_extend(val, 1): val;
|
||||
}
|
||||
} else {
|
||||
uint8_t *data = &map[info.map_bytes];
|
||||
uint64_t data_i = 0;
|
||||
for (uint64_t i = 0; i < size; i++) {
|
||||
uint8_t val = read_map_bit(map, i);
|
||||
if (val == 0) {
|
||||
buf[i] = 0;
|
||||
} else {
|
||||
buf[i] = decompress_one_data(data, data_i, &info);
|
||||
data_i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* COMPRESSION_H */
|
||||
338
cvikernel/include/bmkernel/bm1880v2/1880v2_fp_convert.h
Executable file
338
cvikernel/include/bmkernel/bm1880v2/1880v2_fp_convert.h
Executable file
@ -0,0 +1,338 @@
|
||||
#ifndef ATOMIC_FP_H_
|
||||
#define ATOMIC_FP_H_
|
||||
|
||||
#if __arm__
|
||||
#define __DISABLE_FENV__
|
||||
#endif
|
||||
|
||||
#ifndef __DISABLE_FENV__
|
||||
#include <fenv.h>
|
||||
#endif
|
||||
#include <math.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
static inline uint8_t convert_bf16_u8(uint16_t data);
|
||||
static inline uint8_t _convert_bf16_u8(uint16_t data, int int8_rnd_md);
|
||||
static inline int8_t _convert_bf16_s8(uint16_t data, int int8_rnd_md);
|
||||
static inline int8_t convert_bf16_s8(uint16_t data);
|
||||
static inline uint16_t convert_int8_bf16(uint8_t data, uint8_t sign);
|
||||
static inline uint32_t convert_fp32_u32(float fp32);
|
||||
static inline uint32_t convert_fp32_hex(float val);
|
||||
static inline float convert_hex_fp32(uint32_t hval);
|
||||
|
||||
static inline float convert_bf16_fp32(uint16_t bf16);
|
||||
static inline uint16_t convert_fp32_bf16(float fp32);
|
||||
|
||||
static inline void f32_integer(void *if32, void *o_integer, int integer_size, int accumulate, int int8_signed, int int8_rnd_md);
|
||||
//static inline void f32_integer(void *if32, void *o_integer,
|
||||
// 0 for 32 bit , 1 for 16 bit , 2 for 8 bit
|
||||
// int integer_size, int accumulate = 0, int int8_signed = 1, int int8_rnd_md = 0);
|
||||
|
||||
union convert_type_float {
|
||||
float fval;
|
||||
uint16_t bf16[2];
|
||||
uint32_t ival;
|
||||
};
|
||||
|
||||
typedef union convert_type_float convert_int_float;
|
||||
static const uint16_t NAN_VALUE = 0x7FC0;
|
||||
|
||||
//static int round_mode = 0;
|
||||
static uint8_t float_isnan(const float x) {
|
||||
//return isnan(x);
|
||||
return x != x;
|
||||
}
|
||||
|
||||
static inline int set_store_feround()
|
||||
{
|
||||
#ifndef __DISABLE_FENV__
|
||||
int round_mode = fegetround();
|
||||
fesetround(FE_TOWARDZERO);
|
||||
return round_mode;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void restore_feround(int round_mode)
|
||||
{
|
||||
#ifndef __DISABLE_FENV__
|
||||
fesetround(round_mode);
|
||||
#else
|
||||
(void)round_mode;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline uint8_t _convert_bf16_u8(uint16_t data, int int8_rnd_md)
|
||||
{
|
||||
/* convert bf16 to float32*/
|
||||
float fp32;
|
||||
convert_int_float convert_val;
|
||||
fp32 = convert_bf16_fp32(data);
|
||||
/* convert float32 to uint8_t*/
|
||||
f32_integer((void*)&fp32, &convert_val.ival, 2, 0, 0, int8_rnd_md);
|
||||
return (uint8_t) convert_val.ival;
|
||||
}
|
||||
|
||||
static inline uint8_t convert_bf16_u8(uint16_t data)
|
||||
{
|
||||
return (uint8_t) _convert_bf16_u8(data, 0);
|
||||
}
|
||||
|
||||
static inline int8_t _convert_bf16_s8(uint16_t data, int int8_rnd_md)
|
||||
{
|
||||
/* convert bf16 to float32*/
|
||||
float fp32;
|
||||
convert_int_float convert_val;
|
||||
fp32 = convert_bf16_fp32(data);
|
||||
/* convert float32 to uint8_t*/
|
||||
f32_integer((void*)&fp32, &convert_val.ival, 2, 0, 1, int8_rnd_md);
|
||||
return (int8_t) convert_val.ival;
|
||||
}
|
||||
|
||||
static inline int8_t convert_bf16_s8(uint16_t data)
|
||||
{
|
||||
return (int8_t) _convert_bf16_s8(data, 0);
|
||||
}
|
||||
|
||||
static inline uint16_t convert_int8_bf16(uint8_t data, uint8_t sign)
|
||||
{
|
||||
int32_t val = sign ? (int8_t) data : (uint8_t) data;
|
||||
/* need to round to bf16 mode */
|
||||
return convert_fp32_bf16((float) val);
|
||||
}
|
||||
|
||||
static inline uint16_t convert_fp32_bf16(float fp32)
|
||||
{
|
||||
if (float_isnan(fp32))
|
||||
return NAN_VALUE;
|
||||
convert_int_float convert_val;
|
||||
convert_val.fval = fp32;
|
||||
uint32_t input = convert_val.ival;
|
||||
uint32_t lsb = (input >> 16) & 1;
|
||||
uint32_t rounding_bias = 0x7fff + lsb;
|
||||
input += rounding_bias;
|
||||
convert_val.bf16[1] = (uint16_t) (input >> 16);
|
||||
|
||||
/* HW behavior */
|
||||
if ((convert_val.bf16[1] & 0x7f80) == 0x7f80) {
|
||||
convert_val.bf16[1] = 0x7f7f;
|
||||
}
|
||||
return convert_val.bf16[1];
|
||||
}
|
||||
|
||||
static inline uint8_t convert_fp32_u8(float fp32)
|
||||
{
|
||||
convert_int_float convert_val;
|
||||
f32_integer((void*)&fp32, &convert_val.ival, 2, 0, 0, 0);
|
||||
return (uint8_t) convert_val.ival;
|
||||
}
|
||||
|
||||
static inline int8_t convert_fp32_s8(float fp32)
|
||||
{
|
||||
convert_int_float convert_val;
|
||||
f32_integer((void*)&fp32, &convert_val.ival, 2, 0, 1, 0);
|
||||
return (int8_t) convert_val.ival;
|
||||
}
|
||||
|
||||
static inline uint32_t convert_fp32_u32(float fp32)
|
||||
{
|
||||
convert_int_float convert_val;
|
||||
f32_integer((void*)&fp32, &convert_val.ival, 0, 0, 0, 0);
|
||||
return (uint32_t) convert_val.ival;
|
||||
}
|
||||
|
||||
static inline int32_t convert_fp32_s32(float fp32)
|
||||
{
|
||||
convert_int_float convert_val;
|
||||
f32_integer((void*)&fp32, &convert_val.ival, 0, 0, 1, 0);
|
||||
return (int32_t) convert_val.ival;
|
||||
}
|
||||
|
||||
/* convert hex to float directly */
|
||||
static inline float convert_hex_fp32(uint32_t hval)
|
||||
{
|
||||
convert_int_float convert_val;
|
||||
convert_val.ival = hval;
|
||||
return convert_val.fval;
|
||||
}
|
||||
/* convert float to hex directly */
|
||||
static inline uint32_t convert_fp32_hex(float val)
|
||||
{
|
||||
convert_int_float convert_val;
|
||||
convert_val.fval = val;
|
||||
return convert_val.ival;
|
||||
}
|
||||
static inline float convert_bf16_fp32(uint16_t bf16)
|
||||
{
|
||||
convert_int_float convert_val;
|
||||
convert_val.bf16[1] = bf16;
|
||||
convert_val.bf16[0] = 0;
|
||||
return convert_val.fval;
|
||||
}
|
||||
|
||||
static inline void flt2int_flt(float x, unsigned long long* integer_part, float * sub_part, uint8_t sign)
|
||||
{
|
||||
convert_int_float work_x;
|
||||
int level_code;
|
||||
unsigned long tail_code;
|
||||
work_x.fval = x;
|
||||
level_code = ((work_x.ival >> 23) & 0xff) - 127;
|
||||
|
||||
//if the level code is negaive, the integer part of the float is zero
|
||||
if ( level_code < 0 ){
|
||||
*integer_part = 0;
|
||||
*sub_part = x;
|
||||
}
|
||||
else {
|
||||
tail_code = (work_x.ival) & 0x7fffff;
|
||||
tail_code = tail_code | 0x800000;
|
||||
|
||||
if (level_code < 23){
|
||||
tail_code >>= (23 - level_code);
|
||||
*integer_part = tail_code;
|
||||
work_x.ival &= 0xffffffff << (23 - level_code);
|
||||
*sub_part = x - work_x.fval;
|
||||
}
|
||||
else {
|
||||
tail_code <<= (level_code - 23);
|
||||
*integer_part = tail_code;
|
||||
if(level_code>30){
|
||||
*integer_part = 0x7fffffff;
|
||||
if(sign)*integer_part = 0x800000000;
|
||||
}
|
||||
*sub_part = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline static int flt2int(float ifval, int int8_rnd_md)
|
||||
{
|
||||
union {
|
||||
float floatNum;
|
||||
unsigned long intNum;
|
||||
} tempIfval;
|
||||
tempIfval.floatNum = ifval;
|
||||
uint8_t isPositive = ((tempIfval.intNum & 0x80000000UL) == 0x80000000UL) ? 0 : 1 ;
|
||||
float abs_fval = (!isPositive) ? -ifval : ifval;
|
||||
double sub_part;
|
||||
double integer;
|
||||
unsigned long long integer_part;
|
||||
//uint8_t sign = !isPositive;
|
||||
//flt2int_flt(abs_fval, &integer_part, &sub_part, sign);
|
||||
sub_part = modf((double)abs_fval, &integer);
|
||||
integer_part = (unsigned long long)integer;
|
||||
if (!isPositive)
|
||||
{
|
||||
unsigned long long result;
|
||||
if(int8_rnd_md == 0) { // round to nearest even
|
||||
if ( sub_part > 0.5f )
|
||||
{
|
||||
result = integer_part + 1;
|
||||
}
|
||||
else if (sub_part == 0.5f)
|
||||
{
|
||||
if ( integer_part & 0x1 )
|
||||
{
|
||||
result = integer_part + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
result = integer_part;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
result = integer_part;
|
||||
}
|
||||
} else { //round to zero
|
||||
result = integer_part;
|
||||
}
|
||||
if ( result > 0x80000000UL )
|
||||
{
|
||||
result = 0x80000000UL;
|
||||
}
|
||||
return -result;
|
||||
}
|
||||
else
|
||||
{
|
||||
unsigned long long result;
|
||||
if(int8_rnd_md == 0) { // round to nearest even
|
||||
if ( sub_part > 0.5f )
|
||||
{
|
||||
result = integer_part + 1;
|
||||
}
|
||||
else if ( sub_part == 0.5f )
|
||||
{
|
||||
if ( integer_part & 0x1 )
|
||||
{
|
||||
result = integer_part + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
result = integer_part;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
result = integer_part;
|
||||
}
|
||||
} else {
|
||||
result = integer_part;
|
||||
}
|
||||
if ( result > 0x7fffffff )
|
||||
{
|
||||
result = 0x7fffffff;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void f32_integer(void *if32, void *o_integer, int integer_size, int accumulate, int int8_signed, int int8_rnd_md)
|
||||
{
|
||||
int i_tmp;
|
||||
float *f_tmp;
|
||||
f_tmp = (float *)if32;
|
||||
i_tmp = flt2int(*f_tmp, int8_rnd_md);
|
||||
int *o32 = (int *)o_integer;
|
||||
int dst_f32 = *o32;
|
||||
short *o16 = (short *)o_integer;
|
||||
short dst_o16 = *o32;
|
||||
char *o8 = (char *)o_integer;
|
||||
char dst_o8 = *o8;
|
||||
|
||||
if (integer_size == 0) {
|
||||
*o32 = i_tmp;
|
||||
} else if (integer_size == 1) {
|
||||
*o16 = i_tmp;
|
||||
} else{
|
||||
*o8 = i_tmp;
|
||||
int min = (int8_signed) ? -128 : 0;
|
||||
int max = (int8_signed) ? 127 : 255;
|
||||
if (i_tmp < min ){
|
||||
*o8 = min;
|
||||
}
|
||||
else if (i_tmp > max){
|
||||
*o8 = max;
|
||||
}
|
||||
//*o8 = i_tmp;
|
||||
}
|
||||
if (accumulate) {
|
||||
if (integer_size == 0) {
|
||||
*o32 += dst_f32;
|
||||
} else if (integer_size == 1) {
|
||||
*o16 += dst_o16;
|
||||
} else
|
||||
*o8 += dst_o8;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* ATOMIC_FP_H_ */
|
||||
|
||||
301
cvikernel/include/bmkernel/bm1880v2/bm1880v2_tdma_reg.h
Normal file
301
cvikernel/include/bmkernel/bm1880v2/bm1880v2_tdma_reg.h
Normal file
@ -0,0 +1,301 @@
|
||||
#ifndef BM1880v2_TDMA_REG_V1_32_H
|
||||
#define BM1880v2_TDMA_REG_V1_32_H
|
||||
|
||||
/*
|
||||
* This file is generated by tools. Do not edit it manually.
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
typedef unsigned long long ullong;
|
||||
|
||||
typedef struct {
|
||||
uint32_t vld;
|
||||
uint32_t compress_en;
|
||||
uint32_t eod;
|
||||
uint32_t intp_en;
|
||||
uint32_t bar_en;
|
||||
uint32_t check_bf16_value;
|
||||
uint32_t trans_dir;
|
||||
uint32_t rsv00;
|
||||
uint32_t trans_fmt;
|
||||
uint32_t transpose_md;
|
||||
uint32_t rsv01;
|
||||
uint32_t outstanding_en;
|
||||
uint32_t cmd_id;
|
||||
uint32_t spec_func;
|
||||
uint32_t dst_fmt;
|
||||
uint32_t src_fmt;
|
||||
uint32_t cmprs_fmt;
|
||||
uint32_t sys_dtype;
|
||||
uint32_t rsv2_1;
|
||||
uint32_t int8_sign;
|
||||
uint32_t compress_zero_guard;
|
||||
uint32_t int8_rnd_mode;
|
||||
uint32_t wait_id_tpu;
|
||||
uint32_t wait_id_other_tdma;
|
||||
uint32_t wait_id_sdma;
|
||||
uint32_t const_val;
|
||||
uint32_t src_base_reg_sel;
|
||||
uint32_t mv_lut_idx;
|
||||
uint32_t dst_base_reg_sel;
|
||||
uint32_t mv_lut_base;
|
||||
uint32_t rsv4_5;
|
||||
uint32_t dst_h_stride;
|
||||
uint32_t dst_c_stride_low;
|
||||
uint32_t dst_n_stride;
|
||||
uint32_t src_h_stride;
|
||||
uint32_t src_c_stride_low;
|
||||
uint32_t src_n_stride;
|
||||
uint32_t dst_c;
|
||||
uint32_t src_c;
|
||||
uint32_t dst_w;
|
||||
uint32_t dst_h;
|
||||
uint32_t src_w;
|
||||
uint32_t src_h;
|
||||
uint32_t dst_base_addr_low;
|
||||
uint32_t src_base_addr_low;
|
||||
uint32_t src_n;
|
||||
uint32_t dst_base_addr_high;
|
||||
uint32_t src_base_addr_high;
|
||||
uint32_t src_c_stride_high;
|
||||
uint32_t dst_c_stride_high;
|
||||
uint32_t compress_bias0;
|
||||
uint32_t compress_bias1;
|
||||
uint32_t layer_ID;
|
||||
} tdma_reg_t;
|
||||
|
||||
static inline void parse_tdma_reg(tdma_reg_t *r, const uint32_t *p)
|
||||
{
|
||||
r->vld = p[0] & 1;
|
||||
r->compress_en = (p[0] >> 1) & 1;
|
||||
r->eod = (p[0] >> 2) & 1;
|
||||
r->intp_en = (p[0] >> 3) & 1;
|
||||
r->bar_en = (p[0] >> 4) & 1;
|
||||
r->check_bf16_value = (p[0] >> 5) & 1;
|
||||
r->trans_dir = (p[0] >> 6) & ((1u << 2) - 1);
|
||||
r->rsv00 = (p[0] >> 8) & ((1u << 2) - 1);
|
||||
r->trans_fmt = (p[0] >> 10) & 1;
|
||||
r->transpose_md = (p[0] >> 11) & ((1u << 2) - 1);
|
||||
r->rsv01 = (p[0] >> 13) & ((1u << 2) - 1);
|
||||
r->outstanding_en = (p[0] >> 15) & 1;
|
||||
r->cmd_id = (p[0] >> 16) & ((1u << 16) - 1);
|
||||
r->spec_func = p[1] & ((1u << 3) - 1);
|
||||
r->dst_fmt = (p[1] >> 3) & ((1u << 2) - 1);
|
||||
r->src_fmt = (p[1] >> 5) & ((1u << 2) - 1);
|
||||
r->cmprs_fmt = (p[1] >> 7) & 1;
|
||||
r->sys_dtype = (p[1] >> 8) & 1;
|
||||
r->rsv2_1 = (p[1] >> 9) & ((1u << 4) - 1);
|
||||
r->int8_sign = (p[1] >> 13) & 1;
|
||||
r->compress_zero_guard = (p[1] >> 14) & 1;
|
||||
r->int8_rnd_mode = (p[1] >> 15) & 1;
|
||||
r->wait_id_tpu = (p[1] >> 16) & ((1u << 16) - 1);
|
||||
r->wait_id_other_tdma = p[2] & ((1u << 16) - 1);
|
||||
r->wait_id_sdma = (p[2] >> 16) & ((1u << 16) - 1);
|
||||
r->const_val = p[3] & ((1u << 16) - 1);
|
||||
r->src_base_reg_sel = (p[3] >> 16) & ((1u << 3) - 1);
|
||||
r->mv_lut_idx = (p[3] >> 19) & 1;
|
||||
r->dst_base_reg_sel = (p[3] >> 20) & ((1u << 3) - 1);
|
||||
r->mv_lut_base = (p[3] >> 23) & 1;
|
||||
r->rsv4_5 = (p[3] >> 24) & ((1u << 8) - 1);
|
||||
r->dst_h_stride = p[4] & ((1u << 16) - 1);
|
||||
r->dst_c_stride_low = (p[4] >> 16) & ((1u << 16) - 1);
|
||||
r->dst_n_stride = p[5];
|
||||
r->src_h_stride = p[6] & ((1u << 16) - 1);
|
||||
r->src_c_stride_low = (p[6] >> 16) & ((1u << 16) - 1);
|
||||
r->src_n_stride = p[7];
|
||||
r->dst_c = p[8] & ((1u << 16) - 1);
|
||||
r->src_c = (p[8] >> 16) & ((1u << 16) - 1);
|
||||
r->dst_w = p[9] & ((1u << 16) - 1);
|
||||
r->dst_h = (p[9] >> 16) & ((1u << 16) - 1);
|
||||
r->src_w = p[10] & ((1u << 16) - 1);
|
||||
r->src_h = (p[10] >> 16) & ((1u << 16) - 1);
|
||||
r->dst_base_addr_low = p[11];
|
||||
r->src_base_addr_low = p[12];
|
||||
r->src_n = p[13] & ((1u << 16) - 1);
|
||||
r->dst_base_addr_high = (p[13] >> 16) & ((1u << 8) - 1);
|
||||
r->src_base_addr_high = (p[13] >> 24) & ((1u << 8) - 1);
|
||||
r->src_c_stride_high = p[14] & ((1u << 16) - 1);
|
||||
r->dst_c_stride_high = (p[14] >> 16) & ((1u << 16) - 1);
|
||||
r->compress_bias0 = p[15] & ((1u << 8) - 1);
|
||||
r->compress_bias1 = (p[15] >> 8) & ((1u << 8) - 1);
|
||||
r->layer_ID = (p[15] >> 16) & ((1u << 16) - 1);
|
||||
}
|
||||
|
||||
static inline void emit_tdma_reg(const tdma_reg_t *r, uint32_t *_p)
|
||||
{
|
||||
volatile uint32_t *p = (typeof(p))_p;
|
||||
p[15] = (r->compress_bias0 & ((1u << 8) - 1)) |
|
||||
((r->compress_bias1 & ((1u << 8) - 1)) << 8) |
|
||||
((r->layer_ID & ((1u << 16) - 1)) << 16);
|
||||
p[14] = (r->src_c_stride_high & ((1u << 16) - 1)) |
|
||||
((r->dst_c_stride_high & ((1u << 16) - 1)) << 16);
|
||||
p[13] = (r->src_n & ((1u << 16) - 1)) |
|
||||
((r->dst_base_addr_high & ((1u << 8) - 1)) << 16) |
|
||||
((r->src_base_addr_high & ((1u << 8) - 1)) << 24);
|
||||
p[12] = (r->src_base_addr_low & (((uint64_t)1 << 32) - 1));
|
||||
p[11] = (r->dst_base_addr_low & (((uint64_t)1 << 32) - 1));
|
||||
p[10] = (r->src_w & ((1u << 16) - 1)) |
|
||||
((r->src_h & ((1u << 16) - 1)) << 16);
|
||||
p[9] = (r->dst_w & ((1u << 16) - 1)) |
|
||||
((r->dst_h & ((1u << 16) - 1)) << 16);
|
||||
p[8] = (r->dst_c & ((1u << 16) - 1)) |
|
||||
((r->src_c & ((1u << 16) - 1)) << 16);
|
||||
p[7] = (r->src_n_stride & (((uint64_t)1 << 32) - 1));
|
||||
p[6] = (r->src_h_stride & ((1u << 16) - 1)) |
|
||||
((r->src_c_stride_low & ((1u << 16) - 1)) << 16);
|
||||
p[5] = (r->dst_n_stride & (((uint64_t)1 << 32) - 1));
|
||||
p[4] = (r->dst_h_stride & ((1u << 16) - 1)) |
|
||||
((r->dst_c_stride_low & ((1u << 16) - 1)) << 16);
|
||||
p[3] = (r->const_val & ((1u << 16) - 1)) |
|
||||
((r->src_base_reg_sel & ((1u << 3) - 1)) << 16) |
|
||||
((r->mv_lut_idx & 1) << 19) |
|
||||
((r->dst_base_reg_sel & ((1u << 3) - 1)) << 20) |
|
||||
((r->mv_lut_base & 1) << 23) |
|
||||
((r->rsv4_5 & ((1u << 8) - 1)) << 24);
|
||||
p[2] = (r->wait_id_other_tdma & ((1u << 16) - 1)) |
|
||||
((r->wait_id_sdma & ((1u << 16) - 1)) << 16);
|
||||
p[1] = (r->spec_func & ((1u << 3) - 1)) |
|
||||
((r->dst_fmt & ((1u << 2) - 1)) << 3) |
|
||||
((r->src_fmt & ((1u << 2) - 1)) << 5) |
|
||||
((r->cmprs_fmt & 1) << 7) |
|
||||
((r->sys_dtype & 1) << 8) |
|
||||
((r->rsv2_1 & ((1u << 4) - 1)) << 9) |
|
||||
((r->int8_sign & 1) << 13) |
|
||||
((r->compress_zero_guard & 1) << 14) |
|
||||
((r->int8_rnd_mode & 1) << 15) |
|
||||
((r->wait_id_tpu & ((1u << 16) - 1)) << 16);
|
||||
p[0] = (r->vld & 1) |
|
||||
((r->compress_en & 1) << 1) |
|
||||
((r->eod & 1) << 2) |
|
||||
((r->intp_en & 1) << 3) |
|
||||
((r->bar_en & 1) << 4) |
|
||||
((r->check_bf16_value & 1) << 5) |
|
||||
((r->trans_dir & ((1u << 2) - 1)) << 6) |
|
||||
((r->rsv00 & ((1u << 2) - 1)) << 8) |
|
||||
((r->trans_fmt & 1) << 10) |
|
||||
((r->transpose_md & ((1u << 2) - 1)) << 11) |
|
||||
((r->rsv01 & ((1u << 2) - 1)) << 13) |
|
||||
((r->outstanding_en & 1) << 15) |
|
||||
((r->cmd_id & ((1u << 16) - 1)) << 16);
|
||||
}
|
||||
|
||||
static inline void reset_tdma_reg(tdma_reg_t *r)
|
||||
{
|
||||
r->vld = 0x0;
|
||||
r->compress_en = 0x0;
|
||||
r->eod = 0x0;
|
||||
r->intp_en = 0x0;
|
||||
r->bar_en = 0x0;
|
||||
r->check_bf16_value = 0x0;
|
||||
r->trans_dir = 0x0;
|
||||
r->rsv00 = 0x0;
|
||||
r->trans_fmt = 0x0;
|
||||
r->transpose_md = 0x0;
|
||||
r->rsv01 = 0x0;
|
||||
r->outstanding_en = 0x0;
|
||||
r->cmd_id = 0x0;
|
||||
r->spec_func = 0x0;
|
||||
r->dst_fmt = 0x1;
|
||||
r->src_fmt = 0x1;
|
||||
r->cmprs_fmt = 0x0;
|
||||
r->sys_dtype = 0x0;
|
||||
r->rsv2_1 = 0x0;
|
||||
r->int8_sign = 0x0;
|
||||
r->compress_zero_guard = 0x0;
|
||||
r->int8_rnd_mode = 0x0;
|
||||
r->wait_id_tpu = 0x0;
|
||||
r->wait_id_other_tdma = 0x0;
|
||||
r->wait_id_sdma = 0x0;
|
||||
r->const_val = 0x0;
|
||||
r->src_base_reg_sel = 0x0;
|
||||
r->mv_lut_idx = 0x0;
|
||||
r->dst_base_reg_sel = 0x0;
|
||||
r->mv_lut_base = 0x0;
|
||||
r->rsv4_5 = 0x0;
|
||||
r->dst_h_stride = 0x1;
|
||||
r->dst_c_stride_low = 0x1;
|
||||
r->dst_n_stride = 0x1;
|
||||
r->src_h_stride = 0x1;
|
||||
r->src_c_stride_low = 0x1;
|
||||
r->src_n_stride = 0x1;
|
||||
r->dst_c = 0x1;
|
||||
r->src_c = 0x1;
|
||||
r->dst_w = 0x1;
|
||||
r->dst_h = 0x1;
|
||||
r->src_w = 0x1;
|
||||
r->src_h = 0x1;
|
||||
r->dst_base_addr_low = 0x0;
|
||||
r->src_base_addr_low = 0x0;
|
||||
r->src_n = 0x1;
|
||||
r->dst_base_addr_high = 0x0;
|
||||
r->src_base_addr_high = 0x0;
|
||||
r->src_c_stride_high = 0x0;
|
||||
r->dst_c_stride_high = 0x0;
|
||||
r->compress_bias0 = 0x0;
|
||||
r->compress_bias1 = 0x0;
|
||||
r->layer_ID = 0x0;
|
||||
}
|
||||
|
||||
static inline void trace_tdma_reg(tdma_reg_t *r, const char *tag)
|
||||
{
|
||||
#define trace_one_reg(name) \
|
||||
printf(" %s: 0x%llx\n", #name, (ullong)r->name)
|
||||
|
||||
printf("--- %s ---\n", tag);
|
||||
trace_one_reg(vld);
|
||||
trace_one_reg(compress_en);
|
||||
trace_one_reg(eod);
|
||||
trace_one_reg(intp_en);
|
||||
trace_one_reg(bar_en);
|
||||
trace_one_reg(check_bf16_value);
|
||||
trace_one_reg(trans_dir);
|
||||
trace_one_reg(rsv00);
|
||||
trace_one_reg(trans_fmt);
|
||||
trace_one_reg(transpose_md);
|
||||
trace_one_reg(rsv01);
|
||||
trace_one_reg(outstanding_en);
|
||||
trace_one_reg(cmd_id);
|
||||
trace_one_reg(spec_func);
|
||||
trace_one_reg(dst_fmt);
|
||||
trace_one_reg(src_fmt);
|
||||
trace_one_reg(cmprs_fmt);
|
||||
trace_one_reg(sys_dtype);
|
||||
trace_one_reg(rsv2_1);
|
||||
trace_one_reg(int8_sign);
|
||||
trace_one_reg(compress_zero_guard);
|
||||
trace_one_reg(int8_rnd_mode);
|
||||
trace_one_reg(wait_id_tpu);
|
||||
trace_one_reg(wait_id_other_tdma);
|
||||
trace_one_reg(wait_id_sdma);
|
||||
trace_one_reg(const_val);
|
||||
trace_one_reg(src_base_reg_sel);
|
||||
trace_one_reg(mv_lut_idx);
|
||||
trace_one_reg(dst_base_reg_sel);
|
||||
trace_one_reg(mv_lut_base);
|
||||
trace_one_reg(rsv4_5);
|
||||
trace_one_reg(dst_h_stride);
|
||||
trace_one_reg(dst_c_stride_low);
|
||||
trace_one_reg(dst_n_stride);
|
||||
trace_one_reg(src_h_stride);
|
||||
trace_one_reg(src_c_stride_low);
|
||||
trace_one_reg(src_n_stride);
|
||||
trace_one_reg(dst_c);
|
||||
trace_one_reg(src_c);
|
||||
trace_one_reg(dst_w);
|
||||
trace_one_reg(dst_h);
|
||||
trace_one_reg(src_w);
|
||||
trace_one_reg(src_h);
|
||||
trace_one_reg(dst_base_addr_low);
|
||||
trace_one_reg(src_base_addr_low);
|
||||
trace_one_reg(src_n);
|
||||
trace_one_reg(dst_base_addr_high);
|
||||
trace_one_reg(src_base_addr_high);
|
||||
trace_one_reg(src_c_stride_high);
|
||||
trace_one_reg(dst_c_stride_high);
|
||||
trace_one_reg(compress_bias0);
|
||||
trace_one_reg(compress_bias1);
|
||||
trace_one_reg(layer_ID);
|
||||
}
|
||||
#endif /* BM1880v2_TDMA_REG_V1_32_H */
|
||||
574
cvikernel/include/bmkernel/bm1880v2/bm1880v2_tiu_reg.h
Normal file
574
cvikernel/include/bmkernel/bm1880v2/bm1880v2_tiu_reg.h
Normal file
@ -0,0 +1,574 @@
|
||||
#ifndef BM1880v2_TIU_REG_V2_11_H
|
||||
#define BM1880v2_TIU_REG_V2_11_H
|
||||
|
||||
/*
|
||||
* This file is generated by tools. Do not edit it manually.
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
typedef uint8_t uint8_t;
|
||||
typedef uint64_t uint64_t;
|
||||
typedef unsigned long long ullong;
|
||||
|
||||
typedef struct {
|
||||
uint32_t cmd_en;
|
||||
uint32_t cmd_end;
|
||||
uint32_t cmd_id_en;
|
||||
uint32_t cmd_id_tpu;
|
||||
uint32_t cmd_id_gdma;
|
||||
uint32_t cmd_keep;
|
||||
uint32_t cmd_intr_en;
|
||||
uint32_t tsk_typ;
|
||||
uint32_t tsk_eu_typ;
|
||||
uint32_t tsk_opd_num;
|
||||
uint32_t opt_right_shift;
|
||||
uint32_t opt_left_shift;
|
||||
uint32_t opt_shift_typ;
|
||||
uint32_t opt_rshift_typ;
|
||||
uint32_t opt_res_add;
|
||||
uint32_t opt_relu;
|
||||
uint32_t opt_left_tran;
|
||||
uint32_t opt_chl_quan;
|
||||
uint32_t tens_mdsum;
|
||||
uint32_t tens_lookup;
|
||||
uint32_t opt_res0_sign;
|
||||
uint32_t opt_opd0_sign;
|
||||
uint32_t opt_opd1_sign;
|
||||
uint32_t opt_opd2_sign;
|
||||
uint32_t opt_res0_int8;
|
||||
uint32_t opt_opd0_int8;
|
||||
uint32_t opt_opd1_int8;
|
||||
uint32_t opt_opd2_int8;
|
||||
uint32_t opt_opd0_const;
|
||||
uint32_t opt_opd1_const;
|
||||
uint32_t opt_opd2_const;
|
||||
uint32_t short_nchwstr_same;
|
||||
uint32_t short_res0_str;
|
||||
uint32_t short_opd0_str;
|
||||
uint32_t short_opd1_str;
|
||||
uint32_t short_opd2_str;
|
||||
uint32_t conv_opd0_x_ins0;
|
||||
uint32_t conv_opd0_y_ins0;
|
||||
uint32_t conv_opd0_x_ins0_last;
|
||||
uint32_t conv_opd0_y_ins0_last;
|
||||
uint32_t conv_opd1_x_ins0;
|
||||
uint32_t conv_opd1_y_ins0;
|
||||
uint32_t opd0_ins_val;
|
||||
uint32_t ps32_md;
|
||||
uint32_t double_conv;
|
||||
uint32_t rsvd0;
|
||||
uint32_t res0_n;
|
||||
uint32_t res0_c;
|
||||
uint32_t res0_h;
|
||||
uint32_t res0_w;
|
||||
uint32_t res0_addr;
|
||||
uint32_t opd0_addr;
|
||||
uint32_t opd1_addr;
|
||||
uint32_t rsvd1;
|
||||
uint32_t opd2_addr;
|
||||
uint32_t opd0_c;
|
||||
uint32_t opd0_h;
|
||||
uint32_t opd0_w;
|
||||
uint32_t opd1_h;
|
||||
uint32_t opd1_w;
|
||||
uint32_t conv_opd0_up_pad;
|
||||
uint32_t conv_opd0_dn_pad;
|
||||
uint32_t conv_opd0_lf_pad;
|
||||
uint32_t conv_opd0_rt_pad;
|
||||
uint32_t conv_op_x_str;
|
||||
uint32_t conv_op_y_str;
|
||||
uint32_t opd0_ins_fp;
|
||||
uint32_t rsvd2;
|
||||
uint32_t opd0_n;
|
||||
uint32_t opd1_n;
|
||||
uint32_t opd1_c;
|
||||
uint32_t opd2_n;
|
||||
uint32_t opd2_c;
|
||||
uint32_t opd2_h;
|
||||
uint32_t opd2_w;
|
||||
uint32_t quan_m;
|
||||
uint32_t opd_typ;
|
||||
uint32_t fp_round_typ;
|
||||
uint32_t rsvd7;
|
||||
uint32_t rsvd3;
|
||||
uint32_t res0_n_str;
|
||||
uint32_t res0_c_str;
|
||||
uint32_t res0_h_str;
|
||||
uint32_t res0_w_str;
|
||||
uint32_t res0_b_str;
|
||||
uint32_t opd0_n_str;
|
||||
uint32_t opd0_c_str;
|
||||
uint32_t rsvd4;
|
||||
uint32_t opd0_h_str;
|
||||
uint32_t opd0_w_str;
|
||||
uint32_t opd0_b_str;
|
||||
uint32_t opd1_n_str;
|
||||
uint32_t opd1_c_str;
|
||||
uint32_t opd1_h_str;
|
||||
uint32_t opd1_w_str;
|
||||
uint32_t rsvd5;
|
||||
uint32_t opd1_b_str;
|
||||
uint32_t opd2_n_str;
|
||||
uint32_t opd2_c_str;
|
||||
uint32_t opd2_h_str;
|
||||
uint32_t opd2_w_str;
|
||||
uint32_t opd2_b_str;
|
||||
uint32_t layer_info;
|
||||
uint32_t rsvd6;
|
||||
} tiu_reg_t;
|
||||
|
||||
static inline void parse_tiu_reg(tiu_reg_t *r, const uint32_t *p)
|
||||
{
|
||||
r->cmd_en = p[0] & 1;
|
||||
r->cmd_end = (p[0] >> 1) & 1;
|
||||
r->cmd_id_en = (p[0] >> 2) & 1;
|
||||
r->cmd_id_tpu = (p[0] >> 3) & ((1u << 16) - 1);
|
||||
r->cmd_id_gdma = (p[0] >> 19) & ((1u << 13) - 1);
|
||||
r->cmd_id_gdma |= (uint64_t)(p[1] & ((1u << 3) - 1)) << 13;
|
||||
r->cmd_keep = (p[1] >> 3) & 1;
|
||||
r->cmd_intr_en = (p[1] >> 4) & 1;
|
||||
r->tsk_typ = (p[1] >> 5) & ((1u << 4) - 1);
|
||||
r->tsk_eu_typ = (p[1] >> 9) & ((1u << 8) - 1);
|
||||
r->tsk_opd_num = (p[1] >> 17) & ((1u << 2) - 1);
|
||||
r->opt_right_shift = (p[1] >> 19) & ((1u << 5) - 1);
|
||||
r->opt_left_shift = (p[1] >> 24) & ((1u << 5) - 1);
|
||||
r->opt_shift_typ = (p[1] >> 29) & 1;
|
||||
r->opt_rshift_typ = (p[1] >> 30) & 1;
|
||||
r->opt_res_add = (p[1] >> 31) & 1;
|
||||
r->opt_relu = p[2] & 1;
|
||||
r->opt_left_tran = (p[2] >> 1) & 1;
|
||||
r->opt_chl_quan = (p[2] >> 2) & 1;
|
||||
r->tens_mdsum = (p[2] >> 3) & 1;
|
||||
r->tens_lookup = (p[2] >> 4) & 1;
|
||||
r->opt_res0_sign = (p[2] >> 5) & 1;
|
||||
r->opt_opd0_sign = (p[2] >> 6) & 1;
|
||||
r->opt_opd1_sign = (p[2] >> 7) & 1;
|
||||
r->opt_opd2_sign = (p[2] >> 8) & 1;
|
||||
r->opt_res0_int8 = (p[2] >> 9) & 1;
|
||||
r->opt_opd0_int8 = (p[2] >> 10) & 1;
|
||||
r->opt_opd1_int8 = (p[2] >> 11) & 1;
|
||||
r->opt_opd2_int8 = (p[2] >> 12) & 1;
|
||||
r->opt_opd0_const = (p[2] >> 13) & 1;
|
||||
r->opt_opd1_const = (p[2] >> 14) & 1;
|
||||
r->opt_opd2_const = (p[2] >> 15) & 1;
|
||||
r->short_nchwstr_same = (p[2] >> 16) & 1;
|
||||
r->short_res0_str = (p[2] >> 17) & ((1u << 2) - 1);
|
||||
r->short_opd0_str = (p[2] >> 19) & ((1u << 2) - 1);
|
||||
r->short_opd1_str = (p[2] >> 21) & ((1u << 2) - 1);
|
||||
r->short_opd2_str = (p[2] >> 23) & ((1u << 2) - 1);
|
||||
r->conv_opd0_x_ins0 = (p[2] >> 25) & ((1u << 4) - 1);
|
||||
r->conv_opd0_y_ins0 = (p[2] >> 29) & ((1u << 3) - 1);
|
||||
r->conv_opd0_y_ins0 |= (uint64_t)(p[3] & 1) << 3;
|
||||
r->conv_opd0_x_ins0_last = (p[3] >> 1) & ((1u << 4) - 1);
|
||||
r->conv_opd0_y_ins0_last = (p[3] >> 5) & ((1u << 4) - 1);
|
||||
r->conv_opd1_x_ins0 = (p[3] >> 9) & ((1u << 4) - 1);
|
||||
r->conv_opd1_y_ins0 = (p[3] >> 13) & ((1u << 4) - 1);
|
||||
r->opd0_ins_val = (p[3] >> 17) & ((1u << 8) - 1);
|
||||
r->ps32_md = (p[3] >> 25) & ((1u << 2) - 1);
|
||||
r->double_conv = (p[3] >> 27) & 1;
|
||||
r->rsvd0 = (p[3] >> 28) & ((1u << 4) - 1);
|
||||
r->res0_n = p[4] & ((1u << 12) - 1);
|
||||
r->res0_c = (p[4] >> 12) & ((1u << 12) - 1);
|
||||
r->res0_h = (p[4] >> 24) & ((1u << 8) - 1);
|
||||
r->res0_h |= (uint64_t)(p[5] & ((1u << 4) - 1)) << 8;
|
||||
r->res0_w = (p[5] >> 4) & ((1u << 12) - 1);
|
||||
r->res0_addr = (p[5] >> 16) & ((1u << 16) - 1);
|
||||
r->res0_addr |= (uint64_t)(p[6] & ((1u << 8) - 1)) << 16;
|
||||
r->opd0_addr = (p[6] >> 8) & ((1u << 24) - 1);
|
||||
r->opd1_addr = p[7] & ((1u << 16) - 1);
|
||||
r->rsvd1 = (p[7] >> 16) & ((1u << 16) - 1);
|
||||
r->opd2_addr = p[8] & ((1u << 16) - 1);
|
||||
r->opd0_c = (p[8] >> 16) & ((1u << 12) - 1);
|
||||
r->opd0_h = (p[8] >> 28) & ((1u << 4) - 1);
|
||||
r->opd0_h |= (uint64_t)(p[9] & ((1u << 8) - 1)) << 4;
|
||||
r->opd0_w = (p[9] >> 8) & ((1u << 12) - 1);
|
||||
r->opd1_h = (p[9] >> 20) & ((1u << 12) - 1);
|
||||
r->opd1_w = p[10] & ((1u << 12) - 1);
|
||||
r->conv_opd0_up_pad = (p[10] >> 12) & ((1u << 4) - 1);
|
||||
r->conv_opd0_dn_pad = (p[10] >> 16) & ((1u << 4) - 1);
|
||||
r->conv_opd0_lf_pad = (p[10] >> 20) & ((1u << 4) - 1);
|
||||
r->conv_opd0_rt_pad = (p[10] >> 24) & ((1u << 4) - 1);
|
||||
r->conv_op_x_str = (p[10] >> 28) & ((1u << 4) - 1);
|
||||
r->conv_op_y_str = p[11] & ((1u << 4) - 1);
|
||||
r->opd0_ins_fp = (p[11] >> 4) & ((1u << 16) - 1);
|
||||
r->rsvd2 = (p[11] >> 20) & ((1u << 12) - 1);
|
||||
r->opd0_n = p[12] & ((1u << 12) - 1);
|
||||
r->opd1_n = (p[12] >> 12) & ((1u << 12) - 1);
|
||||
r->opd1_c = (p[12] >> 24) & ((1u << 8) - 1);
|
||||
r->opd1_c |= (uint64_t)(p[13] & ((1u << 4) - 1)) << 8;
|
||||
r->opd2_n = (p[13] >> 4) & ((1u << 12) - 1);
|
||||
r->opd2_c = (p[13] >> 16) & ((1u << 12) - 1);
|
||||
r->opd2_h = (p[13] >> 28) & ((1u << 4) - 1);
|
||||
r->opd2_h |= (uint64_t)(p[14] & ((1u << 8) - 1)) << 4;
|
||||
r->opd2_w = (p[14] >> 8) & ((1u << 12) - 1);
|
||||
r->quan_m = (p[14] >> 20) & ((1u << 12) - 1);
|
||||
r->quan_m |= (uint64_t)(p[15] & ((1u << 20) - 1)) << 12;
|
||||
r->opd_typ = (p[15] >> 20) & 1;
|
||||
r->fp_round_typ = (p[15] >> 21) & ((1u << 3) - 1);
|
||||
r->rsvd7 = (p[15] >> 24) & ((1u << 4) - 1);
|
||||
r->rsvd3 = (p[15] >> 28) & ((1u << 4) - 1);
|
||||
r->res0_n_str = p[16] & ((1u << 16) - 1);
|
||||
r->res0_c_str = (p[16] >> 16) & ((1u << 16) - 1);
|
||||
r->res0_h_str = p[17] & ((1u << 16) - 1);
|
||||
r->res0_w_str = (p[17] >> 16) & ((1u << 16) - 1);
|
||||
r->res0_b_str = p[18] & ((1u << 16) - 1);
|
||||
r->opd0_n_str = (p[18] >> 16) & ((1u << 16) - 1);
|
||||
r->opd0_c_str = p[19] & ((1u << 16) - 1);
|
||||
r->rsvd4 = (p[19] >> 16) & ((1u << 16) - 1);
|
||||
r->opd0_h_str = p[20] & ((1u << 16) - 1);
|
||||
r->opd0_w_str = (p[20] >> 16) & ((1u << 16) - 1);
|
||||
r->opd0_b_str = p[21] & ((1u << 16) - 1);
|
||||
r->opd1_n_str = (p[21] >> 16) & ((1u << 16) - 1);
|
||||
r->opd1_c_str = p[22] & ((1u << 16) - 1);
|
||||
r->opd1_h_str = (p[22] >> 16) & ((1u << 16) - 1);
|
||||
r->opd1_w_str = p[23] & ((1u << 16) - 1);
|
||||
r->rsvd5 = (p[23] >> 16) & ((1u << 16) - 1);
|
||||
r->opd1_b_str = p[24] & ((1u << 16) - 1);
|
||||
r->opd2_n_str = (p[24] >> 16) & ((1u << 16) - 1);
|
||||
r->opd2_c_str = p[25] & ((1u << 16) - 1);
|
||||
r->opd2_h_str = (p[25] >> 16) & ((1u << 16) - 1);
|
||||
r->opd2_w_str = p[26] & ((1u << 16) - 1);
|
||||
r->opd2_b_str = (p[26] >> 16) & ((1u << 16) - 1);
|
||||
r->layer_info = p[27] & ((1u << 28) - 1);
|
||||
r->rsvd6 = (p[27] >> 28) & ((1u << 4) - 1);
|
||||
}
|
||||
|
||||
static inline void emit_tiu_reg(const tiu_reg_t *r, uint32_t *_p)
|
||||
{
|
||||
volatile uint32_t *p = (typeof(p))_p;
|
||||
p[27] = (r->layer_info & ((1u << 28) - 1)) |
|
||||
((r->rsvd6 & ((1u << 4) - 1)) << 28);
|
||||
p[26] = (r->opd2_w_str & ((1u << 16) - 1)) |
|
||||
((r->opd2_b_str & ((1u << 16) - 1)) << 16);
|
||||
p[25] = (r->opd2_c_str & ((1u << 16) - 1)) |
|
||||
((r->opd2_h_str & ((1u << 16) - 1)) << 16);
|
||||
p[24] = (r->opd1_b_str & ((1u << 16) - 1)) |
|
||||
((r->opd2_n_str & ((1u << 16) - 1)) << 16);
|
||||
p[23] = (r->opd1_w_str & ((1u << 16) - 1)) |
|
||||
((r->rsvd5 & ((1u << 16) - 1)) << 16);
|
||||
p[22] = (r->opd1_c_str & ((1u << 16) - 1)) |
|
||||
((r->opd1_h_str & ((1u << 16) - 1)) << 16);
|
||||
p[21] = (r->opd0_b_str & ((1u << 16) - 1)) |
|
||||
((r->opd1_n_str & ((1u << 16) - 1)) << 16);
|
||||
p[20] = (r->opd0_h_str & ((1u << 16) - 1)) |
|
||||
((r->opd0_w_str & ((1u << 16) - 1)) << 16);
|
||||
p[19] = (r->opd0_c_str & ((1u << 16) - 1)) |
|
||||
((r->rsvd4 & ((1u << 16) - 1)) << 16);
|
||||
p[18] = (r->res0_b_str & ((1u << 16) - 1)) |
|
||||
((r->opd0_n_str & ((1u << 16) - 1)) << 16);
|
||||
p[17] = (r->res0_h_str & ((1u << 16) - 1)) |
|
||||
((r->res0_w_str & ((1u << 16) - 1)) << 16);
|
||||
p[16] = (r->res0_n_str & ((1u << 16) - 1)) |
|
||||
((r->res0_c_str & ((1u << 16) - 1)) << 16);
|
||||
p[15] = ((r->quan_m >> 12) & ((1u << 20) - 1)) |
|
||||
((r->opd_typ & 1) << 20) |
|
||||
((r->fp_round_typ & ((1u << 3) - 1)) << 21) |
|
||||
((r->rsvd7 & ((1u << 4) - 1)) << 24) |
|
||||
((r->rsvd3 & ((1u << 4) - 1)) << 28);
|
||||
p[14] = ((r->opd2_h >> 4) & ((1u << 8) - 1)) |
|
||||
((r->opd2_w & ((1u << 12) - 1)) << 8) |
|
||||
((r->quan_m & ((1u << 12) - 1)) << 20);
|
||||
p[13] = ((r->opd1_c >> 8) & ((1u << 4) - 1)) |
|
||||
((r->opd2_n & ((1u << 12) - 1)) << 4) |
|
||||
((r->opd2_c & ((1u << 12) - 1)) << 16) |
|
||||
((r->opd2_h & ((1u << 4) - 1)) << 28);
|
||||
p[12] = (r->opd0_n & ((1u << 12) - 1)) |
|
||||
((r->opd1_n & ((1u << 12) - 1)) << 12) |
|
||||
((r->opd1_c & ((1u << 8) - 1)) << 24);
|
||||
p[11] = (r->conv_op_y_str & ((1u << 4) - 1)) |
|
||||
((r->opd0_ins_fp & ((1u << 16) - 1)) << 4) |
|
||||
((r->rsvd2 & ((1u << 12) - 1)) << 20);
|
||||
p[10] = (r->opd1_w & ((1u << 12) - 1)) |
|
||||
((r->conv_opd0_up_pad & ((1u << 4) - 1)) << 12) |
|
||||
((r->conv_opd0_dn_pad & ((1u << 4) - 1)) << 16) |
|
||||
((r->conv_opd0_lf_pad & ((1u << 4) - 1)) << 20) |
|
||||
((r->conv_opd0_rt_pad & ((1u << 4) - 1)) << 24) |
|
||||
((r->conv_op_x_str & ((1u << 4) - 1)) << 28);
|
||||
p[9] = ((r->opd0_h >> 4) & ((1u << 8) - 1)) |
|
||||
((r->opd0_w & ((1u << 12) - 1)) << 8) |
|
||||
((r->opd1_h & ((1u << 12) - 1)) << 20);
|
||||
p[8] = (r->opd2_addr & ((1u << 16) - 1)) |
|
||||
((r->opd0_c & ((1u << 12) - 1)) << 16) |
|
||||
((r->opd0_h & ((1u << 4) - 1)) << 28);
|
||||
p[7] = (r->opd1_addr & ((1u << 16) - 1)) |
|
||||
((r->rsvd1 & ((1u << 16) - 1)) << 16);
|
||||
p[6] = ((r->res0_addr >> 16) & ((1u << 8) - 1)) |
|
||||
((r->opd0_addr & ((1u << 24) - 1)) << 8);
|
||||
p[5] = ((r->res0_h >> 8) & ((1u << 4) - 1)) |
|
||||
((r->res0_w & ((1u << 12) - 1)) << 4) |
|
||||
((r->res0_addr & ((1u << 16) - 1)) << 16);
|
||||
p[4] = (r->res0_n & ((1u << 12) - 1)) |
|
||||
((r->res0_c & ((1u << 12) - 1)) << 12) |
|
||||
((r->res0_h & ((1u << 8) - 1)) << 24);
|
||||
p[3] = ((r->conv_opd0_y_ins0 >> 3) & 1) |
|
||||
((r->conv_opd0_x_ins0_last & ((1u << 4) - 1)) << 1) |
|
||||
((r->conv_opd0_y_ins0_last & ((1u << 4) - 1)) << 5) |
|
||||
((r->conv_opd1_x_ins0 & ((1u << 4) - 1)) << 9) |
|
||||
((r->conv_opd1_y_ins0 & ((1u << 4) - 1)) << 13) |
|
||||
((r->opd0_ins_val & ((1u << 8) - 1)) << 17) |
|
||||
((r->ps32_md & ((1u << 2) - 1)) << 25) |
|
||||
((r->double_conv & 1) << 27) |
|
||||
((r->rsvd0 & ((1u << 4) - 1)) << 28);
|
||||
p[2] = (r->opt_relu & 1) |
|
||||
((r->opt_left_tran & 1) << 1) |
|
||||
((r->opt_chl_quan & 1) << 2) |
|
||||
((r->tens_mdsum & 1) << 3) |
|
||||
((r->tens_lookup & 1) << 4) |
|
||||
((r->opt_res0_sign & 1) << 5) |
|
||||
((r->opt_opd0_sign & 1) << 6) |
|
||||
((r->opt_opd1_sign & 1) << 7) |
|
||||
((r->opt_opd2_sign & 1) << 8) |
|
||||
((r->opt_res0_int8 & 1) << 9) |
|
||||
((r->opt_opd0_int8 & 1) << 10) |
|
||||
((r->opt_opd1_int8 & 1) << 11) |
|
||||
((r->opt_opd2_int8 & 1) << 12) |
|
||||
((r->opt_opd0_const & 1) << 13) |
|
||||
((r->opt_opd1_const & 1) << 14) |
|
||||
((r->opt_opd2_const & 1) << 15) |
|
||||
((r->short_nchwstr_same & 1) << 16) |
|
||||
((r->short_res0_str & ((1u << 2) - 1)) << 17) |
|
||||
((r->short_opd0_str & ((1u << 2) - 1)) << 19) |
|
||||
((r->short_opd1_str & ((1u << 2) - 1)) << 21) |
|
||||
((r->short_opd2_str & ((1u << 2) - 1)) << 23) |
|
||||
((r->conv_opd0_x_ins0 & ((1u << 4) - 1)) << 25) |
|
||||
((r->conv_opd0_y_ins0 & ((1u << 3) - 1)) << 29);
|
||||
p[1] = ((r->cmd_id_gdma >> 13) & ((1u << 3) - 1)) |
|
||||
((r->cmd_keep & 1) << 3) |
|
||||
((r->cmd_intr_en & 1) << 4) |
|
||||
((r->tsk_typ & ((1u << 4) - 1)) << 5) |
|
||||
((r->tsk_eu_typ & ((1u << 8) - 1)) << 9) |
|
||||
((r->tsk_opd_num & ((1u << 2) - 1)) << 17) |
|
||||
((r->opt_right_shift & ((1u << 5) - 1)) << 19) |
|
||||
((r->opt_left_shift & ((1u << 5) - 1)) << 24) |
|
||||
((r->opt_shift_typ & 1) << 29) |
|
||||
((r->opt_rshift_typ & 1) << 30) |
|
||||
((r->opt_res_add & 1) << 31);
|
||||
p[0] = (r->cmd_en & 1) |
|
||||
((r->cmd_end & 1) << 1) |
|
||||
((r->cmd_id_en & 1) << 2) |
|
||||
((r->cmd_id_tpu & ((1u << 16) - 1)) << 3) |
|
||||
((r->cmd_id_gdma & ((1u << 13) - 1)) << 19);
|
||||
}
|
||||
|
||||
static inline void reset_tiu_reg(tiu_reg_t *r)
|
||||
{
|
||||
r->cmd_en = 0b0;
|
||||
r->cmd_end = 0b0;
|
||||
r->cmd_id_en = 0b0;
|
||||
r->cmd_id_tpu = 0;
|
||||
r->cmd_id_gdma = 0;
|
||||
r->cmd_keep = 0b0;
|
||||
r->cmd_intr_en = 0b0;
|
||||
r->tsk_typ = 0;
|
||||
r->tsk_eu_typ = 0;
|
||||
r->tsk_opd_num = 0b11;
|
||||
r->opt_right_shift = 10;
|
||||
r->opt_left_shift = 2;
|
||||
r->opt_shift_typ = 0b1;
|
||||
r->opt_rshift_typ = 0b1;
|
||||
r->opt_res_add = 0b0;
|
||||
r->opt_relu = 0b1;
|
||||
r->opt_left_tran = 0b0;
|
||||
r->opt_chl_quan = 0b0;
|
||||
r->tens_mdsum = 0b0;
|
||||
r->tens_lookup = 0b0;
|
||||
r->opt_res0_sign = 0b0;
|
||||
r->opt_opd0_sign = 0b0;
|
||||
r->opt_opd1_sign = 0b1;
|
||||
r->opt_opd2_sign = 0b1;
|
||||
r->opt_res0_int8 = 0b1;
|
||||
r->opt_opd0_int8 = 0b1;
|
||||
r->opt_opd1_int8 = 0b1;
|
||||
r->opt_opd2_int8 = 0b0;
|
||||
r->opt_opd0_const = 0b0;
|
||||
r->opt_opd1_const = 0b0;
|
||||
r->opt_opd2_const = 0b0;
|
||||
r->short_nchwstr_same = 0b0;
|
||||
r->short_res0_str = 0b00;
|
||||
r->short_opd0_str = 0b00;
|
||||
r->short_opd1_str = 0b00;
|
||||
r->short_opd2_str = 0b00;
|
||||
r->conv_opd0_x_ins0 = 0;
|
||||
r->conv_opd0_y_ins0 = 0;
|
||||
r->conv_opd0_x_ins0_last = 0;
|
||||
r->conv_opd0_y_ins0_last = 0;
|
||||
r->conv_opd1_x_ins0 = 0;
|
||||
r->conv_opd1_y_ins0 = 0;
|
||||
r->opd0_ins_val = 0;
|
||||
r->ps32_md = 0;
|
||||
r->double_conv = 0;
|
||||
r->rsvd0 = 0;
|
||||
r->res0_n = 1;
|
||||
r->res0_c = 1;
|
||||
r->res0_h = 1;
|
||||
r->res0_w = 16;
|
||||
r->res0_addr = 0;
|
||||
r->opd0_addr = 0;
|
||||
r->opd1_addr = 0;
|
||||
r->rsvd1 = 0;
|
||||
r->opd2_addr = 0;
|
||||
r->opd0_c = 1;
|
||||
r->opd0_h = 1;
|
||||
r->opd0_w = 16;
|
||||
r->opd1_h = 1;
|
||||
r->opd1_w = 16;
|
||||
r->conv_opd0_up_pad = 0;
|
||||
r->conv_opd0_dn_pad = 0;
|
||||
r->conv_opd0_lf_pad = 0;
|
||||
r->conv_opd0_rt_pad = 0;
|
||||
r->conv_op_x_str = 1;
|
||||
r->conv_op_y_str = 1;
|
||||
r->opd0_ins_fp = 0;
|
||||
r->rsvd2 = 0;
|
||||
r->opd0_n = 1;
|
||||
r->opd1_n = 1;
|
||||
r->opd1_c = 1;
|
||||
r->opd2_n = 1;
|
||||
r->opd2_c = 1;
|
||||
r->opd2_h = 1;
|
||||
r->opd2_w = 16;
|
||||
r->quan_m = 0;
|
||||
r->opd_typ = 0;
|
||||
r->fp_round_typ = 0;
|
||||
r->rsvd7 = 0;
|
||||
r->rsvd3 = 0;
|
||||
r->res0_n_str = 16;
|
||||
r->res0_c_str = 16;
|
||||
r->res0_h_str = 0;
|
||||
r->res0_w_str = 1;
|
||||
r->res0_b_str = 16;
|
||||
r->opd0_n_str = 16;
|
||||
r->opd0_c_str = 16;
|
||||
r->rsvd4 = 0;
|
||||
r->opd0_h_str = 0;
|
||||
r->opd0_w_str = 1;
|
||||
r->opd0_b_str = 16;
|
||||
r->opd1_n_str = 16;
|
||||
r->opd1_c_str = 16;
|
||||
r->opd1_h_str = 0;
|
||||
r->opd1_w_str = 1;
|
||||
r->rsvd5 = 0;
|
||||
r->opd1_b_str = 16;
|
||||
r->opd2_n_str = 16;
|
||||
r->opd2_c_str = 16;
|
||||
r->opd2_h_str = 0;
|
||||
r->opd2_w_str = 1;
|
||||
r->opd2_b_str = 16;
|
||||
r->layer_info = 0;
|
||||
r->rsvd6 = 0;
|
||||
}
|
||||
|
||||
static inline void trace_tiu_reg(tiu_reg_t *r, const char *tag)
|
||||
{
|
||||
#define trace_one_reg(name) \
|
||||
printf(" %s: 0x%llx\n", #name, (ullong)r->name)
|
||||
|
||||
printf("--- %s ---\n", tag);
|
||||
trace_one_reg(cmd_en);
|
||||
trace_one_reg(cmd_end);
|
||||
trace_one_reg(cmd_id_en);
|
||||
trace_one_reg(cmd_id_tpu);
|
||||
trace_one_reg(cmd_id_gdma);
|
||||
trace_one_reg(cmd_keep);
|
||||
trace_one_reg(cmd_intr_en);
|
||||
trace_one_reg(tsk_typ);
|
||||
trace_one_reg(tsk_eu_typ);
|
||||
trace_one_reg(tsk_opd_num);
|
||||
trace_one_reg(opt_right_shift);
|
||||
trace_one_reg(opt_left_shift);
|
||||
trace_one_reg(opt_shift_typ);
|
||||
trace_one_reg(opt_rshift_typ);
|
||||
trace_one_reg(opt_res_add);
|
||||
trace_one_reg(opt_relu);
|
||||
trace_one_reg(opt_left_tran);
|
||||
trace_one_reg(opt_chl_quan);
|
||||
trace_one_reg(tens_mdsum);
|
||||
trace_one_reg(tens_lookup);
|
||||
trace_one_reg(opt_res0_sign);
|
||||
trace_one_reg(opt_opd0_sign);
|
||||
trace_one_reg(opt_opd1_sign);
|
||||
trace_one_reg(opt_opd2_sign);
|
||||
trace_one_reg(opt_res0_int8);
|
||||
trace_one_reg(opt_opd0_int8);
|
||||
trace_one_reg(opt_opd1_int8);
|
||||
trace_one_reg(opt_opd2_int8);
|
||||
trace_one_reg(opt_opd0_const);
|
||||
trace_one_reg(opt_opd1_const);
|
||||
trace_one_reg(opt_opd2_const);
|
||||
trace_one_reg(short_nchwstr_same);
|
||||
trace_one_reg(short_res0_str);
|
||||
trace_one_reg(short_opd0_str);
|
||||
trace_one_reg(short_opd1_str);
|
||||
trace_one_reg(short_opd2_str);
|
||||
trace_one_reg(conv_opd0_x_ins0);
|
||||
trace_one_reg(conv_opd0_y_ins0);
|
||||
trace_one_reg(conv_opd0_x_ins0_last);
|
||||
trace_one_reg(conv_opd0_y_ins0_last);
|
||||
trace_one_reg(conv_opd1_x_ins0);
|
||||
trace_one_reg(conv_opd1_y_ins0);
|
||||
trace_one_reg(opd0_ins_val);
|
||||
trace_one_reg(ps32_md);
|
||||
trace_one_reg(double_conv);
|
||||
trace_one_reg(rsvd0);
|
||||
trace_one_reg(res0_n);
|
||||
trace_one_reg(res0_c);
|
||||
trace_one_reg(res0_h);
|
||||
trace_one_reg(res0_w);
|
||||
trace_one_reg(res0_addr);
|
||||
trace_one_reg(opd0_addr);
|
||||
trace_one_reg(opd1_addr);
|
||||
trace_one_reg(rsvd1);
|
||||
trace_one_reg(opd2_addr);
|
||||
trace_one_reg(opd0_c);
|
||||
trace_one_reg(opd0_h);
|
||||
trace_one_reg(opd0_w);
|
||||
trace_one_reg(opd1_h);
|
||||
trace_one_reg(opd1_w);
|
||||
trace_one_reg(conv_opd0_up_pad);
|
||||
trace_one_reg(conv_opd0_dn_pad);
|
||||
trace_one_reg(conv_opd0_lf_pad);
|
||||
trace_one_reg(conv_opd0_rt_pad);
|
||||
trace_one_reg(conv_op_x_str);
|
||||
trace_one_reg(conv_op_y_str);
|
||||
trace_one_reg(opd0_ins_fp);
|
||||
trace_one_reg(rsvd2);
|
||||
trace_one_reg(opd0_n);
|
||||
trace_one_reg(opd1_n);
|
||||
trace_one_reg(opd1_c);
|
||||
trace_one_reg(opd2_n);
|
||||
trace_one_reg(opd2_c);
|
||||
trace_one_reg(opd2_h);
|
||||
trace_one_reg(opd2_w);
|
||||
trace_one_reg(quan_m);
|
||||
trace_one_reg(opd_typ);
|
||||
trace_one_reg(fp_round_typ);
|
||||
trace_one_reg(rsvd7);
|
||||
trace_one_reg(rsvd3);
|
||||
trace_one_reg(res0_n_str);
|
||||
trace_one_reg(res0_c_str);
|
||||
trace_one_reg(res0_h_str);
|
||||
trace_one_reg(res0_w_str);
|
||||
trace_one_reg(res0_b_str);
|
||||
trace_one_reg(opd0_n_str);
|
||||
trace_one_reg(opd0_c_str);
|
||||
trace_one_reg(rsvd4);
|
||||
trace_one_reg(opd0_h_str);
|
||||
trace_one_reg(opd0_w_str);
|
||||
trace_one_reg(opd0_b_str);
|
||||
trace_one_reg(opd1_n_str);
|
||||
trace_one_reg(opd1_c_str);
|
||||
trace_one_reg(opd1_h_str);
|
||||
trace_one_reg(opd1_w_str);
|
||||
trace_one_reg(rsvd5);
|
||||
trace_one_reg(opd1_b_str);
|
||||
trace_one_reg(opd2_n_str);
|
||||
trace_one_reg(opd2_c_str);
|
||||
trace_one_reg(opd2_h_str);
|
||||
trace_one_reg(opd2_w_str);
|
||||
trace_one_reg(opd2_b_str);
|
||||
trace_one_reg(layer_info);
|
||||
trace_one_reg(rsvd6);
|
||||
}
|
||||
#endif /* BM1880v2_TIU_REG_V2_11_H */
|
||||
37
cvikernel/include/bmkernel/bm1880v2/bm1880v2_tpu_cfg.h
Normal file
37
cvikernel/include/bmkernel/bm1880v2/bm1880v2_tpu_cfg.h
Normal file
@ -0,0 +1,37 @@
|
||||
#ifndef __BM1880V2_TPU_CFG__
|
||||
#define __BM1880V2_TPU_CFG__
|
||||
|
||||
#define BM1880V2_VER 18802
|
||||
#define BM1880V2_HW_NPU_SHIFT 5
|
||||
#define BM1880V2_HW_EU_SHIFT 4
|
||||
#define BM1880V2_HW_LMEM_SHIFT 15
|
||||
#define BM1880V2_HW_LMEM_BANKS 8
|
||||
#define BM1880V2_HW_LMEM_BANK_SIZE 0x1000
|
||||
#define BM1880V2_HW_NODE_CHIP_SHIFT 0
|
||||
#define BM1880V2_HW_NPU_NUM (1 << BM1880V2_HW_NPU_SHIFT)
|
||||
#define BM1880V2_HW_EU_NUM (1 << BM1880V2_HW_EU_SHIFT)
|
||||
#define BM1880V2_HW_LMEM_SIZE (1 << BM1880V2_HW_LMEM_SHIFT)
|
||||
#define BM1880V2_HW_NODE_CHIP_NUM (1 << BM1880V2_HW_NODE_CHIP_SHIFT)
|
||||
|
||||
#if (BM1880V2_HW_LMEM_SIZE != (BM1880V2_HW_LMEM_BANK_SIZE * BM1880V2_HW_LMEM_BANKS))
|
||||
#error "Set wrong TPU configuraiton."
|
||||
#endif
|
||||
|
||||
#define BM1880V2_GLOBAL_MEM_START_ADDR 0x100000000
|
||||
#define BM1880V2_GLOBAL_MEM_SIZE 0x100000000
|
||||
|
||||
#define BM1880V2_GLOBAL_TIU_CMDBUF_ADDR 0x00000000
|
||||
#define BM1880V2_GLOBAL_TDMA_CMDBUF_ADDR 0x01400000
|
||||
#define BM1880V2_GLOBAL_TIU_CMDBUF_RESERVED_SIZE 0x01400000
|
||||
#define BM1880V2_GLOBAL_TDMA_CMDBUF_RESERVED_SIZE 0x01400000
|
||||
#define BM1880V2_GLOBAL_POOL_RESERVED_SIZE (BM1880V2_GLOBAL_MEM_SIZE - BM1880V2_GLOBAL_TIU_CMDBUF_RESERVED_SIZE - BM1880V2_GLOBAL_TDMA_CMDBUF_RESERVED_SIZE)
|
||||
|
||||
#define BM1880V2_UART_CTLR_BASE_ADDR 0x04140000
|
||||
|
||||
#define BM1880V2_TDMA_ENGINE_BASE_ADDR 0x0C100000
|
||||
#define BM1880V2_TDMA_ENGINE_END_ADDR (BM1880V2_TDMA_ENGINE_BASE_ADDR + 0x1000)
|
||||
|
||||
#define BM1880V2_TIU_ENGINE_BASE_ADDR 0x0C101000 //"NPS Register" in memory map?
|
||||
#define BM1880V2_TIU_ENGINE_END_ADDR (BM1880V2_TIU_ENGINE_BASE_ADDR + 0x1000)
|
||||
|
||||
#endif
|
||||
708
cvikernel/include/bmkernel/bm1880v2/bm_vlc_compress.h
Normal file
708
cvikernel/include/bmkernel/bm1880v2/bm_vlc_compress.h
Normal file
@ -0,0 +1,708 @@
|
||||
#ifndef __BM_VLC_COMPRESS_H__
|
||||
#define __BM_VLC_COMPRESS_H__
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#ifdef __cplusplus
|
||||
extern "C"
|
||||
{
|
||||
#endif
|
||||
|
||||
#define MAX_UNARY_FIELD_SIZE 47
|
||||
#define MAX_ORDER_K 5
|
||||
|
||||
/**
|
||||
* \data_type 0 means 8bit, 1 means 16bit
|
||||
*/
|
||||
static inline size_t get_out_bs_buf_size(uint64_t in_size, uint8_t data_type) {
|
||||
size_t blk_num = (data_type) ? ((in_size + 31) >> 5) : ((in_size + 15) >> 4);
|
||||
size_t in_size_pad = blk_num << (4 + data_type);
|
||||
size_t bs_buf_size = in_size_pad + (ceiling_func(blk_num, 16) << 4) + 16;
|
||||
return bs_buf_size;
|
||||
}
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint8_t signedness;
|
||||
uint8_t is_bfloat16;
|
||||
uint8_t bias0;
|
||||
uint8_t bias1;
|
||||
uint8_t zero_guard_en;
|
||||
} CommandInfo;
|
||||
typedef struct
|
||||
{
|
||||
uint8_t *stream; // stream buffer pointer
|
||||
int bit_pos; // current pointer (in bit)
|
||||
int buf_size; // in byte
|
||||
} StreamBuffer;
|
||||
|
||||
static inline int8_t two_side_circular_shift(int8_t val, uint8_t bias0, uint8_t bias1);
|
||||
static inline int8_t inv_two_side_circular_shift(int8_t val, uint8_t bias0, uint8_t bias1);
|
||||
static inline uint8_t center_shift(uint8_t val, uint8_t bias, uint8_t zero_guard);
|
||||
static inline uint8_t inv_center_shift(uint8_t val, uint8_t bias, uint8_t zero_guard);
|
||||
|
||||
static inline void init_stream(StreamBuffer *bs, const uint8_t *buf, int buf_size, uint8_t read_only);
|
||||
|
||||
static inline void bm_vlc_est_weight_bias(const uint8_t *ibuf, size_t isz, uint8_t signedness, uint8_t isBfloat16, CommandInfo *cmd_info);
|
||||
static inline void bm_vlc_enc_int8(const uint8_t *ibuf, size_t isz, uint8_t *obuf, size_t *osz, CommandInfo *cmd_info);
|
||||
static inline void bm_vlc_dec_int8_ext(const uint8_t *ibuf, size_t isz, uint8_t *obuf, size_t *bs_size);
|
||||
static inline void bm_vlc_dec_int8(const uint8_t *ibuf, size_t isz, uint8_t *obuf);
|
||||
static inline void bm_vlc_enc_bf16(const uint16_t *ibuf, size_t isz, uint8_t *obuf, size_t *osz, CommandInfo *cmd_info);
|
||||
static inline void bm_vlc_dec_bf16_ext(const uint8_t *ibuf, size_t isz, uint16_t *obuf, size_t *bs_size);
|
||||
static inline void bm_vlc_dec_bf16(const uint8_t *ibuf, size_t isz, uint16_t *obuf);
|
||||
|
||||
static inline uint8_t get_bit_val(uint8_t *buf, int byte_idx, int bit_idx)
|
||||
{
|
||||
return (buf[byte_idx] >> bit_idx) & 0x1;
|
||||
}
|
||||
|
||||
static inline uint8_t sign_to_unsign(uint8_t val)
|
||||
{
|
||||
uint8_t sign_i = (val >> 7) & 0x1;
|
||||
int abs_data_i = abs(((int8_t)val));
|
||||
return ((abs_data_i << 1) - sign_i);
|
||||
}
|
||||
|
||||
static inline int8_t unsign_to_sign(uint8_t val)
|
||||
{
|
||||
uint8_t sign_i = val & 0x1;
|
||||
int abs_data_i = (((int)val) + 1) >> 1;
|
||||
return (uint8_t)((sign_i == 1) ? (-abs_data_i) : abs_data_i);
|
||||
}
|
||||
|
||||
static inline void dispatch_bf16_data(const uint16_t *bf16_in, uint8_t *exp, uint8_t *frac, size_t isz)
|
||||
{
|
||||
for (size_t i = 0; i < isz; i++)
|
||||
{
|
||||
exp[i] = (uint8_t)((bf16_in[i] >> 7) & 0xFF);
|
||||
frac[i] = (uint8_t)(((bf16_in[i] >> 15) << 7) | (bf16_in[i] & 0x7F));
|
||||
}
|
||||
}
|
||||
|
||||
static inline void merge_bf16_data(const uint8_t *exp_in, const uint8_t *frac_in, uint16_t *bf16_out, size_t isz)
|
||||
{
|
||||
memset(bf16_out, 0, sizeof(uint16_t));
|
||||
for (size_t i = 0; i < isz; i++)
|
||||
{
|
||||
bf16_out[i] = ((frac_in[i] >> 7) << 15) | (exp_in[i] << 7) | (frac_in[i] & 0x7F);
|
||||
}
|
||||
}
|
||||
|
||||
// -- streaming operation handler --
|
||||
static inline void init_stream(StreamBuffer *bs, const uint8_t *buf, int buf_size, uint8_t read_only)
|
||||
{
|
||||
bs->bit_pos = 0;
|
||||
bs->stream = (uint8_t *)buf;
|
||||
bs->buf_size = buf_size;
|
||||
if (!read_only)
|
||||
memset((uint8_t *)buf, 0, sizeof(uint8_t) * buf_size);
|
||||
}
|
||||
|
||||
static inline void write_stream(StreamBuffer *bs, uint8_t *src, int bit_len)
|
||||
{
|
||||
for (int bit = 0; bit < bit_len; bit++)
|
||||
{
|
||||
int src_byte_i = bit / 8;
|
||||
int src_bit_i = bit % 8;
|
||||
int dest_byte_i = (bs->bit_pos + bit) / 8;
|
||||
int dest_bit_i = (bs->bit_pos + bit) % 8;
|
||||
bs->stream[dest_byte_i] |= (get_bit_val(src, src_byte_i, src_bit_i) << dest_bit_i);
|
||||
}
|
||||
bs->bit_pos += bit_len;
|
||||
}
|
||||
|
||||
static inline void move_stream_ptr(StreamBuffer *bs, int bit_len)
|
||||
{
|
||||
bs->bit_pos += bit_len;
|
||||
}
|
||||
|
||||
static inline void parse_stream(StreamBuffer *bs, uint8_t *dest, int bit_len)
|
||||
{
|
||||
memset(dest, 0, sizeof(uint8_t) * (bit_len + 7) >> 3);
|
||||
for (int bit = 0; bit < bit_len; bit++)
|
||||
{
|
||||
int dest_byte_i = bit / 8;
|
||||
int dest_bit_i = bit % 8;
|
||||
int bs_byte_i = (bs->bit_pos + bit) / 8;
|
||||
int bs_bit_i = (bs->bit_pos + bit) % 8;
|
||||
dest[dest_byte_i] |= (get_bit_val(bs->stream, bs_byte_i, bs_bit_i) << dest_bit_i);
|
||||
}
|
||||
bs->bit_pos += bit_len;
|
||||
}
|
||||
|
||||
// -- header read/write operation handler --
|
||||
static inline void vlc_enc_header(StreamBuffer *bs_header, CommandInfo *cmd_info, size_t blk_bs_size)
|
||||
{
|
||||
write_stream(bs_header, (uint8_t *)&blk_bs_size, 24); // bit[23:0] compressed block stream size
|
||||
move_stream_ptr(bs_header, 4); // bit[27:24] reserved
|
||||
write_stream(bs_header, (uint8_t *)&cmd_info->signedness, 1); // bit[28] signedness
|
||||
write_stream(bs_header, (uint8_t *)&cmd_info->is_bfloat16, 1); // bit[29] data type
|
||||
move_stream_ptr(bs_header, 2); // bit[31:30] bit depth
|
||||
write_stream(bs_header, (uint8_t *)&cmd_info->bias0, 8); // bit[39:32] bias0 for symbol remapping
|
||||
write_stream(bs_header, (uint8_t *)&cmd_info->bias1, 7); // bit[46:40] bias1 for symbol remapping
|
||||
write_stream(bs_header, (uint8_t *)&cmd_info->zero_guard_en, 1); // bit[47] zero guard
|
||||
}
|
||||
|
||||
static inline void vlc_dec_header(StreamBuffer *bs_header, CommandInfo *cmd_info)
|
||||
{
|
||||
move_stream_ptr(bs_header, 28); // bit[27:24] reserved
|
||||
parse_stream(bs_header, (uint8_t *)&cmd_info->signedness, 1); // bit[28] signedness
|
||||
parse_stream(bs_header, (uint8_t *)&cmd_info->is_bfloat16, 1); // bit[29] data type
|
||||
move_stream_ptr(bs_header, 2);
|
||||
parse_stream(bs_header, (uint8_t *)&cmd_info->bias0, 8); // bit[39:32] bias0 for symbol remapping
|
||||
parse_stream(bs_header, (uint8_t *)&cmd_info->bias1, 7); // bit[46:40] bias1 for symbol remapping
|
||||
parse_stream(bs_header, (uint8_t *)&cmd_info->zero_guard_en, 1); // bit[47] zero guard
|
||||
}
|
||||
|
||||
static inline void vlc_dec_header_ext(StreamBuffer *bs_header, CommandInfo *cmd_info, size_t *bs_size)
|
||||
{
|
||||
parse_stream(bs_header, (uint8_t *)bs_size, 24); // bit[23:0] compressed block stream size
|
||||
move_stream_ptr(bs_header, 4); // bit[27:24] reserved
|
||||
parse_stream(bs_header, (uint8_t *)&cmd_info->signedness, 1); // bit[28] signedness
|
||||
parse_stream(bs_header, (uint8_t *)&cmd_info->is_bfloat16, 1); // bit[29] data type
|
||||
move_stream_ptr(bs_header, 2);
|
||||
parse_stream(bs_header, (uint8_t *)&cmd_info->bias0, 8); // bit[39:32] bias0 for symbol remapping
|
||||
parse_stream(bs_header, (uint8_t *)&cmd_info->bias1, 7); // bit[46:40] bias1 for symbol remapping
|
||||
parse_stream(bs_header, (uint8_t *)&cmd_info->zero_guard_en, 1); // bit[47] zero guard
|
||||
}
|
||||
|
||||
// -- symbol remmaping handler --
|
||||
static inline uint8_t center_shift(uint8_t val, uint8_t bias, uint8_t zero_guard)
|
||||
{
|
||||
if (val == 0 && zero_guard)
|
||||
return 0;
|
||||
|
||||
int16_t shift_data_i = val - bias;
|
||||
uint8_t range = (bias <= 128) ? bias : 255 - bias;
|
||||
if (bias <= 128)
|
||||
{
|
||||
return (val >= (range << 1)) ? val : sign_to_unsign(shift_data_i) + zero_guard;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (val < (bias - range)) ? (range + bias - val + zero_guard) : (sign_to_unsign(shift_data_i) + zero_guard);
|
||||
}
|
||||
}
|
||||
|
||||
static inline uint8_t inv_center_shift(uint8_t val, uint8_t bias, uint8_t zero_guard)
|
||||
{
|
||||
if (val == 0 && zero_guard)
|
||||
return 0;
|
||||
|
||||
uint8_t unsign_data_i = val - zero_guard;
|
||||
uint8_t range = (bias <= 128) ? bias : 255 - bias;
|
||||
if (bias <= 128)
|
||||
{
|
||||
return (val >= (range << 1)) ? val : unsign_to_sign(unsign_data_i) + bias;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (unsign_data_i > (range << 1)) ? (range + bias - val + zero_guard) : unsign_to_sign(unsign_data_i) + bias;
|
||||
}
|
||||
}
|
||||
|
||||
static inline int8_t two_side_circular_shift(int8_t val, uint8_t bias0, uint8_t bias1)
|
||||
{
|
||||
if (val == 0)
|
||||
return 0;
|
||||
|
||||
uint8_t sign = (val < 0) ? true : false;
|
||||
int32_t abs_val = abs(val);
|
||||
abs_val -= (sign) ? bias1 : bias0;
|
||||
abs_val += (abs_val <= 0) ? (127 + sign) : 0;
|
||||
return (sign) ? -abs_val : abs_val;
|
||||
}
|
||||
|
||||
static inline int8_t inv_two_side_circular_shift(int8_t val, uint8_t bias0, uint8_t bias1)
|
||||
{
|
||||
if (val == 0)
|
||||
return 0;
|
||||
|
||||
uint8_t sign = (val < 0) ? true : false;
|
||||
uint32_t abs_val = abs(val);
|
||||
abs_val += (sign) ? bias1 : bias0;
|
||||
int32_t abs_val_minus = abs_val - (127 + sign);
|
||||
uint8_t abs_val_lsb = ((abs_val_minus <= 0)
|
||||
? abs_val
|
||||
: abs_val_minus) &
|
||||
0xFF;
|
||||
return (sign) ? -abs_val_lsb : abs_val_lsb;
|
||||
}
|
||||
|
||||
static inline void symbol_remapping(uint8_t *blk_in, uint8_t *blk_out, uint8_t bias0, uint8_t bias1, uint8_t signedness, uint8_t is_bf16_exp, uint8_t zero_guard)
|
||||
{
|
||||
if (is_bf16_exp == false && signedness == false)
|
||||
{
|
||||
// remapping bypass
|
||||
memcpy(blk_out, blk_in, sizeof(uint8_t) * 16);
|
||||
return;
|
||||
}
|
||||
|
||||
if (is_bf16_exp == true)
|
||||
{
|
||||
// center circular shift
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
blk_out[i] = center_shift(blk_in[i], bias0, zero_guard);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// two-side circular shift
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
int8_t shift_data_i = two_side_circular_shift((int8_t)blk_in[i], bias0, bias1);
|
||||
blk_out[i] = sign_to_unsign(shift_data_i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline void inv_symbol_remapping(uint8_t *blk_in, uint8_t *blk_out, uint8_t bias0, uint8_t bias1, uint8_t signedness, uint8_t is_bf16_exp, uint8_t zero_guard)
|
||||
{
|
||||
if (is_bf16_exp == false && signedness == false)
|
||||
{
|
||||
// remapping bypass
|
||||
memcpy(blk_out, blk_in, sizeof(uint8_t) * 16);
|
||||
return;
|
||||
}
|
||||
|
||||
if (is_bf16_exp == true)
|
||||
{
|
||||
// center circular shift
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
blk_out[i] = inv_center_shift(blk_in[i], bias0, zero_guard);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// two-side circular shift
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
int8_t sign_data_i = unsign_to_sign(blk_in[i]);
|
||||
blk_out[i] = (uint8_t)inv_two_side_circular_shift(sign_data_i, bias0, bias1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline int vlc_estimate_block_order(uint8_t *blk_in, uint8_t bf16_zvc_en)
|
||||
{
|
||||
int best_k = 0;
|
||||
int best_bs_size = 0x7FFFFFFF;
|
||||
|
||||
for (int k = 0; k <= (int)MAX_ORDER_K; k++)
|
||||
{
|
||||
uint8_t remain_field_size = k << 4;
|
||||
int unary_field_len = 0;
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
uint8_t group_idx = blk_in[i] >> k;
|
||||
unary_field_len += (group_idx + 1);
|
||||
}
|
||||
int znum_bit = (bf16_zvc_en && k > 0) ? 4 : 0;
|
||||
int blk_size = (unary_field_len <= MAX_UNARY_FIELD_SIZE)
|
||||
? remain_field_size + unary_field_len + znum_bit
|
||||
: 255;
|
||||
if (blk_size < best_bs_size)
|
||||
{
|
||||
best_k = k;
|
||||
best_bs_size = blk_size;
|
||||
}
|
||||
}
|
||||
|
||||
best_k = (best_bs_size > 128) ? -1 : best_k;
|
||||
return best_k;
|
||||
}
|
||||
// -- vlc block parrelel GR encode/decode --
|
||||
static inline uint8_t vlc_gr_enc_block_data(uint8_t *blk_in, StreamBuffer *bs, int order_k, uint8_t bf16_zvc_en)
|
||||
{
|
||||
// uncompressed mode
|
||||
if (order_k == -1)
|
||||
{
|
||||
write_stream(bs, blk_in, 128);
|
||||
return 128;
|
||||
}
|
||||
|
||||
// remain field
|
||||
uint8_t remain_field[16] = {0};
|
||||
uint8_t unary_field[8] = {0};
|
||||
uint8_t sym_end_pos[16] = {0};
|
||||
uint8_t unary_field_len = 0;
|
||||
int sym_end_pos_accum = -1;
|
||||
|
||||
// bit plane encode for remain field
|
||||
for (int k = 0; k < order_k; k++)
|
||||
{
|
||||
uint8_t bit_plane0 = 0, bit_plane1 = 0;
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
bit_plane0 |= (get_bit_val(blk_in, i, k) << i);
|
||||
bit_plane1 |= (get_bit_val(blk_in, i + 8, k) << i);
|
||||
}
|
||||
remain_field[k << 1] = bit_plane0;
|
||||
remain_field[(k << 1) + 1] = bit_plane1;
|
||||
}
|
||||
write_stream(bs, remain_field, order_k << 4);
|
||||
|
||||
if (bf16_zvc_en && order_k > 0)
|
||||
{
|
||||
int zero_num = 0;
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
if (blk_in[i] == 0)
|
||||
zero_num++;
|
||||
}
|
||||
assert(zero_num < 16);
|
||||
write_stream(bs, (uint8_t *)&zero_num, 4);
|
||||
}
|
||||
|
||||
// unary encode for unary field
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
int group_idx = blk_in[i] >> order_k;
|
||||
sym_end_pos_accum += (group_idx + 1);
|
||||
sym_end_pos[i] = sym_end_pos_accum;
|
||||
int byte_idx = sym_end_pos[i] / 8;
|
||||
int bit_idx = sym_end_pos[i] % 8;
|
||||
unary_field[byte_idx] |= (1 << (bit_idx));
|
||||
}
|
||||
unary_field_len = sym_end_pos[15] + 1;
|
||||
assert(unary_field_len <= MAX_UNARY_FIELD_SIZE);
|
||||
uint8_t ulen = (unary_field_len - 16) & 0x1F;
|
||||
write_stream(bs, unary_field, unary_field_len);
|
||||
|
||||
return ulen;
|
||||
}
|
||||
|
||||
static inline void vlc_gr_dec_block_data(StreamBuffer *bs, uint8_t bs_size, uint8_t *rec, int order_k, uint8_t bf16_zvc_en)
|
||||
{
|
||||
assert(bs_size <= 128);
|
||||
// uncompressed mode
|
||||
if (order_k == -1)
|
||||
{
|
||||
parse_stream(bs, rec, 128);
|
||||
return;
|
||||
}
|
||||
|
||||
// remain field
|
||||
uint8_t remain_data[16] = {0};
|
||||
uint8_t remain_bs[16] = {0};
|
||||
uint8_t unary_field[8] = {0};
|
||||
uint8_t sym_end_pos[16] = {0};
|
||||
uint8_t unary_sym[16] = {0};
|
||||
uint8_t remain_field_size = order_k << 4;
|
||||
|
||||
parse_stream(bs, remain_bs, remain_field_size);
|
||||
// bit plane encode for remain field
|
||||
for (int k = 0; k < order_k; k++)
|
||||
{
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
remain_data[i] |= (get_bit_val(remain_bs, k << 1, i) << k);
|
||||
remain_data[i + 8] |= (get_bit_val(remain_bs, (k << 1) + 1, i) << k);
|
||||
}
|
||||
}
|
||||
|
||||
// zero number info
|
||||
int znum_bit = (bf16_zvc_en && order_k > 0) ? 4 : 0;
|
||||
uint8_t znum = 0;
|
||||
parse_stream(bs, &znum, znum_bit);
|
||||
|
||||
// unary encode for unary field
|
||||
uint8_t unary_field_len = bs_size - remain_field_size - znum_bit;
|
||||
parse_stream(bs, unary_field, unary_field_len);
|
||||
|
||||
int sym_cnt = 0;
|
||||
for (uint8_t ubit_i = 0; ubit_i < unary_field_len; ubit_i++)
|
||||
{
|
||||
int byte_idx = ubit_i / 8;
|
||||
int bit_idx = ubit_i % 8;
|
||||
if (get_bit_val(unary_field, byte_idx, bit_idx) == 1)
|
||||
{
|
||||
sym_end_pos[sym_cnt] = ubit_i;
|
||||
sym_cnt++;
|
||||
}
|
||||
}
|
||||
unary_sym[0] = sym_end_pos[0];
|
||||
for (int i = 1; i < 16; i++)
|
||||
{
|
||||
unary_sym[i] = sym_end_pos[i] - sym_end_pos[i - 1] - 1;
|
||||
}
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
rec[i] = (unary_sym[i] << order_k) + remain_data[i];
|
||||
}
|
||||
}
|
||||
|
||||
// -- vlc encode int8 entry funtion --
|
||||
static inline void bm_vlc_enc_int8(const uint8_t *ibuf, size_t isz, uint8_t *obuf, size_t *osz, CommandInfo *cmd_info)
|
||||
{
|
||||
StreamBuffer bs_header, bs_kmap, bs_data;
|
||||
size_t blk_num = (isz + 15) >> 4;
|
||||
size_t header_size = 16;
|
||||
size_t kmap_size = ceiling_func(blk_num, 16) << 4;
|
||||
size_t bs_buf_size = header_size + kmap_size + (blk_num << 4);
|
||||
uint8_t *bsbuf = (uint8_t *)calloc(bs_buf_size, sizeof(uint8_t));
|
||||
|
||||
// block encode
|
||||
init_stream(&bs_kmap, bsbuf + header_size, kmap_size, false);
|
||||
init_stream(&bs_data, bsbuf + header_size + kmap_size, blk_num << 4, false);
|
||||
|
||||
for (size_t blk_idx = 0; blk_idx < blk_num; blk_idx++)
|
||||
{
|
||||
uint8_t blk_data[16] = {0}, blk_sr_data[16] = {0};
|
||||
size_t in_size = (blk_idx == (blk_num - 1)) ? isz - (blk_idx << 4) : 16;
|
||||
memcpy(blk_data, &ibuf[blk_idx << 4], sizeof(uint8_t) * in_size);
|
||||
|
||||
symbol_remapping(blk_data, blk_sr_data, cmd_info->bias0, cmd_info->bias1, cmd_info->signedness, false, false);
|
||||
|
||||
int k = vlc_estimate_block_order(blk_sr_data, false);
|
||||
uint8_t ulen = vlc_gr_enc_block_data(blk_sr_data, &bs_data, k, false);
|
||||
uint8_t k_info = (k == -1) ? 0xE0 : (k << 5) + ulen;
|
||||
write_stream(&bs_kmap, &k_info, 8);
|
||||
}
|
||||
|
||||
int blk_bs_size = ceiling_func(((bs_data.bit_pos + 7) >> 3), 16) << 4; // 16 byte align
|
||||
*osz = header_size + kmap_size + blk_bs_size;
|
||||
|
||||
// write header
|
||||
init_stream(&bs_header, bsbuf, header_size, false);
|
||||
vlc_enc_header(&bs_header, cmd_info, blk_bs_size);
|
||||
|
||||
memcpy(obuf, bsbuf, (*osz) * sizeof(uint8_t));
|
||||
free(bsbuf);
|
||||
}
|
||||
|
||||
// -- vlc decode int8 entry funtion --
|
||||
static inline void bm_vlc_dec_int8_ext(const uint8_t *ibuf, size_t isz, uint8_t *obuf, size_t *bs_size)
|
||||
{
|
||||
StreamBuffer bs_header, bs_kmap, bs_data;
|
||||
CommandInfo cmd_info;
|
||||
memset(&cmd_info, 0, sizeof(CommandInfo));
|
||||
|
||||
size_t blk_num = (isz + 15) >> 4;
|
||||
int header_size = 16;
|
||||
int kmap_size = ceiling_func(blk_num, 16) << 4;
|
||||
|
||||
// parse header
|
||||
init_stream(&bs_header, ibuf, header_size, true);
|
||||
vlc_dec_header_ext(&bs_header, &cmd_info, bs_size);
|
||||
|
||||
// Check whether valid header
|
||||
size_t bs_buf_size = get_out_bs_buf_size(isz, 0); // int8
|
||||
ASSERT(*bs_size <= bs_buf_size);
|
||||
ASSERT(cmd_info.is_bfloat16 == 0);
|
||||
|
||||
// block decode
|
||||
init_stream(&bs_kmap, ibuf + header_size, kmap_size, true);
|
||||
init_stream(&bs_data, ibuf + header_size + kmap_size, blk_num << 4, true);
|
||||
|
||||
for (size_t blk_idx = 0; blk_idx < blk_num; blk_idx++)
|
||||
{
|
||||
uint8_t blk_data[16] = {0}, blk_sr_data[16] = {0};
|
||||
uint8_t k_info = 0;
|
||||
parse_stream(&bs_kmap, &k_info, 8);
|
||||
uint8_t ulen = k_info & 0x1F;
|
||||
int k = (k_info >> 5 == 7) ? -1 : k_info >> 5;
|
||||
int blk_bs_size = (k == -1) ? 128 : (k << 4) + ulen + 16;
|
||||
vlc_gr_dec_block_data(&bs_data, blk_bs_size, blk_data, k, false);
|
||||
|
||||
inv_symbol_remapping(blk_data, blk_sr_data, cmd_info.bias0, cmd_info.bias1, cmd_info.signedness, false, false);
|
||||
|
||||
int out_size = (blk_idx == (blk_num - 1)) ? isz - (blk_idx << 4) : 16;
|
||||
memcpy(&obuf[blk_idx << 4], blk_sr_data, sizeof(uint8_t) * out_size);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void bm_vlc_dec_int8(const uint8_t *ibuf, size_t isz, uint8_t *obuf)
|
||||
{
|
||||
size_t bs_size;
|
||||
bm_vlc_dec_int8_ext(ibuf, isz, obuf, &bs_size);
|
||||
}
|
||||
|
||||
// -- vlc encode bfloat16 entry funtion --
|
||||
static inline void bm_vlc_enc_bf16(const uint16_t *ibuf, size_t isz, uint8_t *obuf, size_t *osz, CommandInfo *cmd_info)
|
||||
{
|
||||
StreamBuffer bs_header, bs_kmap, bs_data;
|
||||
size_t blk_num = (isz + 31) >> 5; // 32 bytes per blok
|
||||
size_t header_size = 16;
|
||||
size_t kmap_size = ceiling_func(blk_num, 16) << 4;
|
||||
size_t bs_buf_size = header_size + kmap_size + (blk_num << 5);
|
||||
uint8_t *bsbuf = (uint8_t *)calloc(bs_buf_size, sizeof(uint8_t));
|
||||
|
||||
// block encode
|
||||
init_stream(&bs_kmap, bsbuf + header_size, kmap_size, false);
|
||||
init_stream(&bs_data, bsbuf + header_size + kmap_size, blk_num << 5, false);
|
||||
|
||||
for (size_t blk_idx = 0; blk_idx < blk_num; blk_idx++)
|
||||
{
|
||||
uint8_t blk_data[16] = {0}, blk_sr_data[16] = {0}, blk_data_frac[16] = {0};
|
||||
size_t in_num = (blk_idx == (blk_num - 1)) ? ((isz >> 1) - (blk_idx << 4)) : 16;
|
||||
dispatch_bf16_data(&ibuf[blk_idx << 4], blk_data, blk_data_frac, in_num);
|
||||
|
||||
// exp: BGR encode
|
||||
symbol_remapping(blk_data, blk_sr_data, cmd_info->bias0, cmd_info->bias1, false, true, cmd_info->zero_guard_en);
|
||||
|
||||
int k = vlc_estimate_block_order(blk_sr_data, cmd_info->zero_guard_en);
|
||||
uint8_t ulen = vlc_gr_enc_block_data(blk_sr_data, &bs_data, k, cmd_info->zero_guard_en);
|
||||
uint8_t k_info = (k == -1) ? 0xE0 : (k << 5) + ulen;
|
||||
write_stream(&bs_kmap, &k_info, 8);
|
||||
|
||||
// frac: implicit zero compression
|
||||
for (size_t i = 0; i < 16; i++)
|
||||
{
|
||||
if (!cmd_info->zero_guard_en || blk_data[i] != 0)
|
||||
{
|
||||
write_stream(&bs_data, &blk_data_frac[i], 8);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int blk_bs_size = ceiling_func(((bs_data.bit_pos + 7) >> 3), 16) << 4; // 16 byte align
|
||||
*osz = header_size + kmap_size + blk_bs_size;
|
||||
|
||||
// write header
|
||||
init_stream(&bs_header, bsbuf, header_size, false);
|
||||
vlc_enc_header(&bs_header, cmd_info, blk_bs_size);
|
||||
|
||||
memcpy(obuf, bsbuf, (*osz) * sizeof(uint8_t));
|
||||
free(bsbuf);
|
||||
}
|
||||
|
||||
// -- vlc decode bfloat16 entry funtion --
|
||||
static inline void bm_vlc_dec_bf16_ext(const uint8_t *ibuf, size_t isz, uint16_t *obuf, size_t *bs_size)
|
||||
{
|
||||
StreamBuffer bs_header, bs_kmap, bs_data;
|
||||
CommandInfo cmd_info;
|
||||
memset(&cmd_info, 0, sizeof(CommandInfo));
|
||||
|
||||
size_t blk_num = (isz + 31) >> 5; // 32 bytes per blok
|
||||
int header_size = 16;
|
||||
int kmap_size = ceiling_func(blk_num, 16) << 4;
|
||||
|
||||
// parse header
|
||||
init_stream(&bs_header, ibuf, header_size, true);
|
||||
vlc_dec_header_ext(&bs_header, &cmd_info, bs_size);
|
||||
|
||||
// Check whether valid header
|
||||
size_t bs_buf_size = get_out_bs_buf_size(isz, 1); // bf16
|
||||
ASSERT(*bs_size <= bs_buf_size);
|
||||
ASSERT(cmd_info.is_bfloat16 == 1);
|
||||
|
||||
// block decode
|
||||
init_stream(&bs_kmap, ibuf + header_size, kmap_size, true);
|
||||
init_stream(&bs_data, ibuf + header_size + kmap_size, blk_num << 5, true);
|
||||
|
||||
for (size_t blk_idx = 0; blk_idx < blk_num; blk_idx++)
|
||||
{
|
||||
uint8_t blk_data[16] = {0}, blk_sr_data[16] = {0}, blk_data_frac[16] = {0};
|
||||
uint8_t k_info = 0;
|
||||
parse_stream(&bs_kmap, &k_info, 8);
|
||||
uint8_t ulen = k_info & 0x1F;
|
||||
int k = (k_info >> 5 == 7) ? -1 : k_info >> 5;
|
||||
int znum_bit = (cmd_info.zero_guard_en && k > 0) ? 4 : 0;
|
||||
uint8_t blk_bs_size = (k == -1) ? 128 : (k << 4) + ulen + 16 + znum_bit;
|
||||
|
||||
// exp: BGR decode
|
||||
vlc_gr_dec_block_data(&bs_data, blk_bs_size, blk_data, k, cmd_info.zero_guard_en);
|
||||
|
||||
inv_symbol_remapping(blk_data, blk_sr_data, cmd_info.bias0, cmd_info.bias1, false, true, cmd_info.zero_guard_en);
|
||||
|
||||
size_t out_num = (blk_idx == (blk_num - 1)) ? ((isz >> 1) - (blk_idx << 4)) : 16;
|
||||
|
||||
// frac: implicit zero compression
|
||||
for (size_t i = 0; i < out_num; i++)
|
||||
{
|
||||
if (!cmd_info.zero_guard_en || blk_sr_data[i] != 0)
|
||||
{
|
||||
parse_stream(&bs_data, &blk_data_frac[i], 8);
|
||||
}
|
||||
}
|
||||
merge_bf16_data(blk_sr_data, blk_data_frac, &obuf[blk_idx << 4], out_num);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void bm_vlc_dec_bf16(const uint8_t *ibuf, size_t isz, uint16_t *obuf)
|
||||
{
|
||||
size_t bs_size;
|
||||
bm_vlc_dec_bf16_ext(ibuf, isz, obuf, &bs_size);
|
||||
}
|
||||
|
||||
// -- offline estimate model weight params --
|
||||
static inline void bm_vlc_est_weight_bias(const uint8_t *ibuf, size_t isz, uint8_t signedness, uint8_t isBfloat16, CommandInfo *cmd_info)
|
||||
{
|
||||
assert(!(isBfloat16 && signedness)); // WARNING: signedness MUST be 0 as isBfloat16==True
|
||||
|
||||
cmd_info->is_bfloat16 = isBfloat16;
|
||||
if (isBfloat16 == false && signedness == true)
|
||||
{
|
||||
// two-side circular shift
|
||||
int hist[256] = {0};
|
||||
for (size_t i = 0; i < isz; i++)
|
||||
{
|
||||
hist[ibuf[i]]++;
|
||||
}
|
||||
|
||||
int8_t pos_v = 1;
|
||||
//while (pos_v < 128)
|
||||
// comparison is always true due to limited range of data type [-Werror=type-limits]
|
||||
while (true)
|
||||
{
|
||||
if (hist[((uint8_t)pos_v)] == 0)
|
||||
{
|
||||
pos_v++;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
//cmd_info->bias0 = (pos_v > 1 && pos_v < 128) ? (pos_v - 1) : 0;
|
||||
// comparison is always true due to limited range of data type [-Werror=type-limits]
|
||||
cmd_info->bias0 = (pos_v > 1) ? (pos_v - 1) : 0;
|
||||
int8_t neg_v = -1;
|
||||
//while (neg_v >= (-128)) // comparison is always true due to limited range of data type [-Werror=type-limits]
|
||||
while (true)
|
||||
{
|
||||
if (hist[(uint8_t)neg_v] == 0)
|
||||
{
|
||||
neg_v--;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
//cmd_info->bias1 = (neg_v < -1 && neg_v >= -128) ? abs(neg_v + 1) : 0;
|
||||
// comparison is always true due to limited range of data type [-Werror=type-limits]
|
||||
cmd_info->bias1 = (neg_v < -1) ? abs(neg_v + 1) : 0;
|
||||
cmd_info->signedness = true;
|
||||
}
|
||||
|
||||
if (isBfloat16 == true)
|
||||
{
|
||||
// center shift
|
||||
int64_t exp_accum = 0;
|
||||
uint16_t *bf16_in = (uint16_t *)ibuf;
|
||||
size_t inum = (isz >> 1), cnt = 0;
|
||||
for (size_t i = 0; i < inum; i++)
|
||||
{
|
||||
uint8_t exp = ((bf16_in[i] >> 7) & 0xFF);
|
||||
if (exp != 0)
|
||||
{
|
||||
exp_accum += exp;
|
||||
cnt++;
|
||||
}
|
||||
}
|
||||
if (cnt > 0)
|
||||
{
|
||||
cmd_info->bias0 = (uint8_t)((exp_accum / (float)cnt) + 0.5);
|
||||
}
|
||||
cmd_info->zero_guard_en = (inum == cnt) ? false : true;
|
||||
cmd_info->signedness = false;
|
||||
}
|
||||
}
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* __BM_VLC_COMPRESS_H__ */
|
||||
1042
cvikernel/include/bmkernel/bm1880v2/bmkernel_1880v2.h
Normal file
1042
cvikernel/include/bmkernel/bm1880v2/bmkernel_1880v2.h
Normal file
File diff suppressed because it is too large
Load Diff
369
cvikernel/include/bmkernel/bm1880v2/compression.h
Normal file
369
cvikernel/include/bmkernel/bm1880v2/compression.h
Normal file
@ -0,0 +1,369 @@
|
||||
#ifndef COMPRESSION_H
|
||||
#define COMPRESSION_H
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
typedef struct {
|
||||
uint32_t compress_md;
|
||||
uint32_t bit_length;
|
||||
int is_signed;
|
||||
|
||||
uint64_t total_data_num;
|
||||
uint32_t non_zero_data_num;
|
||||
|
||||
uint64_t header_bytes;
|
||||
uint64_t map_bytes;
|
||||
uint64_t data_bytes;
|
||||
uint64_t total_bytes;
|
||||
|
||||
int compressed_min;
|
||||
int compressed_max;
|
||||
} compression_info_t;
|
||||
|
||||
typedef struct {
|
||||
uint64_t header_offset;
|
||||
uint64_t header_size;
|
||||
uint64_t map_offset;
|
||||
uint64_t map_size;
|
||||
uint64_t data_offset;
|
||||
uint64_t data_size;
|
||||
uint64_t total_size;
|
||||
} compress_addr_info;
|
||||
|
||||
static uint64_t compression_map_bytes(uint64_t total_data_num)
|
||||
{
|
||||
uint64_t bit_alignment = 16 * 8;
|
||||
uint64_t bits = total_data_num;
|
||||
|
||||
return ceiling_func(bits, bit_alignment)*16;
|
||||
}
|
||||
|
||||
static uint64_t compression_map_clear_bytes(uint64_t total_data_num)
|
||||
{
|
||||
uint64_t bit_alignment = 2 * 8;
|
||||
uint64_t bits = total_data_num;
|
||||
|
||||
return ceiling_func(bits, bit_alignment)*2;
|
||||
}
|
||||
|
||||
|
||||
static uint64_t compression_data_bytes(uint64_t non_zero_data_num, uint32_t bit_length)
|
||||
{
|
||||
if (bit_length == 1)
|
||||
return 0;
|
||||
|
||||
uint64_t bit_alignment = 8;
|
||||
uint64_t bits = non_zero_data_num * bit_length;
|
||||
|
||||
return ceiling_func(bits, bit_alignment);
|
||||
}
|
||||
|
||||
static inline uint32_t compression_bit_length(uint32_t compress_md)
|
||||
{
|
||||
switch (compress_md) {
|
||||
case 0:
|
||||
return 8;
|
||||
case 1:
|
||||
return 4;
|
||||
case 2:
|
||||
return 2;
|
||||
case 3:
|
||||
return 1;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void compute_compressed_range(
|
||||
uint32_t bit_length, int is_signed, int *min, int *max)
|
||||
{
|
||||
if (is_signed) {
|
||||
switch (bit_length) {
|
||||
case 1:
|
||||
*min = -1;
|
||||
*max = 0;
|
||||
return;
|
||||
case 2:
|
||||
*min = -2;
|
||||
*max = 1;
|
||||
return;
|
||||
case 4:
|
||||
*min = -8;
|
||||
*max = 7;
|
||||
return;
|
||||
case 8:
|
||||
*min = -128;
|
||||
*max = 127;
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
*min = 0;
|
||||
switch (bit_length) {
|
||||
case 1:
|
||||
*max = 1;
|
||||
return;
|
||||
case 2:
|
||||
*max = 3;
|
||||
return;
|
||||
case 4:
|
||||
*max = 15;
|
||||
return;
|
||||
case 8:
|
||||
*max = 255;
|
||||
return;
|
||||
}
|
||||
}
|
||||
assert(0);
|
||||
}
|
||||
|
||||
static inline int saturate(int val, int max, int min)
|
||||
{
|
||||
if (val < min)
|
||||
return min;
|
||||
else if (val > max)
|
||||
return max;
|
||||
else
|
||||
return val;
|
||||
}
|
||||
|
||||
static inline uint64_t count_non_zero_results(
|
||||
uint8_t buf[], uint64_t size, int is_signed, int max, int min)
|
||||
{
|
||||
uint64_t n = 0;
|
||||
|
||||
for (uint64_t i = 0; i < size; i++) {
|
||||
int val = is_signed? (int8_t)buf[i]: buf[i];
|
||||
int res = saturate(val, max, min);
|
||||
if (res != 0)
|
||||
n++;
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
static inline void set_map_bit(uint8_t map[], uint64_t i)
|
||||
{
|
||||
uint64_t byte_i = i / 8;
|
||||
uint64_t bit_i = i % 8;
|
||||
|
||||
map[byte_i] |= (1 << bit_i);
|
||||
}
|
||||
|
||||
static inline uint8_t read_map_bit(uint8_t map[], uint64_t i)
|
||||
{
|
||||
uint64_t byte_i = i / 8;
|
||||
uint64_t bit_i = i % 8;
|
||||
|
||||
return (map[byte_i] >> bit_i) & 1;
|
||||
}
|
||||
|
||||
static inline void parse_header(
|
||||
uint32_t header, int *is_signed, uint32_t *compress_md, uint32_t *nz_num)
|
||||
{
|
||||
*is_signed = (header >> 29) & 1;
|
||||
*compress_md = (header >> 24) & 0b11;
|
||||
*nz_num = header & 0xffffff;
|
||||
}
|
||||
|
||||
static inline void fill_header(uint32_t *hdr, compression_info_t *info)
|
||||
{
|
||||
if(compression_bit_length(info->compress_md)!=1)
|
||||
{
|
||||
*hdr = (info->is_signed << 29) | (1 << 28) |
|
||||
(info->compress_md << 24) |
|
||||
info->non_zero_data_num;
|
||||
}else
|
||||
{
|
||||
*hdr = (info->is_signed << 29) | (1 << 28) |
|
||||
(info->compress_md << 24);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void fill_map(uint8_t map[], uint8_t buf[], compression_info_t *info)
|
||||
{
|
||||
int min = info->compressed_min;
|
||||
int max = info->compressed_max;
|
||||
|
||||
uint64_t clear_map = compression_map_clear_bytes(info->total_data_num);
|
||||
for (uint64_t i = 0; i < clear_map; i++)
|
||||
map[i] = 0;
|
||||
|
||||
for (uint64_t i = 0; i < info->total_data_num; i++) {
|
||||
int val = info->is_signed? (int8_t)buf[i]: buf[i];
|
||||
int res = saturate(val, max, min);
|
||||
if (res != 0)
|
||||
set_map_bit(map, i);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void compress_one_data(
|
||||
uint8_t data[], uint64_t i, uint8_t val, compression_info_t *info)
|
||||
{
|
||||
uint32_t bit_len = info->bit_length;
|
||||
uint32_t data_per_byte = 8 / bit_len;
|
||||
|
||||
uint32_t byte_i = i / data_per_byte;
|
||||
uint32_t bit_i = (i % data_per_byte) * bit_len;
|
||||
uint8_t mask = (1 << bit_len) - 1;
|
||||
|
||||
data[byte_i] |= (val & mask) << bit_i;
|
||||
}
|
||||
|
||||
static inline uint8_t sign_extend(uint8_t val, uint32_t bit_len)
|
||||
{
|
||||
int shift = 8 - bit_len;
|
||||
return (int8_t)(val << shift) >> shift;
|
||||
}
|
||||
|
||||
static inline uint8_t decompress_one_data(
|
||||
uint8_t data[], uint64_t i, compression_info_t *info)
|
||||
{
|
||||
uint32_t bit_len = info->bit_length;
|
||||
uint32_t data_per_byte = 8 / bit_len;
|
||||
|
||||
uint32_t byte_i = i / data_per_byte;
|
||||
uint32_t bit_i = (i % data_per_byte) * bit_len;
|
||||
uint8_t mask = (1 << bit_len) - 1;
|
||||
|
||||
uint8_t val = (data[byte_i] >> bit_i) & mask;
|
||||
if (info->is_signed)
|
||||
val = sign_extend(val, bit_len);
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
static inline void fill_data(uint8_t data[], uint8_t buf[], compression_info_t *info)
|
||||
{
|
||||
int min = info->compressed_min;
|
||||
int max = info->compressed_max;
|
||||
|
||||
for (uint64_t i = 0; i < info->data_bytes; i++)
|
||||
data[i] = 0;
|
||||
|
||||
uint64_t nz_i = 0;
|
||||
for (uint64_t i = 0; i < info->total_data_num; i++) {
|
||||
int val = info->is_signed? (int8_t)buf[i]: buf[i];
|
||||
int res = saturate(val, max, min);
|
||||
if (res != 0) {
|
||||
compress_one_data(data, nz_i, res, info);
|
||||
nz_i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline compression_info_t make_compression_info(
|
||||
uint8_t buf[], uint64_t size, uint32_t compress_md, int is_signed)
|
||||
{
|
||||
uint32_t bit_length = compression_bit_length(compress_md);
|
||||
|
||||
int min, max;
|
||||
compute_compressed_range(bit_length, is_signed, &min, &max);
|
||||
|
||||
uint32_t nz_num = count_non_zero_results(buf, size, is_signed, max, min);
|
||||
assert(nz_num <= 0xffffff);
|
||||
|
||||
compression_info_t info;
|
||||
info.compress_md = compress_md;
|
||||
info.bit_length = bit_length;
|
||||
info.is_signed = is_signed;
|
||||
info.total_data_num = size;
|
||||
info.non_zero_data_num = nz_num;
|
||||
info.header_bytes = 16;
|
||||
info.map_bytes = compression_map_bytes(size);
|
||||
info.data_bytes = compression_data_bytes(nz_num, bit_length);
|
||||
info.total_bytes = info.header_bytes + info.map_bytes + info.data_bytes;
|
||||
info.compressed_min = min;
|
||||
info.compressed_max = max;
|
||||
return info;
|
||||
}
|
||||
|
||||
static inline compression_info_t parse_compression_info(
|
||||
uint8_t compressed_buf[], uint64_t max_size, uint64_t total_data_num)
|
||||
{
|
||||
uint64_t header_bytes = 16;
|
||||
assert(header_bytes <= max_size);
|
||||
|
||||
int is_signed;
|
||||
uint32_t compress_md, nz_num;
|
||||
parse_header(*(uint32_t *)compressed_buf, &is_signed, &compress_md, &nz_num);
|
||||
|
||||
uint32_t bit_length = compression_bit_length(compress_md);
|
||||
int min, max;
|
||||
compute_compressed_range(bit_length, is_signed, &min, &max);
|
||||
|
||||
compression_info_t info;
|
||||
info.compress_md = compress_md;
|
||||
info.bit_length = compression_bit_length(compress_md);
|
||||
info.is_signed = is_signed;
|
||||
info.total_data_num = total_data_num;
|
||||
info.non_zero_data_num = nz_num;
|
||||
info.header_bytes = header_bytes;
|
||||
info.map_bytes = compression_map_bytes(total_data_num);
|
||||
info.data_bytes = compression_data_bytes(nz_num, info.bit_length);
|
||||
info.total_bytes = header_bytes + info.map_bytes + info.data_bytes;
|
||||
info.compressed_min = min;
|
||||
info.compressed_max = max;
|
||||
|
||||
assert(info.total_bytes <= max_size);
|
||||
|
||||
return info;
|
||||
}
|
||||
|
||||
static inline uint8_t * compress(
|
||||
uint8_t buf[], uint64_t size, uint32_t compress_md, int is_signed, compress_addr_info *compressed_data)
|
||||
{
|
||||
compression_info_t info =
|
||||
make_compression_info(buf, size, compress_md, is_signed);
|
||||
|
||||
assert(info.total_bytes < 0x100000);
|
||||
static uint8_t *result = new uint8_t[0x100000];
|
||||
uint32_t *hdr = (uint32_t *)result;
|
||||
uint8_t *map = &result[info.header_bytes];
|
||||
uint8_t *data = &map[info.map_bytes];
|
||||
|
||||
fill_header(hdr, &info);
|
||||
fill_map(map, buf, &info);
|
||||
if (info.bit_length != 1)
|
||||
fill_data(data, buf, &info);
|
||||
|
||||
compressed_data->header_offset = 0;
|
||||
compressed_data->header_size = 4;
|
||||
compressed_data->map_offset = info.header_bytes;
|
||||
compressed_data->map_size = compression_map_clear_bytes(info.total_data_num);
|
||||
compressed_data->data_offset = info.map_bytes + info.header_bytes;
|
||||
compressed_data->data_size = info.data_bytes;
|
||||
compressed_data->total_size = info.total_bytes;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline void decompress(
|
||||
uint8_t buf[], uint64_t size, uint8_t compressed_buf[], uint64_t max_size)
|
||||
{
|
||||
compression_info_t info =
|
||||
parse_compression_info(compressed_buf, max_size, size);
|
||||
assert(info.total_bytes <= max_size);
|
||||
assert(info.total_data_num == size);
|
||||
|
||||
uint8_t *map = &compressed_buf[info.header_bytes];
|
||||
if (info.bit_length == 1) {
|
||||
for (uint64_t i = 0; i < size; i++) {
|
||||
uint8_t val = read_map_bit(map, i);
|
||||
buf[i] = info.is_signed? sign_extend(val, 1): val;
|
||||
}
|
||||
} else {
|
||||
uint8_t *data = &map[info.map_bytes];
|
||||
uint64_t data_i = 0;
|
||||
for (uint64_t i = 0; i < size; i++) {
|
||||
uint8_t val = read_map_bit(map, i);
|
||||
if (val == 0) {
|
||||
buf[i] = 0;
|
||||
} else {
|
||||
buf[i] = decompress_one_data(data, data_i, &info);
|
||||
data_i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* COMPRESSION_H */
|
||||
300
cvikernel/include/bmkernel/bm1880v2/non_atomic.h
Normal file
300
cvikernel/include/bmkernel/bm1880v2/non_atomic.h
Normal file
@ -0,0 +1,300 @@
|
||||
#ifndef __BMKERNEL_1880v2_NON_ATOMIC_H__
|
||||
#define __BMKERNEL_1880v2_NON_ATOMIC_H__
|
||||
|
||||
#include "bmkernel_1880v2.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// non atomic
|
||||
void bf16_table_shape(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_shape_t *s);
|
||||
|
||||
int bf16_emit_sqrt(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf, bmk1880v2_tensor_lmem_t *tbl_answer,
|
||||
bmk1880v2_tensor_lmem_t *tbl_answer_mantissa,
|
||||
bmk1880v2_tensor_lmem_t *tl_ofmap_bf16);
|
||||
void bf16_gen_sqrt(uint16_t *table_data, bmk1880v2_tensor_lmem_shape_t *table_shape);
|
||||
void bf16_gen_sqrt_mantissa(uint16_t *table_mantissa, bmk1880v2_tensor_lmem_shape_t *table_shape);
|
||||
void bf16_sqrt_tbl(uint16_t *sqrt_table_data, uint16_t *sqrt_table_data_mantissa,
|
||||
bmk1880v2_tensor_lmem_shape_t *table_shape);
|
||||
|
||||
int bf16_emit_reciprocal(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf, bmk1880v2_tensor_lmem_t *tbl_answer,
|
||||
bmk1880v2_tensor_lmem_t *tbl_answer_mantissa,
|
||||
bmk1880v2_tensor_lmem_t *tl_ofmap_bf16);
|
||||
void bf16_gen_reciprocal(uint16_t *table_data, bmk1880v2_tensor_lmem_shape_t *table_shape);
|
||||
void bf16_gen_reciprocal_mantissa(uint16_t *table_mantissa, bmk1880v2_tensor_lmem_shape_t *table_shape);
|
||||
void bf16_reciprocal_tbl(uint16_t *table_data, uint16_t *table_mantissa,
|
||||
bmk1880v2_tensor_lmem_shape_t *table_shape);
|
||||
|
||||
void bf16_atan_y0(uint16_t *table_data_y0, bmk1880v2_tensor_lmem_shape_t *table_shape);
|
||||
void bf16_atan_fast_degree_y0(uint16_t *table_data_y0, bmk1880v2_tensor_lmem_shape_t *table_shape);
|
||||
void bf16_atan_slope(uint16_t *table_slope, bmk1880v2_tensor_lmem_shape_t *table_shape);
|
||||
void bf16_atan_s_01(uint16_t *table_invert, bmk1880v2_tensor_lmem_shape_t *table_shape);
|
||||
void bf16_atan_pos_neg(uint16_t *table_pos_neg, bmk1880v2_tensor_lmem_shape_t *table_shape);
|
||||
int bf16_atan_slope_multipilier(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf2, bmk1880v2_tensor_lmem_t *tl_buf3,
|
||||
bmk1880v2_tensor_lmem_t *tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt);
|
||||
|
||||
int bf16_atan_emit(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf, bmk1880v2_tensor_lmem_t *tl_buf2,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf3, bmk1880v2_tensor_lmem_t *tl_y0_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_slope_buf, bmk1880v2_tensor_lmem_t *tl_invert_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_pos_neg_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_table_answer,
|
||||
bmk1880v2_tensor_lmem_t *tl_table_answer_mantissa,
|
||||
bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt);
|
||||
|
||||
int bf16_atan_fast_emit(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf, bmk1880v2_tensor_lmem_t *tl_buf2,
|
||||
bmk1880v2_tensor_lmem_t *tl_y0_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_invert_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_pos_neg_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_table_answer,
|
||||
bmk1880v2_tensor_lmem_t *tl_table_answer_mantissa,
|
||||
bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt,
|
||||
uint8_t is_dirty_ifmap);
|
||||
|
||||
void bf16_atan2_fast_emit(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *y,
|
||||
bmk1880v2_tensor_lmem_t *x, bmk1880v2_tensor_lmem_t *tl_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf2, bmk1880v2_tensor_lmem_t *tl_buf3,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf4, bmk1880v2_tensor_lmem_t *tl_y0_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_slope_buf, bmk1880v2_tensor_lmem_t *tl_invert_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_pos_neg_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_table_answer,
|
||||
bmk1880v2_tensor_lmem_t *tl_table_answer_mantissa,
|
||||
bmk1880v2_tensor_lmem_t *tl_0_idx_table,
|
||||
bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt);
|
||||
|
||||
void bf16_atan2_fast_degree_emit(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *y,
|
||||
bmk1880v2_tensor_lmem_t *x, bmk1880v2_tensor_lmem_t *tl_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf2, bmk1880v2_tensor_lmem_t *tl_buf3,
|
||||
bmk1880v2_tensor_lmem_t *tl_y0_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_invert_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_pos_neg_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_table_answer,
|
||||
bmk1880v2_tensor_lmem_t *tl_table_answer_mantissa,
|
||||
bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt);
|
||||
|
||||
void bf16_atan2_merge_emit(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *y,
|
||||
bmk1880v2_tensor_lmem_t *x, bmk1880v2_tensor_lmem_t *tl_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf2, bmk1880v2_tensor_lmem_t *tl_buf3,
|
||||
bmk1880v2_tensor_lmem_t *tl_y0_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_invert_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_pos_neg_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_table_answer,
|
||||
bmk1880v2_tensor_lmem_t *tl_table_answer_mantissa,
|
||||
bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt);
|
||||
|
||||
|
||||
uint64_t bf16_lut_tbl_bytesize(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_shape_t *table_shape,
|
||||
fmt_t fmt);
|
||||
|
||||
void bf16_atan_tbl(uint16_t *table_data_atan_y0, uint16_t *table_data_atan_slope, uint16_t *table_data_atan_invert,
|
||||
uint16_t *table_data_atan_pos_neg, bmk1880v2_tensor_lmem_shape_t *table_shape);
|
||||
|
||||
void bf16_atan_fast_degree_tbl(uint16_t *table_data_atan_y0, uint16_t *table_data_atan_invert,
|
||||
uint16_t *table_data_atan_pos_neg, bmk1880v2_tensor_lmem_shape_t *table_shape);
|
||||
|
||||
void bf16_gen_0_tbl(uint16_t *table_0, bmk1880v2_tensor_lmem_shape_t *table_shape);
|
||||
|
||||
int bf16_emit_0_idx(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf, bmk1880v2_tensor_lmem_t *tbl_answer,
|
||||
bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt);
|
||||
|
||||
int bf16_emit_neg_idx(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf, bmk1880v2_tensor_lmem_t *tl_pos_neg_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt);
|
||||
|
||||
int bf16_emit_pos_idx(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf, bmk1880v2_tensor_lmem_t *tl_pos_neg_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt);
|
||||
|
||||
int bf16_emit_0_1_revert_input(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt);
|
||||
|
||||
void bf16_atan2_emit(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *y,
|
||||
bmk1880v2_tensor_lmem_t *x, bmk1880v2_tensor_lmem_t *tl_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf2, bmk1880v2_tensor_lmem_t *tl_buf3,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf4, bmk1880v2_tensor_lmem_t *tl_buf5,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf6, bmk1880v2_tensor_lmem_t *tl_y0_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_slope_buf, bmk1880v2_tensor_lmem_t *tl_invert_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_pos_neg_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_table_answer,
|
||||
bmk1880v2_tensor_lmem_t *tl_table_answer_mantissa,
|
||||
bmk1880v2_tensor_lmem_t *tl_sqrt_table_answer,
|
||||
bmk1880v2_tensor_lmem_t *tl_sqrt_table_answer_mantissa,
|
||||
bmk1880v2_tensor_lmem_t *tl_0_idx_table,
|
||||
bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt);
|
||||
|
||||
// nn function
|
||||
int bf16_emit_pythagoras(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *y,
|
||||
bmk1880v2_tensor_lmem_t *x, bmk1880v2_tensor_lmem_t *tl_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf2,
|
||||
bmk1880v2_tensor_lmem_t *tl_sqrt_table_answer,
|
||||
bmk1880v2_tensor_lmem_t *tl_sqrt_table_answer_mantissa,
|
||||
bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt);
|
||||
|
||||
int bf16_emit_max_const(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt, float b);
|
||||
|
||||
int bf16_emit_min_const(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt, float b);
|
||||
|
||||
int bf16_emit_0_1_revert(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf, bmk1880v2_tensor_lmem_t *tbl_answer,
|
||||
bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt);
|
||||
|
||||
int bf16_emit_mul(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *tl_ifmap2, bmk1880v2_tensor_lmem_t *tl_ofmap_bf16,
|
||||
fmt_t fmt);
|
||||
|
||||
int bf16_emit_add(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *tl_ifmap2, bmk1880v2_tensor_lmem_t *tl_ofmap_bf16,
|
||||
fmt_t fmt);
|
||||
|
||||
int bf16_emit_add_const(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt, float b);
|
||||
|
||||
int bf16_emit_mul_const(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt, float b);
|
||||
|
||||
// mask please refer \BF16_MASK_TYPE for supported case
|
||||
int bf16_emit_mask_gt0(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf, bmk1880v2_tensor_lmem_t *tl_buf2,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf3, bmk1880v2_tensor_lmem_t *tl_pos_neg_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_0_idx_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt);
|
||||
|
||||
int bf16_emit_mask_ge0(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf, bmk1880v2_tensor_lmem_t *tl_pos_neg_table,
|
||||
bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt);
|
||||
|
||||
int bf16_emit_mask_le0(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf, bmk1880v2_tensor_lmem_t *tl_pos_neg_table,
|
||||
bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt);
|
||||
|
||||
int bf16_emit_mask_eq0(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf, bmk1880v2_tensor_lmem_t *tl_0_idx_table,
|
||||
bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt);
|
||||
enum BF16_MASK_TYPE {
|
||||
BF16_MASK_TYPE_GT_0 = 0, // remain > 0
|
||||
BF16_MASK_TYPE_GE_0, // remain >= 0
|
||||
BF16_MASK_TYPE_EQ_0, // remain = 0
|
||||
BF16_MASK_TYPE_LT_0, // remain < 0
|
||||
BF16_MASK_TYPE_LE_0, // remain <= 0
|
||||
BF16_MASK_MAX
|
||||
};
|
||||
|
||||
int bf16_emit_mask(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf, bmk1880v2_tensor_lmem_t *tl_buf2,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf3, bmk1880v2_tensor_lmem_t *tl_pos_neg_table,
|
||||
bmk1880v2_tensor_lmem_t *tl_0_idx_table, bmk1880v2_tensor_lmem_t *tl_ofmap_bf16,
|
||||
fmt_t fmt, enum BF16_MASK_TYPE mask);
|
||||
|
||||
int bf16_emit_mask_lt0(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf, bmk1880v2_tensor_lmem_t *tl_pos_neg_table,
|
||||
bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt);
|
||||
|
||||
int _bf16_atan_emit(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf, bmk1880v2_tensor_lmem_t *tl_buf2,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf3, bmk1880v2_tensor_lmem_t *tl_y0_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_slope_buf, bmk1880v2_tensor_lmem_t *tl_invert_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_pos_neg_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_table_answer,
|
||||
bmk1880v2_tensor_lmem_t *tl_table_answer_mantissa,
|
||||
bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt, float b);
|
||||
|
||||
uint32_t *bm1880v2_reshape_channel_bias(uint8_t *bias, int ni, int ci, int hi, int wi, int old_bias_c,
|
||||
fmt_t fmt);
|
||||
|
||||
int bm1880v2_reshape_channel_same(bmk1880v2_context_t *bk_ctx, int ic, int ih, int iw, int kh,
|
||||
int kw, int pad_right, int pad_left, int stride_h, int stride_w,
|
||||
bmk1880v2_tensor_lmem_shape_t *tl_load_shape,
|
||||
bmk1880v2_tensor_lmem_stride_t *new_tl_ifmap_stride,
|
||||
bmk1880v2_tensor_tgmem_shape_t *new_tg_ifmap_shape,
|
||||
bmk1880v2_tensor_tgmem_stride_t *new_tg_ifmap_stride,
|
||||
bmk1880v2_tensor_lmem_shape_t *new_tl_weight_shape,
|
||||
bmk1880v2_tensor_lmem_shape_t *new_tl_bias_shape,
|
||||
bmk1880v2_tensor_lmem_shape_t *new_tl_ofmap_shape, fmt_t fmt,
|
||||
int eu_align);
|
||||
|
||||
uint8_t *bm1880v2_reshape_channel_weight(uint8_t *weight, int ni, int ci, int hi, int wi, int old_weight_c,
|
||||
fmt_t fmt);
|
||||
|
||||
|
||||
int bm1880v2_reshape_channel_same_pad(
|
||||
bmk1880v2_context_t *bk_ctx,
|
||||
int ic, int ih, int iw, int kh, int kw,
|
||||
int pad_right, int pad_left, int stride_h, int stride_w,
|
||||
bmk1880v2_tensor_lmem_shape_t* tl_load_shape,
|
||||
bmk1880v2_tensor_lmem_stride_t* new_tl_ifmap_stride,
|
||||
bmk1880v2_tensor_tgmem_shape_t* new_tg_ifmap_shape,
|
||||
bmk1880v2_tensor_tgmem_stride_t* new_tg_ifmap_stride,
|
||||
bmk1880v2_tensor_lmem_shape_t* new_tl_weight_shape,
|
||||
bmk1880v2_tensor_lmem_shape_t* new_tl_bias_shape,
|
||||
bmk1880v2_tensor_lmem_shape_t* new_tl_ofmap_shape,
|
||||
fmt_t fmt, int eu_align);
|
||||
|
||||
int bf16_emit_sigmoid(bmk1880v2_context_t *ctx,
|
||||
bmk1880v2_tensor_lmem_t* tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t* tl_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_table_answer,
|
||||
bmk1880v2_tensor_lmem_t *tl_table_answer_slope,
|
||||
bmk1880v2_tensor_lmem_t* tl_ofmap_bf16,
|
||||
float scale);
|
||||
|
||||
void bf16_sigmoid_tbl(uint16_t *sigmoid_table_data, uint16_t* sigmoid_table_data_slope,
|
||||
bmk1880v2_tensor_lmem_shape_t* table_shape,
|
||||
int range_start, int range_end);
|
||||
|
||||
float bf16_sigmoid_scale(int range_start, int range_end);
|
||||
|
||||
void bf16_emit_mask_ge0_lt0(
|
||||
bmk1880v2_context_t *ctx,
|
||||
bmk1880v2_tensor_lmem_t* y,
|
||||
bmk1880v2_tensor_lmem_t* index_i8,
|
||||
bmk1880v2_tensor_lmem_t* tl_buf3,
|
||||
fmt_t fmt
|
||||
);
|
||||
|
||||
void bf16_emit_mask_eq_0(
|
||||
bmk1880v2_context_t *ctx,
|
||||
bmk1880v2_tensor_lmem_t* y,
|
||||
bmk1880v2_tensor_lmem_t* tl_buf,
|
||||
bmk1880v2_tensor_lmem_t* index_i8,
|
||||
bmk1880v2_tensor_lmem_t* tl_buf3,
|
||||
fmt_t fmt
|
||||
);
|
||||
|
||||
int bf16_lut_exp_mantissa(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf,
|
||||
bmk1880v2_tensor_lmem_t *tbl_answer,
|
||||
bmk1880v2_tensor_lmem_t *tbl_answer_mantissa,
|
||||
bmk1880v2_tensor_lmem_t *tl_ofmap_bf16);
|
||||
|
||||
int bf16_s2s_fp32_bf16(bmk1880v2_context_t *ctx, uint64_t gaddr_fp32,
|
||||
bmk1880v2_tensor_tgmem_shape_t fp32_shape, uint64_t gaddr_bf16,
|
||||
bmk1880v2_tensor_tgmem_shape_t bf16_shape, fmt_t fmt);
|
||||
|
||||
/**
|
||||
* \gaddr_nc_image for temp gaddr, it could be the same as \gaddr_image
|
||||
* \re_order_gaddr_svm means we re-ordered weight by \unit_size and oc/ic transpose
|
||||
* \svm_shape as alias as weight of conv, record actually shape likes (oc, ic, kh, kw),
|
||||
* the passible shape is <oc, \unit_size, 15, 7>
|
||||
* \unit_size as vecotr size, it should be 36 in HOG
|
||||
*/
|
||||
int bf16_hists_svm(bmk1880v2_context_t *ctx, uint64_t gaddr_image, uint64_t gaddr_nc_image,
|
||||
bmk1880v2_tensor_tgmem_shape_t image_shape, uint64_t re_order_gaddr_svm,
|
||||
bmk1880v2_tensor_tgmem_shape_t svm_shape, // (oc, ic, kh, kw)
|
||||
uint64_t gaddr_output, int unit_size, fmt_t fmt);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* __BMKERNEL_1880v2_NON_ATOMIC_H__ */
|
||||
|
||||
115
cvikernel/include/bmkernel/bm_kernel.h
Normal file
115
cvikernel/include/bmkernel/bm_kernel.h
Normal file
@ -0,0 +1,115 @@
|
||||
#ifndef __BM_KERNEL_H__
|
||||
#define __BM_KERNEL_H__
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#include <math.h>
|
||||
#include <stddef.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <cvikernel/cvikernel.h>
|
||||
|
||||
typedef uint8_t u8;
|
||||
typedef uint16_t u16;
|
||||
typedef uint32_t u32;
|
||||
typedef uint64_t u64;
|
||||
|
||||
typedef int8_t s8;
|
||||
typedef int16_t s16;
|
||||
typedef int32_t s32;
|
||||
typedef int64_t s64;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef int bmerr_t;
|
||||
#define BM_SUCCESS 0 // The operation was successful
|
||||
#define BM_ERR_AGAIN 1 // Not ready yet
|
||||
#define BM_ERR_FAILURE 2 // General failure
|
||||
#define BM_ERR_TIMEOUT 3 // Timeout
|
||||
#define BM_ERR_UNINITIALIZED 4 // Uninitialzed
|
||||
#define BM_ERR_INVALID_ARGUMENT 5 // Arguments invalid
|
||||
#define BM_ERR_NOMEM 6 // Not enough memory
|
||||
#define BM_ERR_DATA 7 // Data error
|
||||
#define BM_ERR_BUSY 8 // Busy
|
||||
#define BM_ERR_NOT_SUPPORTED 9 // Not supported yet
|
||||
|
||||
#define CVI_TPU_TIU 0 // Tensor Instruction Unit
|
||||
#define CVI_TPU_CPU 1 // CPU, Reserved for common cpu op
|
||||
#define CVI_TPU_TDMA 2 // TPU DMA
|
||||
#define CVI_TPU_ENGINE_NUM 3 // Number of Engines
|
||||
|
||||
typedef cvk_fmt_t fmt_t;
|
||||
#define FMT_F32 CVK_FMT_F32
|
||||
#define FMT_F16 CVK_FMT_F16
|
||||
#define FMT_I32 CVK_FMT_I32
|
||||
#define FMT_I16 CVK_FMT_I16
|
||||
#define FMT_I8 CVK_FMT_I8
|
||||
#define FMT_I4 CVK_FMT_I4
|
||||
#define FMT_I2 CVK_FMT_I2
|
||||
#define FMT_I1 CVK_FMT_I1
|
||||
#define FMT_U32 CVK_FMT_U32
|
||||
#define FMT_U16 CVK_FMT_U16
|
||||
#define FMT_U8 CVK_FMT_U8
|
||||
#define FMT_BF16 CVK_FMT_BF16
|
||||
#define FMT_INVALID CVK_FMT_INVALID
|
||||
|
||||
typedef enum _Cmdbuf_Head_Magic {
|
||||
CMDBUF_HDR_MAGIC_1880v2 = 0xA5,
|
||||
CMDBUF_HDR_MAGIC_1822 = 0xA6,
|
||||
CMDBUF_HDR_MAGIC_181X = 0xA7,
|
||||
CMDBUF_HDR_MAGIC_180X = 0xA8,
|
||||
} Cmdbuf_Head_Magic;
|
||||
|
||||
#define BM_CMB_HDR_FLAG_NEURON (0x1)
|
||||
#define BM_CMB_HDR_FLAG_WEIGHT (0x2)
|
||||
|
||||
typedef struct __cmd_hdr_s {
|
||||
uint8_t magic; // 0xA5
|
||||
uint8_t len; // lens in bytes
|
||||
uint8_t engine_id: 4; // TPU, GDMA, CDMA
|
||||
uint8_t __deprecated: 4;
|
||||
uint8_t flags; // CMD_ID, sync flags, etc. TBD
|
||||
uint32_t mask; // bit mask for which register need to write
|
||||
uint8_t cmd[0];
|
||||
} __attribute__((packed)) cmd_hdr_t;
|
||||
|
||||
typedef struct {
|
||||
uint32_t chip_version;
|
||||
uint32_t cmdbuf_size;
|
||||
uint8_t *cmdbuf;
|
||||
} bmk_info_t;
|
||||
|
||||
cvk_chip_info_t bmk1880v2_chip_info(void);
|
||||
cvk_chip_info_t bmk1822_chip_info(void);
|
||||
|
||||
static inline int ceiling_func(int numerator, int denominator)
|
||||
{
|
||||
return (numerator + denominator - 1) / denominator;
|
||||
}
|
||||
|
||||
static inline int ceiling_func_shift(int numerator, int shift)
|
||||
{
|
||||
return (numerator + (1 << shift) - 1) >> shift;
|
||||
}
|
||||
|
||||
static inline uint64_t align_up(uint64_t x, uint64_t n)
|
||||
{
|
||||
return (x + n - 1) / n * n;
|
||||
}
|
||||
|
||||
// len max number is 255, sometimes cmd larger than 255
|
||||
static inline uint32_t cmd_hdr_len(cmd_hdr_t * hdr) {
|
||||
if (hdr->len == 0) {
|
||||
return hdr->mask;
|
||||
}
|
||||
return hdr->len;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* __BM_KERNEL_H__ */
|
||||
223
cvikernel/include/bmkernel/bm_kernel_legacy.h
Normal file
223
cvikernel/include/bmkernel/bm_kernel_legacy.h
Normal file
@ -0,0 +1,223 @@
|
||||
#ifndef __BM_KERNEL_LEGACY_H__
|
||||
#define __BM_KERNEL_LEGACY_H__
|
||||
|
||||
#include <bmkernel/bm_kernel.h>
|
||||
|
||||
typedef uint32_t laddr_t;
|
||||
typedef uint64_t gaddr_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define LADDR_INVALID (0xFFFFFFFF)
|
||||
#define GADDR_INVALID (0x000000FFFFFFFFFFULL)
|
||||
|
||||
#define FMT_U8_to_F32 0xFF
|
||||
|
||||
#define ENGINE_BD 0 // Broadcast Engine
|
||||
#define ENGINE_CPU 1 // CPU, Reserved
|
||||
#define ENGINE_GDMA 2 // GDMA Engine
|
||||
#define ENGINE_CDMA 3 // CDMA Engine
|
||||
#define ENGINE_END 4 // Invalid
|
||||
|
||||
typedef struct __dma_hdr_t {
|
||||
uint16_t dmabuf_magic_m;
|
||||
uint16_t dmabuf_magic_s;
|
||||
uint32_t dmabuf_size;
|
||||
uint32_t cpu_desc_count;
|
||||
uint32_t bd_desc_count; //16bytes
|
||||
uint32_t tdma_desc_count;
|
||||
uint32_t tpu_clk_rate;
|
||||
uint32_t pmubuf_size;
|
||||
uint32_t pmubuf_offset; //32bytes
|
||||
uint32_t arraybase_0_L;
|
||||
uint32_t arraybase_0_H;
|
||||
uint32_t arraybase_1_L;
|
||||
uint32_t arraybase_1_H; //48bytes
|
||||
uint32_t arraybase_2_L;
|
||||
uint32_t arraybase_2_H;
|
||||
uint32_t arraybase_3_L;
|
||||
uint32_t arraybase_3_H; //64bytes
|
||||
|
||||
uint32_t arraybase_4_L;
|
||||
uint32_t arraybase_4_H;
|
||||
uint32_t arraybase_5_L;
|
||||
uint32_t arraybase_5_H;
|
||||
uint32_t arraybase_6_L;
|
||||
uint32_t arraybase_6_H;
|
||||
uint32_t arraybase_7_L;
|
||||
uint32_t arraybase_7_H;
|
||||
uint32_t reserve[8]; //128bytes, 128bytes align
|
||||
} dma_hdr_t;
|
||||
|
||||
typedef struct {
|
||||
uint32_t version;
|
||||
uint32_t npu_num;
|
||||
uint32_t eu_num;
|
||||
uint32_t lmem_size;
|
||||
uint32_t lmem_banks;
|
||||
uint32_t lmem_bank_size;
|
||||
} bmk_chip_info_t;
|
||||
|
||||
#define FLOAT_SIZE 4
|
||||
#define INT8_SIZE 1
|
||||
#define BF16_SIZE 2
|
||||
|
||||
#define UNUSED(x) (void)(x)
|
||||
|
||||
#define __ALIGN_MASK(x,mask) (((x)+(mask))&~(mask))
|
||||
#define ALIGN(x,a) __ALIGN_MASK(x,(__typeof__(x))(a)-1)
|
||||
#define ALIGN_DOWN(x, a) ((x) / (a) * (a))
|
||||
|
||||
#define math_min(x, y) ((x) < (y) ? (x) : (y))
|
||||
#define math_max(x, y) ((x) > (y) ? (x) : (y))
|
||||
|
||||
static inline int get_num_shift(uint64_t num)
|
||||
{
|
||||
int n = 0;
|
||||
while (!(num & 1)) {
|
||||
n++;
|
||||
num >>= 1;
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
uint32_t dim;
|
||||
uint32_t n;
|
||||
uint32_t c;
|
||||
union {
|
||||
uint32_t h;
|
||||
uint32_t row;
|
||||
};
|
||||
union {
|
||||
uint32_t w;
|
||||
uint32_t col;
|
||||
};
|
||||
} shape_t;
|
||||
|
||||
shape_t shape_t4(int n, int c, int h, int w);
|
||||
shape_t shape_t3(int d3, int d2, int d1);
|
||||
shape_t shape_t2(int row, int col);
|
||||
shape_t shape_t1(int len);
|
||||
|
||||
uint8_t shape_equal(shape_t s1, shape_t s2);
|
||||
|
||||
typedef struct {
|
||||
uint32_t n;
|
||||
uint32_t c;
|
||||
union {
|
||||
uint32_t h;
|
||||
uint32_t row;
|
||||
};
|
||||
union {
|
||||
uint32_t w;
|
||||
uint32_t col;
|
||||
};
|
||||
} stride_t;
|
||||
|
||||
static inline stride_t stride_st4(int n, int c, int h, int w)
|
||||
{
|
||||
stride_t st;
|
||||
st.n = n;
|
||||
st.c = c;
|
||||
st.h = h;
|
||||
st.w = w;
|
||||
return st;
|
||||
}
|
||||
|
||||
typedef uint32_t ctrl_t;
|
||||
#define CTRL_NULL 0
|
||||
#define CTRL_AL (1 << 0) // alloc aligned with EU_NUM
|
||||
#define CTRL_RA (1 << 2) // result add
|
||||
#define CTRL_BN (1 << 3) // B_N_is_1 broadcast to A
|
||||
#define CTRL_TP (1 << 5) // transpose
|
||||
#define CTRL_ADDR_ALIGN (1 << 7)
|
||||
#define CTRL_RELU (1 << 8)
|
||||
#define CTRL_KFLIP (1 << 9) // kernel flip
|
||||
#define CTRL_WEIGHT (1 << 10) // mark weight address in GDMA
|
||||
#define CTRL_NEURON (1 << 11) // mark neuron address in GDMA
|
||||
#define CTRL_WINOGRAD (1 << 12) // GDMA reshap winograd kernel
|
||||
#define CTRL_WINOGRAD_SCALE_FACTOR (1 << 28) // GDMA reshap winograd kernel
|
||||
|
||||
typedef uint32_t tl_type;
|
||||
#define TL_TYPE_TENSOR 0
|
||||
#define TL_TYPE_TENSOR_PREALLOC 1
|
||||
#define TL_TYPE_CONSTANT 2
|
||||
#define TL_TYPE_SLICE 3
|
||||
#define TL_TYPE_CLONE 4
|
||||
|
||||
typedef union {
|
||||
uint32_t reg_val;
|
||||
float fp32_val;
|
||||
} const_fp32_t;
|
||||
|
||||
typedef union {
|
||||
laddr_t laddr;
|
||||
const_fp32_t const_fp32;
|
||||
} opd_t;
|
||||
|
||||
typedef struct {
|
||||
tl_type type;
|
||||
opd_t operand;
|
||||
shape_t shape;
|
||||
stride_t *stride;
|
||||
uint8_t aligned;
|
||||
fmt_t fmt;
|
||||
uint32_t bank_id;
|
||||
int reserved_size;
|
||||
} tensor_lmem;
|
||||
|
||||
static inline laddr_t tl_address(tensor_lmem *tlp)
|
||||
{
|
||||
if (tlp->type == TL_TYPE_CONSTANT) {
|
||||
return LADDR_INVALID;
|
||||
}
|
||||
return tlp->operand.laddr;
|
||||
}
|
||||
|
||||
void tl_reshape(tensor_lmem * tlp, shape_t shape);
|
||||
|
||||
typedef struct {
|
||||
uint64_t addr;
|
||||
shape_t shape;
|
||||
stride_t stride;
|
||||
} tensor_gmem;
|
||||
|
||||
static inline int tg_is_matrix(tensor_gmem *t)
|
||||
{
|
||||
return t->shape.dim == 2;
|
||||
}
|
||||
|
||||
static inline int tg_matrix_row(tensor_gmem *t)
|
||||
{
|
||||
return t->shape.row;
|
||||
}
|
||||
|
||||
static inline int tg_matrix_col(tensor_gmem *t)
|
||||
{
|
||||
return t->shape.col;
|
||||
}
|
||||
|
||||
static inline int tl_is_const(tensor_lmem *tlp)
|
||||
{
|
||||
return tlp->type == TL_TYPE_CONSTANT;
|
||||
}
|
||||
|
||||
static inline int tl_is_prealloc(tensor_lmem *tlp)
|
||||
{
|
||||
return tlp->type == TL_TYPE_TENSOR_PREALLOC;
|
||||
}
|
||||
|
||||
static inline int tl_is_matrix(tensor_lmem *tlp)
|
||||
{
|
||||
int dim = tlp->shape.dim;
|
||||
return dim == 1 || dim == 2;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* __BM_KERNEL_LEGACY_H__ */
|
||||
72
cvikernel/include/bmkernel/bm_regcpu.h
Normal file
72
cvikernel/include/bmkernel/bm_regcpu.h
Normal file
@ -0,0 +1,72 @@
|
||||
/*
|
||||
* Copyright Bitmain Technologies Inc.
|
||||
*
|
||||
* Written by:
|
||||
* Wanwei CAI <wanwei.cai@bitmain.com>
|
||||
* Created Time: 2017-06-29 15:33
|
||||
*/
|
||||
|
||||
#ifndef _BM_REG_CPU_H
|
||||
#define _BM_REG_CPU_H
|
||||
|
||||
#include <bmkernel/bm_kernel.h>
|
||||
|
||||
#define CPU_ENGINE_DESCRIPTOR_NUM 56
|
||||
#define CPU_ENGINE_DESCRIPTOR_DMA_NUM CPU_ENGINE_DESCRIPTOR_NUM
|
||||
#define CPU_ENGINE_BYTES (CPU_ENGINE_DESCRIPTOR_NUM*sizeof(uint32_t))
|
||||
#define CPU_ENGINE_STR_LIMIT_BYTE (CPU_ENGINE_DESCRIPTOR_NUM - 7) * sizeof(uint32_t)
|
||||
|
||||
#define CPU_CMD_ACCPI0 0
|
||||
#define CPU_CMD_ACCPI1 1
|
||||
#define CPU_CMD_ACCPI2 2
|
||||
#define CPU_CMD_ACCPI3 3
|
||||
#define CPU_CMD_ACCPI4 4
|
||||
/* CPU_CMD_ACCPI5 ~ CPU_CMD_ACCPI63
|
||||
defined here if needed */
|
||||
|
||||
#define CPU_ACCPI0_OP_BIT 0
|
||||
#define CPU_ACCPI1_BD_CMDID_BIT 0
|
||||
#define CPU_ACCPI1_CPU_CMDID_BIT 16
|
||||
#define CPU_ACCPI2_GDMA_CMDID_BIT 0
|
||||
#define CPU_ACCPI2_CDMA_CMDID_BIT 16
|
||||
#define CPU_ACCPI3_NEXT_BD_ADDR_BIT 0
|
||||
#define CPU_ACCPI4_NEXT_GDMA_ADDR_BIT 0
|
||||
#define CPU_ACCPI5_NEXT_CDMA_ADDR_BIT 0
|
||||
|
||||
typedef enum {
|
||||
CPU_OP_SYNC = 2,
|
||||
CPU_OP_INST = 3,
|
||||
CPU_OP_END
|
||||
} CPU_OP;
|
||||
|
||||
// CPU common structure
|
||||
typedef struct {
|
||||
uint32_t regs[CPU_ENGINE_DESCRIPTOR_NUM];
|
||||
} bmk_cpu_desc_t;
|
||||
|
||||
// CPU_OP_SYNC structure
|
||||
typedef struct {
|
||||
uint32_t op_type; // CPU_CMD_ACCPI0
|
||||
uint32_t num_bd; // CPU_CMD_ACCPI1
|
||||
uint32_t num_gdma; // CPU_CMD_ACCPI2
|
||||
uint32_t offset_bd; // CPU_CMD_ACCPI3
|
||||
uint32_t offset_gdma; // CPU_CMD_ACCPI4
|
||||
uint32_t reserved[2]; // CPU_CMD_ACCPI5-CPU_CMD_ACCPI6
|
||||
char str[CPU_ENGINE_STR_LIMIT_BYTE];
|
||||
} __attribute__((packed)) bmk_cpu_sync_desc_t;
|
||||
|
||||
// CPU_OP_INST structure
|
||||
#define CPU_INST_HEADER_COUNT 12
|
||||
typedef struct {
|
||||
uint32_t op_type; // CPU_CMD_ACCPI0
|
||||
uint32_t num_bd; // CPU_CMD_ACCPI1
|
||||
uint32_t num_gdma; // CPU_CMD_ACCPI2
|
||||
uint32_t offset_bd; // CPU_CMD_ACCPI3
|
||||
uint32_t offset_gdma; // CPU_CMD_ACCPI4
|
||||
uint32_t reserved[2]; // CPU_CMD_ACCPI5-CPU_CMD_ACCPI6
|
||||
char lib_name[4*sizeof(uint32_t)]; // CPU_CMD_ACCPI7~CPU_CMD_ACCPI10
|
||||
uint32_t param_size; //CPU_CMD_ACCPI11
|
||||
uint8_t param[0];
|
||||
} __attribute__((packed)) bmk_cpu_inst_desc_t;
|
||||
|
||||
#endif
|
||||
37
cvikernel/include/bmkernel/reg_bdcast.h
Normal file
37
cvikernel/include/bmkernel/reg_bdcast.h
Normal file
@ -0,0 +1,37 @@
|
||||
#ifndef REG_BDCAST_H
|
||||
#define REG_BDCAST_H
|
||||
|
||||
#define BD_ENGINE_DESCRIPTOR_NUM 28
|
||||
#define BD_REG_BYTES (BD_ENGINE_DESCRIPTOR_NUM * 4)
|
||||
#define BDC_ENGINE_CMD_ALIGNED_BIT 8
|
||||
|
||||
#define BD_CMD_BASE_ADDR (TIU_ENGINE_BASE_ADDR + 0)
|
||||
#define BD_CTRL_BASE_ADDR (TIU_ENGINE_BASE_ADDR + 0x100)
|
||||
#define BD_ENGINE_MAIN_CTRL (TIU_ENGINE_BASE_ADDR + 0)
|
||||
#define BD_ENGINE_DESC_ADDR (TIU_ENGINE_BASE_ADDR + 0x4)
|
||||
|
||||
//
|
||||
// BD operations for BIRD
|
||||
//
|
||||
#define DCR_TYPE_CONV_FIX8B 0
|
||||
#define DCR_TYPE_DEPTHWISE_POOL_FIX8B 1
|
||||
#define DCR_TYPE_FC_FIX8B 2
|
||||
#define DCR_TYPE_TENSOR_ARITH_FIX8B 3
|
||||
#define DCR_TYPE_FC_TYPE_2_FIX8B 4
|
||||
|
||||
// BD control bits base on BD_CTRL_BASE_ADDR
|
||||
#define BD_TPU_EN 0 // TPU Enable bit
|
||||
#define BD_LANE_NUM 22 // Lane number bit[29:22]
|
||||
#define BD_DES_ADDR_VLD 30 // enable descriptor mode
|
||||
#define BD_INTR_ENABLE 31 // TIU interrupt global enable
|
||||
|
||||
typedef enum _TIU_LANNUM {
|
||||
TIU_LANNUM_2 = 0x1,
|
||||
TIU_LANNUM_4 = 0x2,
|
||||
TIU_LANNUM_8 = 0x3,
|
||||
TIU_LANNUM_16 = 0x4,
|
||||
TIU_LANNUM_32 = 0x5,
|
||||
TIU_LANNUM_64 = 0x6,
|
||||
} TIU_LANNUM;
|
||||
|
||||
#endif /* REG_BDCAST_H */
|
||||
98
cvikernel/include/bmkernel/reg_tdma.h
Normal file
98
cvikernel/include/bmkernel/reg_tdma.h
Normal file
@ -0,0 +1,98 @@
|
||||
#ifndef REG_GDMA_H
|
||||
#define REG_GDMA_H
|
||||
|
||||
#define TDMA_DESC_REG_BYTES (0x40)
|
||||
#define TDMA_ENGINE_DESCRIPTOR_NUM (TDMA_DESC_REG_BYTES >> 2)
|
||||
#define TDMA_NUM_BASE_REGS (0x8)
|
||||
|
||||
//backward compatible?
|
||||
#define GDMA_TYPE_f32 0
|
||||
#define GDMA_TYPE_f16 1
|
||||
#define GDMA_TYPE_i32 2
|
||||
#define GDMA_TYPE_i16 3
|
||||
#define GDMA_TYPE_i8 4
|
||||
#define GDMA_TYPE_i4 5
|
||||
#define GDMA_TYPE_i2 6
|
||||
#define GDMA_TYPE_i1 7
|
||||
#define LAST_GDMA_TYPE_i1 8
|
||||
|
||||
|
||||
//tdma descriptor define
|
||||
#define TDMA_DESCRIPTOR_ALIGNED_BIT 6
|
||||
|
||||
#define TDMA_CMD_ACCP0 0
|
||||
#define TDMA_CMD_ACCP1 4
|
||||
#define TDMA_CMD_ACCP2 8
|
||||
#define TDMA_CMD_ACCP3 12
|
||||
#define TDMA_CMD_ACCP4 16
|
||||
#define TDMA_CMD_ACCP5 20
|
||||
#define TDMA_CMD_ACCP6 24
|
||||
#define TDMA_CMD_ACCP7 28
|
||||
#define TDMA_CMD_ACCP8 32
|
||||
#define TDMA_CMD_ACCP9 36
|
||||
#define TDMA_CMD_ACCP10 40
|
||||
#define TDMA_CMD_ACCP11 44
|
||||
#define TDMA_CMD_ACCP12 48
|
||||
#define TDMA_CMD_ACCP13 52
|
||||
#define TDMA_CMD_ACCP14 56
|
||||
|
||||
#define TDMA_ACCPI0_CMD_VALID_BIT 0
|
||||
#define TDMA_ACCPI0_EOD_BIT 2
|
||||
#define TDMA_ACCPI0_INTERRUPT_BIT 3
|
||||
#define TDMA_ACCPI0_BARRIER_ENABLE_BIT 4
|
||||
|
||||
|
||||
//tdma control define
|
||||
#define TDMA_CTRL (TDMA_ENGINE_BASE_ADDR + 0x0)
|
||||
#define TDMA_DES_BASE (TDMA_ENGINE_BASE_ADDR + 0x4)
|
||||
#define TDMA_INT_MASK (TDMA_ENGINE_BASE_ADDR + 0x8)
|
||||
#define TDMA_SYNC_STATUS (TDMA_ENGINE_BASE_ADDR + 0xC)
|
||||
#define TDMA_ARRAYBASE0_L (TDMA_ENGINE_BASE_ADDR + 0x70)
|
||||
#define TDMA_ARRAYBASE1_L (TDMA_ENGINE_BASE_ADDR + 0x74)
|
||||
#define TDMA_ARRAYBASE2_L (TDMA_ENGINE_BASE_ADDR + 0x78)
|
||||
#define TDMA_ARRAYBASE3_L (TDMA_ENGINE_BASE_ADDR + 0x7C)
|
||||
#define TDMA_ARRAYBASE4_L (TDMA_ENGINE_BASE_ADDR + 0x80)
|
||||
#define TDMA_ARRAYBASE5_L (TDMA_ENGINE_BASE_ADDR + 0x84)
|
||||
#define TDMA_ARRAYBASE6_L (TDMA_ENGINE_BASE_ADDR + 0x88)
|
||||
#define TDMA_ARRAYBASE7_L (TDMA_ENGINE_BASE_ADDR + 0x8C)
|
||||
#define TDMA_ARRAYBASE0_H (TDMA_ENGINE_BASE_ADDR + 0x90)
|
||||
#define TDMA_ARRAYBASE1_H (TDMA_ENGINE_BASE_ADDR + 0x94)
|
||||
#define TDMA_DEBUG_MODE (TDMA_ENGINE_BASE_ADDR + 0xA0)
|
||||
|
||||
|
||||
|
||||
#define TDMA_CTRL_ENABLE_BIT 0
|
||||
#define TDMA_CTRL_MODESEL_BIT 1
|
||||
#define TDMA_CTRL_RESET_SYNCID_BIT 2
|
||||
#define TDMA_CTRL_FORCE_1ARRAY 5
|
||||
#define TDMA_CTRL_FORCE_2ARRAY 6
|
||||
#define TDMA_CTRL_BURSTLEN_BIT 8
|
||||
#define TDMA_CTRL_64BYTE_ALIGN_EN 10
|
||||
#define TDMA_CTRL_DESNUM_BIT 16
|
||||
|
||||
|
||||
|
||||
|
||||
//This function only supports the following condition
|
||||
//localmem2tensor or tensor2localmem
|
||||
//The source and dst shares the the same format
|
||||
//Data is 32 bit
|
||||
//no stride
|
||||
//We use it in the forward_cpu backward_cpu
|
||||
static inline int get_index_data_format(int size)
|
||||
{
|
||||
if (size == 1) {
|
||||
return GDMA_TYPE_i1;
|
||||
} else if (size <= 16) {
|
||||
return GDMA_TYPE_i4;
|
||||
} else if (size <= 256){
|
||||
return GDMA_TYPE_i8;
|
||||
} else {
|
||||
return GDMA_TYPE_i16;
|
||||
}
|
||||
}
|
||||
#define LRN_LEFT_SHIFT 0
|
||||
#define LRN_RIGHT_SHIFT 1
|
||||
|
||||
#endif /* REG_GDMA_H */
|
||||
|
||||
20
cvikernel/include/bmkernel/reg_tiu.h
Normal file
20
cvikernel/include/bmkernel/reg_tiu.h
Normal file
@ -0,0 +1,20 @@
|
||||
#ifndef REG_TIU_H
|
||||
#define REG_TIU_H
|
||||
|
||||
#define TIU_DESC_REG_BYTES (0x70)
|
||||
#define TIU_ENGINE_DESCRIPTOR_NUM (TIU_DESC_REG_BYTES >> 2)
|
||||
|
||||
// TIU operation data type
|
||||
#define DCR_TYPE_CONV_FIX8B 0
|
||||
#define DCR_TYPE_DEPTHWISE_POOL_FIX8B 1
|
||||
#define DCR_TYPE_FC_FIX8B 2
|
||||
#define DCR_TYPE_TENSOR_ARITH_FIX8B 3
|
||||
#define NR_DCR_TYPES 4
|
||||
|
||||
// BD control bits base on BD_CTRL_BASE_ADDR
|
||||
#define BD_TPU_EN 0 // TPU Enable bit
|
||||
#define BD_LANE_NUM 22 // Lane number bit[29:22]
|
||||
#define BD_DES_ADDR_VLD 30 // enable descriptor mode
|
||||
#define BD_INTR_ENABLE 31 // TIU interrupt global enable
|
||||
|
||||
#endif /* REG_TIU_H */
|
||||
310
cvikernel/include/cvikernel/cv180x/cv180x_tdma_reg.h
Normal file
310
cvikernel/include/cvikernel/cv180x/cv180x_tdma_reg.h
Normal file
@ -0,0 +1,310 @@
|
||||
#ifndef CV180X_TDMA_REG_H
|
||||
#define CV180X_TDMA_REG_H
|
||||
|
||||
/*
|
||||
* This file is generated by tools. Do not edit it manually.
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#define TDMA_DESC_REG_BYTES (0x40)
|
||||
#define TDMA_ENGINE_DESCRIPTOR_NUM (TDMA_DESC_REG_BYTES >> 2)
|
||||
#define TDMA_NUM_BASE_REGS (0x8)
|
||||
|
||||
typedef unsigned long long ullong;
|
||||
|
||||
typedef struct {
|
||||
uint32_t vld;
|
||||
uint32_t compress_en;
|
||||
uint32_t eod;
|
||||
uint32_t intp_en;
|
||||
uint32_t bar_en;
|
||||
uint32_t check_bf16_value;
|
||||
uint32_t trans_dir;
|
||||
uint32_t rsv00;
|
||||
uint32_t trans_fmt;
|
||||
uint32_t transpose_md;
|
||||
uint32_t rsv01;
|
||||
uint32_t intra_cmd_paral;
|
||||
uint32_t outstanding_en;
|
||||
uint32_t cmd_id;
|
||||
uint32_t spec_func;
|
||||
uint32_t dst_fmt;
|
||||
uint32_t src_fmt;
|
||||
uint32_t cmprs_fmt;
|
||||
uint32_t sys_dtype;
|
||||
uint32_t rsv2_1;
|
||||
uint32_t int8_sign;
|
||||
uint32_t compress_zero_guard;
|
||||
uint32_t int8_rnd_mode;
|
||||
uint32_t wait_id_tpu;
|
||||
uint32_t wait_id_other_tdma;
|
||||
uint32_t wait_id_sdma;
|
||||
uint32_t const_val;
|
||||
uint32_t src_base_reg_sel;
|
||||
uint32_t mv_lut_idx;
|
||||
uint32_t dst_base_reg_sel;
|
||||
uint32_t mv_lut_base;
|
||||
uint32_t rsv4_5;
|
||||
uint32_t dst_h_stride;
|
||||
uint32_t dst_c_stride_low;
|
||||
uint32_t dst_n_stride;
|
||||
uint32_t src_h_stride;
|
||||
uint32_t src_c_stride_low;
|
||||
uint32_t src_n_stride;
|
||||
uint32_t dst_c;
|
||||
uint32_t src_c;
|
||||
uint32_t dst_w;
|
||||
uint32_t dst_h;
|
||||
uint32_t src_w;
|
||||
uint32_t src_h;
|
||||
uint32_t dst_base_addr_low;
|
||||
uint32_t src_base_addr_low;
|
||||
uint32_t src_n;
|
||||
uint32_t dst_base_addr_high;
|
||||
uint32_t src_base_addr_high;
|
||||
uint32_t src_c_stride_high;
|
||||
uint32_t dst_c_stride_high;
|
||||
uint32_t compress_bias0;
|
||||
uint32_t compress_bias1;
|
||||
uint32_t layer_ID;
|
||||
} tdma_reg_t;
|
||||
|
||||
static inline void parse_tdma_reg(tdma_reg_t *r, const uint32_t *p)
|
||||
{
|
||||
r->vld = p[0] & 1;
|
||||
r->compress_en = (p[0] >> 1) & 1;
|
||||
r->eod = (p[0] >> 2) & 1;
|
||||
r->intp_en = (p[0] >> 3) & 1;
|
||||
r->bar_en = (p[0] >> 4) & 1;
|
||||
r->check_bf16_value = (p[0] >> 5) & 1;
|
||||
r->trans_dir = (p[0] >> 6) & ((1u << 2) - 1);
|
||||
r->rsv00 = (p[0] >> 8) & ((1u << 2) - 1);
|
||||
r->trans_fmt = (p[0] >> 10) & 1;
|
||||
r->transpose_md = (p[0] >> 11) & ((1u << 2) - 1);
|
||||
r->rsv01 = (p[0] >> 13) & 1;
|
||||
r->intra_cmd_paral = (p[0] >> 14) & 1;
|
||||
r->outstanding_en = (p[0] >> 15) & 1;
|
||||
r->cmd_id = (p[0] >> 16) & ((1u << 16) - 1);
|
||||
r->spec_func = p[1] & ((1u << 3) - 1);
|
||||
r->dst_fmt = (p[1] >> 3) & ((1u << 2) - 1);
|
||||
r->src_fmt = (p[1] >> 5) & ((1u << 2) - 1);
|
||||
r->cmprs_fmt = (p[1] >> 7) & 1;
|
||||
r->sys_dtype = (p[1] >> 8) & 1;
|
||||
r->rsv2_1 = (p[1] >> 9) & ((1u << 4) - 1);
|
||||
r->int8_sign = (p[1] >> 13) & 1;
|
||||
r->compress_zero_guard = (p[1] >> 14) & 1;
|
||||
r->int8_rnd_mode = (p[1] >> 15) & 1;
|
||||
r->wait_id_tpu = (p[1] >> 16) & ((1u << 16) - 1);
|
||||
r->wait_id_other_tdma = p[2] & ((1u << 16) - 1);
|
||||
r->wait_id_sdma = (p[2] >> 16) & ((1u << 16) - 1);
|
||||
r->const_val = p[3] & ((1u << 16) - 1);
|
||||
r->src_base_reg_sel = (p[3] >> 16) & ((1u << 3) - 1);
|
||||
r->mv_lut_idx = (p[3] >> 19) & 1;
|
||||
r->dst_base_reg_sel = (p[3] >> 20) & ((1u << 3) - 1);
|
||||
r->mv_lut_base = (p[3] >> 23) & 1;
|
||||
r->rsv4_5 = (p[3] >> 24) & ((1u << 8) - 1);
|
||||
r->dst_h_stride = p[4] & ((1u << 16) - 1);
|
||||
r->dst_c_stride_low = (p[4] >> 16) & ((1u << 16) - 1);
|
||||
r->dst_n_stride = p[5];
|
||||
r->src_h_stride = p[6] & ((1u << 16) - 1);
|
||||
r->src_c_stride_low = (p[6] >> 16) & ((1u << 16) - 1);
|
||||
r->src_n_stride = p[7];
|
||||
r->dst_c = p[8] & ((1u << 16) - 1);
|
||||
r->src_c = (p[8] >> 16) & ((1u << 16) - 1);
|
||||
r->dst_w = p[9] & ((1u << 16) - 1);
|
||||
r->dst_h = (p[9] >> 16) & ((1u << 16) - 1);
|
||||
r->src_w = p[10] & ((1u << 16) - 1);
|
||||
r->src_h = (p[10] >> 16) & ((1u << 16) - 1);
|
||||
r->dst_base_addr_low = p[11];
|
||||
r->src_base_addr_low = p[12];
|
||||
r->src_n = p[13] & ((1u << 16) - 1);
|
||||
r->dst_base_addr_high = (p[13] >> 16) & ((1u << 8) - 1);
|
||||
r->src_base_addr_high = (p[13] >> 24) & ((1u << 8) - 1);
|
||||
r->src_c_stride_high = p[14] & ((1u << 16) - 1);
|
||||
r->dst_c_stride_high = (p[14] >> 16) & ((1u << 16) - 1);
|
||||
r->compress_bias0 = p[15] & ((1u << 8) - 1);
|
||||
r->compress_bias1 = (p[15] >> 8) & ((1u << 8) - 1);
|
||||
r->layer_ID = (p[15] >> 16) & ((1u << 16) - 1);
|
||||
}
|
||||
|
||||
static inline void emit_tdma_reg(const tdma_reg_t *r, uint32_t *_p)
|
||||
{
|
||||
volatile uint32_t *p = (typeof(p))_p;
|
||||
p[15] = (r->compress_bias0 & ((1u << 8) - 1)) |
|
||||
((r->compress_bias1 & ((1u << 8) - 1)) << 8) |
|
||||
((r->layer_ID & ((1u << 16) - 1)) << 16);
|
||||
p[14] = (r->src_c_stride_high & ((1u << 16) - 1)) |
|
||||
((r->dst_c_stride_high & ((1u << 16) - 1)) << 16);
|
||||
p[13] = (r->src_n & ((1u << 16) - 1)) |
|
||||
((r->dst_base_addr_high & ((1u << 8) - 1)) << 16) |
|
||||
((r->src_base_addr_high & ((1u << 8) - 1)) << 24);
|
||||
p[12] = (r->src_base_addr_low & (((uint64_t)1 << 32) - 1));
|
||||
p[11] = (r->dst_base_addr_low & (((uint64_t)1 << 32) - 1));
|
||||
p[10] = (r->src_w & ((1u << 16) - 1)) |
|
||||
((r->src_h & ((1u << 16) - 1)) << 16);
|
||||
p[9] = (r->dst_w & ((1u << 16) - 1)) |
|
||||
((r->dst_h & ((1u << 16) - 1)) << 16);
|
||||
p[8] = (r->dst_c & ((1u << 16) - 1)) |
|
||||
((r->src_c & ((1u << 16) - 1)) << 16);
|
||||
p[7] = (r->src_n_stride & (((uint64_t)1 << 32) - 1));
|
||||
p[6] = (r->src_h_stride & ((1u << 16) - 1)) |
|
||||
((r->src_c_stride_low & ((1u << 16) - 1)) << 16);
|
||||
p[5] = (r->dst_n_stride & (((uint64_t)1 << 32) - 1));
|
||||
p[4] = (r->dst_h_stride & ((1u << 16) - 1)) |
|
||||
((r->dst_c_stride_low & ((1u << 16) - 1)) << 16);
|
||||
p[3] = (r->const_val & ((1u << 16) - 1)) |
|
||||
((r->src_base_reg_sel & ((1u << 3) - 1)) << 16) |
|
||||
((r->mv_lut_idx & 1) << 19) |
|
||||
((r->dst_base_reg_sel & ((1u << 3) - 1)) << 20) |
|
||||
((r->mv_lut_base & 1) << 23) |
|
||||
((r->rsv4_5 & ((1u << 8) - 1)) << 24);
|
||||
p[2] = (r->wait_id_other_tdma & ((1u << 16) - 1)) |
|
||||
((r->wait_id_sdma & ((1u << 16) - 1)) << 16);
|
||||
p[1] = (r->spec_func & ((1u << 3) - 1)) |
|
||||
((r->dst_fmt & ((1u << 2) - 1)) << 3) |
|
||||
((r->src_fmt & ((1u << 2) - 1)) << 5) |
|
||||
((r->cmprs_fmt & 1) << 7) |
|
||||
((r->sys_dtype & 1) << 8) |
|
||||
((r->rsv2_1 & ((1u << 4) - 1)) << 9) |
|
||||
((r->int8_sign & 1) << 13) |
|
||||
((r->compress_zero_guard & 1) << 14) |
|
||||
((r->int8_rnd_mode & 1) << 15) |
|
||||
((r->wait_id_tpu & ((1u << 16) - 1)) << 16);
|
||||
p[0] = (r->vld & 1) |
|
||||
((r->compress_en & 1) << 1) |
|
||||
((r->eod & 1) << 2) |
|
||||
((r->intp_en & 1) << 3) |
|
||||
((r->bar_en & 1) << 4) |
|
||||
((r->check_bf16_value & 1) << 5) |
|
||||
((r->trans_dir & ((1u << 2) - 1)) << 6) |
|
||||
((r->rsv00 & ((1u << 2) - 1)) << 8) |
|
||||
((r->trans_fmt & 1) << 10) |
|
||||
((r->transpose_md & ((1u << 2) - 1)) << 11) |
|
||||
((r->rsv01 & 1) << 13) |
|
||||
((r->intra_cmd_paral & 1) << 14) |
|
||||
((r->outstanding_en & 1) << 15) |
|
||||
((r->cmd_id & ((1u << 16) - 1)) << 16);
|
||||
}
|
||||
|
||||
static inline void reset_tdma_reg(tdma_reg_t *r)
|
||||
{
|
||||
r->vld = 0x0;
|
||||
r->compress_en = 0x0;
|
||||
r->eod = 0x0;
|
||||
r->intp_en = 0x0;
|
||||
r->bar_en = 0x0;
|
||||
r->check_bf16_value = 0x0;
|
||||
r->trans_dir = 0x0;
|
||||
r->rsv00 = 0x0;
|
||||
r->trans_fmt = 0x0;
|
||||
r->transpose_md = 0x0;
|
||||
r->rsv01 = 0x0;
|
||||
r->intra_cmd_paral = 0x0;
|
||||
r->outstanding_en = 0x0;
|
||||
r->cmd_id = 0x0;
|
||||
r->spec_func = 0x0;
|
||||
r->dst_fmt = 0x1;
|
||||
r->src_fmt = 0x1;
|
||||
r->cmprs_fmt = 0x0;
|
||||
r->sys_dtype = 0x0;
|
||||
r->rsv2_1 = 0x0;
|
||||
r->int8_sign = 0x0;
|
||||
r->compress_zero_guard = 0x0;
|
||||
r->int8_rnd_mode = 0x0;
|
||||
r->wait_id_tpu = 0x0;
|
||||
r->wait_id_other_tdma = 0x0;
|
||||
r->wait_id_sdma = 0x0;
|
||||
r->const_val = 0x0;
|
||||
r->src_base_reg_sel = 0x0;
|
||||
r->mv_lut_idx = 0x0;
|
||||
r->dst_base_reg_sel = 0x0;
|
||||
r->mv_lut_base = 0x0;
|
||||
r->rsv4_5 = 0x0;
|
||||
r->dst_h_stride = 0x1;
|
||||
r->dst_c_stride_low = 0x1;
|
||||
r->dst_n_stride = 0x1;
|
||||
r->src_h_stride = 0x1;
|
||||
r->src_c_stride_low = 0x1;
|
||||
r->src_n_stride = 0x1;
|
||||
r->dst_c = 0x1;
|
||||
r->src_c = 0x1;
|
||||
r->dst_w = 0x1;
|
||||
r->dst_h = 0x1;
|
||||
r->src_w = 0x1;
|
||||
r->src_h = 0x1;
|
||||
r->dst_base_addr_low = 0x0;
|
||||
r->src_base_addr_low = 0x0;
|
||||
r->src_n = 0x1;
|
||||
r->dst_base_addr_high = 0x0;
|
||||
r->src_base_addr_high = 0x0;
|
||||
r->src_c_stride_high = 0x0;
|
||||
r->dst_c_stride_high = 0x0;
|
||||
r->compress_bias0 = 0x0;
|
||||
r->compress_bias1 = 0x0;
|
||||
r->layer_ID = 0x0;
|
||||
}
|
||||
|
||||
static inline void trace_tdma_reg(tdma_reg_t *r, const char *tag)
|
||||
{
|
||||
#define trace_one_reg(name) \
|
||||
printf(" %s: 0x%llx\n", #name, (ullong)r->name)
|
||||
|
||||
printf("--- %s ---\n", tag);
|
||||
trace_one_reg(vld);
|
||||
trace_one_reg(compress_en);
|
||||
trace_one_reg(eod);
|
||||
trace_one_reg(intp_en);
|
||||
trace_one_reg(bar_en);
|
||||
trace_one_reg(check_bf16_value);
|
||||
trace_one_reg(trans_dir);
|
||||
trace_one_reg(rsv00);
|
||||
trace_one_reg(trans_fmt);
|
||||
trace_one_reg(transpose_md);
|
||||
trace_one_reg(rsv01);
|
||||
trace_one_reg(intra_cmd_paral);
|
||||
trace_one_reg(outstanding_en);
|
||||
trace_one_reg(cmd_id);
|
||||
trace_one_reg(spec_func);
|
||||
trace_one_reg(dst_fmt);
|
||||
trace_one_reg(src_fmt);
|
||||
trace_one_reg(cmprs_fmt);
|
||||
trace_one_reg(sys_dtype);
|
||||
trace_one_reg(rsv2_1);
|
||||
trace_one_reg(int8_sign);
|
||||
trace_one_reg(compress_zero_guard);
|
||||
trace_one_reg(int8_rnd_mode);
|
||||
trace_one_reg(wait_id_tpu);
|
||||
trace_one_reg(wait_id_other_tdma);
|
||||
trace_one_reg(wait_id_sdma);
|
||||
trace_one_reg(const_val);
|
||||
trace_one_reg(src_base_reg_sel);
|
||||
trace_one_reg(mv_lut_idx);
|
||||
trace_one_reg(dst_base_reg_sel);
|
||||
trace_one_reg(mv_lut_base);
|
||||
trace_one_reg(rsv4_5);
|
||||
trace_one_reg(dst_h_stride);
|
||||
trace_one_reg(dst_c_stride_low);
|
||||
trace_one_reg(dst_n_stride);
|
||||
trace_one_reg(src_h_stride);
|
||||
trace_one_reg(src_c_stride_low);
|
||||
trace_one_reg(src_n_stride);
|
||||
trace_one_reg(dst_c);
|
||||
trace_one_reg(src_c);
|
||||
trace_one_reg(dst_w);
|
||||
trace_one_reg(dst_h);
|
||||
trace_one_reg(src_w);
|
||||
trace_one_reg(src_h);
|
||||
trace_one_reg(dst_base_addr_low);
|
||||
trace_one_reg(src_base_addr_low);
|
||||
trace_one_reg(src_n);
|
||||
trace_one_reg(dst_base_addr_high);
|
||||
trace_one_reg(src_base_addr_high);
|
||||
trace_one_reg(src_c_stride_high);
|
||||
trace_one_reg(dst_c_stride_high);
|
||||
trace_one_reg(compress_bias0);
|
||||
trace_one_reg(compress_bias1);
|
||||
trace_one_reg(layer_ID);
|
||||
}
|
||||
#endif /* CV180X_TDMA_REG_H */
|
||||
622
cvikernel/include/cvikernel/cv180x/cv180x_tiu_reg.h
Normal file
622
cvikernel/include/cvikernel/cv180x/cv180x_tiu_reg.h
Normal file
@ -0,0 +1,622 @@
|
||||
#ifndef CV180X_TIU_REG_H
|
||||
#define CV180X_TIU_REG_H
|
||||
|
||||
/*
|
||||
* This file is generated by tools. Do not edit it manually.
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#define TIU_DESC_REG_BYTES (0x70)
|
||||
#define TIU_ENGINE_DESCRIPTOR_NUM (TIU_DESC_REG_BYTES >> 2)
|
||||
|
||||
// TIU operation data type
|
||||
#define DCR_TYPE_CONV_FIX8B 0
|
||||
#define DCR_TYPE_DEPTHWISE_POOL_FIX8B 1
|
||||
#define DCR_TYPE_FC_FIX8B 2
|
||||
#define DCR_TYPE_TENSOR_ARITH_FIX8B 3
|
||||
#define NR_DCR_TYPES 4
|
||||
|
||||
#define TENSOR_MUL_FIX8B 0
|
||||
#define TENSOR_MAC_FIX8B 1
|
||||
#define TENSOR_ADD_FIX8B 2
|
||||
#define TENSOR_SUB_FIX8B 3
|
||||
#define TENSOR_MAX_FIX8B 4
|
||||
#define TENSOR_MIN_FIX8B 5
|
||||
#define TENSOR_SHIFT_FIX8B 6
|
||||
#define TENSOR_AND_FIX8B 7
|
||||
#define TENSOR_OR_FIX8B 8
|
||||
#define TENSOR_XOR_FIX8B 9
|
||||
#define TENSOR_COPY_FIX8B 10
|
||||
#define TENSOR_GE_FIX8B 11
|
||||
|
||||
typedef unsigned long long ullong;
|
||||
|
||||
typedef struct {
|
||||
uint32_t cmd_en;
|
||||
uint32_t cmd_end;
|
||||
uint32_t cmd_id_en;
|
||||
uint32_t cmd_keep;
|
||||
uint32_t cmd_intr_en;
|
||||
uint32_t tsk_typ;
|
||||
uint32_t tsk_eu_typ;
|
||||
uint32_t tsk_opd_num;
|
||||
uint32_t opt_res_shift;
|
||||
uint32_t opt_left_shift;
|
||||
uint32_t opt_shift_typ;
|
||||
uint32_t opt_rshift_typ;
|
||||
uint32_t dummy1;
|
||||
uint32_t opd_typ;
|
||||
uint32_t opt_chl_quan;
|
||||
uint32_t cmd_id_tpu;
|
||||
uint32_t cmd_id_gdma;
|
||||
uint32_t quan_m;
|
||||
uint32_t opt_res0_sign;
|
||||
uint32_t opt_opd0_sign;
|
||||
uint32_t opt_opd1_sign;
|
||||
uint32_t opt_opd2_sign;
|
||||
uint32_t opt_res0_seg;
|
||||
uint32_t opt_opd0_seg;
|
||||
uint32_t opt_opd1_seg;
|
||||
uint32_t opt_opd2_seg;
|
||||
uint32_t ps32_md;
|
||||
uint32_t double_conv;
|
||||
uint32_t opt_left_tran;
|
||||
uint32_t fp_round_typ;
|
||||
uint32_t opt_relu_typ;
|
||||
uint32_t opt_relu_value;
|
||||
uint32_t cmd_pre_exe_typ;
|
||||
uint32_t opt_res_add;
|
||||
uint32_t rsvd0;
|
||||
uint32_t conv_opd0_x_ins0;
|
||||
uint32_t conv_opd0_y_ins0;
|
||||
uint32_t conv_opd0_x_ins0_last;
|
||||
uint32_t conv_opd0_y_ins0_last;
|
||||
uint32_t conv_opd1_x_ins0;
|
||||
uint32_t conv_opd1_y_ins0;
|
||||
uint32_t dummy0;
|
||||
uint32_t opd0_ins_val;
|
||||
uint32_t conv_opd0_up_pad;
|
||||
uint32_t conv_opd0_dn_pad;
|
||||
uint32_t conv_opd0_lf_pad;
|
||||
uint32_t conv_opd0_rt_pad;
|
||||
uint32_t res0_n;
|
||||
uint32_t res0_c;
|
||||
uint32_t res0_h;
|
||||
uint32_t res0_w;
|
||||
uint32_t conv_op_x_str;
|
||||
uint32_t conv_op_y_str;
|
||||
uint32_t cmd_pre_exe;
|
||||
uint32_t rsvd1;
|
||||
uint32_t res0_addr;
|
||||
uint32_t opd0_addr;
|
||||
uint32_t opd1_addr;
|
||||
uint32_t opd2_addr;
|
||||
uint32_t opt_opd0_const;
|
||||
uint32_t opt_opd1_const;
|
||||
uint32_t opt_opd2_const;
|
||||
uint32_t short_nchwstr_same;
|
||||
uint32_t short_res0_str;
|
||||
uint32_t short_opd0_str;
|
||||
uint32_t short_opd1_str;
|
||||
uint32_t short_opd2_str;
|
||||
uint32_t dummy2;
|
||||
uint32_t opd0_n;
|
||||
uint32_t opd0_c;
|
||||
uint32_t dummy3;
|
||||
uint32_t rsvd2;
|
||||
uint32_t opd0_h;
|
||||
uint32_t opd0_w;
|
||||
uint32_t opd1_n;
|
||||
uint32_t opd1_c;
|
||||
uint32_t opd1_h;
|
||||
uint32_t opd1_w;
|
||||
uint32_t opd2_n;
|
||||
uint32_t opd2_c;
|
||||
uint32_t opd2_h;
|
||||
uint32_t opd2_w;
|
||||
uint32_t dummy4;
|
||||
uint32_t rsvd3;
|
||||
uint32_t layer_info;
|
||||
uint32_t res0_n_str;
|
||||
uint32_t res0_c_str;
|
||||
uint32_t res0_h_str;
|
||||
uint32_t res0_w_str;
|
||||
uint32_t res0_b_str;
|
||||
uint32_t opd0_n_str;
|
||||
uint32_t dummy5;
|
||||
uint32_t rsvd4;
|
||||
uint32_t opd0_c_str;
|
||||
uint32_t opd0_h_str;
|
||||
uint32_t opd0_w_str;
|
||||
uint32_t opd0_b_str;
|
||||
uint32_t opd1_n_str;
|
||||
uint32_t opd1_c_str;
|
||||
uint32_t opd1_h_str;
|
||||
uint32_t dummy6;
|
||||
uint32_t rsvd5;
|
||||
uint32_t opd1_w_str;
|
||||
uint32_t opd1_b_str;
|
||||
uint32_t opd2_n_str;
|
||||
uint32_t opd2_c_str;
|
||||
uint32_t opd2_h_str;
|
||||
uint32_t opd2_w_str;
|
||||
uint32_t opd2_b_str;
|
||||
uint32_t dummy7;
|
||||
uint32_t rsvd6;
|
||||
} tiu_reg_t;
|
||||
|
||||
static inline void parse_tiu_reg(tiu_reg_t *r, const uint32_t *p)
|
||||
{
|
||||
r->cmd_en = p[0] & 1;
|
||||
r->cmd_end = (p[0] >> 1) & 1;
|
||||
r->cmd_id_en = (p[0] >> 2) & 1;
|
||||
r->cmd_keep = (p[0] >> 3) & 1;
|
||||
r->cmd_intr_en = (p[0] >> 4) & 1;
|
||||
r->tsk_typ = (p[0] >> 5) & ((1u << 4) - 1);
|
||||
r->tsk_eu_typ = (p[0] >> 9) & ((1u << 5) - 1);
|
||||
r->tsk_opd_num = (p[0] >> 14) & ((1u << 2) - 1);
|
||||
r->opt_res_shift = (p[0] >> 16) & ((1u << 6) - 1);
|
||||
r->opt_left_shift = (p[0] >> 22) & ((1u << 5) - 1);
|
||||
r->opt_shift_typ = (p[0] >> 27) & 1;
|
||||
r->opt_rshift_typ = (p[0] >> 28) & 1;
|
||||
r->dummy1 = (p[0] >> 29) & 1;
|
||||
r->opd_typ = (p[0] >> 30) & 1;
|
||||
r->opt_chl_quan = (p[0] >> 31) & 1;
|
||||
r->cmd_id_tpu = p[1] & ((1u << 16) - 1);
|
||||
r->cmd_id_gdma = (p[1] >> 16) & ((1u << 16) - 1);
|
||||
r->quan_m = p[2];
|
||||
r->opt_res0_sign = p[3] & 1;
|
||||
r->opt_opd0_sign = (p[3] >> 1) & 1;
|
||||
r->opt_opd1_sign = (p[3] >> 2) & 1;
|
||||
r->opt_opd2_sign = (p[3] >> 3) & 1;
|
||||
r->opt_res0_seg = (p[3] >> 4) & ((1u << 2) - 1);
|
||||
r->opt_opd0_seg = (p[3] >> 6) & ((1u << 2) - 1);
|
||||
r->opt_opd1_seg = (p[3] >> 8) & ((1u << 2) - 1);
|
||||
r->opt_opd2_seg = (p[3] >> 10) & 1;
|
||||
r->ps32_md = (p[3] >> 11) & ((1u << 2) - 1);
|
||||
r->double_conv = (p[3] >> 13) & 1;
|
||||
r->opt_left_tran = (p[3] >> 14) & 1;
|
||||
r->fp_round_typ = (p[3] >> 15) & 1;
|
||||
r->opt_relu_typ = (p[3] >> 16) & ((1u << 2) - 1);
|
||||
r->opt_relu_value = (p[3] >> 18) & ((1u << 8) - 1);
|
||||
r->cmd_pre_exe_typ = (p[3] >> 26) & 1;
|
||||
r->opt_res_add = (p[3] >> 27) & 1;
|
||||
r->rsvd0 = (p[3] >> 28) & ((1u << 4) - 1);
|
||||
r->conv_opd0_x_ins0 = p[4] & ((1u << 4) - 1);
|
||||
r->conv_opd0_y_ins0 = (p[4] >> 4) & ((1u << 4) - 1);
|
||||
r->conv_opd0_x_ins0_last = (p[4] >> 8) & ((1u << 4) - 1);
|
||||
r->conv_opd0_y_ins0_last = (p[4] >> 12) & ((1u << 4) - 1);
|
||||
r->conv_opd1_x_ins0 = (p[4] >> 16) & ((1u << 4) - 1);
|
||||
r->conv_opd1_y_ins0 = (p[4] >> 20) & ((1u << 4) - 1);
|
||||
r->dummy0 = (p[4] >> 24) & ((1u << 8) - 1);
|
||||
r->opd0_ins_val = p[5] & ((1u << 16) - 1);
|
||||
r->conv_opd0_up_pad = (p[5] >> 16) & ((1u << 4) - 1);
|
||||
r->conv_opd0_dn_pad = (p[5] >> 20) & ((1u << 4) - 1);
|
||||
r->conv_opd0_lf_pad = (p[5] >> 24) & ((1u << 4) - 1);
|
||||
r->conv_opd0_rt_pad = (p[5] >> 28) & ((1u << 4) - 1);
|
||||
r->res0_n = p[6] & ((1u << 12) - 1);
|
||||
r->res0_c = (p[6] >> 12) & ((1u << 12) - 1);
|
||||
r->res0_h = (p[6] >> 24) & ((1u << 8) - 1);
|
||||
r->res0_h |= (uint64_t)(p[7] & ((1u << 4) - 1)) << 8;
|
||||
r->res0_w = (p[7] >> 4) & ((1u << 12) - 1);
|
||||
r->conv_op_x_str = (p[7] >> 16) & ((1u << 5) - 1);
|
||||
r->conv_op_y_str = (p[7] >> 21) & ((1u << 5) - 1);
|
||||
r->cmd_pre_exe = (p[7] >> 26) & ((1u << 2) - 1);
|
||||
r->rsvd1 = (p[7] >> 28) & ((1u << 4) - 1);
|
||||
r->res0_addr = p[8] & ((1u << 24) - 1);
|
||||
r->opd0_addr = (p[8] >> 24) & ((1u << 8) - 1);
|
||||
r->opd0_addr |= (uint64_t)(p[9] & ((1u << 16) - 1)) << 8;
|
||||
r->opd1_addr = (p[9] >> 16) & ((1u << 16) - 1);
|
||||
r->opd2_addr = p[10] & ((1u << 16) - 1);
|
||||
r->opt_opd0_const = (p[10] >> 16) & 1;
|
||||
r->opt_opd1_const = (p[10] >> 17) & 1;
|
||||
r->opt_opd2_const = (p[10] >> 18) & 1;
|
||||
r->short_nchwstr_same = (p[10] >> 19) & 1;
|
||||
r->short_res0_str = (p[10] >> 20) & ((1u << 2) - 1);
|
||||
r->short_opd0_str = (p[10] >> 22) & ((1u << 2) - 1);
|
||||
r->short_opd1_str = (p[10] >> 24) & ((1u << 2) - 1);
|
||||
r->short_opd2_str = (p[10] >> 26) & ((1u << 2) - 1);
|
||||
r->dummy2 = (p[10] >> 28) & ((1u << 4) - 1);
|
||||
r->opd0_n = p[11] & ((1u << 12) - 1);
|
||||
r->opd0_c = (p[11] >> 12) & ((1u << 12) - 1);
|
||||
r->dummy3 = (p[11] >> 24) & ((1u << 4) - 1);
|
||||
r->rsvd2 = (p[11] >> 28) & ((1u << 4) - 1);
|
||||
r->opd0_h = p[12] & ((1u << 12) - 1);
|
||||
r->opd0_w = (p[12] >> 12) & ((1u << 12) - 1);
|
||||
r->opd1_n = (p[12] >> 24) & ((1u << 8) - 1);
|
||||
r->opd1_n |= (uint64_t)(p[13] & ((1u << 4) - 1)) << 8;
|
||||
r->opd1_c = (p[13] >> 4) & ((1u << 12) - 1);
|
||||
r->opd1_h = (p[13] >> 16) & ((1u << 12) - 1);
|
||||
r->opd1_w = (p[13] >> 28) & ((1u << 4) - 1);
|
||||
r->opd1_w |= (uint64_t)(p[14] & ((1u << 8) - 1)) << 4;
|
||||
r->opd2_n = (p[14] >> 8) & ((1u << 12) - 1);
|
||||
r->opd2_c = (p[14] >> 20) & ((1u << 12) - 1);
|
||||
r->opd2_h = p[15] & ((1u << 12) - 1);
|
||||
r->opd2_w = (p[15] >> 12) & ((1u << 12) - 1);
|
||||
r->dummy4 = (p[15] >> 24) & ((1u << 4) - 1);
|
||||
r->rsvd3 = (p[15] >> 28) & ((1u << 4) - 1);
|
||||
r->layer_info = p[16] & ((1u << 16) - 1);
|
||||
r->res0_n_str = (p[16] >> 16) & ((1u << 16) - 1);
|
||||
r->res0_c_str = p[17] & ((1u << 16) - 1);
|
||||
r->res0_h_str = (p[17] >> 16) & ((1u << 16) - 1);
|
||||
r->res0_w_str = p[18] & ((1u << 16) - 1);
|
||||
r->res0_b_str = (p[18] >> 16) & ((1u << 16) - 1);
|
||||
r->opd0_n_str = p[19] & ((1u << 16) - 1);
|
||||
r->dummy5 = (p[19] >> 16) & ((1u << 12) - 1);
|
||||
r->rsvd4 = (p[19] >> 28) & ((1u << 4) - 1);
|
||||
r->opd0_c_str = p[20] & ((1u << 16) - 1);
|
||||
r->opd0_h_str = (p[20] >> 16) & ((1u << 16) - 1);
|
||||
r->opd0_w_str = p[21] & ((1u << 16) - 1);
|
||||
r->opd0_b_str = (p[21] >> 16) & ((1u << 16) - 1);
|
||||
r->opd1_n_str = p[22] & ((1u << 16) - 1);
|
||||
r->opd1_c_str = (p[22] >> 16) & ((1u << 16) - 1);
|
||||
r->opd1_h_str = p[23] & ((1u << 16) - 1);
|
||||
r->dummy6 = (p[23] >> 16) & ((1u << 12) - 1);
|
||||
r->rsvd5 = (p[23] >> 28) & ((1u << 4) - 1);
|
||||
r->opd1_w_str = p[24] & ((1u << 16) - 1);
|
||||
r->opd1_b_str = (p[24] >> 16) & ((1u << 16) - 1);
|
||||
r->opd2_n_str = p[25] & ((1u << 16) - 1);
|
||||
r->opd2_c_str = (p[25] >> 16) & ((1u << 16) - 1);
|
||||
r->opd2_h_str = p[26] & ((1u << 16) - 1);
|
||||
r->opd2_w_str = (p[26] >> 16) & ((1u << 16) - 1);
|
||||
r->opd2_b_str = p[27] & ((1u << 16) - 1);
|
||||
r->dummy7 = (p[27] >> 16) & ((1u << 12) - 1);
|
||||
r->rsvd6 = (p[27] >> 28) & ((1u << 4) - 1);
|
||||
}
|
||||
|
||||
static inline void emit_tiu_reg(const tiu_reg_t *r, uint32_t *_p)
|
||||
{
|
||||
volatile uint32_t *p = (typeof(p))_p;
|
||||
p[27] = (r->opd2_b_str & ((1u << 16) - 1)) |
|
||||
((r->dummy7 & ((1u << 12) - 1)) << 16) |
|
||||
((r->rsvd6 & ((1u << 4) - 1)) << 28);
|
||||
p[26] = (r->opd2_h_str & ((1u << 16) - 1)) |
|
||||
((r->opd2_w_str & ((1u << 16) - 1)) << 16);
|
||||
p[25] = (r->opd2_n_str & ((1u << 16) - 1)) |
|
||||
((r->opd2_c_str & ((1u << 16) - 1)) << 16);
|
||||
p[24] = (r->opd1_w_str & ((1u << 16) - 1)) |
|
||||
((r->opd1_b_str & ((1u << 16) - 1)) << 16);
|
||||
p[23] = (r->opd1_h_str & ((1u << 16) - 1)) |
|
||||
((r->dummy6 & ((1u << 12) - 1)) << 16) |
|
||||
((r->rsvd5 & ((1u << 4) - 1)) << 28);
|
||||
p[22] = (r->opd1_n_str & ((1u << 16) - 1)) |
|
||||
((r->opd1_c_str & ((1u << 16) - 1)) << 16);
|
||||
p[21] = (r->opd0_w_str & ((1u << 16) - 1)) |
|
||||
((r->opd0_b_str & ((1u << 16) - 1)) << 16);
|
||||
p[20] = (r->opd0_c_str & ((1u << 16) - 1)) |
|
||||
((r->opd0_h_str & ((1u << 16) - 1)) << 16);
|
||||
p[19] = (r->opd0_n_str & ((1u << 16) - 1)) |
|
||||
((r->dummy5 & ((1u << 12) - 1)) << 16) |
|
||||
((r->rsvd4 & ((1u << 4) - 1)) << 28);
|
||||
p[18] = (r->res0_w_str & ((1u << 16) - 1)) |
|
||||
((r->res0_b_str & ((1u << 16) - 1)) << 16);
|
||||
p[17] = (r->res0_c_str & ((1u << 16) - 1)) |
|
||||
((r->res0_h_str & ((1u << 16) - 1)) << 16);
|
||||
p[16] = (r->layer_info & ((1u << 16) - 1)) |
|
||||
((r->res0_n_str & ((1u << 16) - 1)) << 16);
|
||||
p[15] = (r->opd2_h & ((1u << 12) - 1)) |
|
||||
((r->opd2_w & ((1u << 12) - 1)) << 12) |
|
||||
((r->dummy4 & ((1u << 4) - 1)) << 24) |
|
||||
((r->rsvd3 & ((1u << 4) - 1)) << 28);
|
||||
p[14] = ((r->opd1_w >> 4) & ((1u << 8) - 1)) |
|
||||
((r->opd2_n & ((1u << 12) - 1)) << 8) |
|
||||
((r->opd2_c & ((1u << 12) - 1)) << 20);
|
||||
p[13] = ((r->opd1_n >> 8) & ((1u << 4) - 1)) |
|
||||
((r->opd1_c & ((1u << 12) - 1)) << 4) |
|
||||
((r->opd1_h & ((1u << 12) - 1)) << 16) |
|
||||
((r->opd1_w & ((1u << 4) - 1)) << 28);
|
||||
p[12] = (r->opd0_h & ((1u << 12) - 1)) |
|
||||
((r->opd0_w & ((1u << 12) - 1)) << 12) |
|
||||
((r->opd1_n & ((1u << 8) - 1)) << 24);
|
||||
p[11] = (r->opd0_n & ((1u << 12) - 1)) |
|
||||
((r->opd0_c & ((1u << 12) - 1)) << 12) |
|
||||
((r->dummy3 & ((1u << 4) - 1)) << 24) |
|
||||
((r->rsvd2 & ((1u << 4) - 1)) << 28);
|
||||
p[10] = (r->opd2_addr & ((1u << 16) - 1)) |
|
||||
((r->opt_opd0_const & 1) << 16) |
|
||||
((r->opt_opd1_const & 1) << 17) |
|
||||
((r->opt_opd2_const & 1) << 18) |
|
||||
((r->short_nchwstr_same & 1) << 19) |
|
||||
((r->short_res0_str & ((1u << 2) - 1)) << 20) |
|
||||
((r->short_opd0_str & ((1u << 2) - 1)) << 22) |
|
||||
((r->short_opd1_str & ((1u << 2) - 1)) << 24) |
|
||||
((r->short_opd2_str & ((1u << 2) - 1)) << 26) |
|
||||
((r->dummy2 & ((1u << 4) - 1)) << 28);
|
||||
p[9] = ((r->opd0_addr >> 8) & ((1u << 16) - 1)) |
|
||||
((r->opd1_addr & ((1u << 16) - 1)) << 16);
|
||||
p[8] = (r->res0_addr & ((1u << 24) - 1)) |
|
||||
((r->opd0_addr & ((1u << 8) - 1)) << 24);
|
||||
p[7] = ((r->res0_h >> 8) & ((1u << 4) - 1)) |
|
||||
((r->res0_w & ((1u << 12) - 1)) << 4) |
|
||||
((r->conv_op_x_str & ((1u << 5) - 1)) << 16) |
|
||||
((r->conv_op_y_str & ((1u << 5) - 1)) << 21) |
|
||||
((r->cmd_pre_exe & ((1u << 2) - 1)) << 26) |
|
||||
((r->rsvd1 & ((1u << 4) - 1)) << 28);
|
||||
p[6] = (r->res0_n & ((1u << 12) - 1)) |
|
||||
((r->res0_c & ((1u << 12) - 1)) << 12) |
|
||||
((r->res0_h & ((1u << 8) - 1)) << 24);
|
||||
p[5] = (r->opd0_ins_val & ((1u << 16) - 1)) |
|
||||
((r->conv_opd0_up_pad & ((1u << 4) - 1)) << 16) |
|
||||
((r->conv_opd0_dn_pad & ((1u << 4) - 1)) << 20) |
|
||||
((r->conv_opd0_lf_pad & ((1u << 4) - 1)) << 24) |
|
||||
((r->conv_opd0_rt_pad & ((1u << 4) - 1)) << 28);
|
||||
p[4] = (r->conv_opd0_x_ins0 & ((1u << 4) - 1)) |
|
||||
((r->conv_opd0_y_ins0 & ((1u << 4) - 1)) << 4) |
|
||||
((r->conv_opd0_x_ins0_last & ((1u << 4) - 1)) << 8) |
|
||||
((r->conv_opd0_y_ins0_last & ((1u << 4) - 1)) << 12) |
|
||||
((r->conv_opd1_x_ins0 & ((1u << 4) - 1)) << 16) |
|
||||
((r->conv_opd1_y_ins0 & ((1u << 4) - 1)) << 20) |
|
||||
((r->dummy0 & ((1u << 8) - 1)) << 24);
|
||||
p[3] = (r->opt_res0_sign & 1) |
|
||||
((r->opt_opd0_sign & 1) << 1) |
|
||||
((r->opt_opd1_sign & 1) << 2) |
|
||||
((r->opt_opd2_sign & 1) << 3) |
|
||||
((r->opt_res0_seg & ((1u << 2) - 1)) << 4) |
|
||||
((r->opt_opd0_seg & ((1u << 2) - 1)) << 6) |
|
||||
((r->opt_opd1_seg & ((1u << 2) - 1)) << 8) |
|
||||
((r->opt_opd2_seg & 1) << 10) |
|
||||
((r->ps32_md & ((1u << 2) - 1)) << 11) |
|
||||
((r->double_conv & 1) << 13) |
|
||||
((r->opt_left_tran & 1) << 14) |
|
||||
((r->fp_round_typ & 1) << 15) |
|
||||
((r->opt_relu_typ & ((1u << 2) - 1)) << 16) |
|
||||
((r->opt_relu_value & ((1u << 8) - 1)) << 18) |
|
||||
((r->cmd_pre_exe_typ & 1) << 26) |
|
||||
((r->opt_res_add & 1) << 27) |
|
||||
((r->rsvd0 & ((1u << 4) - 1)) << 28);
|
||||
p[2] = (r->quan_m & (((uint64_t)1 << 32) - 1));
|
||||
p[1] = (r->cmd_id_tpu & ((1u << 16) - 1)) |
|
||||
((r->cmd_id_gdma & ((1u << 16) - 1)) << 16);
|
||||
p[0] = (r->cmd_en & 1) |
|
||||
((r->cmd_end & 1) << 1) |
|
||||
((r->cmd_id_en & 1) << 2) |
|
||||
((r->cmd_keep & 1) << 3) |
|
||||
((r->cmd_intr_en & 1) << 4) |
|
||||
((r->tsk_typ & ((1u << 4) - 1)) << 5) |
|
||||
((r->tsk_eu_typ & ((1u << 5) - 1)) << 9) |
|
||||
((r->tsk_opd_num & ((1u << 2) - 1)) << 14) |
|
||||
((r->opt_res_shift & ((1u << 6) - 1)) << 16) |
|
||||
((r->opt_left_shift & ((1u << 5) - 1)) << 22) |
|
||||
((r->opt_shift_typ & 1) << 27) |
|
||||
((r->opt_rshift_typ & 1) << 28) |
|
||||
((r->dummy1 & 1) << 29) |
|
||||
((r->opd_typ & 1) << 30) |
|
||||
((r->opt_chl_quan & 1) << 31);
|
||||
}
|
||||
|
||||
static inline void reset_tiu_reg(tiu_reg_t *r)
|
||||
{
|
||||
r->cmd_en = 0x0;
|
||||
r->cmd_end = 0x0;
|
||||
r->cmd_id_en = 0x0;
|
||||
r->cmd_keep = 0x0;
|
||||
r->cmd_intr_en = 0x0;
|
||||
r->tsk_typ = 0x0;
|
||||
r->tsk_eu_typ = 0x0;
|
||||
r->tsk_opd_num = 0x3;
|
||||
r->opt_res_shift = 0xa;
|
||||
r->opt_left_shift = 0x2;
|
||||
r->opt_shift_typ = 0x1;
|
||||
r->opt_rshift_typ = 0x1;
|
||||
r->dummy1 = 0x0;
|
||||
r->opd_typ = 0x0;
|
||||
r->opt_chl_quan = 0x0;
|
||||
r->cmd_id_tpu = 0x0;
|
||||
r->cmd_id_gdma = 0x0;
|
||||
r->quan_m = 0x0;
|
||||
r->opt_res0_sign = 0x0;
|
||||
r->opt_opd0_sign = 0x0;
|
||||
r->opt_opd1_sign = 0x1;
|
||||
r->opt_opd2_sign = 0x1;
|
||||
r->opt_res0_seg = 0x1;
|
||||
r->opt_opd0_seg = 0x1;
|
||||
r->opt_opd1_seg = 0x1;
|
||||
r->opt_opd2_seg = 0x0;
|
||||
r->ps32_md = 0x0;
|
||||
r->double_conv = 0x0;
|
||||
r->opt_left_tran = 0x0;
|
||||
r->fp_round_typ = 0x0;
|
||||
r->opt_relu_typ = 0x0;
|
||||
r->opt_relu_value = 0x0;
|
||||
r->cmd_pre_exe_typ = 0x0;
|
||||
r->opt_res_add = 0x0;
|
||||
r->rsvd0 = 0x0;
|
||||
r->conv_opd0_x_ins0 = 0x0;
|
||||
r->conv_opd0_y_ins0 = 0x0;
|
||||
r->conv_opd0_x_ins0_last = 0x0;
|
||||
r->conv_opd0_y_ins0_last = 0x0;
|
||||
r->conv_opd1_x_ins0 = 0x0;
|
||||
r->conv_opd1_y_ins0 = 0x0;
|
||||
r->dummy0 = 0x0;
|
||||
r->opd0_ins_val = 0x0;
|
||||
r->conv_opd0_up_pad = 0x0;
|
||||
r->conv_opd0_dn_pad = 0x0;
|
||||
r->conv_opd0_lf_pad = 0x0;
|
||||
r->conv_opd0_rt_pad = 0x0;
|
||||
r->res0_n = 0x1;
|
||||
r->res0_c = 0x1;
|
||||
r->res0_h = 0x1;
|
||||
r->res0_w = 0x10;
|
||||
r->conv_op_x_str = 0x1;
|
||||
r->conv_op_y_str = 0x1;
|
||||
r->cmd_pre_exe = 0x0;
|
||||
r->rsvd1 = 0x1;
|
||||
r->res0_addr = 0x0;
|
||||
r->opd0_addr = 0x0;
|
||||
r->opd1_addr = 0x0;
|
||||
r->opd2_addr = 0x0;
|
||||
r->opt_opd0_const = 0x0;
|
||||
r->opt_opd1_const = 0x0;
|
||||
r->opt_opd2_const = 0x0;
|
||||
r->short_nchwstr_same = 0x0;
|
||||
r->short_res0_str = 0x0;
|
||||
r->short_opd0_str = 0x0;
|
||||
r->short_opd1_str = 0x0;
|
||||
r->short_opd2_str = 0x0;
|
||||
r->dummy2 = 0x0;
|
||||
r->opd0_n = 0x1;
|
||||
r->opd0_c = 0x1;
|
||||
r->dummy3 = 0x0;
|
||||
r->rsvd2 = 0x2;
|
||||
r->opd0_h = 0x1;
|
||||
r->opd0_w = 0x10;
|
||||
r->opd1_n = 0x1;
|
||||
r->opd1_c = 0x1;
|
||||
r->opd1_h = 0x1;
|
||||
r->opd1_w = 0x10;
|
||||
r->opd2_n = 0x1;
|
||||
r->opd2_c = 0x1;
|
||||
r->opd2_h = 0x1;
|
||||
r->opd2_w = 0x10;
|
||||
r->dummy4 = 0x0;
|
||||
r->rsvd3 = 0x3;
|
||||
r->layer_info = 0x0;
|
||||
r->res0_n_str = 0x10;
|
||||
r->res0_c_str = 0x10;
|
||||
r->res0_h_str = 0x0;
|
||||
r->res0_w_str = 0x1;
|
||||
r->res0_b_str = 0x10;
|
||||
r->opd0_n_str = 0x10;
|
||||
r->dummy5 = 0x0;
|
||||
r->rsvd4 = 0x4;
|
||||
r->opd0_c_str = 0x10;
|
||||
r->opd0_h_str = 0x0;
|
||||
r->opd0_w_str = 0x1;
|
||||
r->opd0_b_str = 0x10;
|
||||
r->opd1_n_str = 0x10;
|
||||
r->opd1_c_str = 0x10;
|
||||
r->opd1_h_str = 0x0;
|
||||
r->dummy6 = 0x0;
|
||||
r->rsvd5 = 0x5;
|
||||
r->opd1_w_str = 0x1;
|
||||
r->opd1_b_str = 0x10;
|
||||
r->opd2_n_str = 0x10;
|
||||
r->opd2_c_str = 0x10;
|
||||
r->opd2_h_str = 0x0;
|
||||
r->opd2_w_str = 0x1;
|
||||
r->opd2_b_str = 0x10;
|
||||
r->dummy7 = 0x0;
|
||||
r->rsvd6 = 0x6;
|
||||
}
|
||||
|
||||
static inline void trace_tiu_reg(tiu_reg_t *r, const char *tag)
|
||||
{
|
||||
#define trace_one_reg(name) \
|
||||
printf(" %s: 0x%llx\n", #name, (ullong)r->name)
|
||||
|
||||
printf("--- %s ---\n", tag);
|
||||
trace_one_reg(cmd_en);
|
||||
trace_one_reg(cmd_end);
|
||||
trace_one_reg(cmd_id_en);
|
||||
trace_one_reg(cmd_keep);
|
||||
trace_one_reg(cmd_intr_en);
|
||||
trace_one_reg(tsk_typ);
|
||||
trace_one_reg(tsk_eu_typ);
|
||||
trace_one_reg(tsk_opd_num);
|
||||
trace_one_reg(opt_res_shift);
|
||||
trace_one_reg(opt_left_shift);
|
||||
trace_one_reg(opt_shift_typ);
|
||||
trace_one_reg(opt_rshift_typ);
|
||||
trace_one_reg(dummy1);
|
||||
trace_one_reg(opd_typ);
|
||||
trace_one_reg(opt_chl_quan);
|
||||
trace_one_reg(cmd_id_tpu);
|
||||
trace_one_reg(cmd_id_gdma);
|
||||
trace_one_reg(quan_m);
|
||||
trace_one_reg(opt_res0_sign);
|
||||
trace_one_reg(opt_opd0_sign);
|
||||
trace_one_reg(opt_opd1_sign);
|
||||
trace_one_reg(opt_opd2_sign);
|
||||
trace_one_reg(opt_res0_seg);
|
||||
trace_one_reg(opt_opd0_seg);
|
||||
trace_one_reg(opt_opd1_seg);
|
||||
trace_one_reg(opt_opd2_seg);
|
||||
trace_one_reg(ps32_md);
|
||||
trace_one_reg(double_conv);
|
||||
trace_one_reg(opt_left_tran);
|
||||
trace_one_reg(fp_round_typ);
|
||||
trace_one_reg(opt_relu_typ);
|
||||
trace_one_reg(opt_relu_value);
|
||||
trace_one_reg(cmd_pre_exe_typ);
|
||||
trace_one_reg(opt_res_add);
|
||||
trace_one_reg(rsvd0);
|
||||
trace_one_reg(conv_opd0_x_ins0);
|
||||
trace_one_reg(conv_opd0_y_ins0);
|
||||
trace_one_reg(conv_opd0_x_ins0_last);
|
||||
trace_one_reg(conv_opd0_y_ins0_last);
|
||||
trace_one_reg(conv_opd1_x_ins0);
|
||||
trace_one_reg(conv_opd1_y_ins0);
|
||||
trace_one_reg(dummy0);
|
||||
trace_one_reg(opd0_ins_val);
|
||||
trace_one_reg(conv_opd0_up_pad);
|
||||
trace_one_reg(conv_opd0_dn_pad);
|
||||
trace_one_reg(conv_opd0_lf_pad);
|
||||
trace_one_reg(conv_opd0_rt_pad);
|
||||
trace_one_reg(res0_n);
|
||||
trace_one_reg(res0_c);
|
||||
trace_one_reg(res0_h);
|
||||
trace_one_reg(res0_w);
|
||||
trace_one_reg(conv_op_x_str);
|
||||
trace_one_reg(conv_op_y_str);
|
||||
trace_one_reg(cmd_pre_exe);
|
||||
trace_one_reg(rsvd1);
|
||||
trace_one_reg(res0_addr);
|
||||
trace_one_reg(opd0_addr);
|
||||
trace_one_reg(opd1_addr);
|
||||
trace_one_reg(opd2_addr);
|
||||
trace_one_reg(opt_opd0_const);
|
||||
trace_one_reg(opt_opd1_const);
|
||||
trace_one_reg(opt_opd2_const);
|
||||
trace_one_reg(short_nchwstr_same);
|
||||
trace_one_reg(short_res0_str);
|
||||
trace_one_reg(short_opd0_str);
|
||||
trace_one_reg(short_opd1_str);
|
||||
trace_one_reg(short_opd2_str);
|
||||
trace_one_reg(dummy2);
|
||||
trace_one_reg(opd0_n);
|
||||
trace_one_reg(opd0_c);
|
||||
trace_one_reg(dummy3);
|
||||
trace_one_reg(rsvd2);
|
||||
trace_one_reg(opd0_h);
|
||||
trace_one_reg(opd0_w);
|
||||
trace_one_reg(opd1_n);
|
||||
trace_one_reg(opd1_c);
|
||||
trace_one_reg(opd1_h);
|
||||
trace_one_reg(opd1_w);
|
||||
trace_one_reg(opd2_n);
|
||||
trace_one_reg(opd2_c);
|
||||
trace_one_reg(opd2_h);
|
||||
trace_one_reg(opd2_w);
|
||||
trace_one_reg(dummy4);
|
||||
trace_one_reg(rsvd3);
|
||||
trace_one_reg(layer_info);
|
||||
trace_one_reg(res0_n_str);
|
||||
trace_one_reg(res0_c_str);
|
||||
trace_one_reg(res0_h_str);
|
||||
trace_one_reg(res0_w_str);
|
||||
trace_one_reg(res0_b_str);
|
||||
trace_one_reg(opd0_n_str);
|
||||
trace_one_reg(dummy5);
|
||||
trace_one_reg(rsvd4);
|
||||
trace_one_reg(opd0_c_str);
|
||||
trace_one_reg(opd0_h_str);
|
||||
trace_one_reg(opd0_w_str);
|
||||
trace_one_reg(opd0_b_str);
|
||||
trace_one_reg(opd1_n_str);
|
||||
trace_one_reg(opd1_c_str);
|
||||
trace_one_reg(opd1_h_str);
|
||||
trace_one_reg(dummy6);
|
||||
trace_one_reg(rsvd5);
|
||||
trace_one_reg(opd1_w_str);
|
||||
trace_one_reg(opd1_b_str);
|
||||
trace_one_reg(opd2_n_str);
|
||||
trace_one_reg(opd2_c_str);
|
||||
trace_one_reg(opd2_h_str);
|
||||
trace_one_reg(opd2_w_str);
|
||||
trace_one_reg(opd2_b_str);
|
||||
trace_one_reg(dummy7);
|
||||
trace_one_reg(rsvd6);
|
||||
}
|
||||
#endif /* CV180X_TIU_REG_H */
|
||||
38
cvikernel/include/cvikernel/cv180x/cv180x_tpu_cfg.h
Normal file
38
cvikernel/include/cvikernel/cv180x/cv180x_tpu_cfg.h
Normal file
@ -0,0 +1,38 @@
|
||||
#ifndef __CV180X_TPU_CFG__
|
||||
#define __CV180X_TPU_CFG__
|
||||
|
||||
#define CV180X_VER 182203
|
||||
#define CV180X_HW_NPU_SHIFT 1
|
||||
#define CV180X_HW_EU_SHIFT 4
|
||||
#define CV180X_HW_LMEM_SHIFT 15
|
||||
#define CV180X_HW_LMEM_BANKS 8
|
||||
#define CV180X_HW_LMEM_BANK_SIZE 0x1000
|
||||
#define CV180X_HW_NODE_CHIP_SHIFT 0
|
||||
#define CV180X_HW_NPU_NUM (1 << CV180X_HW_NPU_SHIFT)
|
||||
#define CV180X_HW_EU_NUM (1 << CV180X_HW_EU_SHIFT)
|
||||
#define CV180X_HW_LMEM_SIZE (1 << CV180X_HW_LMEM_SHIFT)
|
||||
#define CV180X_HW_LMEM_START_ADDR 0x0C000000
|
||||
#define CV180X_HW_NODE_CHIP_NUM (1 << CV180X_HW_NODE_CHIP_SHIFT)
|
||||
|
||||
#if (CV180X_HW_LMEM_SIZE != (CV180X_HW_LMEM_BANK_SIZE * CV180X_HW_LMEM_BANKS))
|
||||
#error "Set wrong TPU configuration."
|
||||
#endif
|
||||
|
||||
#define CV180X_GLOBAL_MEM_START_ADDR 0x0
|
||||
#define CV180X_GLOBAL_MEM_SIZE 0x100000000 //
|
||||
|
||||
#define CV180X_GLOBAL_TIU_CMDBUF_ADDR 0x00000000
|
||||
#define CV180X_GLOBAL_TDMA_CMDBUF_ADDR 0x00800000
|
||||
#define CV180X_GLOBAL_TIU_CMDBUF_RESERVED_SIZE 0x00800000 // 8MB
|
||||
#define CV180X_GLOBAL_TDMA_CMDBUF_RESERVED_SIZE 0x00800000 // 8MB
|
||||
#define CV180X_GLOBAL_POOL_RESERVED_SIZE (CV180X_GLOBAL_MEM_SIZE - CV180X_GLOBAL_TIU_CMDBUF_RESERVED_SIZE - CV180X_GLOBAL_TDMA_CMDBUF_RESERVED_SIZE)
|
||||
|
||||
#define CV180X_UART_CTLR_BASE_ADDR 0x04140000
|
||||
|
||||
#define CV180X_TDMA_ENGINE_BASE_ADDR 0x0C100000
|
||||
#define CV180X_TDMA_ENGINE_END_ADDR (CV180X_TDMA_ENGINE_BASE_ADDR + 0x1000)
|
||||
|
||||
#define CV180X_TIU_ENGINE_BASE_ADDR 0x0C101000 //"NPS Register" in memory map?
|
||||
#define CV180X_TIU_ENGINE_END_ADDR (CV180X_TIU_ENGINE_BASE_ADDR + 0x1000)
|
||||
|
||||
#endif
|
||||
310
cvikernel/include/cvikernel/cv181x/cv181x_tdma_reg.h
Normal file
310
cvikernel/include/cvikernel/cv181x/cv181x_tdma_reg.h
Normal file
@ -0,0 +1,310 @@
|
||||
#ifndef CV181X_TDMA_REG_H
|
||||
#define CV181X_TDMA_REG_H
|
||||
|
||||
/*
|
||||
* This file is generated by tools. Do not edit it manually.
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#define TDMA_DESC_REG_BYTES (0x40)
|
||||
#define TDMA_ENGINE_DESCRIPTOR_NUM (TDMA_DESC_REG_BYTES >> 2)
|
||||
#define TDMA_NUM_BASE_REGS (0x8)
|
||||
|
||||
typedef unsigned long long ullong;
|
||||
|
||||
typedef struct {
|
||||
uint32_t vld;
|
||||
uint32_t compress_en;
|
||||
uint32_t eod;
|
||||
uint32_t intp_en;
|
||||
uint32_t bar_en;
|
||||
uint32_t check_bf16_value;
|
||||
uint32_t trans_dir;
|
||||
uint32_t rsv00;
|
||||
uint32_t trans_fmt;
|
||||
uint32_t transpose_md;
|
||||
uint32_t rsv01;
|
||||
uint32_t intra_cmd_paral;
|
||||
uint32_t outstanding_en;
|
||||
uint32_t cmd_id;
|
||||
uint32_t spec_func;
|
||||
uint32_t dst_fmt;
|
||||
uint32_t src_fmt;
|
||||
uint32_t cmprs_fmt;
|
||||
uint32_t sys_dtype;
|
||||
uint32_t rsv2_1;
|
||||
uint32_t int8_sign;
|
||||
uint32_t compress_zero_guard;
|
||||
uint32_t int8_rnd_mode;
|
||||
uint32_t wait_id_tpu;
|
||||
uint32_t wait_id_other_tdma;
|
||||
uint32_t wait_id_sdma;
|
||||
uint32_t const_val;
|
||||
uint32_t src_base_reg_sel;
|
||||
uint32_t mv_lut_idx;
|
||||
uint32_t dst_base_reg_sel;
|
||||
uint32_t mv_lut_base;
|
||||
uint32_t rsv4_5;
|
||||
uint32_t dst_h_stride;
|
||||
uint32_t dst_c_stride_low;
|
||||
uint32_t dst_n_stride;
|
||||
uint32_t src_h_stride;
|
||||
uint32_t src_c_stride_low;
|
||||
uint32_t src_n_stride;
|
||||
uint32_t dst_c;
|
||||
uint32_t src_c;
|
||||
uint32_t dst_w;
|
||||
uint32_t dst_h;
|
||||
uint32_t src_w;
|
||||
uint32_t src_h;
|
||||
uint32_t dst_base_addr_low;
|
||||
uint32_t src_base_addr_low;
|
||||
uint32_t src_n;
|
||||
uint32_t dst_base_addr_high;
|
||||
uint32_t src_base_addr_high;
|
||||
uint32_t src_c_stride_high;
|
||||
uint32_t dst_c_stride_high;
|
||||
uint32_t compress_bias0;
|
||||
uint32_t compress_bias1;
|
||||
uint32_t layer_ID;
|
||||
} tdma_reg_t;
|
||||
|
||||
static inline void parse_tdma_reg(tdma_reg_t *r, const uint32_t *p)
|
||||
{
|
||||
r->vld = p[0] & 1;
|
||||
r->compress_en = (p[0] >> 1) & 1;
|
||||
r->eod = (p[0] >> 2) & 1;
|
||||
r->intp_en = (p[0] >> 3) & 1;
|
||||
r->bar_en = (p[0] >> 4) & 1;
|
||||
r->check_bf16_value = (p[0] >> 5) & 1;
|
||||
r->trans_dir = (p[0] >> 6) & ((1u << 2) - 1);
|
||||
r->rsv00 = (p[0] >> 8) & ((1u << 2) - 1);
|
||||
r->trans_fmt = (p[0] >> 10) & 1;
|
||||
r->transpose_md = (p[0] >> 11) & ((1u << 2) - 1);
|
||||
r->rsv01 = (p[0] >> 13) & 1;
|
||||
r->intra_cmd_paral = (p[0] >> 14) & 1;
|
||||
r->outstanding_en = (p[0] >> 15) & 1;
|
||||
r->cmd_id = (p[0] >> 16) & ((1u << 16) - 1);
|
||||
r->spec_func = p[1] & ((1u << 3) - 1);
|
||||
r->dst_fmt = (p[1] >> 3) & ((1u << 2) - 1);
|
||||
r->src_fmt = (p[1] >> 5) & ((1u << 2) - 1);
|
||||
r->cmprs_fmt = (p[1] >> 7) & 1;
|
||||
r->sys_dtype = (p[1] >> 8) & 1;
|
||||
r->rsv2_1 = (p[1] >> 9) & ((1u << 4) - 1);
|
||||
r->int8_sign = (p[1] >> 13) & 1;
|
||||
r->compress_zero_guard = (p[1] >> 14) & 1;
|
||||
r->int8_rnd_mode = (p[1] >> 15) & 1;
|
||||
r->wait_id_tpu = (p[1] >> 16) & ((1u << 16) - 1);
|
||||
r->wait_id_other_tdma = p[2] & ((1u << 16) - 1);
|
||||
r->wait_id_sdma = (p[2] >> 16) & ((1u << 16) - 1);
|
||||
r->const_val = p[3] & ((1u << 16) - 1);
|
||||
r->src_base_reg_sel = (p[3] >> 16) & ((1u << 3) - 1);
|
||||
r->mv_lut_idx = (p[3] >> 19) & 1;
|
||||
r->dst_base_reg_sel = (p[3] >> 20) & ((1u << 3) - 1);
|
||||
r->mv_lut_base = (p[3] >> 23) & 1;
|
||||
r->rsv4_5 = (p[3] >> 24) & ((1u << 8) - 1);
|
||||
r->dst_h_stride = p[4] & ((1u << 16) - 1);
|
||||
r->dst_c_stride_low = (p[4] >> 16) & ((1u << 16) - 1);
|
||||
r->dst_n_stride = p[5];
|
||||
r->src_h_stride = p[6] & ((1u << 16) - 1);
|
||||
r->src_c_stride_low = (p[6] >> 16) & ((1u << 16) - 1);
|
||||
r->src_n_stride = p[7];
|
||||
r->dst_c = p[8] & ((1u << 16) - 1);
|
||||
r->src_c = (p[8] >> 16) & ((1u << 16) - 1);
|
||||
r->dst_w = p[9] & ((1u << 16) - 1);
|
||||
r->dst_h = (p[9] >> 16) & ((1u << 16) - 1);
|
||||
r->src_w = p[10] & ((1u << 16) - 1);
|
||||
r->src_h = (p[10] >> 16) & ((1u << 16) - 1);
|
||||
r->dst_base_addr_low = p[11];
|
||||
r->src_base_addr_low = p[12];
|
||||
r->src_n = p[13] & ((1u << 16) - 1);
|
||||
r->dst_base_addr_high = (p[13] >> 16) & ((1u << 8) - 1);
|
||||
r->src_base_addr_high = (p[13] >> 24) & ((1u << 8) - 1);
|
||||
r->src_c_stride_high = p[14] & ((1u << 16) - 1);
|
||||
r->dst_c_stride_high = (p[14] >> 16) & ((1u << 16) - 1);
|
||||
r->compress_bias0 = p[15] & ((1u << 8) - 1);
|
||||
r->compress_bias1 = (p[15] >> 8) & ((1u << 8) - 1);
|
||||
r->layer_ID = (p[15] >> 16) & ((1u << 16) - 1);
|
||||
}
|
||||
|
||||
static inline void emit_tdma_reg(const tdma_reg_t *r, uint32_t *_p)
|
||||
{
|
||||
volatile uint32_t *p = (typeof(p))_p;
|
||||
p[15] = (r->compress_bias0 & ((1u << 8) - 1)) |
|
||||
((r->compress_bias1 & ((1u << 8) - 1)) << 8) |
|
||||
((r->layer_ID & ((1u << 16) - 1)) << 16);
|
||||
p[14] = (r->src_c_stride_high & ((1u << 16) - 1)) |
|
||||
((r->dst_c_stride_high & ((1u << 16) - 1)) << 16);
|
||||
p[13] = (r->src_n & ((1u << 16) - 1)) |
|
||||
((r->dst_base_addr_high & ((1u << 8) - 1)) << 16) |
|
||||
((r->src_base_addr_high & ((1u << 8) - 1)) << 24);
|
||||
p[12] = (r->src_base_addr_low & (((uint64_t)1 << 32) - 1));
|
||||
p[11] = (r->dst_base_addr_low & (((uint64_t)1 << 32) - 1));
|
||||
p[10] = (r->src_w & ((1u << 16) - 1)) |
|
||||
((r->src_h & ((1u << 16) - 1)) << 16);
|
||||
p[9] = (r->dst_w & ((1u << 16) - 1)) |
|
||||
((r->dst_h & ((1u << 16) - 1)) << 16);
|
||||
p[8] = (r->dst_c & ((1u << 16) - 1)) |
|
||||
((r->src_c & ((1u << 16) - 1)) << 16);
|
||||
p[7] = (r->src_n_stride & (((uint64_t)1 << 32) - 1));
|
||||
p[6] = (r->src_h_stride & ((1u << 16) - 1)) |
|
||||
((r->src_c_stride_low & ((1u << 16) - 1)) << 16);
|
||||
p[5] = (r->dst_n_stride & (((uint64_t)1 << 32) - 1));
|
||||
p[4] = (r->dst_h_stride & ((1u << 16) - 1)) |
|
||||
((r->dst_c_stride_low & ((1u << 16) - 1)) << 16);
|
||||
p[3] = (r->const_val & ((1u << 16) - 1)) |
|
||||
((r->src_base_reg_sel & ((1u << 3) - 1)) << 16) |
|
||||
((r->mv_lut_idx & 1) << 19) |
|
||||
((r->dst_base_reg_sel & ((1u << 3) - 1)) << 20) |
|
||||
((r->mv_lut_base & 1) << 23) |
|
||||
((r->rsv4_5 & ((1u << 8) - 1)) << 24);
|
||||
p[2] = (r->wait_id_other_tdma & ((1u << 16) - 1)) |
|
||||
((r->wait_id_sdma & ((1u << 16) - 1)) << 16);
|
||||
p[1] = (r->spec_func & ((1u << 3) - 1)) |
|
||||
((r->dst_fmt & ((1u << 2) - 1)) << 3) |
|
||||
((r->src_fmt & ((1u << 2) - 1)) << 5) |
|
||||
((r->cmprs_fmt & 1) << 7) |
|
||||
((r->sys_dtype & 1) << 8) |
|
||||
((r->rsv2_1 & ((1u << 4) - 1)) << 9) |
|
||||
((r->int8_sign & 1) << 13) |
|
||||
((r->compress_zero_guard & 1) << 14) |
|
||||
((r->int8_rnd_mode & 1) << 15) |
|
||||
((r->wait_id_tpu & ((1u << 16) - 1)) << 16);
|
||||
p[0] = (r->vld & 1) |
|
||||
((r->compress_en & 1) << 1) |
|
||||
((r->eod & 1) << 2) |
|
||||
((r->intp_en & 1) << 3) |
|
||||
((r->bar_en & 1) << 4) |
|
||||
((r->check_bf16_value & 1) << 5) |
|
||||
((r->trans_dir & ((1u << 2) - 1)) << 6) |
|
||||
((r->rsv00 & ((1u << 2) - 1)) << 8) |
|
||||
((r->trans_fmt & 1) << 10) |
|
||||
((r->transpose_md & ((1u << 2) - 1)) << 11) |
|
||||
((r->rsv01 & 1) << 13) |
|
||||
((r->intra_cmd_paral & 1) << 14) |
|
||||
((r->outstanding_en & 1) << 15) |
|
||||
((r->cmd_id & ((1u << 16) - 1)) << 16);
|
||||
}
|
||||
|
||||
static inline void reset_tdma_reg(tdma_reg_t *r)
|
||||
{
|
||||
r->vld = 0x0;
|
||||
r->compress_en = 0x0;
|
||||
r->eod = 0x0;
|
||||
r->intp_en = 0x0;
|
||||
r->bar_en = 0x0;
|
||||
r->check_bf16_value = 0x0;
|
||||
r->trans_dir = 0x0;
|
||||
r->rsv00 = 0x0;
|
||||
r->trans_fmt = 0x0;
|
||||
r->transpose_md = 0x0;
|
||||
r->rsv01 = 0x0;
|
||||
r->intra_cmd_paral = 0x0;
|
||||
r->outstanding_en = 0x0;
|
||||
r->cmd_id = 0x0;
|
||||
r->spec_func = 0x0;
|
||||
r->dst_fmt = 0x1;
|
||||
r->src_fmt = 0x1;
|
||||
r->cmprs_fmt = 0x0;
|
||||
r->sys_dtype = 0x0;
|
||||
r->rsv2_1 = 0x0;
|
||||
r->int8_sign = 0x0;
|
||||
r->compress_zero_guard = 0x0;
|
||||
r->int8_rnd_mode = 0x0;
|
||||
r->wait_id_tpu = 0x0;
|
||||
r->wait_id_other_tdma = 0x0;
|
||||
r->wait_id_sdma = 0x0;
|
||||
r->const_val = 0x0;
|
||||
r->src_base_reg_sel = 0x0;
|
||||
r->mv_lut_idx = 0x0;
|
||||
r->dst_base_reg_sel = 0x0;
|
||||
r->mv_lut_base = 0x0;
|
||||
r->rsv4_5 = 0x0;
|
||||
r->dst_h_stride = 0x1;
|
||||
r->dst_c_stride_low = 0x1;
|
||||
r->dst_n_stride = 0x1;
|
||||
r->src_h_stride = 0x1;
|
||||
r->src_c_stride_low = 0x1;
|
||||
r->src_n_stride = 0x1;
|
||||
r->dst_c = 0x1;
|
||||
r->src_c = 0x1;
|
||||
r->dst_w = 0x1;
|
||||
r->dst_h = 0x1;
|
||||
r->src_w = 0x1;
|
||||
r->src_h = 0x1;
|
||||
r->dst_base_addr_low = 0x0;
|
||||
r->src_base_addr_low = 0x0;
|
||||
r->src_n = 0x1;
|
||||
r->dst_base_addr_high = 0x0;
|
||||
r->src_base_addr_high = 0x0;
|
||||
r->src_c_stride_high = 0x0;
|
||||
r->dst_c_stride_high = 0x0;
|
||||
r->compress_bias0 = 0x0;
|
||||
r->compress_bias1 = 0x0;
|
||||
r->layer_ID = 0x0;
|
||||
}
|
||||
|
||||
static inline void trace_tdma_reg(tdma_reg_t *r, const char *tag)
|
||||
{
|
||||
#define trace_one_reg(name) \
|
||||
printf(" %s: 0x%llx\n", #name, (ullong)r->name)
|
||||
|
||||
printf("--- %s ---\n", tag);
|
||||
trace_one_reg(vld);
|
||||
trace_one_reg(compress_en);
|
||||
trace_one_reg(eod);
|
||||
trace_one_reg(intp_en);
|
||||
trace_one_reg(bar_en);
|
||||
trace_one_reg(check_bf16_value);
|
||||
trace_one_reg(trans_dir);
|
||||
trace_one_reg(rsv00);
|
||||
trace_one_reg(trans_fmt);
|
||||
trace_one_reg(transpose_md);
|
||||
trace_one_reg(rsv01);
|
||||
trace_one_reg(intra_cmd_paral);
|
||||
trace_one_reg(outstanding_en);
|
||||
trace_one_reg(cmd_id);
|
||||
trace_one_reg(spec_func);
|
||||
trace_one_reg(dst_fmt);
|
||||
trace_one_reg(src_fmt);
|
||||
trace_one_reg(cmprs_fmt);
|
||||
trace_one_reg(sys_dtype);
|
||||
trace_one_reg(rsv2_1);
|
||||
trace_one_reg(int8_sign);
|
||||
trace_one_reg(compress_zero_guard);
|
||||
trace_one_reg(int8_rnd_mode);
|
||||
trace_one_reg(wait_id_tpu);
|
||||
trace_one_reg(wait_id_other_tdma);
|
||||
trace_one_reg(wait_id_sdma);
|
||||
trace_one_reg(const_val);
|
||||
trace_one_reg(src_base_reg_sel);
|
||||
trace_one_reg(mv_lut_idx);
|
||||
trace_one_reg(dst_base_reg_sel);
|
||||
trace_one_reg(mv_lut_base);
|
||||
trace_one_reg(rsv4_5);
|
||||
trace_one_reg(dst_h_stride);
|
||||
trace_one_reg(dst_c_stride_low);
|
||||
trace_one_reg(dst_n_stride);
|
||||
trace_one_reg(src_h_stride);
|
||||
trace_one_reg(src_c_stride_low);
|
||||
trace_one_reg(src_n_stride);
|
||||
trace_one_reg(dst_c);
|
||||
trace_one_reg(src_c);
|
||||
trace_one_reg(dst_w);
|
||||
trace_one_reg(dst_h);
|
||||
trace_one_reg(src_w);
|
||||
trace_one_reg(src_h);
|
||||
trace_one_reg(dst_base_addr_low);
|
||||
trace_one_reg(src_base_addr_low);
|
||||
trace_one_reg(src_n);
|
||||
trace_one_reg(dst_base_addr_high);
|
||||
trace_one_reg(src_base_addr_high);
|
||||
trace_one_reg(src_c_stride_high);
|
||||
trace_one_reg(dst_c_stride_high);
|
||||
trace_one_reg(compress_bias0);
|
||||
trace_one_reg(compress_bias1);
|
||||
trace_one_reg(layer_ID);
|
||||
}
|
||||
#endif /* CV181X_TDMA_REG_H */
|
||||
622
cvikernel/include/cvikernel/cv181x/cv181x_tiu_reg.h
Normal file
622
cvikernel/include/cvikernel/cv181x/cv181x_tiu_reg.h
Normal file
@ -0,0 +1,622 @@
|
||||
#ifndef CV181X_TIU_REG_H
|
||||
#define CV181X_TIU_REG_H
|
||||
|
||||
/*
|
||||
* This file is generated by tools. Do not edit it manually.
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#define TIU_DESC_REG_BYTES (0x70)
|
||||
#define TIU_ENGINE_DESCRIPTOR_NUM (TIU_DESC_REG_BYTES >> 2)
|
||||
|
||||
// TIU operation data type
|
||||
#define DCR_TYPE_CONV_FIX8B 0
|
||||
#define DCR_TYPE_DEPTHWISE_POOL_FIX8B 1
|
||||
#define DCR_TYPE_FC_FIX8B 2
|
||||
#define DCR_TYPE_TENSOR_ARITH_FIX8B 3
|
||||
#define NR_DCR_TYPES 4
|
||||
|
||||
#define TENSOR_MUL_FIX8B 0
|
||||
#define TENSOR_MAC_FIX8B 1
|
||||
#define TENSOR_ADD_FIX8B 2
|
||||
#define TENSOR_SUB_FIX8B 3
|
||||
#define TENSOR_MAX_FIX8B 4
|
||||
#define TENSOR_MIN_FIX8B 5
|
||||
#define TENSOR_SHIFT_FIX8B 6
|
||||
#define TENSOR_AND_FIX8B 7
|
||||
#define TENSOR_OR_FIX8B 8
|
||||
#define TENSOR_XOR_FIX8B 9
|
||||
#define TENSOR_COPY_FIX8B 10
|
||||
#define TENSOR_GE_FIX8B 11
|
||||
|
||||
typedef unsigned long long ullong;
|
||||
|
||||
typedef struct {
|
||||
uint32_t cmd_en;
|
||||
uint32_t cmd_end;
|
||||
uint32_t cmd_id_en;
|
||||
uint32_t cmd_keep;
|
||||
uint32_t cmd_intr_en;
|
||||
uint32_t tsk_typ;
|
||||
uint32_t tsk_eu_typ;
|
||||
uint32_t tsk_opd_num;
|
||||
uint32_t opt_res_shift;
|
||||
uint32_t opt_left_shift;
|
||||
uint32_t opt_shift_typ;
|
||||
uint32_t opt_rshift_typ;
|
||||
uint32_t dummy1;
|
||||
uint32_t opd_typ;
|
||||
uint32_t opt_chl_quan;
|
||||
uint32_t cmd_id_tpu;
|
||||
uint32_t cmd_id_gdma;
|
||||
uint32_t quan_m;
|
||||
uint32_t opt_res0_sign;
|
||||
uint32_t opt_opd0_sign;
|
||||
uint32_t opt_opd1_sign;
|
||||
uint32_t opt_opd2_sign;
|
||||
uint32_t opt_res0_seg;
|
||||
uint32_t opt_opd0_seg;
|
||||
uint32_t opt_opd1_seg;
|
||||
uint32_t opt_opd2_seg;
|
||||
uint32_t ps32_md;
|
||||
uint32_t double_conv;
|
||||
uint32_t opt_left_tran;
|
||||
uint32_t fp_round_typ;
|
||||
uint32_t opt_relu_typ;
|
||||
uint32_t opt_relu_value;
|
||||
uint32_t cmd_pre_exe_typ;
|
||||
uint32_t opt_res_add;
|
||||
uint32_t rsvd0;
|
||||
uint32_t conv_opd0_x_ins0;
|
||||
uint32_t conv_opd0_y_ins0;
|
||||
uint32_t conv_opd0_x_ins0_last;
|
||||
uint32_t conv_opd0_y_ins0_last;
|
||||
uint32_t conv_opd1_x_ins0;
|
||||
uint32_t conv_opd1_y_ins0;
|
||||
uint32_t dummy0;
|
||||
uint32_t opd0_ins_val;
|
||||
uint32_t conv_opd0_up_pad;
|
||||
uint32_t conv_opd0_dn_pad;
|
||||
uint32_t conv_opd0_lf_pad;
|
||||
uint32_t conv_opd0_rt_pad;
|
||||
uint32_t res0_n;
|
||||
uint32_t res0_c;
|
||||
uint32_t res0_h;
|
||||
uint32_t res0_w;
|
||||
uint32_t conv_op_x_str;
|
||||
uint32_t conv_op_y_str;
|
||||
uint32_t cmd_pre_exe;
|
||||
uint32_t rsvd1;
|
||||
uint32_t res0_addr;
|
||||
uint32_t opd0_addr;
|
||||
uint32_t opd1_addr;
|
||||
uint32_t opd2_addr;
|
||||
uint32_t opt_opd0_const;
|
||||
uint32_t opt_opd1_const;
|
||||
uint32_t opt_opd2_const;
|
||||
uint32_t short_nchwstr_same;
|
||||
uint32_t short_res0_str;
|
||||
uint32_t short_opd0_str;
|
||||
uint32_t short_opd1_str;
|
||||
uint32_t short_opd2_str;
|
||||
uint32_t dummy2;
|
||||
uint32_t opd0_n;
|
||||
uint32_t opd0_c;
|
||||
uint32_t dummy3;
|
||||
uint32_t rsvd2;
|
||||
uint32_t opd0_h;
|
||||
uint32_t opd0_w;
|
||||
uint32_t opd1_n;
|
||||
uint32_t opd1_c;
|
||||
uint32_t opd1_h;
|
||||
uint32_t opd1_w;
|
||||
uint32_t opd2_n;
|
||||
uint32_t opd2_c;
|
||||
uint32_t opd2_h;
|
||||
uint32_t opd2_w;
|
||||
uint32_t dummy4;
|
||||
uint32_t rsvd3;
|
||||
uint32_t layer_info;
|
||||
uint32_t res0_n_str;
|
||||
uint32_t res0_c_str;
|
||||
uint32_t res0_h_str;
|
||||
uint32_t res0_w_str;
|
||||
uint32_t res0_b_str;
|
||||
uint32_t opd0_n_str;
|
||||
uint32_t dummy5;
|
||||
uint32_t rsvd4;
|
||||
uint32_t opd0_c_str;
|
||||
uint32_t opd0_h_str;
|
||||
uint32_t opd0_w_str;
|
||||
uint32_t opd0_b_str;
|
||||
uint32_t opd1_n_str;
|
||||
uint32_t opd1_c_str;
|
||||
uint32_t opd1_h_str;
|
||||
uint32_t dummy6;
|
||||
uint32_t rsvd5;
|
||||
uint32_t opd1_w_str;
|
||||
uint32_t opd1_b_str;
|
||||
uint32_t opd2_n_str;
|
||||
uint32_t opd2_c_str;
|
||||
uint32_t opd2_h_str;
|
||||
uint32_t opd2_w_str;
|
||||
uint32_t opd2_b_str;
|
||||
uint32_t dummy7;
|
||||
uint32_t rsvd6;
|
||||
} tiu_reg_t;
|
||||
|
||||
static inline void parse_tiu_reg(tiu_reg_t *r, const uint32_t *p)
|
||||
{
|
||||
r->cmd_en = p[0] & 1;
|
||||
r->cmd_end = (p[0] >> 1) & 1;
|
||||
r->cmd_id_en = (p[0] >> 2) & 1;
|
||||
r->cmd_keep = (p[0] >> 3) & 1;
|
||||
r->cmd_intr_en = (p[0] >> 4) & 1;
|
||||
r->tsk_typ = (p[0] >> 5) & ((1u << 4) - 1);
|
||||
r->tsk_eu_typ = (p[0] >> 9) & ((1u << 5) - 1);
|
||||
r->tsk_opd_num = (p[0] >> 14) & ((1u << 2) - 1);
|
||||
r->opt_res_shift = (p[0] >> 16) & ((1u << 6) - 1);
|
||||
r->opt_left_shift = (p[0] >> 22) & ((1u << 5) - 1);
|
||||
r->opt_shift_typ = (p[0] >> 27) & 1;
|
||||
r->opt_rshift_typ = (p[0] >> 28) & 1;
|
||||
r->dummy1 = (p[0] >> 29) & 1;
|
||||
r->opd_typ = (p[0] >> 30) & 1;
|
||||
r->opt_chl_quan = (p[0] >> 31) & 1;
|
||||
r->cmd_id_tpu = p[1] & ((1u << 16) - 1);
|
||||
r->cmd_id_gdma = (p[1] >> 16) & ((1u << 16) - 1);
|
||||
r->quan_m = p[2];
|
||||
r->opt_res0_sign = p[3] & 1;
|
||||
r->opt_opd0_sign = (p[3] >> 1) & 1;
|
||||
r->opt_opd1_sign = (p[3] >> 2) & 1;
|
||||
r->opt_opd2_sign = (p[3] >> 3) & 1;
|
||||
r->opt_res0_seg = (p[3] >> 4) & ((1u << 2) - 1);
|
||||
r->opt_opd0_seg = (p[3] >> 6) & ((1u << 2) - 1);
|
||||
r->opt_opd1_seg = (p[3] >> 8) & ((1u << 2) - 1);
|
||||
r->opt_opd2_seg = (p[3] >> 10) & 1;
|
||||
r->ps32_md = (p[3] >> 11) & ((1u << 2) - 1);
|
||||
r->double_conv = (p[3] >> 13) & 1;
|
||||
r->opt_left_tran = (p[3] >> 14) & 1;
|
||||
r->fp_round_typ = (p[3] >> 15) & 1;
|
||||
r->opt_relu_typ = (p[3] >> 16) & ((1u << 2) - 1);
|
||||
r->opt_relu_value = (p[3] >> 18) & ((1u << 8) - 1);
|
||||
r->cmd_pre_exe_typ = (p[3] >> 26) & 1;
|
||||
r->opt_res_add = (p[3] >> 27) & 1;
|
||||
r->rsvd0 = (p[3] >> 28) & ((1u << 4) - 1);
|
||||
r->conv_opd0_x_ins0 = p[4] & ((1u << 4) - 1);
|
||||
r->conv_opd0_y_ins0 = (p[4] >> 4) & ((1u << 4) - 1);
|
||||
r->conv_opd0_x_ins0_last = (p[4] >> 8) & ((1u << 4) - 1);
|
||||
r->conv_opd0_y_ins0_last = (p[4] >> 12) & ((1u << 4) - 1);
|
||||
r->conv_opd1_x_ins0 = (p[4] >> 16) & ((1u << 4) - 1);
|
||||
r->conv_opd1_y_ins0 = (p[4] >> 20) & ((1u << 4) - 1);
|
||||
r->dummy0 = (p[4] >> 24) & ((1u << 8) - 1);
|
||||
r->opd0_ins_val = p[5] & ((1u << 16) - 1);
|
||||
r->conv_opd0_up_pad = (p[5] >> 16) & ((1u << 4) - 1);
|
||||
r->conv_opd0_dn_pad = (p[5] >> 20) & ((1u << 4) - 1);
|
||||
r->conv_opd0_lf_pad = (p[5] >> 24) & ((1u << 4) - 1);
|
||||
r->conv_opd0_rt_pad = (p[5] >> 28) & ((1u << 4) - 1);
|
||||
r->res0_n = p[6] & ((1u << 12) - 1);
|
||||
r->res0_c = (p[6] >> 12) & ((1u << 12) - 1);
|
||||
r->res0_h = (p[6] >> 24) & ((1u << 8) - 1);
|
||||
r->res0_h |= (uint64_t)(p[7] & ((1u << 4) - 1)) << 8;
|
||||
r->res0_w = (p[7] >> 4) & ((1u << 12) - 1);
|
||||
r->conv_op_x_str = (p[7] >> 16) & ((1u << 5) - 1);
|
||||
r->conv_op_y_str = (p[7] >> 21) & ((1u << 5) - 1);
|
||||
r->cmd_pre_exe = (p[7] >> 26) & ((1u << 2) - 1);
|
||||
r->rsvd1 = (p[7] >> 28) & ((1u << 4) - 1);
|
||||
r->res0_addr = p[8] & ((1u << 24) - 1);
|
||||
r->opd0_addr = (p[8] >> 24) & ((1u << 8) - 1);
|
||||
r->opd0_addr |= (uint64_t)(p[9] & ((1u << 16) - 1)) << 8;
|
||||
r->opd1_addr = (p[9] >> 16) & ((1u << 16) - 1);
|
||||
r->opd2_addr = p[10] & ((1u << 16) - 1);
|
||||
r->opt_opd0_const = (p[10] >> 16) & 1;
|
||||
r->opt_opd1_const = (p[10] >> 17) & 1;
|
||||
r->opt_opd2_const = (p[10] >> 18) & 1;
|
||||
r->short_nchwstr_same = (p[10] >> 19) & 1;
|
||||
r->short_res0_str = (p[10] >> 20) & ((1u << 2) - 1);
|
||||
r->short_opd0_str = (p[10] >> 22) & ((1u << 2) - 1);
|
||||
r->short_opd1_str = (p[10] >> 24) & ((1u << 2) - 1);
|
||||
r->short_opd2_str = (p[10] >> 26) & ((1u << 2) - 1);
|
||||
r->dummy2 = (p[10] >> 28) & ((1u << 4) - 1);
|
||||
r->opd0_n = p[11] & ((1u << 12) - 1);
|
||||
r->opd0_c = (p[11] >> 12) & ((1u << 12) - 1);
|
||||
r->dummy3 = (p[11] >> 24) & ((1u << 4) - 1);
|
||||
r->rsvd2 = (p[11] >> 28) & ((1u << 4) - 1);
|
||||
r->opd0_h = p[12] & ((1u << 12) - 1);
|
||||
r->opd0_w = (p[12] >> 12) & ((1u << 12) - 1);
|
||||
r->opd1_n = (p[12] >> 24) & ((1u << 8) - 1);
|
||||
r->opd1_n |= (uint64_t)(p[13] & ((1u << 4) - 1)) << 8;
|
||||
r->opd1_c = (p[13] >> 4) & ((1u << 12) - 1);
|
||||
r->opd1_h = (p[13] >> 16) & ((1u << 12) - 1);
|
||||
r->opd1_w = (p[13] >> 28) & ((1u << 4) - 1);
|
||||
r->opd1_w |= (uint64_t)(p[14] & ((1u << 8) - 1)) << 4;
|
||||
r->opd2_n = (p[14] >> 8) & ((1u << 12) - 1);
|
||||
r->opd2_c = (p[14] >> 20) & ((1u << 12) - 1);
|
||||
r->opd2_h = p[15] & ((1u << 12) - 1);
|
||||
r->opd2_w = (p[15] >> 12) & ((1u << 12) - 1);
|
||||
r->dummy4 = (p[15] >> 24) & ((1u << 4) - 1);
|
||||
r->rsvd3 = (p[15] >> 28) & ((1u << 4) - 1);
|
||||
r->layer_info = p[16] & ((1u << 16) - 1);
|
||||
r->res0_n_str = (p[16] >> 16) & ((1u << 16) - 1);
|
||||
r->res0_c_str = p[17] & ((1u << 16) - 1);
|
||||
r->res0_h_str = (p[17] >> 16) & ((1u << 16) - 1);
|
||||
r->res0_w_str = p[18] & ((1u << 16) - 1);
|
||||
r->res0_b_str = (p[18] >> 16) & ((1u << 16) - 1);
|
||||
r->opd0_n_str = p[19] & ((1u << 16) - 1);
|
||||
r->dummy5 = (p[19] >> 16) & ((1u << 12) - 1);
|
||||
r->rsvd4 = (p[19] >> 28) & ((1u << 4) - 1);
|
||||
r->opd0_c_str = p[20] & ((1u << 16) - 1);
|
||||
r->opd0_h_str = (p[20] >> 16) & ((1u << 16) - 1);
|
||||
r->opd0_w_str = p[21] & ((1u << 16) - 1);
|
||||
r->opd0_b_str = (p[21] >> 16) & ((1u << 16) - 1);
|
||||
r->opd1_n_str = p[22] & ((1u << 16) - 1);
|
||||
r->opd1_c_str = (p[22] >> 16) & ((1u << 16) - 1);
|
||||
r->opd1_h_str = p[23] & ((1u << 16) - 1);
|
||||
r->dummy6 = (p[23] >> 16) & ((1u << 12) - 1);
|
||||
r->rsvd5 = (p[23] >> 28) & ((1u << 4) - 1);
|
||||
r->opd1_w_str = p[24] & ((1u << 16) - 1);
|
||||
r->opd1_b_str = (p[24] >> 16) & ((1u << 16) - 1);
|
||||
r->opd2_n_str = p[25] & ((1u << 16) - 1);
|
||||
r->opd2_c_str = (p[25] >> 16) & ((1u << 16) - 1);
|
||||
r->opd2_h_str = p[26] & ((1u << 16) - 1);
|
||||
r->opd2_w_str = (p[26] >> 16) & ((1u << 16) - 1);
|
||||
r->opd2_b_str = p[27] & ((1u << 16) - 1);
|
||||
r->dummy7 = (p[27] >> 16) & ((1u << 12) - 1);
|
||||
r->rsvd6 = (p[27] >> 28) & ((1u << 4) - 1);
|
||||
}
|
||||
|
||||
static inline void emit_tiu_reg(const tiu_reg_t *r, uint32_t *_p)
|
||||
{
|
||||
volatile uint32_t *p = (typeof(p))_p;
|
||||
p[27] = (r->opd2_b_str & ((1u << 16) - 1)) |
|
||||
((r->dummy7 & ((1u << 12) - 1)) << 16) |
|
||||
((r->rsvd6 & ((1u << 4) - 1)) << 28);
|
||||
p[26] = (r->opd2_h_str & ((1u << 16) - 1)) |
|
||||
((r->opd2_w_str & ((1u << 16) - 1)) << 16);
|
||||
p[25] = (r->opd2_n_str & ((1u << 16) - 1)) |
|
||||
((r->opd2_c_str & ((1u << 16) - 1)) << 16);
|
||||
p[24] = (r->opd1_w_str & ((1u << 16) - 1)) |
|
||||
((r->opd1_b_str & ((1u << 16) - 1)) << 16);
|
||||
p[23] = (r->opd1_h_str & ((1u << 16) - 1)) |
|
||||
((r->dummy6 & ((1u << 12) - 1)) << 16) |
|
||||
((r->rsvd5 & ((1u << 4) - 1)) << 28);
|
||||
p[22] = (r->opd1_n_str & ((1u << 16) - 1)) |
|
||||
((r->opd1_c_str & ((1u << 16) - 1)) << 16);
|
||||
p[21] = (r->opd0_w_str & ((1u << 16) - 1)) |
|
||||
((r->opd0_b_str & ((1u << 16) - 1)) << 16);
|
||||
p[20] = (r->opd0_c_str & ((1u << 16) - 1)) |
|
||||
((r->opd0_h_str & ((1u << 16) - 1)) << 16);
|
||||
p[19] = (r->opd0_n_str & ((1u << 16) - 1)) |
|
||||
((r->dummy5 & ((1u << 12) - 1)) << 16) |
|
||||
((r->rsvd4 & ((1u << 4) - 1)) << 28);
|
||||
p[18] = (r->res0_w_str & ((1u << 16) - 1)) |
|
||||
((r->res0_b_str & ((1u << 16) - 1)) << 16);
|
||||
p[17] = (r->res0_c_str & ((1u << 16) - 1)) |
|
||||
((r->res0_h_str & ((1u << 16) - 1)) << 16);
|
||||
p[16] = (r->layer_info & ((1u << 16) - 1)) |
|
||||
((r->res0_n_str & ((1u << 16) - 1)) << 16);
|
||||
p[15] = (r->opd2_h & ((1u << 12) - 1)) |
|
||||
((r->opd2_w & ((1u << 12) - 1)) << 12) |
|
||||
((r->dummy4 & ((1u << 4) - 1)) << 24) |
|
||||
((r->rsvd3 & ((1u << 4) - 1)) << 28);
|
||||
p[14] = ((r->opd1_w >> 4) & ((1u << 8) - 1)) |
|
||||
((r->opd2_n & ((1u << 12) - 1)) << 8) |
|
||||
((r->opd2_c & ((1u << 12) - 1)) << 20);
|
||||
p[13] = ((r->opd1_n >> 8) & ((1u << 4) - 1)) |
|
||||
((r->opd1_c & ((1u << 12) - 1)) << 4) |
|
||||
((r->opd1_h & ((1u << 12) - 1)) << 16) |
|
||||
((r->opd1_w & ((1u << 4) - 1)) << 28);
|
||||
p[12] = (r->opd0_h & ((1u << 12) - 1)) |
|
||||
((r->opd0_w & ((1u << 12) - 1)) << 12) |
|
||||
((r->opd1_n & ((1u << 8) - 1)) << 24);
|
||||
p[11] = (r->opd0_n & ((1u << 12) - 1)) |
|
||||
((r->opd0_c & ((1u << 12) - 1)) << 12) |
|
||||
((r->dummy3 & ((1u << 4) - 1)) << 24) |
|
||||
((r->rsvd2 & ((1u << 4) - 1)) << 28);
|
||||
p[10] = (r->opd2_addr & ((1u << 16) - 1)) |
|
||||
((r->opt_opd0_const & 1) << 16) |
|
||||
((r->opt_opd1_const & 1) << 17) |
|
||||
((r->opt_opd2_const & 1) << 18) |
|
||||
((r->short_nchwstr_same & 1) << 19) |
|
||||
((r->short_res0_str & ((1u << 2) - 1)) << 20) |
|
||||
((r->short_opd0_str & ((1u << 2) - 1)) << 22) |
|
||||
((r->short_opd1_str & ((1u << 2) - 1)) << 24) |
|
||||
((r->short_opd2_str & ((1u << 2) - 1)) << 26) |
|
||||
((r->dummy2 & ((1u << 4) - 1)) << 28);
|
||||
p[9] = ((r->opd0_addr >> 8) & ((1u << 16) - 1)) |
|
||||
((r->opd1_addr & ((1u << 16) - 1)) << 16);
|
||||
p[8] = (r->res0_addr & ((1u << 24) - 1)) |
|
||||
((r->opd0_addr & ((1u << 8) - 1)) << 24);
|
||||
p[7] = ((r->res0_h >> 8) & ((1u << 4) - 1)) |
|
||||
((r->res0_w & ((1u << 12) - 1)) << 4) |
|
||||
((r->conv_op_x_str & ((1u << 5) - 1)) << 16) |
|
||||
((r->conv_op_y_str & ((1u << 5) - 1)) << 21) |
|
||||
((r->cmd_pre_exe & ((1u << 2) - 1)) << 26) |
|
||||
((r->rsvd1 & ((1u << 4) - 1)) << 28);
|
||||
p[6] = (r->res0_n & ((1u << 12) - 1)) |
|
||||
((r->res0_c & ((1u << 12) - 1)) << 12) |
|
||||
((r->res0_h & ((1u << 8) - 1)) << 24);
|
||||
p[5] = (r->opd0_ins_val & ((1u << 16) - 1)) |
|
||||
((r->conv_opd0_up_pad & ((1u << 4) - 1)) << 16) |
|
||||
((r->conv_opd0_dn_pad & ((1u << 4) - 1)) << 20) |
|
||||
((r->conv_opd0_lf_pad & ((1u << 4) - 1)) << 24) |
|
||||
((r->conv_opd0_rt_pad & ((1u << 4) - 1)) << 28);
|
||||
p[4] = (r->conv_opd0_x_ins0 & ((1u << 4) - 1)) |
|
||||
((r->conv_opd0_y_ins0 & ((1u << 4) - 1)) << 4) |
|
||||
((r->conv_opd0_x_ins0_last & ((1u << 4) - 1)) << 8) |
|
||||
((r->conv_opd0_y_ins0_last & ((1u << 4) - 1)) << 12) |
|
||||
((r->conv_opd1_x_ins0 & ((1u << 4) - 1)) << 16) |
|
||||
((r->conv_opd1_y_ins0 & ((1u << 4) - 1)) << 20) |
|
||||
((r->dummy0 & ((1u << 8) - 1)) << 24);
|
||||
p[3] = (r->opt_res0_sign & 1) |
|
||||
((r->opt_opd0_sign & 1) << 1) |
|
||||
((r->opt_opd1_sign & 1) << 2) |
|
||||
((r->opt_opd2_sign & 1) << 3) |
|
||||
((r->opt_res0_seg & ((1u << 2) - 1)) << 4) |
|
||||
((r->opt_opd0_seg & ((1u << 2) - 1)) << 6) |
|
||||
((r->opt_opd1_seg & ((1u << 2) - 1)) << 8) |
|
||||
((r->opt_opd2_seg & 1) << 10) |
|
||||
((r->ps32_md & ((1u << 2) - 1)) << 11) |
|
||||
((r->double_conv & 1) << 13) |
|
||||
((r->opt_left_tran & 1) << 14) |
|
||||
((r->fp_round_typ & 1) << 15) |
|
||||
((r->opt_relu_typ & ((1u << 2) - 1)) << 16) |
|
||||
((r->opt_relu_value & ((1u << 8) - 1)) << 18) |
|
||||
((r->cmd_pre_exe_typ & 1) << 26) |
|
||||
((r->opt_res_add & 1) << 27) |
|
||||
((r->rsvd0 & ((1u << 4) - 1)) << 28);
|
||||
p[2] = (r->quan_m & (((uint64_t)1 << 32) - 1));
|
||||
p[1] = (r->cmd_id_tpu & ((1u << 16) - 1)) |
|
||||
((r->cmd_id_gdma & ((1u << 16) - 1)) << 16);
|
||||
p[0] = (r->cmd_en & 1) |
|
||||
((r->cmd_end & 1) << 1) |
|
||||
((r->cmd_id_en & 1) << 2) |
|
||||
((r->cmd_keep & 1) << 3) |
|
||||
((r->cmd_intr_en & 1) << 4) |
|
||||
((r->tsk_typ & ((1u << 4) - 1)) << 5) |
|
||||
((r->tsk_eu_typ & ((1u << 5) - 1)) << 9) |
|
||||
((r->tsk_opd_num & ((1u << 2) - 1)) << 14) |
|
||||
((r->opt_res_shift & ((1u << 6) - 1)) << 16) |
|
||||
((r->opt_left_shift & ((1u << 5) - 1)) << 22) |
|
||||
((r->opt_shift_typ & 1) << 27) |
|
||||
((r->opt_rshift_typ & 1) << 28) |
|
||||
((r->dummy1 & 1) << 29) |
|
||||
((r->opd_typ & 1) << 30) |
|
||||
((r->opt_chl_quan & 1) << 31);
|
||||
}
|
||||
|
||||
static inline void reset_tiu_reg(tiu_reg_t *r)
|
||||
{
|
||||
r->cmd_en = 0x0;
|
||||
r->cmd_end = 0x0;
|
||||
r->cmd_id_en = 0x0;
|
||||
r->cmd_keep = 0x0;
|
||||
r->cmd_intr_en = 0x0;
|
||||
r->tsk_typ = 0x0;
|
||||
r->tsk_eu_typ = 0x0;
|
||||
r->tsk_opd_num = 0x3;
|
||||
r->opt_res_shift = 0xa;
|
||||
r->opt_left_shift = 0x2;
|
||||
r->opt_shift_typ = 0x1;
|
||||
r->opt_rshift_typ = 0x1;
|
||||
r->dummy1 = 0x0;
|
||||
r->opd_typ = 0x0;
|
||||
r->opt_chl_quan = 0x0;
|
||||
r->cmd_id_tpu = 0x0;
|
||||
r->cmd_id_gdma = 0x0;
|
||||
r->quan_m = 0x0;
|
||||
r->opt_res0_sign = 0x0;
|
||||
r->opt_opd0_sign = 0x0;
|
||||
r->opt_opd1_sign = 0x1;
|
||||
r->opt_opd2_sign = 0x1;
|
||||
r->opt_res0_seg = 0x1;
|
||||
r->opt_opd0_seg = 0x1;
|
||||
r->opt_opd1_seg = 0x1;
|
||||
r->opt_opd2_seg = 0x0;
|
||||
r->ps32_md = 0x0;
|
||||
r->double_conv = 0x0;
|
||||
r->opt_left_tran = 0x0;
|
||||
r->fp_round_typ = 0x0;
|
||||
r->opt_relu_typ = 0x0;
|
||||
r->opt_relu_value = 0x0;
|
||||
r->cmd_pre_exe_typ = 0x0;
|
||||
r->opt_res_add = 0x0;
|
||||
r->rsvd0 = 0x0;
|
||||
r->conv_opd0_x_ins0 = 0x0;
|
||||
r->conv_opd0_y_ins0 = 0x0;
|
||||
r->conv_opd0_x_ins0_last = 0x0;
|
||||
r->conv_opd0_y_ins0_last = 0x0;
|
||||
r->conv_opd1_x_ins0 = 0x0;
|
||||
r->conv_opd1_y_ins0 = 0x0;
|
||||
r->dummy0 = 0x0;
|
||||
r->opd0_ins_val = 0x0;
|
||||
r->conv_opd0_up_pad = 0x0;
|
||||
r->conv_opd0_dn_pad = 0x0;
|
||||
r->conv_opd0_lf_pad = 0x0;
|
||||
r->conv_opd0_rt_pad = 0x0;
|
||||
r->res0_n = 0x1;
|
||||
r->res0_c = 0x1;
|
||||
r->res0_h = 0x1;
|
||||
r->res0_w = 0x10;
|
||||
r->conv_op_x_str = 0x1;
|
||||
r->conv_op_y_str = 0x1;
|
||||
r->cmd_pre_exe = 0x0;
|
||||
r->rsvd1 = 0x1;
|
||||
r->res0_addr = 0x0;
|
||||
r->opd0_addr = 0x0;
|
||||
r->opd1_addr = 0x0;
|
||||
r->opd2_addr = 0x0;
|
||||
r->opt_opd0_const = 0x0;
|
||||
r->opt_opd1_const = 0x0;
|
||||
r->opt_opd2_const = 0x0;
|
||||
r->short_nchwstr_same = 0x0;
|
||||
r->short_res0_str = 0x0;
|
||||
r->short_opd0_str = 0x0;
|
||||
r->short_opd1_str = 0x0;
|
||||
r->short_opd2_str = 0x0;
|
||||
r->dummy2 = 0x0;
|
||||
r->opd0_n = 0x1;
|
||||
r->opd0_c = 0x1;
|
||||
r->dummy3 = 0x0;
|
||||
r->rsvd2 = 0x2;
|
||||
r->opd0_h = 0x1;
|
||||
r->opd0_w = 0x10;
|
||||
r->opd1_n = 0x1;
|
||||
r->opd1_c = 0x1;
|
||||
r->opd1_h = 0x1;
|
||||
r->opd1_w = 0x10;
|
||||
r->opd2_n = 0x1;
|
||||
r->opd2_c = 0x1;
|
||||
r->opd2_h = 0x1;
|
||||
r->opd2_w = 0x10;
|
||||
r->dummy4 = 0x0;
|
||||
r->rsvd3 = 0x3;
|
||||
r->layer_info = 0x0;
|
||||
r->res0_n_str = 0x10;
|
||||
r->res0_c_str = 0x10;
|
||||
r->res0_h_str = 0x0;
|
||||
r->res0_w_str = 0x1;
|
||||
r->res0_b_str = 0x10;
|
||||
r->opd0_n_str = 0x10;
|
||||
r->dummy5 = 0x0;
|
||||
r->rsvd4 = 0x4;
|
||||
r->opd0_c_str = 0x10;
|
||||
r->opd0_h_str = 0x0;
|
||||
r->opd0_w_str = 0x1;
|
||||
r->opd0_b_str = 0x10;
|
||||
r->opd1_n_str = 0x10;
|
||||
r->opd1_c_str = 0x10;
|
||||
r->opd1_h_str = 0x0;
|
||||
r->dummy6 = 0x0;
|
||||
r->rsvd5 = 0x5;
|
||||
r->opd1_w_str = 0x1;
|
||||
r->opd1_b_str = 0x10;
|
||||
r->opd2_n_str = 0x10;
|
||||
r->opd2_c_str = 0x10;
|
||||
r->opd2_h_str = 0x0;
|
||||
r->opd2_w_str = 0x1;
|
||||
r->opd2_b_str = 0x10;
|
||||
r->dummy7 = 0x0;
|
||||
r->rsvd6 = 0x6;
|
||||
}
|
||||
|
||||
static inline void trace_tiu_reg(tiu_reg_t *r, const char *tag)
|
||||
{
|
||||
#define trace_one_reg(name) \
|
||||
printf(" %s: 0x%llx\n", #name, (ullong)r->name)
|
||||
|
||||
printf("--- %s ---\n", tag);
|
||||
trace_one_reg(cmd_en);
|
||||
trace_one_reg(cmd_end);
|
||||
trace_one_reg(cmd_id_en);
|
||||
trace_one_reg(cmd_keep);
|
||||
trace_one_reg(cmd_intr_en);
|
||||
trace_one_reg(tsk_typ);
|
||||
trace_one_reg(tsk_eu_typ);
|
||||
trace_one_reg(tsk_opd_num);
|
||||
trace_one_reg(opt_res_shift);
|
||||
trace_one_reg(opt_left_shift);
|
||||
trace_one_reg(opt_shift_typ);
|
||||
trace_one_reg(opt_rshift_typ);
|
||||
trace_one_reg(dummy1);
|
||||
trace_one_reg(opd_typ);
|
||||
trace_one_reg(opt_chl_quan);
|
||||
trace_one_reg(cmd_id_tpu);
|
||||
trace_one_reg(cmd_id_gdma);
|
||||
trace_one_reg(quan_m);
|
||||
trace_one_reg(opt_res0_sign);
|
||||
trace_one_reg(opt_opd0_sign);
|
||||
trace_one_reg(opt_opd1_sign);
|
||||
trace_one_reg(opt_opd2_sign);
|
||||
trace_one_reg(opt_res0_seg);
|
||||
trace_one_reg(opt_opd0_seg);
|
||||
trace_one_reg(opt_opd1_seg);
|
||||
trace_one_reg(opt_opd2_seg);
|
||||
trace_one_reg(ps32_md);
|
||||
trace_one_reg(double_conv);
|
||||
trace_one_reg(opt_left_tran);
|
||||
trace_one_reg(fp_round_typ);
|
||||
trace_one_reg(opt_relu_typ);
|
||||
trace_one_reg(opt_relu_value);
|
||||
trace_one_reg(cmd_pre_exe_typ);
|
||||
trace_one_reg(opt_res_add);
|
||||
trace_one_reg(rsvd0);
|
||||
trace_one_reg(conv_opd0_x_ins0);
|
||||
trace_one_reg(conv_opd0_y_ins0);
|
||||
trace_one_reg(conv_opd0_x_ins0_last);
|
||||
trace_one_reg(conv_opd0_y_ins0_last);
|
||||
trace_one_reg(conv_opd1_x_ins0);
|
||||
trace_one_reg(conv_opd1_y_ins0);
|
||||
trace_one_reg(dummy0);
|
||||
trace_one_reg(opd0_ins_val);
|
||||
trace_one_reg(conv_opd0_up_pad);
|
||||
trace_one_reg(conv_opd0_dn_pad);
|
||||
trace_one_reg(conv_opd0_lf_pad);
|
||||
trace_one_reg(conv_opd0_rt_pad);
|
||||
trace_one_reg(res0_n);
|
||||
trace_one_reg(res0_c);
|
||||
trace_one_reg(res0_h);
|
||||
trace_one_reg(res0_w);
|
||||
trace_one_reg(conv_op_x_str);
|
||||
trace_one_reg(conv_op_y_str);
|
||||
trace_one_reg(cmd_pre_exe);
|
||||
trace_one_reg(rsvd1);
|
||||
trace_one_reg(res0_addr);
|
||||
trace_one_reg(opd0_addr);
|
||||
trace_one_reg(opd1_addr);
|
||||
trace_one_reg(opd2_addr);
|
||||
trace_one_reg(opt_opd0_const);
|
||||
trace_one_reg(opt_opd1_const);
|
||||
trace_one_reg(opt_opd2_const);
|
||||
trace_one_reg(short_nchwstr_same);
|
||||
trace_one_reg(short_res0_str);
|
||||
trace_one_reg(short_opd0_str);
|
||||
trace_one_reg(short_opd1_str);
|
||||
trace_one_reg(short_opd2_str);
|
||||
trace_one_reg(dummy2);
|
||||
trace_one_reg(opd0_n);
|
||||
trace_one_reg(opd0_c);
|
||||
trace_one_reg(dummy3);
|
||||
trace_one_reg(rsvd2);
|
||||
trace_one_reg(opd0_h);
|
||||
trace_one_reg(opd0_w);
|
||||
trace_one_reg(opd1_n);
|
||||
trace_one_reg(opd1_c);
|
||||
trace_one_reg(opd1_h);
|
||||
trace_one_reg(opd1_w);
|
||||
trace_one_reg(opd2_n);
|
||||
trace_one_reg(opd2_c);
|
||||
trace_one_reg(opd2_h);
|
||||
trace_one_reg(opd2_w);
|
||||
trace_one_reg(dummy4);
|
||||
trace_one_reg(rsvd3);
|
||||
trace_one_reg(layer_info);
|
||||
trace_one_reg(res0_n_str);
|
||||
trace_one_reg(res0_c_str);
|
||||
trace_one_reg(res0_h_str);
|
||||
trace_one_reg(res0_w_str);
|
||||
trace_one_reg(res0_b_str);
|
||||
trace_one_reg(opd0_n_str);
|
||||
trace_one_reg(dummy5);
|
||||
trace_one_reg(rsvd4);
|
||||
trace_one_reg(opd0_c_str);
|
||||
trace_one_reg(opd0_h_str);
|
||||
trace_one_reg(opd0_w_str);
|
||||
trace_one_reg(opd0_b_str);
|
||||
trace_one_reg(opd1_n_str);
|
||||
trace_one_reg(opd1_c_str);
|
||||
trace_one_reg(opd1_h_str);
|
||||
trace_one_reg(dummy6);
|
||||
trace_one_reg(rsvd5);
|
||||
trace_one_reg(opd1_w_str);
|
||||
trace_one_reg(opd1_b_str);
|
||||
trace_one_reg(opd2_n_str);
|
||||
trace_one_reg(opd2_c_str);
|
||||
trace_one_reg(opd2_h_str);
|
||||
trace_one_reg(opd2_w_str);
|
||||
trace_one_reg(opd2_b_str);
|
||||
trace_one_reg(dummy7);
|
||||
trace_one_reg(rsvd6);
|
||||
}
|
||||
#endif /* CV181X_TIU_REG_H */
|
||||
38
cvikernel/include/cvikernel/cv181x/cv181x_tpu_cfg.h
Normal file
38
cvikernel/include/cvikernel/cv181x/cv181x_tpu_cfg.h
Normal file
@ -0,0 +1,38 @@
|
||||
#ifndef __CV181X_TPU_CFG__
|
||||
#define __CV181X_TPU_CFG__
|
||||
|
||||
#define CV181X_VER 182202
|
||||
#define CV181X_HW_NPU_SHIFT 3
|
||||
#define CV181X_HW_EU_SHIFT 4
|
||||
#define CV181X_HW_LMEM_SHIFT 15
|
||||
#define CV181X_HW_LMEM_BANKS 8
|
||||
#define CV181X_HW_LMEM_BANK_SIZE 0x1000
|
||||
#define CV181X_HW_NODE_CHIP_SHIFT 0
|
||||
#define CV181X_HW_NPU_NUM (1 << CV181X_HW_NPU_SHIFT)
|
||||
#define CV181X_HW_EU_NUM (1 << CV181X_HW_EU_SHIFT)
|
||||
#define CV181X_HW_LMEM_SIZE (1 << CV181X_HW_LMEM_SHIFT)
|
||||
#define CV181X_HW_LMEM_START_ADDR 0x0C000000
|
||||
#define CV181X_HW_NODE_CHIP_NUM (1 << CV181X_HW_NODE_CHIP_SHIFT)
|
||||
|
||||
#if (CV181X_HW_LMEM_SIZE != (CV181X_HW_LMEM_BANK_SIZE * CV181X_HW_LMEM_BANKS))
|
||||
#error "Set wrong TPU configuration."
|
||||
#endif
|
||||
|
||||
#define CV181X_GLOBAL_MEM_START_ADDR 0x0
|
||||
#define CV181X_GLOBAL_MEM_SIZE 0x100000000 //
|
||||
|
||||
#define CV181X_GLOBAL_TIU_CMDBUF_ADDR 0x00000000
|
||||
#define CV181X_GLOBAL_TDMA_CMDBUF_ADDR 0x00800000
|
||||
#define CV181X_GLOBAL_TIU_CMDBUF_RESERVED_SIZE 0x00800000 // 8MB
|
||||
#define CV181X_GLOBAL_TDMA_CMDBUF_RESERVED_SIZE 0x00800000 // 8MB
|
||||
#define CV181X_GLOBAL_POOL_RESERVED_SIZE (CV181X_GLOBAL_MEM_SIZE - CV181X_GLOBAL_TIU_CMDBUF_RESERVED_SIZE - CV181X_GLOBAL_TDMA_CMDBUF_RESERVED_SIZE)
|
||||
|
||||
#define CV181X_UART_CTLR_BASE_ADDR 0x04140000
|
||||
|
||||
#define CV181X_TDMA_ENGINE_BASE_ADDR 0x0C100000
|
||||
#define CV181X_TDMA_ENGINE_END_ADDR (CV181X_TDMA_ENGINE_BASE_ADDR + 0x1000)
|
||||
|
||||
#define CV181X_TIU_ENGINE_BASE_ADDR 0x0C101000 //"NPS Register" in memory map?
|
||||
#define CV181X_TIU_ENGINE_END_ADDR (CV181X_TIU_ENGINE_BASE_ADDR + 0x1000)
|
||||
|
||||
#endif
|
||||
1171
cvikernel/include/cvikernel/cvikernel.h
Normal file
1171
cvikernel/include/cvikernel/cvikernel.h
Normal file
File diff suppressed because it is too large
Load Diff
333
cvikernel/include/cvikernel/cvk_fp_convert.h
Normal file
333
cvikernel/include/cvikernel/cvk_fp_convert.h
Normal file
@ -0,0 +1,333 @@
|
||||
#ifndef CVK_FP_CONVERT_H
|
||||
#define CVK_FP_CONVERT_H
|
||||
|
||||
#if __arm__
|
||||
#define __DISABLE_FENV__
|
||||
#endif
|
||||
|
||||
#ifndef __DISABLE_FENV__
|
||||
#include <fenv.h>
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
static inline uint8_t cvk_convert_bf16_u8(uint16_t data);
|
||||
static inline uint8_t cvk_convert_bf16_u8_rnd(uint16_t data, int int8_rnd_md);
|
||||
static inline int8_t cvk_convert_bf16_s8_rnd(uint16_t data, int int8_rnd_md);
|
||||
static inline int8_t cvk_convert_bf16_s8(uint16_t data);
|
||||
static inline uint16_t cvk_convert_int8_bf16(uint8_t data, uint8_t sign);
|
||||
static inline uint32_t cvk_convert_fp32_u32(float fp32);
|
||||
static inline uint32_t cvk_convert_fp32_hex(float val);
|
||||
static inline float cvk_convert_hex_fp32(uint32_t hval);
|
||||
|
||||
static inline float cvk_convert_bf16_fp32(uint16_t bf16);
|
||||
static inline uint16_t cvk_convert_fp32_bf16(float fp32);
|
||||
|
||||
static inline void cvk_f32_integer(void *if32, void *o_integer, int integer_size, int accumulate, int int8_signed, int int8_rnd_md);
|
||||
//static inline void f32_integer(void *if32, void *o_integer,
|
||||
// 0 for 32 bit , 1 for 16 bit , 2 for 8 bit
|
||||
// int integer_size, int accumulate = 0, int int8_signed = 1, int int8_rnd_md = 0);
|
||||
|
||||
union convert_type_float {
|
||||
float fval;
|
||||
uint16_t bf16[2];
|
||||
uint32_t ival;
|
||||
};
|
||||
|
||||
typedef union convert_type_float convert_int_float;
|
||||
static const uint16_t NAN_VALUE = 0x7FC0;
|
||||
|
||||
//static int round_mode = 0;
|
||||
static uint8_t cvk_float_isnan(const float x) {
|
||||
//return isnan(x);
|
||||
return x != x;
|
||||
}
|
||||
|
||||
static inline int cvk_set_store_feround()
|
||||
{
|
||||
#ifndef __DISABLE_FENV__
|
||||
int round_mode = fegetround();
|
||||
fesetround(FE_TOWARDZERO);
|
||||
return round_mode;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void cvk_restore_feround(int round_mode)
|
||||
{
|
||||
#ifndef __DISABLE_FENV__
|
||||
fesetround(round_mode);
|
||||
#else
|
||||
(void)round_mode;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline uint8_t cvk_convert_bf16_u8_rnd(uint16_t data, int int8_rnd_md)
|
||||
{
|
||||
/* convert bf16 to float32*/
|
||||
float fp32;
|
||||
convert_int_float convert_val;
|
||||
fp32 = cvk_convert_bf16_fp32(data);
|
||||
/* convert float32 to uint8_t*/
|
||||
cvk_f32_integer((void*)&fp32, &convert_val.ival, 2, 0, 0, int8_rnd_md);
|
||||
return (uint8_t) convert_val.ival;
|
||||
}
|
||||
|
||||
static inline uint8_t cvk_convert_bf16_u8(uint16_t data)
|
||||
{
|
||||
return (uint8_t) cvk_convert_bf16_u8_rnd(data, 0);
|
||||
}
|
||||
|
||||
static inline int8_t cvk_convert_bf16_s8_rnd(uint16_t data, int int8_rnd_md)
|
||||
{
|
||||
/* convert bf16 to float32 */
|
||||
float fp32;
|
||||
convert_int_float convert_val;
|
||||
fp32 = cvk_convert_bf16_fp32(data);
|
||||
/* convert float32 to uint8_t*/
|
||||
cvk_f32_integer((void*)&fp32, &convert_val.ival, 2, 0, 1, int8_rnd_md);
|
||||
return (int8_t) convert_val.ival;
|
||||
}
|
||||
|
||||
static inline int8_t cvk_convert_bf16_s8(uint16_t data)
|
||||
{
|
||||
return (int8_t) cvk_convert_bf16_s8_rnd(data, 0);
|
||||
}
|
||||
|
||||
static inline uint16_t cvk_convert_int8_bf16(uint8_t data, uint8_t sign)
|
||||
{
|
||||
int32_t val = sign ? (int8_t) data : (uint8_t) data;
|
||||
/* need to round to bf16 mode */
|
||||
return cvk_convert_fp32_bf16((float) val);
|
||||
}
|
||||
|
||||
static inline uint16_t cvk_convert_fp32_bf16(float fp32)
|
||||
{
|
||||
if (cvk_float_isnan(fp32))
|
||||
return NAN_VALUE;
|
||||
convert_int_float convert_val;
|
||||
convert_val.fval = fp32;
|
||||
uint32_t input = convert_val.ival;
|
||||
uint32_t lsb = (input >> 16) & 1;
|
||||
uint32_t rounding_bias = 0x7fff + lsb;
|
||||
input += rounding_bias;
|
||||
convert_val.bf16[1] = (uint16_t) (input >> 16);
|
||||
|
||||
/* HW behavior */
|
||||
if ((convert_val.bf16[1] & 0x7f80) == 0x7f80) {
|
||||
convert_val.bf16[1] = 0x7f7f;
|
||||
}
|
||||
return convert_val.bf16[1];
|
||||
}
|
||||
|
||||
static inline uint8_t cvk_convert_fp32_u8(float fp32)
|
||||
{
|
||||
convert_int_float convert_val;
|
||||
cvk_f32_integer((void*)&fp32, &convert_val.ival, 2, 0, 0, 0);
|
||||
return (uint8_t) convert_val.ival;
|
||||
}
|
||||
|
||||
static inline int8_t cvk_convert_fp32_s8(float fp32)
|
||||
{
|
||||
convert_int_float convert_val;
|
||||
cvk_f32_integer((void*)&fp32, &convert_val.ival, 2, 0, 1, 0);
|
||||
return (int8_t) convert_val.ival;
|
||||
}
|
||||
|
||||
static inline uint32_t cvk_convert_fp32_u32(float fp32)
|
||||
{
|
||||
convert_int_float convert_val;
|
||||
cvk_f32_integer((void*)&fp32, &convert_val.ival, 0, 0, 0, 0);
|
||||
return (uint32_t) convert_val.ival;
|
||||
}
|
||||
|
||||
static inline int32_t cvk_convert_fp32_s32(float fp32)
|
||||
{
|
||||
convert_int_float convert_val;
|
||||
cvk_f32_integer((void*)&fp32, &convert_val.ival, 0, 0, 1, 0);
|
||||
return (int32_t) convert_val.ival;
|
||||
}
|
||||
|
||||
/* convert hex to float directly */
|
||||
static inline float cvk_convert_hex_fp32(uint32_t hval)
|
||||
{
|
||||
convert_int_float convert_val;
|
||||
convert_val.ival = hval;
|
||||
return convert_val.fval;
|
||||
}
|
||||
/* convert float to hex directly */
|
||||
static inline uint32_t cvk_convert_fp32_hex(float val)
|
||||
{
|
||||
convert_int_float convert_val;
|
||||
convert_val.fval = val;
|
||||
return convert_val.ival;
|
||||
}
|
||||
static inline float cvk_convert_bf16_fp32(uint16_t bf16)
|
||||
{
|
||||
convert_int_float convert_val;
|
||||
convert_val.bf16[1] = bf16;
|
||||
convert_val.bf16[0] = 0;
|
||||
return convert_val.fval;
|
||||
}
|
||||
|
||||
static inline void cvk_flt2int_flt(float x, unsigned long long* integer_part, float * sub_part, uint8_t sign)
|
||||
{
|
||||
convert_int_float work_x;
|
||||
int level_code;
|
||||
unsigned long tail_code;
|
||||
work_x.fval = x;
|
||||
level_code = ((work_x.ival >> 23) & 0xff) - 127;
|
||||
|
||||
//if the level code is negaive, the integer part of the float is zero
|
||||
if ( level_code < 0 ){
|
||||
*integer_part = 0;
|
||||
*sub_part = x;
|
||||
}
|
||||
else {
|
||||
tail_code = (work_x.ival) & 0x7fffff;
|
||||
tail_code = tail_code | 0x800000;
|
||||
|
||||
if (level_code < 23){
|
||||
tail_code >>= (23 - level_code);
|
||||
*integer_part = tail_code;
|
||||
work_x.ival &= 0xffffffff << (23 - level_code);
|
||||
*sub_part = x - work_x.fval;
|
||||
}
|
||||
else {
|
||||
tail_code <<= (level_code - 23);
|
||||
*integer_part = tail_code;
|
||||
if(level_code>30){
|
||||
*integer_part = 0x7fffffff;
|
||||
if(sign)*integer_part = 0x800000000;
|
||||
}
|
||||
*sub_part = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline static int cvk_flt2int(float ifval, int int8_rnd_md)
|
||||
{
|
||||
union {
|
||||
float floatNum;
|
||||
unsigned long intNum;
|
||||
} tempIfval;
|
||||
tempIfval.floatNum = ifval;
|
||||
uint8_t isPositive = ((tempIfval.intNum & 0x80000000UL) == 0x80000000UL) ? 0 : 1;
|
||||
float abs_fval = (!isPositive) ? -ifval : ifval;
|
||||
float sub_part;
|
||||
unsigned long long integer_part;
|
||||
uint8_t sign = !isPositive;
|
||||
cvk_flt2int_flt(abs_fval, &integer_part, &sub_part, sign);
|
||||
if (!isPositive)
|
||||
{
|
||||
unsigned long long result;
|
||||
if(int8_rnd_md == 0) { // round to nearest even
|
||||
if ( sub_part > 0.5f )
|
||||
{
|
||||
result = integer_part + 1;
|
||||
}
|
||||
else if (sub_part == 0.5f)
|
||||
{
|
||||
if ( integer_part & 0x1 )
|
||||
{
|
||||
result = integer_part + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
result = integer_part;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
result = integer_part;
|
||||
}
|
||||
} else { //round to zero
|
||||
result = integer_part;
|
||||
}
|
||||
if ( result > 0x80000000UL )
|
||||
{
|
||||
result = 0x80000000UL;
|
||||
}
|
||||
return -result;
|
||||
}
|
||||
else
|
||||
{
|
||||
unsigned long long result;
|
||||
if(int8_rnd_md == 0) { // round to nearest even
|
||||
if ( sub_part > 0.5f )
|
||||
{
|
||||
result = integer_part + 1;
|
||||
}
|
||||
else if ( sub_part == 0.5f )
|
||||
{
|
||||
if ( integer_part & 0x1 )
|
||||
{
|
||||
result = integer_part + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
result = integer_part;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
result = integer_part;
|
||||
}
|
||||
} else {
|
||||
result = integer_part;
|
||||
}
|
||||
if ( result > 0x7fffffff )
|
||||
{
|
||||
result = 0x7fffffff;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void cvk_f32_integer(void *if32, void *o_integer, int integer_size, int accumulate, int int8_signed, int int8_rnd_md)
|
||||
{
|
||||
int i_tmp;
|
||||
float *f_tmp;
|
||||
f_tmp = (float *)if32;
|
||||
i_tmp = cvk_flt2int(*f_tmp, int8_rnd_md);
|
||||
int *o32 = (int *)o_integer;
|
||||
int dst_f32 = *o32;
|
||||
short *o16 = (short *)o_integer;
|
||||
short dst_o16 = *o32;
|
||||
char *o8 = (char *)o_integer;
|
||||
char dst_o8 = *o8;
|
||||
|
||||
if (integer_size == 0) {
|
||||
*o32 = i_tmp;
|
||||
} else if (integer_size == 1) {
|
||||
*o16 = i_tmp;
|
||||
} else{
|
||||
*o8 = i_tmp;
|
||||
int min = (int8_signed) ? -128 : 0;
|
||||
int max = (int8_signed) ? 127 : 255;
|
||||
if (i_tmp < min ){
|
||||
*o8 = min;
|
||||
}
|
||||
else if (i_tmp > max){
|
||||
*o8 = max;
|
||||
}
|
||||
//*o8 = i_tmp;
|
||||
}
|
||||
if (accumulate) {
|
||||
if (integer_size == 0) {
|
||||
*o32 += dst_f32;
|
||||
} else if (integer_size == 1) {
|
||||
*o16 += dst_o16;
|
||||
} else
|
||||
*o8 += dst_o8;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* CVK_FP_CONVERT_H */
|
||||
728
cvikernel/include/cvikernel/cvk_vlc_compress.h
Normal file
728
cvikernel/include/cvikernel/cvk_vlc_compress.h
Normal file
@ -0,0 +1,728 @@
|
||||
#ifndef __CVK_VLC_COMPRESS_H__
|
||||
#define __CVK_VLC_COMPRESS_H__
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"
|
||||
{
|
||||
#endif
|
||||
|
||||
#define MAX_UNARY_FIELD_SIZE 47
|
||||
#define MAX_ORDER_K 5
|
||||
|
||||
static inline int divide_ceil(int numerator, int denominator)
|
||||
{
|
||||
return (numerator + denominator - 1) / denominator;
|
||||
}
|
||||
|
||||
/**
|
||||
* \data_type 0 means 8bit, 1 means 16bit
|
||||
*/
|
||||
static inline size_t get_out_bs_buf_size(uint64_t in_size, uint8_t data_type) {
|
||||
size_t blk_num = (data_type) ? ((in_size + 31) >> 5) : ((in_size + 15) >> 4);
|
||||
size_t in_size_pad = blk_num << (4 + data_type);
|
||||
size_t bs_buf_size = in_size_pad + (divide_ceil(blk_num, 16) << 4) + 16;
|
||||
return bs_buf_size;
|
||||
}
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint8_t signedness;
|
||||
uint8_t is_bfloat16;
|
||||
uint8_t bias0;
|
||||
uint8_t bias1;
|
||||
uint8_t zero_guard_en;
|
||||
} CommandInfo;
|
||||
typedef struct
|
||||
{
|
||||
uint8_t *stream; // stream buffer pointer
|
||||
int bit_pos; // current pointer (in bit)
|
||||
int buf_size; // in byte
|
||||
} StreamBuffer;
|
||||
|
||||
static inline int8_t two_side_circular_shift(int8_t val, uint8_t bias0, uint8_t bias1);
|
||||
static inline int8_t inv_two_side_circular_shift(int8_t val, uint8_t bias0, uint8_t bias1);
|
||||
static inline uint8_t center_shift(uint8_t val, uint8_t bias, uint8_t zero_guard);
|
||||
static inline uint8_t inv_center_shift(uint8_t val, uint8_t bias, uint8_t zero_guard);
|
||||
|
||||
static inline void init_stream(StreamBuffer *bs, const uint8_t *buf, int buf_size, uint8_t read_only);
|
||||
|
||||
static inline void cvk_vlc_est_weight_bias(const uint8_t *ibuf, size_t isz, uint8_t signedness, uint8_t isBfloat16, CommandInfo *cmd_info);
|
||||
static inline void cvk_vlc_enc_int8(const uint8_t *ibuf, size_t isz, uint8_t *obuf, size_t *osz, CommandInfo *cmd_info);
|
||||
static inline void cvk_vlc_dec_int8_ext(const uint8_t *ibuf, size_t isz, uint8_t *obuf, size_t *bs_size);
|
||||
static inline void cvk_vlc_dec_int8(const uint8_t *ibuf, size_t isz, uint8_t *obuf);
|
||||
static inline void cvk_vlc_enc_bf16(const uint16_t *ibuf, size_t isz, uint8_t *obuf, size_t *osz, CommandInfo *cmd_info);
|
||||
static inline void cvk_vlc_dec_bf16_ext(const uint8_t *ibuf, size_t isz, uint16_t *obuf, size_t *bs_size);
|
||||
static inline void cvk_vlc_dec_bf16(const uint8_t *ibuf, size_t isz, uint16_t *obuf);
|
||||
|
||||
static inline uint8_t get_bit_val(uint8_t *buf, int byte_idx, int bit_idx)
|
||||
{
|
||||
return (buf[byte_idx] >> bit_idx) & 0x1;
|
||||
}
|
||||
|
||||
static inline uint8_t sign_to_unsign(uint8_t val)
|
||||
{
|
||||
uint8_t sign_i = (val >> 7) & 0x1;
|
||||
int abs_data_i = abs(((int8_t)val));
|
||||
return ((abs_data_i << 1) - sign_i);
|
||||
}
|
||||
|
||||
static inline int8_t unsign_to_sign(uint8_t val)
|
||||
{
|
||||
uint8_t sign_i = val & 0x1;
|
||||
int abs_data_i = (((int)val) + 1) >> 1;
|
||||
return (uint8_t)((sign_i == 1) ? (-abs_data_i) : abs_data_i);
|
||||
}
|
||||
|
||||
static inline void dispatch_bf16_data(const uint16_t *bf16_in, uint8_t *exp, uint8_t *frac, size_t isz)
|
||||
{
|
||||
for (size_t i = 0; i < isz; i++)
|
||||
{
|
||||
exp[i] = (uint8_t)((bf16_in[i] >> 7) & 0xFF);
|
||||
frac[i] = (uint8_t)(((bf16_in[i] >> 15) << 7) | (bf16_in[i] & 0x7F));
|
||||
}
|
||||
}
|
||||
|
||||
static inline void merge_bf16_data(const uint8_t *exp_in, const uint8_t *frac_in, uint16_t *bf16_out, size_t isz)
|
||||
{
|
||||
memset(bf16_out, 0, sizeof(uint16_t));
|
||||
for (size_t i = 0; i < isz; i++)
|
||||
{
|
||||
bf16_out[i] = ((frac_in[i] >> 7) << 15) | (exp_in[i] << 7) | (frac_in[i] & 0x7F);
|
||||
}
|
||||
}
|
||||
|
||||
// -- streaming operation handler --
|
||||
static inline void init_stream(StreamBuffer *bs, const uint8_t *buf, int buf_size, uint8_t read_only)
|
||||
{
|
||||
bs->bit_pos = 0;
|
||||
bs->stream = (uint8_t *)buf;
|
||||
bs->buf_size = buf_size;
|
||||
if (!read_only)
|
||||
memset((uint8_t *)buf, 0, sizeof(uint8_t) * buf_size);
|
||||
}
|
||||
|
||||
static inline void write_stream(StreamBuffer *bs, uint8_t *src, int bit_len)
|
||||
{
|
||||
for (int bit = 0; bit < bit_len; bit++)
|
||||
{
|
||||
int src_byte_i = bit / 8;
|
||||
int src_bit_i = bit % 8;
|
||||
int dest_byte_i = (bs->bit_pos + bit) / 8;
|
||||
int dest_bit_i = (bs->bit_pos + bit) % 8;
|
||||
bs->stream[dest_byte_i] |= (get_bit_val(src, src_byte_i, src_bit_i) << dest_bit_i);
|
||||
}
|
||||
bs->bit_pos += bit_len;
|
||||
}
|
||||
|
||||
static inline void move_stream_ptr(StreamBuffer *bs, int bit_len)
|
||||
{
|
||||
bs->bit_pos += bit_len;
|
||||
}
|
||||
|
||||
static inline void parse_stream(StreamBuffer *bs, uint8_t *dest, int bit_len)
|
||||
{
|
||||
memset(dest, 0, sizeof(uint8_t) * (bit_len + 7) >> 3);
|
||||
for (int bit = 0; bit < bit_len; bit++)
|
||||
{
|
||||
int dest_byte_i = bit / 8;
|
||||
int dest_bit_i = bit % 8;
|
||||
int bs_byte_i = (bs->bit_pos + bit) / 8;
|
||||
int bs_bit_i = (bs->bit_pos + bit) % 8;
|
||||
dest[dest_byte_i] |= (get_bit_val(bs->stream, bs_byte_i, bs_bit_i) << dest_bit_i);
|
||||
}
|
||||
bs->bit_pos += bit_len;
|
||||
}
|
||||
|
||||
// -- header read/write operation handler --
|
||||
static inline void vlc_enc_header(StreamBuffer *bs_header, CommandInfo *cmd_info, size_t blk_bs_size)
|
||||
{
|
||||
write_stream(bs_header, (uint8_t *)&blk_bs_size, 24); // bit[23:0] compressed block stream size
|
||||
move_stream_ptr(bs_header, 4); // bit[27:24] reserved
|
||||
write_stream(bs_header, (uint8_t *)&cmd_info->signedness, 1); // bit[28] signedness
|
||||
write_stream(bs_header, (uint8_t *)&cmd_info->is_bfloat16, 1); // bit[29] data type
|
||||
move_stream_ptr(bs_header, 2); // bit[31:30] bit depth
|
||||
write_stream(bs_header, (uint8_t *)&cmd_info->bias0, 8); // bit[39:32] bias0 for symbol remapping
|
||||
write_stream(bs_header, (uint8_t *)&cmd_info->bias1, 7); // bit[46:40] bias1 for symbol remapping
|
||||
write_stream(bs_header, (uint8_t *)&cmd_info->zero_guard_en, 1); // bit[47] zero guard
|
||||
}
|
||||
|
||||
static inline void vlc_dec_header_ext(StreamBuffer *bs_header, CommandInfo *cmd_info, size_t *blk_bs_size)
|
||||
{
|
||||
parse_stream(bs_header, (uint8_t *)blk_bs_size, 24); // bit[23:0] compressed block stream size
|
||||
move_stream_ptr(bs_header, 4); // bit[27:24] reserved
|
||||
parse_stream(bs_header, (uint8_t *)&cmd_info->signedness, 1); // bit[28] signedness
|
||||
parse_stream(bs_header, (uint8_t *)&cmd_info->is_bfloat16, 1); // bit[29] data type
|
||||
move_stream_ptr(bs_header, 2);
|
||||
parse_stream(bs_header, (uint8_t *)&cmd_info->bias0, 8); // bit[39:32] bias0 for symbol remapping
|
||||
parse_stream(bs_header, (uint8_t *)&cmd_info->bias1, 7); // bit[46:40] bias1 for symbol remapping
|
||||
parse_stream(bs_header, (uint8_t *)&cmd_info->zero_guard_en, 1); // bit[47] zero guard
|
||||
}
|
||||
|
||||
static inline void vlc_dec_header(StreamBuffer *bs_header, CommandInfo *cmd_info)
|
||||
{
|
||||
size_t blk_bs_size;
|
||||
vlc_dec_header_ext(bs_header, cmd_info, &blk_bs_size);
|
||||
}
|
||||
|
||||
// -- symbol remmaping handler --
|
||||
static inline uint8_t center_shift(uint8_t val, uint8_t bias, uint8_t zero_guard)
|
||||
{
|
||||
if (val == 0 && zero_guard)
|
||||
return 0;
|
||||
|
||||
int16_t shift_data_i = val - bias;
|
||||
uint8_t range = (bias <= 128) ? bias : 255 - bias;
|
||||
if (bias <= 128)
|
||||
{
|
||||
return (val >= (range << 1)) ? val : sign_to_unsign(shift_data_i) + zero_guard;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (val < (bias - range)) ? (range + bias - val + zero_guard) : (sign_to_unsign(shift_data_i) + zero_guard);
|
||||
}
|
||||
}
|
||||
|
||||
static inline uint8_t inv_center_shift(uint8_t val, uint8_t bias, uint8_t zero_guard)
|
||||
{
|
||||
if (val == 0 && zero_guard)
|
||||
return 0;
|
||||
|
||||
uint8_t unsign_data_i = val - zero_guard;
|
||||
uint8_t range = (bias <= 128) ? bias : 255 - bias;
|
||||
if (bias <= 128)
|
||||
{
|
||||
return (val >= (range << 1)) ? val : unsign_to_sign(unsign_data_i) + bias;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (unsign_data_i > (range << 1)) ? (range + bias - val + zero_guard) : unsign_to_sign(unsign_data_i) + bias;
|
||||
}
|
||||
}
|
||||
|
||||
static inline int8_t two_side_circular_shift(int8_t val, uint8_t bias0, uint8_t bias1)
|
||||
{
|
||||
if (val == 0)
|
||||
return 0;
|
||||
|
||||
uint8_t sign = (val < 0) ? true : false;
|
||||
int32_t abs_val = abs(val);
|
||||
abs_val -= (sign) ? bias1 : bias0;
|
||||
abs_val += (abs_val <= 0) ? (127 + sign) : 0;
|
||||
return (sign) ? -abs_val : abs_val;
|
||||
}
|
||||
|
||||
static inline int8_t inv_two_side_circular_shift(int8_t val, uint8_t bias0, uint8_t bias1)
|
||||
{
|
||||
if (val == 0)
|
||||
return 0;
|
||||
|
||||
uint8_t sign = (val < 0) ? true : false;
|
||||
uint32_t abs_val = (uint32_t)abs(val);
|
||||
abs_val += (sign) ? bias1 : bias0;
|
||||
int32_t abs_val_minus = abs_val - (127 + sign);
|
||||
uint8_t abs_val_lsb = ((abs_val_minus <= 0)
|
||||
? (uint8_t)abs_val
|
||||
: (uint8_t)abs_val_minus) &
|
||||
0xFF;
|
||||
return (sign) ? -abs_val_lsb : abs_val_lsb;
|
||||
}
|
||||
|
||||
static inline void symbol_remapping(uint8_t *blk_in, uint8_t *blk_out, uint8_t bias0, uint8_t bias1, uint8_t signedness, uint8_t is_bf16_exp, uint8_t zero_guard)
|
||||
{
|
||||
if (is_bf16_exp == false && signedness == false)
|
||||
{
|
||||
// remapping bypass
|
||||
memcpy(blk_out, blk_in, sizeof(uint8_t) * 16);
|
||||
return;
|
||||
}
|
||||
|
||||
if (is_bf16_exp == true)
|
||||
{
|
||||
// center circular shift
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
blk_out[i] = center_shift(blk_in[i], bias0, zero_guard);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// two-side circular shift
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
int8_t shift_data_i = two_side_circular_shift((int8_t)blk_in[i], bias0, bias1);
|
||||
blk_out[i] = sign_to_unsign(shift_data_i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline void inv_symbol_remapping(uint8_t *blk_in, uint8_t *blk_out, uint8_t bias0, uint8_t bias1, uint8_t signedness, uint8_t is_bf16_exp, uint8_t zero_guard)
|
||||
{
|
||||
if (is_bf16_exp == false && signedness == false)
|
||||
{
|
||||
// remapping bypass
|
||||
memcpy(blk_out, blk_in, sizeof(uint8_t) * 16);
|
||||
return;
|
||||
}
|
||||
|
||||
if (is_bf16_exp == true)
|
||||
{
|
||||
// center circular shift
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
blk_out[i] = inv_center_shift(blk_in[i], bias0, zero_guard);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// two-side circular shift
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
int8_t sign_data_i = unsign_to_sign(blk_in[i]);
|
||||
blk_out[i] = (uint8_t)inv_two_side_circular_shift(sign_data_i, bias0, bias1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline int vlc_estimate_block_order(uint8_t *blk_in, uint8_t bf16_zvc_en)
|
||||
{
|
||||
int best_k = 0;
|
||||
int best_bs_size = 0x7FFFFFFF;
|
||||
|
||||
for (int k = 0; k <= (int)MAX_ORDER_K; k++)
|
||||
{
|
||||
uint8_t remain_field_size = k << 4;
|
||||
int unary_field_len = 0;
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
uint8_t group_idx = blk_in[i] >> k;
|
||||
unary_field_len += (group_idx + 1);
|
||||
}
|
||||
int znum_bit = (bf16_zvc_en && k > 0) ? 4 : 0;
|
||||
int blk_size = (unary_field_len <= MAX_UNARY_FIELD_SIZE)
|
||||
? remain_field_size + unary_field_len + znum_bit
|
||||
: 255;
|
||||
if (blk_size < best_bs_size)
|
||||
{
|
||||
best_k = k;
|
||||
best_bs_size = blk_size;
|
||||
}
|
||||
}
|
||||
|
||||
best_k = (best_bs_size > 128) ? -1 : best_k;
|
||||
return best_k;
|
||||
}
|
||||
// -- vlc block parrelel GR encode/decode --
|
||||
static inline uint8_t vlc_gr_enc_block_data(uint8_t *blk_in, StreamBuffer *bs, int order_k, uint8_t bf16_zvc_en)
|
||||
{
|
||||
// uncompressed mode
|
||||
if (order_k == -1)
|
||||
{
|
||||
write_stream(bs, blk_in, 128);
|
||||
return 128;
|
||||
}
|
||||
|
||||
// remain field
|
||||
uint8_t remain_field[16] = {0};
|
||||
uint8_t unary_field[8] = {0};
|
||||
uint8_t sym_end_pos[16] = {0};
|
||||
uint8_t unary_field_len = 0;
|
||||
int sym_end_pos_accum = -1;
|
||||
|
||||
// bit plane encode for remain field
|
||||
for (int k = 0; k < order_k; k++)
|
||||
{
|
||||
uint8_t bit_plane0 = 0, bit_plane1 = 0;
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
bit_plane0 |= (get_bit_val(blk_in, i, k) << i);
|
||||
bit_plane1 |= (get_bit_val(blk_in, i + 8, k) << i);
|
||||
}
|
||||
remain_field[k << 1] = bit_plane0;
|
||||
remain_field[(k << 1) + 1] = bit_plane1;
|
||||
}
|
||||
write_stream(bs, remain_field, order_k << 4);
|
||||
|
||||
if (bf16_zvc_en && order_k > 0)
|
||||
{
|
||||
int zero_num = 0;
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
if (blk_in[i] == 0)
|
||||
zero_num++;
|
||||
}
|
||||
// assert(zero_num < 16);
|
||||
if (zero_num >= 16)
|
||||
return 0;
|
||||
|
||||
write_stream(bs, (uint8_t *)&zero_num, 4);
|
||||
}
|
||||
|
||||
// unary encode for unary field
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
int group_idx = blk_in[i] >> order_k;
|
||||
sym_end_pos_accum += (group_idx + 1);
|
||||
sym_end_pos[i] = sym_end_pos_accum;
|
||||
int byte_idx = sym_end_pos[i] / 8;
|
||||
int bit_idx = sym_end_pos[i] % 8;
|
||||
unary_field[byte_idx] |= (1 << (bit_idx));
|
||||
}
|
||||
unary_field_len = sym_end_pos[15] + 1;
|
||||
|
||||
//assert(unary_field_len <= MAX_UNARY_FIELD_SIZE);
|
||||
if (unary_field_len > MAX_UNARY_FIELD_SIZE)
|
||||
return 0;
|
||||
|
||||
uint8_t ulen = (unary_field_len - 16) & 0x1F;
|
||||
write_stream(bs, unary_field, unary_field_len);
|
||||
|
||||
return ulen;
|
||||
}
|
||||
|
||||
static inline void vlc_gr_dec_block_data(StreamBuffer *bs, uint8_t bs_size, uint8_t *rec, int order_k, uint8_t bf16_zvc_en)
|
||||
{
|
||||
// assert(bs_size <= 128);
|
||||
if (bs_size > 128)
|
||||
return;
|
||||
|
||||
// uncompressed mode
|
||||
if (order_k == -1)
|
||||
{
|
||||
parse_stream(bs, rec, 128);
|
||||
return;
|
||||
}
|
||||
|
||||
// remain field
|
||||
uint8_t remain_data[16] = {0};
|
||||
uint8_t remain_bs[16] = {0};
|
||||
uint8_t unary_field[8] = {0};
|
||||
uint8_t sym_end_pos[16] = {0};
|
||||
uint8_t unary_sym[16] = {0};
|
||||
uint8_t remain_field_size = order_k << 4;
|
||||
|
||||
parse_stream(bs, remain_bs, remain_field_size);
|
||||
// bit plane encode for remain field
|
||||
for (int k = 0; k < order_k; k++)
|
||||
{
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
remain_data[i] |= (get_bit_val(remain_bs, k << 1, i) << k);
|
||||
remain_data[i + 8] |= (get_bit_val(remain_bs, (k << 1) + 1, i) << k);
|
||||
}
|
||||
}
|
||||
|
||||
// zero number info
|
||||
int znum_bit = (bf16_zvc_en && order_k > 0) ? 4 : 0;
|
||||
uint8_t znum = 0;
|
||||
parse_stream(bs, &znum, znum_bit);
|
||||
|
||||
// unary encode for unary field
|
||||
uint8_t unary_field_len = bs_size - remain_field_size - znum_bit;
|
||||
parse_stream(bs, unary_field, unary_field_len);
|
||||
|
||||
int sym_cnt = 0;
|
||||
for (uint8_t ubit_i = 0; ubit_i < unary_field_len; ubit_i++)
|
||||
{
|
||||
int byte_idx = ubit_i / 8;
|
||||
int bit_idx = ubit_i % 8;
|
||||
if (get_bit_val(unary_field, byte_idx, bit_idx) == 1)
|
||||
{
|
||||
sym_end_pos[sym_cnt] = ubit_i;
|
||||
sym_cnt++;
|
||||
}
|
||||
}
|
||||
unary_sym[0] = sym_end_pos[0];
|
||||
for (int i = 1; i < 16; i++)
|
||||
{
|
||||
unary_sym[i] = sym_end_pos[i] - sym_end_pos[i - 1] - 1;
|
||||
}
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
rec[i] = (unary_sym[i] << order_k) + remain_data[i];
|
||||
}
|
||||
}
|
||||
|
||||
// -- vlc encode int8 entry function --
|
||||
static inline void cvk_vlc_enc_int8(const uint8_t *ibuf, size_t isz, uint8_t *obuf, size_t *osz, CommandInfo *cmd_info)
|
||||
{
|
||||
StreamBuffer bs_header, bs_kmap, bs_data;
|
||||
size_t blk_num = (isz + 15) >> 4;
|
||||
size_t header_size = 16;
|
||||
size_t kmap_size = divide_ceil(blk_num, 16) << 4;
|
||||
size_t bs_buf_size = header_size + kmap_size + (blk_num << 4);
|
||||
uint8_t *bsbuf = (uint8_t *)calloc(bs_buf_size, sizeof(uint8_t));
|
||||
|
||||
// block encode
|
||||
init_stream(&bs_kmap, bsbuf + header_size, kmap_size, false);
|
||||
init_stream(&bs_data, bsbuf + header_size + kmap_size, blk_num << 4, false);
|
||||
|
||||
for (size_t blk_idx = 0; blk_idx < blk_num; blk_idx++)
|
||||
{
|
||||
uint8_t blk_data[16] = {0}, blk_sr_data[16] = {0};
|
||||
size_t in_size = (blk_idx == (blk_num - 1)) ? isz - (blk_idx << 4) : 16;
|
||||
memcpy(blk_data, &ibuf[blk_idx << 4], sizeof(uint8_t) * in_size);
|
||||
|
||||
symbol_remapping(blk_data, blk_sr_data, cmd_info->bias0, cmd_info->bias1, cmd_info->signedness, false, false);
|
||||
|
||||
int k = vlc_estimate_block_order(blk_sr_data, false);
|
||||
uint8_t ulen = vlc_gr_enc_block_data(blk_sr_data, &bs_data, k, false);
|
||||
uint8_t k_info = (k == -1) ? 0xE0 : (k << 5) + ulen;
|
||||
write_stream(&bs_kmap, &k_info, 8);
|
||||
}
|
||||
|
||||
int blk_bs_size = divide_ceil(((bs_data.bit_pos + 7) >> 3), 16) << 4; // 16 byte align
|
||||
*osz = header_size + kmap_size + blk_bs_size;
|
||||
|
||||
// write header
|
||||
init_stream(&bs_header, bsbuf, header_size, false);
|
||||
vlc_enc_header(&bs_header, cmd_info, blk_bs_size);
|
||||
|
||||
memcpy(obuf, bsbuf, (*osz) * sizeof(uint8_t));
|
||||
free(bsbuf);
|
||||
}
|
||||
|
||||
// -- vlc decode int8 entry function --
|
||||
static inline void cvk_vlc_dec_int8_ext(const uint8_t *ibuf, size_t isz, uint8_t *obuf, size_t *bs_size)
|
||||
{
|
||||
StreamBuffer bs_header, bs_kmap, bs_data;
|
||||
CommandInfo cmd_info;
|
||||
memset(&cmd_info, 0, sizeof(CommandInfo));
|
||||
|
||||
size_t blk_num = (isz + 15) >> 4;
|
||||
int header_size = 16;
|
||||
int kmap_size = divide_ceil(blk_num, 16) << 4;
|
||||
*bs_size = 0;
|
||||
|
||||
// parse header
|
||||
init_stream(&bs_header, ibuf, header_size, true);
|
||||
vlc_dec_header_ext(&bs_header, &cmd_info, bs_size);
|
||||
|
||||
// Check whether valid header
|
||||
size_t bs_buf_size = get_out_bs_buf_size(isz, 0); // int8
|
||||
|
||||
//ASSERT(*bs_size <= bs_buf_size);
|
||||
//ASSERT(cmd_info.is_bfloat16 == 0);
|
||||
if (*bs_size > bs_buf_size || cmd_info.is_bfloat16)
|
||||
return;
|
||||
|
||||
// block decode
|
||||
init_stream(&bs_kmap, ibuf + header_size, kmap_size, true);
|
||||
init_stream(&bs_data, ibuf + header_size + kmap_size, blk_num << 4, true);
|
||||
|
||||
for (size_t blk_idx = 0; blk_idx < blk_num; blk_idx++)
|
||||
{
|
||||
uint8_t blk_data[16] = {0}, blk_sr_data[16] = {0};
|
||||
uint8_t k_info = 0;
|
||||
parse_stream(&bs_kmap, &k_info, 8);
|
||||
uint8_t ulen = k_info & 0x1F;
|
||||
int k = (k_info >> 5 == 7) ? -1 : k_info >> 5;
|
||||
int blk_bs_size = (k == -1) ? 128 : (k << 4) + ulen + 16;
|
||||
vlc_gr_dec_block_data(&bs_data, blk_bs_size, blk_data, k, false);
|
||||
|
||||
inv_symbol_remapping(blk_data, blk_sr_data, cmd_info.bias0, cmd_info.bias1, cmd_info.signedness, false, false);
|
||||
|
||||
int out_size = (blk_idx == (blk_num - 1)) ? isz - (blk_idx << 4) : 16;
|
||||
memcpy(&obuf[blk_idx << 4], blk_sr_data, sizeof(uint8_t) * out_size);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void cvk_vlc_dec_int8(const uint8_t *ibuf, size_t isz, uint8_t *obuf)
|
||||
{
|
||||
size_t bs_size;
|
||||
cvk_vlc_dec_int8_ext(ibuf, isz, obuf, &bs_size);
|
||||
}
|
||||
|
||||
// -- vlc encode bfloat16 entry function --
|
||||
static inline void cvk_vlc_enc_bf16(const uint16_t *ibuf, size_t isz, uint8_t *obuf, size_t *osz, CommandInfo *cmd_info)
|
||||
{
|
||||
StreamBuffer bs_header, bs_kmap, bs_data;
|
||||
size_t blk_num = (isz + 31) >> 5; // 32 bytes per blok
|
||||
size_t header_size = 16;
|
||||
size_t kmap_size = divide_ceil(blk_num, 16) << 4;
|
||||
size_t bs_buf_size = header_size + kmap_size + (blk_num << 5);
|
||||
uint8_t *bsbuf = (uint8_t *)calloc(bs_buf_size, sizeof(uint8_t));
|
||||
|
||||
// block encode
|
||||
init_stream(&bs_kmap, bsbuf + header_size, kmap_size, false);
|
||||
init_stream(&bs_data, bsbuf + header_size + kmap_size, blk_num << 5, false);
|
||||
|
||||
for (size_t blk_idx = 0; blk_idx < blk_num; blk_idx++)
|
||||
{
|
||||
uint8_t blk_data[16] = {0}, blk_sr_data[16] = {0}, blk_data_frac[16] = {0};
|
||||
size_t in_num = (blk_idx == (blk_num - 1)) ? ((isz >> 1) - (blk_idx << 4)) : 16;
|
||||
dispatch_bf16_data(&ibuf[blk_idx << 4], blk_data, blk_data_frac, in_num);
|
||||
|
||||
// exp: BGR encode
|
||||
symbol_remapping(blk_data, blk_sr_data, cmd_info->bias0, cmd_info->bias1, false, true, cmd_info->zero_guard_en);
|
||||
|
||||
int k = vlc_estimate_block_order(blk_sr_data, cmd_info->zero_guard_en);
|
||||
uint8_t ulen = vlc_gr_enc_block_data(blk_sr_data, &bs_data, k, cmd_info->zero_guard_en);
|
||||
uint8_t k_info = (k == -1) ? 0xE0 : (k << 5) + ulen;
|
||||
write_stream(&bs_kmap, &k_info, 8);
|
||||
|
||||
// frac: implicit zero compression
|
||||
for (size_t i = 0; i < 16; i++)
|
||||
{
|
||||
if (!cmd_info->zero_guard_en || blk_data[i] != 0)
|
||||
{
|
||||
write_stream(&bs_data, &blk_data_frac[i], 8);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int blk_bs_size = divide_ceil(((bs_data.bit_pos + 7) >> 3), 16) << 4; // 16 byte align
|
||||
*osz = header_size + kmap_size + blk_bs_size;
|
||||
|
||||
// write header
|
||||
init_stream(&bs_header, bsbuf, header_size, false);
|
||||
vlc_enc_header(&bs_header, cmd_info, blk_bs_size);
|
||||
|
||||
memcpy(obuf, bsbuf, (*osz) * sizeof(uint8_t));
|
||||
free(bsbuf);
|
||||
}
|
||||
|
||||
// -- vlc decode bfloat16 entry function --
|
||||
static inline void cvk_vlc_dec_bf16_ext(const uint8_t *ibuf, size_t isz, uint16_t *obuf, size_t *bs_size)
|
||||
{
|
||||
StreamBuffer bs_header, bs_kmap, bs_data;
|
||||
CommandInfo cmd_info;
|
||||
memset(&cmd_info, 0, sizeof(CommandInfo));
|
||||
|
||||
size_t blk_num = (isz + 31) >> 5; // 32 bytes per blok
|
||||
int header_size = 16;
|
||||
int kmap_size = divide_ceil(blk_num, 16) << 4;
|
||||
*bs_size = 0;
|
||||
|
||||
// parse header
|
||||
init_stream(&bs_header, ibuf, header_size, true);
|
||||
vlc_dec_header_ext(&bs_header, &cmd_info, bs_size);
|
||||
|
||||
// Check whether valid header
|
||||
size_t bs_buf_size = get_out_bs_buf_size(isz, 1); // bf16
|
||||
|
||||
//ASSERT(*bs_size <= bs_buf_size);
|
||||
//ASSERT(cmd_info.is_bfloat16 == 1);
|
||||
if (*bs_size > bs_buf_size || cmd_info.is_bfloat16 != 1)
|
||||
return;
|
||||
|
||||
// block decode
|
||||
init_stream(&bs_kmap, ibuf + header_size, kmap_size, true);
|
||||
init_stream(&bs_data, ibuf + header_size + kmap_size, blk_num << 5, true);
|
||||
|
||||
for (size_t blk_idx = 0; blk_idx < blk_num; blk_idx++)
|
||||
{
|
||||
uint8_t blk_data[16] = {0}, blk_sr_data[16] = {0}, blk_data_frac[16] = {0};
|
||||
uint8_t k_info = 0;
|
||||
parse_stream(&bs_kmap, &k_info, 8);
|
||||
uint8_t ulen = k_info & 0x1F;
|
||||
int k = (k_info >> 5 == 7) ? -1 : k_info >> 5;
|
||||
int znum_bit = (cmd_info.zero_guard_en && k > 0) ? 4 : 0;
|
||||
uint8_t blk_bs_size = (k == -1) ? 128 : (k << 4) + ulen + 16 + znum_bit;
|
||||
|
||||
// exp: BGR decode
|
||||
vlc_gr_dec_block_data(&bs_data, blk_bs_size, blk_data, k, cmd_info.zero_guard_en);
|
||||
|
||||
inv_symbol_remapping(blk_data, blk_sr_data, cmd_info.bias0, cmd_info.bias1, false, true, cmd_info.zero_guard_en);
|
||||
|
||||
size_t out_num = (blk_idx == (blk_num - 1)) ? ((isz >> 1) - (blk_idx << 4)) : 16;
|
||||
|
||||
// frac: implicit zero compression
|
||||
for (size_t i = 0; i < out_num; i++)
|
||||
{
|
||||
if (!cmd_info.zero_guard_en || blk_sr_data[i] != 0)
|
||||
{
|
||||
parse_stream(&bs_data, &blk_data_frac[i], 8);
|
||||
}
|
||||
}
|
||||
merge_bf16_data(blk_sr_data, blk_data_frac, &obuf[blk_idx << 4], out_num);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void cvk_vlc_dec_bf16(const uint8_t *ibuf, size_t isz, uint16_t *obuf)
|
||||
{
|
||||
size_t bs_size;
|
||||
cvk_vlc_dec_bf16_ext(ibuf, isz, obuf, &bs_size);
|
||||
}
|
||||
|
||||
// -- offline estimate model weight params --
|
||||
static inline void cvk_vlc_est_weight_bias(const uint8_t *ibuf, size_t isz, uint8_t signedness, uint8_t isBfloat16, CommandInfo *cmd_info)
|
||||
{
|
||||
//assert(!(isBfloat16 && signedness)); // WARNING: signedness MUST be 0 as isBfloat16==True
|
||||
|
||||
cmd_info->is_bfloat16 = isBfloat16;
|
||||
if (isBfloat16 == false && signedness == true)
|
||||
{
|
||||
// two-side circular shift
|
||||
int hist[256] = {0};
|
||||
for (size_t i = 0; i < isz; i++)
|
||||
{
|
||||
hist[ibuf[i]]++;
|
||||
}
|
||||
|
||||
int8_t pos_v = 1;
|
||||
//while (pos_v < 128)
|
||||
// comparison is always true due to limited range of data type [-Werror=type-limits]
|
||||
while (true)
|
||||
{
|
||||
if (hist[((uint8_t)pos_v)] == 0)
|
||||
{
|
||||
pos_v++;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
//cmd_info->bias0 = (pos_v > 1 && pos_v < 128) ? (pos_v - 1) : 0;
|
||||
// comparison is always true due to limited range of data type [-Werror=type-limits]
|
||||
cmd_info->bias0 = (pos_v > 1) ? (pos_v - 1) : 0;
|
||||
int8_t neg_v = -1;
|
||||
//while (neg_v >= (-128)) // comparison is always true due to limited range of data type [-Werror=type-limits]
|
||||
while (true)
|
||||
{
|
||||
if (hist[(uint8_t)neg_v] == 0)
|
||||
{
|
||||
neg_v--;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
//cmd_info->bias1 = (neg_v < -1 && neg_v >= -128) ? abs(neg_v + 1) : 0;
|
||||
// comparison is always true due to limited range of data type [-Werror=type-limits]
|
||||
cmd_info->bias1 = (neg_v < -1) ? abs(neg_v + 1) : 0;
|
||||
cmd_info->signedness = true;
|
||||
}
|
||||
|
||||
if (isBfloat16 == true)
|
||||
{
|
||||
// center shift
|
||||
int64_t exp_accum = 0;
|
||||
uint16_t *bf16_in = (uint16_t *)ibuf;
|
||||
size_t inum = (isz >> 1), cnt = 0;
|
||||
for (size_t i = 0; i < inum; i++)
|
||||
{
|
||||
uint8_t exp = ((bf16_in[i] >> 7) & 0xFF);
|
||||
if (exp != 0)
|
||||
{
|
||||
exp_accum += exp;
|
||||
cnt++;
|
||||
}
|
||||
}
|
||||
if (cnt > 0)
|
||||
{
|
||||
cmd_info->bias0 = (uint8_t)((exp_accum / (float)cnt) + 0.5);
|
||||
}
|
||||
cmd_info->zero_guard_en = (inum == cnt) ? false : true;
|
||||
cmd_info->signedness = false;
|
||||
}
|
||||
}
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* __CVK_VLC_COMPRESS_H__ */
|
||||
423
cvikernel/src/bm1822/bm_dmabuf.c
Normal file
423
cvikernel/src/bm1822/bm_dmabuf.c
Normal file
@ -0,0 +1,423 @@
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <inttypes.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "kernel_1822.h"
|
||||
#include <bmkernel/bm1822/bmkernel_1822.h>
|
||||
#include <bmkernel/bm1822/bm1822_tiu_reg.h>
|
||||
#include <bmkernel/bm1822/bm1822_tdma_reg.h>
|
||||
#include <bmkernel/reg_tiu.h>
|
||||
#include <bmkernel/reg_tdma.h>
|
||||
#include <bmkernel/reg_bdcast.h>
|
||||
#include <bmkernel/bm_regcpu.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define __ALIGN_MASK(x,mask) (((x)+(mask))&~(mask))
|
||||
#define ALIGN(x,a) __ALIGN_MASK(x,(__typeof__(x))(a)-1)
|
||||
|
||||
#define BD_DESC_ALIGN_SIZE (1 << BDC_ENGINE_CMD_ALIGNED_BIT)
|
||||
#define GDMA_DESC_ALIGN_SIZE (1 << TDMA_DESCRIPTOR_ALIGNED_BIT)
|
||||
#define BD_EOD_PADDING_BYTES (128)
|
||||
#define TPU_DMABUF_HEADER_M 0xB5B5
|
||||
|
||||
typedef struct {
|
||||
cmd_hdr_t hdr;
|
||||
uint32_t body[0];
|
||||
} DESC;
|
||||
|
||||
// CPU_OP_SYNC structure
|
||||
typedef struct {
|
||||
uint32_t op_type;
|
||||
uint32_t num_tiu;
|
||||
uint32_t num_tdma;
|
||||
uint32_t offset_tiu;
|
||||
uint32_t offset_tdma;
|
||||
uint32_t offset_tiu_ori_bk;
|
||||
uint32_t offset_tdma_ori_bk;
|
||||
char str[CPU_ENGINE_STR_LIMIT_BYTE];
|
||||
} __attribute__((packed)) cvi_cpu_desc_t;
|
||||
|
||||
static DESC *traverse_start(uint8_t *cmdbuf)
|
||||
{
|
||||
ASSERT(cmdbuf);
|
||||
DESC *desc = (DESC *)cmdbuf;
|
||||
ASSERT(desc->hdr.magic == CMDBUF_HDR_MAGIC_1822);
|
||||
return desc;
|
||||
}
|
||||
|
||||
static DESC *traverse_next(DESC *desc, uint8_t *cmdbuf, uint32_t size)
|
||||
{
|
||||
DESC *next_desc = (DESC *)((uint8_t *)desc + cmd_hdr_len(&desc->hdr) + sizeof(cmd_hdr_t));
|
||||
if ((uint8_t *)next_desc >= cmdbuf + size)
|
||||
return NULL;
|
||||
ASSERT(next_desc->hdr.magic == CMDBUF_HDR_MAGIC_1822);
|
||||
return next_desc;
|
||||
}
|
||||
|
||||
static bool is_last_desc(DESC *desc, uint8_t *cmdbuf, uint32_t size)
|
||||
{
|
||||
DESC *next_desc = traverse_next(desc, cmdbuf, size);
|
||||
return next_desc ? false : true;
|
||||
}
|
||||
|
||||
static void reorder_bd_cmdbuf_reg(uint8_t *cmdbuf)
|
||||
{
|
||||
int total_bits = BD_REG_BYTES * 8;
|
||||
|
||||
for (int i = 0; i < total_bits; i += 128)
|
||||
cmdbuf[(i + 128 - 8) / 8] |= (i / 128) << 4;
|
||||
|
||||
uint8_t tmp[128 / 8];
|
||||
uint8_t *last = &cmdbuf[(total_bits - 128) / 8];
|
||||
memcpy(tmp, last, sizeof(tmp));
|
||||
memcpy(last, cmdbuf, sizeof(tmp));
|
||||
memcpy(cmdbuf, tmp, sizeof(tmp));
|
||||
}
|
||||
|
||||
static void adjust_desc_tdma(uint32_t *body, bool eod)
|
||||
{
|
||||
if (eod) {
|
||||
body[0] |= (1 << TDMA_ACCPI0_EOD_BIT);
|
||||
body[0] |= (1 << TDMA_ACCPI0_INTERRUPT_BIT); // interrupt
|
||||
}
|
||||
body[0] |= (1 << TDMA_ACCPI0_BARRIER_ENABLE_BIT);
|
||||
}
|
||||
|
||||
static void adjust_desc_bd(uint32_t *body, bool eod)
|
||||
{
|
||||
if (eod) {
|
||||
tiu_reg_t reg;
|
||||
parse_tiu_reg(®, body);
|
||||
reg.cmd_end = 1;
|
||||
reg.cmd_intr_en = 1;
|
||||
emit_tiu_reg(®, body);
|
||||
}
|
||||
reorder_bd_cmdbuf_reg((uint8_t *)body);
|
||||
}
|
||||
|
||||
void bmk1822_dmabuf_relocate(uint8_t *dmabuf, uint64_t dmabuf_devaddr, uint32_t original_size, uint32_t pmubuf_size)
|
||||
{
|
||||
dma_hdr_t *header = (dma_hdr_t *)dmabuf;
|
||||
uint64_t tmpAddress = 0;
|
||||
|
||||
ASSERT(header->dmabuf_magic_m == TPU_DMABUF_HEADER_M);
|
||||
cvi_cpu_desc_t *desc = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t));
|
||||
|
||||
for (uint32_t i = 0; i < header->cpu_desc_count; i++, desc++) {
|
||||
uint32_t tiu_num = desc->num_tiu & 0xFFFF;
|
||||
uint32_t tdma_num = desc->num_tdma & 0xFFFF;
|
||||
|
||||
if (tiu_num) {
|
||||
tmpAddress = dmabuf_devaddr + desc->offset_tiu;
|
||||
//printf("bd tmpAddress = 0x%lu\n", tmpAddress);
|
||||
desc->offset_tiu_ori_bk = desc->offset_tiu;
|
||||
desc->offset_tiu = tmpAddress >> BDC_ENGINE_CMD_ALIGNED_BIT;
|
||||
}
|
||||
|
||||
if (tdma_num) {
|
||||
tmpAddress = dmabuf_devaddr + desc->offset_tdma;
|
||||
//printf("tdma tmpAddress = 0x%lu\n", tmpAddress);
|
||||
desc->offset_tdma_ori_bk = desc->offset_tdma;
|
||||
desc->offset_tdma = tmpAddress >> TDMA_DESCRIPTOR_ALIGNED_BIT;
|
||||
}
|
||||
|
||||
//set pmubuf_addr_p to enable pmu kick
|
||||
header->pmubuf_size = pmubuf_size;
|
||||
header->pmubuf_offset = original_size;
|
||||
}
|
||||
}
|
||||
|
||||
static uint32_t desc_sync_id(DESC *desc)
|
||||
{
|
||||
switch (desc->hdr.engine_id) {
|
||||
case BMK1822_TIU: {
|
||||
tiu_reg_t reg;
|
||||
parse_tiu_reg(®, desc->body);
|
||||
return reg.cmd_id_tpu;
|
||||
}
|
||||
case BMK1822_TDMA: {
|
||||
tdma_reg_t reg;
|
||||
parse_tdma_reg(®, desc->body);
|
||||
return reg.cmd_id;
|
||||
}
|
||||
default:
|
||||
ASSERT(0);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
static void fill_header_and_arm(uint8_t *cmdbuf, uint32_t sz, uint8_t *dmabuf, uint64_t *tiu_offset, uint64_t *tdma_offset)
|
||||
{
|
||||
dma_hdr_t header = {0};
|
||||
header.dmabuf_magic_m = TPU_DMABUF_HEADER_M;
|
||||
header.dmabuf_magic_s = 0x1822;
|
||||
|
||||
cvi_cpu_desc_t *segments = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t));
|
||||
DESC *desc = NULL;
|
||||
uint32_t desc_nums[BMK1822_ENGINE_NUM] = {0};
|
||||
uint32_t counters[BMK1822_ENGINE_NUM] = {0};
|
||||
uint32_t desc_size[BMK1822_ENGINE_NUM] = {0};
|
||||
|
||||
ASSERT(segments);
|
||||
// fill arm descs
|
||||
desc = traverse_start(cmdbuf);
|
||||
|
||||
while (desc != NULL) {
|
||||
uint32_t engine_id = (uint32_t)desc->hdr.engine_id;
|
||||
counters[engine_id]++;
|
||||
desc_nums[engine_id]++;
|
||||
if (engine_id != BMK1822_CPU) {
|
||||
// a new arm desc inserted to do sync operation
|
||||
if (desc_sync_id(desc) == 0xFFFF || is_last_desc(desc, cmdbuf, sz)) {
|
||||
desc_nums[BMK1822_CPU]++;
|
||||
cvi_cpu_desc_t *arm = segments + desc_nums[BMK1822_CPU] - 1;
|
||||
memset(arm, 0, sizeof(cvi_cpu_desc_t));
|
||||
arm->op_type = CPU_OP_SYNC;
|
||||
arm->num_tiu = counters[BMK1822_TIU];
|
||||
arm->num_tdma = counters[BMK1822_TDMA];
|
||||
strncpy(arm->str, "layer_end", sizeof(arm->str) - 1);
|
||||
if (counters[BMK1822_TIU] != 0) {
|
||||
desc_size[BMK1822_TIU] =
|
||||
ALIGN(desc_size[BMK1822_TIU] + counters[BMK1822_TIU] * BD_REG_BYTES + BD_EOD_PADDING_BYTES,
|
||||
BD_DESC_ALIGN_SIZE);
|
||||
}
|
||||
counters[BMK1822_TIU] = 0;
|
||||
counters[BMK1822_TDMA] = 0;
|
||||
}
|
||||
} else {
|
||||
cvi_cpu_desc_t *arm = segments + desc_nums[BMK1822_CPU] - 1;
|
||||
memcpy(arm, &(desc->body), sizeof(cvi_cpu_desc_t));
|
||||
arm->num_tiu = counters[BMK1822_TIU];
|
||||
arm->num_tdma = counters[BMK1822_TDMA];
|
||||
if (counters[BMK1822_TIU] != 0) {
|
||||
desc_size[BMK1822_TIU] =
|
||||
ALIGN(desc_size[BMK1822_TIU] + counters[BMK1822_TIU] * BD_REG_BYTES + BD_EOD_PADDING_BYTES,
|
||||
BD_DESC_ALIGN_SIZE);
|
||||
}
|
||||
counters[BMK1822_TIU] = 0;
|
||||
counters[BMK1822_TDMA] = 0;
|
||||
}
|
||||
desc = traverse_next(desc, cmdbuf, sz);
|
||||
}
|
||||
desc_size[BMK1822_CPU] = desc_nums[BMK1822_CPU] * CPU_ENGINE_BYTES;
|
||||
desc_size[BMK1822_TDMA] = desc_nums[BMK1822_TDMA] * GDMA_DESC_ALIGN_SIZE;
|
||||
|
||||
(*tiu_offset) = ALIGN(sizeof(header) + desc_size[BMK1822_CPU], BD_DESC_ALIGN_SIZE);
|
||||
(*tdma_offset) = ALIGN((*tiu_offset) + desc_size[BMK1822_TIU], GDMA_DESC_ALIGN_SIZE);
|
||||
|
||||
// dma hdr + arm descs + bd descs + tdma descs
|
||||
header.dmabuf_size = (*tdma_offset) + desc_size[BMK1822_TDMA];
|
||||
header.cpu_desc_count = desc_nums[BMK1822_CPU];
|
||||
header.bd_desc_count = desc_nums[BMK1822_TIU];
|
||||
header.tdma_desc_count = desc_nums[BMK1822_TDMA];
|
||||
|
||||
//printf("header.dmabuf_size = %d\n", header.dmabuf_size);
|
||||
printf("header.cpu_desc_count = %d\n", header.cpu_desc_count);
|
||||
printf("header.bd_desc_count = %d\n", header.bd_desc_count);
|
||||
printf("header.tdma_desc_count = %d\n", header.tdma_desc_count);
|
||||
|
||||
memcpy(dmabuf, &header, sizeof(header));
|
||||
}
|
||||
|
||||
static void fill_bd_and_tdma(uint8_t *cmdbuf, uint32_t sz, uint8_t *dmabuf, uint64_t tiu_offset, uint64_t tdma_offset)
|
||||
{
|
||||
dma_hdr_t *p_header = (dma_hdr_t *)dmabuf;
|
||||
cvi_cpu_desc_t *segments = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t));
|
||||
DESC *desc = traverse_start(cmdbuf);
|
||||
//uint64_t address_max = 0x0;
|
||||
|
||||
for (uint32_t i = 0; i < p_header->cpu_desc_count; i++) {
|
||||
|
||||
cvi_cpu_desc_t *arm = segments + i;
|
||||
|
||||
uint32_t tiu_num = arm->num_tiu & 0xFFFF;
|
||||
uint32_t tdma_num = arm->num_tdma & 0xFFFF;
|
||||
|
||||
if (tiu_num) {
|
||||
tiu_offset = ALIGN(tiu_offset, 1 << BDC_ENGINE_CMD_ALIGNED_BIT);
|
||||
arm->offset_tiu = tiu_offset;
|
||||
//printf("arm->offset_tiu = 0x%x \n", arm->offset_tiu);
|
||||
}
|
||||
|
||||
if (tdma_num) {
|
||||
tdma_offset = ALIGN(tdma_offset, 1 << TDMA_DESCRIPTOR_ALIGNED_BIT);
|
||||
arm->offset_tdma = tdma_offset;
|
||||
//printf("arm->offset_tdma = 0x%x \n", arm->offset_tdma);
|
||||
}
|
||||
|
||||
while (tiu_num || tdma_num) {
|
||||
uint32_t engine_id = (uint32_t)desc->hdr.engine_id;
|
||||
void *p_body = NULL;
|
||||
|
||||
switch (engine_id) {
|
||||
case BMK1822_TIU:
|
||||
tiu_num--;
|
||||
p_body = (void *)(dmabuf + tiu_offset);
|
||||
tiu_offset += BD_REG_BYTES;
|
||||
memcpy(p_body, desc->body, desc->hdr.len);
|
||||
adjust_desc_bd((uint32_t *)p_body, tiu_num == 0);
|
||||
break;
|
||||
case BMK1822_TDMA:
|
||||
tdma_num--;
|
||||
tdma_offset = ALIGN(tdma_offset, GDMA_DESC_ALIGN_SIZE);
|
||||
p_body = (void *)(dmabuf + tdma_offset);
|
||||
tdma_offset += GDMA_DESC_ALIGN_SIZE;
|
||||
memcpy(p_body, desc->body, desc->hdr.len);
|
||||
|
||||
#if 0 //debug feature, for checking if neuron overshoot
|
||||
{
|
||||
tdma_reg_t reg_tdma = {0};
|
||||
uint64_t tdma_address = 0, tdma_address2 = 0;
|
||||
|
||||
parse_tdma_reg(®_tdma, p_body);
|
||||
|
||||
if (reg_tdma.src_base_reg_sel == 0) {
|
||||
// reg.trans_dir = 2; // 0:tg2l, 1:l2tg, 2:g2g, 3:l2l
|
||||
if (reg_tdma.trans_dir == 0) {
|
||||
printf ("src_base_addr_high=%x, src_base_addr_low=%x\n", reg_tdma.src_base_addr_high, reg_tdma.src_base_addr_low);
|
||||
tdma_address = ((uint64_t)reg_tdma.src_base_addr_high) << 32 | (uint64_t)reg_tdma.src_base_addr_low;
|
||||
} else if (reg_tdma.trans_dir == 1) {
|
||||
printf ("dst_base_addr_high=%x, dst_base_addr_low=%x\n", reg_tdma.dst_base_addr_high, reg_tdma.dst_base_addr_low);
|
||||
tdma_address = ((uint64_t)reg_tdma.dst_base_addr_high) << 32 | (uint64_t)reg_tdma.dst_base_addr_low;
|
||||
} else if (reg_tdma.trans_dir == 2) {
|
||||
printf ("dst_base_addr_high=%x, dst_base_addr_low=%x\n", reg_tdma.dst_base_addr_high, reg_tdma.dst_base_addr_low);
|
||||
tdma_address = ((uint64_t)reg_tdma.src_base_addr_high) << 32 | (uint64_t)reg_tdma.src_base_addr_low;
|
||||
tdma_address2 = ((uint64_t)reg_tdma.dst_base_addr_high) << 32 | (uint64_t)reg_tdma.dst_base_addr_low;
|
||||
|
||||
if (tdma_address2 > tdma_address) {
|
||||
tdma_address = tdma_address2;
|
||||
}
|
||||
}
|
||||
|
||||
if (tdma_address > address_max) {
|
||||
address_max = tdma_address;
|
||||
printf("address_max=%llx\n", address_max);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
adjust_desc_tdma((uint32_t *)p_body, tdma_num == 0);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
desc = traverse_next(desc, cmdbuf, sz);
|
||||
}
|
||||
|
||||
// padding zero after eod to workaroud hardware bug
|
||||
if (arm->num_tiu & 0xFFFF) {
|
||||
void *buf = (void *)(dmabuf + tiu_offset);
|
||||
memset(buf, 0, BD_EOD_PADDING_BYTES);
|
||||
tiu_offset += BD_EOD_PADDING_BYTES;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void bmk1822_dmabuf_convert(uint8_t *cmdbuf, uint32_t sz, uint8_t *dmabuf)
|
||||
{
|
||||
uint64_t tiu_offset = 0;
|
||||
uint64_t tdma_offset = 0;
|
||||
fill_header_and_arm(cmdbuf, sz, dmabuf, &tiu_offset, &tdma_offset);
|
||||
fill_bd_and_tdma(cmdbuf, sz, dmabuf, tiu_offset, tdma_offset);
|
||||
return;
|
||||
}
|
||||
|
||||
#define PER_DES_SIZE 16
|
||||
#define PADDING_SIZE (1024 * 1024)
|
||||
void bmk1822_dmabuf_size(uint8_t *cmdbuf, uint32_t sz, uint32_t *psize, uint32_t *pmu_size)
|
||||
{
|
||||
uint32_t tdma_desc_num = {0};
|
||||
uint32_t counters[BMK1822_ENGINE_NUM] = {0};
|
||||
uint32_t bd_size = 0;
|
||||
uint32_t dmabuf_size = 0;
|
||||
|
||||
uint32_t tiu_cnt = 0;
|
||||
uint32_t tdma_cnt = 0;
|
||||
|
||||
// calculate desc numbers
|
||||
DESC *desc = traverse_start(cmdbuf);
|
||||
|
||||
while (desc != NULL) {
|
||||
uint32_t engine_id = (uint32_t)desc->hdr.engine_id;
|
||||
counters[engine_id]++;
|
||||
if (engine_id != BMK1822_CPU) {
|
||||
// a new arm desc inserted to do sync operation
|
||||
if (desc_sync_id(desc) == 0xFFFF || is_last_desc(desc, cmdbuf, sz)) {
|
||||
counters[BMK1822_CPU]++;
|
||||
tdma_desc_num += counters[BMK1822_TDMA];
|
||||
if (counters[BMK1822_TIU] != 0) {
|
||||
bd_size = ALIGN(bd_size + counters[BMK1822_TIU] * BD_REG_BYTES + BD_EOD_PADDING_BYTES,
|
||||
BD_DESC_ALIGN_SIZE);
|
||||
}
|
||||
tiu_cnt += counters[BMK1822_TIU] & 0xFFFF;
|
||||
tdma_cnt += counters[BMK1822_TDMA] & 0xFFFF;
|
||||
counters[BMK1822_TIU] = 0;
|
||||
counters[BMK1822_TDMA] = 0;
|
||||
}
|
||||
} else {
|
||||
tdma_desc_num += counters[BMK1822_TDMA];
|
||||
if (counters[BMK1822_TIU] != 0) {
|
||||
bd_size = ALIGN(bd_size + counters[BMK1822_TIU] * BD_REG_BYTES + BD_EOD_PADDING_BYTES,
|
||||
BD_DESC_ALIGN_SIZE);
|
||||
}
|
||||
tiu_cnt += counters[BMK1822_TIU] & 0xFFFF;
|
||||
tdma_cnt += counters[BMK1822_TDMA] & 0xFFFF;
|
||||
counters[BMK1822_TIU] = 0;
|
||||
counters[BMK1822_TDMA] = 0;
|
||||
}
|
||||
desc = traverse_next(desc, cmdbuf, sz);
|
||||
}
|
||||
// dma hdr + arm descs + bd descs + tdma descs
|
||||
dmabuf_size = sizeof(dma_hdr_t) + counters[BMK1822_CPU] * CPU_ENGINE_BYTES;
|
||||
dmabuf_size = ALIGN(dmabuf_size, BD_DESC_ALIGN_SIZE) + bd_size;
|
||||
dmabuf_size = ALIGN(dmabuf_size, GDMA_DESC_ALIGN_SIZE) + tdma_desc_num * GDMA_DESC_ALIGN_SIZE;
|
||||
|
||||
*psize = dmabuf_size;
|
||||
|
||||
*pmu_size = ALIGN((tiu_cnt + tdma_cnt) * PER_DES_SIZE + PADDING_SIZE, 0x1000);
|
||||
}
|
||||
|
||||
void bmk1822_arraybase_set(uint8_t *dmabuf, uint32_t arraybase0L, uint32_t arraybase1L, uint32_t arraybase0H, uint32_t arraybase1H)
|
||||
{
|
||||
ASSERT(dmabuf);
|
||||
dma_hdr_t *header = (dma_hdr_t *)dmabuf;
|
||||
|
||||
ASSERT(header->dmabuf_magic_m == TPU_DMABUF_HEADER_M);
|
||||
header->arraybase_0_L = arraybase0L;
|
||||
header->arraybase_1_L = arraybase1L;
|
||||
header->arraybase_0_H = arraybase0H;
|
||||
header->arraybase_1_H = arraybase1H;
|
||||
return;
|
||||
}
|
||||
|
||||
void bmk1822_dmabuf_dump(uint8_t *dmabuf)
|
||||
{
|
||||
ASSERT(dmabuf);
|
||||
dma_hdr_t *header = (dma_hdr_t *)dmabuf;
|
||||
//printf("bmk1822_dmabuf_dump header->arraybase_0_L = 0x%x\n", header->arraybase_0_L);
|
||||
//printf("bmk1822_dmabuf_dump header->arraybase_1_L = 0x%x\n", header->arraybase_1_L);
|
||||
//printf("bmk1822_dmabuf_dump header->arraybase_0_H = 0x%x\n", header->arraybase_0_H);
|
||||
//printf("bmk1822_dmabuf_dump header->arraybase_1_H = 0x%x\n", header->arraybase_1_H);
|
||||
//printf("bmk1822_dmabuf_dump header->pmubuf_offset = 0x%x\n", header->pmubuf_offset);
|
||||
|
||||
ASSERT(header->dmabuf_magic_m == TPU_DMABUF_HEADER_M);
|
||||
cvi_cpu_desc_t *desc = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t));
|
||||
|
||||
for (u32 i = 0; i < header->cpu_desc_count; i++, desc++) {
|
||||
int bd_num = desc->num_tiu & 0xFFFF;
|
||||
int tdma_num = desc->num_tdma & 0xFFFF;
|
||||
u32 bd_offset = desc->offset_tiu;
|
||||
u32 tdma_offset = desc->offset_tdma;
|
||||
printf("bmk1822_dmabuf_dump num<bd:%d, tdma:%d>, offset<0x%08x, 0x%08x>\n", bd_num, tdma_num, bd_offset, tdma_offset);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
586
cvikernel/src/bm1822/bm_kernel.c
Normal file
586
cvikernel/src/bm1822/bm_kernel.c
Normal file
@ -0,0 +1,586 @@
|
||||
#include <bmkernel/bm_kernel.h>
|
||||
#include "kernel_1822.h"
|
||||
#include <bmkernel/bm1822/bm1822_tpu_cfg.h>
|
||||
|
||||
static void replace_cmd_id(uint32_t *desc, uint32_t eng_id, uint16_t ids[])
|
||||
{
|
||||
if (eng_id == BMK1822_TIU) {
|
||||
tiu_reg_t reg;
|
||||
parse_tiu_reg(®, desc);
|
||||
reg.cmd_id_en = 1;
|
||||
reg.cmd_id_tpu = ids[eng_id];
|
||||
reg.cmd_id_gdma = ids[BMK1822_TDMA];
|
||||
emit_tiu_reg(®, desc);
|
||||
} else if (eng_id == BMK1822_TDMA) {
|
||||
tdma_reg_t tdma_reg;
|
||||
parse_tdma_reg(&tdma_reg, desc);
|
||||
tdma_reg.cmd_id = ids[eng_id];
|
||||
tdma_reg.wait_id_tpu = ids[BMK1822_TIU];
|
||||
tdma_reg.bar_en = 1;
|
||||
emit_tdma_reg(&tdma_reg, desc);
|
||||
}
|
||||
}
|
||||
|
||||
static int bm1822_get_engine_desc_length(uint32_t engine_id)
|
||||
{
|
||||
switch (engine_id) {
|
||||
case BMK1822_TIU:
|
||||
return TIU_ENGINE_DESCRIPTOR_NUM * sizeof(uint32_t);
|
||||
case BMK1822_TDMA:
|
||||
return TDMA_ENGINE_DESCRIPTOR_NUM * sizeof(uint32_t);
|
||||
case BMK1822_CPU:
|
||||
return CPU_ENGINE_DESCRIPTOR_NUM * sizeof(uint32_t);
|
||||
default:
|
||||
ASSERT(0);
|
||||
}
|
||||
}
|
||||
|
||||
// Estimate the number of command descriptor based on buffer size provided
|
||||
// by the user.
|
||||
uint32_t bmk1822_estimate_nr_desc(ctx_t *k)
|
||||
{
|
||||
uint32_t tiu_desc_len = bm1822_get_engine_desc_length(BMK1822_TIU);
|
||||
uint32_t tdma_desc_len = bm1822_get_engine_desc_length(BMK1822_TDMA);
|
||||
uint32_t hdr_len = sizeof(cmd_hdr_t);
|
||||
|
||||
uint32_t desc_len =
|
||||
(tiu_desc_len > tdma_desc_len) ? tiu_desc_len : tdma_desc_len;
|
||||
|
||||
return k->info.cmdbuf_size / (desc_len + hdr_len);
|
||||
}
|
||||
|
||||
static void kernel_init(ctx_t *k, bmk_info_t *info)
|
||||
{
|
||||
k->info = *info;
|
||||
ASSERT(info->chip_version == BM1822_VER);
|
||||
k->chip_info = bmk1822_chip_info();
|
||||
|
||||
uint32_t max_nr_desc = bmk1822_estimate_nr_desc(k);
|
||||
ec_init(&k->ec, BMK1822_ENGINE_NUM, max_nr_desc);
|
||||
mode_manager_init(&k->mode_manager, &k->ec, BMK1822_ENGINE_NUM);
|
||||
|
||||
k->cmdbuf_ptr = 0;
|
||||
k->max_nr_desc = max_nr_desc;
|
||||
k->cur_nr_desc = 0;
|
||||
k->desc_pairs = xmalloc(max_nr_desc * sizeof(k->desc_pairs[0]));
|
||||
|
||||
k->lmem_ptr = 0;
|
||||
}
|
||||
|
||||
static void kernel_destroy(ctx_t *k)
|
||||
{
|
||||
free(k->desc_pairs);
|
||||
ec_destroy(&k->ec);
|
||||
mode_manager_destroy(&k->mode_manager);
|
||||
}
|
||||
|
||||
static void kernel_reset(ctx_t *k)
|
||||
{
|
||||
k->cur_nr_desc = 0;
|
||||
k->cmdbuf_ptr = 0;
|
||||
|
||||
ec_reset(&k->ec);
|
||||
mode_manager_reset(&k->mode_manager);
|
||||
}
|
||||
|
||||
static cmd_hdr_t * kernel_alloc_cmd_hdr(
|
||||
ctx_t *k, uint8_t eng_id, uint32_t desc_len)
|
||||
{
|
||||
uint32_t free_len = k->info.cmdbuf_size - k->cmdbuf_ptr;
|
||||
uint32_t hdr_len = sizeof(cmd_hdr_t);
|
||||
uint32_t total_len = hdr_len + desc_len;
|
||||
ASSERT(total_len <= free_len);
|
||||
|
||||
cmd_hdr_t *hdr = (cmd_hdr_t *)&k->info.cmdbuf[k->cmdbuf_ptr];
|
||||
hdr->magic = CMDBUF_HDR_MAGIC_1822;
|
||||
hdr->len = desc_len;
|
||||
hdr->engine_id = eng_id;
|
||||
hdr->__deprecated = 0; // for valgrind
|
||||
hdr->flags = 0;
|
||||
hdr->mask = 0;
|
||||
|
||||
k->cmdbuf_ptr += total_len;
|
||||
return hdr;
|
||||
}
|
||||
|
||||
static desc_pair_t * kernel_alloc_desc_pair(ctx_t *k, uint8_t eng_id)
|
||||
{
|
||||
ASSERT(eng_id < BMK1822_ENGINE_NUM);
|
||||
ASSERT(k->cur_nr_desc < k->max_nr_desc);
|
||||
|
||||
uint32_t desc_len = bm1822_get_engine_desc_length(eng_id);
|
||||
desc_pair_t *dp = &k->desc_pairs[k->cur_nr_desc++];
|
||||
dp->cmd_hdr = kernel_alloc_cmd_hdr(k, eng_id, desc_len);
|
||||
dp->ec_desc = ec_alloc_desc(&k->ec, eng_id);
|
||||
|
||||
mode_manager_record_ec_desc(&k->mode_manager, dp->ec_desc);
|
||||
return dp;
|
||||
}
|
||||
|
||||
static void kernel_update_sync_id(ctx_t *k)
|
||||
{
|
||||
ec_compute_sync_ids(&k->ec);
|
||||
|
||||
for (uint32_t di = 0; di < k->cur_nr_desc; di++) {
|
||||
desc_pair_t *dp = &k->desc_pairs[di];
|
||||
uint8_t eng_id = dp->ec_desc->engine_id;
|
||||
uint32_t *desc = (uint32_t *)dp->cmd_hdr->cmd;
|
||||
replace_cmd_id(desc, eng_id, dp->ec_desc->sync_ids);
|
||||
}
|
||||
}
|
||||
|
||||
void bmk1822_add_dependency(
|
||||
ctx_t *ctx,
|
||||
bmk1822_op_t *before,
|
||||
bmk1822_op_t *after)
|
||||
{
|
||||
ec_add_dependency(&ctx->ec, before, after);
|
||||
}
|
||||
|
||||
desc_pair_t * bm1822_get_desc_pair(ctx_t *k, uint8_t eng_id)
|
||||
{
|
||||
if (eng_id == BMK1822_CPU) {
|
||||
kernel_update_sync_id(k);
|
||||
k->cur_nr_desc = 0;
|
||||
|
||||
ec_reset(&k->ec);
|
||||
mode_manager_restart_sync_id(&k->mode_manager);
|
||||
}
|
||||
|
||||
return kernel_alloc_desc_pair(k, eng_id);
|
||||
}
|
||||
|
||||
ctx_t * bmk1822_register(bmk_info_t *info)
|
||||
{
|
||||
ASSERT(info);
|
||||
ASSERT(info->cmdbuf);
|
||||
ASSERT(info->cmdbuf_size > 0);
|
||||
ctx_t *k = xmalloc(sizeof(*k));
|
||||
kernel_init(k, info);
|
||||
return k;
|
||||
}
|
||||
|
||||
void bmk1822_cleanup(ctx_t *ctx)
|
||||
{
|
||||
ASSERT(ctx);
|
||||
|
||||
ctx_t *k = (typeof(k))ctx;
|
||||
|
||||
kernel_destroy(k);
|
||||
free(k);
|
||||
}
|
||||
|
||||
void bmk1822_reset(ctx_t *ctx)
|
||||
{
|
||||
ctx_t *k = (typeof(k))ctx;
|
||||
kernel_reset(k);
|
||||
}
|
||||
|
||||
uint8_t *bmk1822_acquire_cmdbuf(ctx_t *ctx, uint32_t *size)
|
||||
{
|
||||
ctx_t *k = (typeof(k))ctx;
|
||||
|
||||
*size = k->cmdbuf_ptr;
|
||||
kernel_update_sync_id(k);
|
||||
return k->info.cmdbuf;
|
||||
}
|
||||
|
||||
void bmk1822_parallel_enable(ctx_t *ctx)
|
||||
{
|
||||
ctx_t *k = (typeof(k))ctx;
|
||||
mode_manager_enable_parallel(&k->mode_manager);
|
||||
}
|
||||
|
||||
void bmk1822_set_op(ctx_t *ctx, void* op)
|
||||
{
|
||||
ctx_t *k = (typeof(k))ctx;
|
||||
k->op = op;
|
||||
}
|
||||
|
||||
void* bmk1822_get_op(ctx_t *ctx)
|
||||
{
|
||||
ctx_t *k = (typeof(k))ctx;
|
||||
return k->op;
|
||||
}
|
||||
|
||||
void bmk1822_parallel_disable(ctx_t *ctx)
|
||||
{
|
||||
ctx_t *k = (typeof(k))ctx;
|
||||
mode_manager_disable_parallel(&k->mode_manager);
|
||||
}
|
||||
|
||||
void bmk1822_create_streams(ctx_t *ctx, int nr_streams)
|
||||
{
|
||||
ctx_t *k = (typeof(k))ctx;
|
||||
mode_manager_create_streams(&k->mode_manager, nr_streams);
|
||||
}
|
||||
|
||||
void bmk1822_set_layer_id(ctx_t *ctx, uint16_t layer_id)
|
||||
{
|
||||
ctx_t *k = (typeof(k))ctx;
|
||||
k->layer_id = layer_id;
|
||||
}
|
||||
|
||||
uint16_t bmk1822_layer_id(ctx_t *ctx)
|
||||
{
|
||||
ctx_t *k = (typeof(k))ctx;
|
||||
return k->layer_id;
|
||||
}
|
||||
|
||||
void bmk1822_destroy_streams(ctx_t *ctx)
|
||||
{
|
||||
ctx_t *k = (typeof(k))ctx;
|
||||
mode_manager_destroy_streams(&k->mode_manager);
|
||||
}
|
||||
|
||||
void bmk1822_set_stream(ctx_t *ctx, int i)
|
||||
{
|
||||
ctx_t *k = (typeof(k))ctx;
|
||||
mode_manager_set_stream(&k->mode_manager, i);
|
||||
}
|
||||
|
||||
static bmk1822_chip_info_t bm1822_chip_info = {
|
||||
.version = BM1822_VER,
|
||||
.npu_num = BM1822_HW_NPU_NUM,
|
||||
.eu_num = BM1822_HW_EU_NUM,
|
||||
.lmem_size = BM1822_HW_LMEM_SIZE,
|
||||
.lmem_banks = BM1822_HW_LMEM_BANKS,
|
||||
.lmem_bank_size = BM1822_HW_LMEM_BANK_SIZE,
|
||||
.lmem_start = BM1822_HW_LMEM_START_ADDR,
|
||||
.gmem_start = BM1822_GLOBAL_MEM_START_ADDR,
|
||||
.gmem_size = BM1822_GLOBAL_MEM_SIZE,
|
||||
};
|
||||
|
||||
bmk1822_chip_info_t bmk1822_chip_info(void)
|
||||
{
|
||||
return bm1822_chip_info;
|
||||
}
|
||||
|
||||
bmk1822_tensor_lmem_t * bmk1822_lmem_alloc_tensor(
|
||||
ctx_t *ctx,
|
||||
bmk1822_tensor_lmem_shape_t s,
|
||||
fmt_t fmt,
|
||||
int eu_align)
|
||||
{
|
||||
ctx_t *k = (typeof(k))ctx;
|
||||
uint32_t lmem_size = k->chip_info.lmem_size;
|
||||
uint32_t eu_num = k->chip_info.eu_num;
|
||||
|
||||
bmk1822_tensor_lmem_t *t = xmalloc(sizeof(*t));
|
||||
memset(t, 0, sizeof(*t));
|
||||
t->start_address = k->lmem_ptr;
|
||||
t->fmt = fmt;
|
||||
t->cmprs_fmt = fmt;
|
||||
t->shape = s;
|
||||
t->eu_align = eu_align;
|
||||
t->stride = bmk1822_tensor_lmem_default_stride(ctx, s, fmt, eu_align);
|
||||
|
||||
uint32_t needed = align_up(t->shape.n * t->stride.n, eu_num);
|
||||
if ((lmem_size - k->lmem_ptr < needed) || !needed) {
|
||||
free(t);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
k->lmem_ptr += needed;
|
||||
return t;
|
||||
}
|
||||
|
||||
void bmk1822_lmem_init_tensor(
|
||||
ctx_t *ctx,
|
||||
bmk1822_tensor_lmem_t *tl,
|
||||
bmk1822_tensor_lmem_shape_t shape,
|
||||
fmt_t fmt,
|
||||
int eu_align)
|
||||
{
|
||||
memset(tl, 0, sizeof(*tl));
|
||||
tl->fmt = fmt;
|
||||
tl->shape = shape;
|
||||
tl->eu_align = eu_align;
|
||||
tl->stride = bmk1822_tensor_lmem_default_stride(ctx, shape, fmt, eu_align);
|
||||
}
|
||||
|
||||
// Provide the unified api for tensor size calculation.
|
||||
// Must have the same logic as bmk1822_lmem_bf16_alloc_tensor.
|
||||
// The backed does not need to duplicate the related code.
|
||||
uint32_t bmk1822_lmem_tensor_to_size(
|
||||
ctx_t *ctx,
|
||||
bmk1822_tensor_lmem_shape_t s,
|
||||
fmt_t fmt, int eu_align)
|
||||
{
|
||||
ctx_t *k = (typeof(k))ctx;
|
||||
uint32_t eu_num = k->chip_info.eu_num;
|
||||
|
||||
bmk1822_tensor_lmem_stride_t stride;
|
||||
stride = bmk1822_tensor_lmem_default_stride(ctx, s, fmt, eu_align);
|
||||
|
||||
uint32_t needed = align_up(s.n * stride.n, eu_num);
|
||||
|
||||
return needed;
|
||||
}
|
||||
|
||||
bmk1822_tensor_lmem_t * bmk1822_lmem_alloc_ps32_tensor(
|
||||
bmk1822_context_t *ctx,
|
||||
bmk1822_tensor_lmem_shape_t s,
|
||||
fmt_t fmt,
|
||||
int eu_align)
|
||||
{
|
||||
/* Partial sum is located in lmem in 32-bit format, so we times n to 2 to
|
||||
* spare a sapce for it.
|
||||
*/
|
||||
|
||||
uint32_t prev_n;
|
||||
|
||||
prev_n = s.n;
|
||||
s.n = s.n * (bitsize_of_fmt(FMT_I32) / bitsize_of_fmt(fmt));
|
||||
bmk1822_tensor_lmem_t *res = bmk1822_lmem_alloc_tensor(ctx, s, fmt, eu_align);
|
||||
if(res == NULL)
|
||||
ASSERT(0);
|
||||
res->shape.n = prev_n;
|
||||
return res;
|
||||
}
|
||||
|
||||
void bmk1822_lmem_free_tensor(
|
||||
ctx_t *ctx, const bmk1822_tensor_lmem_t *t)
|
||||
{
|
||||
ASSERT(t->start_address < ctx->lmem_ptr);
|
||||
ctx->lmem_ptr = t->start_address;
|
||||
|
||||
free((void *)t);
|
||||
}
|
||||
|
||||
bmk1822_matrix_lmem_t * bmk1822_lmem_alloc_matrix(
|
||||
ctx_t *ctx,
|
||||
bmk1822_matrix_lmem_shape_t s,
|
||||
fmt_t fmt,
|
||||
int eu_align)
|
||||
{
|
||||
uint32_t lmem_size = ctx->chip_info.lmem_size;
|
||||
uint32_t npu_num = ctx->chip_info.npu_num;
|
||||
uint32_t eu_num = ctx->chip_info.eu_num;
|
||||
uint32_t val = (fmt == FMT_BF16) ? 2 : 1;
|
||||
|
||||
bmk1822_matrix_lmem_t *t = xmalloc(sizeof(*t));
|
||||
memset(t, 0, sizeof(*t));
|
||||
t->start_address = ctx->lmem_ptr;
|
||||
t->fmt = fmt;
|
||||
t->shape = s;
|
||||
t->stride.h = s.w * val;
|
||||
if (eu_align)
|
||||
t->stride.c = align_up(s.w * val, eu_num);
|
||||
else
|
||||
t->stride.c = s.w * val;
|
||||
t->stride.n = t->stride.c * ceiling_func(s.c, npu_num);
|
||||
t->eu_align = eu_align;
|
||||
|
||||
uint32_t needed = align_up(t->shape.n * t->stride.n, eu_num);
|
||||
if (lmem_size - ctx->lmem_ptr < needed) {
|
||||
free(t);
|
||||
return NULL;
|
||||
}
|
||||
ctx->lmem_ptr += needed;
|
||||
return t;
|
||||
}
|
||||
|
||||
void bmk1822_lmem_init_matrix(
|
||||
ctx_t *ctx,
|
||||
bmk1822_matrix_lmem_t *ml,
|
||||
bmk1822_matrix_lmem_shape_t shape,
|
||||
fmt_t fmt,
|
||||
int eu_align)
|
||||
{
|
||||
memset(ml, 0, sizeof(*ml));
|
||||
ml->fmt = fmt;
|
||||
ml->shape = shape;
|
||||
ml->stride = bmk1822_matrix_lmem_default_stride(ctx, shape, fmt, eu_align);
|
||||
ml->eu_align = eu_align;
|
||||
}
|
||||
|
||||
// Provide the unified api for matrix size calculation.
|
||||
// Must have the same logic as bmk1822_lmem_alloc_matrix.
|
||||
// The backed does not need to duplicate the related code.
|
||||
uint32_t bmk1822_lmem_matrix_to_size(
|
||||
ctx_t *ctx,
|
||||
bmk1822_matrix_lmem_shape_t s,
|
||||
fmt_t fmt,
|
||||
int eu_align) {
|
||||
uint32_t npu_num = ctx->chip_info.npu_num;
|
||||
uint32_t eu_num = ctx->chip_info.eu_num;
|
||||
uint32_t val = (fmt == FMT_BF16) ? 2 : 1;
|
||||
|
||||
bmk1822_matrix_lmem_t t;
|
||||
t.fmt = fmt;
|
||||
t.shape = s;
|
||||
t.stride.h = s.w * val;
|
||||
if (eu_align)
|
||||
t.stride.c = align_up(s.w * val, eu_num);
|
||||
else
|
||||
t.stride.c = s.w * val;
|
||||
t.stride.n = t.stride.c * ceiling_func(s.c, npu_num);
|
||||
|
||||
uint32_t needed = align_up(t.shape.n * t.stride.n, eu_num);
|
||||
|
||||
return needed;
|
||||
}
|
||||
|
||||
bmk1822_matrix_lmem_t * bmk1822_lmem_alloc_ps32_matrix(
|
||||
bmk1822_context_t *ctx,
|
||||
bmk1822_matrix_lmem_shape_t s,
|
||||
fmt_t fmt,
|
||||
int eu_align)
|
||||
{
|
||||
/* Partial sum is located in lmem in 32-bit format, so we times n to 4 to
|
||||
* spare a sapce for it.
|
||||
*/
|
||||
|
||||
uint32_t prev_n;
|
||||
|
||||
prev_n = s.n;
|
||||
s.n = s.n * (bitsize_of_fmt(FMT_I32) / bitsize_of_fmt(fmt));
|
||||
bmk1822_matrix_lmem_t *res = bmk1822_lmem_alloc_matrix(ctx, s, fmt, eu_align);
|
||||
if(res == NULL)
|
||||
ASSERT(0);
|
||||
res->shape.n = prev_n;
|
||||
return res;
|
||||
}
|
||||
|
||||
// Provide the unified api for matrix size calculation.
|
||||
// Must have the same logic as bmk1822_lmem_alloc_ps32_matrix.
|
||||
// The backed does not need to duplicate the related code.
|
||||
uint32_t bmk1822_lmem_ps32_matrix_to_size(
|
||||
bmk1822_context_t *ctx,
|
||||
bmk1822_matrix_lmem_shape_t s,
|
||||
fmt_t fmt,
|
||||
int eu_align)
|
||||
{
|
||||
/* Partial sum is located in lmem in 32-bit format, so we times n to 4 to
|
||||
* spare a sapce for it.
|
||||
*/
|
||||
|
||||
s.n = s.n * (bitsize_of_fmt(FMT_I32) / bitsize_of_fmt(fmt));
|
||||
|
||||
return bmk1822_lmem_matrix_to_size(ctx, s, fmt, eu_align);
|
||||
}
|
||||
|
||||
void bmk1822_lmem_free_matrix(
|
||||
ctx_t *ctx, const bmk1822_matrix_lmem_t *t)
|
||||
{
|
||||
ASSERT(t->start_address < ctx->lmem_ptr);
|
||||
ctx->lmem_ptr = t->start_address;
|
||||
free((void *)t);
|
||||
}
|
||||
|
||||
bmk1822_tensor_lmem_stride_t bmk1822_tensor_lmem_default_stride(
|
||||
ctx_t *ctx,
|
||||
bmk1822_tensor_lmem_shape_t s,
|
||||
fmt_t fmt_type,
|
||||
int eu_align)
|
||||
{
|
||||
bmk1822_tensor_lmem_stride_t stride;
|
||||
uint32_t eu_num = ctx->chip_info.eu_num;
|
||||
uint32_t npu_num = ctx->chip_info.npu_num;
|
||||
uint32_t fmt = (fmt_type == FMT_BF16) ? 2 : 1;
|
||||
stride.w = fmt;
|
||||
stride.h = s.w * fmt;
|
||||
if (eu_align)
|
||||
stride.c = align_up(s.h * s.w * fmt, eu_num);
|
||||
else
|
||||
stride.c = s.h * s.w * fmt;
|
||||
|
||||
stride.n = stride.c * ceiling_func(s.c, npu_num);
|
||||
// printf("bmk1822_tensor_lmem_default_stride stride n=%x c=%x h=%x w=%x\n", stride.n , stride.c , stride.h, stride.w);
|
||||
return stride;
|
||||
}
|
||||
|
||||
bmk1822_tensor_tgmem_stride_t bmk1822_tensor_tgmem_default_stride(
|
||||
bmk1822_tensor_tgmem_shape_t s, fmt_t fmt_type)
|
||||
{
|
||||
uint32_t data_type_size = (fmt_type == FMT_BF16) ? 2 : 1;
|
||||
bmk1822_tensor_tgmem_stride_t stride;
|
||||
stride.h = s.w * data_type_size;
|
||||
stride.c = s.h * stride.h;
|
||||
stride.n = s.c * stride.c;
|
||||
return stride;
|
||||
}
|
||||
|
||||
static void try_optimize_matrix_shape(ctx_t *ctx,
|
||||
bmk1822_matrix_lmem_shape_t *s,
|
||||
fmt_t fmt_type) {
|
||||
uint32_t eu_num = ctx->chip_info.eu_num;
|
||||
uint32_t npu_num = ctx->chip_info.npu_num;
|
||||
uint32_t col = s->col;
|
||||
bool isBf16 = (fmt_type == FMT_BF16);
|
||||
uint32_t workingNumber = isBf16 ? eu_num / 2 : eu_num;
|
||||
|
||||
if (col >= workingNumber) {
|
||||
int num_eu = ceiling_func(col, workingNumber * npu_num);
|
||||
s->w = workingNumber * num_eu;
|
||||
s->c = ceiling_func(col, s->w);
|
||||
} else {
|
||||
// col < EU_NUM
|
||||
// Only transfer needed data
|
||||
// We still change tensor shape in TIU mac op
|
||||
s->w = col;
|
||||
s->c = 1;
|
||||
}
|
||||
}
|
||||
|
||||
bmk1822_matrix_lmem_shape_t bmk1822_matrix_lmem_default_shape(
|
||||
ctx_t *ctx,
|
||||
uint32_t row,
|
||||
uint32_t col,
|
||||
fmt_t fmt_type)
|
||||
{
|
||||
bmk1822_matrix_lmem_shape_t s = {0};
|
||||
s.n = row;
|
||||
s.col = col;
|
||||
|
||||
try_optimize_matrix_shape(ctx, &s, fmt_type);
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
bmk1822_matrix_lmem_shape_t bmk1822_matrix_lmem_shape_t1(
|
||||
ctx_t *ctx,
|
||||
uint32_t len,
|
||||
fmt_t fmt_type)
|
||||
{
|
||||
uint32_t lmem_size = ctx->chip_info.lmem_size;
|
||||
bmk1822_matrix_lmem_shape_t s = {0};
|
||||
|
||||
uint32_t row = 1;
|
||||
uint32_t col = len;
|
||||
|
||||
while (col >= lmem_size) {
|
||||
ASSERT(col % 2 == 0);
|
||||
col /= 2;
|
||||
row *= 2;
|
||||
}
|
||||
|
||||
s.n = row;
|
||||
s.col = col;
|
||||
|
||||
try_optimize_matrix_shape(ctx, &s, fmt_type);
|
||||
return s;
|
||||
}
|
||||
|
||||
// This should be inside bmk1822_lmem_alloc_matrix
|
||||
bmk1822_matrix_lmem_stride_t bmk1822_matrix_lmem_default_stride(
|
||||
ctx_t *ctx,
|
||||
bmk1822_matrix_lmem_shape_t s,
|
||||
fmt_t fmt,
|
||||
int eu_align)
|
||||
{
|
||||
uint32_t npu_num = ctx->chip_info.npu_num;
|
||||
uint32_t eu_num = ctx->chip_info.eu_num;
|
||||
uint32_t val = (fmt == FMT_BF16) ? 2 : 1;
|
||||
|
||||
bmk1822_matrix_lmem_stride_t stride;
|
||||
stride.h = s.w * val;
|
||||
if (eu_align)
|
||||
stride.c = align_up(s.w * val, eu_num);
|
||||
else
|
||||
stride.c = s.w * val;
|
||||
stride.n = stride.c * ceiling_func(s.c, npu_num);
|
||||
|
||||
return stride;
|
||||
}
|
||||
374
cvikernel/src/bm1822/kernel_1822.h
Normal file
374
cvikernel/src/bm1822/kernel_1822.h
Normal file
@ -0,0 +1,374 @@
|
||||
#ifndef KERNEL_1822_H
|
||||
#define KERNEL_1822_H
|
||||
|
||||
#include "kernel_internal.h"
|
||||
|
||||
#include <bmkernel/bm1822/bmkernel_1822.h>
|
||||
#include <bmkernel/bm1822/bm1822_tiu_reg.h>
|
||||
#include <bmkernel/bm1822/bm1822_tdma_reg.h>
|
||||
#include <bmkernel/bm1822/bm1822_tpu_cfg.h>
|
||||
#include <bmkernel/reg_tiu.h>
|
||||
#include <bmkernel/reg_bdcast.h>
|
||||
#include <bmkernel/reg_tdma.h>
|
||||
#include "bmkernel_standard.h"
|
||||
|
||||
#include <cvikernel/cvikernel.h>
|
||||
|
||||
#define TENSOR_MUL_FIX8B 0
|
||||
#define TENSOR_MAC_FIX8B 1
|
||||
#define TENSOR_ADD_FIX8B 2
|
||||
#define TENSOR_SUB_FIX8B 3
|
||||
#define TENSOR_MAX_FIX8B 4
|
||||
#define TENSOR_MIN_FIX8B 5
|
||||
#define TENSOR_SHIFT_FIX8B 6
|
||||
#define TENSOR_AND_FIX8B 7
|
||||
#define TENSOR_OR_FIX8B 8
|
||||
#define TENSOR_XOR_FIX8B 9
|
||||
#define TENSOR_COPY_FIX8B 10
|
||||
#define TENSOR_GE_FIX8B 11
|
||||
|
||||
typedef bmk1822_tensor_lmem_shape_t tl_shape_t;
|
||||
typedef bmk1822_matrix_lmem_shape_t ml_shape_t;
|
||||
typedef bmk1822_tensor_tgmem_shape_t tg_shape_t;
|
||||
typedef bmk1822_matrix_tgmem_shape_t mg_shape_t;
|
||||
|
||||
typedef bmk1822_tensor_lmem_stride_t tl_stride_t;
|
||||
|
||||
typedef bmk1822_tensor_lmem_t tl_t;
|
||||
typedef bmk1822_matrix_lmem_t ml_t;
|
||||
typedef bmk1822_tensor_tgmem_t tg_t;
|
||||
typedef bmk1822_matrix_tgmem_t mg_t;
|
||||
typedef bmk1822_compressed_tensor_tgmem_t compressed_tg_t;
|
||||
typedef bmk1822_compressed_matrix_tgmem_t compressed_mg_t;
|
||||
|
||||
desc_pair_t * bm1822_get_desc_pair(ctx_t *k, uint8_t eng_id);
|
||||
|
||||
static inline void assert_same_stride(const tl_t *a, const tl_t *b)
|
||||
{
|
||||
ASSERT(a->stride.n == b->stride.n);
|
||||
ASSERT(a->stride.c == b->stride.c);
|
||||
ASSERT(a->stride.h == b->stride.h);
|
||||
ASSERT(a->stride.w == b->stride.w);
|
||||
}
|
||||
|
||||
static inline void assert_same_shape(const tl_t *a, const tl_t *b)
|
||||
{
|
||||
ASSERT(a->shape.n == b->shape.n);
|
||||
ASSERT(a->shape.c == b->shape.c);
|
||||
ASSERT(a->shape.h == b->shape.h);
|
||||
ASSERT(a->shape.w == b->shape.w);
|
||||
}
|
||||
|
||||
static inline void assert_same_shape_3(
|
||||
const tl_t *a,
|
||||
const tl_t *b,
|
||||
const tl_t *c)
|
||||
{
|
||||
assert_same_shape(a, b);
|
||||
assert_same_shape(a, c);
|
||||
}
|
||||
|
||||
static inline void assert_same_shape_4(
|
||||
const tl_t *a,
|
||||
const tl_t *b,
|
||||
const tl_t *c,
|
||||
const tl_t *d)
|
||||
{
|
||||
assert_same_shape_3(a, b, c);
|
||||
assert_same_shape(a, d);
|
||||
}
|
||||
|
||||
static inline void assert_same_shape_5(
|
||||
const tl_t *t0,
|
||||
const tl_t *t1,
|
||||
const tl_t *t2,
|
||||
const tl_t *t3,
|
||||
const tl_t *t4)
|
||||
{
|
||||
assert_same_shape_3(t0, t1, t2);
|
||||
assert_same_shape_3(t0, t3, t4);
|
||||
}
|
||||
|
||||
static inline void assert_same_shape_6(
|
||||
const tl_t *t0,
|
||||
const tl_t *t1,
|
||||
const tl_t *t2,
|
||||
const tl_t *t3,
|
||||
const tl_t *t4,
|
||||
const tl_t *t5)
|
||||
{
|
||||
assert_same_shape_5(t0, t1, t2, t3, t4);
|
||||
assert_same_shape(t0, t5);
|
||||
}
|
||||
|
||||
|
||||
static inline void assert_tiu_tensor_shape(const tl_t *t)
|
||||
{
|
||||
ASSERT(t->shape.n > 0);
|
||||
ASSERT(t->shape.c > 0);
|
||||
ASSERT(t->shape.h > 0);
|
||||
ASSERT(t->shape.w > 0);
|
||||
|
||||
ASSERT(t->shape.n < 0x1000);
|
||||
ASSERT(t->shape.c < 0x1000);
|
||||
ASSERT(t->shape.h <= (4095-32)); // 12bit, max 4095-32(lanes)
|
||||
ASSERT(t->shape.w <= (4095-32)); // 12bit, max 4095-32(lanes)
|
||||
}
|
||||
|
||||
static inline void check_tiu_tensor(const tl_t *t)
|
||||
{
|
||||
ASSERT(t);
|
||||
assert_tiu_tensor_shape(t);
|
||||
ASSERT(t->fmt == FMT_I8 || t->fmt == FMT_U8 || t->fmt == FMT_BF16);
|
||||
}
|
||||
|
||||
static inline void check_tiu_tensor_2(
|
||||
const tl_t *t0,
|
||||
const tl_t *t1)
|
||||
{
|
||||
check_tiu_tensor(t0);
|
||||
check_tiu_tensor(t1);
|
||||
}
|
||||
|
||||
static inline void check_tiu_tensor_3(
|
||||
const tl_t *t0,
|
||||
const tl_t *t1,
|
||||
const tl_t *t2)
|
||||
{
|
||||
check_tiu_tensor(t0);
|
||||
check_tiu_tensor_2(t1, t2);
|
||||
}
|
||||
|
||||
static inline void check_tiu_tensor_4(
|
||||
const tl_t *t0,
|
||||
const tl_t *t1,
|
||||
const tl_t *t2,
|
||||
const tl_t *t3)
|
||||
{
|
||||
check_tiu_tensor_3(t0, t1, t2);
|
||||
check_tiu_tensor(t3);
|
||||
}
|
||||
|
||||
static inline void check_tiu_tensor_5(
|
||||
const tl_t *t0,
|
||||
const tl_t *t1,
|
||||
const tl_t *t2,
|
||||
const tl_t *t3,
|
||||
const tl_t *t4)
|
||||
{
|
||||
check_tiu_tensor_3(t0, t1, t2);
|
||||
check_tiu_tensor_2(t3, t4);
|
||||
}
|
||||
|
||||
static inline void check_tiu_tensor_6(
|
||||
const tl_t *t0,
|
||||
const tl_t *t1,
|
||||
const tl_t *t2,
|
||||
const tl_t *t3,
|
||||
const tl_t *t4,
|
||||
const tl_t *t5)
|
||||
{
|
||||
check_tiu_tensor_3(t0, t1, t2);
|
||||
check_tiu_tensor_3(t3, t4, t5);
|
||||
}
|
||||
|
||||
static inline void check_16bit_tiu_tensor(const tl_t *low, const tl_t *high)
|
||||
{
|
||||
check_tiu_tensor_2(low, high);
|
||||
assert_same_shape(low, high);
|
||||
assert_same_stride(low, high);
|
||||
ASSERT(low->fmt == high->fmt);
|
||||
ASSERT(low->start_address < high->start_address);
|
||||
}
|
||||
|
||||
static inline void assert_stride_type_0(ctx_t *ctx, const tl_t *t)
|
||||
{
|
||||
uint32_t eu_num = ctx->chip_info.eu_num;
|
||||
uint32_t fmt = (t->fmt == FMT_BF16) ? 2 : 1;
|
||||
|
||||
uint32_t h = t->shape.h;
|
||||
uint32_t w = t->shape.w * fmt;
|
||||
uint32_t c_stride = align_up(h * w, eu_num);
|
||||
|
||||
ASSERT(t->stride.c == c_stride);
|
||||
ASSERT(t->stride.h == w);
|
||||
ASSERT(t->stride.w == fmt);
|
||||
}
|
||||
|
||||
static inline void assert_bf16_stride_type_0(ctx_t *ctx, const tl_t *t)
|
||||
{
|
||||
uint32_t eu_num = ctx->chip_info.eu_num;
|
||||
uint32_t fmt = (t->fmt == FMT_BF16) ? 2 : 1;
|
||||
|
||||
ASSERT(t->stride.c % eu_num == 0);
|
||||
ASSERT(t->stride.w == fmt);
|
||||
}
|
||||
|
||||
|
||||
static inline void assert_stride_type_2(ctx_t *ctx, const tl_t *t)
|
||||
{
|
||||
ASSERT(t->shape.h == 1);
|
||||
ASSERT(t->shape.w == 1);
|
||||
|
||||
uint32_t fmt = (t->fmt == FMT_BF16) ? 2 : 1;
|
||||
uint32_t c = t->shape.c;
|
||||
uint32_t npu_num = ctx->chip_info.npu_num;
|
||||
|
||||
ASSERT(t->stride.n == fmt * align_up(c, npu_num) / npu_num);
|
||||
ASSERT(t->stride.c == 1 * fmt);
|
||||
ASSERT(t->stride.h == 1 * fmt);
|
||||
ASSERT(t->stride.w == 1 * fmt);
|
||||
}
|
||||
|
||||
static inline void assert_bf16_stride_type_2(ctx_t *ctx, const tl_t *t)
|
||||
{
|
||||
ASSERT(t->shape.h == 1);
|
||||
ASSERT(t->shape.w == 1);
|
||||
|
||||
uint32_t fmt = (t->fmt == FMT_BF16) ? 2 : 1;
|
||||
uint32_t c = t->shape.c;
|
||||
uint32_t npu_num = ctx->chip_info.npu_num;
|
||||
|
||||
ASSERT(t->stride.n == fmt * align_up(c, npu_num) / npu_num);
|
||||
ASSERT(t->stride.c == 1 * fmt);
|
||||
ASSERT(t->stride.h == 1 * fmt);
|
||||
ASSERT(t->stride.w == 1 * fmt);
|
||||
}
|
||||
|
||||
static inline int tensor_is_signed(const tl_t *t)
|
||||
{
|
||||
switch (t->fmt) {
|
||||
case FMT_I8:
|
||||
return 1;
|
||||
case FMT_U8:
|
||||
case FMT_BF16: //does not matter, so set to default 0
|
||||
return 0;
|
||||
default:
|
||||
ASSERT(0);
|
||||
}
|
||||
}
|
||||
|
||||
static inline int matrix_is_signed(const ml_t *t)
|
||||
{
|
||||
switch (t->fmt) {
|
||||
case FMT_I8:
|
||||
return 1;
|
||||
case FMT_U8:
|
||||
case FMT_BF16: //does not matter, so set to default 0
|
||||
return 0;
|
||||
default:
|
||||
ASSERT(0);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void fill_same_tensor_shape(tiu_reg_t *r, tl_shape_t s)
|
||||
{
|
||||
uint32_t n = s.n;
|
||||
uint32_t c = s.c;
|
||||
uint32_t h = s.h;
|
||||
uint32_t w = s.w;
|
||||
|
||||
r->opd0_n = n;
|
||||
r->opd0_c = c;
|
||||
r->opd0_h = h;
|
||||
r->opd0_w = w;
|
||||
|
||||
r->opd1_n = n;
|
||||
r->opd1_c = c;
|
||||
r->opd1_h = h;
|
||||
r->opd1_w = w;
|
||||
|
||||
r->opd2_n = n;
|
||||
r->opd2_c = c;
|
||||
r->opd2_h = h;
|
||||
r->opd2_w = w;
|
||||
|
||||
r->res0_n = n;
|
||||
r->res0_c = c;
|
||||
r->res0_h = h;
|
||||
r->res0_w = w;
|
||||
}
|
||||
|
||||
static inline void assert_stride_range(tl_stride_t s)
|
||||
{
|
||||
ASSERT(s.n < 0x10000);
|
||||
ASSERT(s.c < 0x10000);
|
||||
ASSERT(s.h < 0x10000);
|
||||
}
|
||||
|
||||
static inline void fill_same_tensor_stride(tiu_reg_t *r, tl_stride_t s)
|
||||
{
|
||||
uint32_t n = s.n;
|
||||
uint32_t c = s.c;
|
||||
uint32_t h = s.h;
|
||||
uint32_t w = 1;
|
||||
|
||||
r->opd0_n_str = n;
|
||||
r->opd0_c_str = c;
|
||||
r->opd0_h_str = h;
|
||||
r->opd0_w_str = w;
|
||||
|
||||
r->opd1_n_str = n;
|
||||
r->opd1_c_str = c;
|
||||
r->opd1_h_str = h;
|
||||
r->opd1_w_str = w;
|
||||
|
||||
r->opd2_n_str = n;
|
||||
r->opd2_c_str = c;
|
||||
r->opd2_h_str = h;
|
||||
r->opd2_w_str = w;
|
||||
|
||||
r->res0_n_str = n;
|
||||
r->res0_c_str = c;
|
||||
r->res0_h_str = h;
|
||||
r->res0_w_str = w;
|
||||
}
|
||||
|
||||
#define fill_stride_code(r, op, str) \
|
||||
do { \
|
||||
r->op##_n_str = str->n; \
|
||||
r->op##_c_str = str->c; \
|
||||
r->op##_h_str = str->h; \
|
||||
r->op##_w_str = str->w; \
|
||||
} while (0)
|
||||
|
||||
static inline void fill_opd0_stride(tiu_reg_t *r, const tl_stride_t *str)
|
||||
{
|
||||
fill_stride_code(r, opd0, str);
|
||||
}
|
||||
|
||||
static inline void fill_opd1_stride(tiu_reg_t *r, const tl_stride_t *str)
|
||||
{
|
||||
fill_stride_code(r, opd1, str);
|
||||
}
|
||||
|
||||
static inline void fill_opd2_stride(tiu_reg_t *r, const tl_stride_t *str)
|
||||
{
|
||||
fill_stride_code(r, opd2, str);
|
||||
}
|
||||
|
||||
static inline void fill_res0_stride(tiu_reg_t *r, const tl_stride_t *str)
|
||||
{
|
||||
fill_stride_code(r, res0, str);
|
||||
}
|
||||
|
||||
static inline void fill_same_tensor_stride_type(tiu_reg_t *r, int type)
|
||||
{
|
||||
r->short_opd0_str = type & 0b11;
|
||||
r->short_opd1_str = type & 0b11;
|
||||
r->short_opd2_str = type & 0b11;
|
||||
r->short_res0_str = type & 0b11;
|
||||
}
|
||||
|
||||
static inline ec_desc_t * emit_tiu_cmdbuf(ctx_t *k, tiu_reg_t *r)
|
||||
{
|
||||
int engine_id = BMK1822_TIU;
|
||||
|
||||
desc_pair_t *dp = bm1822_get_desc_pair(k, engine_id);
|
||||
uint32_t *cmdbuf = (uint32_t *)dp->cmd_hdr->cmd;
|
||||
emit_tiu_reg(r, cmdbuf);
|
||||
|
||||
return dp->ec_desc;
|
||||
}
|
||||
|
||||
#endif /* KERNEL_1822_H */
|
||||
1977
cvikernel/src/bm1822/tdma.c
Normal file
1977
cvikernel/src/bm1822/tdma.c
Normal file
File diff suppressed because it is too large
Load Diff
90
cvikernel/src/bm1822/tiu_average_pooling.c
Normal file
90
cvikernel/src/bm1822/tiu_average_pooling.c
Normal file
@ -0,0 +1,90 @@
|
||||
#include "kernel_1822.h"
|
||||
#include <bmkernel/bm1822/1822_fp_convert.h>
|
||||
|
||||
bmk1822_op_t * bmk1822_tiu_average_pooling(
|
||||
ctx_t *ctx,
|
||||
const bmk1822_tiu_average_pooling_param_t *p)
|
||||
{
|
||||
int bf16_enable = (p->ifmap->fmt == FMT_BF16) ? 1 : 0;
|
||||
|
||||
ASSERT(p->ifmap->shape.n == p->ofmap->shape.n);
|
||||
ASSERT(p->ifmap->shape.c == p->ofmap->shape.c);
|
||||
ASSERT(p->stride_h < 32 && p->stride_h > 0);
|
||||
ASSERT(p->stride_w < 32 && p->stride_w > 0);
|
||||
ASSERT(p->pad_top < 16);
|
||||
ASSERT(p->pad_bottom < 16);
|
||||
ASSERT(p->pad_left < 16);
|
||||
ASSERT(p->pad_right < 16);
|
||||
ASSERT(p->ins_h < 15);
|
||||
ASSERT(p->ins_last_h < 15);
|
||||
ASSERT(p->ins_w < 15);
|
||||
ASSERT(p->ins_last_w < 15);
|
||||
|
||||
check_tiu_tensor_2(p->ifmap, p->ofmap);
|
||||
if (bf16_enable) {
|
||||
assert_bf16_stride_type_0(ctx, p->ifmap);
|
||||
assert_bf16_stride_type_0(ctx, p->ofmap);
|
||||
} else {
|
||||
assert_stride_type_0(ctx, p->ifmap);
|
||||
assert_stride_type_0(ctx, p->ofmap);
|
||||
}
|
||||
|
||||
int opd0_sign = tensor_is_signed(p->ifmap);
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_DEPTHWISE_POOL_FIX8B;
|
||||
reg.tsk_eu_typ = 1;
|
||||
reg.opt_shift_typ = opd0_sign;
|
||||
reg.opt_res_shift = p->rshift_bits;
|
||||
reg.opt_relu_typ = 0; /* hardware relu function not verified. */
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opd_typ = bf16_enable ? 1: 0;
|
||||
|
||||
reg.res0_addr = p->ofmap->start_address;
|
||||
reg.opt_res0_sign = opd0_sign;
|
||||
reg.opt_res0_seg = 1;
|
||||
reg.res0_n = p->ofmap->shape.n;
|
||||
reg.res0_c = p->ofmap->shape.c;
|
||||
reg.res0_h = p->ofmap->shape.h;
|
||||
reg.res0_w = p->ofmap->shape.w;
|
||||
|
||||
reg.opd0_addr = p->ifmap->start_address;
|
||||
reg.opt_opd0_sign = opd0_sign;
|
||||
reg.opt_opd0_seg = 1;
|
||||
reg.opd0_n = p->ifmap->shape.n;
|
||||
reg.opd0_c = p->ifmap->shape.c;
|
||||
reg.opd0_h = p->ifmap->shape.h;
|
||||
reg.opd0_w = p->ifmap->shape.w;
|
||||
reg.conv_opd0_up_pad = p->pad_top;
|
||||
reg.conv_opd0_dn_pad = p->pad_bottom;
|
||||
reg.conv_opd0_lf_pad = p->pad_left;
|
||||
reg.conv_opd0_rt_pad = p->pad_right;
|
||||
reg.conv_opd0_x_ins0 = p->ins_w;
|
||||
reg.conv_opd0_y_ins0 = p->ins_h;
|
||||
reg.conv_opd0_x_ins0_last = p->ins_last_w;
|
||||
reg.conv_opd0_y_ins0_last = p->ins_last_h;
|
||||
|
||||
reg.opt_opd1_const = 1;
|
||||
/*HW does not have dive, we need to calculate value here*/
|
||||
if (bf16_enable)
|
||||
reg.opd1_addr =
|
||||
convert_fp32_bf16(
|
||||
(float)(convert_bf16_fp32(p->avg_pooling_const) / (p->kh * p->kw)));
|
||||
else
|
||||
reg.opd1_addr = p->avg_pooling_const;
|
||||
|
||||
reg.opd1_h = p->kh;
|
||||
reg.opd1_w = p->kw;
|
||||
reg.opt_opd1_sign = 0;
|
||||
reg.opt_opd1_seg = 1;
|
||||
reg.conv_op_x_str = p->stride_w;
|
||||
reg.conv_op_y_str = p->stride_h;
|
||||
reg.opd0_ins_val = bf16_enable ?
|
||||
(uint32_t)p->ins_fp : (uint32_t)p->ins_val;
|
||||
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
176
cvikernel/src/bm1822/tiu_convolution.c
Normal file
176
cvikernel/src/bm1822/tiu_convolution.c
Normal file
@ -0,0 +1,176 @@
|
||||
#include "kernel_1822.h"
|
||||
|
||||
typedef bmk1822_tiu_convolution_param_t param_t;
|
||||
|
||||
static int can_do_double_conv(ctx_t *ctx, const param_t *p)
|
||||
{
|
||||
uint8_t bf16_enable = (p->ifmap->fmt == FMT_BF16) ? 1 : 0;
|
||||
|
||||
if (((p->ifmap->start_address % ctx->chip_info.lmem_size) % 2 == 0 &&
|
||||
p->ifmap->shape.c % 2 == 0 &&
|
||||
p->ifmap->shape.c >= 4 &&
|
||||
p->weight->start_address % 2 == 0) && !bf16_enable)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void check_conv_param(ctx_t *ctx, const param_t *p)
|
||||
{
|
||||
uint32_t eu_num = ctx->chip_info.eu_num;
|
||||
uint8_t bf16_enable = (p->ifmap->fmt == FMT_BF16) ? 1 : 0;
|
||||
|
||||
check_tiu_tensor_3(p->ifmap, p->ofmap, p->weight);
|
||||
if (bf16_enable) {
|
||||
assert_bf16_stride_type_0(ctx, p->ifmap);
|
||||
} else {
|
||||
assert_stride_type_0(ctx, p->ifmap);
|
||||
}
|
||||
//assert_stride_type_1(ctx, p->weight);
|
||||
if (p->bias) {
|
||||
check_tiu_tensor(p->bias);
|
||||
if (bf16_enable)
|
||||
assert_bf16_stride_type_2(ctx, p->bias);
|
||||
else
|
||||
assert_stride_type_2(ctx, p->bias);
|
||||
}
|
||||
|
||||
// n stride must align 16B
|
||||
ASSERT((p->ofmap->stride.n % 16) == 0);
|
||||
|
||||
ASSERT(p->ifmap->start_address % eu_num == 0);
|
||||
ASSERT(p->ofmap->start_address % eu_num == 0);
|
||||
ASSERT(p->ifmap->shape.n == p->ofmap->shape.n);
|
||||
ASSERT(!(p->ifmap->shape.h == 1 && p->ins_h > 0));
|
||||
ASSERT(p->weight->shape.n == p->ifmap->shape.c);
|
||||
ASSERT(p->weight->shape.c == p->ofmap->shape.c);
|
||||
if (can_do_double_conv(ctx, p)) {
|
||||
uint32_t lmem_i = p->ifmap->start_address % ctx->chip_info.lmem_size;
|
||||
ASSERT(lmem_i % 2 == 0);
|
||||
ASSERT(p->ifmap->shape.c % 2 == 0);
|
||||
ASSERT(p->ifmap->shape.c >= 4); /* Otherwise performance will suffer */
|
||||
ASSERT(p->weight->start_address % 2 == 0);
|
||||
}
|
||||
if(p->ps32_mode & 0x2)
|
||||
{
|
||||
ASSERT(!p->relu_enable);
|
||||
ASSERT(!p->bias);
|
||||
ASSERT(!p->rshift_bits);
|
||||
|
||||
ASSERT(p->cmd_pre_exe <= 1);
|
||||
}
|
||||
ASSERT(p->stride_h < 32 && p->stride_h > 0);
|
||||
ASSERT(p->stride_w < 32 && p->stride_w > 0);
|
||||
ASSERT(p->pad_top < 16);
|
||||
ASSERT(p->pad_bottom < 16);
|
||||
ASSERT(p->pad_left < 16);
|
||||
ASSERT(p->pad_right < 16);
|
||||
ASSERT(p->ins_h < 15);
|
||||
ASSERT(p->ins_last_h < 15);
|
||||
ASSERT(p->ins_w < 15);
|
||||
ASSERT(p->ins_last_w < 15);
|
||||
ASSERT(p->dilation_h >= 1);
|
||||
ASSERT(p->dilation_w >= 1);
|
||||
ASSERT(p->relu_enable == 0 || p->relu_enable == 1);
|
||||
}
|
||||
|
||||
bmk1822_op_t * bmk1822_tiu_convolution(ctx_t *ctx, const param_t *p)
|
||||
{
|
||||
check_conv_param(ctx, p);
|
||||
|
||||
uint32_t npu_num = ctx->chip_info.npu_num;
|
||||
int opd0_sign = tensor_is_signed(p->ifmap);
|
||||
int opd1_sign = tensor_is_signed(p->weight);
|
||||
int opd2_sign = p->bias? tensor_is_signed(p->bias): 1;
|
||||
int arith_shift = opd0_sign || opd1_sign || opd2_sign;
|
||||
int bf16_enable = (p->ifmap->fmt == FMT_BF16) ? 1 : 0;
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_CONV_FIX8B;
|
||||
reg.opt_shift_typ = arith_shift;
|
||||
reg.opt_res_shift = p->rshift_bits;
|
||||
reg.opt_relu_typ = !!(p->relu_enable);
|
||||
reg.tsk_opd_num = 2;
|
||||
|
||||
reg.opd_typ = bf16_enable;
|
||||
|
||||
/*always automatically enabel double conv at those situations*/
|
||||
if (can_do_double_conv(ctx, p))
|
||||
reg.double_conv = 1;
|
||||
|
||||
reg.res0_addr = p->ofmap->start_address;
|
||||
reg.opt_res0_sign = tensor_is_signed(p->ofmap);
|
||||
reg.opt_res0_seg = 1;
|
||||
reg.res0_n = p->ofmap->shape.n;
|
||||
reg.res0_c = p->ofmap->shape.c;
|
||||
reg.res0_h = p->ofmap->shape.h;
|
||||
reg.res0_w = p->ofmap->shape.w;
|
||||
reg.res0_n_str = p->ofmap->stride.n;
|
||||
reg.res0_c_str = p->ofmap->stride.c;
|
||||
reg.res0_h_str = p->ofmap->stride.h;
|
||||
reg.res0_w_str = p->ofmap->stride.w;
|
||||
reg.short_res0_str = 3; // Manual instead of h/w
|
||||
reg.ps32_md = p->ps32_mode;
|
||||
if (p->ps32_mode > 0)
|
||||
reg.res0_b_str = p->ofmap->shape.n * p->ofmap->stride.n;
|
||||
|
||||
reg.opd0_addr = p->ifmap->start_address;
|
||||
reg.opt_opd0_sign = opd0_sign;
|
||||
reg.opt_opd0_seg = 1;
|
||||
reg.opd0_n = p->ifmap->shape.n;
|
||||
reg.opd0_c = p->ifmap->shape.c;
|
||||
reg.opd0_h = p->ifmap->shape.h;
|
||||
reg.opd0_w = p->ifmap->shape.w;
|
||||
reg.short_opd0_str = 0;
|
||||
reg.conv_opd0_up_pad = p->pad_top;
|
||||
reg.conv_opd0_dn_pad = p->pad_bottom;
|
||||
reg.conv_opd0_lf_pad = p->pad_left;
|
||||
reg.conv_opd0_rt_pad = p->pad_right;
|
||||
reg.conv_opd0_x_ins0 = p->ins_w;
|
||||
reg.conv_opd0_y_ins0 = p->ins_h;
|
||||
reg.conv_opd0_x_ins0_last = p->ins_last_w;
|
||||
reg.conv_opd0_y_ins0_last = p->ins_last_h;
|
||||
|
||||
reg.opd1_addr = p->weight->start_address;
|
||||
reg.opt_opd1_sign = opd1_sign;
|
||||
reg.opt_opd1_seg = 1;
|
||||
reg.opt_opd1_const = p->w_is_const;
|
||||
reg.opd1_n = p->weight->shape.n;
|
||||
reg.opd1_c = p->weight->shape.c;
|
||||
reg.opd1_h = p->weight->shape.h;
|
||||
reg.opd1_w = p->weight->shape.w;
|
||||
reg.short_opd1_str = 1;
|
||||
reg.conv_opd1_x_ins0 = p->dilation_w - 1;
|
||||
reg.conv_opd1_y_ins0 = p->dilation_h - 1;
|
||||
reg.conv_op_x_str = p->stride_w;
|
||||
reg.conv_op_y_str = p->stride_h;
|
||||
reg.opd0_ins_val = bf16_enable ?
|
||||
(uint32_t)p->ins_fp : (uint32_t)p->ins_val;
|
||||
|
||||
if (p->bias) {
|
||||
ASSERT(p->bias->shape.n == 2);
|
||||
ASSERT(p->bias->shape.c == p->ofmap->shape.c);
|
||||
ASSERT(p->bias->shape.h == 1);
|
||||
ASSERT(p->bias->shape.w == 1);
|
||||
|
||||
reg.tsk_opd_num = 3;
|
||||
reg.opt_opd2_sign = opd2_sign;
|
||||
reg.opt_opd2_seg = 0;
|
||||
reg.opd2_addr = p->bias->start_address;
|
||||
reg.opd2_n = 1;
|
||||
reg.opd2_c = p->bias->shape.c;
|
||||
reg.opd2_h = 1;
|
||||
reg.opd2_w = 1;
|
||||
reg.short_opd2_str = 2;
|
||||
reg.opd2_b_str = ceiling_func(p->bias->shape.c, npu_num) * (bf16_enable ? 2 : 1);
|
||||
}
|
||||
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
reg.cmd_pre_exe_typ = p->cmd_pre_exe_typ;
|
||||
reg.cmd_pre_exe = p->cmd_pre_exe;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
166
cvikernel/src/bm1822/tiu_convolution_qdm.c
Normal file
166
cvikernel/src/bm1822/tiu_convolution_qdm.c
Normal file
@ -0,0 +1,166 @@
|
||||
#include "kernel_1822.h"
|
||||
|
||||
typedef bmk1822_tiu_convolution_qdm_param_t param_t;
|
||||
|
||||
static int can_do_double_conv(ctx_t *ctx, const param_t *p)
|
||||
{
|
||||
if ((p->ifmap->start_address % ctx->chip_info.lmem_size) % 2 == 0 &&
|
||||
p->ifmap->shape.c % 2 == 0 &&
|
||||
p->ifmap->shape.c >= 4 &&
|
||||
p->weight->start_address % 2 == 0)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void check_conv_param(ctx_t *ctx, const param_t *p)
|
||||
{
|
||||
uint32_t eu_num = ctx->chip_info.eu_num;
|
||||
|
||||
check_tiu_tensor_3(p->ifmap, p->ofmap, p->weight);
|
||||
assert_stride_type_0(ctx, p->ifmap);
|
||||
|
||||
ASSERT((p->ofmap->stride.n % eu_num) == 0);
|
||||
ASSERT(p->ifmap->start_address % eu_num == 0);
|
||||
ASSERT(p->ofmap->start_address % eu_num == 0);
|
||||
ASSERT(p->ifmap->shape.n == p->ofmap->shape.n);
|
||||
ASSERT(!(p->ifmap->shape.h == 1 && p->ins_h > 0));
|
||||
ASSERT(p->weight->shape.n == p->ifmap->shape.c);
|
||||
ASSERT(p->weight->shape.c == p->ofmap->shape.c);
|
||||
|
||||
if (p->chl_quan_param) {
|
||||
check_tiu_tensor(p->chl_quan_param);
|
||||
assert_stride_type_2(ctx, p->chl_quan_param);
|
||||
ASSERT(p->chl_quan_param->start_address % eu_num == 0);
|
||||
}
|
||||
if (can_do_double_conv(ctx, p)) {
|
||||
uint32_t lmem_i = p->ifmap->start_address % ctx->chip_info.lmem_size;
|
||||
ASSERT(lmem_i % 2 == 0);
|
||||
ASSERT(p->ifmap->shape.c % 2 == 0);
|
||||
ASSERT(p->ifmap->shape.c >= 4); /* Otherwise performance will suffer */
|
||||
ASSERT(p->weight->start_address % 2 == 0);
|
||||
}
|
||||
if(p->ps32_mode & 0x2)
|
||||
{
|
||||
ASSERT(!p->relu_enable);
|
||||
ASSERT(!p->has_bias);
|
||||
|
||||
ASSERT(p->cmd_pre_exe <= 1);
|
||||
}
|
||||
ASSERT(p->stride_h < 32 && p->stride_h > 0);
|
||||
ASSERT(p->stride_w < 32 && p->stride_w > 0);
|
||||
ASSERT(p->pad_top < 16);
|
||||
ASSERT(p->pad_bottom < 16);
|
||||
ASSERT(p->pad_left < 16);
|
||||
ASSERT(p->pad_right < 16);
|
||||
ASSERT(p->ins_h < 15);
|
||||
ASSERT(p->ins_last_h < 15);
|
||||
ASSERT(p->ins_w < 15);
|
||||
ASSERT(p->ins_last_w < 15);
|
||||
ASSERT(p->dilation_h >= 1);
|
||||
ASSERT(p->dilation_w >= 1);
|
||||
ASSERT(p->relu_enable == 0 || p->relu_enable == 1);
|
||||
}
|
||||
|
||||
bmk1822_op_t * bmk1822_tiu_convolution_qdm(ctx_t *ctx, const param_t *p)
|
||||
{
|
||||
check_conv_param(ctx, p);
|
||||
|
||||
int opd0_sign = tensor_is_signed(p->ifmap);
|
||||
int opd1_sign = tensor_is_signed(p->weight);
|
||||
int arith_shift = opd0_sign || opd1_sign;
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_CONV_FIX8B;
|
||||
reg.opt_shift_typ = arith_shift;
|
||||
reg.opt_relu_typ = !!(p->relu_enable);
|
||||
reg.tsk_opd_num = 2;
|
||||
|
||||
/*always automatically enabel double conv at those situations*/
|
||||
if (can_do_double_conv(ctx, p))
|
||||
reg.double_conv = 1;
|
||||
|
||||
reg.res0_addr = p->ofmap->start_address;
|
||||
reg.opt_res0_sign = tensor_is_signed(p->ofmap);
|
||||
reg.opt_res0_seg = 1;
|
||||
reg.res0_n = p->ofmap->shape.n;
|
||||
reg.res0_c = p->ofmap->shape.c;
|
||||
reg.res0_h = p->ofmap->shape.h;
|
||||
reg.res0_w = p->ofmap->shape.w;
|
||||
reg.res0_n_str = p->ofmap->stride.n;
|
||||
reg.res0_c_str = p->ofmap->stride.c;
|
||||
reg.res0_h_str = p->ofmap->stride.h;
|
||||
reg.res0_w_str = p->ofmap->stride.w;
|
||||
reg.short_res0_str = 3; // Manual instead of h/w
|
||||
reg.ps32_md = p->ps32_mode;
|
||||
if (p->ps32_mode > 0) {
|
||||
reg.res0_b_str = p->ofmap->shape.n * p->ofmap->stride.n;
|
||||
|
||||
// Per-channel parameter does not has right shift (default is 10).
|
||||
// Set zero.
|
||||
reg.opt_res_shift = 0;
|
||||
}
|
||||
|
||||
reg.opd0_addr = p->ifmap->start_address;
|
||||
reg.opt_opd0_sign = opd0_sign;
|
||||
reg.opt_opd0_seg = 1;
|
||||
reg.opd0_n = p->ifmap->shape.n;
|
||||
reg.opd0_c = p->ifmap->shape.c;
|
||||
reg.opd0_h = p->ifmap->shape.h;
|
||||
reg.opd0_w = p->ifmap->shape.w;
|
||||
reg.opd0_ins_val = (uint32_t)p->ins_val;
|
||||
reg.short_opd0_str = 0;
|
||||
reg.conv_opd0_up_pad = p->pad_top;
|
||||
reg.conv_opd0_dn_pad = p->pad_bottom;
|
||||
reg.conv_opd0_lf_pad = p->pad_left;
|
||||
reg.conv_opd0_rt_pad = p->pad_right;
|
||||
reg.conv_opd0_x_ins0 = p->ins_w;
|
||||
reg.conv_opd0_y_ins0 = p->ins_h;
|
||||
reg.conv_opd0_x_ins0_last = p->ins_last_w;
|
||||
reg.conv_opd0_y_ins0_last = p->ins_last_h;
|
||||
|
||||
reg.opd1_addr = p->weight->start_address;
|
||||
reg.opt_opd1_sign = opd1_sign;
|
||||
reg.opt_opd1_seg = 1;
|
||||
reg.opt_opd1_const = p->w_is_const;
|
||||
reg.opd1_n = p->weight->shape.n;
|
||||
reg.opd1_c = p->weight->shape.c;
|
||||
reg.opd1_h = p->weight->shape.h;
|
||||
reg.opd1_w = p->weight->shape.w;
|
||||
reg.short_opd1_str = 1;
|
||||
reg.conv_opd1_x_ins0 = p->dilation_w - 1;
|
||||
reg.conv_opd1_y_ins0 = p->dilation_h - 1;
|
||||
reg.conv_op_x_str = p->stride_w;
|
||||
reg.conv_op_y_str = p->stride_h;
|
||||
|
||||
if (p->chl_quan_param) {
|
||||
ASSERT(p->chl_quan_param->shape.n == 1);
|
||||
ASSERT(p->chl_quan_param->shape.c == p->ofmap->shape.c);
|
||||
ASSERT(p->chl_quan_param->shape.h == 1);
|
||||
ASSERT(p->chl_quan_param->shape.w == 1);
|
||||
reg.opt_chl_quan = 1;
|
||||
reg.opt_res_shift = 0; // useless
|
||||
reg.opd2_addr = p->chl_quan_param->start_address;
|
||||
reg.opd2_n = p->chl_quan_param->shape.n;
|
||||
reg.opd2_c = p->chl_quan_param->shape.c;
|
||||
reg.opd2_h = p->chl_quan_param->shape.h;
|
||||
reg.opd2_w = p->chl_quan_param->shape.w;
|
||||
}
|
||||
reg.opt_opd2_seg = 1; // useless, force to 1 to skip b_stride check
|
||||
reg.short_opd2_str = 2; // useless
|
||||
reg.opd2_b_str = 0; // useless
|
||||
|
||||
if (p->has_bias) {
|
||||
reg.tsk_opd_num = 3;
|
||||
reg.opt_opd2_sign = 1;
|
||||
}
|
||||
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
reg.cmd_pre_exe_typ = p->cmd_pre_exe_typ;
|
||||
reg.cmd_pre_exe = p->cmd_pre_exe;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
152
cvikernel/src/bm1822/tiu_depthwise_convolution.c
Normal file
152
cvikernel/src/bm1822/tiu_depthwise_convolution.c
Normal file
@ -0,0 +1,152 @@
|
||||
#include "kernel_1822.h"
|
||||
|
||||
bmk1822_op_t * bmk1822_tiu_depthwise_convolution(
|
||||
ctx_t *ctx,
|
||||
const bmk1822_tiu_depthwise_convolution_param_t *p)
|
||||
{
|
||||
int bf16_enable = (p->ifmap->fmt == FMT_BF16) ? 1 : 0;
|
||||
|
||||
bool isMulConst = (p->weight_is_const == 1) ? 1 : 0;
|
||||
|
||||
if(isMulConst) {
|
||||
check_tiu_tensor_2(p->ifmap, p->ofmap);
|
||||
} else {
|
||||
check_tiu_tensor_3(p->ifmap, p->ofmap, p->weight);
|
||||
}
|
||||
if (bf16_enable) {
|
||||
assert_bf16_stride_type_0(ctx, p->ifmap);
|
||||
if(!isMulConst)
|
||||
assert_bf16_stride_type_0(ctx, p->weight);
|
||||
if (p->bias) {
|
||||
check_tiu_tensor(p->bias);
|
||||
assert_bf16_stride_type_2(ctx, p->bias);
|
||||
}
|
||||
} else {
|
||||
assert_stride_type_0(ctx, p->ifmap);
|
||||
if(!isMulConst)
|
||||
assert_stride_type_0(ctx, p->weight);
|
||||
if (p->bias) {
|
||||
check_tiu_tensor(p->bias);
|
||||
assert_stride_type_2(ctx, p->bias);
|
||||
}
|
||||
}
|
||||
|
||||
// n stride must align 16B
|
||||
ASSERT((p->ofmap->stride.n % 16) == 0);
|
||||
|
||||
ASSERT(p->ifmap->shape.n == p->ofmap->shape.n);
|
||||
ASSERT(p->ifmap->shape.c == p->ofmap->shape.c);
|
||||
if(!isMulConst){
|
||||
ASSERT(p->ifmap->shape.c == p->weight->shape.c);
|
||||
ASSERT(p->weight->shape.n == 1);
|
||||
}
|
||||
ASSERT(p->relu_enable == 0 || p->relu_enable == 1);
|
||||
ASSERT(p->stride_h < 32 && p->stride_h > 0);
|
||||
ASSERT(p->stride_w < 32 && p->stride_w > 0);
|
||||
ASSERT(p->pad_top < 16);
|
||||
ASSERT(p->pad_bottom < 16);
|
||||
ASSERT(p->pad_left < 16);
|
||||
ASSERT(p->pad_right < 16);
|
||||
ASSERT(p->ins_h < 15);
|
||||
ASSERT(p->ins_last_h < 15);
|
||||
ASSERT(p->ins_w < 15);
|
||||
ASSERT(p->ins_last_w < 15);
|
||||
ASSERT(p->dilation_h >= 1);
|
||||
ASSERT(p->dilation_w >= 1);
|
||||
|
||||
int opd0_sign = tensor_is_signed(p->ifmap);
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_DEPTHWISE_POOL_FIX8B;
|
||||
reg.tsk_eu_typ = 2;
|
||||
reg.opt_relu_typ = p->relu_enable;
|
||||
reg.opt_shift_typ = 1;
|
||||
reg.opt_res_shift = p->rshift_bits;
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opd_typ = bf16_enable ? 1: 0;
|
||||
|
||||
int res0_sign = tensor_is_signed(p->ofmap);
|
||||
reg.res0_addr = p->ofmap->start_address;
|
||||
reg.opt_res0_sign = res0_sign;
|
||||
reg.opt_res0_seg = 1;
|
||||
reg.res0_n = p->ofmap->shape.n;
|
||||
reg.res0_c = p->ofmap->shape.c;
|
||||
reg.res0_h = p->ofmap->shape.h;
|
||||
reg.res0_w = p->ofmap->shape.w;
|
||||
reg.res0_n_str = p->ofmap->stride.n;
|
||||
reg.res0_c_str = p->ofmap->stride.c;
|
||||
reg.res0_h_str = p->ofmap->stride.h;
|
||||
reg.res0_w_str = p->ofmap->stride.w;
|
||||
reg.short_res0_str = 3; // Manual instead of h/w
|
||||
|
||||
reg.opd0_addr = p->ifmap->start_address;
|
||||
reg.opt_opd0_sign = opd0_sign;
|
||||
reg.opt_opd0_seg = 1;
|
||||
reg.opd0_n = p->ifmap->shape.n;
|
||||
reg.opd0_c = p->ifmap->shape.c;
|
||||
reg.opd0_h = p->ifmap->shape.h;
|
||||
reg.opd0_w = p->ifmap->shape.w;
|
||||
reg.opd0_n_str = p->ifmap->stride.n;
|
||||
reg.opd0_c_str = p->ifmap->stride.c;
|
||||
reg.opd0_h_str = p->ifmap->stride.h;
|
||||
reg.opd0_w_str = p->ifmap->stride.w;
|
||||
reg.short_opd0_str = 3; // Manual instead of h/w
|
||||
reg.conv_opd0_up_pad = p->pad_top;
|
||||
reg.conv_opd0_dn_pad = p->pad_bottom;
|
||||
reg.conv_opd0_lf_pad = p->pad_left;
|
||||
reg.conv_opd0_rt_pad = p->pad_right;
|
||||
reg.conv_opd0_x_ins0 = p->ins_w;
|
||||
reg.conv_opd0_y_ins0 = p->ins_h;
|
||||
reg.conv_opd0_x_ins0_last = p->ins_last_w;
|
||||
reg.conv_opd0_y_ins0_last = p->ins_last_h;
|
||||
|
||||
reg.opt_opd1_sign = 1;
|
||||
reg.opt_opd1_seg = 1;
|
||||
reg.conv_opd1_x_ins0 = p->dilation_w - 1;
|
||||
reg.conv_opd1_y_ins0 = p->dilation_h - 1;
|
||||
if (isMulConst) {
|
||||
reg.opt_opd1_const = 1;
|
||||
reg.opt_opd1_sign = p->weight_const.is_signed;
|
||||
reg.opd1_addr = p->weight_const.val;
|
||||
reg.opd1_n = p->weight->shape.n;
|
||||
reg.opd1_c = p->weight->shape.c;
|
||||
reg.opd1_h = p->weight->shape.h;
|
||||
reg.opd1_w = p->weight->shape.w;
|
||||
} else {
|
||||
reg.opd1_addr = p->weight->start_address;
|
||||
reg.opd1_n = p->weight->shape.n;
|
||||
reg.opd1_c = p->weight->shape.c;
|
||||
reg.opd1_h = p->weight->shape.h;
|
||||
reg.opd1_w = p->weight->shape.w;
|
||||
}
|
||||
reg.conv_op_x_str = p->stride_w;
|
||||
reg.conv_op_y_str = p->stride_h;
|
||||
reg.opd0_ins_val = bf16_enable ?
|
||||
(uint32_t)p->ins_fp : (uint32_t)p->ins_val;
|
||||
|
||||
if (p->bias) {
|
||||
ASSERT(p->bias->shape.n == 2);
|
||||
ASSERT(p->bias->shape.c == p->ofmap->shape.c);
|
||||
ASSERT(p->bias->shape.h == 1);
|
||||
ASSERT(p->bias->shape.w == 1);
|
||||
|
||||
reg.tsk_opd_num = 3;
|
||||
reg.opd2_addr = p->bias->start_address;
|
||||
reg.opt_opd2_seg = 0;
|
||||
reg.opd2_n = 1;
|
||||
reg.opd2_c = p->bias->shape.c;
|
||||
reg.opd2_h = 1;
|
||||
reg.opd2_w = 1;
|
||||
reg.short_opd2_str = 2;
|
||||
reg.opd2_b_str = p->bias->stride.n;
|
||||
}
|
||||
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
reg.cmd_pre_exe_typ = p->cmd_pre_exe_typ;
|
||||
reg.cmd_pre_exe = p->cmd_pre_exe;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
142
cvikernel/src/bm1822/tiu_depthwise_convolution_qdm.c
Normal file
142
cvikernel/src/bm1822/tiu_depthwise_convolution_qdm.c
Normal file
@ -0,0 +1,142 @@
|
||||
#include "kernel_1822.h"
|
||||
|
||||
bmk1822_op_t * bmk1822_tiu_depthwise_convolution_qdm(
|
||||
ctx_t *ctx,
|
||||
const bmk1822_tiu_depthwise_convolution_qdm_param_t *p)
|
||||
{
|
||||
uint32_t eu_num = ctx->chip_info.eu_num;
|
||||
|
||||
bool isMulConst = (p->weight_is_const == 1) ? 1 : 0;
|
||||
|
||||
if(isMulConst) {
|
||||
check_tiu_tensor_2(p->ifmap, p->ofmap);
|
||||
} else {
|
||||
check_tiu_tensor_3(p->ifmap, p->ofmap, p->weight);
|
||||
}
|
||||
assert_stride_type_0(ctx, p->ifmap);
|
||||
if(!isMulConst){
|
||||
assert_stride_type_0(ctx, p->weight);
|
||||
}
|
||||
check_tiu_tensor(p->chl_quan_param);
|
||||
assert_stride_type_2(ctx, p->chl_quan_param);
|
||||
|
||||
ASSERT((p->ofmap->stride.n % eu_num) == 0);
|
||||
ASSERT(p->chl_quan_param->start_address %eu_num == 0);
|
||||
ASSERT(p->ifmap->shape.n == p->ofmap->shape.n);
|
||||
ASSERT(p->ifmap->shape.c == p->ofmap->shape.c);
|
||||
if (!isMulConst) {
|
||||
ASSERT(p->ifmap->shape.c == p->weight->shape.c);
|
||||
ASSERT(p->weight->shape.n == 1);
|
||||
}
|
||||
ASSERT(p->relu_enable == 0 || p->relu_enable == 1);
|
||||
ASSERT(p->stride_h < 32 && p->stride_h > 0);
|
||||
ASSERT(p->stride_w < 32 && p->stride_w > 0);
|
||||
ASSERT(p->pad_top < 16);
|
||||
ASSERT(p->pad_bottom < 16);
|
||||
ASSERT(p->pad_left < 16);
|
||||
ASSERT(p->pad_right < 16);
|
||||
ASSERT(p->ins_h < 15);
|
||||
ASSERT(p->ins_last_h < 15);
|
||||
ASSERT(p->ins_w < 15);
|
||||
ASSERT(p->ins_last_w < 15);
|
||||
ASSERT(p->dilation_h >= 1);
|
||||
ASSERT(p->dilation_w >= 1);
|
||||
|
||||
int opd0_sign = tensor_is_signed(p->ifmap);
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_DEPTHWISE_POOL_FIX8B;
|
||||
reg.tsk_eu_typ = 2;
|
||||
reg.opt_relu_typ = p->relu_enable;
|
||||
reg.opt_shift_typ = 1;
|
||||
reg.tsk_opd_num = 2;
|
||||
|
||||
int res0_sign = tensor_is_signed(p->ofmap);
|
||||
reg.res0_addr = p->ofmap->start_address;
|
||||
reg.opt_res0_sign = res0_sign;
|
||||
reg.opt_res0_seg = 1;
|
||||
reg.res0_n = p->ofmap->shape.n;
|
||||
reg.res0_c = p->ofmap->shape.c;
|
||||
reg.res0_h = p->ofmap->shape.h;
|
||||
reg.res0_w = p->ofmap->shape.w;
|
||||
reg.res0_n_str = p->ofmap->stride.n;
|
||||
reg.res0_c_str = p->ofmap->stride.c;
|
||||
reg.res0_h_str = p->ofmap->stride.h;
|
||||
reg.res0_w_str = p->ofmap->stride.w;
|
||||
reg.short_res0_str = 3; // Manual instead of h/w
|
||||
|
||||
reg.opd0_addr = p->ifmap->start_address;
|
||||
reg.opt_opd0_sign = opd0_sign;
|
||||
reg.opt_opd0_seg = 1;
|
||||
reg.opd0_n = p->ifmap->shape.n;
|
||||
reg.opd0_c = p->ifmap->shape.c;
|
||||
reg.opd0_h = p->ifmap->shape.h;
|
||||
reg.opd0_w = p->ifmap->shape.w;
|
||||
reg.opd0_n_str = p->ifmap->stride.n;
|
||||
reg.opd0_c_str = p->ifmap->stride.c;
|
||||
reg.opd0_h_str = p->ifmap->stride.h;
|
||||
reg.opd0_w_str = p->ifmap->stride.w;
|
||||
reg.opd0_ins_val = (uint32_t)p->ins_val;
|
||||
reg.short_opd0_str = 3; // Manual instead of h/w
|
||||
reg.conv_opd0_up_pad = p->pad_top;
|
||||
reg.conv_opd0_dn_pad = p->pad_bottom;
|
||||
reg.conv_opd0_lf_pad = p->pad_left;
|
||||
reg.conv_opd0_rt_pad = p->pad_right;
|
||||
reg.conv_opd0_x_ins0 = p->ins_w;
|
||||
reg.conv_opd0_y_ins0 = p->ins_h;
|
||||
reg.conv_opd0_x_ins0_last = p->ins_last_w;
|
||||
reg.conv_opd0_y_ins0_last = p->ins_last_h;
|
||||
|
||||
reg.opt_opd1_sign = 1;
|
||||
reg.opt_opd1_seg = 1;
|
||||
reg.conv_opd1_x_ins0 = p->dilation_w - 1;
|
||||
reg.conv_opd1_y_ins0 = p->dilation_h - 1;
|
||||
if (isMulConst) {
|
||||
reg.opt_opd1_const = 1;
|
||||
reg.opt_opd1_sign = p->weight_const.is_signed;
|
||||
reg.opd1_addr = p->weight_const.val;
|
||||
reg.opd1_n = p->weight->shape.n;
|
||||
reg.opd1_c = p->weight->shape.c;
|
||||
reg.opd1_h = p->weight->shape.h;
|
||||
reg.opd1_w = p->weight->shape.w;
|
||||
} else {
|
||||
reg.opd1_addr = p->weight->start_address;
|
||||
reg.opd1_n = p->weight->shape.n;
|
||||
reg.opd1_c = p->weight->shape.c;
|
||||
reg.opd1_h = p->weight->shape.h;
|
||||
reg.opd1_w = p->weight->shape.w;
|
||||
}
|
||||
reg.conv_op_x_str = p->stride_w;
|
||||
reg.conv_op_y_str = p->stride_h;
|
||||
reg.cmd_pre_exe_typ = p->cmd_pre_exe_typ;
|
||||
reg.cmd_pre_exe = p->cmd_pre_exe;
|
||||
|
||||
ASSERT(p->chl_quan_param->shape.n == 1);
|
||||
ASSERT(p->chl_quan_param->shape.c == p->ofmap->shape.c);
|
||||
ASSERT(p->chl_quan_param->shape.h == 1);
|
||||
ASSERT(p->chl_quan_param->shape.w == 1);
|
||||
reg.opt_chl_quan = 1;
|
||||
reg.opt_res_shift = 0; // useless
|
||||
reg.opd2_addr = p->chl_quan_param->start_address;
|
||||
reg.opd2_n = p->chl_quan_param->shape.n;
|
||||
reg.opd2_c = p->chl_quan_param->shape.c;
|
||||
reg.opd2_h = p->chl_quan_param->shape.h;
|
||||
reg.opd2_w = p->chl_quan_param->shape.w;
|
||||
reg.opt_opd2_seg = 1; // useless, force to 1 to skip b_stride check
|
||||
reg.short_opd2_str = 2; // useless
|
||||
reg.opd2_b_str = 0; // useless
|
||||
|
||||
if (p->has_bias) {
|
||||
reg.tsk_opd_num = 3;
|
||||
reg.opt_opd2_sign = 1;
|
||||
}
|
||||
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
reg.cmd_pre_exe_typ = p->cmd_pre_exe_typ;
|
||||
reg.cmd_pre_exe = p->cmd_pre_exe;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
81
cvikernel/src/bm1822/tiu_element_wise_add.c
Normal file
81
cvikernel/src/bm1822/tiu_element_wise_add.c
Normal file
@ -0,0 +1,81 @@
|
||||
#include "kernel_1822.h"
|
||||
|
||||
bmk1822_op_t * bmk1822_tiu_element_wise_add(
|
||||
ctx_t *k,
|
||||
const bmk1822_tiu_element_wise_add_param_t *p)
|
||||
{
|
||||
int bf16_enable = (p->a_low->fmt == FMT_BF16) ? 1 : 0;
|
||||
|
||||
if (bf16_enable) {
|
||||
/*bf16 only support 16 bit*/
|
||||
ASSERT(!p->a_high);
|
||||
ASSERT(!(p->b_high && !p->b_is_const));
|
||||
ASSERT(!p->res_high);
|
||||
check_tiu_tensor(p->a_low);
|
||||
check_tiu_tensor(p->res_low);
|
||||
assert_same_shape(p->res_low, p->a_low);
|
||||
if (!p->b_is_const) {
|
||||
check_tiu_tensor(p->b_low);
|
||||
assert_same_shape(p->res_low, p->b_low);
|
||||
}
|
||||
} else {
|
||||
check_16bit_tiu_tensor(p->a_low, p->a_high);
|
||||
check_tiu_tensor(p->res_low);
|
||||
assert_same_shape(p->res_low, p->a_low);
|
||||
if (!p->b_is_const) {
|
||||
check_16bit_tiu_tensor(p->b_low, p->b_high);
|
||||
assert_same_shape(p->res_low, p->b_low);
|
||||
}
|
||||
}
|
||||
if (p->res_high)
|
||||
check_16bit_tiu_tensor(p->res_low, p->res_high);
|
||||
ASSERT(p->relu_enable == 0 || p->relu_enable == 1);
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tsk_eu_typ = TENSOR_ADD_FIX8B;
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opd_typ = bf16_enable ? 1: 0;
|
||||
reg.opt_res_shift = 0;
|
||||
reg.opt_relu_typ = p->relu_enable;
|
||||
fill_same_tensor_shape(®, p->a_low->shape);
|
||||
fill_same_tensor_stride_type(®, 0b11);
|
||||
|
||||
int arith_shift = tensor_is_signed(p->res_low);
|
||||
reg.opt_shift_typ = arith_shift;
|
||||
reg.opt_res_shift = p->rshift_bits;
|
||||
|
||||
reg.opd0_addr = p->a_low->start_address;
|
||||
reg.opt_opd0_sign = tensor_is_signed(p->a_low);
|
||||
reg.opt_opd0_seg = (p->a_high == NULL);
|
||||
reg.opd0_b_str = bf16_enable ? 0 : (p->a_high->start_address - p->a_low->start_address);
|
||||
fill_opd0_stride(®, &p->a_low->stride);
|
||||
|
||||
reg.opt_opd1_seg = bf16_enable ? 1 : 0; //(p->b_high == NULL); b_high is the same as b_val
|
||||
if (p->b_is_const) {
|
||||
reg.opt_opd1_const = 1;
|
||||
reg.opt_opd1_sign = !!p->b_const.is_signed;
|
||||
reg.opd1_addr = p->b_const.val;
|
||||
} else {
|
||||
reg.opt_opd1_const = 0;
|
||||
reg.opt_opd1_sign = tensor_is_signed(p->b_low);
|
||||
reg.opd1_addr = p->b_low->start_address;
|
||||
reg.opd1_b_str = bf16_enable ? 0 : (p->b_high->start_address - p->b_low->start_address);
|
||||
fill_opd1_stride(®, &p->b_low->stride);
|
||||
}
|
||||
|
||||
reg.res0_addr = p->res_low->start_address;
|
||||
reg.opt_res0_sign = tensor_is_signed(p->res_low);
|
||||
reg.opt_res0_seg = (p->res_high == NULL);
|
||||
fill_res0_stride(®, &p->res_low->stride);
|
||||
if (p->res_high)
|
||||
reg.res0_b_str = p->res_high->start_address - p->res_low->start_address;
|
||||
if (p->relu_enable)
|
||||
ASSERT(reg.opt_res0_seg);
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
return emit_tiu_cmdbuf(k, ®);
|
||||
}
|
||||
100
cvikernel/src/bm1822/tiu_element_wise_and.c
Normal file
100
cvikernel/src/bm1822/tiu_element_wise_and.c
Normal file
@ -0,0 +1,100 @@
|
||||
#include "kernel_1822.h"
|
||||
|
||||
bmk1822_op_t * bmk1822_tiu_element_wise_and_int8(
|
||||
ctx_t *ctx,
|
||||
const bmk1822_tiu_element_wise_and_int8_param_t *p)
|
||||
{
|
||||
check_tiu_tensor_3(p->res, p->a, p->b);
|
||||
assert_same_shape_3(p->res, p->a, p->b);
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tsk_eu_typ = TENSOR_AND_FIX8B;
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opt_res_shift = 0;
|
||||
reg.opt_shift_typ = 0;
|
||||
reg.opt_relu_typ = 0;
|
||||
fill_same_tensor_shape(®, p->a->shape);
|
||||
fill_same_tensor_stride_type(®, 0b11);
|
||||
|
||||
reg.opd0_addr = p->a->start_address;
|
||||
reg.opt_opd0_sign = 0;
|
||||
reg.opt_opd0_seg = 1;
|
||||
fill_opd0_stride(®, &p->a->stride);
|
||||
|
||||
reg.opd1_addr = p->b->start_address;
|
||||
reg.opt_opd1_sign = 0;
|
||||
reg.opt_opd1_seg = 1;
|
||||
fill_opd1_stride(®, &p->b->stride);
|
||||
|
||||
reg.res0_addr = p->res->start_address;
|
||||
reg.opt_res0_sign = 0;
|
||||
reg.opt_res0_seg = 1;
|
||||
fill_res0_stride(®, &p->res->stride);
|
||||
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
|
||||
bmk1822_op_t * bmk1822_tiu_element_wise_and_int16(
|
||||
ctx_t *ctx,
|
||||
const bmk1822_tiu_element_wise_and_int16_param_t *p)
|
||||
{
|
||||
check_16bit_tiu_tensor(p->a_low, p->a_high);
|
||||
check_16bit_tiu_tensor(p->b_low, p->b_high);
|
||||
check_16bit_tiu_tensor(p->res_low, p->res_high);
|
||||
assert_same_shape_3(p->res_low, p->a_low, p->b_low);
|
||||
|
||||
int res_high_addr = p->res_high->start_address;
|
||||
int res_low_addr = p->res_low->start_address;
|
||||
ASSERT(res_high_addr > res_low_addr);
|
||||
int res_b_stride = res_high_addr - res_low_addr;
|
||||
|
||||
int a_high_addr = p->a_high->start_address;
|
||||
int a_low_addr = p->a_low->start_address;
|
||||
ASSERT(a_high_addr > a_low_addr);
|
||||
int a_b_stride = a_high_addr - a_low_addr;
|
||||
|
||||
int b_high_addr = p->b_high->start_address;
|
||||
int b_low_addr = p->b_low->start_address;
|
||||
ASSERT(b_high_addr > b_low_addr);
|
||||
int b_b_stride = b_high_addr - b_low_addr;
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tsk_eu_typ = TENSOR_AND_FIX8B;
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opt_res_shift = 0;
|
||||
reg.opt_shift_typ = 0;
|
||||
reg.opt_relu_typ = 0;
|
||||
fill_same_tensor_shape(®, p->a_low->shape);
|
||||
fill_same_tensor_stride_type(®, 0b11);
|
||||
|
||||
reg.opd0_addr = a_low_addr;
|
||||
reg.opt_opd0_sign = 0;
|
||||
reg.opt_opd0_seg = 0;
|
||||
reg.opd0_b_str = a_b_stride;
|
||||
fill_opd0_stride(®, &p->a_low->stride);
|
||||
|
||||
reg.opd1_addr = b_low_addr;
|
||||
reg.opt_opd1_sign = 0;
|
||||
reg.opt_opd1_seg = 0;
|
||||
reg.opd1_b_str = b_b_stride;
|
||||
fill_opd1_stride(®, &p->b_low->stride);
|
||||
|
||||
reg.res0_addr = res_low_addr;
|
||||
reg.opt_res0_sign = 0;
|
||||
reg.opt_res0_seg = 0;
|
||||
reg.res0_b_str = res_b_stride;
|
||||
fill_res0_stride(®, &p->res_low->stride);
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
42
cvikernel/src/bm1822/tiu_element_wise_copy.c
Normal file
42
cvikernel/src/bm1822/tiu_element_wise_copy.c
Normal file
@ -0,0 +1,42 @@
|
||||
#include "kernel_1822.h"
|
||||
|
||||
bmk1822_op_t * bmk1822_tiu_element_wise_copy(
|
||||
ctx_t *ctx,
|
||||
const bmk1822_tiu_element_wise_copy_param_t *p)
|
||||
{
|
||||
int bf16_enable = (p->src->fmt == FMT_BF16) ? 1 : 0;
|
||||
|
||||
check_tiu_tensor_2(p->dst, p->src);
|
||||
assert_same_shape(p->dst, p->src);
|
||||
assert_stride_range(p->dst->stride);
|
||||
assert_stride_range(p->src->stride);
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tsk_eu_typ = TENSOR_COPY_FIX8B;
|
||||
reg.tsk_opd_num = 1;
|
||||
reg.opd_typ = bf16_enable ? 1: 0;
|
||||
reg.opt_res_shift = 0;
|
||||
reg.opt_shift_typ = 0;
|
||||
reg.opt_relu_typ = 0;
|
||||
fill_same_tensor_shape(®, p->dst->shape);
|
||||
fill_same_tensor_stride_type(®, 0b11);
|
||||
|
||||
reg.opd0_addr = p->src->start_address;
|
||||
reg.opt_opd0_sign = 0;
|
||||
reg.opt_opd0_seg = 1;
|
||||
fill_opd0_stride(®, &p->src->stride);
|
||||
|
||||
reg.res0_addr = p->dst->start_address;
|
||||
reg.opt_res0_sign = 0;
|
||||
reg.opt_res0_seg = 1;
|
||||
fill_res0_stride(®, &p->dst->stride);
|
||||
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
110
cvikernel/src/bm1822/tiu_element_wise_ge.c
Normal file
110
cvikernel/src/bm1822/tiu_element_wise_ge.c
Normal file
@ -0,0 +1,110 @@
|
||||
#include "kernel_1822.h"
|
||||
|
||||
bmk1822_op_t * bmk1822_tiu_element_wise_ge(
|
||||
ctx_t *ctx,
|
||||
const bmk1822_tiu_element_wise_ge_param_t *p)
|
||||
{
|
||||
check_tiu_tensor_2(p->ge, p->a);
|
||||
assert_same_shape(p->ge, p->a);
|
||||
if (p->b_is_const) {
|
||||
if (tensor_is_signed(p->a))
|
||||
ASSERT(p->b_const.is_signed);
|
||||
else
|
||||
ASSERT(!p->b_const.is_signed);
|
||||
} else {
|
||||
check_tiu_tensor(p->b);
|
||||
assert_same_shape(p->ge, p->b);
|
||||
ASSERT(p->a->fmt == p->b->fmt);
|
||||
}
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tsk_eu_typ = TENSOR_GE_FIX8B;
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opt_res_shift = 0;
|
||||
reg.opt_relu_typ = 0;
|
||||
fill_same_tensor_shape(®, p->a->shape);
|
||||
fill_same_tensor_stride_type(®, 0b11);
|
||||
|
||||
reg.opd0_addr = p->a->start_address;
|
||||
reg.opt_opd0_sign = tensor_is_signed(p->a);
|
||||
fill_opd0_stride(®, &p->a->stride);
|
||||
|
||||
if (p->b_is_const) {
|
||||
reg.opt_opd1_const = 1;
|
||||
reg.opd1_addr = p->b_const.val;
|
||||
reg.opt_opd1_sign = !!p->b_const.is_signed;
|
||||
} else {
|
||||
reg.opt_opd1_const = 0;
|
||||
reg.opd1_addr = p->b->start_address;
|
||||
reg.opt_opd1_sign = tensor_is_signed(p->b);
|
||||
fill_opd1_stride(®, &p->b->stride);
|
||||
}
|
||||
|
||||
reg.res0_addr = p->ge->start_address;
|
||||
reg.opt_res0_sign = tensor_is_signed(p->ge);
|
||||
fill_res0_stride(®, &p->ge->stride);
|
||||
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
|
||||
bmk1822_op_t * bmk1822_tiu_bf16_element_wise_ge(
|
||||
ctx_t *ctx,
|
||||
const bmk1822_tiu_element_wise_ge_param_t *p)
|
||||
{
|
||||
int bf16_enable = (p->a->fmt == FMT_BF16) ? 1 : 0;
|
||||
|
||||
check_tiu_tensor_2(p->ge, p->a);
|
||||
assert_same_shape(p->ge, p->a);
|
||||
|
||||
if (p->b_is_const && !bf16_enable) {
|
||||
if (tensor_is_signed(p->a))
|
||||
ASSERT(p->b_const.is_signed);
|
||||
else
|
||||
ASSERT(!p->b_const.is_signed);
|
||||
} else if (!p->b_is_const) {
|
||||
check_tiu_tensor(p->b);
|
||||
assert_same_shape(p->ge, p->b);
|
||||
ASSERT(p->a->fmt == p->b->fmt);
|
||||
}
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tsk_eu_typ = TENSOR_GE_FIX8B;
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opd_typ = bf16_enable ? 1: 0;
|
||||
reg.opt_res_shift = 0;
|
||||
reg.opt_relu_typ = 0;
|
||||
fill_same_tensor_shape(®, p->a->shape);
|
||||
fill_same_tensor_stride_type(®, 0b11);
|
||||
|
||||
reg.opd0_addr = p->a->start_address;
|
||||
reg.opt_opd0_sign = tensor_is_signed(p->a);
|
||||
fill_opd0_stride(®, &p->a->stride);
|
||||
|
||||
if (p->b_is_const) {
|
||||
reg.opt_opd1_const = 1;
|
||||
reg.opd1_addr = bf16_enable ? p->b_const.val : (p->b_const.val & 0xFF);
|
||||
reg.opt_opd1_sign = !!p->b_const.is_signed;
|
||||
} else {
|
||||
reg.opt_opd1_const = 0;
|
||||
reg.opd1_addr = p->b->start_address;
|
||||
reg.opt_opd1_sign = tensor_is_signed(p->b);
|
||||
fill_opd1_stride(®, &p->b->stride);
|
||||
}
|
||||
|
||||
reg.res0_addr = p->ge->start_address;
|
||||
reg.opt_res0_sign = tensor_is_signed(p->ge);
|
||||
fill_res0_stride(®, &p->ge->stride);
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
68
cvikernel/src/bm1822/tiu_element_wise_mac.c
Normal file
68
cvikernel/src/bm1822/tiu_element_wise_mac.c
Normal file
@ -0,0 +1,68 @@
|
||||
#include "kernel_1822.h"
|
||||
|
||||
bmk1822_op_t * bmk1822_tiu_element_wise_mac(
|
||||
ctx_t *ctx,
|
||||
const bmk1822_tiu_element_wise_mac_param_t *p)
|
||||
{
|
||||
int bf16_enable = (p->a->fmt == FMT_BF16) ? 1 : 0;
|
||||
|
||||
check_tiu_tensor(p->a);
|
||||
assert_same_shape(p->res_low, p->a);
|
||||
if(!bf16_enable) {
|
||||
check_16bit_tiu_tensor(p->res_low, p->res_high);
|
||||
ASSERT(p->lshift_bits < 32);
|
||||
ASSERT(p->rshift_bits < 16);
|
||||
}
|
||||
if (!p->b_is_const) {
|
||||
check_tiu_tensor(p->b);
|
||||
assert_same_shape(p->res_low, p->b);
|
||||
}
|
||||
ASSERT(p->relu_enable == 0 || p->relu_enable == 1);
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tsk_eu_typ = TENSOR_MAC_FIX8B;
|
||||
reg.opt_res_add = 1;
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opd_typ = bf16_enable ? 1: 0;
|
||||
reg.opt_left_shift = p->lshift_bits;
|
||||
reg.opt_relu_typ = p->relu_enable;
|
||||
fill_same_tensor_shape(®, p->a->shape);
|
||||
fill_same_tensor_stride_type(®, 0b11);
|
||||
|
||||
int arith_shift = tensor_is_signed(p->res_low);
|
||||
reg.opt_shift_typ = arith_shift;
|
||||
reg.opt_res_shift = p->rshift_bits;
|
||||
|
||||
reg.opd0_addr = p->a->start_address;
|
||||
reg.opt_opd0_sign = tensor_is_signed(p->a);
|
||||
fill_opd0_stride(®, &p->a->stride);
|
||||
|
||||
if (p->b_is_const) {
|
||||
reg.opt_opd1_const = 1;
|
||||
reg.opd1_addr = bf16_enable ? p->b_const.val : (p->b_const.val & 0xFF);
|
||||
reg.opt_opd1_sign = !!p->b_const.is_signed;
|
||||
} else {
|
||||
reg.opt_opd1_const = 0;
|
||||
reg.opd1_addr = p->b->start_address;
|
||||
reg.opt_opd1_sign = tensor_is_signed(p->b);
|
||||
fill_opd1_stride(®, &p->b->stride);
|
||||
}
|
||||
|
||||
reg.res0_addr = p->res_low->start_address;
|
||||
reg.opt_res0_sign = tensor_is_signed(p->res_low);
|
||||
reg.opt_res0_seg = bf16_enable ? 1 : !!p->res_is_int8;
|
||||
fill_res0_stride(®, &p->res_low->stride);
|
||||
reg.res0_b_str = bf16_enable ? 0 : (p->res_high->start_address - p->res_low->start_address);
|
||||
|
||||
if (p->relu_enable)
|
||||
ASSERT(reg.opt_res0_seg);
|
||||
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
56
cvikernel/src/bm1822/tiu_element_wise_max.c
Normal file
56
cvikernel/src/bm1822/tiu_element_wise_max.c
Normal file
@ -0,0 +1,56 @@
|
||||
#include "kernel_1822.h"
|
||||
|
||||
bmk1822_op_t * bmk1822_tiu_element_wise_max(
|
||||
ctx_t *ctx,
|
||||
const bmk1822_tiu_element_wise_max_param_t *p)
|
||||
{
|
||||
int bf16_enable = (p->a->fmt == FMT_BF16) ? 1 : 0;
|
||||
|
||||
check_tiu_tensor_2(p->max, p->a);
|
||||
assert_same_shape(p->max, p->a);
|
||||
|
||||
if (p->b_is_const && !bf16_enable) {
|
||||
if (tensor_is_signed(p->a))
|
||||
ASSERT(p->b_const.is_signed);
|
||||
else
|
||||
ASSERT(!p->b_const.is_signed);
|
||||
} else if (!p->b_is_const) {
|
||||
check_tiu_tensor(p->b);
|
||||
assert_same_shape(p->max, p->b);
|
||||
ASSERT(p->a->fmt == p->b->fmt);
|
||||
}
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tsk_eu_typ = TENSOR_MAX_FIX8B;
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opd_typ = bf16_enable ? 1: 0;
|
||||
reg.opt_res_shift = 0;
|
||||
reg.opt_relu_typ = 0;
|
||||
fill_same_tensor_shape(®, p->a->shape);
|
||||
fill_same_tensor_stride_type(®, 0b11);
|
||||
|
||||
reg.opd0_addr = p->a->start_address;
|
||||
reg.opt_opd0_sign = tensor_is_signed(p->a);
|
||||
fill_opd0_stride(®, &p->a->stride);
|
||||
|
||||
if (p->b_is_const) {
|
||||
reg.opt_opd1_const = 1;
|
||||
reg.opd1_addr = bf16_enable ? p->b_const.val : (p->b_const.val & 0xFF);
|
||||
reg.opt_opd1_sign = !!p->b_const.is_signed;
|
||||
} else {
|
||||
reg.opt_opd1_const = 0;
|
||||
reg.opd1_addr = p->b->start_address;
|
||||
reg.opt_opd1_sign = tensor_is_signed(p->b);
|
||||
fill_opd1_stride(®, &p->b->stride);
|
||||
}
|
||||
|
||||
reg.res0_addr = p->max->start_address;
|
||||
reg.opt_res0_sign = tensor_is_signed(p->max);
|
||||
fill_res0_stride(®, &p->max->stride);
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
58
cvikernel/src/bm1822/tiu_element_wise_min.c
Normal file
58
cvikernel/src/bm1822/tiu_element_wise_min.c
Normal file
@ -0,0 +1,58 @@
|
||||
#include "kernel_1822.h"
|
||||
|
||||
bmk1822_op_t * bmk1822_tiu_element_wise_min(
|
||||
ctx_t *ctx,
|
||||
const bmk1822_tiu_element_wise_min_param_t *p)
|
||||
{
|
||||
int bf16_enable = (p->a->fmt == FMT_BF16) ? 1 : 0;
|
||||
|
||||
check_tiu_tensor_2(p->min, p->a);
|
||||
assert_same_shape(p->min, p->a);
|
||||
if (p->b_is_const && !bf16_enable) {
|
||||
if (tensor_is_signed(p->a))
|
||||
ASSERT(p->b_const.is_signed);
|
||||
else
|
||||
ASSERT(!p->b_const.is_signed);
|
||||
} else if (!p->b_is_const) {
|
||||
check_tiu_tensor(p->b);
|
||||
assert_same_shape(p->min, p->b);
|
||||
ASSERT(p->a->fmt == p->b->fmt);
|
||||
}
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tsk_eu_typ = TENSOR_MIN_FIX8B;
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opd_typ = bf16_enable ? 1: 0;
|
||||
reg.opt_res_shift = 0;
|
||||
reg.opt_relu_typ = 0;
|
||||
fill_same_tensor_shape(®, p->a->shape);
|
||||
fill_same_tensor_stride_type(®, 0b11);
|
||||
|
||||
reg.opd0_addr = p->a->start_address;
|
||||
reg.opt_opd0_sign = tensor_is_signed(p->a);
|
||||
fill_opd0_stride(®, &p->a->stride);
|
||||
|
||||
if (p->b_is_const) {
|
||||
reg.opt_opd1_const = 1;
|
||||
reg.opd1_addr = bf16_enable ? p->b_const.val : (p->b_const.val & 0xFF);
|
||||
reg.opt_opd1_sign = !!p->b_const.is_signed;
|
||||
} else {
|
||||
reg.opt_opd1_const = 0;
|
||||
reg.opd1_addr = p->b->start_address;
|
||||
reg.opt_opd1_sign = tensor_is_signed(p->b);
|
||||
fill_opd1_stride(®, &p->b->stride);
|
||||
}
|
||||
|
||||
reg.res0_addr = p->min->start_address;
|
||||
reg.opt_res0_sign = tensor_is_signed(p->min);
|
||||
fill_res0_stride(®, &p->min->stride);
|
||||
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
67
cvikernel/src/bm1822/tiu_element_wise_mul.c
Normal file
67
cvikernel/src/bm1822/tiu_element_wise_mul.c
Normal file
@ -0,0 +1,67 @@
|
||||
#include "kernel_1822.h"
|
||||
|
||||
bmk1822_op_t * bmk1822_tiu_element_wise_mul(
|
||||
ctx_t *ctx,
|
||||
const bmk1822_tiu_element_wise_mul_param_t *p)
|
||||
{
|
||||
int bf16_enable = (p->res_low->fmt == FMT_BF16) ? 1 : 0;
|
||||
|
||||
check_tiu_tensor_2(p->res_low, p->a);
|
||||
assert_same_shape(p->res_low, p->a);
|
||||
if (!p->b_is_const) {
|
||||
check_tiu_tensor(p->b);
|
||||
assert_same_shape(p->res_low, p->b);
|
||||
}
|
||||
if (p->res_high)
|
||||
check_16bit_tiu_tensor(p->res_low, p->res_high);
|
||||
ASSERT(p->relu_enable == 0 || p->relu_enable == 1);
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tsk_eu_typ = TENSOR_MUL_FIX8B;
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opd_typ = bf16_enable ? 1: 0;
|
||||
int arith_shift = tensor_is_signed(p->res_low);
|
||||
reg.opt_shift_typ = arith_shift;
|
||||
reg.opt_res_shift = p->rshift_bits;
|
||||
reg.opt_relu_typ = p->relu_enable;
|
||||
fill_same_tensor_shape(®, p->a->shape);
|
||||
fill_same_tensor_stride_type(®, 0b11);
|
||||
|
||||
reg.opd0_addr = p->a->start_address;
|
||||
reg.opt_opd0_sign = tensor_is_signed(p->a);
|
||||
fill_opd0_stride(®, &p->a->stride);
|
||||
|
||||
if (p->b_is_const) {
|
||||
reg.opt_opd1_const = 1;
|
||||
reg.opd1_addr = bf16_enable ? p->b_const.val : (p->b_const.val & 0xFF);
|
||||
reg.opt_opd1_sign = !!p->b_const.is_signed;
|
||||
} else {
|
||||
reg.opt_opd1_const = 0;
|
||||
reg.opd1_addr = p->b->start_address;
|
||||
reg.opt_opd1_sign = tensor_is_signed(p->b);
|
||||
fill_opd1_stride(®, &p->b->stride);
|
||||
}
|
||||
|
||||
reg.res0_addr = p->res_low->start_address;
|
||||
reg.opt_res0_sign = tensor_is_signed(p->res_low);
|
||||
reg.opt_res0_seg = (p->res_high == NULL);
|
||||
fill_res0_stride(®, &p->res_low->stride);
|
||||
if (p->res_high)
|
||||
reg.res0_b_str = (p->res_high->start_address - p->res_low->start_address);
|
||||
if (p->relu_enable)
|
||||
ASSERT(reg.opt_res0_seg);
|
||||
|
||||
ASSERT((
|
||||
p->b_is_const || (!reg.opt_opd1_sign && !reg.opt_opd0_sign && !reg.opt_shift_typ) ||
|
||||
((reg.opt_opd1_sign || reg.opt_opd0_sign) && reg.opt_shift_typ)
|
||||
));
|
||||
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
67
cvikernel/src/bm1822/tiu_element_wise_mul_qdm.c
Normal file
67
cvikernel/src/bm1822/tiu_element_wise_mul_qdm.c
Normal file
@ -0,0 +1,67 @@
|
||||
#include "kernel_1822.h"
|
||||
|
||||
bmk1822_op_t * bmk1822_tiu_element_wise_mul_qdm(
|
||||
ctx_t *ctx,
|
||||
const bmk1822_tiu_element_wise_mul_qdm_param_t *p)
|
||||
{
|
||||
check_tiu_tensor_2(p->res_low, p->a);
|
||||
assert_same_shape(p->res_low, p->a);
|
||||
if (!p->b_is_const) {
|
||||
check_tiu_tensor(p->b);
|
||||
assert_same_shape(p->res_low, p->b);
|
||||
}
|
||||
if (p->res_high)
|
||||
check_16bit_tiu_tensor(p->res_low, p->res_high);
|
||||
ASSERT(p->relu_enable == 0 || p->relu_enable == 1);
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tsk_eu_typ = TENSOR_MUL_FIX8B;
|
||||
reg.tsk_opd_num = 2;
|
||||
int arith_shift = tensor_is_signed(p->res_low);
|
||||
reg.opt_shift_typ = arith_shift;
|
||||
reg.opt_res_shift = p->rshift_bits;
|
||||
reg.opt_relu_typ = p->relu_enable;
|
||||
fill_same_tensor_shape(®, p->a->shape);
|
||||
fill_same_tensor_stride_type(®, 0b11);
|
||||
|
||||
reg.opd0_addr = p->a->start_address;
|
||||
reg.opt_opd0_sign = tensor_is_signed(p->a);
|
||||
fill_opd0_stride(®, &p->a->stride);
|
||||
|
||||
if (p->b_is_const) {
|
||||
reg.opt_opd1_const = 1;
|
||||
reg.opd1_addr = p->b_const.val;
|
||||
reg.opt_opd1_sign = !!p->b_const.is_signed;
|
||||
} else {
|
||||
reg.opt_opd1_const = 0;
|
||||
reg.opd1_addr = p->b->start_address;
|
||||
reg.opt_opd1_sign = tensor_is_signed(p->b);
|
||||
fill_opd1_stride(®, &p->b->stride);
|
||||
}
|
||||
|
||||
reg.res0_addr = p->res_low->start_address;
|
||||
reg.opt_res0_sign = tensor_is_signed(p->res_low);
|
||||
reg.opt_res0_seg = (p->res_high == NULL);
|
||||
fill_res0_stride(®, &p->res_low->stride);
|
||||
if (p->res_high)
|
||||
reg.res0_b_str = p->res_high->start_address - p->res_low->start_address;
|
||||
if (p->relu_enable)
|
||||
ASSERT(reg.opt_res0_seg);
|
||||
|
||||
ASSERT((
|
||||
(!reg.opt_opd1_sign && !reg.opt_opd0_sign && !reg.opt_shift_typ) ||
|
||||
((reg.opt_opd1_sign || reg.opt_opd0_sign) && reg.opt_shift_typ)
|
||||
));
|
||||
|
||||
reg.opt_chl_quan = 1;
|
||||
reg.quan_m = p->multiplier;
|
||||
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
100
cvikernel/src/bm1822/tiu_element_wise_or.c
Normal file
100
cvikernel/src/bm1822/tiu_element_wise_or.c
Normal file
@ -0,0 +1,100 @@
|
||||
#include "kernel_1822.h"
|
||||
|
||||
bmk1822_op_t * bmk1822_tiu_element_wise_or_int8(
|
||||
ctx_t *ctx,
|
||||
const bmk1822_tiu_element_wise_or_int8_param_t *p)
|
||||
{
|
||||
check_tiu_tensor_3(p->res, p->a, p->b);
|
||||
assert_same_shape_3(p->res, p->a, p->b);
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tsk_eu_typ = TENSOR_OR_FIX8B;
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opt_res_shift = 0;
|
||||
reg.opt_shift_typ = 0;
|
||||
reg.opt_relu_typ = 0;
|
||||
fill_same_tensor_shape(®, p->a->shape);
|
||||
fill_same_tensor_stride_type(®, 0b11);
|
||||
|
||||
reg.opd0_addr = p->a->start_address;
|
||||
reg.opt_opd0_sign = 0;
|
||||
reg.opt_opd0_seg = 1;
|
||||
fill_opd0_stride(®, &p->a->stride);
|
||||
|
||||
reg.opd1_addr = p->b->start_address;
|
||||
reg.opt_opd1_sign = 0;
|
||||
reg.opt_opd1_seg = 1;
|
||||
fill_opd1_stride(®, &p->b->stride);
|
||||
|
||||
reg.res0_addr = p->res->start_address;
|
||||
reg.opt_res0_sign = 0;
|
||||
reg.opt_res0_seg = 1;
|
||||
fill_res0_stride(®, &p->res->stride);
|
||||
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
|
||||
bmk1822_op_t * bmk1822_tiu_element_wise_or_int16(
|
||||
ctx_t *ctx,
|
||||
const bmk1822_tiu_element_wise_or_int16_param_t *p)
|
||||
{
|
||||
check_16bit_tiu_tensor(p->a_low, p->a_high);
|
||||
check_16bit_tiu_tensor(p->b_low, p->b_high);
|
||||
check_16bit_tiu_tensor(p->res_low, p->res_high);
|
||||
assert_same_shape_3(p->res_low, p->a_low, p->b_low);
|
||||
|
||||
int res_high_addr = p->res_high->start_address;
|
||||
int res_low_addr = p->res_low->start_address;
|
||||
ASSERT(res_high_addr > res_low_addr);
|
||||
int res_b_stride = res_high_addr - res_low_addr;
|
||||
|
||||
int a_high_addr = p->a_high->start_address;
|
||||
int a_low_addr = p->a_low->start_address;
|
||||
ASSERT(a_high_addr > a_low_addr);
|
||||
int a_b_stride = a_high_addr - a_low_addr;
|
||||
|
||||
int b_high_addr = p->b_high->start_address;
|
||||
int b_low_addr = p->b_low->start_address;
|
||||
ASSERT(b_high_addr > b_low_addr);
|
||||
int b_b_stride = b_high_addr - b_low_addr;
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tsk_eu_typ = TENSOR_OR_FIX8B;
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opt_res_shift = 0;
|
||||
reg.opt_shift_typ = 0;
|
||||
reg.opt_relu_typ = 0;
|
||||
fill_same_tensor_shape(®, p->a_low->shape);
|
||||
fill_same_tensor_stride_type(®, 0b11);
|
||||
|
||||
reg.opd0_addr = a_low_addr;
|
||||
reg.opt_opd0_sign = 0;
|
||||
reg.opt_opd0_seg = 0;
|
||||
reg.opd0_b_str = a_b_stride;
|
||||
fill_opd0_stride(®, &p->a_low->stride);
|
||||
|
||||
reg.opd1_addr = b_low_addr;
|
||||
reg.opt_opd1_sign = 0;
|
||||
reg.opt_opd1_seg = 0;
|
||||
reg.opd1_b_str = b_b_stride;
|
||||
fill_opd1_stride(®, &p->b_low->stride);
|
||||
|
||||
reg.res0_addr = res_low_addr;
|
||||
reg.opt_res0_sign = 0;
|
||||
reg.opt_res0_seg = 0;
|
||||
reg.res0_b_str = res_b_stride;
|
||||
fill_res0_stride(®, &p->res_low->stride);
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
58
cvikernel/src/bm1822/tiu_element_wise_shift.c
Normal file
58
cvikernel/src/bm1822/tiu_element_wise_shift.c
Normal file
@ -0,0 +1,58 @@
|
||||
#include "kernel_1822.h"
|
||||
|
||||
bmk1822_op_t * bmk1822_tiu_element_wise_arith_shift(
|
||||
ctx_t *ctx,
|
||||
const bmk1822_tiu_element_wise_arith_shift_param_t *p)
|
||||
{
|
||||
check_16bit_tiu_tensor(p->a_low, p->a_high);
|
||||
check_16bit_tiu_tensor(p->res_low, p->res_high);
|
||||
check_tiu_tensor(p->bits);
|
||||
assert_same_shape_3(p->res_low, p->a_low, p->bits);
|
||||
ASSERT(tensor_is_signed(p->a_low));
|
||||
ASSERT(tensor_is_signed(p->bits));
|
||||
|
||||
int res_high_addr = p->res_high->start_address;
|
||||
int res_low_addr = p->res_low->start_address;
|
||||
ASSERT(res_high_addr > res_low_addr);
|
||||
int res_b_stride = res_high_addr - res_low_addr;
|
||||
|
||||
int a_high_addr = p->a_high->start_address;
|
||||
int a_low_addr = p->a_low->start_address;
|
||||
ASSERT(a_high_addr > a_low_addr);
|
||||
int a_b_stride = a_high_addr - a_low_addr;
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tsk_eu_typ = TENSOR_SHIFT_FIX8B;
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opt_res_shift = 0;
|
||||
reg.opt_rshift_typ = 0;
|
||||
reg.opt_relu_typ = 0;
|
||||
fill_same_tensor_shape(®, p->a_low->shape);
|
||||
fill_same_tensor_stride_type(®, 0b11);
|
||||
|
||||
reg.opd0_addr = a_low_addr;
|
||||
reg.opt_opd0_sign = 1;
|
||||
reg.opt_opd0_seg = 0;
|
||||
reg.opd0_b_str = a_b_stride;
|
||||
fill_opd0_stride(®, &p->a_low->stride);
|
||||
|
||||
reg.opd1_addr = p->bits->start_address;
|
||||
reg.opt_opd1_sign = 1;
|
||||
reg.opt_opd1_seg = 1;
|
||||
fill_opd1_stride(®, &p->bits->stride);
|
||||
|
||||
reg.res0_addr = res_low_addr;
|
||||
reg.opt_res0_sign = 1;
|
||||
reg.opt_res0_seg = 0;
|
||||
reg.res0_b_str = res_b_stride;
|
||||
fill_res0_stride(®, &p->res_low->stride);
|
||||
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
68
cvikernel/src/bm1822/tiu_element_wise_sub.c
Normal file
68
cvikernel/src/bm1822/tiu_element_wise_sub.c
Normal file
@ -0,0 +1,68 @@
|
||||
#include "kernel_1822.h"
|
||||
|
||||
bmk1822_op_t * bmk1822_tiu_element_wise_sub(
|
||||
ctx_t *ctx,
|
||||
const bmk1822_tiu_element_wise_sub_param_t *p)
|
||||
{
|
||||
int bf16_enable = (p->a_low->fmt == FMT_BF16) ? 1 : 0;
|
||||
|
||||
if (bf16_enable) {
|
||||
/*bf16 only support 16 bit*/
|
||||
ASSERT(!p->a_high);
|
||||
ASSERT(!p->b_high);
|
||||
ASSERT(!p->res_high);
|
||||
check_tiu_tensor(p->a_low);
|
||||
check_tiu_tensor(p->b_low);
|
||||
check_tiu_tensor(p->res_low);
|
||||
assert_same_shape_3(p->res_low, p->a_low, p->b_low);
|
||||
} else {
|
||||
check_16bit_tiu_tensor(p->a_low, p->a_high);
|
||||
check_16bit_tiu_tensor(p->b_low, p->b_high);
|
||||
check_tiu_tensor(p->res_low);
|
||||
assert_same_shape_3(p->res_low, p->a_low, p->b_low);
|
||||
ASSERT(tensor_is_signed(p->res_low));
|
||||
}
|
||||
if (p->res_high)
|
||||
check_16bit_tiu_tensor(p->res_low, p->res_high);
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tsk_eu_typ = TENSOR_SUB_FIX8B;
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opd_typ = bf16_enable ? 1: 0;
|
||||
reg.opt_res_shift = 0;
|
||||
reg.opt_relu_typ = 0;
|
||||
fill_same_tensor_shape(®, p->a_low->shape);
|
||||
fill_same_tensor_stride_type(®, 0b11);
|
||||
|
||||
int arith_shift = tensor_is_signed(p->res_low);
|
||||
reg.opt_shift_typ = arith_shift;
|
||||
reg.opt_res_shift = p->rshift_bits;
|
||||
|
||||
reg.opd0_addr = p->a_low->start_address;
|
||||
reg.opt_opd0_sign = tensor_is_signed(p->a_low);
|
||||
reg.opt_opd0_seg = (p->a_high == NULL);
|
||||
reg.opd0_b_str = bf16_enable ? 0 : (p->a_high->start_address - p->a_low->start_address);
|
||||
fill_opd0_stride(®, &p->a_low->stride);
|
||||
|
||||
reg.opd1_addr = p->b_low->start_address;
|
||||
reg.opt_opd1_sign = tensor_is_signed(p->b_low);;
|
||||
reg.opt_opd1_seg = (p->b_high == NULL);
|
||||
reg.opd1_b_str = bf16_enable ? 0 : (p->b_high->start_address - p->b_low->start_address);
|
||||
fill_opd1_stride(®, &p->b_low->stride);
|
||||
|
||||
reg.res0_addr = p->res_low->start_address;
|
||||
reg.opt_res0_sign = 1;
|
||||
reg.opt_res0_seg = (p->res_high == NULL);
|
||||
fill_res0_stride(®, &p->res_low->stride);
|
||||
if (p->res_high)
|
||||
reg.res0_b_str = bf16_enable ? 0 : (p->res_high->start_address - p->res_low->start_address);
|
||||
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
100
cvikernel/src/bm1822/tiu_element_wise_xor.c
Normal file
100
cvikernel/src/bm1822/tiu_element_wise_xor.c
Normal file
@ -0,0 +1,100 @@
|
||||
#include "kernel_1822.h"
|
||||
|
||||
bmk1822_op_t * bmk1822_tiu_element_wise_xor_int8(
|
||||
ctx_t *ctx,
|
||||
const bmk1822_tiu_element_wise_xor_int8_param_t *p)
|
||||
{
|
||||
check_tiu_tensor_3(p->res, p->a, p->b);
|
||||
assert_same_shape_3(p->res, p->a, p->b);
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tsk_eu_typ = TENSOR_XOR_FIX8B;
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opt_res_shift = 0;
|
||||
reg.opt_shift_typ = 0;
|
||||
reg.opt_relu_typ = 0;
|
||||
fill_same_tensor_shape(®, p->a->shape);
|
||||
fill_same_tensor_stride_type(®, 0b11);
|
||||
|
||||
reg.opd0_addr = p->a->start_address;
|
||||
reg.opt_opd0_sign = 0;
|
||||
reg.opt_opd0_seg = 1;
|
||||
fill_opd0_stride(®, &p->a->stride);
|
||||
|
||||
reg.opd1_addr = p->b->start_address;
|
||||
reg.opt_opd1_sign = 0;
|
||||
reg.opt_opd1_seg = 1;
|
||||
fill_opd1_stride(®, &p->b->stride);
|
||||
|
||||
reg.res0_addr = p->res->start_address;
|
||||
reg.opt_res0_sign = 0;
|
||||
reg.opt_res0_seg = 1;
|
||||
fill_res0_stride(®, &p->res->stride);
|
||||
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
|
||||
bmk1822_op_t * bmk1822_tiu_element_wise_xor_int16(
|
||||
ctx_t *ctx,
|
||||
const bmk1822_tiu_element_wise_xor_int16_param_t *p)
|
||||
{
|
||||
check_16bit_tiu_tensor(p->a_low, p->a_high);
|
||||
check_16bit_tiu_tensor(p->b_low, p->b_high);
|
||||
check_16bit_tiu_tensor(p->res_low, p->res_high);
|
||||
assert_same_shape_3(p->res_low, p->a_low, p->b_low);
|
||||
|
||||
int res_high_addr = p->res_high->start_address;
|
||||
int res_low_addr = p->res_low->start_address;
|
||||
ASSERT(res_high_addr > res_low_addr);
|
||||
int res_b_stride = res_high_addr - res_low_addr;
|
||||
|
||||
int a_high_addr = p->a_high->start_address;
|
||||
int a_low_addr = p->a_low->start_address;
|
||||
ASSERT(a_high_addr > a_low_addr);
|
||||
int a_b_stride = a_high_addr - a_low_addr;
|
||||
|
||||
int b_high_addr = p->b_high->start_address;
|
||||
int b_low_addr = p->b_low->start_address;
|
||||
ASSERT(b_high_addr > b_low_addr);
|
||||
int b_b_stride = b_high_addr - b_low_addr;
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tsk_eu_typ = TENSOR_XOR_FIX8B;
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opt_res_shift = 0;
|
||||
reg.opt_shift_typ = 0;
|
||||
reg.opt_relu_typ = 0;
|
||||
fill_same_tensor_shape(®, p->a_low->shape);
|
||||
fill_same_tensor_stride_type(®, 0b11);
|
||||
|
||||
reg.opd0_addr = a_low_addr;
|
||||
reg.opt_opd0_sign = 0;
|
||||
reg.opt_opd0_seg = 0;
|
||||
reg.opd0_b_str = a_b_stride;
|
||||
fill_opd0_stride(®, &p->a_low->stride);
|
||||
|
||||
reg.opd1_addr = b_low_addr;
|
||||
reg.opt_opd1_sign = 0;
|
||||
reg.opt_opd1_seg = 0;
|
||||
reg.opd1_b_str = b_b_stride;
|
||||
fill_opd1_stride(®, &p->b_low->stride);
|
||||
|
||||
reg.res0_addr = res_low_addr;
|
||||
reg.opt_res0_sign = 0;
|
||||
reg.opt_res0_seg = 0;
|
||||
reg.res0_b_str = res_b_stride;
|
||||
fill_res0_stride(®, &p->res_low->stride);
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
112
cvikernel/src/bm1822/tiu_lookup_table.c
Normal file
112
cvikernel/src/bm1822/tiu_lookup_table.c
Normal file
@ -0,0 +1,112 @@
|
||||
#include "kernel_1822.h"
|
||||
|
||||
bmk1822_op_t * bmk1822_tiu_lookup_table(
|
||||
ctx_t *ctx,
|
||||
const bmk1822_tiu_lookup_table_param_t *p)
|
||||
{
|
||||
uint32_t eu_num = ctx->chip_info.eu_num;
|
||||
uint32_t npu_num = ctx->chip_info.npu_num;
|
||||
|
||||
check_tiu_tensor_3(p->ofmap, p->ifmap, p->table);
|
||||
assert_stride_type_0(ctx, p->ofmap);
|
||||
assert_stride_type_0(ctx, p->ifmap);
|
||||
assert_stride_type_0(ctx, p->table);
|
||||
|
||||
uint8_t is_bf16 = (p->ofmap->fmt == FMT_BF16 && p->ifmap->fmt == FMT_BF16);
|
||||
|
||||
ASSERT(p->table->shape.n == 1);
|
||||
ASSERT(p->table->shape.c == npu_num);
|
||||
|
||||
if (is_bf16) {
|
||||
ASSERT(p->table->shape.h == 32);
|
||||
ASSERT(p->table->shape.w == 8);
|
||||
}
|
||||
else {
|
||||
ASSERT(p->table->shape.h == 16);
|
||||
ASSERT(p->table->shape.w == 16);
|
||||
}
|
||||
|
||||
ASSERT(p->ifmap->start_address % eu_num == 0);
|
||||
ASSERT(p->ofmap->start_address % eu_num == 0);
|
||||
ASSERT(p->table->start_address % eu_num == 0);
|
||||
|
||||
// fmt MUST be same under bf16
|
||||
if (p->ofmap->fmt == FMT_BF16) {
|
||||
ASSERT(p->ifmap->fmt == FMT_BF16);
|
||||
}
|
||||
ASSERT(p->ofmap->fmt == FMT_I8 || p->ofmap->fmt == FMT_U8 || p->ofmap->fmt == FMT_BF16);
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
//reg.tens_lookup = 1;
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opt_shift_typ = 0;
|
||||
reg.opt_res_shift = 0;
|
||||
reg.opt_relu_typ = 0;
|
||||
reg.opd_typ = is_bf16;
|
||||
|
||||
reg.res0_addr = p->ofmap->start_address;
|
||||
if (is_bf16) {
|
||||
reg.opt_res0_sign = 1;
|
||||
reg.opt_res0_seg = 1;
|
||||
}
|
||||
else {
|
||||
reg.opt_res0_sign = 0;
|
||||
reg.opt_res0_seg = 1;
|
||||
}
|
||||
|
||||
// <! input / output shape SHOULD be same
|
||||
ASSERT(p->ifmap->shape.n == p->ofmap->shape.n);
|
||||
ASSERT(p->ifmap->shape.c == p->ofmap->shape.c);
|
||||
ASSERT(p->ifmap->shape.h == p->ofmap->shape.h);
|
||||
ASSERT(p->ifmap->shape.w == p->ofmap->shape.w);
|
||||
|
||||
reg.res0_n = p->ifmap->shape.n;
|
||||
reg.res0_c = p->ifmap->shape.c;
|
||||
reg.res0_h = p->ifmap->shape.h;
|
||||
reg.res0_w = p->ifmap->shape.w;
|
||||
reg.short_res0_str = 0;
|
||||
|
||||
reg.opd0_addr = p->ifmap->start_address;
|
||||
if (is_bf16) {
|
||||
reg.opt_opd0_sign = 1;
|
||||
reg.opt_opd0_seg = 1;
|
||||
}
|
||||
else {
|
||||
reg.opt_opd0_sign = 0;
|
||||
reg.opt_opd0_seg = 1;
|
||||
}
|
||||
reg.opd0_n = p->ifmap->shape.n;
|
||||
reg.opd0_c = p->ifmap->shape.c;
|
||||
reg.opd0_h = p->ifmap->shape.h;
|
||||
reg.opd0_w = p->ifmap->shape.w;
|
||||
reg.short_opd0_str = 0;
|
||||
|
||||
reg.opd1_addr = p->table->start_address;
|
||||
if (is_bf16) {
|
||||
reg.opt_opd1_sign = 1;
|
||||
reg.opt_opd1_seg = 1;
|
||||
}
|
||||
else {
|
||||
reg.opt_opd1_sign = 0;
|
||||
reg.opt_opd1_seg = 1;
|
||||
}
|
||||
reg.opd1_n = p->table->shape.n;
|
||||
reg.opd1_c = p->table->shape.c;
|
||||
reg.opd1_h = p->table->shape.h;
|
||||
reg.opd1_w = p->table->shape.w;
|
||||
reg.short_opd1_str = 0;
|
||||
reg.tsk_eu_typ = 12; // 12 means lut
|
||||
if (is_bf16) {
|
||||
reg.opt_opd2_seg = 1; // hw check
|
||||
// dont care once short_xxx_str set to 0
|
||||
}
|
||||
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
//trace_tiu_reg(®, __FUNCTION__);
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
151
cvikernel/src/bm1822/tiu_matrix_multiplication.c
Normal file
151
cvikernel/src/bm1822/tiu_matrix_multiplication.c
Normal file
@ -0,0 +1,151 @@
|
||||
#include "kernel_1822.h"
|
||||
|
||||
typedef bmk1822_tiu_matrix_multiplication_param_t param_t;
|
||||
|
||||
static void check_matrix(ctx_t *ctx, const ml_t *m)
|
||||
{
|
||||
bmk1822_tensor_lmem_t t;
|
||||
t.start_address = m->start_address;
|
||||
t.fmt = m->fmt;
|
||||
t.shape.n = m->shape.n;
|
||||
t.shape.c = m->shape.c;
|
||||
t.shape.h = 1;
|
||||
t.shape.w = m->shape.w;
|
||||
t.stride.n = m->stride.n;
|
||||
t.stride.c = m->stride.c;
|
||||
t.stride.h = m->stride.h;
|
||||
t.stride.w = 1 * (m->fmt == FMT_BF16 ? 2 : 1);
|
||||
|
||||
check_tiu_tensor(&t);
|
||||
assert_stride_type_0(ctx, &t);
|
||||
|
||||
uint32_t eu_num = ctx->chip_info.eu_num;
|
||||
ASSERT(m->start_address % eu_num == 0);
|
||||
}
|
||||
|
||||
static int is_arith_shift(const param_t *p)
|
||||
{
|
||||
if (p->left->fmt == FMT_I8)
|
||||
return 1;
|
||||
if (p->right->fmt == FMT_I8)
|
||||
return 1;
|
||||
if (p->bias && p->bias->fmt == FMT_I8)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
bmk1822_op_t * bmk1822_tiu_matrix_multiplication(ctx_t *ctx, const param_t *p)
|
||||
{
|
||||
const bmk1822_matrix_lmem_t *res = p->res;
|
||||
const bmk1822_matrix_lmem_t *left = p->left;
|
||||
const bmk1822_matrix_lmem_t *right = p->right;
|
||||
const bmk1822_matrix_lmem_t *bias = p->bias;
|
||||
int bf16_enable = (res->fmt == FMT_BF16) ? 1 : 0;
|
||||
|
||||
check_matrix(ctx, res);
|
||||
check_matrix(ctx, left);
|
||||
check_matrix(ctx, right);
|
||||
if (bias)
|
||||
check_matrix(ctx, bias);
|
||||
|
||||
ASSERT(p->lshift_bits < 32);
|
||||
if (bf16_enable) /* bf16 does not support add_result*/
|
||||
ASSERT(!p->add_result);
|
||||
else
|
||||
ASSERT(!(p->relu_enable && p->add_result));
|
||||
|
||||
if(p->ps32_mode & 0x2)
|
||||
{
|
||||
ASSERT(!p->relu_enable);
|
||||
ASSERT(!p->bias);
|
||||
ASSERT(!p->rshift_bits);
|
||||
}
|
||||
ASSERT(p->relu_enable == 0 || p->relu_enable == 1);
|
||||
|
||||
uint32_t left_row = left->shape.n;
|
||||
uint32_t left_col = left->shape.col;
|
||||
uint32_t right_row = right->shape.n;
|
||||
uint32_t right_col = right->shape.col;
|
||||
uint32_t res_row = res->shape.n;
|
||||
uint32_t res_col = res->shape.col;
|
||||
ASSERT(left_col == right_row);
|
||||
ASSERT(res_col == right_col);
|
||||
|
||||
if(p->ps32_mode)
|
||||
{
|
||||
ASSERT(!p->add_result);
|
||||
} else if ((p->add_result || !p->res_is_int8) && !bf16_enable) {
|
||||
ASSERT(res_row == left_row * 2);
|
||||
res_row = left_row;
|
||||
} else {
|
||||
ASSERT(res_row == left_row);
|
||||
}
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_FC_FIX8B;
|
||||
reg.tsk_opd_num = bias? 3: 2;
|
||||
reg.opd_typ = bf16_enable ? 1 : 0;
|
||||
reg.opt_shift_typ = is_arith_shift(p);
|
||||
reg.opt_res_shift = p->rshift_bits;
|
||||
reg.opt_left_shift = p->lshift_bits;
|
||||
reg.opt_relu_typ = p->relu_enable;
|
||||
reg.opt_res_add = p->add_result;
|
||||
|
||||
reg.res0_addr = res->start_address;
|
||||
reg.opt_res0_seg = (bf16_enable ? 1 : p->res_is_int8);
|
||||
|
||||
reg.opt_res0_sign = matrix_is_signed(res);
|
||||
reg.res0_n = res_row;
|
||||
reg.res0_c = res->shape.c;
|
||||
reg.res0_h = 1;
|
||||
reg.res0_w = res->shape.w;
|
||||
reg.short_res0_str = 0; // stride, b_stride calculated by H/W
|
||||
|
||||
reg.opd0_addr = left->start_address;
|
||||
reg.opt_opd0_seg = 1;
|
||||
reg.opt_opd0_sign = (left->fmt == FMT_I8);
|
||||
reg.opd0_n = left_row;
|
||||
reg.opd0_c = left->shape.c;
|
||||
reg.opd0_h = 1;
|
||||
reg.opd0_w = left->shape.w;
|
||||
reg.short_opd0_str = 0;
|
||||
|
||||
reg.opd1_addr = right->start_address;
|
||||
reg.opt_opd1_seg = 1;
|
||||
reg.opt_opd1_sign = (right->fmt == FMT_I8);
|
||||
reg.opd1_n = right_row;
|
||||
reg.opd1_c = right->shape.c;
|
||||
reg.opd1_h = 1;
|
||||
reg.opd1_w = left_col - left->shape.w * (left->shape.c - 1);
|
||||
reg.short_opd1_str = 0;
|
||||
|
||||
reg.ps32_md = p->ps32_mode;
|
||||
if (p->ps32_mode > 0)
|
||||
reg.res0_b_str = p->res->shape.n * p->res->stride.n;
|
||||
if(reg.opd0_c == 1)
|
||||
ASSERT(reg.opd0_w == reg.opd1_w);
|
||||
|
||||
if (bias) {
|
||||
ASSERT(bias->shape.n == 2);
|
||||
ASSERT(bias->shape.c == right->shape.c);
|
||||
ASSERT(bias->shape.w == right->shape.w);
|
||||
ASSERT(bias->shape.col == right->shape.col);
|
||||
|
||||
reg.opd2_addr = bias->start_address;
|
||||
reg.opt_opd2_seg = 0;
|
||||
reg.opt_opd2_sign = (bias->fmt == FMT_I8);
|
||||
reg.opd2_n = 1;
|
||||
reg.opd2_c = bias->shape.c;
|
||||
reg.opd2_h = 1;
|
||||
reg.opd2_w = bias->shape.w;
|
||||
reg.short_opd2_str = 0;
|
||||
}
|
||||
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
151
cvikernel/src/bm1822/tiu_matrix_multiplication_qdm.c
Normal file
151
cvikernel/src/bm1822/tiu_matrix_multiplication_qdm.c
Normal file
@ -0,0 +1,151 @@
|
||||
#include "kernel_1822.h"
|
||||
|
||||
typedef bmk1822_tiu_matrix_multiplication_qdm_param_t param_t;
|
||||
|
||||
static void check_matrix(ctx_t *ctx, const ml_t *m)
|
||||
{
|
||||
bmk1822_tensor_lmem_t t;
|
||||
t.start_address = m->start_address;
|
||||
t.fmt = m->fmt;
|
||||
t.shape.n = m->shape.n;
|
||||
t.shape.c = m->shape.c;
|
||||
t.shape.h = 1;
|
||||
t.shape.w = m->shape.w;
|
||||
t.stride.n = m->stride.n;
|
||||
t.stride.c = m->stride.c;
|
||||
t.stride.h = m->stride.h;
|
||||
t.stride.w = 1;
|
||||
|
||||
check_tiu_tensor(&t);
|
||||
assert_stride_type_0(ctx, &t);
|
||||
|
||||
uint32_t eu_num = ctx->chip_info.eu_num;
|
||||
ASSERT(m->start_address % eu_num == 0);
|
||||
}
|
||||
|
||||
static int is_arith_shift(const param_t *p)
|
||||
{
|
||||
if (p->left->fmt == FMT_I8)
|
||||
return 1;
|
||||
if (p->right->fmt == FMT_I8)
|
||||
return 1;
|
||||
if (p->bias && p->bias->fmt == FMT_I8)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
bmk1822_op_t * bmk1822_tiu_matrix_multiplication_qdm(ctx_t *ctx, const param_t *p)
|
||||
{
|
||||
const bmk1822_matrix_lmem_t *res = p->res;
|
||||
const bmk1822_matrix_lmem_t *left = p->left;
|
||||
const bmk1822_matrix_lmem_t *right = p->right;
|
||||
const bmk1822_matrix_lmem_t *bias = p->bias;
|
||||
|
||||
check_matrix(ctx, res);
|
||||
check_matrix(ctx, left);
|
||||
check_matrix(ctx, right);
|
||||
if (bias)
|
||||
check_matrix(ctx, bias);
|
||||
|
||||
ASSERT(p->lshift_bits < 32);
|
||||
ASSERT(!(p->relu_enable && p->add_result));
|
||||
if(p->ps32_mode & 0x2)
|
||||
{
|
||||
ASSERT(!p->relu_enable);
|
||||
ASSERT(!p->bias);
|
||||
ASSERT(!p->rshift_bits);
|
||||
}
|
||||
ASSERT(p->relu_enable == 0 || p->relu_enable == 1);
|
||||
|
||||
uint32_t left_row = left->shape.n;
|
||||
uint32_t left_col = left->shape.col;
|
||||
uint32_t right_row = right->shape.n;
|
||||
uint32_t right_col = right->shape.col;
|
||||
uint32_t res_row = res->shape.n;
|
||||
uint32_t res_col = res->shape.col;
|
||||
ASSERT(left_col == right_row);
|
||||
ASSERT(res_col == right_col);
|
||||
ASSERT(p->res_is_int8 == 1);
|
||||
|
||||
if(p->ps32_mode)
|
||||
{
|
||||
ASSERT(!p->add_result);
|
||||
}
|
||||
else if (p->add_result) {
|
||||
ASSERT(res_row == left_row * 2);
|
||||
res_row = left_row;
|
||||
} else {
|
||||
ASSERT(res_row == left_row);
|
||||
}
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_FC_FIX8B;
|
||||
reg.tsk_opd_num = bias? 3: 2;
|
||||
reg.opt_shift_typ = is_arith_shift(p);
|
||||
reg.opt_res_shift = p->rshift_bits;
|
||||
reg.opt_left_shift = p->lshift_bits;
|
||||
reg.opt_relu_typ = p->relu_enable;
|
||||
reg.opt_res_add = p->add_result;
|
||||
|
||||
reg.res0_addr = res->start_address;
|
||||
reg.opt_res0_seg = 1;
|
||||
reg.opt_res0_sign = matrix_is_signed(res);
|
||||
reg.res0_n = res_row;
|
||||
reg.res0_c = res->shape.c;
|
||||
reg.res0_h = 1;
|
||||
reg.res0_w = res->shape.w;
|
||||
reg.short_res0_str = 0; // stride, b_stride calculated by H/W
|
||||
|
||||
reg.opd0_addr = left->start_address;
|
||||
reg.opt_opd0_seg = 1;
|
||||
reg.opt_opd0_sign = (left->fmt == FMT_I8);
|
||||
reg.opd0_n = left_row;
|
||||
reg.opd0_c = left->shape.c;
|
||||
reg.opd0_h = 1;
|
||||
reg.opd0_w = left->shape.w;
|
||||
reg.short_opd0_str = 0;
|
||||
|
||||
reg.opd1_addr = right->start_address;
|
||||
reg.opt_opd1_seg = 1;
|
||||
reg.opt_opd1_sign = (right->fmt == FMT_I8);
|
||||
reg.opd1_n = right_row;
|
||||
reg.opd1_c = right->shape.c;
|
||||
reg.opd1_h = 1;
|
||||
reg.opd1_w = left_col - left->shape.w * (left->shape.c - 1);
|
||||
reg.short_opd1_str = 0;
|
||||
|
||||
reg.ps32_md = p->ps32_mode;
|
||||
if (p->ps32_mode > 0)
|
||||
reg.res0_b_str = p->res->shape.n * p->res->stride.n;
|
||||
if(reg.opd0_c == 1)
|
||||
ASSERT(reg.opd0_w == reg.opd1_w);
|
||||
|
||||
// Only enable 32-bit multipler at the final post processing stage
|
||||
reg.opt_chl_quan = ((p->ps32_mode == 0) || (p->ps32_mode == 1)) ? 1 : 0;
|
||||
reg.quan_m = p->quan_m;
|
||||
|
||||
// 32b bias, determined by b_stride
|
||||
if (bias) {
|
||||
ASSERT(bias->shape.n == 4);
|
||||
ASSERT(bias->shape.c == right->shape.c);
|
||||
ASSERT(bias->shape.w == right->shape.w);
|
||||
ASSERT(bias->shape.col == right->shape.col);
|
||||
|
||||
reg.opd2_addr = bias->start_address;
|
||||
reg.opt_opd2_seg = 0;
|
||||
reg.opt_opd2_sign = (bias->fmt == FMT_I8);
|
||||
reg.opd2_n = 1;
|
||||
reg.opd2_c = bias->shape.c;
|
||||
reg.opd2_h = 1;
|
||||
reg.opd2_w = bias->shape.w;
|
||||
reg.short_opd2_str = 0;
|
||||
}
|
||||
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
69
cvikernel/src/bm1822/tiu_max_pooling.c
Normal file
69
cvikernel/src/bm1822/tiu_max_pooling.c
Normal file
@ -0,0 +1,69 @@
|
||||
#include "kernel_1822.h"
|
||||
|
||||
bmk1822_op_t * bmk1822_tiu_max_pooling(
|
||||
ctx_t *ctx,
|
||||
const bmk1822_tiu_max_pooling_param_t *p)
|
||||
{
|
||||
int bf16_enable = (p->ifmap->fmt == FMT_BF16);
|
||||
|
||||
check_tiu_tensor_2(p->ifmap, p->ofmap);
|
||||
ASSERT(p->kh * p->kw >= 1);
|
||||
ASSERT(p->ifmap->shape.n == p->ofmap->shape.n);
|
||||
ASSERT(p->ifmap->shape.c == p->ofmap->shape.c);
|
||||
ASSERT(p->stride_h < 32 && p->stride_h > 0 && "stride_h should be in [1, 31] range");
|
||||
ASSERT(p->stride_w < 32 && p->stride_w > 0 && "stride_w should be in [1, 31] range");
|
||||
if (bf16_enable) {
|
||||
assert_bf16_stride_type_0(ctx, p->ifmap);
|
||||
assert_bf16_stride_type_0(ctx, p->ofmap);
|
||||
} else {
|
||||
assert_stride_type_0(ctx, p->ifmap);
|
||||
assert_stride_type_0(ctx, p->ofmap);
|
||||
}
|
||||
int opd0_sign = tensor_is_signed(p->ifmap);
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_DEPTHWISE_POOL_FIX8B;
|
||||
reg.tsk_eu_typ = 0;
|
||||
reg.opt_relu_typ = 0; /* Hardware relu function not validated. */
|
||||
reg.opt_res_shift = 0;
|
||||
reg.opt_shift_typ = opd0_sign;
|
||||
reg.tsk_opd_num = 1;
|
||||
reg.opd_typ = bf16_enable ? 1: 0;
|
||||
|
||||
reg.res0_addr = p->ofmap->start_address;
|
||||
reg.opt_res0_sign = opd0_sign;
|
||||
reg.opt_res0_seg = 1;
|
||||
reg.res0_n = p->ofmap->shape.n;
|
||||
reg.res0_c = p->ofmap->shape.c;
|
||||
reg.res0_h = p->ofmap->shape.h;
|
||||
reg.res0_w = p->ofmap->shape.w;
|
||||
reg.opd0_addr = p->ifmap->start_address;
|
||||
reg.opt_opd0_sign = opd0_sign;
|
||||
reg.opt_opd0_seg = 1;
|
||||
reg.opd0_n = p->ifmap->shape.n;
|
||||
reg.opd0_c = p->ifmap->shape.c;
|
||||
reg.opd0_h = p->ifmap->shape.h;
|
||||
reg.opd0_w = p->ifmap->shape.w;
|
||||
reg.conv_opd0_up_pad = p->pad_top;
|
||||
reg.conv_opd0_dn_pad = p->pad_bottom;
|
||||
reg.conv_opd0_lf_pad = p->pad_left;
|
||||
reg.conv_opd0_rt_pad = p->pad_right;
|
||||
if (bf16_enable) {
|
||||
reg.opd0_ins_val = p->ins_fp;
|
||||
} else {
|
||||
//reg.opd0_ins_val = bf16_enable ? 0 : (uint32_t)p->ins_val;
|
||||
reg.opd0_ins_val = (!p->ins_val && opd0_sign) ? -128 : p->ins_val; // backend not set yet
|
||||
}
|
||||
|
||||
reg.opt_opd1_seg = 1;
|
||||
reg.opd1_h = p->kh;
|
||||
reg.opd1_w = p->kw;
|
||||
reg.conv_op_x_str = p->stride_w;
|
||||
reg.conv_op_y_str = p->stride_h;
|
||||
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
120
cvikernel/src/bm1822/tiu_min_pooling.c
Normal file
120
cvikernel/src/bm1822/tiu_min_pooling.c
Normal file
@ -0,0 +1,120 @@
|
||||
#include "kernel_1822.h"
|
||||
|
||||
bmk1822_op_t * bmk1822_tiu_min_pooling(
|
||||
ctx_t *ctx,
|
||||
const bmk1822_tiu_min_pooling_param_t *p)
|
||||
{
|
||||
check_tiu_tensor_2(p->ifmap, p->ofmap);
|
||||
ASSERT(p->kh * p->kw > 1);
|
||||
ASSERT(p->ifmap->shape.n == p->ofmap->shape.n);
|
||||
ASSERT(p->ifmap->shape.c == p->ofmap->shape.c);
|
||||
assert_stride_type_0(ctx, p->ifmap);
|
||||
assert_stride_type_0(ctx, p->ofmap);
|
||||
|
||||
int opd0_sign = tensor_is_signed(p->ifmap);
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_DEPTHWISE_POOL_FIX8B;
|
||||
reg.tsk_eu_typ = 3;
|
||||
reg.opt_relu_typ = 0; /* Hardware relu function not validated. */
|
||||
reg.opt_res_shift = 0;
|
||||
reg.opt_shift_typ = opd0_sign;
|
||||
reg.tsk_opd_num = 1;
|
||||
|
||||
reg.res0_addr = p->ofmap->start_address;
|
||||
reg.opt_res0_sign = opd0_sign;
|
||||
reg.opt_res0_seg = 1;
|
||||
reg.res0_n = p->ofmap->shape.n;
|
||||
reg.res0_c = p->ofmap->shape.c;
|
||||
reg.res0_h = p->ofmap->shape.h;
|
||||
reg.res0_w = p->ofmap->shape.w;
|
||||
|
||||
reg.opd0_addr = p->ifmap->start_address;
|
||||
reg.opt_opd0_sign = opd0_sign;
|
||||
reg.opt_opd0_seg = 1;
|
||||
reg.opd0_n = p->ifmap->shape.n;
|
||||
reg.opd0_c = p->ifmap->shape.c;
|
||||
reg.opd0_h = p->ifmap->shape.h;
|
||||
reg.opd0_w = p->ifmap->shape.w;
|
||||
reg.conv_opd0_up_pad = p->pad_top;
|
||||
reg.conv_opd0_dn_pad = p->pad_bottom;
|
||||
reg.conv_opd0_lf_pad = p->pad_left;
|
||||
reg.conv_opd0_rt_pad = p->pad_right;
|
||||
if (opd0_sign)
|
||||
reg.opd0_ins_val = (uint16_t)127;
|
||||
else
|
||||
reg.opd0_ins_val = (uint16_t)255;
|
||||
reg.opt_opd1_seg = 1;
|
||||
reg.opd1_h = p->kh;
|
||||
reg.opd1_w = p->kw;
|
||||
reg.conv_op_x_str = p->stride_w;
|
||||
reg.conv_op_y_str = p->stride_h;
|
||||
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
|
||||
bmk1822_op_t * bmk1822_tiu_bf16_min_pooling(
|
||||
ctx_t *ctx,
|
||||
const bmk1822_tiu_min_pooling_param_t *p)
|
||||
{
|
||||
int bf16_enable = (p->ifmap->fmt == FMT_BF16) ? 1 : 0;
|
||||
|
||||
check_tiu_tensor_2(p->ifmap, p->ofmap);
|
||||
ASSERT(p->kh * p->kw > 1);
|
||||
ASSERT(p->ifmap->shape.n == p->ofmap->shape.n);
|
||||
ASSERT(p->ifmap->shape.c == p->ofmap->shape.c);
|
||||
if (bf16_enable) {
|
||||
assert_bf16_stride_type_0(ctx, p->ifmap);
|
||||
assert_bf16_stride_type_0(ctx, p->ofmap);
|
||||
} else {
|
||||
assert_stride_type_0(ctx, p->ifmap);
|
||||
assert_stride_type_0(ctx, p->ofmap);
|
||||
}
|
||||
int opd0_sign = tensor_is_signed(p->ifmap);
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_DEPTHWISE_POOL_FIX8B;
|
||||
reg.tsk_eu_typ = 3;
|
||||
reg.opt_relu_typ = 0; /* Hardware relu function not validated. */
|
||||
reg.opt_res_shift = 0;
|
||||
reg.opt_shift_typ = opd0_sign;
|
||||
reg.tsk_opd_num = 1;
|
||||
reg.opd_typ = bf16_enable ? 1: 0;
|
||||
|
||||
reg.res0_addr = p->ofmap->start_address;
|
||||
reg.opt_res0_sign = opd0_sign;
|
||||
reg.opt_res0_seg = 1;
|
||||
reg.res0_n = p->ofmap->shape.n;
|
||||
reg.res0_c = p->ofmap->shape.c;
|
||||
reg.res0_h = p->ofmap->shape.h;
|
||||
reg.res0_w = p->ofmap->shape.w;
|
||||
reg.opd0_ins_val = p->ins_fp;
|
||||
reg.opd0_addr = p->ifmap->start_address;
|
||||
reg.opt_opd0_sign = opd0_sign;
|
||||
reg.opt_opd0_seg = 1;
|
||||
reg.opd0_n = p->ifmap->shape.n;
|
||||
reg.opd0_c = p->ifmap->shape.c;
|
||||
reg.opd0_h = p->ifmap->shape.h;
|
||||
reg.opd0_w = p->ifmap->shape.w;
|
||||
reg.conv_opd0_up_pad = p->pad_top;
|
||||
reg.conv_opd0_dn_pad = p->pad_bottom;
|
||||
reg.conv_opd0_lf_pad = p->pad_left;
|
||||
reg.conv_opd0_rt_pad = p->pad_right;
|
||||
|
||||
reg.opt_opd1_seg = 1;
|
||||
reg.opd1_h = p->kh;
|
||||
reg.opd1_w = p->kw;
|
||||
reg.conv_op_x_str = p->stride_w;
|
||||
reg.conv_op_y_str = p->stride_h;
|
||||
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
410
cvikernel/src/bm1880v2/bm_dmabuf.c
Normal file
410
cvikernel/src/bm1880v2/bm_dmabuf.c
Normal file
@ -0,0 +1,410 @@
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <inttypes.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "kernel_1880v2.h"
|
||||
#include <bmkernel/bm1880v2/bmkernel_1880v2.h>
|
||||
#include <bmkernel/bm1880v2/bm1880v2_tiu_reg.h>
|
||||
#include <bmkernel/bm1880v2/bm1880v2_tdma_reg.h>
|
||||
#include <bmkernel/reg_tiu.h>
|
||||
#include <bmkernel/reg_tdma.h>
|
||||
#include <bmkernel/reg_bdcast.h>
|
||||
#include <bmkernel/bm_regcpu.h>
|
||||
|
||||
#define __ALIGN_MASK(x,mask) (((x)+(mask))&~(mask))
|
||||
#define ALIGN(x,a) __ALIGN_MASK(x,(__typeof__(x))(a)-1)
|
||||
|
||||
#define BD_DESC_ALIGN_SIZE (1 << BDC_ENGINE_CMD_ALIGNED_BIT)
|
||||
#define GDMA_DESC_ALIGN_SIZE (1 << TDMA_DESCRIPTOR_ALIGNED_BIT)
|
||||
#define BD_EOD_PADDING_BYTES (128)
|
||||
#define TPU_DMABUF_HEADER_M 0xB5B5
|
||||
|
||||
typedef struct {
|
||||
cmd_hdr_t hdr;
|
||||
uint32_t body[0];
|
||||
} DESC;
|
||||
|
||||
// CPU_OP_SYNC structure
|
||||
typedef struct {
|
||||
uint32_t op_type;
|
||||
uint32_t num_tiu;
|
||||
uint32_t num_tdma;
|
||||
uint32_t offset_tiu;
|
||||
uint32_t offset_tdma;
|
||||
uint32_t offset_tiu_ori_bk;
|
||||
uint32_t offset_tdma_ori_bk;
|
||||
char str[CPU_ENGINE_STR_LIMIT_BYTE];
|
||||
} __attribute__((packed)) cvi_cpu_desc_t;
|
||||
|
||||
static DESC *traverse_start(uint8_t *cmdbuf)
|
||||
{
|
||||
ASSERT(cmdbuf);
|
||||
DESC *desc = (DESC *)cmdbuf;
|
||||
ASSERT(desc->hdr.magic == CMDBUF_HDR_MAGIC_1880v2);
|
||||
return desc;
|
||||
}
|
||||
|
||||
static DESC *traverse_next(DESC *desc, uint8_t *cmdbuf, uint32_t size)
|
||||
{
|
||||
DESC *next_desc = (DESC *)((uint8_t *)desc + cmd_hdr_len(&desc->hdr) + sizeof(cmd_hdr_t));
|
||||
if ((uint8_t *)next_desc >= cmdbuf + size)
|
||||
return NULL;
|
||||
ASSERT(next_desc->hdr.magic == CMDBUF_HDR_MAGIC_1880v2);
|
||||
return next_desc;
|
||||
}
|
||||
|
||||
static bool is_last_desc(DESC *desc, uint8_t *cmdbuf, uint32_t size)
|
||||
{
|
||||
DESC *next_desc = traverse_next(desc, cmdbuf, size);
|
||||
return next_desc ? false : true;
|
||||
}
|
||||
|
||||
static void reorder_bd_cmdbuf_reg(uint8_t *cmdbuf)
|
||||
{
|
||||
int total_bits = BD_REG_BYTES * 8;
|
||||
|
||||
for (int i = 0; i < total_bits; i += 128)
|
||||
cmdbuf[(i + 128 - 8) / 8] |= (i / 128) << 4;
|
||||
|
||||
uint8_t tmp[128 / 8];
|
||||
uint8_t *last = &cmdbuf[(total_bits - 128) / 8];
|
||||
memcpy(tmp, last, sizeof(tmp));
|
||||
memcpy(last, cmdbuf, sizeof(tmp));
|
||||
memcpy(cmdbuf, tmp, sizeof(tmp));
|
||||
}
|
||||
|
||||
static void adjust_desc_tdma(uint32_t *body, bool eod)
|
||||
{
|
||||
if (eod) {
|
||||
body[0] |= (1 << TDMA_ACCPI0_EOD_BIT);
|
||||
body[0] |= (1 << TDMA_ACCPI0_INTERRUPT_BIT); // interrupt
|
||||
}
|
||||
body[0] |= (1 << TDMA_ACCPI0_BARRIER_ENABLE_BIT);
|
||||
}
|
||||
|
||||
static void adjust_desc_bd(uint32_t *body, bool eod)
|
||||
{
|
||||
if (eod) {
|
||||
tiu_reg_t reg;
|
||||
parse_tiu_reg(®, body);
|
||||
reg.cmd_end = 1;
|
||||
reg.cmd_intr_en = 1;
|
||||
emit_tiu_reg(®, body);
|
||||
}
|
||||
reorder_bd_cmdbuf_reg((uint8_t *)body);
|
||||
}
|
||||
|
||||
|
||||
static uint32_t desc_sync_id(DESC *desc)
|
||||
{
|
||||
switch (desc->hdr.engine_id) {
|
||||
case BMK1880v2_TIU: {
|
||||
tiu_reg_t reg;
|
||||
parse_tiu_reg(®, desc->body);
|
||||
return reg.cmd_id_tpu;
|
||||
}
|
||||
case BMK1880v2_TDMA: {
|
||||
tdma_reg_t reg;
|
||||
parse_tdma_reg(®, desc->body);
|
||||
return reg.cmd_id;
|
||||
}
|
||||
default:
|
||||
ASSERT(0);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
static void fill_header_and_arm(uint8_t *cmdbuf, uint32_t sz, uint8_t *dmabuf, uint64_t *tiu_offset, uint64_t *tdma_offset)
|
||||
{
|
||||
dma_hdr_t header = {0};
|
||||
header.dmabuf_magic_m = TPU_DMABUF_HEADER_M;
|
||||
header.dmabuf_magic_s = 0x1835;
|
||||
|
||||
cvi_cpu_desc_t *segments = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t));
|
||||
DESC *desc = NULL;
|
||||
uint32_t desc_nums[BMK1880v2_ENGINE_NUM] = {0};
|
||||
uint32_t counters[BMK1880v2_ENGINE_NUM] = {0};
|
||||
uint32_t desc_size[BMK1880v2_ENGINE_NUM] = {0};
|
||||
|
||||
ASSERT(segments);
|
||||
// fill arm descs
|
||||
desc = traverse_start(cmdbuf);
|
||||
|
||||
while (desc != NULL) {
|
||||
uint32_t engine_id = (uint32_t)desc->hdr.engine_id;
|
||||
counters[engine_id]++;
|
||||
desc_nums[engine_id]++;
|
||||
if (engine_id != BMK1880v2_CPU) {
|
||||
// a new arm desc inserted to do sync operation
|
||||
if (desc_sync_id(desc) == 0xFFFF || is_last_desc(desc, cmdbuf, sz)) {
|
||||
desc_nums[BMK1880v2_CPU]++;
|
||||
cvi_cpu_desc_t *arm = segments + desc_nums[BMK1880v2_CPU] - 1;
|
||||
memset(arm, 0, sizeof(cvi_cpu_desc_t));
|
||||
arm->op_type = CPU_OP_SYNC;
|
||||
arm->num_tiu = counters[BMK1880v2_TIU];
|
||||
arm->num_tdma = counters[BMK1880v2_TDMA];
|
||||
strncpy(arm->str, "layer_end", sizeof(arm->str) - 1);
|
||||
if (counters[BMK1880v2_TIU] != 0) {
|
||||
desc_size[BMK1880v2_TIU] =
|
||||
ALIGN(desc_size[BMK1880v2_TIU] + counters[BMK1880v2_TIU] * BD_REG_BYTES + BD_EOD_PADDING_BYTES,
|
||||
BD_DESC_ALIGN_SIZE);
|
||||
}
|
||||
counters[BMK1880v2_TIU] = 0;
|
||||
counters[BMK1880v2_TDMA] = 0;
|
||||
}
|
||||
} else {
|
||||
cvi_cpu_desc_t *arm = segments + desc_nums[BMK1880v2_CPU] - 1;
|
||||
memcpy(arm, &(desc->body), sizeof(cvi_cpu_desc_t));
|
||||
arm->num_tiu = counters[BMK1880v2_TIU];
|
||||
arm->num_tdma = counters[BMK1880v2_TDMA];
|
||||
if (counters[BMK1880v2_TIU] != 0) {
|
||||
desc_size[BMK1880v2_TIU] =
|
||||
ALIGN(desc_size[BMK1880v2_TIU] + counters[BMK1880v2_TIU] * BD_REG_BYTES + BD_EOD_PADDING_BYTES,
|
||||
BD_DESC_ALIGN_SIZE);
|
||||
}
|
||||
counters[BMK1880v2_TIU] = 0;
|
||||
counters[BMK1880v2_TDMA] = 0;
|
||||
}
|
||||
desc = traverse_next(desc, cmdbuf, sz);
|
||||
}
|
||||
desc_size[BMK1880v2_CPU] = desc_nums[BMK1880v2_CPU] * CPU_ENGINE_BYTES;
|
||||
desc_size[BMK1880v2_TDMA] = desc_nums[BMK1880v2_TDMA] * GDMA_DESC_ALIGN_SIZE;
|
||||
|
||||
(*tiu_offset) = ALIGN(sizeof(header) + desc_size[BMK1880v2_CPU], BD_DESC_ALIGN_SIZE);
|
||||
(*tdma_offset) = ALIGN((*tiu_offset) + desc_size[BMK1880v2_TIU], GDMA_DESC_ALIGN_SIZE);
|
||||
|
||||
// dma hdr + arm descs + bd descs + tdma descs
|
||||
header.dmabuf_size = (*tdma_offset) + desc_size[BMK1880v2_TDMA];
|
||||
header.cpu_desc_count = desc_nums[BMK1880v2_CPU];
|
||||
header.bd_desc_count = desc_nums[BMK1880v2_TIU];
|
||||
header.tdma_desc_count = desc_nums[BMK1880v2_TDMA];
|
||||
|
||||
//printf("header.dmabuf_size = %d\n", header.dmabuf_size);
|
||||
printf("header.cpu_desc_count = %d\n", header.cpu_desc_count);
|
||||
printf("header.bd_desc_count = %d\n", header.bd_desc_count);
|
||||
printf("header.tdma_desc_count = %d\n", header.tdma_desc_count);
|
||||
memcpy(dmabuf, &header, sizeof(header));
|
||||
}
|
||||
|
||||
static void fill_bd_and_tdma(uint8_t *cmdbuf, uint32_t sz, uint8_t *dmabuf, uint64_t tiu_offset, uint64_t tdma_offset)
|
||||
{
|
||||
dma_hdr_t *p_header = (dma_hdr_t *)dmabuf;
|
||||
cvi_cpu_desc_t *segments = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t));
|
||||
DESC *desc = traverse_start(cmdbuf);
|
||||
//uint64_t address_max = 0x100000000;
|
||||
|
||||
for (uint32_t i = 0; i < p_header->cpu_desc_count; i++) {
|
||||
|
||||
cvi_cpu_desc_t *arm = segments + i;
|
||||
|
||||
uint32_t tiu_num = arm->num_tiu & 0xFFFF;
|
||||
uint32_t tdma_num = arm->num_tdma & 0xFFFF;
|
||||
|
||||
if (tiu_num) {
|
||||
tiu_offset = ALIGN(tiu_offset, 1 << BDC_ENGINE_CMD_ALIGNED_BIT);
|
||||
arm->offset_tiu = tiu_offset;
|
||||
//printf("arm->offset_tiu = 0x%x \n", arm->offset_tiu);
|
||||
}
|
||||
|
||||
if (tdma_num) {
|
||||
tdma_offset = ALIGN(tdma_offset, 1 << TDMA_DESCRIPTOR_ALIGNED_BIT);
|
||||
arm->offset_tdma = tdma_offset;
|
||||
//printf("arm->offset_tdma = 0x%x \n", arm->offset_tdma);
|
||||
}
|
||||
|
||||
while (tiu_num || tdma_num) {
|
||||
uint32_t engine_id = (uint32_t)desc->hdr.engine_id;
|
||||
void *p_body = NULL;
|
||||
|
||||
switch (engine_id) {
|
||||
case BMK1880v2_TIU:
|
||||
tiu_num--;
|
||||
p_body = (void *)(dmabuf + tiu_offset);
|
||||
tiu_offset += BD_REG_BYTES;
|
||||
memcpy(p_body, desc->body, desc->hdr.len);
|
||||
adjust_desc_bd((uint32_t *)p_body, tiu_num == 0);
|
||||
break;
|
||||
case BMK1880v2_TDMA:
|
||||
tdma_num--;
|
||||
tdma_offset = ALIGN(tdma_offset, GDMA_DESC_ALIGN_SIZE);
|
||||
p_body = (void *)(dmabuf + tdma_offset);
|
||||
tdma_offset += GDMA_DESC_ALIGN_SIZE;
|
||||
memcpy(p_body, desc->body, desc->hdr.len);
|
||||
|
||||
#if 0 //debug feature, for checking if neuron overshoot
|
||||
{
|
||||
tdma_reg_t reg_tdma = {0};
|
||||
uint64_t tdma_address = 0, tdma_address2 = 0;
|
||||
|
||||
parse_tdma_reg(®_tdma, p_body);
|
||||
|
||||
if (reg_tdma.src_base_reg_sel == 0) {
|
||||
// reg.trans_dir = 2; // 0:tg2l, 1:l2tg, 2:g2g, 3:l2l
|
||||
if (reg_tdma.trans_dir == 0) {
|
||||
printf ("src_base_addr_high=%x, src_base_addr_low=%x\n", reg_tdma.src_base_addr_high, reg_tdma.src_base_addr_low);
|
||||
tdma_address = ((uint64_t)reg_tdma.src_base_addr_high) << 32 | (uint64_t)reg_tdma.src_base_addr_low;
|
||||
} else if (reg_tdma.trans_dir == 1) {
|
||||
printf ("dst_base_addr_high=%x, dst_base_addr_low=%x\n", reg_tdma.dst_base_addr_high, reg_tdma.dst_base_addr_low);
|
||||
tdma_address = ((uint64_t)reg_tdma.dst_base_addr_high) << 32 | (uint64_t)reg_tdma.dst_base_addr_low;
|
||||
} else if (reg_tdma.trans_dir == 2) {
|
||||
printf ("dst_base_addr_high=%x, dst_base_addr_low=%x\n", reg_tdma.dst_base_addr_high, reg_tdma.dst_base_addr_low);
|
||||
tdma_address = ((uint64_t)reg_tdma.src_base_addr_high) << 32 | (uint64_t)reg_tdma.src_base_addr_low;
|
||||
tdma_address2 = ((uint64_t)reg_tdma.dst_base_addr_high) << 32 | (uint64_t)reg_tdma.dst_base_addr_low;
|
||||
|
||||
if (tdma_address2 > tdma_address) {
|
||||
tdma_address = tdma_address2;
|
||||
}
|
||||
}
|
||||
|
||||
if (tdma_address > address_max) {
|
||||
address_max = tdma_address;
|
||||
printf("address_max=%llx\n", address_max);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
adjust_desc_tdma((uint32_t *)p_body, tdma_num == 0);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
desc = traverse_next(desc, cmdbuf, sz);
|
||||
}
|
||||
|
||||
// padding zero after eod to workaroud hardware bug
|
||||
if (arm->num_tiu & 0xFFFF) {
|
||||
void *buf = (void *)(dmabuf + tiu_offset);
|
||||
memset(buf, 0, BD_EOD_PADDING_BYTES);
|
||||
tiu_offset += BD_EOD_PADDING_BYTES;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void bmk1880v2_dmabuf_convert(uint8_t *cmdbuf, uint32_t sz, uint8_t *dmabuf)
|
||||
{
|
||||
uint64_t tiu_offset = 0;
|
||||
uint64_t tdma_offset = 0;
|
||||
fill_header_and_arm(cmdbuf, sz, dmabuf, &tiu_offset, &tdma_offset);
|
||||
fill_bd_and_tdma(cmdbuf, sz, dmabuf, tiu_offset, tdma_offset);
|
||||
}
|
||||
|
||||
#define PER_DES_SIZE 16
|
||||
#define PADDING_SIZE (1024 * 1024)
|
||||
void bmk1880v2_dmabuf_size(uint8_t *cmdbuf, uint32_t sz, uint32_t *psize, uint32_t *pmu_size)
|
||||
{
|
||||
uint32_t tdma_desc_num = {0};
|
||||
uint32_t counters[BMK1880v2_ENGINE_NUM] = {0};
|
||||
uint32_t bd_size = 0;
|
||||
uint32_t dmabuf_size = 0;
|
||||
|
||||
uint32_t tiu_cnt = 0;
|
||||
uint32_t tdma_cnt = 0;
|
||||
|
||||
// calculate desc numbers
|
||||
DESC *desc = traverse_start(cmdbuf);
|
||||
|
||||
while (desc != NULL) {
|
||||
uint32_t engine_id = (uint32_t)desc->hdr.engine_id;
|
||||
counters[engine_id]++;
|
||||
if (engine_id != BMK1880v2_CPU) {
|
||||
// a new arm desc inserted to do sync operation
|
||||
if (desc_sync_id(desc) == 0xFFFF || is_last_desc(desc, cmdbuf, sz)) {
|
||||
counters[BMK1880v2_CPU]++;
|
||||
tdma_desc_num += counters[BMK1880v2_TDMA];
|
||||
if (counters[BMK1880v2_TIU] != 0) {
|
||||
bd_size = ALIGN(bd_size + counters[BMK1880v2_TIU] * BD_REG_BYTES + BD_EOD_PADDING_BYTES,
|
||||
BD_DESC_ALIGN_SIZE);
|
||||
}
|
||||
tiu_cnt += counters[BMK1880v2_TIU] & 0xFFFF;
|
||||
tdma_cnt += counters[BMK1880v2_TDMA] & 0xFFFF;
|
||||
counters[BMK1880v2_TIU] = 0;
|
||||
counters[BMK1880v2_TDMA] = 0;
|
||||
}
|
||||
} else {
|
||||
tdma_desc_num += counters[BMK1880v2_TDMA];
|
||||
if (counters[BMK1880v2_TIU] != 0) {
|
||||
bd_size = ALIGN(bd_size + counters[BMK1880v2_TIU] * BD_REG_BYTES + BD_EOD_PADDING_BYTES,
|
||||
BD_DESC_ALIGN_SIZE);
|
||||
}
|
||||
tiu_cnt += counters[BMK1880v2_TIU] & 0xFFFF;
|
||||
tdma_cnt += counters[BMK1880v2_TDMA] & 0xFFFF;
|
||||
counters[BMK1880v2_TIU] = 0;
|
||||
counters[BMK1880v2_TDMA] = 0;
|
||||
}
|
||||
desc = traverse_next(desc, cmdbuf, sz);
|
||||
}
|
||||
// dma hdr + arm descs + bd descs + tdma descs
|
||||
dmabuf_size = sizeof(dma_hdr_t) + counters[BMK1880v2_CPU] * CPU_ENGINE_BYTES;
|
||||
dmabuf_size = ALIGN(dmabuf_size, BD_DESC_ALIGN_SIZE) + bd_size;
|
||||
dmabuf_size = ALIGN(dmabuf_size, GDMA_DESC_ALIGN_SIZE) + tdma_desc_num * GDMA_DESC_ALIGN_SIZE;
|
||||
|
||||
*pmu_size = ALIGN((tiu_cnt + tdma_cnt) * PER_DES_SIZE + PADDING_SIZE, 0x1000);
|
||||
*psize = dmabuf_size;
|
||||
}
|
||||
|
||||
void bmk1880v2_arraybase_set(uint8_t *dmabuf, uint32_t arraybase0L, uint32_t arraybase1L, uint32_t arraybase0H, uint32_t arraybase1H)
|
||||
{
|
||||
ASSERT(dmabuf);
|
||||
dma_hdr_t *header = (dma_hdr_t *)dmabuf;
|
||||
|
||||
ASSERT(header->dmabuf_magic_m == TPU_DMABUF_HEADER_M);
|
||||
header->arraybase_0_L = arraybase0L;
|
||||
header->arraybase_1_L = arraybase1L;
|
||||
header->arraybase_0_H = arraybase0H;
|
||||
header->arraybase_1_H = arraybase1H;
|
||||
}
|
||||
|
||||
void bmk1880v2_dmabuf_relocate(uint8_t *dmabuf, uint64_t dmabuf_devaddr, uint32_t original_size, uint32_t pmubuf_size)
|
||||
{
|
||||
dma_hdr_t *header = (dma_hdr_t *)dmabuf;
|
||||
uint64_t tmpAddress = 0;
|
||||
|
||||
ASSERT(header->dmabuf_magic_m == TPU_DMABUF_HEADER_M);
|
||||
cvi_cpu_desc_t *desc = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t));
|
||||
|
||||
for (uint32_t i = 0; i < header->cpu_desc_count; i++, desc++) {
|
||||
uint32_t tiu_num = desc->num_tiu & 0xFFFF;
|
||||
uint32_t tdma_num = desc->num_tdma & 0xFFFF;
|
||||
|
||||
if (tiu_num) {
|
||||
tmpAddress = dmabuf_devaddr + desc->offset_tiu;
|
||||
//printf("bd tmpAddress = 0x%lu\n", tmpAddress);
|
||||
desc->offset_tiu_ori_bk = desc->offset_tiu;
|
||||
desc->offset_tiu = tmpAddress >> BDC_ENGINE_CMD_ALIGNED_BIT;
|
||||
}
|
||||
|
||||
if (tdma_num) {
|
||||
tmpAddress = dmabuf_devaddr + desc->offset_tdma;
|
||||
//printf("tdma tmpAddress = 0x%lu\n", tmpAddress);
|
||||
desc->offset_tdma_ori_bk = desc->offset_tdma;
|
||||
desc->offset_tdma = tmpAddress >> TDMA_DESCRIPTOR_ALIGNED_BIT;
|
||||
}
|
||||
|
||||
//set pmubuf_addr_p to enable pmu kick
|
||||
header->pmubuf_size = pmubuf_size;
|
||||
header->pmubuf_offset = original_size;
|
||||
}
|
||||
}
|
||||
|
||||
void bmk1880v2_dmabuf_dump(uint8_t *dmabuf)
|
||||
{
|
||||
ASSERT(dmabuf);
|
||||
dma_hdr_t *header = (dma_hdr_t *)dmabuf;
|
||||
// printf("bmk1880v2_dmabuf_dump header->arraybase_0_L = 0x%x\n", header->arraybase_0_L);
|
||||
// printf("bmk1880v2_dmabuf_dump header->arraybase_1_L = 0x%x\n", header->arraybase_1_L);
|
||||
// printf("bmk1880v2_dmabuf_dump header->arraybase_0_H = 0x%x\n", header->arraybase_0_H);
|
||||
// printf("bmk1880v2_dmabuf_dump header->arraybase_1_H = 0x%x\n", header->arraybase_1_H);
|
||||
// printf("bmk1880v2_dmabuf_dump header->pmubuf_offset = 0x%x\n", header->pmubuf_offset);
|
||||
|
||||
ASSERT(header->dmabuf_magic_m == TPU_DMABUF_HEADER_M);
|
||||
cvi_cpu_desc_t *desc = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t));
|
||||
|
||||
for (uint32_t i = 0; i < header->cpu_desc_count; i++, desc++) {
|
||||
int tiu_num = desc->num_tiu & 0xFFFF;
|
||||
int tdma_num = desc->num_tdma & 0xFFFF;
|
||||
uint32_t tiu_offset = desc->offset_tiu;
|
||||
uint32_t tdma_offset = desc->offset_tdma;
|
||||
printf("bmk1880v2_dmabuf_dump num<tiu:%d, tdma:%d>, offset<0x%08x, 0x%08x>\n", tiu_num, tdma_num, tiu_offset, tdma_offset);
|
||||
}
|
||||
}
|
||||
594
cvikernel/src/bm1880v2/bm_kernel.c
Normal file
594
cvikernel/src/bm1880v2/bm_kernel.c
Normal file
@ -0,0 +1,594 @@
|
||||
#include "kernel_1880v2.h"
|
||||
#include <bmkernel/bm1880v2/bm1880v2_tpu_cfg.h>
|
||||
|
||||
static void replace_cmd_id(uint32_t *desc, uint32_t eng_id, uint16_t ids[])
|
||||
{
|
||||
if (eng_id == BMK1880v2_TIU) {
|
||||
tiu_reg_t reg;
|
||||
parse_tiu_reg(®, desc);
|
||||
reg.cmd_id_en = 1;
|
||||
reg.cmd_id_tpu = ids[eng_id];
|
||||
reg.cmd_id_gdma = ids[BMK1880v2_TDMA];
|
||||
emit_tiu_reg(®, desc);
|
||||
|
||||
// printf(" %s: TIU eng_id %d, [wait_tdma_id=%d|tiu_id=%d] dst shape(%d, %d, %d, %d)\n",
|
||||
// __FUNCTION__, eng_id, reg.cmd_id_gdma, reg.cmd_id_tpu,
|
||||
// reg.res0_n, reg.res0_c, reg.res0_h, reg.res0_w);
|
||||
|
||||
} else if (eng_id == BMK1880v2_TDMA) {
|
||||
tdma_reg_t tdma_reg;
|
||||
parse_tdma_reg(&tdma_reg, desc);
|
||||
tdma_reg.cmd_id = ids[eng_id];
|
||||
tdma_reg.wait_id_tpu = ids[BMK1880v2_TIU];
|
||||
tdma_reg.bar_en = 1;
|
||||
|
||||
// printf(" %s: TDMA eng_id %d, [tdma_id=%d|wait_tiu_id=%d], dst shape(%d, %d, %d, %d)\n",
|
||||
// __FUNCTION__, eng_id, tdma_reg.cmd_id, tdma_reg.wait_id_tpu,
|
||||
// tdma_reg.src_n, tdma_reg.dst_c, tdma_reg.dst_h, tdma_reg.dst_w);
|
||||
|
||||
emit_tdma_reg(&tdma_reg, desc);
|
||||
}
|
||||
}
|
||||
|
||||
static int bm1880v2_get_engine_desc_length(uint32_t engine_id)
|
||||
{
|
||||
switch (engine_id) {
|
||||
case BMK1880v2_TIU:
|
||||
return TIU_ENGINE_DESCRIPTOR_NUM * sizeof(uint32_t);
|
||||
case BMK1880v2_TDMA:
|
||||
return TDMA_ENGINE_DESCRIPTOR_NUM * sizeof(uint32_t);
|
||||
case BMK1880v2_CPU:
|
||||
return CPU_ENGINE_DESCRIPTOR_NUM * sizeof(uint32_t);
|
||||
default:
|
||||
ASSERT(0);
|
||||
}
|
||||
}
|
||||
|
||||
// Estimate the number of command descriptor based on buffer size provided
|
||||
// by the user.
|
||||
uint32_t bmk1880v2_estimate_nr_desc(ctx_t *k)
|
||||
{
|
||||
uint32_t tiu_desc_len = bm1880v2_get_engine_desc_length(BMK1880v2_TIU);
|
||||
uint32_t tdma_desc_len = bm1880v2_get_engine_desc_length(BMK1880v2_TDMA);
|
||||
uint32_t hdr_len = sizeof(cmd_hdr_t);
|
||||
|
||||
uint32_t desc_len =
|
||||
(tiu_desc_len > tdma_desc_len) ? tiu_desc_len : tdma_desc_len;
|
||||
|
||||
return k->info.cmdbuf_size / (desc_len + hdr_len);
|
||||
}
|
||||
|
||||
static void kernel_init(ctx_t *k, bmk_info_t *info)
|
||||
{
|
||||
k->info = *info;
|
||||
//1880v2->18802
|
||||
ASSERT(info->chip_version == BM1880V2_VER);
|
||||
k->chip_info = bmk1880v2_chip_info();
|
||||
|
||||
uint32_t max_nr_desc = bmk1880v2_estimate_nr_desc(k);
|
||||
ec_init(&k->ec, BMK1880v2_ENGINE_NUM, max_nr_desc);
|
||||
mode_manager_init(&k->mode_manager, &k->ec, BMK1880v2_ENGINE_NUM);
|
||||
|
||||
k->cmdbuf_ptr = 0;
|
||||
k->max_nr_desc = max_nr_desc;
|
||||
k->cur_nr_desc = 0;
|
||||
k->desc_pairs = xmalloc(max_nr_desc * sizeof(k->desc_pairs[0]));
|
||||
|
||||
k->lmem_ptr = 0;
|
||||
}
|
||||
|
||||
static void kernel_destroy(ctx_t *k)
|
||||
{
|
||||
free(k->desc_pairs);
|
||||
ec_destroy(&k->ec);
|
||||
mode_manager_destroy(&k->mode_manager);
|
||||
}
|
||||
|
||||
static void kernel_reset(ctx_t *k)
|
||||
{
|
||||
k->cur_nr_desc = 0;
|
||||
k->cmdbuf_ptr = 0;
|
||||
|
||||
ec_reset(&k->ec);
|
||||
mode_manager_reset(&k->mode_manager);
|
||||
}
|
||||
|
||||
static cmd_hdr_t * kernel_alloc_cmd_hdr(
|
||||
ctx_t *k, uint8_t eng_id, uint32_t desc_len)
|
||||
{
|
||||
uint32_t free_len = k->info.cmdbuf_size - k->cmdbuf_ptr;
|
||||
uint32_t hdr_len = sizeof(cmd_hdr_t);
|
||||
uint32_t total_len = hdr_len + desc_len;
|
||||
ASSERT(total_len <= free_len);
|
||||
|
||||
cmd_hdr_t *hdr = (cmd_hdr_t *)&k->info.cmdbuf[k->cmdbuf_ptr];
|
||||
hdr->magic = CMDBUF_HDR_MAGIC_1880v2;
|
||||
hdr->len = desc_len;
|
||||
hdr->engine_id = eng_id;
|
||||
hdr->__deprecated = 0; // for valgrind
|
||||
hdr->flags = 0;
|
||||
hdr->mask = 0;
|
||||
|
||||
k->cmdbuf_ptr += total_len;
|
||||
return hdr;
|
||||
}
|
||||
|
||||
static desc_pair_t * kernel_alloc_desc_pair(ctx_t *k, uint8_t eng_id)
|
||||
{
|
||||
ASSERT(eng_id < BMK1880v2_ENGINE_NUM);
|
||||
ASSERT(k->cur_nr_desc < k->max_nr_desc);
|
||||
|
||||
uint32_t desc_len = bm1880v2_get_engine_desc_length(eng_id);
|
||||
desc_pair_t *dp = &k->desc_pairs[k->cur_nr_desc++];
|
||||
dp->cmd_hdr = kernel_alloc_cmd_hdr(k, eng_id, desc_len);
|
||||
dp->ec_desc = ec_alloc_desc(&k->ec, eng_id);
|
||||
|
||||
mode_manager_record_ec_desc(&k->mode_manager, dp->ec_desc);
|
||||
return dp;
|
||||
}
|
||||
|
||||
static void kernel_update_sync_id(ctx_t *k)
|
||||
{
|
||||
ec_compute_sync_ids(&k->ec);
|
||||
|
||||
for (uint32_t di = 0; di < k->cur_nr_desc; di++) {
|
||||
desc_pair_t *dp = &k->desc_pairs[di];
|
||||
uint8_t eng_id = dp->ec_desc->engine_id;
|
||||
uint32_t *desc = (uint32_t *)dp->cmd_hdr->cmd;
|
||||
replace_cmd_id(desc, eng_id, dp->ec_desc->sync_ids);
|
||||
}
|
||||
}
|
||||
|
||||
void bmk1880v2_add_dependency(
|
||||
ctx_t *ctx,
|
||||
bmk1880v2_op_t *before,
|
||||
bmk1880v2_op_t *after)
|
||||
{
|
||||
ec_add_dependency(&ctx->ec, before, after);
|
||||
}
|
||||
|
||||
desc_pair_t * bm1880v2_get_desc_pair(ctx_t *k, uint8_t eng_id)
|
||||
{
|
||||
if (eng_id == BMK1880v2_CPU) {
|
||||
kernel_update_sync_id(k);
|
||||
k->cur_nr_desc = 0;
|
||||
|
||||
ec_reset(&k->ec);
|
||||
mode_manager_restart_sync_id(&k->mode_manager);
|
||||
}
|
||||
|
||||
return kernel_alloc_desc_pair(k, eng_id);
|
||||
}
|
||||
|
||||
ctx_t * bmk1880v2_register(bmk_info_t *info)
|
||||
{
|
||||
ASSERT(info);
|
||||
ASSERT(info->cmdbuf);
|
||||
ASSERT(info->cmdbuf_size > 0);
|
||||
ctx_t *k = xmalloc(sizeof(*k));
|
||||
kernel_init(k, info);
|
||||
return k;
|
||||
}
|
||||
|
||||
void bmk1880v2_cleanup(ctx_t *ctx)
|
||||
{
|
||||
ASSERT(ctx);
|
||||
|
||||
ctx_t *k = (typeof(k))ctx;
|
||||
|
||||
kernel_destroy(k);
|
||||
free(k);
|
||||
}
|
||||
|
||||
void bmk1880v2_reset(ctx_t *ctx)
|
||||
{
|
||||
ctx_t *k = (typeof(k))ctx;
|
||||
kernel_reset(k);
|
||||
}
|
||||
|
||||
uint8_t *bmk1880v2_acquire_cmdbuf(ctx_t *ctx, uint32_t *size)
|
||||
{
|
||||
ctx_t *k = (typeof(k))ctx;
|
||||
|
||||
*size = k->cmdbuf_ptr;
|
||||
kernel_update_sync_id(k);
|
||||
return k->info.cmdbuf;
|
||||
}
|
||||
|
||||
void bmk1880v2_parallel_enable(ctx_t *ctx)
|
||||
{
|
||||
ctx_t *k = (typeof(k))ctx;
|
||||
mode_manager_enable_parallel(&k->mode_manager);
|
||||
}
|
||||
|
||||
void bmk1880v2_set_op(ctx_t *ctx, void* op)
|
||||
{
|
||||
ctx_t *k = (typeof(k))ctx;
|
||||
k->op = op;
|
||||
}
|
||||
|
||||
void* bmk1880v2_get_op(ctx_t *ctx)
|
||||
{
|
||||
ctx_t *k = (typeof(k))ctx;
|
||||
return k->op;
|
||||
}
|
||||
|
||||
void bmk1880v2_parallel_disable(ctx_t *ctx)
|
||||
{
|
||||
ctx_t *k = (typeof(k))ctx;
|
||||
mode_manager_disable_parallel(&k->mode_manager);
|
||||
}
|
||||
|
||||
void bmk1880v2_create_streams(ctx_t *ctx, int nr_streams)
|
||||
{
|
||||
ctx_t *k = (typeof(k))ctx;
|
||||
mode_manager_create_streams(&k->mode_manager, nr_streams);
|
||||
}
|
||||
|
||||
void bmk1880v2_set_layer_id(ctx_t *ctx, uint16_t layer_id)
|
||||
{
|
||||
ctx_t *k = (typeof(k))ctx;
|
||||
k->layer_id = layer_id;
|
||||
}
|
||||
|
||||
uint16_t bmk1880v2_layer_id(ctx_t *ctx)
|
||||
{
|
||||
ctx_t *k = (typeof(k))ctx;
|
||||
return k->layer_id;
|
||||
}
|
||||
|
||||
void bmk1880v2_destroy_streams(ctx_t *ctx)
|
||||
{
|
||||
ctx_t *k = (typeof(k))ctx;
|
||||
mode_manager_destroy_streams(&k->mode_manager);
|
||||
}
|
||||
|
||||
void bmk1880v2_set_stream(ctx_t *ctx, int i)
|
||||
{
|
||||
ctx_t *k = (typeof(k))ctx;
|
||||
mode_manager_set_stream(&k->mode_manager, i);
|
||||
}
|
||||
|
||||
static bmk1880v2_chip_info_t bm1880v2_chip_info = {
|
||||
.version = BM1880V2_VER,
|
||||
.npu_num = BM1880V2_HW_NPU_NUM,
|
||||
.eu_num = BM1880V2_HW_EU_NUM,
|
||||
.lmem_size = BM1880V2_HW_LMEM_SIZE,
|
||||
.lmem_banks = BM1880V2_HW_LMEM_BANKS,
|
||||
.lmem_bank_size = BM1880V2_HW_LMEM_BANK_SIZE,
|
||||
.gmem_start = BM1880V2_GLOBAL_MEM_START_ADDR,
|
||||
.gmem_size = BM1880V2_GLOBAL_MEM_SIZE,
|
||||
};
|
||||
|
||||
bmk1880v2_chip_info_t bmk1880v2_chip_info(void)
|
||||
{
|
||||
return bm1880v2_chip_info;
|
||||
}
|
||||
|
||||
bmk1880v2_tensor_lmem_t * bmk1880v2_lmem_alloc_tensor(
|
||||
ctx_t *ctx,
|
||||
bmk1880v2_tensor_lmem_shape_t s,
|
||||
fmt_t fmt, int eu_align)
|
||||
{
|
||||
ctx_t *k = (typeof(k))ctx;
|
||||
uint32_t lmem_size = k->chip_info.lmem_size;
|
||||
uint32_t eu_num = k->chip_info.eu_num;
|
||||
|
||||
bmk1880v2_tensor_lmem_t *t = xmalloc(sizeof(*t));
|
||||
memset(t, 0, sizeof(*t));
|
||||
t->start_address = k->lmem_ptr;
|
||||
t->fmt = fmt;
|
||||
t->cmprs_fmt = fmt;
|
||||
t->shape = s;
|
||||
t->eu_align = eu_align;
|
||||
t->stride = bmk1880v2_tensor_lmem_default_stride(ctx, s, fmt, eu_align);
|
||||
|
||||
uint32_t needed = align_up(t->shape.n * t->stride.n, eu_num);
|
||||
if ((lmem_size - k->lmem_ptr < needed) || !needed) {
|
||||
free(t);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
k->lmem_ptr += needed;
|
||||
return t;
|
||||
}
|
||||
|
||||
void bmk1880v2_lmem_init_tensor(
|
||||
ctx_t *ctx,
|
||||
bmk1880v2_tensor_lmem_t *tl,
|
||||
bmk1880v2_tensor_lmem_shape_t shape,
|
||||
fmt_t fmt,
|
||||
int eu_align)
|
||||
{
|
||||
memset(tl, 0, sizeof(*tl));
|
||||
tl->fmt = fmt;
|
||||
tl->shape = shape;
|
||||
tl->stride = bmk1880v2_tensor_lmem_default_stride(ctx, shape, fmt, eu_align);
|
||||
tl->eu_align = eu_align;
|
||||
}
|
||||
|
||||
// Provide the unified api for tensor size calculation.
|
||||
// Must have the same logic as bmk1880v2_lmem_bf16_alloc_tensor.
|
||||
// The backed does not need to duplicate the related code.
|
||||
uint32_t bmk1880v2_lmem_tensor_to_size(
|
||||
ctx_t *ctx,
|
||||
bmk1880v2_tensor_lmem_shape_t s,
|
||||
fmt_t fmt, int eu_align)
|
||||
{
|
||||
ctx_t *k = (typeof(k))ctx;
|
||||
uint32_t eu_num = k->chip_info.eu_num;
|
||||
|
||||
bmk1880v2_tensor_lmem_stride_t stride;
|
||||
stride = bmk1880v2_tensor_lmem_default_stride(ctx, s, fmt, eu_align);
|
||||
|
||||
uint32_t needed = align_up(s.n * stride.n, eu_num);
|
||||
|
||||
return needed;
|
||||
}
|
||||
|
||||
bmk1880v2_tensor_lmem_t * bmk1880v2_lmem_alloc_ps32_tensor(
|
||||
bmk1880v2_context_t *ctx,
|
||||
bmk1880v2_tensor_lmem_shape_t s,
|
||||
fmt_t fmt,
|
||||
int eu_align)
|
||||
{
|
||||
/* Partial sum is located in lmem in 32-bit format, so we times n to 2 to
|
||||
* spare a sapce for it.
|
||||
*/
|
||||
|
||||
uint32_t prev_n;
|
||||
|
||||
prev_n = s.n;
|
||||
s.n = s.n * (bitsize_of_fmt(FMT_I32) / bitsize_of_fmt(fmt));
|
||||
bmk1880v2_tensor_lmem_t *res = bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, eu_align);
|
||||
if(res == NULL)
|
||||
ASSERT(0);
|
||||
res->shape.n = prev_n;
|
||||
return res;
|
||||
}
|
||||
|
||||
void bmk1880v2_lmem_free_tensor(
|
||||
ctx_t *ctx, const bmk1880v2_tensor_lmem_t *t)
|
||||
{
|
||||
ASSERT(t->start_address < ctx->lmem_ptr);
|
||||
ctx->lmem_ptr = t->start_address;
|
||||
|
||||
free((void *)t);
|
||||
}
|
||||
|
||||
bmk1880v2_matrix_lmem_t * bmk1880v2_lmem_alloc_matrix(
|
||||
ctx_t *ctx,
|
||||
bmk1880v2_matrix_lmem_shape_t s,
|
||||
fmt_t fmt,
|
||||
int eu_align)
|
||||
{
|
||||
uint32_t lmem_size = ctx->chip_info.lmem_size;
|
||||
uint32_t npu_num = ctx->chip_info.npu_num;
|
||||
uint32_t eu_num = ctx->chip_info.eu_num;
|
||||
uint32_t val = (fmt == FMT_BF16) ? 2 : 1;
|
||||
|
||||
bmk1880v2_matrix_lmem_t *t = xmalloc(sizeof(*t));
|
||||
memset(t, 0, sizeof(*t));
|
||||
t->start_address = ctx->lmem_ptr;
|
||||
t->fmt = fmt;
|
||||
t->shape = s;
|
||||
t->stride.h = s.w * val;
|
||||
if (eu_align)
|
||||
t->stride.c = align_up(s.w * val, eu_num);
|
||||
else
|
||||
t->stride.c = s.w * val;
|
||||
t->stride.n = t->stride.c * ceiling_func(s.c, npu_num);
|
||||
t->eu_align = eu_align;
|
||||
|
||||
uint32_t needed = align_up(t->shape.n * t->stride.n, eu_num);
|
||||
if (lmem_size - ctx->lmem_ptr < needed) {
|
||||
free(t);
|
||||
return NULL;
|
||||
}
|
||||
ctx->lmem_ptr += needed;
|
||||
return t;
|
||||
}
|
||||
|
||||
void bmk1880v2_lmem_init_matrix(
|
||||
ctx_t *ctx,
|
||||
bmk1880v2_matrix_lmem_t *ml,
|
||||
bmk1880v2_matrix_lmem_shape_t shape,
|
||||
fmt_t fmt,
|
||||
int eu_align)
|
||||
{
|
||||
memset(ml, 0, sizeof(*ml));
|
||||
ml->fmt = fmt;
|
||||
ml->shape = shape;
|
||||
ml->stride = bmk1880v2_matrix_lmem_default_stride(ctx, shape, fmt, eu_align);
|
||||
ml->eu_align = eu_align;
|
||||
}
|
||||
|
||||
// Provide the unified api for matrix size calculation.
|
||||
// Must have the same logic as bmk1880v2_lmem_alloc_matrix.
|
||||
// The backed does not need to duplicate the related code.
|
||||
uint32_t bmk1880v2_lmem_matrix_to_size(
|
||||
ctx_t *ctx,
|
||||
bmk1880v2_matrix_lmem_shape_t s,
|
||||
fmt_t fmt,
|
||||
int eu_align) {
|
||||
uint32_t npu_num = ctx->chip_info.npu_num;
|
||||
uint32_t eu_num = ctx->chip_info.eu_num;
|
||||
uint32_t val = (fmt == FMT_BF16) ? 2 : 1;
|
||||
|
||||
bmk1880v2_matrix_lmem_t t;
|
||||
t.fmt = fmt;
|
||||
t.shape = s;
|
||||
t.stride.h = s.w * val;
|
||||
if (eu_align)
|
||||
t.stride.c = align_up(s.w * val, eu_num);
|
||||
else
|
||||
t.stride.c = s.w * val;
|
||||
t.stride.n = t.stride.c * ceiling_func(s.c, npu_num);
|
||||
|
||||
uint32_t needed = align_up(t.shape.n * t.stride.n, eu_num);
|
||||
|
||||
return needed;
|
||||
}
|
||||
|
||||
bmk1880v2_matrix_lmem_t * bmk1880v2_lmem_alloc_ps32_matrix(
|
||||
bmk1880v2_context_t *ctx,
|
||||
bmk1880v2_matrix_lmem_shape_t s,
|
||||
fmt_t fmt,
|
||||
int eu_align)
|
||||
{
|
||||
/* Partial sum is located in lmem in 32-bit format, so we times n to 4 to
|
||||
* spare a sapce for it.
|
||||
*/
|
||||
|
||||
uint32_t prev_n;
|
||||
|
||||
prev_n = s.n;
|
||||
s.n = s.n * (bitsize_of_fmt(FMT_I32) / bitsize_of_fmt(fmt));
|
||||
bmk1880v2_matrix_lmem_t *res = bmk1880v2_lmem_alloc_matrix(ctx, s, fmt, eu_align);
|
||||
if(res == NULL)
|
||||
ASSERT(0);
|
||||
res->shape.n = prev_n;
|
||||
return res;
|
||||
}
|
||||
|
||||
// Provide the unified api for matrix size calculation.
|
||||
// Must have the same logic as bmk1880v2_lmem_alloc_ps32_bf16_matrix.
|
||||
// The backed does not need to duplicate the related code.
|
||||
uint32_t bmk1880v2_lmem_ps32_matrix_to_size(
|
||||
bmk1880v2_context_t *ctx,
|
||||
bmk1880v2_matrix_lmem_shape_t s,
|
||||
fmt_t fmt,
|
||||
int eu_align)
|
||||
{
|
||||
/* Partial sum is located in lmem in 32-bit format, so we times n to 4 to
|
||||
* spare a sapce for it.
|
||||
*/
|
||||
|
||||
s.n = s.n * (bitsize_of_fmt(FMT_I32) / bitsize_of_fmt(fmt));
|
||||
|
||||
return bmk1880v2_lmem_matrix_to_size(ctx, s, fmt, eu_align);
|
||||
}
|
||||
|
||||
void bmk1880v2_lmem_free_matrix(
|
||||
ctx_t *ctx, const bmk1880v2_matrix_lmem_t *t)
|
||||
{
|
||||
ASSERT(t->start_address < ctx->lmem_ptr);
|
||||
ctx->lmem_ptr = t->start_address;
|
||||
free((void *)t);
|
||||
}
|
||||
|
||||
bmk1880v2_tensor_lmem_stride_t bmk1880v2_tensor_lmem_default_stride(
|
||||
ctx_t *ctx,
|
||||
bmk1880v2_tensor_lmem_shape_t s,
|
||||
fmt_t fmt_type,
|
||||
int eu_align)
|
||||
{
|
||||
bmk1880v2_tensor_lmem_stride_t stride;
|
||||
uint32_t eu_num = ctx->chip_info.eu_num;
|
||||
uint32_t npu_num = ctx->chip_info.npu_num;
|
||||
uint32_t fmt = (fmt_type == FMT_BF16) ? 2 : 1;
|
||||
stride.w = fmt;
|
||||
stride.h = s.w * fmt;
|
||||
if (eu_align)
|
||||
stride.c = align_up(s.h * s.w * fmt, eu_num);
|
||||
else
|
||||
stride.c = s.h * s.w * fmt;
|
||||
|
||||
stride.n = stride.c * ceiling_func(s.c, npu_num);
|
||||
// printf("bmk1880v2_tensor_lmem_default_stride stride n=%x c=%x h=%x w=%x\n", stride.n , stride.c , stride.h, stride.w);
|
||||
return stride;
|
||||
}
|
||||
|
||||
bmk1880v2_tensor_tgmem_stride_t bmk1880v2_tensor_tgmem_default_stride(
|
||||
bmk1880v2_tensor_tgmem_shape_t s, fmt_t fmt)
|
||||
{
|
||||
uint32_t data_type_size = (fmt == FMT_BF16) ? 2 : 1;
|
||||
bmk1880v2_tensor_tgmem_stride_t stride;
|
||||
stride.h = s.w * data_type_size;
|
||||
stride.c = s.h * stride.h;
|
||||
stride.n = s.c * stride.c;
|
||||
return stride;
|
||||
}
|
||||
|
||||
static void try_optimize_matrix_shape(ctx_t *ctx,
|
||||
bmk1880v2_matrix_lmem_shape_t *s,
|
||||
fmt_t fmt_type) {
|
||||
uint32_t eu_num = ctx->chip_info.eu_num;
|
||||
uint32_t npu_num = ctx->chip_info.npu_num;
|
||||
uint32_t col = s->col;
|
||||
bool isBf16 = (fmt_type == FMT_BF16);
|
||||
uint32_t workingNumber = isBf16 ? eu_num / 2 : eu_num;
|
||||
|
||||
if (col >= workingNumber) {
|
||||
int num_eu = ceiling_func(col, workingNumber * npu_num);
|
||||
s->w = workingNumber * num_eu;
|
||||
s->c = ceiling_func(col, s->w);
|
||||
} else {
|
||||
// col < EU_NUM
|
||||
// Only transfer needed data
|
||||
// We still change tensor shape in TIU mac op
|
||||
s->w = col;
|
||||
s->c = 1;
|
||||
}
|
||||
}
|
||||
|
||||
bmk1880v2_matrix_lmem_shape_t bmk1880v2_matrix_lmem_default_shape(
|
||||
ctx_t *ctx,
|
||||
uint32_t row,
|
||||
uint32_t col,
|
||||
fmt_t fmt_type)
|
||||
{
|
||||
bmk1880v2_matrix_lmem_shape_t s = {0};
|
||||
s.n = row;
|
||||
s.col = col;
|
||||
|
||||
try_optimize_matrix_shape(ctx, &s, fmt_type);
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
bmk1880v2_matrix_lmem_shape_t bmk1880v2_matrix_lmem_shape_t1(
|
||||
ctx_t *ctx,
|
||||
uint32_t len,
|
||||
fmt_t fmt_type)
|
||||
{
|
||||
uint32_t lmem_size = ctx->chip_info.lmem_size;
|
||||
bmk1880v2_matrix_lmem_shape_t s = {0};
|
||||
|
||||
uint32_t row = 1;
|
||||
uint32_t col = len;
|
||||
|
||||
while (col >= lmem_size) {
|
||||
ASSERT(col % 2 == 0);
|
||||
col /= 2;
|
||||
row *= 2;
|
||||
}
|
||||
|
||||
s.n = row;
|
||||
s.col = col;
|
||||
|
||||
try_optimize_matrix_shape(ctx, &s, fmt_type);
|
||||
return s;
|
||||
}
|
||||
|
||||
// This should be inside bmk1880v2_lmem_alloc_matrix
|
||||
bmk1880v2_matrix_lmem_stride_t bmk1880v2_matrix_lmem_default_stride(
|
||||
ctx_t *ctx,
|
||||
bmk1880v2_matrix_lmem_shape_t s,
|
||||
fmt_t fmt,
|
||||
int eu_align)
|
||||
{
|
||||
uint32_t npu_num = ctx->chip_info.npu_num;
|
||||
uint32_t eu_num = ctx->chip_info.eu_num;
|
||||
uint32_t val = (fmt == FMT_BF16) ? 2 : 1;
|
||||
|
||||
bmk1880v2_matrix_lmem_stride_t stride;
|
||||
stride.h = s.w * val;
|
||||
if (eu_align)
|
||||
stride.c = align_up(s.w * val, eu_num);
|
||||
else
|
||||
stride.c = s.w * val;
|
||||
stride.n = stride.c * ceiling_func(s.c, npu_num);
|
||||
|
||||
return stride;
|
||||
}
|
||||
372
cvikernel/src/bm1880v2/kernel_1880v2.h
Normal file
372
cvikernel/src/bm1880v2/kernel_1880v2.h
Normal file
@ -0,0 +1,372 @@
|
||||
#ifndef KERNEL_1880v2_H
|
||||
#define KERNEL_1880v2_H
|
||||
|
||||
#include "kernel_internal.h"
|
||||
|
||||
#include <bmkernel/bm1880v2/bmkernel_1880v2.h>
|
||||
#include <bmkernel/bm1880v2/bm1880v2_tiu_reg.h>
|
||||
#include <bmkernel/bm1880v2/bm1880v2_tdma_reg.h>
|
||||
#include <bmkernel/bm1880v2/bm1880v2_tpu_cfg.h>
|
||||
#include <bmkernel/reg_tiu.h>
|
||||
#include <bmkernel/reg_bdcast.h>
|
||||
#include <bmkernel/reg_tdma.h>
|
||||
#include "bmkernel_standard.h"
|
||||
|
||||
#include <cvikernel/cvikernel.h>
|
||||
|
||||
#define TENSOR_MUL_FIX8B 0
|
||||
#define TENSOR_MAC_FIX8B 1
|
||||
#define TENSOR_ADD_FIX8B 2
|
||||
#define TENSOR_SUB_FIX8B 3
|
||||
#define TENSOR_MAX_FIX8B 4
|
||||
#define TENSOR_MIN_FIX8B 5
|
||||
#define TENSOR_SHIFT_FIX8B 6
|
||||
#define TENSOR_AND_FIX8B 7
|
||||
#define TENSOR_OR_FIX8B 8
|
||||
#define TENSOR_XOR_FIX8B 9
|
||||
#define TENSOR_COPY_FIX8B 10
|
||||
|
||||
typedef bmk1880v2_tensor_lmem_shape_t tl_shape_t;
|
||||
typedef bmk1880v2_matrix_lmem_shape_t ml_shape_t;
|
||||
typedef bmk1880v2_tensor_tgmem_shape_t tg_shape_t;
|
||||
typedef bmk1880v2_matrix_tgmem_shape_t mg_shape_t;
|
||||
|
||||
typedef bmk1880v2_tensor_lmem_stride_t tl_stride_t;
|
||||
|
||||
typedef bmk1880v2_tensor_lmem_t tl_t;
|
||||
typedef bmk1880v2_matrix_lmem_t ml_t;
|
||||
typedef bmk1880v2_tensor_tgmem_t tg_t;
|
||||
typedef bmk1880v2_matrix_tgmem_t mg_t;
|
||||
typedef bmk1880v2_compressed_tensor_tgmem_t compressed_tg_t;
|
||||
typedef bmk1880v2_compressed_matrix_tgmem_t compressed_mg_t;
|
||||
|
||||
desc_pair_t * bm1880v2_get_desc_pair(ctx_t *k, uint8_t eng_id);
|
||||
|
||||
static inline void assert_same_stride(const tl_t *a, const tl_t *b)
|
||||
{
|
||||
ASSERT(a->stride.n == b->stride.n);
|
||||
ASSERT(a->stride.c == b->stride.c);
|
||||
ASSERT(a->stride.h == b->stride.h);
|
||||
ASSERT(a->stride.w == b->stride.w);
|
||||
}
|
||||
|
||||
static inline void assert_same_shape(const tl_t *a, const tl_t *b)
|
||||
{
|
||||
ASSERT(a->shape.n == b->shape.n);
|
||||
ASSERT(a->shape.c == b->shape.c);
|
||||
ASSERT(a->shape.h == b->shape.h);
|
||||
ASSERT(a->shape.w == b->shape.w);
|
||||
}
|
||||
|
||||
static inline void assert_same_shape_3(
|
||||
const tl_t *a,
|
||||
const tl_t *b,
|
||||
const tl_t *c)
|
||||
{
|
||||
assert_same_shape(a, b);
|
||||
assert_same_shape(a, c);
|
||||
}
|
||||
|
||||
static inline void assert_same_shape_4(
|
||||
const tl_t *a,
|
||||
const tl_t *b,
|
||||
const tl_t *c,
|
||||
const tl_t *d)
|
||||
{
|
||||
assert_same_shape_3(a, b, c);
|
||||
assert_same_shape(a, d);
|
||||
}
|
||||
|
||||
static inline void assert_same_shape_5(
|
||||
const tl_t *t0,
|
||||
const tl_t *t1,
|
||||
const tl_t *t2,
|
||||
const tl_t *t3,
|
||||
const tl_t *t4)
|
||||
{
|
||||
assert_same_shape_3(t0, t1, t2);
|
||||
assert_same_shape_3(t0, t3, t4);
|
||||
}
|
||||
|
||||
static inline void assert_same_shape_6(
|
||||
const tl_t *t0,
|
||||
const tl_t *t1,
|
||||
const tl_t *t2,
|
||||
const tl_t *t3,
|
||||
const tl_t *t4,
|
||||
const tl_t *t5)
|
||||
{
|
||||
assert_same_shape_5(t0, t1, t2, t3, t4);
|
||||
assert_same_shape(t0, t5);
|
||||
}
|
||||
|
||||
static inline void assert_tiu_tensor_shape(const tl_t *t)
|
||||
{
|
||||
ASSERT(t->shape.n > 0);
|
||||
ASSERT(t->shape.c > 0);
|
||||
ASSERT(t->shape.h > 0);
|
||||
ASSERT(t->shape.w > 0);
|
||||
|
||||
ASSERT(t->shape.n < 0x1000);
|
||||
ASSERT(t->shape.c < 0x1000);
|
||||
ASSERT(t->shape.h <= (4095-32)); // 12bit, max 4095-32(lanes)
|
||||
ASSERT(t->shape.w <= (4095-32)); // 12bit, max 4095-32(lanes)
|
||||
}
|
||||
|
||||
static inline void check_tiu_tensor(const tl_t *t)
|
||||
{
|
||||
ASSERT(t);
|
||||
assert_tiu_tensor_shape(t);
|
||||
ASSERT(t->fmt == FMT_I8 || t->fmt == FMT_U8 || t->fmt == FMT_BF16);
|
||||
}
|
||||
|
||||
static inline void check_tiu_tensor_2(
|
||||
const tl_t *t0,
|
||||
const tl_t *t1)
|
||||
{
|
||||
check_tiu_tensor(t0);
|
||||
check_tiu_tensor(t1);
|
||||
}
|
||||
|
||||
static inline void check_tiu_tensor_3(
|
||||
const tl_t *t0,
|
||||
const tl_t *t1,
|
||||
const tl_t *t2)
|
||||
{
|
||||
check_tiu_tensor(t0);
|
||||
check_tiu_tensor_2(t1, t2);
|
||||
}
|
||||
|
||||
static inline void check_tiu_tensor_4(
|
||||
const tl_t *t0,
|
||||
const tl_t *t1,
|
||||
const tl_t *t2,
|
||||
const tl_t *t3)
|
||||
{
|
||||
check_tiu_tensor_3(t0, t1, t2);
|
||||
check_tiu_tensor(t3);
|
||||
}
|
||||
|
||||
static inline void check_tiu_tensor_5(
|
||||
const tl_t *t0,
|
||||
const tl_t *t1,
|
||||
const tl_t *t2,
|
||||
const tl_t *t3,
|
||||
const tl_t *t4)
|
||||
{
|
||||
check_tiu_tensor_3(t0, t1, t2);
|
||||
check_tiu_tensor_2(t3, t4);
|
||||
}
|
||||
|
||||
static inline void check_tiu_tensor_6(
|
||||
const tl_t *t0,
|
||||
const tl_t *t1,
|
||||
const tl_t *t2,
|
||||
const tl_t *t3,
|
||||
const tl_t *t4,
|
||||
const tl_t *t5)
|
||||
{
|
||||
check_tiu_tensor_3(t0, t1, t2);
|
||||
check_tiu_tensor_3(t3, t4, t5);
|
||||
}
|
||||
|
||||
static inline void check_16bit_tiu_tensor(const tl_t *low, const tl_t *high)
|
||||
{
|
||||
check_tiu_tensor_2(low, high);
|
||||
assert_same_shape(low, high);
|
||||
assert_same_stride(low, high);
|
||||
ASSERT(low->fmt == high->fmt);
|
||||
ASSERT(low->start_address < high->start_address);
|
||||
}
|
||||
|
||||
static inline void assert_stride_type_0(ctx_t *ctx, const tl_t *t)
|
||||
{
|
||||
uint32_t eu_num = ctx->chip_info.eu_num;
|
||||
uint32_t fmt = (t->fmt == FMT_BF16) ? 2 : 1;
|
||||
|
||||
uint32_t h = t->shape.h;
|
||||
uint32_t w = t->shape.w * fmt;
|
||||
uint32_t c_stride = align_up(h * w, eu_num);
|
||||
|
||||
ASSERT(t->stride.c == c_stride);
|
||||
ASSERT(t->stride.h == w);
|
||||
ASSERT(t->stride.w == fmt);
|
||||
}
|
||||
|
||||
static inline void assert_bf16_stride_type_0(ctx_t *ctx, const tl_t *t)
|
||||
{
|
||||
uint32_t eu_num = ctx->chip_info.eu_num;
|
||||
uint32_t fmt = (t->fmt == FMT_BF16) ? 2 : 1;
|
||||
|
||||
ASSERT(t->stride.c % eu_num == 0);
|
||||
ASSERT(t->stride.w == fmt);
|
||||
}
|
||||
|
||||
|
||||
static inline void assert_stride_type_2(ctx_t *ctx, const tl_t *t)
|
||||
{
|
||||
ASSERT(t->shape.h == 1);
|
||||
ASSERT(t->shape.w == 1);
|
||||
|
||||
uint32_t fmt = (t->fmt == FMT_BF16) ? 2 : 1;
|
||||
uint32_t c = t->shape.c;
|
||||
uint32_t npu_num = ctx->chip_info.npu_num;
|
||||
|
||||
ASSERT(t->stride.n == fmt * align_up(c, npu_num) / npu_num);
|
||||
ASSERT(t->stride.c == 1 * fmt);
|
||||
ASSERT(t->stride.h == 1 * fmt);
|
||||
ASSERT(t->stride.w == 1 * fmt);
|
||||
}
|
||||
|
||||
static inline void assert_bf16_stride_type_2(ctx_t *ctx, const tl_t *t)
|
||||
{
|
||||
ASSERT(t->shape.h == 1);
|
||||
ASSERT(t->shape.w == 1);
|
||||
|
||||
uint32_t fmt = (t->fmt == FMT_BF16) ? 2 : 1;
|
||||
uint32_t c = t->shape.c;
|
||||
uint32_t npu_num = ctx->chip_info.npu_num;
|
||||
|
||||
ASSERT(t->stride.n == fmt * align_up(c, npu_num) / npu_num);
|
||||
ASSERT(t->stride.c == 1 * fmt);
|
||||
ASSERT(t->stride.h == 1 * fmt);
|
||||
ASSERT(t->stride.w == 1 * fmt);
|
||||
}
|
||||
|
||||
static inline int tensor_is_signed(const tl_t *t)
|
||||
{
|
||||
switch (t->fmt) {
|
||||
case FMT_I8:
|
||||
return 1;
|
||||
case FMT_U8:
|
||||
case FMT_BF16: //does not matter, so set to default 0
|
||||
return 0;
|
||||
default:
|
||||
ASSERT(0);
|
||||
}
|
||||
}
|
||||
|
||||
static inline int matrix_is_signed(const ml_t *t)
|
||||
{
|
||||
switch (t->fmt) {
|
||||
case FMT_I8:
|
||||
return 1;
|
||||
case FMT_U8:
|
||||
case FMT_BF16: //does not matter, so set to default 0
|
||||
return 0;
|
||||
default:
|
||||
ASSERT(0);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void fill_same_tensor_shape(tiu_reg_t *r, tl_shape_t s)
|
||||
{
|
||||
uint32_t n = s.n;
|
||||
uint32_t c = s.c;
|
||||
uint32_t h = s.h;
|
||||
uint32_t w = s.w;
|
||||
|
||||
r->opd0_n = n;
|
||||
r->opd0_c = c;
|
||||
r->opd0_h = h;
|
||||
r->opd0_w = w;
|
||||
|
||||
r->opd1_n = n;
|
||||
r->opd1_c = c;
|
||||
r->opd1_h = h;
|
||||
r->opd1_w = w;
|
||||
|
||||
r->opd2_n = n;
|
||||
r->opd2_c = c;
|
||||
r->opd2_h = h;
|
||||
r->opd2_w = w;
|
||||
|
||||
r->res0_n = n;
|
||||
r->res0_c = c;
|
||||
r->res0_h = h;
|
||||
r->res0_w = w;
|
||||
}
|
||||
|
||||
static inline void assert_stride_range(tl_stride_t s)
|
||||
{
|
||||
ASSERT(s.n < 0x10000);
|
||||
ASSERT(s.c < 0x10000);
|
||||
ASSERT(s.h < 0x10000);
|
||||
}
|
||||
|
||||
static inline void fill_same_tensor_stride(tiu_reg_t *r, tl_stride_t s)
|
||||
{
|
||||
uint32_t n = s.n;
|
||||
uint32_t c = s.c;
|
||||
uint32_t h = s.h;
|
||||
uint32_t w = 1;
|
||||
|
||||
r->opd0_n_str = n;
|
||||
r->opd0_c_str = c;
|
||||
r->opd0_h_str = h;
|
||||
r->opd0_w_str = w;
|
||||
|
||||
r->opd1_n_str = n;
|
||||
r->opd1_c_str = c;
|
||||
r->opd1_h_str = h;
|
||||
r->opd1_w_str = w;
|
||||
|
||||
r->opd2_n_str = n;
|
||||
r->opd2_c_str = c;
|
||||
r->opd2_h_str = h;
|
||||
r->opd2_w_str = w;
|
||||
|
||||
r->res0_n_str = n;
|
||||
r->res0_c_str = c;
|
||||
r->res0_h_str = h;
|
||||
r->res0_w_str = w;
|
||||
}
|
||||
|
||||
#define fill_stride_code(r, op, str) \
|
||||
do { \
|
||||
r->op##_n_str = str->n; \
|
||||
r->op##_c_str = str->c; \
|
||||
r->op##_h_str = str->h; \
|
||||
r->op##_w_str = str->w; \
|
||||
} while (0)
|
||||
|
||||
static inline void fill_opd0_stride(tiu_reg_t *r, const tl_stride_t *str)
|
||||
{
|
||||
fill_stride_code(r, opd0, str);
|
||||
}
|
||||
|
||||
static inline void fill_opd1_stride(tiu_reg_t *r, const tl_stride_t *str)
|
||||
{
|
||||
fill_stride_code(r, opd1, str);
|
||||
}
|
||||
|
||||
static inline void fill_opd2_stride(tiu_reg_t *r, const tl_stride_t *str)
|
||||
{
|
||||
fill_stride_code(r, opd2, str);
|
||||
}
|
||||
|
||||
static inline void fill_res0_stride(tiu_reg_t *r, const tl_stride_t *str)
|
||||
{
|
||||
fill_stride_code(r, res0, str);
|
||||
}
|
||||
|
||||
static inline void fill_same_tensor_stride_type(tiu_reg_t *r, int type)
|
||||
{
|
||||
r->short_opd0_str = type & 0b11;
|
||||
r->short_opd1_str = type & 0b11;
|
||||
r->short_opd2_str = type & 0b11;
|
||||
r->short_res0_str = type & 0b11;
|
||||
}
|
||||
|
||||
static inline ec_desc_t * emit_tiu_cmdbuf(ctx_t *k, tiu_reg_t *r)
|
||||
{
|
||||
int engine_id = BMK1880v2_TIU;
|
||||
|
||||
desc_pair_t *dp = bm1880v2_get_desc_pair(k, engine_id);
|
||||
uint32_t *cmdbuf = (uint32_t *)dp->cmd_hdr->cmd;
|
||||
emit_tiu_reg(r, cmdbuf);
|
||||
|
||||
return dp->ec_desc;
|
||||
}
|
||||
|
||||
#endif /* KERNEL_1880v2_H */
|
||||
1201
cvikernel/src/bm1880v2/non_atomic/common.c
Normal file
1201
cvikernel/src/bm1880v2/non_atomic/common.c
Normal file
File diff suppressed because it is too large
Load Diff
49
cvikernel/src/bm1880v2/non_atomic/fp32_bf16_kernel.c
Normal file
49
cvikernel/src/bm1880v2/non_atomic/fp32_bf16_kernel.c
Normal file
@ -0,0 +1,49 @@
|
||||
#include "../kernel_1880v2.h"
|
||||
|
||||
// only fill base_reg_index/int8_rnd_mode
|
||||
static void init_tgmem(bmk1880v2_tensor_tgmem_t* t) {
|
||||
t->base_reg_index = 0;
|
||||
t->int8_rnd_mode = 0;
|
||||
}
|
||||
|
||||
int bf16_s2s_fp32_bf16(bmk1880v2_context_t* ctx, uint64_t gaddr_fp32,
|
||||
bmk1880v2_tensor_tgmem_shape_t fp32_shape, uint64_t gaddr_bf16,
|
||||
bmk1880v2_tensor_tgmem_shape_t bf16_shape, fmt_t fmt) {
|
||||
int ret = 0;
|
||||
ASSERT(fmt == FMT_BF16 && "only support FMT_BF16");
|
||||
ASSERT(fp32_shape.w % 2 == 0 && "fp32's w MUST align with 2");
|
||||
|
||||
bmk1880v2_tdma_tg2tg_tensor_copy_param_t p;
|
||||
|
||||
bmk1880v2_tensor_tgmem_t src, dst;
|
||||
|
||||
init_tgmem(&src);
|
||||
init_tgmem(&dst);
|
||||
|
||||
int fp32_w = 2;
|
||||
src.fmt = fmt;
|
||||
src.start_address = gaddr_fp32 + fp32_w; // copy from high part
|
||||
src.shape = fp32_shape;
|
||||
src.shape.h = fp32_shape.w * fp32_shape.h / fp32_w;
|
||||
src.shape.w = 1;
|
||||
|
||||
int fmt_sz = ceiling_bytesize_of(bitsize_of_fmt(fmt));
|
||||
src.stride.n = fp32_shape.w * fp32_shape.h * fp32_shape.c * fmt_sz;
|
||||
src.stride.c = fp32_shape.w * fp32_shape.h * fmt_sz;
|
||||
src.stride.h = fp32_w * fmt_sz;
|
||||
|
||||
dst.fmt = fmt;
|
||||
dst.start_address = gaddr_bf16;
|
||||
dst.shape = bf16_shape;
|
||||
dst.shape.h = bf16_shape.w * bf16_shape.h / fp32_w;
|
||||
dst.shape.w = 1;
|
||||
dst.stride = bmk1880v2_tensor_tgmem_default_stride(dst.shape, fmt);
|
||||
|
||||
memset(&p, 0, sizeof(p));
|
||||
p.src = &src;
|
||||
p.dst = &dst;
|
||||
|
||||
bmk1880v2_tdma_tg2tg_bf16_tensor_copy(ctx, &p);
|
||||
|
||||
return ret;
|
||||
}
|
||||
182
cvikernel/src/bm1880v2/non_atomic/gen_lut.h
Normal file
182
cvikernel/src/bm1880v2/non_atomic/gen_lut.h
Normal file
@ -0,0 +1,182 @@
|
||||
#ifndef GEN_LUT_1880v2_H
|
||||
#define GEN_LUT_1880v2_H
|
||||
|
||||
#include "../kernel_1880v2.h"
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
#define IN
|
||||
#define OUT
|
||||
static inline int bf16_exp_start()
|
||||
{
|
||||
return -62;
|
||||
}
|
||||
static inline int bf16_exp_end()
|
||||
{
|
||||
return 63;
|
||||
}
|
||||
static inline int bf16_table_h()
|
||||
{
|
||||
return 32;
|
||||
}
|
||||
static inline int bf16_table_w()
|
||||
{
|
||||
return 8;
|
||||
}
|
||||
static inline int bf16_table_hw()
|
||||
{
|
||||
return bf16_table_h() * bf16_table_w();
|
||||
}
|
||||
static inline int half_h_table()
|
||||
{
|
||||
return bf16_table_h() * bf16_table_w() / 2;
|
||||
}
|
||||
static inline uint8_t is_1880v2_tbl_shape(bmk1880v2_tensor_lmem_shape_t *s)
|
||||
{
|
||||
// FIXME: h could be reduce less than 32
|
||||
assert(s->h == (uint32_t)bf16_table_h() && s->w == (uint32_t)bf16_table_w() &&
|
||||
"table h/w should be 32/8");
|
||||
|
||||
return s->h == (uint32_t)bf16_table_h() && s->w == (uint32_t)bf16_table_w();
|
||||
}
|
||||
|
||||
// <! end copy from bmkernel/src/kernel_internal.h
|
||||
static inline int bytesize_of_fmt(fmt_t fmt)
|
||||
{
|
||||
return bitsize_of_fmt(fmt) / 8;
|
||||
}
|
||||
|
||||
// duplicate from 1880v2_test_util.h
|
||||
static inline uint64_t tl_shape_size(const bmk1880v2_tensor_lmem_shape_t *s)
|
||||
{
|
||||
return (uint64_t)s->n * s->c * s->h * s->w;
|
||||
}
|
||||
|
||||
// copy bmk1880v2_tensor_lmem_t structure
|
||||
static inline void bmk1880v2_tensor_lmem_s_copy(bmk1880v2_tensor_lmem_t *dst,
|
||||
bmk1880v2_tensor_lmem_t *src)
|
||||
{
|
||||
|
||||
dst->start_address = src->start_address;
|
||||
dst->fmt = src->fmt;
|
||||
dst->shape = src->shape;
|
||||
dst->stride = src->stride;
|
||||
dst->int8_rnd_mode = src->int8_rnd_mode;
|
||||
}
|
||||
|
||||
static inline void
|
||||
bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx_t *ctx, bmk1880v2_tensor_lmem_t *dst,
|
||||
bmk1880v2_tensor_lmem_t *src, fmt_t fmt)
|
||||
{
|
||||
assert(src->fmt == FMT_BF16 && (fmt == FMT_I8 || fmt == FMT_U8) &&
|
||||
"only support bf16->i8/uint8_t, plz check fmt\n");
|
||||
|
||||
dst->start_address = src->start_address;
|
||||
dst->fmt = fmt;
|
||||
dst->shape = src->shape;
|
||||
dst->shape.w *= 2;
|
||||
dst->stride = bmk1880v2_tensor_lmem_default_stride(ctx, dst->shape,
|
||||
fmt, CTRL_NULL);
|
||||
// dst->shape.h *= 2;
|
||||
// dst->stride = bmk1880v2_tensor_lmem_default_stride(ctx, dst->shape,
|
||||
// /*eu_align*/ 1,
|
||||
// fmt);
|
||||
// dst->shape.h = src->shape.h;
|
||||
dst->int8_rnd_mode = src->int8_rnd_mode;
|
||||
}
|
||||
|
||||
// l2l means we keep the same shape between bf16/(u)int8
|
||||
static inline void
|
||||
bmk1880v2_tensor_lmem_s_copy_l2l_bf16_8(ctx_t *ctx,
|
||||
bmk1880v2_tensor_lmem_t *dst,
|
||||
bmk1880v2_tensor_lmem_t *src, fmt_t fmt)
|
||||
{
|
||||
assert(src->fmt == FMT_BF16 && (fmt == FMT_I8 || fmt == FMT_U8) &&
|
||||
"only support bf16->i8/uint8_t, plz check fmt\n");
|
||||
|
||||
dst->start_address = src->start_address;
|
||||
dst->fmt = fmt;
|
||||
dst->shape = src->shape;
|
||||
dst->stride = bmk1880v2_tensor_lmem_default_stride(ctx, dst->shape,
|
||||
fmt, CTRL_NULL);
|
||||
dst->int8_rnd_mode = src->int8_rnd_mode;
|
||||
}
|
||||
|
||||
int bf16_emit_square(ctx_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *OUT tl_ofmap_bf16, fmt_t fmt);
|
||||
|
||||
void bf16_table_check(bmk1880v2_tensor_lmem_t *IN tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *tbl_answer,
|
||||
bmk1880v2_tensor_lmem_t *tbl_answer_mantissa,
|
||||
bmk1880v2_tensor_lmem_t *OUT tl_ofmap_bf16);
|
||||
|
||||
int bf16_lut_exp_mantissa(ctx_t *ctx, bmk1880v2_tensor_lmem_t *IN tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *IN tl_buf,
|
||||
bmk1880v2_tensor_lmem_t *tbl_answer,
|
||||
bmk1880v2_tensor_lmem_t *tbl_answer_mantissa,
|
||||
bmk1880v2_tensor_lmem_t *OUT tl_ofmap_bf16);
|
||||
|
||||
void bf16_get_u8_tbl_idx(ctx_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *OUT tl_ofmap_bf16);
|
||||
|
||||
void bf16_get_dec(ctx_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf,
|
||||
bmk1880v2_tensor_lmem_t *OUT tl_ofmap_bf16);
|
||||
|
||||
void bf16_get_dec_fractions(ctx_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *OUT buf,
|
||||
bmk1880v2_tensor_lmem_t *OUT tl_ofmap_bf16);
|
||||
|
||||
int bf16_emit_abs(ctx_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *OUT tl_ofmap_bf16, fmt_t fmt);
|
||||
|
||||
int _bf16_lut_exp_mantissa(ctx_t *ctx, bmk1880v2_tensor_lmem_t *IN tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *IN tl_buf,
|
||||
bmk1880v2_tensor_lmem_t *tbl_answer,
|
||||
bmk1880v2_tensor_lmem_t *tbl_answer_mantissa,
|
||||
bmk1880v2_tensor_lmem_t *OUT tl_ofmap_bf16,
|
||||
uint8_t is_dirty_ifmap);
|
||||
|
||||
int _bf16_atan_fast_emit(ctx_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf2,
|
||||
bmk1880v2_tensor_lmem_t *tl_y0_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_invert_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_pos_neg_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_table_answer,
|
||||
bmk1880v2_tensor_lmem_t *tl_table_answer_mantissa,
|
||||
bmk1880v2_tensor_lmem_t *OUT tl_ofmap_bf16, fmt_t fmt,
|
||||
float b, uint8_t is_dirty_ifmap);
|
||||
|
||||
int bf16_emit_x_over_y(ctx_t *ctx, bmk1880v2_tensor_lmem_t *IN x,
|
||||
bmk1880v2_tensor_lmem_t *IN y,
|
||||
bmk1880v2_tensor_lmem_t *IN tl_buf,
|
||||
bmk1880v2_tensor_lmem_t *OUT tl_ofmap_bf16,
|
||||
bmk1880v2_tensor_lmem_t *tl_table_answer,
|
||||
bmk1880v2_tensor_lmem_t *tl_table_answer_mantissa,
|
||||
fmt_t fmt, uint8_t is_dirty_ifmap);
|
||||
|
||||
int _bf16_emit_mask(ctx_t *ctx, bmk1880v2_tensor_lmem_t *IN tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf2,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf3,
|
||||
bmk1880v2_tensor_lmem_t *tl_pos_neg_table,
|
||||
bmk1880v2_tensor_lmem_t *tl_0_idx_table,
|
||||
bmk1880v2_tensor_lmem_t *OUT tl_ofmap_bf16, fmt_t fmt,
|
||||
enum BF16_MASK_TYPE mask, uint8_t is_dirty_ifmap);
|
||||
|
||||
void _bf16_get_tbl_idx(ctx_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *OUT tl_ofmap_bf16,
|
||||
fmt_t src_fmt, int int8_rnd_mode);
|
||||
int __bf16_atan_fast_emit(ctx_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_buf2,
|
||||
bmk1880v2_tensor_lmem_t *tl_y0_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_invert_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_pos_neg_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_table_answer,
|
||||
bmk1880v2_tensor_lmem_t *tl_table_answer_mantissa,
|
||||
bmk1880v2_tensor_lmem_t *OUT tl_ofmap_bf16,
|
||||
fmt_t fmt);
|
||||
|
||||
#endif /* GEN_LUT_1880v2_H */
|
||||
929
cvikernel/src/bm1880v2/non_atomic/hists_svm_kernel.c
Normal file
929
cvikernel/src/bm1880v2/non_atomic/hists_svm_kernel.c
Normal file
@ -0,0 +1,929 @@
|
||||
#include "../kernel_1880v2.h"
|
||||
#define LLVM_DEBUG(...)
|
||||
#define SPLIT_FAILED 0xFFFF
|
||||
|
||||
// only fill base_reg_index/int8_rnd_mode
|
||||
static void init_tgmem(bmk1880v2_tensor_tgmem_t* t) {
|
||||
t->base_reg_index = 0;
|
||||
t->int8_rnd_mode = 0;
|
||||
}
|
||||
|
||||
static void copy_tg_tl_tensor_shape(bmk1880v2_tensor_lmem_shape_t* dst,
|
||||
const bmk1880v2_tensor_tgmem_shape_t* src) {
|
||||
dst->n = src->n;
|
||||
dst->c = src->c;
|
||||
dst->h = src->h;
|
||||
dst->w = src->w;
|
||||
}
|
||||
|
||||
static void copy_tl_tg_tensor_shape(bmk1880v2_tensor_tgmem_shape_t* dst,
|
||||
const bmk1880v2_tensor_lmem_shape_t* src) {
|
||||
dst->n = src->n;
|
||||
dst->c = src->c;
|
||||
dst->h = src->h;
|
||||
dst->w = src->w;
|
||||
}
|
||||
|
||||
static int conv_out(int conv_in_ext, int conv_kernel_ext, int stride) {
|
||||
return conv_in_ext - conv_kernel_ext / stride + 1;
|
||||
}
|
||||
|
||||
static void conv_output(int on, bmk1880v2_tensor_lmem_shape_t* out_shape,
|
||||
const bmk1880v2_tensor_tgmem_shape_t* image_shape,
|
||||
const bmk1880v2_tensor_tgmem_shape_t* svm_shape) {
|
||||
int ins_h = 0, ins_h_last = 0, pad_top = 0, pad_bot = 0, dh = 1;
|
||||
int conv_ih_ext = (image_shape->h - 1) * (ins_h + 1) + ins_h_last + 1 + pad_top + pad_bot;
|
||||
int conv_kh_ext = (svm_shape->h - 1) * dh + 1;
|
||||
int stride_h = 1;
|
||||
|
||||
int ins_w = 0, ins_w_last = 0, pad_left = 0, pad_right = 0, dw = 1;
|
||||
int conv_kw_ext = (svm_shape->w - 1) * dw + 1;
|
||||
int conv_iw_ext = (image_shape->w - 1) * (ins_w + 1) + ins_w_last + 1 + pad_left + pad_right;
|
||||
int stride_w = 1;
|
||||
|
||||
int oh = conv_out(conv_ih_ext, conv_kh_ext, stride_h);
|
||||
int ow = conv_out(conv_iw_ext, conv_kw_ext, stride_w);
|
||||
|
||||
out_shape->n = on;
|
||||
out_shape->c = svm_shape->n;
|
||||
out_shape->h = oh;
|
||||
out_shape->w = ow;
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
int n;
|
||||
int oc;
|
||||
int ic;
|
||||
int h;
|
||||
int w;
|
||||
int ic_step;
|
||||
int oc_step;
|
||||
int oh_step;
|
||||
int ow_step;
|
||||
int ih_step;
|
||||
int iw_step;
|
||||
} SLICES;
|
||||
|
||||
SLICES slices;
|
||||
|
||||
static int is_split_ic() { return 0; }
|
||||
static int is_split_oc() { return 1; }
|
||||
static int is_reuse_weight() { return 1; }
|
||||
|
||||
static bmk1880v2_tensor_lmem_shape_t _shape_t4(int n, int c, int h, int w) {
|
||||
bmk1880v2_tensor_lmem_shape_t s;
|
||||
s.n = n;
|
||||
s.c = c;
|
||||
s.h = h;
|
||||
s.w = w;
|
||||
return s;
|
||||
}
|
||||
|
||||
static bmk1880v2_tensor_tgmem_shape_t _tg_shape_t4(int n, int c, int h, int w) {
|
||||
bmk1880v2_tensor_tgmem_shape_t s;
|
||||
s.n = n;
|
||||
s.c = c;
|
||||
s.h = h;
|
||||
s.w = w;
|
||||
return s;
|
||||
}
|
||||
|
||||
#define NPU_SHIFT (get_num_shift(ctx->chip_info.npu_num))
|
||||
static int _split(bmk1880v2_context_t* ctx, int input_n, int input_c, int input_h, int input_w,
|
||||
int groups, int output_c, uint16_t kh, uint16_t kw, uint16_t dilation_h, uint16_t dilation_w,
|
||||
uint8_t pad_top, uint8_t pad_bottom, uint8_t pad_left, uint8_t pad_right, uint8_t stride_h, uint8_t stride_w) {
|
||||
int do_bias = 0;
|
||||
int duplicate_weights = 2; // force duplicate weight to speed up
|
||||
|
||||
int ic = input_c / groups;
|
||||
int oc = output_c / groups;
|
||||
int kh_extent = dilation_h * (kh - 1) + 1;
|
||||
int kw_extent = dilation_w * (kw - 1) + 1;
|
||||
int oh = (input_h + pad_top + pad_bottom - kh_extent) / stride_h + 1;
|
||||
int ow = (input_w + pad_left + pad_right - kw_extent) / stride_w + 1;
|
||||
int ih = input_h;
|
||||
int iw = input_w;
|
||||
int n = input_n;
|
||||
|
||||
// Depthwise
|
||||
uint8_t isDepthWise = (input_c == groups && output_c == groups && 1 != groups) ? true : false;
|
||||
if (isDepthWise) {
|
||||
ic = input_c;
|
||||
oc = output_c;
|
||||
}
|
||||
|
||||
LLVM_DEBUG(llvm::errs() << llvm::format(
|
||||
"BM1880v2ConvBF16::split =>\n"
|
||||
" groups %d, ifmap (%d, %d, %d, %d), ofmap(%d, %d, %d, %d)\n"
|
||||
" kernel (%d, %d), pad (top=%d, bot=%d, left=%d, right=%d)\n"
|
||||
" stride (%d, %d), dilation (%d, %d)\n",
|
||||
groups, input_n, input_c, input_h, input_w, input_n, oc, oh, ow, kh, kw, pad_top,
|
||||
pad_bottom, pad_left, pad_right, stride_h, stride_w, dilation_h, dilation_w));
|
||||
|
||||
slices.n = 1;
|
||||
slices.oc = oc / ctx->chip_info.npu_num; // lane parallelism
|
||||
// slices.ic = isDepthWise ? ic : 1;
|
||||
slices.ic = 1;
|
||||
slices.h = (ih + (4095 - 32 - 1)) / (4095 - 32); // 12bit, max 4095-32(lanes)
|
||||
slices.w = (iw + (4095 - 32 - 1)) / (4095 - 32); // 12bit, max 4095-32(lanes)
|
||||
|
||||
// int oc_step = (oc >= (int)ctx->chip_info.npu_num) ? (int)ctx->chip_info.npu_num : oc; // use
|
||||
// all lanes int ic_step = isDepthWise ? 1 : ic;
|
||||
int ic_step = ic;
|
||||
int num_oc_step = 1;
|
||||
|
||||
//
|
||||
// Slices may not be a good way to find size
|
||||
// We may try to increase or decrease width in aligned with 4, 8, 16 ...
|
||||
// or specific height/width (8, 8), (16, 16) ...
|
||||
//
|
||||
// Split ow
|
||||
if (is_split_ic()) {
|
||||
LLVM_DEBUG(llvm::errs() << "<= slice ic(" << ic << ")\n";);
|
||||
ASSERT(0);
|
||||
// return split_ic(ctx);
|
||||
}
|
||||
|
||||
if (is_split_oc()) {
|
||||
LLVM_DEBUG(llvm::errs() << "<= slice oc\n";);
|
||||
num_oc_step = (oc + ctx->chip_info.npu_num - 1) / ctx->chip_info.npu_num;
|
||||
}
|
||||
|
||||
// TODO: suppot slice kernel
|
||||
// 'iw / slices.w >= kw_extent' means we CANT slice kernel
|
||||
for (slices.w = 1; slices.w <= ow && iw / slices.w >= kw_extent; ++slices.w) {
|
||||
int ow_step = ceiling_func(ow, slices.w);
|
||||
int iw_step = math_min((ow_step - 1) * stride_w + kw_extent, iw);
|
||||
|
||||
if ((slices.w == 1) && (stride_w > 1)) {
|
||||
// For better DMA transfer efficiency, use whole width.
|
||||
// E.g.
|
||||
// ifmap (1, 512, 28, 28), kernel (1, 1), stride 2
|
||||
//
|
||||
// input (27, 27) needed, but (27, 28) is better
|
||||
iw_step = math_min(iw_step + stride_w - 1, iw);
|
||||
slices.iw_step = iw_step;
|
||||
}
|
||||
|
||||
// Split oh
|
||||
// TODO: support slice kernel
|
||||
for (slices.h = 1; slices.h <= oh && ih / slices.h >= kh_extent; ++slices.h) {
|
||||
// Split oc
|
||||
// TODO: config not split it
|
||||
for (int slice_oc = 0; slice_oc < num_oc_step; ++slice_oc) {
|
||||
// Downward, align lanes
|
||||
// E.g. oc = 48, oc_step: 48, 32
|
||||
int oc_step = math_min((num_oc_step - slice_oc) * (int)ctx->chip_info.npu_num, oc);
|
||||
if (num_oc_step == 1) {
|
||||
// FIXME: not check every loop
|
||||
oc_step = oc;
|
||||
slices.oc = 1;
|
||||
}
|
||||
|
||||
uint32_t coeff_oc_step_size = 0;
|
||||
|
||||
if (do_bias) {
|
||||
// 2x 16bit
|
||||
coeff_oc_step_size += bmk1880v2_lmem_tensor_to_size(ctx, _shape_t4(2, oc_step, 1, 1),
|
||||
FMT_BF16, /*eu_align=*/0);
|
||||
}
|
||||
|
||||
// TODO: handle prelu
|
||||
|
||||
// Add weight size
|
||||
coeff_oc_step_size += bmk1880v2_lmem_tensor_to_size(
|
||||
ctx, _shape_t4(ic_step, oc_step, kh, kw), FMT_BF16, /*eu_align=*/0);
|
||||
|
||||
// split n
|
||||
for (slices.n = 1; slices.n <= n; ++slices.n) {
|
||||
int n_step = ceiling_func(n, slices.n);
|
||||
|
||||
int oh_step = ceiling_func(oh, slices.h);
|
||||
int ih_step = math_min((oh_step - 1) * stride_h + kh_extent, ih);
|
||||
|
||||
uint32_t total_needed = 0;
|
||||
|
||||
uint32_t ofmap_size = bmk1880v2_lmem_tensor_to_size(
|
||||
ctx, _shape_t4(n_step, oc_step, oh_step, ow_step), FMT_BF16, /*eu_align=*/1);
|
||||
|
||||
total_needed += ofmap_size;
|
||||
|
||||
uint32_t ifmap_size = bmk1880v2_lmem_tensor_to_size(
|
||||
ctx, _shape_t4(n_step, ic_step, ih_step, iw_step), FMT_BF16, /*eu_align=*/1);
|
||||
total_needed += ifmap_size;
|
||||
|
||||
total_needed += coeff_oc_step_size;
|
||||
|
||||
// Double buffers so that TDMA load and store can run during TIU executes.
|
||||
total_needed *= duplicate_weights;
|
||||
|
||||
// TODO: handle prelu, leaky relu
|
||||
// Both prelu and leaky relu need tl_neg, tl_relu.
|
||||
// tl_relu, tl_neg are not from tmda and not final output.
|
||||
// One copy is enough.
|
||||
// if (do_activation && ((activation_method == PRELU) ||
|
||||
// (activation_method == RELU && activation_arg && activation_arg[0]
|
||||
// != 0.0f))) {
|
||||
// total_needed += 2 * ofmap_size; // tl_relu + tl_neg
|
||||
// }
|
||||
|
||||
if (total_needed < BM1880V2_HW_LMEM_SIZE) {
|
||||
slices.ic_step = ic_step;
|
||||
slices.oc_step = oc_step;
|
||||
slices.oh_step = oh_step;
|
||||
slices.ow_step = ow_step;
|
||||
slices.ih_step = ih_step;
|
||||
slices.iw_step = iw_step;
|
||||
|
||||
LLVM_DEBUG(
|
||||
llvm::errs() << llvm::format(
|
||||
" Slices(n=%d, oc=%d, ic=%d, h=%d, w=%d), n_step %d, oh_step %d, ih_step %d"
|
||||
", coeff_oc_step_size %d, total_needed %d\n",
|
||||
slices.n, slices.oc, slices.ic, slices.h, slices.w, n_step, oh_step, ih_step,
|
||||
coeff_oc_step_size, total_needed););
|
||||
LLVM_DEBUG(llvm::errs() << "<= BM1880v2ConvFixedParallelv2_qdm::split succeed"
|
||||
<< "/n");
|
||||
return total_needed;
|
||||
}
|
||||
|
||||
} // for (slices.n = 1; slices.n < n; ++slices.n)
|
||||
|
||||
} // for (int slice_oc = 0; slice_oc < num_oc_step; ++slice_oc)
|
||||
|
||||
} // for (slices.h = 1; slices.h <= oh; ++slices.h)
|
||||
|
||||
} // for (slices.w = 1; slices.w <= ow; ++slices.ow)
|
||||
|
||||
LLVM_DEBUG(llvm::errs() << "<= BM1880v2ConvBF16::split fail"
|
||||
<< "\n");
|
||||
|
||||
return SPLIT_FAILED;
|
||||
}
|
||||
|
||||
void tdma_load_stride_bf16(bmk1880v2_context_t* ctx, bmk1880v2_tensor_lmem_t* tlp, uint64_t ga_src,
|
||||
bmk1880v2_tensor_tgmem_stride_t ts_stride, ctrl_t ctrl) {
|
||||
ASSERT(tlp != NULL);
|
||||
|
||||
uint8_t DoTranspose = (ctrl & CTRL_TP) ? true : false;
|
||||
|
||||
// tensor in system memory
|
||||
// Global shape use local shape
|
||||
bmk1880v2_tensor_tgmem_t ts_data;
|
||||
ts_data.base_reg_index = 0;
|
||||
ts_data.fmt = tlp->fmt;
|
||||
ts_data.start_address = ga_src;
|
||||
ts_data.shape = _tg_shape_t4(tlp->shape.n, tlp->shape.c, tlp->shape.h, tlp->shape.w);
|
||||
ts_data.stride = ts_stride;
|
||||
|
||||
if (DoTranspose) {
|
||||
bmk1880v2_tdma_tg2l_tensor_copy_nc_transposed_param_t p1;
|
||||
memset(&p1, 0, sizeof(p1));
|
||||
p1.src = &ts_data;
|
||||
p1.dst = tlp;
|
||||
bmk1880v2_tdma_g2l_bf16_tensor_copy_nc_transposed(ctx, &p1);
|
||||
} else {
|
||||
bmk1880v2_tdma_tg2l_tensor_copy_param_t p1;
|
||||
memset(&p1, 0, sizeof(p1));
|
||||
p1.src = &ts_data;
|
||||
p1.dst = tlp;
|
||||
bmk1880v2_tdma_g2l_bf16_tensor_copy(ctx, &p1);
|
||||
}
|
||||
}
|
||||
|
||||
void tdma_store_stride_bf16(bmk1880v2_context_t* ctx, bmk1880v2_tensor_lmem_t* tlp, uint64_t ga_dst,
|
||||
bmk1880v2_tensor_tgmem_stride_t ts_stride, ctrl_t ctrl) {
|
||||
ASSERT(tlp != NULL);
|
||||
|
||||
uint8_t DoTranspose = (ctrl & CTRL_TP) ? true : false;
|
||||
|
||||
// tensor in system memory
|
||||
// Global shape use local shape
|
||||
// Global shape used for stride calculation
|
||||
bmk1880v2_tensor_tgmem_t ts_data;
|
||||
ts_data.base_reg_index = 0;
|
||||
ts_data.fmt = tlp->fmt;
|
||||
ts_data.start_address = ga_dst;
|
||||
ts_data.shape = _tg_shape_t4(tlp->shape.n, tlp->shape.c, tlp->shape.h, tlp->shape.w);
|
||||
ts_data.stride = ts_stride;
|
||||
|
||||
if (DoTranspose) {
|
||||
bmk1880v2_tdma_l2tg_tensor_copy_nc_transposed_param_t p1;
|
||||
memset(&p1, 0, sizeof(p1));
|
||||
p1.src = tlp;
|
||||
p1.dst = &ts_data;
|
||||
bmk1880v2_tdma_l2g_bf16_tensor_copy_nc_transposed(ctx, &p1);
|
||||
} else {
|
||||
bmk1880v2_tdma_l2tg_tensor_copy_param_t p1;
|
||||
memset(&p1, 0, sizeof(p1));
|
||||
p1.src = tlp;
|
||||
p1.dst = &ts_data;
|
||||
bmk1880v2_tdma_l2g_bf16_tensor_copy(ctx, &p1);
|
||||
}
|
||||
}
|
||||
|
||||
static void ConvReuseWeight(bmk1880v2_context_t* ctx, gaddr_t ga_ifmap, gaddr_t ga_ofmap,
|
||||
gaddr_t ga_weight, int input_n, int input_c, int input_h, int input_w,
|
||||
int groups, int output_c, uint16_t kh, uint16_t kw, uint16_t dilation_h,
|
||||
uint16_t dilation_w, uint8_t pad_top, uint8_t pad_bottom, uint8_t pad_left, uint8_t pad_right,
|
||||
uint8_t stride_h, uint8_t stride_w) {
|
||||
#define RELU (0)
|
||||
int do_scale = 0;
|
||||
int do_bn = 0;
|
||||
int do_activation = 0;
|
||||
int activation_method = 0;
|
||||
int* activation_arg = NULL;
|
||||
int do_bias = 0;
|
||||
int ga_bias = -1; // not support
|
||||
int layer_id = 2; // debug
|
||||
|
||||
int ic = input_c / groups;
|
||||
int oc = output_c / groups;
|
||||
int kh_ext = dilation_h * (kh - 1) + 1;
|
||||
int kw_ext = dilation_w * (kw - 1) + 1;
|
||||
int oh = (input_h + pad_top + pad_bottom - kh_ext) / stride_h + 1;
|
||||
int ow = (input_w + pad_left + pad_right - kw_ext) / stride_w + 1;
|
||||
|
||||
int n_step = ceiling_func(input_n, slices.n);
|
||||
// int ic_step = ceiling_func(ic, slices.ic);
|
||||
// ic_step = slices.ic_step;
|
||||
int oh_step = slices.oh_step;
|
||||
int ow_step = slices.ow_step;
|
||||
int ih_step = slices.ih_step;
|
||||
int iw_step = slices.iw_step;
|
||||
int oc_step = slices.oc_step;
|
||||
|
||||
// Always use all lanes.
|
||||
// Not divided by slices.oc.
|
||||
// E.g. mtcnn_det2_cic oc = 48, slices.oc = 2
|
||||
// It is better to store step.
|
||||
if (slices.oc > 1) {
|
||||
ASSERT(oc > (int)ctx->chip_info.npu_num);
|
||||
oc_step = ctx->chip_info.npu_num;
|
||||
}
|
||||
|
||||
if (slices.h > 1) {
|
||||
// max input height inside feature map
|
||||
ih_step = (oh_step - 1) * stride_h + kh_ext;
|
||||
}
|
||||
if (slices.w > 1) {
|
||||
// max input width inside feature map
|
||||
iw_step = (ow_step - 1) * stride_w + kw_ext;
|
||||
}
|
||||
|
||||
LLVM_DEBUG(llvm::errs() << llvm::format(
|
||||
"ConvReuseWeight =>\n"
|
||||
" groups %d, ifmap (%d, %d, %d, %d), ofmap(%d, %d, %d, %d)\n"
|
||||
" kernel (%d, %d), pad (top=%d, bot=%d, left=%d, right=%d)\n"
|
||||
" stride (%d, %d), dilation (%d, %d)\n"
|
||||
" Slices (n=%d, oc=%d, ic=%d, h=%d, w=%d)\n",
|
||||
groups, input_n, input_c, input_h, input_w, input_n, oc, oh, ow, kh, kw, pad_top,
|
||||
pad_bottom, pad_left, pad_right, stride_h, stride_w, dilation_h, dilation_w,
|
||||
slices.n, slices.oc, slices.ic, slices.h, slices.w));
|
||||
|
||||
uint8_t fused_conv_relu = (!do_scale && !do_bn &&
|
||||
(do_activation && activation_method == RELU &&
|
||||
(!activation_arg || (activation_arg[0] == 0.0f))))
|
||||
? true
|
||||
: false;
|
||||
|
||||
// uint8_t fused_conv_bn_relu =
|
||||
// (!do_scale && do_bn &&
|
||||
// (do_activation && activation_method == RELU && (!activation_arg || (activation_arg[0] ==
|
||||
// 0.0f))))
|
||||
// ? true
|
||||
// : false;
|
||||
|
||||
// bmk1880v2_tensor_lmem_shape_t oc_shape_ = _shape_t4(1, oc_step, 1, 1);
|
||||
// bmk1880v2_tensor_lmem_shape_t ifmap_shape_ = _shape_t4(n_step, ic_step, ih_step, input_w);
|
||||
// bmk1880v2_tensor_lmem_shape_t ofmap_shape_ = _shape_t4(n_step, oc_step, oh_step, ow);
|
||||
|
||||
bmk1880v2_tensor_lmem_t *tl_weight[2] = {NULL, NULL}, *tl_bias[2] = {NULL, NULL};
|
||||
bmk1880v2_tensor_lmem_t* tl_ifmap[2] = {NULL};
|
||||
bmk1880v2_tensor_lmem_t* tl_ofmap[2] = {NULL};
|
||||
|
||||
// Global memory stride from global memory shape
|
||||
// input_c, output_c, not ic, oc
|
||||
// bmk1880v2_tensor_tgmem_stride_t ofmap_gstride = {static_cast<uint32_t>(output_c) * oh * ow,
|
||||
// static_cast<uint32_t>(oh) * ow,
|
||||
// static_cast<uint32_t>(ow)};
|
||||
// bmk1880v2_tensor_tgmem_stride_t ifmap_gstride = {static_cast<uint32_t>(input_c) * input_h * input_w,
|
||||
// static_cast<uint32_t>(input_h) * input_w,
|
||||
// static_cast<uint32_t>(input_w)};
|
||||
// bmk1880v2_tensor_tgmem_stride_t bias_gstride = {static_cast<uint32_t>(output_c), 1, 1};
|
||||
// bmk1880v2_tensor_tgmem_stride_t weight_gstride = {
|
||||
// static_cast<uint32_t>(oc) * kh * kw * ic, static_cast<uint32_t>(kh) * kw * ic, static_cast<uint32_t>(ic)};
|
||||
bmk1880v2_tensor_tgmem_stride_t ofmap_gstride =
|
||||
bmk1880v2_tensor_tgmem_default_stride(_tg_shape_t4(1, output_c, oh, ow), FMT_BF16);
|
||||
bmk1880v2_tensor_tgmem_stride_t ifmap_gstride = bmk1880v2_tensor_tgmem_default_stride(
|
||||
_tg_shape_t4(1, input_c, input_h, input_w), FMT_BF16);
|
||||
bmk1880v2_tensor_tgmem_stride_t bias_gstride =
|
||||
bmk1880v2_tensor_tgmem_default_stride(_tg_shape_t4(1, output_c, 1, 1), FMT_BF16);
|
||||
bmk1880v2_tensor_tgmem_stride_t weight_gstride =
|
||||
bmk1880v2_tensor_tgmem_default_stride(_tg_shape_t4(1, oc, kh * kw, ic), FMT_BF16);
|
||||
|
||||
//
|
||||
// Pre-alloc maximum one-step size
|
||||
//
|
||||
// Need vector to track the order of local memory.
|
||||
// The local memory release must be in reverse order.
|
||||
//
|
||||
tl_weight[0] =
|
||||
bmk1880v2_lmem_alloc_tensor(ctx, _shape_t4(ic, oc_step, kh, kw), FMT_BF16, CTRL_NULL);
|
||||
if (is_reuse_weight()) {
|
||||
tl_weight[1] =
|
||||
bmk1880v2_lmem_alloc_tensor(ctx, _shape_t4(ic, oc_step, kh, kw), FMT_BF16, CTRL_NULL);
|
||||
} else {
|
||||
// tl_weight[1] = tl_weight[0];
|
||||
}
|
||||
|
||||
tl_ifmap[0] = bmk1880v2_lmem_alloc_tensor(ctx, _shape_t4(n_step, ic, ih_step, iw_step),
|
||||
FMT_BF16, CTRL_AL);
|
||||
|
||||
if (is_reuse_weight()) {
|
||||
tl_ifmap[1] = bmk1880v2_lmem_alloc_tensor(ctx, _shape_t4(n_step, ic, ih_step, iw_step),
|
||||
FMT_BF16, CTRL_AL);
|
||||
} else {
|
||||
// tl_ifmap[1] = tl_ifmap[0];
|
||||
}
|
||||
|
||||
tl_ofmap[0] = bmk1880v2_lmem_alloc_tensor(ctx, _shape_t4(n_step, oc_step, oh_step, ow_step),
|
||||
FMT_BF16, CTRL_AL);
|
||||
|
||||
if (is_reuse_weight()) {
|
||||
tl_ofmap[1] = bmk1880v2_lmem_alloc_tensor(
|
||||
ctx, _shape_t4(n_step, oc_step, oh_step, ow_step), FMT_BF16, CTRL_AL);
|
||||
} else {
|
||||
// tl_ofmap[1] = tl_ofmap[0];
|
||||
}
|
||||
|
||||
ASSERT(tl_weight[0] && tl_ifmap[0] && tl_ofmap[0]);
|
||||
|
||||
if (is_reuse_weight()) {
|
||||
ASSERT(tl_weight[1] && tl_ifmap[1] && tl_ofmap[1]);
|
||||
}
|
||||
|
||||
if (do_bias) {
|
||||
// 16 bit
|
||||
tl_bias[0] = bmk1880v2_lmem_alloc_tensor(ctx, _shape_t4(2, oc_step, 1, 1), FMT_BF16,
|
||||
/*eu_align=*/0);
|
||||
if (is_reuse_weight()) {
|
||||
tl_bias[1] = bmk1880v2_lmem_alloc_tensor(ctx, _shape_t4(2, oc_step, 1, 1), FMT_BF16,
|
||||
/*eu_align=*/0);
|
||||
} else {
|
||||
// tl_bias[1] = tl_bias[0];
|
||||
}
|
||||
ASSERT(tl_bias[0]);
|
||||
if (is_reuse_weight()) {
|
||||
ASSERT(tl_bias[1]);
|
||||
}
|
||||
}
|
||||
|
||||
// split groups
|
||||
for (int ig = 0; ig < groups; ++ig) {
|
||||
int first = 1;
|
||||
int flip = 0;
|
||||
int coeff_flip = 0;
|
||||
gaddr_t ga_ofmap_cur[2] = {0};
|
||||
|
||||
bmk1880v2_parallel_disable(ctx);
|
||||
|
||||
// split oc
|
||||
for (int oc_pos = 0; oc_pos < oc; oc_pos += oc_step) {
|
||||
int cur_oc = math_min(oc - oc_pos, oc_step);
|
||||
|
||||
uint64_t coeff_offset = (ig * oc + oc_pos) * sizeof(uint16_t);
|
||||
|
||||
if (do_bias) {
|
||||
// 2x 16 bit
|
||||
// bmk does not keep eu-align info, user need to update stride if shape changed
|
||||
tl_bias[coeff_flip]->shape = _shape_t4(2, cur_oc, 1, 1);
|
||||
tl_bias[coeff_flip]->stride = bmk1880v2_tensor_lmem_default_stride(
|
||||
ctx, tl_bias[coeff_flip]->shape, FMT_BF16, /*eu_align=*/0);
|
||||
|
||||
LLVM_DEBUG(llvm::errs() << llvm::format(
|
||||
" [ig=%d][oc_pos=%d] tdma_load_stride_bf16:\n"
|
||||
" tl_bias gaddr 0x%lx, laddr 0x%x, shape (%d, %d, "
|
||||
"%d, %d), stride (%d, %d, %d)\n",
|
||||
ig, oc_pos, ga_bias + coeff_offset, tl_bias[coeff_flip]->start_address,
|
||||
tl_bias[coeff_flip]->shape.n, tl_bias[coeff_flip]->shape.c,
|
||||
tl_bias[coeff_flip]->shape.h, tl_bias[coeff_flip]->shape.w, bias_gstride.n,
|
||||
bias_gstride.c, bias_gstride.h));
|
||||
tdma_load_stride_bf16(ctx, tl_bias[coeff_flip], ga_bias + coeff_offset, bias_gstride,
|
||||
CTRL_WEIGHT);
|
||||
}
|
||||
|
||||
// Weight shape for load != shape for tiu
|
||||
// bmk does not keep eu-align info, user need to update stride if shape changed
|
||||
tl_weight[coeff_flip]->shape = _shape_t4(ic, cur_oc, kh, kw);
|
||||
tl_weight[coeff_flip]->stride = bmk1880v2_tensor_lmem_default_stride(
|
||||
ctx, tl_weight[coeff_flip]->shape, FMT_BF16, /*eu_align*/ 0);
|
||||
|
||||
uint64_t weight_offset = (ig * oc * ic * kh * kw + oc_pos * ic * kh * kw) * sizeof(uint16_t);
|
||||
{
|
||||
// Same local address, different shape, stride
|
||||
bmk1880v2_tensor_lmem_t tl_tmp;
|
||||
tl_tmp.start_address = tl_weight[coeff_flip]->start_address;
|
||||
tl_tmp.fmt = FMT_BF16;
|
||||
tl_tmp.shape = _shape_t4(1, cur_oc, kh * kw, ic);
|
||||
tl_tmp.stride =
|
||||
bmk1880v2_tensor_lmem_default_stride(ctx, tl_tmp.shape, FMT_BF16, /*eu_align=*/0);
|
||||
|
||||
LLVM_DEBUG(llvm::errs() << llvm::format(
|
||||
" [ig=%d][oc_pos=%d] tdma_load_stride_bf16:\n"
|
||||
" tl_weight gaddr 0x%lx, laddr 0x%x, shape (%d, %d, "
|
||||
"%d, %d), stride (%d, %d, %d)\n",
|
||||
ig, oc_pos, weight_offset, tl_tmp.start_address, tl_tmp.shape.n,
|
||||
tl_tmp.shape.c, tl_tmp.shape.h, tl_tmp.shape.w, tl_tmp.stride.n,
|
||||
tl_tmp.stride.c, tl_tmp.stride.h, tl_tmp.stride.w));
|
||||
tdma_load_stride_bf16(ctx, &tl_tmp, ga_weight + weight_offset, weight_gstride, CTRL_WEIGHT);
|
||||
}
|
||||
|
||||
// bmk1880v2_tensor_lmem_shape_t ifmap_shape[2] = {0};
|
||||
// bmk1880v2_tensor_lmem_shape_t ofmap_shape[2] = {0};
|
||||
// gaddr_t ga_ifmap_cur[2] = {0};
|
||||
|
||||
// split n
|
||||
for (int n_pos = 0; n_pos < input_n; n_pos += n_step) {
|
||||
int cur_n = math_min(input_n - n_pos, n_step);
|
||||
|
||||
// split h
|
||||
for (int oh_pos = 0; oh_pos < oh; oh_pos += oh_step) {
|
||||
int cur_oh = math_min(oh - oh_pos, oh_step);
|
||||
|
||||
int oh_top = oh_pos;
|
||||
int oh_bot = oh_top + cur_oh;
|
||||
int ih_top = math_max(oh_top * stride_h - pad_top, 0);
|
||||
int ih_bot = math_min((oh_bot - 1) * stride_h + kh_ext - pad_top, input_h);
|
||||
int cur_ih = ih_bot - ih_top;
|
||||
|
||||
int ph_top = 0;
|
||||
if (ih_top == 0) {
|
||||
ph_top = pad_top - oh_top * stride_h;
|
||||
}
|
||||
|
||||
int ph_bot = 0;
|
||||
if (ih_bot == input_h) {
|
||||
ph_bot = (oh_bot - 1) * stride_h + kh_ext - pad_top - input_h;
|
||||
}
|
||||
|
||||
// split w
|
||||
for (int ow_pos = 0; ow_pos < ow; ow_pos += ow_step) {
|
||||
int cur_ow = math_min(ow - ow_pos, ow_step);
|
||||
|
||||
int ow_left = ow_pos;
|
||||
int ow_right = ow_left + cur_ow;
|
||||
int iw_left = math_max(ow_left * stride_w - pad_left, 0);
|
||||
int iw_right = math_min((ow_right - 1) * stride_w + kw_ext - pad_left, input_w);
|
||||
int cur_iw = iw_right - iw_left;
|
||||
|
||||
int pw_left = 0;
|
||||
if (iw_left == 0) {
|
||||
pw_left = pad_left - ow_left * stride_w;
|
||||
}
|
||||
|
||||
int pw_right = 0;
|
||||
if (iw_right == input_w) {
|
||||
pw_right = (ow_right - 1) * stride_w + kw_ext - pad_left - input_w;
|
||||
}
|
||||
|
||||
LLVM_DEBUG(llvm::errs()
|
||||
<< llvm::format(" [ig=%d][oc_pos=%d][n_pos=%d][oh_pos=%d][ow_pos=%d]"
|
||||
" cur_oh %d, cur_ih %d, ih_top %d, ih_bot %d"
|
||||
", cur_ow %d, cur_iw %d, iw_left %d, iw_right %d\n",
|
||||
ig, oc_pos, n_pos, oh_pos, ow_pos, cur_oh, cur_ih, ih_top,
|
||||
ih_bot, cur_ow, cur_iw, iw_left, iw_right));
|
||||
|
||||
// Adjust current shape and stride
|
||||
// bmk does not keep eu-align info, user need to update stride if shape changed
|
||||
tl_ofmap[flip]->shape = _shape_t4(cur_n, cur_oc, cur_oh, cur_ow);
|
||||
tl_ofmap[flip]->stride = bmk1880v2_tensor_lmem_default_stride(
|
||||
ctx, tl_ofmap[flip]->shape, FMT_BF16, /*eu_align=*/1);
|
||||
|
||||
// bmk does not keep eu-align info, user need to update stride if shape changed
|
||||
tl_ifmap[flip]->shape = _shape_t4(cur_n, ic, cur_ih, cur_iw);
|
||||
tl_ifmap[flip]->stride = bmk1880v2_tensor_lmem_default_stride(
|
||||
ctx, tl_ifmap[flip]->shape, FMT_BF16, /*eu_align=*/1);
|
||||
|
||||
uint64_t ifmap_offset = (ig * ic * input_h * input_w + n_pos * input_c * input_h * input_w +
|
||||
ih_top * input_w + iw_left) *
|
||||
sizeof(uint16_t);
|
||||
|
||||
LLVM_DEBUG(
|
||||
llvm::errs() << llvm::format(
|
||||
" [ig=%d][oc_pos=%d][n_pos=%d][oh_pos=%d][ow_pos=%d] tdma_load_stride_bf16:\n"
|
||||
" tl_ifmap gaddr 0x%lx, laddr 0x%x, shape (%d, %d, "
|
||||
"%d, %d), stride (%d, %d, %d)\n",
|
||||
ig, oc_pos, n_pos, oh_pos, ow_pos, ifmap_offset, tl_ifmap[flip]->start_address,
|
||||
tl_ifmap[flip]->shape.n, tl_ifmap[flip]->shape.c, tl_ifmap[flip]->shape.h,
|
||||
tl_ifmap[flip]->shape.w, tl_ifmap[flip]->stride.n, tl_ifmap[flip]->stride.c,
|
||||
tl_ifmap[flip]->stride.h, tl_ifmap[flip]->stride.w));
|
||||
|
||||
tdma_load_stride_bf16(ctx, tl_ifmap[flip], ga_ifmap + ifmap_offset, ifmap_gstride,
|
||||
CTRL_NEURON);
|
||||
|
||||
bmk1880v2_parallel_disable(ctx);
|
||||
bmk1880v2_parallel_enable(ctx);
|
||||
|
||||
{
|
||||
bmk1880v2_tiu_convolution_param_t param;
|
||||
memset(¶m, 0, sizeof(param));
|
||||
param.ofmap = tl_ofmap[flip];
|
||||
param.ifmap = tl_ifmap[flip];
|
||||
param.weight = tl_weight[coeff_flip];
|
||||
param.bias = tl_bias[coeff_flip];
|
||||
param.ins_h = param.ins_last_h = 0;
|
||||
param.ins_w = param.ins_last_w = 0;
|
||||
param.pad_top = ph_top;
|
||||
param.pad_bottom = ph_bot;
|
||||
param.pad_left = pw_left;
|
||||
param.pad_right = pw_right;
|
||||
param.stride_h = stride_h;
|
||||
param.stride_w = stride_w;
|
||||
param.dilation_h = dilation_h;
|
||||
param.dilation_w = dilation_w;
|
||||
param.relu_enable = fused_conv_relu;
|
||||
param.ps32_mode = 0;
|
||||
param.w_is_const = 0;
|
||||
param.layer_id = layer_id;
|
||||
|
||||
LLVM_DEBUG(llvm::errs() << llvm::format(
|
||||
" [ig=%d][oc_pos=%d][n_pos=%d][oh_pos=%d][ow_pos=%d] conv:\n"
|
||||
" ifmap la_addr 0x%x, shape (%d, %d, %d, %d)\n"
|
||||
" weight la_addr 0x%x, shape (%d, %d, %d, %d)\n"
|
||||
" ofmap la_addr 0x%x, shape (%d, %d, %d, %d)\n",
|
||||
ig, oc_pos, n_pos, oh_pos, ow_pos, param.ifmap->start_address,
|
||||
param.ifmap->shape.n, param.ifmap->shape.c, param.ifmap->shape.h,
|
||||
param.ifmap->shape.w, param.weight->start_address,
|
||||
param.weight->shape.n, param.weight->shape.c, param.weight->shape.h,
|
||||
param.weight->shape.w, param.ofmap->start_address,
|
||||
param.ofmap->shape.n, param.ofmap->shape.c, param.ofmap->shape.h,
|
||||
param.ofmap->shape.w));
|
||||
|
||||
bmk1880v2_tiu_convolution(ctx, ¶m);
|
||||
}
|
||||
|
||||
ga_ofmap_cur[flip] = ga_ofmap + (ig * oc * oh * ow + n_pos * output_c * oh * ow +
|
||||
oc_pos * oh * ow + oh_top * ow + ow_left) *
|
||||
sizeof(uint16_t);
|
||||
|
||||
if (!is_reuse_weight()) {
|
||||
flip = 1;
|
||||
first = 0;
|
||||
}
|
||||
|
||||
if (first) {
|
||||
// postpone first result to next loop
|
||||
// loop0: LD0 TIU0
|
||||
// loop1: LD1 TIU1 SD0
|
||||
// loop2: LD2 TIU2 SD1
|
||||
first = 0;
|
||||
} else {
|
||||
int flip_back = 1 - flip;
|
||||
|
||||
// Store back to global memory
|
||||
LLVM_DEBUG(llvm::errs() << llvm::format(
|
||||
" [ig=%d][oc_pos=%d][n_pos=%d][oh_pos=%d][ow_pos=%d] "
|
||||
"tdma_store_stride_bf16:\n"
|
||||
" tl_ofmap gaddr 0x%lx, laddr 0x%x, shape (%d, %d, "
|
||||
"%d, %d), stride (%d, %d, %d)\n",
|
||||
ig, oc_pos, n_pos, oh_pos, ow_pos, ga_ofmap_cur[flip_back],
|
||||
tl_ofmap[flip_back]->start_address, tl_ofmap[flip_back]->shape.n,
|
||||
tl_ofmap[flip_back]->shape.c, tl_ofmap[flip_back]->shape.h,
|
||||
tl_ofmap[flip_back]->shape.w, tl_ofmap[flip_back]->stride.n,
|
||||
tl_ofmap[flip_back]->stride.c, tl_ofmap[flip_back]->stride.h,
|
||||
tl_ofmap[flip_back]->stride.w));
|
||||
|
||||
tdma_store_stride_bf16(ctx, tl_ofmap[flip_back], ga_ofmap_cur[flip_back],
|
||||
ofmap_gstride, CTRL_NEURON);
|
||||
}
|
||||
|
||||
flip = 1 - flip;
|
||||
|
||||
} // for (int ow_pos = 0; ow_pos < ow; ow_pos += ow_step)
|
||||
|
||||
} // for (int oh_i = 0; oh_i < oh; oh_i += oh_step)
|
||||
|
||||
} // for (int n_i = 0; n_i < n; ni += n_step)
|
||||
|
||||
if (!is_reuse_weight()) {
|
||||
coeff_flip = 1;
|
||||
}
|
||||
|
||||
coeff_flip = 1 - coeff_flip;
|
||||
|
||||
} // for (int oc_i = 0; oc_i < oc; oc_i += oc_step
|
||||
|
||||
bmk1880v2_parallel_disable(ctx);
|
||||
|
||||
// the last iteration stored the other side, leave the last side not stored
|
||||
if (!is_reuse_weight()) {
|
||||
// TODO: no need to store last one cuz we store every loop
|
||||
flip = 1;
|
||||
} else {
|
||||
int flip_back = 1 - flip;
|
||||
|
||||
// Store back to global memory
|
||||
LLVM_DEBUG(llvm::errs() << llvm::format(
|
||||
" [ig=%d] tdma_store_stride_bf16:\n"
|
||||
" tl_ofmap gaddr 0x%lx, laddr 0x%x, shape (%d, %d, "
|
||||
"%d, %d), stride (%d, %d, %d)\n",
|
||||
ig, ga_ofmap_cur[flip_back], tl_ofmap[flip_back]->start_address,
|
||||
tl_ofmap[flip_back]->shape.n, tl_ofmap[flip_back]->shape.c,
|
||||
tl_ofmap[flip_back]->shape.h, tl_ofmap[flip_back]->shape.w,
|
||||
tl_ofmap[flip_back]->stride.n, tl_ofmap[flip_back]->stride.c,
|
||||
tl_ofmap[flip_back]->stride.h, tl_ofmap[flip_back]->stride.w));
|
||||
|
||||
tdma_store_stride_bf16(ctx, tl_ofmap[flip_back], ga_ofmap_cur[flip_back], ofmap_gstride,
|
||||
CTRL_NEURON);
|
||||
}
|
||||
|
||||
} // for (int group_i = 0; group_i < groups; ++groups)
|
||||
|
||||
//
|
||||
// Release resource in reverse order
|
||||
//
|
||||
if (do_bias) {
|
||||
if (is_reuse_weight()) bmk1880v2_lmem_free_tensor(ctx, tl_bias[1]);
|
||||
|
||||
bmk1880v2_lmem_free_tensor(ctx, tl_bias[0]);
|
||||
}
|
||||
if (is_reuse_weight()) bmk1880v2_lmem_free_tensor(ctx, tl_ofmap[1]);
|
||||
|
||||
bmk1880v2_lmem_free_tensor(ctx, tl_ofmap[0]);
|
||||
|
||||
if (is_reuse_weight()) bmk1880v2_lmem_free_tensor(ctx, tl_ifmap[1]);
|
||||
|
||||
bmk1880v2_lmem_free_tensor(ctx, tl_ifmap[0]);
|
||||
|
||||
if (is_reuse_weight()) bmk1880v2_lmem_free_tensor(ctx, tl_weight[1]);
|
||||
|
||||
bmk1880v2_lmem_free_tensor(ctx, tl_weight[0]);
|
||||
|
||||
LLVM_DEBUG(llvm::errs() << "<=ConvReuseWeight"
|
||||
<< "/n");
|
||||
}
|
||||
|
||||
int bf16_hists_svm(bmk1880v2_context_t* ctx, uint64_t gaddr_image, uint64_t gaddr_nc_image,
|
||||
bmk1880v2_tensor_tgmem_shape_t image_shape, uint64_t re_order_gaddr_svm,
|
||||
bmk1880v2_tensor_tgmem_shape_t svm_shape, // (oc, ic, kh, kw)
|
||||
uint64_t gaddr_output, int unit_size, fmt_t fmt) {
|
||||
int ret = 0;
|
||||
ASSERT(image_shape.n == 1 && image_shape.c == 1 && "image_shape should 2 dims");
|
||||
// ASSERT(svm_shape.n == unit_size && "svm_shape channel MUST eq unit_size");
|
||||
ASSERT(fmt == FMT_BF16 && "only support FMT_BF16");
|
||||
// 1. nc load transpose, for split unit
|
||||
// 2. store back for load to channel
|
||||
// 3. load c by unit
|
||||
// 4. weight MUST re-order for step 2
|
||||
// 5. conv
|
||||
|
||||
bmk1880v2_tensor_tgmem_t src;
|
||||
init_tgmem(&src);
|
||||
|
||||
// 1. nc load transpose, for split unit
|
||||
bmk1880v2_tdma_tg2l_tensor_copy_nc_transposed_param_t p1;
|
||||
bmk1880v2_tensor_tgmem_shape_t image_shape_expend_unit;
|
||||
image_shape_expend_unit.n = image_shape.h * image_shape.w;
|
||||
image_shape_expend_unit.c = unit_size;
|
||||
image_shape_expend_unit.h = 1;
|
||||
image_shape_expend_unit.w = 1;
|
||||
|
||||
src.fmt = fmt;
|
||||
src.start_address = gaddr_image;
|
||||
src.shape = image_shape_expend_unit;
|
||||
src.stride = bmk1880v2_tensor_tgmem_default_stride(src.shape, src.fmt);
|
||||
|
||||
bmk1880v2_tensor_lmem_shape_t l_image_shape_expend_unit;
|
||||
l_image_shape_expend_unit.n = image_shape_expend_unit.c;
|
||||
l_image_shape_expend_unit.c = image_shape_expend_unit.n;
|
||||
l_image_shape_expend_unit.h = 1;
|
||||
l_image_shape_expend_unit.w = 1;
|
||||
|
||||
bmk1880v2_tensor_lmem_t* dst =
|
||||
bmk1880v2_lmem_alloc_tensor(ctx, l_image_shape_expend_unit, fmt, CTRL_NULL);
|
||||
|
||||
memset(&p1, 0, sizeof(p1));
|
||||
p1.src = &src;
|
||||
p1.dst = dst;
|
||||
bmk1880v2_tdma_g2l_bf16_tensor_copy_nc_transposed(ctx, &p1);
|
||||
|
||||
// 2. store back for load to channel
|
||||
bmk1880v2_tdma_l2tg_tensor_copy_param_t p2;
|
||||
memset(&p2, 0, sizeof(p2));
|
||||
copy_tl_tg_tensor_shape(&image_shape_expend_unit, &l_image_shape_expend_unit);
|
||||
|
||||
src.start_address = gaddr_nc_image;
|
||||
src.shape = image_shape_expend_unit;
|
||||
src.stride = bmk1880v2_tensor_tgmem_default_stride(src.shape, src.fmt);
|
||||
|
||||
p2.src = dst;
|
||||
p2.dst = &src;
|
||||
bmk1880v2_tdma_l2g_bf16_tensor_copy(ctx, &p2);
|
||||
|
||||
bmk1880v2_lmem_free_tensor(ctx, dst);
|
||||
|
||||
// tiling conv, copy from backend
|
||||
if (1) {
|
||||
int input_n = 1;
|
||||
int groups = 1;
|
||||
uint16_t dilation_h = 1, dilation_w = 1;
|
||||
uint8_t pad_top = 0;
|
||||
uint8_t pad_bottom = 0, pad_left = 0, pad_right = 0, stride_h = 1, stride_w = 1;
|
||||
|
||||
_split(ctx, input_n, unit_size, image_shape.h, image_shape.w, groups, svm_shape.n, svm_shape.h,
|
||||
svm_shape.w, dilation_h, dilation_w, pad_top, pad_bottom, pad_left, pad_right, stride_h,
|
||||
stride_w);
|
||||
|
||||
ConvReuseWeight(ctx, gaddr_nc_image, gaddr_output, re_order_gaddr_svm, input_n, unit_size,
|
||||
image_shape.h, image_shape.w, groups, svm_shape.n, svm_shape.h, svm_shape.w,
|
||||
dilation_h, dilation_w, pad_top, pad_bottom, pad_left, pad_right, stride_h,
|
||||
stride_w);
|
||||
} else {
|
||||
// 3. load c by unit
|
||||
bmk1880v2_tdma_tg2l_tensor_copy_param_t p3;
|
||||
memset(&p3, 0, sizeof(p3));
|
||||
image_shape_expend_unit.n = 1;
|
||||
image_shape_expend_unit.c = unit_size;
|
||||
image_shape_expend_unit.h = image_shape.h;
|
||||
image_shape_expend_unit.w = image_shape.w;
|
||||
|
||||
copy_tg_tl_tensor_shape(&l_image_shape_expend_unit, &image_shape_expend_unit);
|
||||
|
||||
bmk1880v2_tensor_lmem_t* tl_ifmap =
|
||||
bmk1880v2_lmem_alloc_tensor(ctx, l_image_shape_expend_unit, fmt, CTRL_AL);
|
||||
|
||||
p3.src = &src;
|
||||
p3.dst = tl_ifmap;
|
||||
bmk1880v2_tdma_g2l_bf16_tensor_copy(ctx, &p3);
|
||||
|
||||
// 4. weight MUST re-order for step 2
|
||||
// bmk1880v2_tensor_lmem_t bmk1880v2_lmem_alloc_tensor(ctx, _shape_t4(ic, oc_step, kh, kw),
|
||||
// FMT_BF16, CTRL_NULL);
|
||||
// weight from origin layout (oc, ic, kh, kw) transform to (1, oc, kh*kw, ic)
|
||||
bmk1880v2_tensor_tgmem_shape_t transpose_svm_shape;
|
||||
transpose_svm_shape.n = 1;
|
||||
transpose_svm_shape.c = svm_shape.n;
|
||||
transpose_svm_shape.h = svm_shape.h * svm_shape.w;
|
||||
transpose_svm_shape.w = svm_shape.c;
|
||||
|
||||
src.start_address = re_order_gaddr_svm;
|
||||
src.shape = image_shape_expend_unit;
|
||||
src.base_reg_index = 1;
|
||||
|
||||
bmk1880v2_tensor_lmem_shape_t l_transpose_svm_shape;
|
||||
copy_tg_tl_tensor_shape(&l_transpose_svm_shape, &transpose_svm_shape);
|
||||
bmk1880v2_tensor_lmem_t* tl_weight =
|
||||
bmk1880v2_lmem_alloc_tensor(ctx, l_transpose_svm_shape, fmt, CTRL_NULL);
|
||||
|
||||
p3.src = &src;
|
||||
p3.dst = tl_weight;
|
||||
bmk1880v2_tdma_g2l_bf16_tensor_copy(ctx, &p3);
|
||||
|
||||
// 5. conv
|
||||
// alloc output
|
||||
bmk1880v2_tensor_lmem_shape_t l_out_shape;
|
||||
conv_output(1, &l_out_shape, &image_shape, &svm_shape);
|
||||
bmk1880v2_tensor_lmem_t* tl_ofmap =
|
||||
bmk1880v2_lmem_alloc_tensor(ctx, l_out_shape, fmt, CTRL_AL);
|
||||
|
||||
bmk1880v2_tiu_convolution_param_t param;
|
||||
memset(¶m, 0, sizeof(param));
|
||||
param.ofmap = tl_ofmap;
|
||||
param.ifmap = tl_ifmap;
|
||||
param.weight = tl_weight;
|
||||
param.bias = NULL;
|
||||
param.ins_h = param.ins_last_h = 0;
|
||||
param.ins_w = param.ins_last_w = 0;
|
||||
param.pad_top = 0;
|
||||
param.pad_bottom = 0;
|
||||
param.pad_left = 0;
|
||||
param.pad_right = 0;
|
||||
param.stride_h = 1;
|
||||
param.stride_w = 1;
|
||||
param.dilation_h = 1;
|
||||
param.dilation_w = 1;
|
||||
param.relu_enable = 0;
|
||||
param.ps32_mode = 0;
|
||||
param.w_is_const = 0;
|
||||
param.layer_id = 0;
|
||||
|
||||
bmk1880v2_tiu_convolution(ctx, ¶m);
|
||||
|
||||
bmk1880v2_tensor_tgmem_shape_t out_shape;
|
||||
copy_tl_tg_tensor_shape(&out_shape, &l_out_shape);
|
||||
|
||||
src.start_address = gaddr_output;
|
||||
src.shape = out_shape;
|
||||
src.base_reg_index = 0;
|
||||
|
||||
p2.src = tl_ofmap;
|
||||
p2.dst = &src;
|
||||
bmk1880v2_tdma_l2g_bf16_tensor_copy(ctx, &p2);
|
||||
|
||||
bmk1880v2_lmem_free_tensor(ctx, tl_ofmap);
|
||||
bmk1880v2_lmem_free_tensor(ctx, tl_weight);
|
||||
bmk1880v2_lmem_free_tensor(ctx, tl_ifmap);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
1105
cvikernel/src/bm1880v2/non_atomic/tiu_lut_atan.c
Normal file
1105
cvikernel/src/bm1880v2/non_atomic/tiu_lut_atan.c
Normal file
File diff suppressed because it is too large
Load Diff
1015
cvikernel/src/bm1880v2/non_atomic/tiu_lut_atan2.c
Normal file
1015
cvikernel/src/bm1880v2/non_atomic/tiu_lut_atan2.c
Normal file
File diff suppressed because it is too large
Load Diff
167
cvikernel/src/bm1880v2/non_atomic/tiu_reciprocal.c
Normal file
167
cvikernel/src/bm1880v2/non_atomic/tiu_reciprocal.c
Normal file
@ -0,0 +1,167 @@
|
||||
/**
|
||||
*/
|
||||
#include "gen_lut.h"
|
||||
#include <bmkernel/bm1880v2/1880v2_fp_convert.h>
|
||||
|
||||
//#define DBG
|
||||
|
||||
/*
|
||||
* NOTICE: it could occupy 2 lookup table size which shape is <1,32,32,8> with bf16 data type
|
||||
*
|
||||
* \tl_buf tmp buffer, the shape MUST be same with \tl_ifmap
|
||||
* \tl_ofmap_bf16 result as bf16, MUST given for tmp buffer used
|
||||
*/
|
||||
int bf16_emit_reciprocal(ctx_t *ctx,
|
||||
bmk1880v2_tensor_lmem_t* IN tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t* IN tl_buf,
|
||||
bmk1880v2_tensor_lmem_t *tbl_answer,
|
||||
bmk1880v2_tensor_lmem_t *tbl_answer_mantissa,
|
||||
bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16
|
||||
) {
|
||||
|
||||
return bf16_lut_exp_mantissa(ctx,
|
||||
tl_ifmap,
|
||||
tl_buf,
|
||||
tbl_answer,
|
||||
tbl_answer_mantissa,
|
||||
tl_ofmap_bf16
|
||||
);
|
||||
}
|
||||
|
||||
// <! gen reciprocal f(x) = 1/x
|
||||
static double _gen_reciprocal(int base, int p) {
|
||||
// y = x ^ -1
|
||||
double f = (double) (pow(base, -1 * p));
|
||||
|
||||
if (isnan(f)) {
|
||||
assert(0);
|
||||
}
|
||||
return f;
|
||||
}
|
||||
|
||||
|
||||
void bf16_gen_reciprocal(uint16_t *table_data, bmk1880v2_tensor_lmem_shape_t* table_shape) {
|
||||
|
||||
assert(is_1880v2_tbl_shape(table_shape));
|
||||
|
||||
int exp_start = bf16_exp_start();
|
||||
int half = half_h_table();
|
||||
int table_hw = bf16_table_hw();
|
||||
uint64_t idx = 0;
|
||||
|
||||
// prepare channel 0
|
||||
double s = 0.0;
|
||||
// 0^-1 is invalid, use positive/negtive max value: 0x7F7F / 0xFF7F
|
||||
//table_data[idx] = 0xff7f; //<! convert to 0xff7f, mulitply slope[0](0.5) is feff
|
||||
table_data[idx] = 0x7F80; //<! convert to 0x7F7F
|
||||
#ifdef DBG
|
||||
printf("t [%lu] is %f bf %x\n", idx,
|
||||
convert_bf16_fp32(table_data[idx]),
|
||||
table_data[idx]);
|
||||
#endif
|
||||
idx++;
|
||||
|
||||
// > 0, exp from 0 -62 -61 .. 62 63
|
||||
for (int i = 0; i < half - 1; i++) {
|
||||
int shift = (exp_start + i);
|
||||
uint8_t is_odd = (shift % 2);
|
||||
float exp = shift;
|
||||
if (is_odd) {
|
||||
exp = exp - 1;
|
||||
}
|
||||
|
||||
double s = _gen_reciprocal(2, exp);
|
||||
table_data[idx] = convert_fp32_bf16(s);
|
||||
#ifdef DBG
|
||||
printf("t [%lu] is %f [idx:%f][2^%f] bf %x\n", idx,
|
||||
convert_bf16_fp32(table_data[idx]),
|
||||
(float)(exp_start + i), -1 * exp,
|
||||
table_data[idx]);
|
||||
#endif
|
||||
idx++;
|
||||
}
|
||||
|
||||
s = _gen_reciprocal(2, -0);
|
||||
table_data[idx] = convert_fp32_bf16(s);
|
||||
table_data[idx] = 0x7F80; //<! convert to 0x7F7F
|
||||
#ifdef DBG
|
||||
printf("t [%lu] is %f[%d] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), 0, table_data[idx]);
|
||||
#endif
|
||||
idx++;
|
||||
|
||||
// < 0, exp from 0 -62 -61 .. 62 63
|
||||
for (int i = 0; i < half - 1; i++) {
|
||||
int shift = (exp_start + i);
|
||||
uint8_t is_odd = (shift % 2);
|
||||
float exp = shift;
|
||||
if (is_odd) {
|
||||
exp = exp - 1;
|
||||
}
|
||||
|
||||
double s = -1 * _gen_reciprocal(-2, exp);
|
||||
table_data[idx] = convert_fp32_bf16(s);
|
||||
#ifdef DBG
|
||||
printf("t [%lu] is %f(%e - %.8lf)[(-2)^%f] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), s, s, exp, table_data[idx]);
|
||||
#endif
|
||||
idx++;
|
||||
}
|
||||
|
||||
// idx = 255 dont care
|
||||
//s = _gen_reciprocal(2, 0);
|
||||
//table_data[idx] = convert_fp32_bf16(s);
|
||||
//printf("t [%lu] is %f[%d]\n", idx, convert_bf16_fp32(table_data[idx]), 0);
|
||||
//idx++;
|
||||
|
||||
// duplicate channel #1 to #31
|
||||
//TODO: tensor copy
|
||||
for (uint32_t i = 1; i < table_shape->c; i++) {
|
||||
memcpy(&table_data[i * table_hw], &table_data[0], sizeof(uint16_t) * table_hw);
|
||||
}
|
||||
}
|
||||
|
||||
void bf16_gen_reciprocal_mantissa(uint16_t* OUT table_mantissa,
|
||||
bmk1880v2_tensor_lmem_shape_t* table_shape) {
|
||||
|
||||
assert(is_1880v2_tbl_shape(table_shape));
|
||||
|
||||
uint32_t half = half_h_table();
|
||||
int table_hw = bf16_table_hw();
|
||||
|
||||
int idx = 0;
|
||||
double d;
|
||||
for (uint32_t i = 0; i < half; i++) {
|
||||
d = 1 + i * 1 / 128.0;
|
||||
d = (double) pow(d, -1);
|
||||
table_mantissa[128+idx] = convert_fp32_bf16(d);
|
||||
|
||||
//13=2^3x1.625=(2^2)x(2^1x1.625)
|
||||
d = 2 * (1 + i * 1 / 128.0);
|
||||
d = (double) pow(d, -1);
|
||||
table_mantissa[idx] = convert_fp32_bf16(d);
|
||||
idx++;
|
||||
}
|
||||
|
||||
#ifdef DBG
|
||||
for (uint32_t i = 0; i < 2 * half; i++) {
|
||||
printf("mantissa [%u] is %lf, 0x%x\n", i, convert_bf16_fp32(table_mantissa[i]),
|
||||
table_mantissa[i]);
|
||||
}
|
||||
#endif /* ifdef DBG */
|
||||
|
||||
// duplicate channel #1 to #31
|
||||
//TODO: tensor copy
|
||||
for (uint64_t i = 1; i < table_shape->c; i++) {
|
||||
memcpy(&table_mantissa[table_hw * i], &table_mantissa[0], sizeof(uint16_t) * table_hw);
|
||||
}
|
||||
}
|
||||
|
||||
void bf16_reciprocal_tbl(uint16_t *table_data, uint16_t* table_mantissa,
|
||||
bmk1880v2_tensor_lmem_shape_t* table_shape) {
|
||||
|
||||
assert(table_data);
|
||||
assert(table_mantissa);
|
||||
assert(table_shape);
|
||||
|
||||
bf16_gen_reciprocal(table_data, table_shape);
|
||||
bf16_gen_reciprocal_mantissa(table_mantissa, table_shape);
|
||||
}
|
||||
438
cvikernel/src/bm1880v2/non_atomic/tiu_reshape_c.c
Normal file
438
cvikernel/src/bm1880v2/non_atomic/tiu_reshape_c.c
Normal file
@ -0,0 +1,438 @@
|
||||
/**
|
||||
* reshape channel under depthwise
|
||||
*/
|
||||
//
|
||||
|
||||
#include "gen_lut.h"
|
||||
#include <bmkernel/bm1880v2/1880v2_fp_convert.h>
|
||||
|
||||
//#define DBG
|
||||
// copy from \1880v2_test_util.h
|
||||
static int calc_dilute_hw(int h, int ins_h, int ins_h_l, int pad_h_b, int pad_h_t)
|
||||
{
|
||||
return (h - 1) * (ins_h + 1) + ins_h_l +
|
||||
1 + pad_h_t + pad_h_b;
|
||||
}
|
||||
|
||||
// get padding as 'SAME' mode in tensorflow
|
||||
// https://www.jianshu.com/p/05c4f1621c7e
|
||||
static int get_same_pad(int ih, int sh, int kh) {
|
||||
return (((ih + sh - 1) / sh) - 1) * sh + kh - ih;
|
||||
}
|
||||
|
||||
// get real 'h' with pad/ins
|
||||
static int pooling_ih_ext(int ins_h, int ins_last_h, int pad_top, int pad_bottom, int ih)
|
||||
{
|
||||
int ins = ins_h;
|
||||
int ins_last = ins_last_h;
|
||||
int pad = pad_top + pad_bottom;
|
||||
return (ih - 1) * (ins + 1) + ins_last + 1 + pad;
|
||||
}
|
||||
|
||||
// get real 'w' with pad/ins
|
||||
static int pooling_iw_ext(int ins_w, int ins_last_w, int pad_left, int pad_right, int iw)
|
||||
{
|
||||
int ins = ins_w;
|
||||
int ins_last = ins_last_w;
|
||||
int pad = pad_left + pad_right;
|
||||
return (iw - 1) * (ins + 1) + ins_last + 1 + pad;
|
||||
}
|
||||
|
||||
// get output h with parameter
|
||||
static int pooling_oh(
|
||||
int ins_h, int ins_last_h, int pad_top, int pad_bottom,
|
||||
int stride_h, int ih, int kh, int dh)
|
||||
{
|
||||
int ih_ext = pooling_ih_ext(ins_h, ins_last_h, pad_top, pad_bottom, ih);
|
||||
int d_h = (kh -1) * dh + 1;
|
||||
return (ih_ext - d_h) / stride_h + 1;
|
||||
}
|
||||
|
||||
// get output w with parameter
|
||||
static int pooling_ow(
|
||||
int ins_w, int ins_last_w, int pad_left, int pad_right,
|
||||
int stride_w, int iw, int kw, int dw)
|
||||
{
|
||||
int iw_ext = pooling_iw_ext(ins_w, ins_last_w, pad_left, pad_right, iw);
|
||||
int d_w = (kw -1) * dw +1;
|
||||
return (iw_ext - d_w) / stride_w + 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief get extended bias
|
||||
* \return allocated new bias
|
||||
*/
|
||||
uint32_t* bm1880v2_reshape_channel_bias(uint8_t* bias,
|
||||
int ni, int ci, int hi, int wi,
|
||||
int old_bias_c, fmt_t fmt
|
||||
) {
|
||||
|
||||
assert(bias);
|
||||
assert((ni == 2 || ni == 1) && "not support bias batch > 1");
|
||||
assert(ci / old_bias_c > 0 && ci % old_bias_c == 0);
|
||||
int sz = fmt == FMT_BF16 ? 4 : 2;
|
||||
|
||||
int d_c_bias_sz = ni * ci * hi * wi;
|
||||
uint8_t *new_bias = (uint8_t *)malloc(d_c_bias_sz * sz);
|
||||
int bias_hw = hi * wi;
|
||||
int duplicat_c = ci / old_bias_c;
|
||||
|
||||
for (int c = 0; c < old_bias_c; c++) {
|
||||
int shift = (c * bias_hw) * sz;
|
||||
for (int i = 0; i < duplicat_c; i++) {
|
||||
int new_bias_shift = (c * duplicat_c + i) * bias_hw * sz;
|
||||
memcpy(&new_bias[new_bias_shift], &bias[shift], bias_hw * sz);
|
||||
}
|
||||
}
|
||||
return (uint32_t* )new_bias;
|
||||
}
|
||||
|
||||
/*
|
||||
* \brief prepare load shape/stride
|
||||
* \return -1 means fail to reshape, 0 means success
|
||||
* \TODO check memory usage
|
||||
*/
|
||||
static inline int _get_dup_shape(
|
||||
bmk1880v2_context_t *bk_ctx,
|
||||
int in, int ic, int ih, int iw,
|
||||
int d_kh, int stride_h, int npu_num,
|
||||
bmk1880v2_tensor_lmem_shape_t* tl_shape, bmk1880v2_tensor_lmem_stride_t* tl_load_stride,
|
||||
bmk1880v2_tensor_tgmem_shape_t* tg_shape, bmk1880v2_tensor_tgmem_stride_t* tg_stride,
|
||||
fmt_t src_tg_fmt, fmt_t dst_tl_fmt
|
||||
) {
|
||||
|
||||
assert(in > 0 && ic > 0 && ih > 0 && iw > 0 && d_kh > 0 && stride_h > 0);
|
||||
assert(tl_shape && tl_load_stride && tg_shape && tg_stride);
|
||||
|
||||
// 1. reshape and extend c, h axis in order
|
||||
int ch = ic * ih;
|
||||
int oc;
|
||||
int oh;
|
||||
|
||||
// FIXME: check kernel setting
|
||||
oh = 0;
|
||||
|
||||
for (int i = npu_num/ic; i > 0; i--) {
|
||||
#if 0
|
||||
int hw = ih * iw;
|
||||
int _oh = hw / i / iw;
|
||||
if (hw % i == 0 && (hw / i) % stride_h == 0 && _oh >= stride_h) {
|
||||
oh = _oh;
|
||||
break;
|
||||
}
|
||||
#else
|
||||
int _oh = ih / i;
|
||||
if (ih % i == 0 && (_oh) % stride_h == 0 && _oh >= stride_h /*&& _oh >= d_kh*/) {
|
||||
oh = _oh;
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
if (!oh) {
|
||||
// FIXME: check terminal condition
|
||||
return -1;
|
||||
}
|
||||
|
||||
oc = ch / oh;
|
||||
|
||||
#ifdef DBG
|
||||
printf ("ic:ih is %d %d, oc:oh is %d:%d\n", ic, ih, oc, oh);
|
||||
#endif
|
||||
|
||||
// tg/tl MUST be same shape size
|
||||
tl_shape->n = tg_shape->n = 1;
|
||||
tl_shape->c = tg_shape->c = oc;
|
||||
tl_shape->h = tg_shape->h = oh;
|
||||
tl_shape->w = tg_shape->w = iw;
|
||||
|
||||
// init tl
|
||||
bmk1880v2_tensor_lmem_stride_t s =
|
||||
bmk1880v2_tensor_lmem_default_stride(bk_ctx, *tl_shape, dst_tl_fmt, CTRL_NULL);
|
||||
tl_load_stride->n = s.n;
|
||||
tl_load_stride->c = s.c;
|
||||
tl_load_stride->h = s.h;
|
||||
tl_load_stride->w = s.w;
|
||||
|
||||
// init tg
|
||||
bmk1880v2_tensor_tgmem_stride_t gs =
|
||||
bmk1880v2_tensor_tgmem_default_stride(*tg_shape, src_tg_fmt);
|
||||
|
||||
tg_stride->n = gs.n;
|
||||
tg_stride->c = gs.c;
|
||||
tg_stride->h = gs.h;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* \brief get proper reshape size for depthwise conv with 'same' mode in h direction
|
||||
* \return -1 means alloc fail
|
||||
* \NOTICE: not support batch/ins_x/dilated_x/pad_top/pad_bottom
|
||||
*/
|
||||
int bm1880v2_reshape_channel_same(
|
||||
bmk1880v2_context_t *bk_ctx,
|
||||
int ic, int ih, int iw, int kh, int kw,
|
||||
int pad_right, int pad_left, int stride_h, int stride_w,
|
||||
bmk1880v2_tensor_lmem_shape_t* tl_load_shape,
|
||||
bmk1880v2_tensor_lmem_stride_t* new_tl_ifmap_stride,
|
||||
bmk1880v2_tensor_tgmem_shape_t* new_tg_ifmap_shape,
|
||||
bmk1880v2_tensor_tgmem_stride_t* new_tg_ifmap_stride,
|
||||
bmk1880v2_tensor_lmem_shape_t* new_tl_weight_shape,
|
||||
bmk1880v2_tensor_lmem_shape_t* new_tl_bias_shape,
|
||||
bmk1880v2_tensor_lmem_shape_t* new_tl_ofmap_shape,
|
||||
fmt_t fmt, int eu_align) {
|
||||
|
||||
assert(eu_align == 0 || eu_align == 1);
|
||||
|
||||
bmk1880v2_chip_info_t chip_info = bmk1880v2_chip_info();
|
||||
// TODO: verify dilation_h/dilation_w
|
||||
int dilation_h = 1;
|
||||
int dilation_w = 1;
|
||||
// TODO: verify p->ins_h, p->ins_last_h
|
||||
int d_kh = calc_dilute_hw(kh, dilation_h - 1, 0, 0, 0);
|
||||
int h_after = calc_dilute_hw(ih, 0, 0, 0, 0);
|
||||
int in = 1;
|
||||
//int h_after = calc_dilute_hw(ih, p->ins_h, p->ins_last_h, p->pad_top, p->pad_bottom);
|
||||
//int w_after = calc_dilute_hw(iw, p->ins_w, p->ins_last_w, p->pad_left, p->pad_right);
|
||||
int ret = _get_dup_shape(bk_ctx, in, ic, h_after, iw, d_kh, stride_h, chip_info.npu_num,
|
||||
tl_load_shape, new_tl_ifmap_stride, new_tg_ifmap_shape, new_tg_ifmap_stride,
|
||||
fmt, fmt);
|
||||
|
||||
if (ret == -1) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
new_tl_weight_shape->n = 1;
|
||||
new_tl_weight_shape->c = tl_load_shape->c;
|
||||
new_tl_weight_shape->h = kh;
|
||||
new_tl_weight_shape->w = kw;
|
||||
|
||||
new_tl_bias_shape->n = 2;
|
||||
new_tl_bias_shape->c = tl_load_shape->c;
|
||||
new_tl_bias_shape->h = 1;
|
||||
new_tl_bias_shape->w = 1;
|
||||
|
||||
int pad_h = get_same_pad(tl_load_shape->h, stride_h, kh);
|
||||
//int no_pad_h = tl_load_shape->h;
|
||||
|
||||
// reserve for padding
|
||||
new_tg_ifmap_shape->h += pad_h;
|
||||
tl_load_shape->h += pad_h;
|
||||
|
||||
bmk1880v2_tensor_lmem_stride_t s =
|
||||
bmk1880v2_tensor_lmem_default_stride(bk_ctx, *tl_load_shape, fmt, eu_align);
|
||||
|
||||
new_tl_ifmap_stride->n = s.n;
|
||||
new_tl_ifmap_stride->c = s.c;
|
||||
new_tl_ifmap_stride->h = s.h;
|
||||
new_tl_ifmap_stride->w = s.w;
|
||||
|
||||
// TODO: verity ins_x
|
||||
int oh = pooling_oh(0, 0, 0, 0,
|
||||
stride_h, tl_load_shape->h, kh, dilation_h);
|
||||
int ow = pooling_ow(0, 0, pad_left, pad_right,
|
||||
stride_w, tl_load_shape->w, kw, dilation_w);
|
||||
|
||||
#ifdef DBG
|
||||
printf("new oh/ow pad_h is %d/%d %d\n", oh, ow, pad_h);
|
||||
#endif
|
||||
new_tl_ofmap_shape->n = in;
|
||||
new_tl_ofmap_shape->c = tl_load_shape->c;
|
||||
new_tl_ofmap_shape->h = oh;
|
||||
new_tl_ofmap_shape->w = ow;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* \brief duplicate weight for reshaped c
|
||||
*/
|
||||
uint8_t* bm1880v2_reshape_channel_weight(uint8_t* weight,
|
||||
int ni, int ci, int hi, int wi,
|
||||
int old_weight_c,
|
||||
fmt_t fmt) {
|
||||
|
||||
assert(weight);
|
||||
assert(ci / old_weight_c > 0 && ci % old_weight_c == 0);
|
||||
|
||||
int sz = fmt == FMT_BF16 ? 2 : 1;
|
||||
|
||||
int new_weight_hw_shape_size = hi * wi;
|
||||
int new_weight_shape_size = ni * ci * hi * wi;
|
||||
int duplicat_c = ci / old_weight_c;
|
||||
uint8_t *new_weight = (uint8_t *)malloc(new_weight_shape_size * sz);
|
||||
|
||||
|
||||
for (int n = 0; n < ni; n++) {
|
||||
for (int c = 0; c < old_weight_c; c++) {
|
||||
int index = (n * old_weight_c + c) * new_weight_hw_shape_size * sz;
|
||||
for (int i = 0; i < duplicat_c; i++) {
|
||||
int new_weight_index = (n * old_weight_c * duplicat_c +
|
||||
c * duplicat_c + i) * new_weight_hw_shape_size * sz;
|
||||
memcpy(&new_weight[new_weight_index], &weight[index], new_weight_hw_shape_size * sz);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new_weight;
|
||||
}
|
||||
|
||||
/*
|
||||
* \brief prepare load shape/stride with pad
|
||||
* \return -1 means fail to reshape, 0 means success
|
||||
* \TODO check memory usage
|
||||
*/
|
||||
static inline int _get_dup_shape_same_pad(
|
||||
bmk1880v2_context_t *bk_ctx,
|
||||
int in, int ic, int ih, int iw,
|
||||
int d_kh, int stride_h, int npu_num,
|
||||
bmk1880v2_tensor_lmem_shape_t* tl_load_shape,
|
||||
bmk1880v2_tensor_lmem_stride_t* tl_load_stride,
|
||||
bmk1880v2_tensor_tgmem_shape_t* tg_shape,
|
||||
bmk1880v2_tensor_tgmem_stride_t* tg_stride,
|
||||
fmt_t src_tg_fmt, fmt_t dst_tl_fmt
|
||||
) {
|
||||
|
||||
assert(in > 0 && ic > 0 && ih > 0 && iw > 0 && d_kh > 0 && stride_h > 0);
|
||||
assert(tl_load_shape && tl_load_stride && tg_shape && tg_stride);
|
||||
|
||||
// 1. reshape and extend c, h axis in order
|
||||
int oc;
|
||||
int oh;
|
||||
|
||||
// FIXME: check kernel setting
|
||||
oh = 0;
|
||||
|
||||
// 2. get total output
|
||||
// 3. slice output
|
||||
assert((ih - d_kh) % stride_h == 0);
|
||||
int ih_ext = pooling_ih_ext(0, 0, 0, 0, ih);
|
||||
int _oh = (ih_ext - d_kh) / stride_h + 1;
|
||||
|
||||
for (int i = npu_num/ic; i > 0; i--) {
|
||||
if (_oh % i == 0) {
|
||||
// add 1 for later padding
|
||||
oh = stride_h * (_oh / i - 1) + 1;
|
||||
oc = i * ic;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!oh) {
|
||||
// FIXME: check terminal condition
|
||||
return -1;
|
||||
}
|
||||
|
||||
#ifdef DBG
|
||||
printf ("ic:ih is %d %d, oc:oh is %d:%d\n", ic, ih, oc, oh);
|
||||
#endif
|
||||
|
||||
// tg/tl MUST be same shape size
|
||||
tl_load_shape->n = tg_shape->n = 1;
|
||||
tl_load_shape->c = tg_shape->c = oc;
|
||||
tl_load_shape->h = tg_shape->h = oh;
|
||||
tl_load_shape->w = tg_shape->w = iw;
|
||||
|
||||
// init tl
|
||||
bmk1880v2_tensor_lmem_stride_t s =
|
||||
bmk1880v2_tensor_lmem_default_stride(bk_ctx, *tl_load_shape, dst_tl_fmt, CTRL_NULL);
|
||||
tl_load_stride->n = s.n;
|
||||
tl_load_stride->c = s.c;
|
||||
tl_load_stride->h = s.h;
|
||||
tl_load_stride->w = s.w;
|
||||
|
||||
// init tg
|
||||
bmk1880v2_tensor_tgmem_stride_t gs =
|
||||
bmk1880v2_tensor_tgmem_default_stride(*tg_shape, src_tg_fmt);
|
||||
|
||||
tg_stride->n = gs.n;
|
||||
tg_stride->c = gs.c;
|
||||
tg_stride->h = gs.h;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief get proper reshape size for depthwise conv with 'same' mode in h direction
|
||||
* 'pad' means \ih is padded
|
||||
* \return -1 means alloc fail
|
||||
* \NOTICE: not support batch/ins_x/dilated_x/pad_top/pad_bottom
|
||||
*/
|
||||
int bm1880v2_reshape_channel_same_pad(
|
||||
bmk1880v2_context_t *bk_ctx,
|
||||
int ic, int ih, int iw, int kh, int kw,
|
||||
int pad_right, int pad_left, int stride_h, int stride_w,
|
||||
bmk1880v2_tensor_lmem_shape_t* tl_load_shape,
|
||||
bmk1880v2_tensor_lmem_stride_t* new_tl_ifmap_stride,
|
||||
bmk1880v2_tensor_tgmem_shape_t* new_tg_ifmap_shape,
|
||||
bmk1880v2_tensor_tgmem_stride_t* new_tg_ifmap_stride,
|
||||
bmk1880v2_tensor_lmem_shape_t* new_tl_weight_shape,
|
||||
bmk1880v2_tensor_lmem_shape_t* new_tl_bias_shape,
|
||||
bmk1880v2_tensor_lmem_shape_t* new_tl_ofmap_shape,
|
||||
fmt_t fmt, int eu_align) {
|
||||
|
||||
assert(eu_align == 0 || eu_align == 1);
|
||||
|
||||
bmk1880v2_chip_info_t chip_info = bmk1880v2_chip_info();
|
||||
// TODO: verify dilation_h/dilation_w
|
||||
int dilation_h = 1;
|
||||
int dilation_w = 1;
|
||||
// TODO: verify p->ins_h, p->ins_last_h
|
||||
int d_kh = calc_dilute_hw(kh, dilation_h - 1, 0, 0, 0);
|
||||
int h_after = calc_dilute_hw(ih, 0, 0, 0, 0);
|
||||
int in = 1;
|
||||
//int h_after = calc_dilute_hw(ih, p->ins_h, p->ins_last_h, p->pad_top, p->pad_bottom);
|
||||
//int w_after = calc_dilute_hw(iw, p->ins_w, p->ins_last_w, p->pad_left, p->pad_right);
|
||||
int ret = _get_dup_shape_same_pad(bk_ctx, in, ic,
|
||||
h_after, iw, d_kh, stride_h, chip_info.npu_num,
|
||||
tl_load_shape, new_tl_ifmap_stride, new_tg_ifmap_shape, new_tg_ifmap_stride,
|
||||
fmt, fmt);
|
||||
|
||||
if (ret == -1) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
new_tl_weight_shape->n = 1;
|
||||
new_tl_weight_shape->c = tl_load_shape->c;
|
||||
new_tl_weight_shape->h = kh;
|
||||
new_tl_weight_shape->w = kw;
|
||||
|
||||
new_tl_bias_shape->n = 2;
|
||||
new_tl_bias_shape->c = tl_load_shape->c;
|
||||
new_tl_bias_shape->h = 1;
|
||||
new_tl_bias_shape->w = 1;
|
||||
|
||||
int pad_h = get_same_pad(tl_load_shape->h, stride_h, kh);
|
||||
//int no_pad_h = tl_load_shape->h;
|
||||
|
||||
// reserve for padding
|
||||
new_tg_ifmap_shape->h += pad_h;
|
||||
tl_load_shape->h += pad_h;
|
||||
|
||||
bmk1880v2_tensor_lmem_stride_t s =
|
||||
bmk1880v2_tensor_lmem_default_stride(bk_ctx, *tl_load_shape, fmt, eu_align);
|
||||
|
||||
new_tl_ifmap_stride->n = s.n;
|
||||
new_tl_ifmap_stride->c = s.c;
|
||||
new_tl_ifmap_stride->h = s.h;
|
||||
new_tl_ifmap_stride->w = s.w;
|
||||
|
||||
// TODO: verity ins_x
|
||||
int oh = pooling_oh(0, 0, 0, 0,
|
||||
stride_h, tl_load_shape->h, kh, dilation_h);
|
||||
int ow = pooling_ow(0, 0, pad_left, pad_right,
|
||||
stride_w, tl_load_shape->w, kw, dilation_w);
|
||||
|
||||
#ifdef DBG
|
||||
printf("new oh/ow pad_h is %d/%d %d\n", oh, ow, pad_h);
|
||||
#endif
|
||||
new_tl_ofmap_shape->n = in;
|
||||
new_tl_ofmap_shape->c = tl_load_shape->c;
|
||||
new_tl_ofmap_shape->h = oh;
|
||||
new_tl_ofmap_shape->w = ow;
|
||||
|
||||
return ret;
|
||||
}
|
||||
277
cvikernel/src/bm1880v2/non_atomic/tiu_sigmoid.c
Normal file
277
cvikernel/src/bm1880v2/non_atomic/tiu_sigmoid.c
Normal file
@ -0,0 +1,277 @@
|
||||
/**
|
||||
* implement Linear interpolation search
|
||||
*
|
||||
* we need to pass 2 table, one is answer(lut_answer), another is slope with anwser(lut_answer_slope),
|
||||
*
|
||||
* for example, we want to get x value
|
||||
* +------+----+
|
||||
* x0 x x1
|
||||
*
|
||||
* the [Linear interpolation defined] (https://en.wikipedia.org/wiki/Linear_interpolation) as flowing:
|
||||
*
|
||||
* part C part A part B
|
||||
* +--+ +---+ +----------------------------------------+
|
||||
*
|
||||
* p(x) = f(x0) + ( (f(x1) - f(x0)) / (x1 - x0) ) * (x - x0)
|
||||
*
|
||||
* +---+ +-----------------------------+
|
||||
* lut_answer lut_answer_slope
|
||||
*/
|
||||
|
||||
#include "gen_lut.h"
|
||||
#include <bmkernel/bm1880v2/1880v2_fp_convert.h>
|
||||
|
||||
//#define DBG
|
||||
/*
|
||||
* NOTICE: it could occupy 2 lookup table size which shape is <1,32,32,8> with bf16 data type
|
||||
*
|
||||
* \tl_buf tmp buffer, the shape MUST be same with \tl_ifmap
|
||||
* \tl_ofmap_bf16 result as bf16, MUST given for tmp buffer used
|
||||
*/
|
||||
int bf16_emit_sigmoid(ctx_t *ctx,
|
||||
bmk1880v2_tensor_lmem_t* IN tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t* IN tl_buf,
|
||||
bmk1880v2_tensor_lmem_t *tl_table_answer,
|
||||
bmk1880v2_tensor_lmem_t *tl_table_answer_slope,
|
||||
bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16,
|
||||
float scale
|
||||
) {
|
||||
bf16_table_check(tl_ifmap, tl_table_answer, tl_table_answer_slope, tl_buf);
|
||||
assert_same_shape_3(tl_ifmap, tl_buf, tl_ofmap_bf16);
|
||||
|
||||
fmt_t fmt = FMT_BF16;
|
||||
|
||||
tl_shape_t tl_ofmap_A_idx_int8_shape = {1, tl_buf->shape.c, tl_buf->shape.h * tl_buf->shape.w, 1};
|
||||
|
||||
bmk1880v2_tdma_l2l_tensor_copy_param_t p10;
|
||||
|
||||
// scale input for remap its idx(-x~x) to (-127~127), dirty tl_ifmap
|
||||
bmk1880v2_tiu_element_wise_mul_param_t p1;
|
||||
memset(&p1, 0, sizeof(p1));
|
||||
p1.res_high = NULL;
|
||||
p1.res_low = tl_ifmap;
|
||||
p1.a = tl_ifmap;
|
||||
p1.b_is_const = 1;
|
||||
p1.b_const.val = convert_fp32_bf16(scale);
|
||||
p1.rshift_bits = 0;
|
||||
p1.relu_enable = 0;
|
||||
bmk1880v2_tiu_element_wise_mul(ctx, &p1);
|
||||
|
||||
|
||||
// <! get idx from bf16->int8
|
||||
// save by stride
|
||||
memset(&p10, 0x00, sizeof(bmk1880v2_tdma_l2l_tensor_copy_param_t));
|
||||
bmk1880v2_tensor_lmem_t dst;
|
||||
memcpy(&dst, tl_ofmap_bf16, sizeof(bmk1880v2_tensor_lmem_t));
|
||||
dst.fmt = FMT_I8;
|
||||
dst.shape = tl_ofmap_A_idx_int8_shape;
|
||||
//dst.stride = bmk1880v2_tensor_lmem_default_stride(ctx, dst.shape, /*eu_align*/ 1, dst.fmt);
|
||||
dst.stride = bmk1880v2_tensor_lmem_default_stride(ctx, dst.shape, dst.fmt, CTRL_NULL);
|
||||
dst.stride.h = dst.stride.h * 2;
|
||||
dst.int8_rnd_mode = 1;
|
||||
p10.dst = &dst;
|
||||
p10.src = tl_ifmap;
|
||||
bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p10);
|
||||
dst.int8_rnd_mode = 0; // reset
|
||||
|
||||
// <! int8 to fb16 format cus for sub use, sub MUST in the same format
|
||||
memset(&p10, 0x00, sizeof(bmk1880v2_tdma_l2l_tensor_copy_param_t));
|
||||
p10.dst = tl_buf; //<! bf16
|
||||
p10.src = &dst;
|
||||
bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p10);
|
||||
|
||||
// <! sub, diff base , a - b
|
||||
// (x - x0)
|
||||
bmk1880v2_tiu_element_wise_sub_param_t p5;
|
||||
memset(&p5, 0, sizeof(p5));
|
||||
p5.res_high = 0;
|
||||
p5.res_low = tl_ifmap;
|
||||
p5.a_high = 0;
|
||||
p5.a_low = tl_ifmap;
|
||||
p5.b_high = 0;
|
||||
p5.b_low = tl_buf;
|
||||
p5.rshift_bits = 0;
|
||||
bmk1880v2_tiu_element_wise_sub(ctx, &p5);
|
||||
|
||||
// get f(x0) and slope(x)
|
||||
// reshape, 16->16
|
||||
dst.fmt = fmt;
|
||||
dst.shape = tl_buf->shape;
|
||||
dst.stride = tl_buf->stride;
|
||||
|
||||
// <! get slope by index
|
||||
// <! ( (f(x1) - f(x0)) / (x1 - x0) )
|
||||
// <! TIU MUST with same shape and stride, we leverage output map shape and stride
|
||||
bmk1880v2_tiu_lookup_table_param_t p12;
|
||||
memset(&p12, 0x0, sizeof(bmk1880v2_tiu_lookup_table_param_t));
|
||||
p12.ofmap = tl_buf;
|
||||
p12.ifmap = &dst;
|
||||
p12.table = tl_table_answer_slope;
|
||||
bmk1880v2_tiu_lookup_table(ctx, &p12);
|
||||
|
||||
// base f(x0)
|
||||
memset(&p12, 0x0, sizeof(bmk1880v2_tiu_lookup_table_param_t));
|
||||
p12.ofmap = tl_ofmap_bf16;
|
||||
p12.ifmap = &dst;
|
||||
p12.table = tl_table_answer;
|
||||
bmk1880v2_tiu_lookup_table(ctx, &p12);
|
||||
|
||||
// <! mac
|
||||
// <! part A + part B, a * b + res = res
|
||||
bmk1880v2_tiu_element_wise_mac_param_t p2;
|
||||
memset(&p2, 0, sizeof(p2));
|
||||
p2.res_high = 0;
|
||||
p2.res_low = tl_ofmap_bf16;
|
||||
p2.res_is_int8 = 0;
|
||||
p2.a = tl_ifmap;
|
||||
p2.b_is_const = 0;
|
||||
p2.b = tl_buf;
|
||||
p2.lshift_bits = 0;//lshift_bits;
|
||||
p2.rshift_bits = 0;//rshift_bits;
|
||||
p2.relu_enable = 0;
|
||||
bmk1880v2_tiu_element_wise_mac(ctx, &p2);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static double _gen_sigmoid(float x) {
|
||||
return 1.0 / (1.0 + exp(-(x)));
|
||||
}
|
||||
|
||||
double* bf16_gen_sigmoid_double() {
|
||||
int table_hw = bf16_table_hw();
|
||||
return (double*)malloc(sizeof(double) * table_hw);
|
||||
}
|
||||
|
||||
void bf16_free_sigmoid_double(double *sigmode_hw) {
|
||||
free(sigmode_hw);
|
||||
}
|
||||
|
||||
void bf16_gen_sigmoid(uint16_t *table_data,
|
||||
bmk1880v2_tensor_lmem_shape_t* table_shape,
|
||||
double *sigmode_hw, float scale,
|
||||
int range_start) {
|
||||
// S(x) = 1 / (1 + (e^-x))
|
||||
//<! 32*8 table, duplicate `channel` times;
|
||||
uint64_t idx = 0;
|
||||
assert(is_1880v2_tbl_shape(table_shape));
|
||||
|
||||
int half = half_h_table();
|
||||
int table_hw = bf16_table_hw();
|
||||
|
||||
|
||||
// prepare channel 0
|
||||
// x [0, 127]
|
||||
// we re-scale [-8, 8] into 256
|
||||
for (int i = 0; i < half; i++) {
|
||||
float _idx = idx / scale;
|
||||
double s = _gen_sigmoid(_idx);
|
||||
sigmode_hw[idx] = s;
|
||||
table_data[idx] = convert_fp32_bf16((float)s);
|
||||
#ifdef GDB
|
||||
printf("t [%lu] is %f[%d], 0x%x fp is %f d is %.8lf, input is %f\n", idx, convert_bf16_fp32(table_data[idx]), i, table_data[idx], (float)s, s, _idx);
|
||||
#endif
|
||||
idx++;
|
||||
}
|
||||
|
||||
// x = -128
|
||||
double s = _gen_sigmoid(range_start);
|
||||
sigmode_hw[idx] = s;
|
||||
table_data[idx] = convert_fp32_bf16((double)s);
|
||||
#ifdef GDB
|
||||
printf("t [%lu] is %f[%d], 0x%x fp is %f d is %.8lf input is %d\n", idx, convert_bf16_fp32(table_data[idx]), -128, table_data[idx], (float)s, s, range_start);
|
||||
#endif
|
||||
idx++;
|
||||
|
||||
// x [-128~-1], 2's complement
|
||||
for (int i = 1; i < half; i++) {
|
||||
float _idx = (i) / scale;
|
||||
double s = _gen_sigmoid(range_start + _idx);
|
||||
sigmode_hw[idx] = s;
|
||||
table_data[idx] = convert_fp32_bf16((double)s);
|
||||
#ifdef GDB
|
||||
printf("t [%lu] is %f[%d], 0x%x fp is %f d is %.8lf input is %f\n", idx, convert_bf16_fp32(table_data[idx]), -127 + i, table_data[idx], (float)s, s, range_start + _idx);
|
||||
#endif
|
||||
idx++;
|
||||
}
|
||||
|
||||
// duplicate channel #1 to #31
|
||||
|
||||
//TODO: tensor copy
|
||||
for (uint32_t i = 1; i < table_shape->c; i++) {
|
||||
memcpy(&table_data[i * table_hw], &table_data[0], sizeof(uint16_t) * table_hw);
|
||||
}
|
||||
}
|
||||
|
||||
float bf16_sigmoid_scale(int range_start, int range_end) {
|
||||
int table_hw = bf16_table_hw();
|
||||
return table_hw / (1.0 * abs(range_start - range_end)); // 256 / 16 = 16
|
||||
}
|
||||
|
||||
void bf16_gen_sigmoid_slope(uint16_t* OUT table_slope,
|
||||
bmk1880v2_tensor_lmem_shape_t* table_shape,
|
||||
double *sigmode_hw, float scale,
|
||||
int range_start, int range_end) {
|
||||
|
||||
assert(is_1880v2_tbl_shape(table_shape));
|
||||
|
||||
int half = half_h_table();
|
||||
int table_hw = bf16_table_hw();
|
||||
|
||||
for (int i = 0; i < table_hw; i++) {
|
||||
double x0 = sigmode_hw[i];
|
||||
double x1 = sigmode_hw[i+1];
|
||||
double delta = 1.0;
|
||||
if (i == half - 1) {
|
||||
//<! slope[127] means f(127)~f(128)
|
||||
double f = _gen_sigmoid(range_end);
|
||||
//uint16_t bf16 = convert_fp32_bf16(f);
|
||||
//x1 = convert_bf16_fp32(bf16);
|
||||
x1 = f;
|
||||
}
|
||||
else if (i == half) {
|
||||
// 128 index mean x1 is -129 and x0 is -128
|
||||
x1 = _gen_sigmoid(range_start - 1/scale);
|
||||
delta = -1.0;
|
||||
}
|
||||
else if (i > half) {
|
||||
x0 = sigmode_hw[i];
|
||||
x1 = sigmode_hw[i-1];
|
||||
delta = -1.0;
|
||||
}
|
||||
double s = (x1 - x0) / delta; // x1 already scale up
|
||||
table_slope[i] = convert_fp32_bf16((float)s);
|
||||
#ifdef GDB
|
||||
printf ("slope table [%u] = (bf16 %f double %.8lf float %f), 0x%x, %.8lf - %.8lf(%.8lf)\n",
|
||||
i, convert_bf16_fp32(table_slope[i]), s, (float)s, table_slope[i], x1, x0, x1-x0);
|
||||
#endif
|
||||
}
|
||||
|
||||
// duplicate channel #1 to #31
|
||||
|
||||
//TODO: tensor copy
|
||||
for (uint64_t i = 1; i < table_shape->c; i++) {
|
||||
memcpy(&table_slope[table_hw * i], &table_slope[0], sizeof(uint16_t) * table_hw);
|
||||
}
|
||||
}
|
||||
|
||||
void bf16_sigmoid_tbl(uint16_t *sigmoid_table_data, uint16_t* sigmoid_table_data_slope,
|
||||
bmk1880v2_tensor_lmem_shape_t* table_shape,
|
||||
int range_start, int range_end
|
||||
) {
|
||||
|
||||
assert(sigmoid_table_data);
|
||||
assert(sigmoid_table_data_slope);
|
||||
assert(table_shape);
|
||||
|
||||
double* sigmode_hw = bf16_gen_sigmoid_double();
|
||||
|
||||
float scale = bf16_sigmoid_scale(range_start, range_end);
|
||||
|
||||
bf16_gen_sigmoid(sigmoid_table_data, table_shape, sigmode_hw, scale, range_start);
|
||||
|
||||
bf16_gen_sigmoid_slope(sigmoid_table_data_slope,
|
||||
table_shape, sigmode_hw, scale,
|
||||
range_start, range_end);
|
||||
|
||||
bf16_free_sigmoid_double(sigmode_hw);
|
||||
}
|
||||
138
cvikernel/src/bm1880v2/non_atomic/tiu_sqrt.c
Normal file
138
cvikernel/src/bm1880v2/non_atomic/tiu_sqrt.c
Normal file
@ -0,0 +1,138 @@
|
||||
/**
|
||||
*/
|
||||
#include "gen_lut.h"
|
||||
#include <bmkernel/bm1880v2/1880v2_fp_convert.h>
|
||||
|
||||
//#define DBG
|
||||
/*
|
||||
* NOTICE: it could occupy 2 lookup table size which shape is <1,32,32,8> with bf16 data type
|
||||
*
|
||||
* \tl_buf tmp buffer, the shape MUST be same with \tl_ifmap
|
||||
* \tl_ofmap_bf16 result as bf16, MUST given for tmp buffer used
|
||||
*/
|
||||
int bf16_emit_sqrt(ctx_t *ctx,
|
||||
bmk1880v2_tensor_lmem_t* IN tl_ifmap,
|
||||
bmk1880v2_tensor_lmem_t* IN tl_buf,
|
||||
bmk1880v2_tensor_lmem_t *tbl_answer,
|
||||
bmk1880v2_tensor_lmem_t *tbl_answer_mantissa,
|
||||
bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16
|
||||
) {
|
||||
|
||||
return bf16_lut_exp_mantissa(ctx,
|
||||
tl_ifmap,
|
||||
tl_buf,
|
||||
tbl_answer,
|
||||
tbl_answer_mantissa,
|
||||
tl_ofmap_bf16
|
||||
);
|
||||
}
|
||||
|
||||
static double _gen_sqrt(int base, int p) {
|
||||
// y = x ^ 0.5
|
||||
double f = (double) (pow(base, p * 0.5));
|
||||
|
||||
if (isnan(f)) {
|
||||
assert(0);
|
||||
}
|
||||
return f;
|
||||
}
|
||||
|
||||
void bf16_gen_sqrt(uint16_t *table_data, bmk1880v2_tensor_lmem_shape_t* table_shape) {
|
||||
|
||||
assert(is_1880v2_tbl_shape(table_shape));
|
||||
|
||||
int exp_start = bf16_exp_start();
|
||||
int half = half_h_table();
|
||||
int table_hw = bf16_table_hw();
|
||||
uint64_t idx = 0;
|
||||
|
||||
// prepare channel 0
|
||||
double s = 0.0;
|
||||
table_data[idx] = convert_fp32_bf16(s); // 0^0.5 = 0
|
||||
#ifdef DBG
|
||||
printf("t [%lu] is %f(%.8lf)[idx:%f][2^%f] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), s, (float)exp_start, (float)(exp_start/2), table_data[idx]);
|
||||
#endif
|
||||
idx++;
|
||||
|
||||
// > 0, exp from 0 -62 -61 .. 62 63
|
||||
for (int i = 0; i < half; i++) {
|
||||
int shift = (exp_start + i);
|
||||
uint8_t is_odd = (shift % 2);
|
||||
float exp = shift;
|
||||
if (is_odd) {
|
||||
exp = exp - 1;
|
||||
}
|
||||
|
||||
double s = _gen_sqrt(2, exp);
|
||||
table_data[idx] = convert_fp32_bf16(s);
|
||||
#ifdef DBG
|
||||
printf("t [%lu] is %f [idx:%f][2^%f(%f)] bf %x\n", idx,
|
||||
convert_bf16_fp32(table_data[idx]),
|
||||
(float)(exp_start + i), exp/2, (exp_start + i) / 2.0,
|
||||
table_data[idx]);
|
||||
#endif
|
||||
idx++;
|
||||
}
|
||||
|
||||
//// idx = 127 dont care
|
||||
// duplicate channel #1 to #channel
|
||||
//TODO: tensor copy
|
||||
|
||||
for (uint32_t i = 1; i < table_shape->c; i++) {
|
||||
memcpy(&table_data[i * table_hw], &table_data[0], sizeof(uint16_t) * table_hw);
|
||||
}
|
||||
}
|
||||
|
||||
void bf16_gen_sqrt_mantissa(uint16_t* OUT table_mantissa,
|
||||
bmk1880v2_tensor_lmem_shape_t* table_shape) {
|
||||
|
||||
assert(is_1880v2_tbl_shape(table_shape));
|
||||
|
||||
uint32_t half = half_h_table();
|
||||
int table_hw = bf16_table_hw();
|
||||
|
||||
int idx = 0;
|
||||
double d;
|
||||
for (uint32_t i = 0; i < half; i++) {
|
||||
d = 1 + i * 1 / 128.0;
|
||||
d = (double) pow(d, 0.5);
|
||||
table_mantissa[128+idx] = convert_fp32_bf16(d);
|
||||
#ifdef DBG
|
||||
//printf(", [%u] is %lf\n", i+128, d);
|
||||
#endif /* ifdef DBG */
|
||||
|
||||
//13=2^3x1.625=(2^2)x(2^1x1.625)
|
||||
d = 2 * (1 + i * 1 / 128.0);
|
||||
d = (double) pow(d, 0.5);
|
||||
table_mantissa[idx] = convert_fp32_bf16(d);
|
||||
#ifdef DBG
|
||||
//printf("mantissa [%u] is %lf", i, d);
|
||||
#endif /* ifdef DBG */
|
||||
idx++;
|
||||
}
|
||||
#ifdef DBG
|
||||
for (uint32_t i = 0; i < 2 * half; i++) {
|
||||
printf("mantissa [%u] is %lf, 0x%x\n", i, convert_bf16_fp32(table_mantissa[i]),
|
||||
table_mantissa[i]);
|
||||
}
|
||||
#endif /* ifdef DBG */
|
||||
|
||||
// duplicate channel #1 to #31
|
||||
//TODO: tensor copy
|
||||
for (uint64_t i = 1; i < table_shape->c; i++) {
|
||||
memcpy(&table_mantissa[table_hw * i], &table_mantissa[0], sizeof(uint16_t) * table_hw);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void bf16_sqrt_tbl(uint16_t *sqrt_table_data, uint16_t* sqrt_table_data_mantissa,
|
||||
bmk1880v2_tensor_lmem_shape_t* table_shape
|
||||
) {
|
||||
|
||||
assert(sqrt_table_data);
|
||||
assert(sqrt_table_data_mantissa);
|
||||
assert(table_shape);
|
||||
|
||||
bf16_gen_sqrt (sqrt_table_data, table_shape);
|
||||
bf16_gen_sqrt_mantissa(sqrt_table_data_mantissa, table_shape);
|
||||
}
|
||||
1960
cvikernel/src/bm1880v2/tdma.c
Normal file
1960
cvikernel/src/bm1880v2/tdma.c
Normal file
File diff suppressed because it is too large
Load Diff
79
cvikernel/src/bm1880v2/tiu_average_pooling.c
Normal file
79
cvikernel/src/bm1880v2/tiu_average_pooling.c
Normal file
@ -0,0 +1,79 @@
|
||||
#include "kernel_1880v2.h"
|
||||
#include <bmkernel/bm1880v2/1880v2_fp_convert.h>
|
||||
|
||||
bmk1880v2_op_t * bmk1880v2_tiu_average_pooling(
|
||||
ctx_t *ctx,
|
||||
const bmk1880v2_tiu_average_pooling_param_t *p)
|
||||
{
|
||||
int bf16_enable = (p->ifmap->fmt == FMT_BF16) ? 1 : 0;
|
||||
|
||||
ASSERT(p->ifmap->shape.n == p->ofmap->shape.n);
|
||||
ASSERT(p->ifmap->shape.c == p->ofmap->shape.c);
|
||||
check_tiu_tensor_2(p->ifmap, p->ofmap);
|
||||
if (bf16_enable) {
|
||||
assert_bf16_stride_type_0(ctx, p->ifmap);
|
||||
assert_bf16_stride_type_0(ctx, p->ofmap);
|
||||
} else {
|
||||
assert_stride_type_0(ctx, p->ifmap);
|
||||
assert_stride_type_0(ctx, p->ofmap);
|
||||
}
|
||||
|
||||
int opd0_sign = tensor_is_signed(p->ifmap);
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_DEPTHWISE_POOL_FIX8B;
|
||||
reg.tsk_eu_typ = 1;
|
||||
reg.opt_shift_typ = opd0_sign;
|
||||
reg.opt_right_shift = p->rshift_bits;
|
||||
reg.opt_relu = 0; /* hardware relu function not verified. */
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opd_typ = bf16_enable ? 1: 0;
|
||||
|
||||
reg.res0_addr = p->ofmap->start_address;
|
||||
reg.opt_res0_sign = opd0_sign;
|
||||
reg.opt_res0_int8 = 1;
|
||||
reg.res0_n = p->ofmap->shape.n;
|
||||
reg.res0_c = p->ofmap->shape.c;
|
||||
reg.res0_h = p->ofmap->shape.h;
|
||||
reg.res0_w = p->ofmap->shape.w;
|
||||
|
||||
reg.opd0_addr = p->ifmap->start_address;
|
||||
reg.opt_opd0_sign = opd0_sign;
|
||||
reg.opt_opd0_int8 = 1;
|
||||
reg.opd0_n = p->ifmap->shape.n;
|
||||
reg.opd0_c = p->ifmap->shape.c;
|
||||
reg.opd0_h = p->ifmap->shape.h;
|
||||
reg.opd0_w = p->ifmap->shape.w;
|
||||
reg.opd0_ins_val = bf16_enable ? 0 : (uint32_t)p->ins_val;
|
||||
reg.opd0_ins_fp = bf16_enable ? (uint32_t)p->ins_fp : 0;
|
||||
reg.conv_opd0_up_pad = p->pad_top;
|
||||
reg.conv_opd0_dn_pad = p->pad_bottom;
|
||||
reg.conv_opd0_lf_pad = p->pad_left;
|
||||
reg.conv_opd0_rt_pad = p->pad_right;
|
||||
reg.conv_opd0_x_ins0 = p->ins_w;
|
||||
reg.conv_opd0_y_ins0 = p->ins_h;
|
||||
reg.conv_opd0_x_ins0_last = p->ins_last_w;
|
||||
reg.conv_opd0_y_ins0_last = p->ins_last_h;
|
||||
|
||||
reg.opt_opd1_const = 1;
|
||||
/*HW does not have dive, we need to calculate value here*/
|
||||
if (bf16_enable)
|
||||
reg.opd1_addr =
|
||||
convert_fp32_bf16(
|
||||
(float)(convert_bf16_fp32(p->avg_pooling_const) / (p->kh * p->kw)));
|
||||
else
|
||||
reg.opd1_addr = p->avg_pooling_const;
|
||||
|
||||
reg.opd1_h = p->kh;
|
||||
reg.opd1_w = p->kw;
|
||||
reg.opt_opd1_sign = 0;
|
||||
reg.opt_opd1_int8 = 1;
|
||||
reg.conv_op_x_str = p->stride_w;
|
||||
reg.conv_op_y_str = p->stride_h;
|
||||
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
170
cvikernel/src/bm1880v2/tiu_convolution.c
Normal file
170
cvikernel/src/bm1880v2/tiu_convolution.c
Normal file
@ -0,0 +1,170 @@
|
||||
#include "kernel_1880v2.h"
|
||||
|
||||
typedef bmk1880v2_tiu_convolution_param_t param_t;
|
||||
|
||||
static int can_do_double_conv(ctx_t *ctx, const param_t *p)
|
||||
{
|
||||
uint8_t bf16_enable = (p->ifmap->fmt == FMT_BF16) ? 1 : 0;
|
||||
|
||||
if (((p->ifmap->start_address % ctx->chip_info.lmem_size) % 2 == 0 &&
|
||||
p->ifmap->shape.c % 2 == 0 &&
|
||||
p->ifmap->shape.c >= 4 &&
|
||||
p->weight->start_address % 2 == 0) && !bf16_enable)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void check_conv_param(ctx_t *ctx, const param_t *p)
|
||||
{
|
||||
uint32_t eu_num = ctx->chip_info.eu_num;
|
||||
uint8_t bf16_enable = (p->ifmap->fmt == FMT_BF16) ? 1 : 0;
|
||||
|
||||
check_tiu_tensor_3(p->ifmap, p->ofmap, p->weight);
|
||||
if (bf16_enable) {
|
||||
assert_bf16_stride_type_0(ctx, p->ifmap);
|
||||
} else {
|
||||
assert_stride_type_0(ctx, p->ifmap);
|
||||
}
|
||||
//assert_stride_type_1(ctx, p->weight);
|
||||
if (p->bias) {
|
||||
check_tiu_tensor(p->bias);
|
||||
if (bf16_enable)
|
||||
assert_bf16_stride_type_2(ctx, p->bias);
|
||||
else
|
||||
assert_stride_type_2(ctx, p->bias);
|
||||
}
|
||||
|
||||
// n stride must align 16B
|
||||
ASSERT((p->ofmap->stride.n % 16) == 0);
|
||||
|
||||
ASSERT(p->ifmap->start_address % eu_num == 0);
|
||||
ASSERT(p->ofmap->start_address % eu_num == 0);
|
||||
ASSERT(p->ifmap->shape.n == p->ofmap->shape.n);
|
||||
ASSERT(!(p->ifmap->shape.h == 1 && p->ins_h > 0));
|
||||
ASSERT(p->weight->shape.n == p->ifmap->shape.c);
|
||||
ASSERT(p->weight->shape.c == p->ofmap->shape.c);
|
||||
if (can_do_double_conv(ctx, p)) {
|
||||
uint32_t lmem_i = p->ifmap->start_address % ctx->chip_info.lmem_size;
|
||||
ASSERT(lmem_i % 2 == 0);
|
||||
ASSERT(p->ifmap->shape.c % 2 == 0);
|
||||
ASSERT(p->ifmap->shape.c >= 4); /* Otherwise performance will suffer */
|
||||
ASSERT(p->weight->start_address % 2 == 0);
|
||||
}
|
||||
if(p->ps32_mode & 0x2)
|
||||
{
|
||||
ASSERT(!p->relu_enable);
|
||||
ASSERT(!p->bias);
|
||||
ASSERT(!p->rshift_bits);
|
||||
}
|
||||
ASSERT(p->stride_h < 16);
|
||||
ASSERT(p->stride_w < 16);
|
||||
ASSERT(p->pad_top < 16);
|
||||
ASSERT(p->pad_bottom < 16);
|
||||
ASSERT(p->pad_left < 16);
|
||||
ASSERT(p->pad_right < 16);
|
||||
ASSERT(p->ins_h < 16);
|
||||
ASSERT(p->ins_last_h < 16);
|
||||
ASSERT(p->ins_w < 16);
|
||||
ASSERT(p->ins_last_w < 16);
|
||||
ASSERT(p->dilation_h >= 1);
|
||||
ASSERT(p->dilation_w >= 1);
|
||||
}
|
||||
|
||||
bmk1880v2_op_t * bmk1880v2_tiu_convolution(ctx_t *ctx, const param_t *p)
|
||||
{
|
||||
check_conv_param(ctx, p);
|
||||
|
||||
uint32_t npu_num = ctx->chip_info.npu_num;
|
||||
int opd0_sign = tensor_is_signed(p->ifmap);
|
||||
int opd1_sign = tensor_is_signed(p->weight);
|
||||
int opd2_sign = p->bias? tensor_is_signed(p->bias): 1;
|
||||
int arith_shift = opd0_sign || opd1_sign || opd2_sign;
|
||||
int bf16_enable = (p->ifmap->fmt == FMT_BF16) ? 1 : 0;
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_CONV_FIX8B;
|
||||
reg.opt_shift_typ = arith_shift;
|
||||
reg.opt_right_shift = p->rshift_bits;
|
||||
reg.opt_relu = !!(p->relu_enable);
|
||||
reg.tsk_opd_num = 2;
|
||||
|
||||
reg.opd_typ = bf16_enable ? 1: 0;
|
||||
|
||||
/*always automatically enabel double conv at those situations*/
|
||||
if (can_do_double_conv(ctx, p))
|
||||
reg.double_conv = 1;
|
||||
|
||||
reg.res0_addr = p->ofmap->start_address;
|
||||
reg.opt_res0_sign = tensor_is_signed(p->ofmap);
|
||||
reg.opt_res0_int8 = 1;
|
||||
reg.res0_n = p->ofmap->shape.n;
|
||||
reg.res0_c = p->ofmap->shape.c;
|
||||
reg.res0_h = p->ofmap->shape.h;
|
||||
reg.res0_w = p->ofmap->shape.w;
|
||||
reg.res0_n_str = p->ofmap->stride.n;
|
||||
reg.res0_c_str = p->ofmap->stride.c;
|
||||
reg.res0_h_str = p->ofmap->stride.h;
|
||||
reg.res0_w_str = p->ofmap->stride.w;
|
||||
reg.short_res0_str = 3; // Manual instead of h/w
|
||||
reg.ps32_md = p->ps32_mode;
|
||||
if (p->ps32_mode > 0)
|
||||
reg.res0_b_str = p->ofmap->shape.n * p->ofmap->stride.n;
|
||||
|
||||
reg.opd0_addr = p->ifmap->start_address;
|
||||
reg.opt_opd0_sign = opd0_sign;
|
||||
reg.opt_opd0_int8 = 1;
|
||||
reg.opd0_n = p->ifmap->shape.n;
|
||||
reg.opd0_c = p->ifmap->shape.c;
|
||||
reg.opd0_h = p->ifmap->shape.h;
|
||||
reg.opd0_w = p->ifmap->shape.w;
|
||||
reg.opd0_ins_val = bf16_enable ? 0 : (uint32_t)p->ins_val;
|
||||
reg.opd0_ins_fp = bf16_enable ? (uint32_t)p->ins_fp : 0;
|
||||
reg.short_opd0_str = 0;
|
||||
reg.conv_opd0_up_pad = p->pad_top;
|
||||
reg.conv_opd0_dn_pad = p->pad_bottom;
|
||||
reg.conv_opd0_lf_pad = p->pad_left;
|
||||
reg.conv_opd0_rt_pad = p->pad_right;
|
||||
reg.conv_opd0_x_ins0 = p->ins_w;
|
||||
reg.conv_opd0_y_ins0 = p->ins_h;
|
||||
reg.conv_opd0_x_ins0_last = p->ins_last_w;
|
||||
reg.conv_opd0_y_ins0_last = p->ins_last_h;
|
||||
|
||||
reg.opd1_addr = p->weight->start_address;
|
||||
reg.opt_opd1_sign = opd1_sign;
|
||||
reg.opt_opd1_int8 = 1;
|
||||
reg.opt_opd1_const = p->w_is_const;
|
||||
reg.opd1_n = p->weight->shape.n;
|
||||
reg.opd1_c = p->weight->shape.c;
|
||||
reg.opd1_h = p->weight->shape.h;
|
||||
reg.opd1_w = p->weight->shape.w;
|
||||
reg.short_opd1_str = 1;
|
||||
reg.conv_opd1_x_ins0 = p->dilation_w - 1;
|
||||
reg.conv_opd1_y_ins0 = p->dilation_h - 1;
|
||||
reg.conv_op_x_str = p->stride_w;
|
||||
reg.conv_op_y_str = p->stride_h;
|
||||
|
||||
if (p->bias) {
|
||||
ASSERT(p->bias->shape.n == 2);
|
||||
ASSERT(p->bias->shape.c == p->ofmap->shape.c);
|
||||
ASSERT(p->bias->shape.h == 1);
|
||||
ASSERT(p->bias->shape.w == 1);
|
||||
|
||||
reg.tsk_opd_num = 3;
|
||||
reg.opt_opd2_sign = opd2_sign;
|
||||
reg.opt_opd2_int8 = 0;
|
||||
reg.opd2_addr = p->bias->start_address;
|
||||
reg.opd2_n = 1;
|
||||
reg.opd2_c = p->bias->shape.c;
|
||||
reg.opd2_h = 1;
|
||||
reg.opd2_w = 1;
|
||||
reg.short_opd2_str = 2;
|
||||
reg.opd2_b_str = ceiling_func(p->bias->shape.c, npu_num) * (bf16_enable ? 2 : 1);
|
||||
}
|
||||
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
160
cvikernel/src/bm1880v2/tiu_convolution_qdm.c
Normal file
160
cvikernel/src/bm1880v2/tiu_convolution_qdm.c
Normal file
@ -0,0 +1,160 @@
|
||||
#include "kernel_1880v2.h"
|
||||
|
||||
typedef bmk1880v2_tiu_convolution_qdm_param_t param_t;
|
||||
|
||||
static int can_do_double_conv(ctx_t *ctx, const param_t *p)
|
||||
{
|
||||
if ((p->ifmap->start_address % ctx->chip_info.lmem_size) % 2 == 0 &&
|
||||
p->ifmap->shape.c % 2 == 0 &&
|
||||
p->ifmap->shape.c >= 4 &&
|
||||
p->weight->start_address % 2 == 0)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void check_conv_param(ctx_t *ctx, const param_t *p)
|
||||
{
|
||||
uint32_t eu_num = ctx->chip_info.eu_num;
|
||||
|
||||
check_tiu_tensor_3(p->ifmap, p->ofmap, p->weight);
|
||||
assert_stride_type_0(ctx, p->ifmap);
|
||||
|
||||
ASSERT((p->ofmap->stride.n % eu_num) == 0);
|
||||
ASSERT(p->ifmap->start_address % eu_num == 0);
|
||||
ASSERT(p->ofmap->start_address % eu_num == 0);
|
||||
ASSERT(p->ifmap->shape.n == p->ofmap->shape.n);
|
||||
ASSERT(!(p->ifmap->shape.h == 1 && p->ins_h > 0));
|
||||
ASSERT(p->weight->shape.n == p->ifmap->shape.c);
|
||||
ASSERT(p->weight->shape.c == p->ofmap->shape.c);
|
||||
|
||||
if (p->chl_quan_param) {
|
||||
check_tiu_tensor(p->chl_quan_param);
|
||||
assert_stride_type_2(ctx, p->chl_quan_param);
|
||||
ASSERT(p->chl_quan_param->start_address % eu_num == 0);
|
||||
}
|
||||
if (can_do_double_conv(ctx, p)) {
|
||||
uint32_t lmem_i = p->ifmap->start_address % ctx->chip_info.lmem_size;
|
||||
ASSERT(lmem_i % 2 == 0);
|
||||
ASSERT(p->ifmap->shape.c % 2 == 0);
|
||||
ASSERT(p->ifmap->shape.c >= 4); /* Otherwise performance will suffer */
|
||||
ASSERT(p->weight->start_address % 2 == 0);
|
||||
}
|
||||
if(p->ps32_mode & 0x2)
|
||||
{
|
||||
ASSERT(!p->relu_enable);
|
||||
ASSERT(!p->has_bias);
|
||||
}
|
||||
ASSERT(p->stride_h < 16);
|
||||
ASSERT(p->stride_w < 16);
|
||||
ASSERT(p->pad_top < 16);
|
||||
ASSERT(p->pad_bottom < 16);
|
||||
ASSERT(p->pad_left < 16);
|
||||
ASSERT(p->pad_right < 16);
|
||||
ASSERT(p->ins_h < 16);
|
||||
ASSERT(p->ins_last_h < 16);
|
||||
ASSERT(p->ins_w < 16);
|
||||
ASSERT(p->ins_last_w < 16);
|
||||
ASSERT(p->dilation_h >= 1);
|
||||
ASSERT(p->dilation_w >= 1);
|
||||
}
|
||||
|
||||
bmk1880v2_op_t * bmk1880v2_tiu_convolution_qdm(ctx_t *ctx, const param_t *p)
|
||||
{
|
||||
check_conv_param(ctx, p);
|
||||
|
||||
int opd0_sign = tensor_is_signed(p->ifmap);
|
||||
int opd1_sign = tensor_is_signed(p->weight);
|
||||
int arith_shift = opd0_sign || opd1_sign;
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_CONV_FIX8B;
|
||||
reg.opt_shift_typ = arith_shift;
|
||||
reg.opt_relu = !!(p->relu_enable);
|
||||
reg.tsk_opd_num = 2;
|
||||
|
||||
/*always automatically enabel double conv at those situations*/
|
||||
if (can_do_double_conv(ctx, p))
|
||||
reg.double_conv = 1;
|
||||
|
||||
reg.res0_addr = p->ofmap->start_address;
|
||||
reg.opt_res0_sign = tensor_is_signed(p->ofmap);
|
||||
reg.opt_res0_int8 = 1;
|
||||
reg.res0_n = p->ofmap->shape.n;
|
||||
reg.res0_c = p->ofmap->shape.c;
|
||||
reg.res0_h = p->ofmap->shape.h;
|
||||
reg.res0_w = p->ofmap->shape.w;
|
||||
reg.res0_n_str = p->ofmap->stride.n;
|
||||
reg.res0_c_str = p->ofmap->stride.c;
|
||||
reg.res0_h_str = p->ofmap->stride.h;
|
||||
reg.res0_w_str = p->ofmap->stride.w;
|
||||
reg.short_res0_str = 3; // Manual instead of h/w
|
||||
reg.ps32_md = p->ps32_mode;
|
||||
if (p->ps32_mode > 0) {
|
||||
reg.res0_b_str = p->ofmap->shape.n * p->ofmap->stride.n;
|
||||
|
||||
// Per-channel parameter does not has right shift (default is 10).
|
||||
// Set zero.
|
||||
reg.opt_right_shift = 0;
|
||||
}
|
||||
|
||||
reg.opd0_addr = p->ifmap->start_address;
|
||||
reg.opt_opd0_sign = opd0_sign;
|
||||
reg.opt_opd0_int8 = 1;
|
||||
reg.opd0_n = p->ifmap->shape.n;
|
||||
reg.opd0_c = p->ifmap->shape.c;
|
||||
reg.opd0_h = p->ifmap->shape.h;
|
||||
reg.opd0_w = p->ifmap->shape.w;
|
||||
reg.opd0_ins_val = (uint32_t)p->ins_val;
|
||||
reg.short_opd0_str = 0;
|
||||
reg.conv_opd0_up_pad = p->pad_top;
|
||||
reg.conv_opd0_dn_pad = p->pad_bottom;
|
||||
reg.conv_opd0_lf_pad = p->pad_left;
|
||||
reg.conv_opd0_rt_pad = p->pad_right;
|
||||
reg.conv_opd0_x_ins0 = p->ins_w;
|
||||
reg.conv_opd0_y_ins0 = p->ins_h;
|
||||
reg.conv_opd0_x_ins0_last = p->ins_last_w;
|
||||
reg.conv_opd0_y_ins0_last = p->ins_last_h;
|
||||
|
||||
reg.opd1_addr = p->weight->start_address;
|
||||
reg.opt_opd1_sign = opd1_sign;
|
||||
reg.opt_opd1_int8 = 1;
|
||||
reg.opt_opd1_const = p->w_is_const;
|
||||
reg.opd1_n = p->weight->shape.n;
|
||||
reg.opd1_c = p->weight->shape.c;
|
||||
reg.opd1_h = p->weight->shape.h;
|
||||
reg.opd1_w = p->weight->shape.w;
|
||||
reg.short_opd1_str = 1;
|
||||
reg.conv_opd1_x_ins0 = p->dilation_w - 1;
|
||||
reg.conv_opd1_y_ins0 = p->dilation_h - 1;
|
||||
reg.conv_op_x_str = p->stride_w;
|
||||
reg.conv_op_y_str = p->stride_h;
|
||||
|
||||
if (p->chl_quan_param) {
|
||||
ASSERT(p->chl_quan_param->shape.n == 1);
|
||||
ASSERT(p->chl_quan_param->shape.c == p->ofmap->shape.c);
|
||||
ASSERT(p->chl_quan_param->shape.h == 1);
|
||||
ASSERT(p->chl_quan_param->shape.w == 1);
|
||||
reg.opt_chl_quan = 1;
|
||||
reg.opt_right_shift = 0; // useless
|
||||
reg.opd2_addr = p->chl_quan_param->start_address;
|
||||
reg.opd2_n = p->chl_quan_param->shape.n;
|
||||
reg.opd2_c = p->chl_quan_param->shape.c;
|
||||
reg.opd2_h = p->chl_quan_param->shape.h;
|
||||
reg.opd2_w = p->chl_quan_param->shape.w;
|
||||
}
|
||||
reg.opt_opd2_int8 = 1; // useless, force to 1 to skip b_stride check
|
||||
reg.short_opd2_str = 2; // useless
|
||||
reg.opd2_b_str = 0; // useless
|
||||
|
||||
if (p->has_bias) {
|
||||
reg.tsk_opd_num = 3;
|
||||
reg.opt_opd2_sign = 1;
|
||||
}
|
||||
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
148
cvikernel/src/bm1880v2/tiu_depthwise_convolution.c
Normal file
148
cvikernel/src/bm1880v2/tiu_depthwise_convolution.c
Normal file
@ -0,0 +1,148 @@
|
||||
#include "kernel_1880v2.h"
|
||||
|
||||
bmk1880v2_op_t * bmk1880v2_tiu_depthwise_convolution(
|
||||
ctx_t *ctx,
|
||||
const bmk1880v2_tiu_depthwise_convolution_param_t *p)
|
||||
{
|
||||
int bf16_enable = (p->ifmap->fmt == FMT_BF16) ? 1 : 0;
|
||||
bool isMulConst = (p->weight_is_const == 1) ? 1 : 0;
|
||||
|
||||
if(isMulConst) {
|
||||
check_tiu_tensor_2(p->ifmap, p->ofmap);
|
||||
} else {
|
||||
check_tiu_tensor_3(p->ifmap, p->ofmap, p->weight);
|
||||
}
|
||||
if (bf16_enable) {
|
||||
assert_bf16_stride_type_0(ctx, p->ifmap);
|
||||
if(!isMulConst)
|
||||
assert_bf16_stride_type_0(ctx, p->weight);
|
||||
if (p->bias) {
|
||||
check_tiu_tensor(p->bias);
|
||||
assert_bf16_stride_type_2(ctx, p->bias);
|
||||
}
|
||||
} else {
|
||||
assert_stride_type_0(ctx, p->ifmap);
|
||||
if(!isMulConst)
|
||||
assert_stride_type_0(ctx, p->weight);
|
||||
if (p->bias) {
|
||||
check_tiu_tensor(p->bias);
|
||||
assert_stride_type_2(ctx, p->bias);
|
||||
}
|
||||
}
|
||||
|
||||
// n stride must align 16B
|
||||
ASSERT((p->ofmap->stride.n % 16) == 0);
|
||||
|
||||
// Support fp32 result in bf16
|
||||
uint32_t res0_n = p->ofmap->shape.n;
|
||||
int ps32_mode = 0;
|
||||
if (bf16_enable && (p->ifmap->shape.n != p->ofmap->shape.n)) {
|
||||
ASSERT((2 * p->ifmap->shape.n) == p->ofmap->shape.n);
|
||||
ASSERT(p->ps32_mode == 2); // bit[1]: write
|
||||
ps32_mode = 2;
|
||||
res0_n = p->ifmap->shape.n;
|
||||
} else {
|
||||
ASSERT(p->ifmap->shape.n == p->ofmap->shape.n);
|
||||
}
|
||||
|
||||
ASSERT(p->ifmap->shape.c == p->ofmap->shape.c);
|
||||
if(!isMulConst){
|
||||
ASSERT(p->ifmap->shape.c == p->weight->shape.c);
|
||||
ASSERT(p->weight->shape.n == 1);
|
||||
}
|
||||
|
||||
int opd0_sign = tensor_is_signed(p->ifmap);
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_DEPTHWISE_POOL_FIX8B;
|
||||
reg.tsk_eu_typ = 2;
|
||||
reg.opt_relu = p->relu_enable;
|
||||
reg.opt_shift_typ = 1;
|
||||
reg.opt_right_shift = p->rshift_bits;
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opd_typ = bf16_enable ? 1: 0;
|
||||
|
||||
int res0_sign = tensor_is_signed(p->ofmap);
|
||||
reg.res0_addr = p->ofmap->start_address;
|
||||
reg.opt_res0_sign = res0_sign;
|
||||
reg.opt_res0_int8 = 1;
|
||||
reg.res0_n = res0_n;
|
||||
reg.res0_c = p->ofmap->shape.c;
|
||||
reg.res0_h = p->ofmap->shape.h;
|
||||
reg.res0_w = p->ofmap->shape.w;
|
||||
reg.res0_n_str = p->ofmap->stride.n;
|
||||
reg.res0_c_str = p->ofmap->stride.c;
|
||||
reg.res0_h_str = p->ofmap->stride.h;
|
||||
reg.res0_w_str = p->ofmap->stride.w;
|
||||
reg.res0_b_str = reg.res0_n_str * reg.res0_n;
|
||||
reg.short_res0_str = 3; // Manual instead of h/w
|
||||
reg.ps32_md = ps32_mode;
|
||||
|
||||
reg.opd0_addr = p->ifmap->start_address;
|
||||
reg.opt_opd0_sign = opd0_sign;
|
||||
reg.opt_opd0_int8 = 1;
|
||||
reg.opd0_n = p->ifmap->shape.n;
|
||||
reg.opd0_c = p->ifmap->shape.c;
|
||||
reg.opd0_h = p->ifmap->shape.h;
|
||||
reg.opd0_w = p->ifmap->shape.w;
|
||||
reg.opd0_n_str = p->ifmap->stride.n;
|
||||
reg.opd0_c_str = p->ifmap->stride.c;
|
||||
reg.opd0_h_str = p->ifmap->stride.h;
|
||||
reg.opd0_w_str = p->ifmap->stride.w;
|
||||
reg.opd0_ins_val = bf16_enable ? 0 : (uint32_t)p->ins_val;
|
||||
reg.opd0_ins_fp = bf16_enable ? (uint32_t)p->ins_fp : 0;
|
||||
reg.short_opd0_str = 3; // Manual instead of h/w
|
||||
reg.conv_opd0_up_pad = p->pad_top;
|
||||
reg.conv_opd0_dn_pad = p->pad_bottom;
|
||||
reg.conv_opd0_lf_pad = p->pad_left;
|
||||
reg.conv_opd0_rt_pad = p->pad_right;
|
||||
reg.conv_opd0_x_ins0 = p->ins_w;
|
||||
reg.conv_opd0_y_ins0 = p->ins_h;
|
||||
reg.conv_opd0_x_ins0_last = p->ins_last_w;
|
||||
reg.conv_opd0_y_ins0_last = p->ins_last_h;
|
||||
|
||||
reg.opt_opd1_sign = 1;
|
||||
reg.opt_opd1_int8 = 1;
|
||||
reg.conv_opd1_x_ins0 = p->dilation_w - 1;
|
||||
reg.conv_opd1_y_ins0 = p->dilation_h - 1;
|
||||
if (isMulConst) {
|
||||
reg.opt_opd1_const = 1;
|
||||
reg.opt_opd1_sign = p->weight_const.is_signed;
|
||||
reg.opd1_addr = p->weight_const.val;
|
||||
reg.opd1_n = p->weight->shape.n;
|
||||
reg.opd1_c = p->weight->shape.c;
|
||||
reg.opd1_h = p->weight->shape.h;
|
||||
reg.opd1_w = p->weight->shape.w;
|
||||
} else {
|
||||
reg.opd1_addr = p->weight->start_address;
|
||||
reg.opd1_n = p->weight->shape.n;
|
||||
reg.opd1_c = p->weight->shape.c;
|
||||
reg.opd1_h = p->weight->shape.h;
|
||||
reg.opd1_w = p->weight->shape.w;
|
||||
}
|
||||
reg.conv_op_x_str = p->stride_w;
|
||||
reg.conv_op_y_str = p->stride_h;
|
||||
|
||||
if (p->bias) {
|
||||
ASSERT(p->bias->shape.n == 2);
|
||||
ASSERT(p->bias->shape.c == p->ofmap->shape.c);
|
||||
ASSERT(p->bias->shape.h == 1);
|
||||
ASSERT(p->bias->shape.w == 1);
|
||||
|
||||
reg.tsk_opd_num = 3;
|
||||
reg.opd2_addr = p->bias->start_address;
|
||||
reg.opt_opd2_int8 = 0;
|
||||
reg.opd2_n = 1;
|
||||
reg.opd2_c = p->bias->shape.c;
|
||||
reg.opd2_h = 1;
|
||||
reg.opd2_w = 1;
|
||||
reg.short_opd2_str = 2;
|
||||
reg.opd2_b_str = p->bias->stride.n;
|
||||
}
|
||||
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
122
cvikernel/src/bm1880v2/tiu_depthwise_convolution_qdm.c
Normal file
122
cvikernel/src/bm1880v2/tiu_depthwise_convolution_qdm.c
Normal file
@ -0,0 +1,122 @@
|
||||
#include "kernel_1880v2.h"
|
||||
|
||||
bmk1880v2_op_t * bmk1880v2_tiu_depthwise_convolution_qdm(
|
||||
ctx_t *ctx,
|
||||
const bmk1880v2_tiu_depthwise_convolution_qdm_param_t *p)
|
||||
{
|
||||
uint32_t eu_num = ctx->chip_info.eu_num;
|
||||
bool isMulConst = (p->weight_is_const == 1) ? 1 : 0;
|
||||
|
||||
if(isMulConst) {
|
||||
check_tiu_tensor_2(p->ifmap, p->ofmap);
|
||||
} else {
|
||||
check_tiu_tensor_3(p->ifmap, p->ofmap, p->weight);
|
||||
}
|
||||
assert_stride_type_0(ctx, p->ifmap);
|
||||
if(!isMulConst)
|
||||
assert_stride_type_0(ctx, p->weight);
|
||||
check_tiu_tensor(p->chl_quan_param);
|
||||
assert_stride_type_2(ctx, p->chl_quan_param);
|
||||
|
||||
ASSERT((p->ofmap->stride.n % eu_num) == 0);
|
||||
ASSERT(p->chl_quan_param->start_address % eu_num == 0);
|
||||
ASSERT(p->ifmap->shape.n == p->ofmap->shape.n);
|
||||
ASSERT(p->ifmap->shape.c == p->ofmap->shape.c);
|
||||
if(!isMulConst){
|
||||
ASSERT(p->ifmap->shape.c == p->weight->shape.c);
|
||||
ASSERT(p->weight->shape.n == 1);
|
||||
}
|
||||
|
||||
int opd0_sign = tensor_is_signed(p->ifmap);
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_DEPTHWISE_POOL_FIX8B;
|
||||
reg.tsk_eu_typ = 2;
|
||||
reg.opt_relu = p->relu_enable;
|
||||
reg.opt_shift_typ = 1;
|
||||
reg.tsk_opd_num = 2;
|
||||
|
||||
int res0_sign = tensor_is_signed(p->ofmap);
|
||||
reg.res0_addr = p->ofmap->start_address;
|
||||
reg.opt_res0_sign = res0_sign;
|
||||
reg.opt_res0_int8 = 1;
|
||||
reg.res0_n = p->ofmap->shape.n;
|
||||
reg.res0_c = p->ofmap->shape.c;
|
||||
reg.res0_h = p->ofmap->shape.h;
|
||||
reg.res0_w = p->ofmap->shape.w;
|
||||
reg.res0_n_str = p->ofmap->stride.n;
|
||||
reg.res0_c_str = p->ofmap->stride.c;
|
||||
reg.res0_h_str = p->ofmap->stride.h;
|
||||
reg.res0_w_str = p->ofmap->stride.w;
|
||||
reg.short_res0_str = 3; // Manual instead of h/w
|
||||
|
||||
reg.opd0_addr = p->ifmap->start_address;
|
||||
reg.opt_opd0_sign = opd0_sign;
|
||||
reg.opt_opd0_int8 = 1;
|
||||
reg.opd0_n = p->ifmap->shape.n;
|
||||
reg.opd0_c = p->ifmap->shape.c;
|
||||
reg.opd0_h = p->ifmap->shape.h;
|
||||
reg.opd0_w = p->ifmap->shape.w;
|
||||
reg.opd0_ins_val = (uint32_t)p->ins_val;
|
||||
reg.opd0_n_str = p->ifmap->stride.n;
|
||||
reg.opd0_c_str = p->ifmap->stride.c;
|
||||
reg.opd0_h_str = p->ifmap->stride.h;
|
||||
reg.opd0_w_str = p->ifmap->stride.w;
|
||||
reg.short_opd0_str = 3; // Manual instead of h/w
|
||||
reg.conv_opd0_up_pad = p->pad_top;
|
||||
reg.conv_opd0_dn_pad = p->pad_bottom;
|
||||
reg.conv_opd0_lf_pad = p->pad_left;
|
||||
reg.conv_opd0_rt_pad = p->pad_right;
|
||||
reg.conv_opd0_x_ins0 = p->ins_w;
|
||||
reg.conv_opd0_y_ins0 = p->ins_h;
|
||||
reg.conv_opd0_x_ins0_last = p->ins_last_w;
|
||||
reg.conv_opd0_y_ins0_last = p->ins_last_h;
|
||||
|
||||
reg.opt_opd1_sign = 1;
|
||||
reg.opt_opd1_int8 = 1;
|
||||
reg.conv_opd1_x_ins0 = p->dilation_w - 1;
|
||||
reg.conv_opd1_y_ins0 = p->dilation_h - 1;
|
||||
if (isMulConst) {
|
||||
reg.opt_opd1_const = 1;
|
||||
reg.opt_opd1_sign = p->weight_const.is_signed;
|
||||
reg.opd1_addr = p->weight_const.val;
|
||||
reg.opd1_n = p->weight->shape.n;
|
||||
reg.opd1_c = p->weight->shape.c;
|
||||
reg.opd1_h = p->weight->shape.h;
|
||||
reg.opd1_w = p->weight->shape.w;
|
||||
} else {
|
||||
reg.opd1_addr = p->weight->start_address;
|
||||
reg.opd1_n = p->weight->shape.n;
|
||||
reg.opd1_c = p->weight->shape.c;
|
||||
reg.opd1_h = p->weight->shape.h;
|
||||
reg.opd1_w = p->weight->shape.w;
|
||||
}
|
||||
reg.conv_op_x_str = p->stride_w;
|
||||
reg.conv_op_y_str = p->stride_h;
|
||||
|
||||
ASSERT(p->chl_quan_param->shape.n == 1);
|
||||
ASSERT(p->chl_quan_param->shape.c == p->ofmap->shape.c);
|
||||
ASSERT(p->chl_quan_param->shape.h == 1);
|
||||
ASSERT(p->chl_quan_param->shape.w == 1);
|
||||
reg.opt_chl_quan = 1;
|
||||
reg.opt_right_shift = 0; // useless
|
||||
reg.opd2_addr = p->chl_quan_param->start_address;
|
||||
reg.opd2_n = p->chl_quan_param->shape.n;
|
||||
reg.opd2_c = p->chl_quan_param->shape.c;
|
||||
reg.opd2_h = p->chl_quan_param->shape.h;
|
||||
reg.opd2_w = p->chl_quan_param->shape.w;
|
||||
reg.opt_opd2_int8 = 1; // useless, force to 1 to skip b_stride check
|
||||
reg.short_opd2_str = 2; // useless
|
||||
reg.opd2_b_str = 0; // useless
|
||||
|
||||
if (p->has_bias) {
|
||||
reg.tsk_opd_num = 3;
|
||||
reg.opt_opd2_sign = 1;
|
||||
}
|
||||
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
79
cvikernel/src/bm1880v2/tiu_element_wise_add.c
Normal file
79
cvikernel/src/bm1880v2/tiu_element_wise_add.c
Normal file
@ -0,0 +1,79 @@
|
||||
#include "kernel_1880v2.h"
|
||||
|
||||
bmk1880v2_op_t * bmk1880v2_tiu_element_wise_add(
|
||||
ctx_t *k,
|
||||
const bmk1880v2_tiu_element_wise_add_param_t *p)
|
||||
{
|
||||
int bf16_enable = (p->a_low->fmt == FMT_BF16) ? 1 : 0;
|
||||
|
||||
if (bf16_enable) {
|
||||
/*bf16 only support 16 bit*/
|
||||
ASSERT(!p->a_high);
|
||||
ASSERT(!(p->b_high && !p->b_is_const));
|
||||
ASSERT(!p->res_high);
|
||||
check_tiu_tensor(p->a_low);
|
||||
check_tiu_tensor(p->res_low);
|
||||
assert_same_shape(p->res_low, p->a_low);
|
||||
if (!p->b_is_const) {
|
||||
check_tiu_tensor(p->b_low);
|
||||
assert_same_shape(p->res_low, p->b_low);
|
||||
}
|
||||
} else {
|
||||
check_16bit_tiu_tensor(p->a_low, p->a_high);
|
||||
check_tiu_tensor(p->res_low);
|
||||
assert_same_shape(p->res_low, p->a_low);
|
||||
if (!p->b_is_const) {
|
||||
check_16bit_tiu_tensor(p->b_low, p->b_high);
|
||||
assert_same_shape(p->res_low, p->b_low);
|
||||
}
|
||||
}
|
||||
if (p->res_high)
|
||||
check_16bit_tiu_tensor(p->res_low, p->res_high);
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tsk_eu_typ = TENSOR_ADD_FIX8B;
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opd_typ = bf16_enable ? 1: 0;
|
||||
reg.opt_right_shift = 0;
|
||||
reg.opt_relu = p->relu_enable;
|
||||
fill_same_tensor_shape(®, p->a_low->shape);
|
||||
fill_same_tensor_stride_type(®, 0b11);
|
||||
|
||||
int arith_shift = tensor_is_signed(p->res_low);
|
||||
reg.opt_shift_typ = arith_shift;
|
||||
reg.opt_right_shift = p->rshift_bits;
|
||||
|
||||
reg.opd0_addr = p->a_low->start_address;
|
||||
reg.opt_opd0_sign = tensor_is_signed(p->a_low);
|
||||
reg.opt_opd0_int8 = (p->a_high == NULL);
|
||||
reg.opd0_b_str = bf16_enable ? 0 : (p->a_high->start_address - p->a_low->start_address);
|
||||
fill_opd0_stride(®, &p->a_low->stride);
|
||||
|
||||
reg.opt_opd1_int8 = bf16_enable ? 1 : 0; //(p->b_high == NULL); b_high is the same as b_val
|
||||
if (p->b_is_const) {
|
||||
reg.opt_opd1_const = 1;
|
||||
reg.opt_opd1_sign = !!p->b_const.is_signed;
|
||||
reg.opd1_addr = p->b_const.val;
|
||||
} else {
|
||||
reg.opt_opd1_const = 0;
|
||||
reg.opt_opd1_sign = tensor_is_signed(p->b_low);
|
||||
reg.opd1_addr = p->b_low->start_address;
|
||||
reg.opd1_b_str = bf16_enable ? 0 : (p->b_high->start_address - p->b_low->start_address);
|
||||
fill_opd1_stride(®, &p->b_low->stride);
|
||||
}
|
||||
|
||||
reg.res0_addr = p->res_low->start_address;
|
||||
reg.opt_res0_sign = tensor_is_signed(p->res_low);
|
||||
reg.opt_res0_int8 = (p->res_high == NULL);
|
||||
fill_res0_stride(®, &p->res_low->stride);
|
||||
if (p->res_high)
|
||||
reg.res0_b_str = p->res_high->start_address - p->res_low->start_address;
|
||||
if (p->relu_enable)
|
||||
ASSERT(reg.opt_res0_int8);
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
return emit_tiu_cmdbuf(k, ®);
|
||||
}
|
||||
100
cvikernel/src/bm1880v2/tiu_element_wise_and.c
Normal file
100
cvikernel/src/bm1880v2/tiu_element_wise_and.c
Normal file
@ -0,0 +1,100 @@
|
||||
#include "kernel_1880v2.h"
|
||||
|
||||
bmk1880v2_op_t * bmk1880v2_tiu_element_wise_and_int8(
|
||||
ctx_t *ctx,
|
||||
const bmk1880v2_tiu_element_wise_and_int8_param_t *p)
|
||||
{
|
||||
check_tiu_tensor_3(p->res, p->a, p->b);
|
||||
assert_same_shape_3(p->res, p->a, p->b);
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tsk_eu_typ = TENSOR_AND_FIX8B;
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opt_right_shift = 0;
|
||||
reg.opt_shift_typ = 0;
|
||||
reg.opt_relu = 0;
|
||||
fill_same_tensor_shape(®, p->a->shape);
|
||||
fill_same_tensor_stride_type(®, 0b11);
|
||||
|
||||
reg.opd0_addr = p->a->start_address;
|
||||
reg.opt_opd0_sign = 0;
|
||||
reg.opt_opd0_int8 = 1;
|
||||
fill_opd0_stride(®, &p->a->stride);
|
||||
|
||||
reg.opd1_addr = p->b->start_address;
|
||||
reg.opt_opd1_sign = 0;
|
||||
reg.opt_opd1_int8 = 1;
|
||||
fill_opd1_stride(®, &p->b->stride);
|
||||
|
||||
reg.res0_addr = p->res->start_address;
|
||||
reg.opt_res0_sign = 0;
|
||||
reg.opt_res0_int8 = 1;
|
||||
fill_res0_stride(®, &p->res->stride);
|
||||
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
|
||||
bmk1880v2_op_t * bmk1880v2_tiu_element_wise_and_int16(
|
||||
ctx_t *ctx,
|
||||
const bmk1880v2_tiu_element_wise_and_int16_param_t *p)
|
||||
{
|
||||
check_16bit_tiu_tensor(p->a_low, p->a_high);
|
||||
check_16bit_tiu_tensor(p->b_low, p->b_high);
|
||||
check_16bit_tiu_tensor(p->res_low, p->res_high);
|
||||
assert_same_shape_3(p->res_low, p->a_low, p->b_low);
|
||||
|
||||
int res_high_addr = p->res_high->start_address;
|
||||
int res_low_addr = p->res_low->start_address;
|
||||
ASSERT(res_high_addr > res_low_addr);
|
||||
int res_b_stride = res_high_addr - res_low_addr;
|
||||
|
||||
int a_high_addr = p->a_high->start_address;
|
||||
int a_low_addr = p->a_low->start_address;
|
||||
ASSERT(a_high_addr > a_low_addr);
|
||||
int a_b_stride = a_high_addr - a_low_addr;
|
||||
|
||||
int b_high_addr = p->b_high->start_address;
|
||||
int b_low_addr = p->b_low->start_address;
|
||||
ASSERT(b_high_addr > b_low_addr);
|
||||
int b_b_stride = b_high_addr - b_low_addr;
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tsk_eu_typ = TENSOR_AND_FIX8B;
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opt_right_shift = 0;
|
||||
reg.opt_shift_typ = 0;
|
||||
reg.opt_relu = 0;
|
||||
fill_same_tensor_shape(®, p->a_low->shape);
|
||||
fill_same_tensor_stride_type(®, 0b11);
|
||||
|
||||
reg.opd0_addr = a_low_addr;
|
||||
reg.opt_opd0_sign = 0;
|
||||
reg.opt_opd0_int8 = 0;
|
||||
reg.opd0_b_str = a_b_stride;
|
||||
fill_opd0_stride(®, &p->a_low->stride);
|
||||
|
||||
reg.opd1_addr = b_low_addr;
|
||||
reg.opt_opd1_sign = 0;
|
||||
reg.opt_opd1_int8 = 0;
|
||||
reg.opd1_b_str = b_b_stride;
|
||||
fill_opd1_stride(®, &p->b_low->stride);
|
||||
|
||||
reg.res0_addr = res_low_addr;
|
||||
reg.opt_res0_sign = 0;
|
||||
reg.opt_res0_int8 = 0;
|
||||
reg.res0_b_str = res_b_stride;
|
||||
fill_res0_stride(®, &p->res_low->stride);
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
42
cvikernel/src/bm1880v2/tiu_element_wise_copy.c
Normal file
42
cvikernel/src/bm1880v2/tiu_element_wise_copy.c
Normal file
@ -0,0 +1,42 @@
|
||||
#include "kernel_1880v2.h"
|
||||
|
||||
bmk1880v2_op_t * bmk1880v2_tiu_element_wise_copy(
|
||||
ctx_t *ctx,
|
||||
const bmk1880v2_tiu_element_wise_copy_param_t *p)
|
||||
{
|
||||
int bf16_enable = (p->src->fmt == FMT_BF16) ? 1 : 0;
|
||||
|
||||
check_tiu_tensor_2(p->dst, p->src);
|
||||
assert_same_shape(p->dst, p->src);
|
||||
assert_stride_range(p->dst->stride);
|
||||
assert_stride_range(p->src->stride);
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tsk_eu_typ = TENSOR_COPY_FIX8B;
|
||||
reg.tsk_opd_num = 1;
|
||||
reg.opd_typ = bf16_enable ? 1: 0;
|
||||
reg.opt_right_shift = 0;
|
||||
reg.opt_shift_typ = 0;
|
||||
reg.opt_relu = 0;
|
||||
fill_same_tensor_shape(®, p->dst->shape);
|
||||
fill_same_tensor_stride_type(®, 0b11);
|
||||
|
||||
reg.opd0_addr = p->src->start_address;
|
||||
reg.opt_opd0_sign = 0;
|
||||
reg.opt_opd0_int8 = 1;
|
||||
fill_opd0_stride(®, &p->src->stride);
|
||||
|
||||
reg.res0_addr = p->dst->start_address;
|
||||
reg.opt_res0_sign = 0;
|
||||
reg.opt_res0_int8 = 1;
|
||||
fill_res0_stride(®, &p->dst->stride);
|
||||
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
67
cvikernel/src/bm1880v2/tiu_element_wise_mac.c
Normal file
67
cvikernel/src/bm1880v2/tiu_element_wise_mac.c
Normal file
@ -0,0 +1,67 @@
|
||||
#include "kernel_1880v2.h"
|
||||
|
||||
bmk1880v2_op_t * bmk1880v2_tiu_element_wise_mac(
|
||||
ctx_t *ctx,
|
||||
const bmk1880v2_tiu_element_wise_mac_param_t *p)
|
||||
{
|
||||
int bf16_enable = (p->a->fmt == FMT_BF16) ? 1 : 0;
|
||||
|
||||
check_tiu_tensor(p->a);
|
||||
assert_same_shape(p->res_low, p->a);
|
||||
if (!bf16_enable) {
|
||||
check_16bit_tiu_tensor(p->res_low, p->res_high);
|
||||
ASSERT(p->lshift_bits < 32);
|
||||
ASSERT(p->rshift_bits < 16);
|
||||
}
|
||||
if (!p->b_is_const) {
|
||||
check_tiu_tensor(p->b);
|
||||
assert_same_shape(p->res_low, p->b);
|
||||
}
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tsk_eu_typ = TENSOR_MAC_FIX8B;
|
||||
reg.opt_res_add = 1;
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opd_typ = bf16_enable ? 1: 0;
|
||||
reg.opt_left_shift = p->lshift_bits;
|
||||
reg.opt_relu = p->relu_enable;
|
||||
fill_same_tensor_shape(®, p->a->shape);
|
||||
fill_same_tensor_stride_type(®, 0b11);
|
||||
|
||||
int arith_shift = tensor_is_signed(p->res_low);
|
||||
reg.opt_shift_typ = arith_shift;
|
||||
reg.opt_right_shift = p->rshift_bits;
|
||||
|
||||
reg.opd0_addr = p->a->start_address;
|
||||
reg.opt_opd0_sign = tensor_is_signed(p->a);
|
||||
fill_opd0_stride(®, &p->a->stride);
|
||||
|
||||
if (p->b_is_const) {
|
||||
reg.opt_opd1_const = 1;
|
||||
reg.opd1_addr = bf16_enable ? p->b_const.val : (p->b_const.val & 0xFF);
|
||||
reg.opt_opd1_sign = !!p->b_const.is_signed;
|
||||
} else {
|
||||
reg.opt_opd1_const = 0;
|
||||
reg.opd1_addr = p->b->start_address;
|
||||
reg.opt_opd1_sign = tensor_is_signed(p->b);
|
||||
fill_opd1_stride(®, &p->b->stride);
|
||||
}
|
||||
|
||||
reg.res0_addr = p->res_low->start_address;
|
||||
reg.opt_res0_sign = tensor_is_signed(p->res_low);
|
||||
reg.opt_res0_int8 = bf16_enable ? 1 : !!p->res_is_int8;
|
||||
fill_res0_stride(®, &p->res_low->stride);
|
||||
reg.res0_b_str = bf16_enable ? 0 : (p->res_high->start_address - p->res_low->start_address);
|
||||
|
||||
if (p->relu_enable)
|
||||
ASSERT(reg.opt_res0_int8);
|
||||
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
56
cvikernel/src/bm1880v2/tiu_element_wise_max.c
Normal file
56
cvikernel/src/bm1880v2/tiu_element_wise_max.c
Normal file
@ -0,0 +1,56 @@
|
||||
#include "kernel_1880v2.h"
|
||||
|
||||
bmk1880v2_op_t * bmk1880v2_tiu_element_wise_max(
|
||||
ctx_t *ctx,
|
||||
const bmk1880v2_tiu_element_wise_max_param_t *p)
|
||||
{
|
||||
int bf16_enable = (p->a->fmt == FMT_BF16) ? 1 : 0;
|
||||
|
||||
check_tiu_tensor_2(p->max, p->a);
|
||||
assert_same_shape(p->max, p->a);
|
||||
|
||||
if (p->b_is_const && !bf16_enable) {
|
||||
if (tensor_is_signed(p->a))
|
||||
ASSERT(p->b_const.is_signed);
|
||||
else
|
||||
ASSERT(!p->b_const.is_signed);
|
||||
} else if (!p->b_is_const) {
|
||||
check_tiu_tensor(p->b);
|
||||
assert_same_shape(p->max, p->b);
|
||||
ASSERT(p->a->fmt == p->b->fmt);
|
||||
}
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tsk_eu_typ = TENSOR_MAX_FIX8B;
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opd_typ = bf16_enable ? 1: 0;
|
||||
reg.opt_right_shift = 0;
|
||||
reg.opt_relu = 0;
|
||||
fill_same_tensor_shape(®, p->a->shape);
|
||||
fill_same_tensor_stride_type(®, 0b11);
|
||||
|
||||
reg.opd0_addr = p->a->start_address;
|
||||
reg.opt_opd0_sign = tensor_is_signed(p->a);
|
||||
fill_opd0_stride(®, &p->a->stride);
|
||||
|
||||
if (p->b_is_const) {
|
||||
reg.opt_opd1_const = 1;
|
||||
reg.opd1_addr = bf16_enable ? p->b_const.val : (p->b_const.val & 0xFF);
|
||||
reg.opt_opd1_sign = !!p->b_const.is_signed;
|
||||
} else {
|
||||
reg.opt_opd1_const = 0;
|
||||
reg.opd1_addr = p->b->start_address;
|
||||
reg.opt_opd1_sign = tensor_is_signed(p->b);
|
||||
fill_opd1_stride(®, &p->b->stride);
|
||||
}
|
||||
|
||||
reg.res0_addr = p->max->start_address;
|
||||
reg.opt_res0_sign = tensor_is_signed(p->max);
|
||||
fill_res0_stride(®, &p->max->stride);
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
58
cvikernel/src/bm1880v2/tiu_element_wise_min.c
Normal file
58
cvikernel/src/bm1880v2/tiu_element_wise_min.c
Normal file
@ -0,0 +1,58 @@
|
||||
#include "kernel_1880v2.h"
|
||||
|
||||
bmk1880v2_op_t * bmk1880v2_tiu_element_wise_min(
|
||||
ctx_t *ctx,
|
||||
const bmk1880v2_tiu_element_wise_min_param_t *p)
|
||||
{
|
||||
int bf16_enable = (p->a->fmt == FMT_BF16) ? 1 : 0;
|
||||
|
||||
check_tiu_tensor_2(p->min, p->a);
|
||||
assert_same_shape(p->min, p->a);
|
||||
if (p->b_is_const && !bf16_enable) {
|
||||
if (tensor_is_signed(p->a))
|
||||
ASSERT(p->b_const.is_signed);
|
||||
else
|
||||
ASSERT(!p->b_const.is_signed);
|
||||
} else if (!p->b_is_const) {
|
||||
check_tiu_tensor(p->b);
|
||||
assert_same_shape(p->min, p->b);
|
||||
ASSERT(p->a->fmt == p->b->fmt);
|
||||
}
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tsk_eu_typ = TENSOR_MIN_FIX8B;
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opd_typ = bf16_enable ? 1: 0;
|
||||
reg.opt_right_shift = 0;
|
||||
reg.opt_relu = 0;
|
||||
fill_same_tensor_shape(®, p->a->shape);
|
||||
fill_same_tensor_stride_type(®, 0b11);
|
||||
|
||||
reg.opd0_addr = p->a->start_address;
|
||||
reg.opt_opd0_sign = tensor_is_signed(p->a);
|
||||
fill_opd0_stride(®, &p->a->stride);
|
||||
|
||||
if (p->b_is_const) {
|
||||
reg.opt_opd1_const = 1;
|
||||
reg.opd1_addr = bf16_enable ? p->b_const.val : (p->b_const.val & 0xFF);
|
||||
reg.opt_opd1_sign = !!p->b_const.is_signed;
|
||||
} else {
|
||||
reg.opt_opd1_const = 0;
|
||||
reg.opd1_addr = p->b->start_address;
|
||||
reg.opt_opd1_sign = tensor_is_signed(p->b);
|
||||
fill_opd1_stride(®, &p->b->stride);
|
||||
}
|
||||
|
||||
reg.res0_addr = p->min->start_address;
|
||||
reg.opt_res0_sign = tensor_is_signed(p->min);
|
||||
fill_res0_stride(®, &p->min->stride);
|
||||
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
66
cvikernel/src/bm1880v2/tiu_element_wise_mul.c
Normal file
66
cvikernel/src/bm1880v2/tiu_element_wise_mul.c
Normal file
@ -0,0 +1,66 @@
|
||||
#include "kernel_1880v2.h"
|
||||
|
||||
bmk1880v2_op_t * bmk1880v2_tiu_element_wise_mul(
|
||||
ctx_t *ctx,
|
||||
const bmk1880v2_tiu_element_wise_mul_param_t *p)
|
||||
{
|
||||
int bf16_enable = (p->a->fmt == FMT_BF16) ? 1 : 0;
|
||||
|
||||
check_tiu_tensor_2(p->res_low, p->a);
|
||||
assert_same_shape(p->res_low, p->a);
|
||||
if (!p->b_is_const) {
|
||||
check_tiu_tensor(p->b);
|
||||
assert_same_shape(p->res_low, p->b);
|
||||
}
|
||||
if (p->res_high)
|
||||
check_16bit_tiu_tensor(p->res_low, p->res_high);
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tsk_eu_typ = TENSOR_MUL_FIX8B;
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opd_typ = bf16_enable ? 1: 0;
|
||||
int arith_shift = tensor_is_signed(p->res_low);
|
||||
reg.opt_shift_typ = arith_shift;
|
||||
reg.opt_right_shift = p->rshift_bits;
|
||||
reg.opt_relu = p->relu_enable;
|
||||
fill_same_tensor_shape(®, p->a->shape);
|
||||
fill_same_tensor_stride_type(®, 0b11);
|
||||
|
||||
reg.opd0_addr = p->a->start_address;
|
||||
reg.opt_opd0_sign = tensor_is_signed(p->a);
|
||||
fill_opd0_stride(®, &p->a->stride);
|
||||
|
||||
if (p->b_is_const) {
|
||||
reg.opt_opd1_const = 1;
|
||||
reg.opd1_addr = bf16_enable ? p->b_const.val : (p->b_const.val & 0xFF);
|
||||
reg.opt_opd1_sign = !!p->b_const.is_signed;
|
||||
} else {
|
||||
reg.opt_opd1_const = 0;
|
||||
reg.opd1_addr = p->b->start_address;
|
||||
reg.opt_opd1_sign = tensor_is_signed(p->b);
|
||||
fill_opd1_stride(®, &p->b->stride);
|
||||
}
|
||||
|
||||
reg.res0_addr = p->res_low->start_address;
|
||||
reg.opt_res0_sign = tensor_is_signed(p->res_low);
|
||||
reg.opt_res0_int8 = (p->res_high == NULL);
|
||||
fill_res0_stride(®, &p->res_low->stride);
|
||||
if (p->res_high)
|
||||
reg.res0_b_str = (p->res_high->start_address - p->res_low->start_address);
|
||||
if (p->relu_enable)
|
||||
ASSERT(reg.opt_res0_int8);
|
||||
|
||||
ASSERT((
|
||||
p->b_is_const || (!reg.opt_opd1_sign && !reg.opt_opd0_sign && !reg.opt_shift_typ) ||
|
||||
((reg.opt_opd1_sign || reg.opt_opd0_sign) && reg.opt_shift_typ)
|
||||
));
|
||||
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
66
cvikernel/src/bm1880v2/tiu_element_wise_mul_qdm.c
Normal file
66
cvikernel/src/bm1880v2/tiu_element_wise_mul_qdm.c
Normal file
@ -0,0 +1,66 @@
|
||||
#include "kernel_1880v2.h"
|
||||
|
||||
bmk1880v2_op_t * bmk1880v2_tiu_element_wise_mul_qdm(
|
||||
ctx_t *ctx,
|
||||
const bmk1880v2_tiu_element_wise_mul_qdm_param_t *p)
|
||||
{
|
||||
check_tiu_tensor_2(p->res_low, p->a);
|
||||
assert_same_shape(p->res_low, p->a);
|
||||
if (!p->b_is_const) {
|
||||
check_tiu_tensor(p->b);
|
||||
assert_same_shape(p->res_low, p->b);
|
||||
}
|
||||
if (p->res_high)
|
||||
check_16bit_tiu_tensor(p->res_low, p->res_high);
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tsk_eu_typ = TENSOR_MUL_FIX8B;
|
||||
reg.tsk_opd_num = 2;
|
||||
int arith_shift = tensor_is_signed(p->res_low);
|
||||
reg.opt_shift_typ = arith_shift;
|
||||
reg.opt_right_shift = p->rshift_bits;
|
||||
reg.opt_relu = p->relu_enable;
|
||||
fill_same_tensor_shape(®, p->a->shape);
|
||||
fill_same_tensor_stride_type(®, 0b11);
|
||||
|
||||
reg.opd0_addr = p->a->start_address;
|
||||
reg.opt_opd0_sign = tensor_is_signed(p->a);
|
||||
fill_opd0_stride(®, &p->a->stride);
|
||||
|
||||
if (p->b_is_const) {
|
||||
reg.opt_opd1_const = 1;
|
||||
reg.opd1_addr = p->b_const.val;
|
||||
reg.opt_opd1_sign = !!p->b_const.is_signed;
|
||||
} else {
|
||||
reg.opt_opd1_const = 0;
|
||||
reg.opd1_addr = p->b->start_address;
|
||||
reg.opt_opd1_sign = tensor_is_signed(p->b);
|
||||
fill_opd1_stride(®, &p->b->stride);
|
||||
}
|
||||
|
||||
reg.res0_addr = p->res_low->start_address;
|
||||
reg.opt_res0_sign = tensor_is_signed(p->res_low);
|
||||
reg.opt_res0_int8 = (p->res_high == NULL);
|
||||
fill_res0_stride(®, &p->res_low->stride);
|
||||
if (p->res_high)
|
||||
reg.res0_b_str = p->res_high->start_address - p->res_low->start_address;
|
||||
if (p->relu_enable)
|
||||
ASSERT(reg.opt_res0_int8);
|
||||
|
||||
ASSERT((
|
||||
(!reg.opt_opd1_sign && !reg.opt_opd0_sign && !reg.opt_shift_typ) ||
|
||||
((reg.opt_opd1_sign || reg.opt_opd0_sign) && reg.opt_shift_typ)
|
||||
));
|
||||
|
||||
reg.opt_chl_quan = 1;
|
||||
reg.quan_m = p->multiplier;
|
||||
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
100
cvikernel/src/bm1880v2/tiu_element_wise_or.c
Normal file
100
cvikernel/src/bm1880v2/tiu_element_wise_or.c
Normal file
@ -0,0 +1,100 @@
|
||||
#include "kernel_1880v2.h"
|
||||
|
||||
bmk1880v2_op_t * bmk1880v2_tiu_element_wise_or_int8(
|
||||
ctx_t *ctx,
|
||||
const bmk1880v2_tiu_element_wise_or_int8_param_t *p)
|
||||
{
|
||||
check_tiu_tensor_3(p->res, p->a, p->b);
|
||||
assert_same_shape_3(p->res, p->a, p->b);
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tsk_eu_typ = TENSOR_OR_FIX8B;
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opt_right_shift = 0;
|
||||
reg.opt_shift_typ = 0;
|
||||
reg.opt_relu = 0;
|
||||
fill_same_tensor_shape(®, p->a->shape);
|
||||
fill_same_tensor_stride_type(®, 0b11);
|
||||
|
||||
reg.opd0_addr = p->a->start_address;
|
||||
reg.opt_opd0_sign = 0;
|
||||
reg.opt_opd0_int8 = 1;
|
||||
fill_opd0_stride(®, &p->a->stride);
|
||||
|
||||
reg.opd1_addr = p->b->start_address;
|
||||
reg.opt_opd1_sign = 0;
|
||||
reg.opt_opd1_int8 = 1;
|
||||
fill_opd1_stride(®, &p->b->stride);
|
||||
|
||||
reg.res0_addr = p->res->start_address;
|
||||
reg.opt_res0_sign = 0;
|
||||
reg.opt_res0_int8 = 1;
|
||||
fill_res0_stride(®, &p->res->stride);
|
||||
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
|
||||
bmk1880v2_op_t * bmk1880v2_tiu_element_wise_or_int16(
|
||||
ctx_t *ctx,
|
||||
const bmk1880v2_tiu_element_wise_or_int16_param_t *p)
|
||||
{
|
||||
check_16bit_tiu_tensor(p->a_low, p->a_high);
|
||||
check_16bit_tiu_tensor(p->b_low, p->b_high);
|
||||
check_16bit_tiu_tensor(p->res_low, p->res_high);
|
||||
assert_same_shape_3(p->res_low, p->a_low, p->b_low);
|
||||
|
||||
int res_high_addr = p->res_high->start_address;
|
||||
int res_low_addr = p->res_low->start_address;
|
||||
ASSERT(res_high_addr > res_low_addr);
|
||||
int res_b_stride = res_high_addr - res_low_addr;
|
||||
|
||||
int a_high_addr = p->a_high->start_address;
|
||||
int a_low_addr = p->a_low->start_address;
|
||||
ASSERT(a_high_addr > a_low_addr);
|
||||
int a_b_stride = a_high_addr - a_low_addr;
|
||||
|
||||
int b_high_addr = p->b_high->start_address;
|
||||
int b_low_addr = p->b_low->start_address;
|
||||
ASSERT(b_high_addr > b_low_addr);
|
||||
int b_b_stride = b_high_addr - b_low_addr;
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tsk_eu_typ = TENSOR_OR_FIX8B;
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opt_right_shift = 0;
|
||||
reg.opt_shift_typ = 0;
|
||||
reg.opt_relu = 0;
|
||||
fill_same_tensor_shape(®, p->a_low->shape);
|
||||
fill_same_tensor_stride_type(®, 0b11);
|
||||
|
||||
reg.opd0_addr = a_low_addr;
|
||||
reg.opt_opd0_sign = 0;
|
||||
reg.opt_opd0_int8 = 0;
|
||||
reg.opd0_b_str = a_b_stride;
|
||||
fill_opd0_stride(®, &p->a_low->stride);
|
||||
|
||||
reg.opd1_addr = b_low_addr;
|
||||
reg.opt_opd1_sign = 0;
|
||||
reg.opt_opd1_int8 = 0;
|
||||
reg.opd1_b_str = b_b_stride;
|
||||
fill_opd1_stride(®, &p->b_low->stride);
|
||||
|
||||
reg.res0_addr = res_low_addr;
|
||||
reg.opt_res0_sign = 0;
|
||||
reg.opt_res0_int8 = 0;
|
||||
reg.res0_b_str = res_b_stride;
|
||||
fill_res0_stride(®, &p->res_low->stride);
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
58
cvikernel/src/bm1880v2/tiu_element_wise_shift.c
Normal file
58
cvikernel/src/bm1880v2/tiu_element_wise_shift.c
Normal file
@ -0,0 +1,58 @@
|
||||
#include "kernel_1880v2.h"
|
||||
|
||||
bmk1880v2_op_t * bmk1880v2_tiu_element_wise_arith_shift(
|
||||
ctx_t *ctx,
|
||||
const bmk1880v2_tiu_element_wise_arith_shift_param_t *p)
|
||||
{
|
||||
check_16bit_tiu_tensor(p->a_low, p->a_high);
|
||||
check_16bit_tiu_tensor(p->res_low, p->res_high);
|
||||
check_tiu_tensor(p->bits);
|
||||
assert_same_shape_3(p->res_low, p->a_low, p->bits);
|
||||
ASSERT(tensor_is_signed(p->a_low));
|
||||
ASSERT(tensor_is_signed(p->bits));
|
||||
|
||||
int res_high_addr = p->res_high->start_address;
|
||||
int res_low_addr = p->res_low->start_address;
|
||||
ASSERT(res_high_addr > res_low_addr);
|
||||
int res_b_stride = res_high_addr - res_low_addr;
|
||||
|
||||
int a_high_addr = p->a_high->start_address;
|
||||
int a_low_addr = p->a_low->start_address;
|
||||
ASSERT(a_high_addr > a_low_addr);
|
||||
int a_b_stride = a_high_addr - a_low_addr;
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tsk_eu_typ = TENSOR_SHIFT_FIX8B;
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opt_right_shift = 0;
|
||||
reg.opt_rshift_typ = 0;
|
||||
reg.opt_relu = 0;
|
||||
fill_same_tensor_shape(®, p->a_low->shape);
|
||||
fill_same_tensor_stride_type(®, 0b11);
|
||||
|
||||
reg.opd0_addr = a_low_addr;
|
||||
reg.opt_opd0_sign = 1;
|
||||
reg.opt_opd0_int8 = 0;
|
||||
reg.opd0_b_str = a_b_stride;
|
||||
fill_opd0_stride(®, &p->a_low->stride);
|
||||
|
||||
reg.opd1_addr = p->bits->start_address;
|
||||
reg.opt_opd1_sign = 1;
|
||||
reg.opt_opd1_int8 = 1;
|
||||
fill_opd1_stride(®, &p->bits->stride);
|
||||
|
||||
reg.res0_addr = res_low_addr;
|
||||
reg.opt_res0_sign = 1;
|
||||
reg.opt_res0_int8 = 0;
|
||||
reg.res0_b_str = res_b_stride;
|
||||
fill_res0_stride(®, &p->res_low->stride);
|
||||
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
68
cvikernel/src/bm1880v2/tiu_element_wise_sub.c
Normal file
68
cvikernel/src/bm1880v2/tiu_element_wise_sub.c
Normal file
@ -0,0 +1,68 @@
|
||||
#include "kernel_1880v2.h"
|
||||
|
||||
bmk1880v2_op_t * bmk1880v2_tiu_element_wise_sub(
|
||||
ctx_t *ctx,
|
||||
const bmk1880v2_tiu_element_wise_sub_param_t *p)
|
||||
{
|
||||
int bf16_enable = (p->a_low->fmt == FMT_BF16) ? 1 : 0;
|
||||
|
||||
if (bf16_enable) {
|
||||
/*bf16 only support 16 bit*/
|
||||
ASSERT(!p->a_high);
|
||||
ASSERT(!p->b_high);
|
||||
ASSERT(!p->res_high);
|
||||
check_tiu_tensor(p->a_low);
|
||||
check_tiu_tensor(p->b_low);
|
||||
check_tiu_tensor(p->res_low);
|
||||
assert_same_shape_3(p->res_low, p->a_low, p->b_low);
|
||||
} else {
|
||||
check_16bit_tiu_tensor(p->a_low, p->a_high);
|
||||
check_16bit_tiu_tensor(p->b_low, p->b_high);
|
||||
check_tiu_tensor(p->res_low);
|
||||
assert_same_shape_3(p->res_low, p->a_low, p->b_low);
|
||||
ASSERT(tensor_is_signed(p->res_low));
|
||||
}
|
||||
if (p->res_high)
|
||||
check_16bit_tiu_tensor(p->res_low, p->res_high);
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tsk_eu_typ = TENSOR_SUB_FIX8B;
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opd_typ = bf16_enable ? 1: 0;
|
||||
reg.opt_right_shift = 0;
|
||||
reg.opt_relu = 0;
|
||||
fill_same_tensor_shape(®, p->a_low->shape);
|
||||
fill_same_tensor_stride_type(®, 0b11);
|
||||
|
||||
int arith_shift = tensor_is_signed(p->res_low);
|
||||
reg.opt_shift_typ = arith_shift;
|
||||
reg.opt_right_shift = p->rshift_bits;
|
||||
|
||||
reg.opd0_addr = p->a_low->start_address;
|
||||
reg.opt_opd0_sign = tensor_is_signed(p->a_low);
|
||||
reg.opt_opd0_int8 = (p->a_high == NULL);
|
||||
reg.opd0_b_str = bf16_enable ? 0 : (p->a_high->start_address - p->a_low->start_address);
|
||||
fill_opd0_stride(®, &p->a_low->stride);
|
||||
|
||||
reg.opd1_addr = p->b_low->start_address;
|
||||
reg.opt_opd1_sign = tensor_is_signed(p->b_low);;
|
||||
reg.opt_opd1_int8 = (p->b_high == NULL);
|
||||
reg.opd1_b_str = bf16_enable ? 0 : (p->b_high->start_address - p->b_low->start_address);
|
||||
fill_opd1_stride(®, &p->b_low->stride);
|
||||
|
||||
reg.res0_addr = p->res_low->start_address;
|
||||
reg.opt_res0_sign = 1;
|
||||
reg.opt_res0_int8 = (p->res_high == NULL);
|
||||
fill_res0_stride(®, &p->res_low->stride);
|
||||
if (p->res_high)
|
||||
reg.res0_b_str = bf16_enable ? 0 : (p->res_high->start_address - p->res_low->start_address);
|
||||
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
100
cvikernel/src/bm1880v2/tiu_element_wise_xor.c
Normal file
100
cvikernel/src/bm1880v2/tiu_element_wise_xor.c
Normal file
@ -0,0 +1,100 @@
|
||||
#include "kernel_1880v2.h"
|
||||
|
||||
bmk1880v2_op_t * bmk1880v2_tiu_element_wise_xor_int8(
|
||||
ctx_t *ctx,
|
||||
const bmk1880v2_tiu_element_wise_xor_int8_param_t *p)
|
||||
{
|
||||
check_tiu_tensor_3(p->res, p->a, p->b);
|
||||
assert_same_shape_3(p->res, p->a, p->b);
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tsk_eu_typ = TENSOR_XOR_FIX8B;
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opt_right_shift = 0;
|
||||
reg.opt_shift_typ = 0;
|
||||
reg.opt_relu = 0;
|
||||
fill_same_tensor_shape(®, p->a->shape);
|
||||
fill_same_tensor_stride_type(®, 0b11);
|
||||
|
||||
reg.opd0_addr = p->a->start_address;
|
||||
reg.opt_opd0_sign = 0;
|
||||
reg.opt_opd0_int8 = 1;
|
||||
fill_opd0_stride(®, &p->a->stride);
|
||||
|
||||
reg.opd1_addr = p->b->start_address;
|
||||
reg.opt_opd1_sign = 0;
|
||||
reg.opt_opd1_int8 = 1;
|
||||
fill_opd1_stride(®, &p->b->stride);
|
||||
|
||||
reg.res0_addr = p->res->start_address;
|
||||
reg.opt_res0_sign = 0;
|
||||
reg.opt_res0_int8 = 1;
|
||||
fill_res0_stride(®, &p->res->stride);
|
||||
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
|
||||
bmk1880v2_op_t * bmk1880v2_tiu_element_wise_xor_int16(
|
||||
ctx_t *ctx,
|
||||
const bmk1880v2_tiu_element_wise_xor_int16_param_t *p)
|
||||
{
|
||||
check_16bit_tiu_tensor(p->a_low, p->a_high);
|
||||
check_16bit_tiu_tensor(p->b_low, p->b_high);
|
||||
check_16bit_tiu_tensor(p->res_low, p->res_high);
|
||||
assert_same_shape_3(p->res_low, p->a_low, p->b_low);
|
||||
|
||||
int res_high_addr = p->res_high->start_address;
|
||||
int res_low_addr = p->res_low->start_address;
|
||||
ASSERT(res_high_addr > res_low_addr);
|
||||
int res_b_stride = res_high_addr - res_low_addr;
|
||||
|
||||
int a_high_addr = p->a_high->start_address;
|
||||
int a_low_addr = p->a_low->start_address;
|
||||
ASSERT(a_high_addr > a_low_addr);
|
||||
int a_b_stride = a_high_addr - a_low_addr;
|
||||
|
||||
int b_high_addr = p->b_high->start_address;
|
||||
int b_low_addr = p->b_low->start_address;
|
||||
ASSERT(b_high_addr > b_low_addr);
|
||||
int b_b_stride = b_high_addr - b_low_addr;
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tsk_eu_typ = TENSOR_XOR_FIX8B;
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opt_right_shift = 0;
|
||||
reg.opt_shift_typ = 0;
|
||||
reg.opt_relu = 0;
|
||||
fill_same_tensor_shape(®, p->a_low->shape);
|
||||
fill_same_tensor_stride_type(®, 0b11);
|
||||
|
||||
reg.opd0_addr = a_low_addr;
|
||||
reg.opt_opd0_sign = 0;
|
||||
reg.opt_opd0_int8 = 0;
|
||||
reg.opd0_b_str = a_b_stride;
|
||||
fill_opd0_stride(®, &p->a_low->stride);
|
||||
|
||||
reg.opd1_addr = b_low_addr;
|
||||
reg.opt_opd1_sign = 0;
|
||||
reg.opt_opd1_int8 = 0;
|
||||
reg.opd1_b_str = b_b_stride;
|
||||
fill_opd1_stride(®, &p->b_low->stride);
|
||||
|
||||
reg.res0_addr = res_low_addr;
|
||||
reg.opt_res0_sign = 0;
|
||||
reg.opt_res0_int8 = 0;
|
||||
reg.res0_b_str = res_b_stride;
|
||||
fill_res0_stride(®, &p->res_low->stride);
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
113
cvikernel/src/bm1880v2/tiu_lookup_table.c
Normal file
113
cvikernel/src/bm1880v2/tiu_lookup_table.c
Normal file
@ -0,0 +1,113 @@
|
||||
#include "kernel_1880v2.h"
|
||||
|
||||
bmk1880v2_op_t * bmk1880v2_tiu_lookup_table(
|
||||
ctx_t *ctx,
|
||||
const bmk1880v2_tiu_lookup_table_param_t *p)
|
||||
{
|
||||
uint32_t eu_num = ctx->chip_info.eu_num;
|
||||
uint32_t npu_num = ctx->chip_info.npu_num;
|
||||
|
||||
check_tiu_tensor_3(p->ofmap, p->ifmap, p->table);
|
||||
assert_stride_type_0(ctx, p->ofmap);
|
||||
assert_stride_type_0(ctx, p->ifmap);
|
||||
assert_stride_type_0(ctx, p->table);
|
||||
|
||||
uint8_t is_bf16 = (p->ofmap->fmt == FMT_BF16 && p->ifmap->fmt == FMT_BF16);
|
||||
|
||||
ASSERT(p->table->shape.n == 1);
|
||||
ASSERT(p->table->shape.c == npu_num);
|
||||
|
||||
if (is_bf16) {
|
||||
ASSERT(p->table->shape.h == 32);
|
||||
ASSERT(p->table->shape.w == 8);
|
||||
}
|
||||
else {
|
||||
ASSERT(p->table->shape.h == 16);
|
||||
ASSERT(p->table->shape.w == 16);
|
||||
}
|
||||
|
||||
ASSERT(p->ifmap->start_address % eu_num == 0);
|
||||
ASSERT(p->ofmap->start_address % eu_num == 0);
|
||||
ASSERT(p->table->start_address % eu_num == 0);
|
||||
|
||||
// fmt MUST be same under bf16
|
||||
if (p->ofmap->fmt == FMT_BF16) {
|
||||
ASSERT(p->ifmap->fmt == FMT_BF16);
|
||||
}
|
||||
ASSERT(p->ofmap->fmt == FMT_I8 || p->ofmap->fmt == FMT_U8 || p->ofmap->fmt == FMT_BF16);
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tens_lookup = 1;
|
||||
reg.tsk_opd_num = 2;
|
||||
reg.opt_shift_typ = 0;
|
||||
reg.opt_right_shift = 0;
|
||||
reg.opt_relu = 0;
|
||||
reg.opd_typ = is_bf16;
|
||||
|
||||
reg.res0_addr = p->ofmap->start_address;
|
||||
if (is_bf16) {
|
||||
reg.opt_res0_sign = 1;
|
||||
reg.opt_res0_int8 = 1;
|
||||
}
|
||||
else {
|
||||
reg.opt_res0_sign = 0;
|
||||
reg.opt_res0_int8 = 1;
|
||||
}
|
||||
|
||||
// <! input / output shape SHOULD be same
|
||||
ASSERT(p->ifmap->shape.n == p->ofmap->shape.n);
|
||||
ASSERT(p->ifmap->shape.c == p->ofmap->shape.c);
|
||||
ASSERT(p->ifmap->shape.h == p->ofmap->shape.h);
|
||||
ASSERT(p->ifmap->shape.w == p->ofmap->shape.w);
|
||||
|
||||
reg.res0_n = p->ifmap->shape.n;
|
||||
reg.res0_c = p->ifmap->shape.c;
|
||||
reg.res0_h = p->ifmap->shape.h;
|
||||
reg.res0_w = p->ifmap->shape.w;
|
||||
reg.short_res0_str = 0;
|
||||
|
||||
reg.opd0_addr = p->ifmap->start_address;
|
||||
if (is_bf16) {
|
||||
reg.opt_opd0_sign = 1;
|
||||
reg.opt_opd0_int8 = 1;
|
||||
}
|
||||
else {
|
||||
reg.opt_opd0_sign = 0;
|
||||
reg.opt_opd0_int8 = 1;
|
||||
}
|
||||
reg.opd0_n = p->ifmap->shape.n;
|
||||
reg.opd0_c = p->ifmap->shape.c;
|
||||
reg.opd0_h = p->ifmap->shape.h;
|
||||
reg.opd0_w = p->ifmap->shape.w;
|
||||
reg.short_opd0_str = 0;
|
||||
|
||||
reg.opd1_addr = p->table->start_address;
|
||||
if (is_bf16) {
|
||||
reg.opt_opd1_sign = 1;
|
||||
reg.opt_opd1_int8 = 1;
|
||||
}
|
||||
else {
|
||||
reg.opt_opd1_sign = 0;
|
||||
reg.opt_opd1_int8 = 1;
|
||||
}
|
||||
reg.opd1_n = p->table->shape.n;
|
||||
reg.opd1_c = p->table->shape.c;
|
||||
reg.opd1_h = p->table->shape.h;
|
||||
reg.opd1_w = p->table->shape.w;
|
||||
reg.short_opd1_str = 0;
|
||||
|
||||
if (is_bf16) {
|
||||
reg.opt_opd2_int8 = 1; // hw check
|
||||
reg.tsk_eu_typ = 12; // 12 means lut
|
||||
// dont care once short_xxx_str set to 0
|
||||
}
|
||||
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
//trace_tiu_reg(®, __FUNCTION__);
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
149
cvikernel/src/bm1880v2/tiu_matrix_multiplication.c
Normal file
149
cvikernel/src/bm1880v2/tiu_matrix_multiplication.c
Normal file
@ -0,0 +1,149 @@
|
||||
#include "kernel_1880v2.h"
|
||||
|
||||
typedef bmk1880v2_tiu_matrix_multiplication_param_t param_t;
|
||||
|
||||
static void check_matrix(ctx_t *ctx, const ml_t *m)
|
||||
{
|
||||
bmk1880v2_tensor_lmem_t t;
|
||||
t.start_address = m->start_address;
|
||||
t.fmt = m->fmt;
|
||||
t.shape.n = m->shape.n;
|
||||
t.shape.c = m->shape.c;
|
||||
t.shape.h = 1;
|
||||
t.shape.w = m->shape.w;
|
||||
t.stride.n = m->stride.n;
|
||||
t.stride.c = m->stride.c;
|
||||
t.stride.h = m->stride.h;
|
||||
t.stride.w = 1 * (m->fmt == FMT_BF16 ? 2 : 1);
|
||||
|
||||
check_tiu_tensor(&t);
|
||||
assert_stride_type_0(ctx, &t);
|
||||
|
||||
uint32_t eu_num = ctx->chip_info.eu_num;
|
||||
ASSERT(m->start_address % eu_num == 0);
|
||||
}
|
||||
|
||||
static int is_arith_shift(const param_t *p)
|
||||
{
|
||||
if (p->left->fmt == FMT_I8)
|
||||
return 1;
|
||||
if (p->right->fmt == FMT_I8)
|
||||
return 1;
|
||||
if (p->bias && p->bias->fmt == FMT_I8)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
bmk1880v2_op_t * bmk1880v2_tiu_matrix_multiplication(ctx_t *ctx, const param_t *p)
|
||||
{
|
||||
const bmk1880v2_matrix_lmem_t *res = p->res;
|
||||
const bmk1880v2_matrix_lmem_t *left = p->left;
|
||||
const bmk1880v2_matrix_lmem_t *right = p->right;
|
||||
const bmk1880v2_matrix_lmem_t *bias = p->bias;
|
||||
int bf16_enable = (p->res->fmt == FMT_BF16) ? 1 : 0;
|
||||
|
||||
check_matrix(ctx, res);
|
||||
check_matrix(ctx, left);
|
||||
check_matrix(ctx, right);
|
||||
if (bias)
|
||||
check_matrix(ctx, bias);
|
||||
|
||||
ASSERT(p->lshift_bits < 32);
|
||||
if (bf16_enable) /* bf16 does not support add_result*/
|
||||
ASSERT(!p->add_result);
|
||||
else
|
||||
ASSERT(!(p->relu_enable && p->add_result));
|
||||
|
||||
if(p->ps32_mode & 0x2)
|
||||
{
|
||||
ASSERT(!p->relu_enable);
|
||||
ASSERT(!p->bias);
|
||||
ASSERT(!p->rshift_bits);
|
||||
}
|
||||
|
||||
uint32_t left_row = left->shape.n;
|
||||
uint32_t left_col = left->shape.col;
|
||||
uint32_t right_row = right->shape.n;
|
||||
uint32_t right_col = right->shape.col;
|
||||
uint32_t res_row = res->shape.n;
|
||||
uint32_t res_col = res->shape.col;
|
||||
ASSERT(left_col == right_row);
|
||||
ASSERT(res_col == right_col);
|
||||
|
||||
if(p->ps32_mode)
|
||||
{
|
||||
ASSERT(!p->add_result);
|
||||
} else if ((p->add_result || !p->res_is_int8) && !bf16_enable) {
|
||||
ASSERT(res_row == left_row * 2);
|
||||
res_row = left_row;
|
||||
} else {
|
||||
ASSERT(res_row == left_row);
|
||||
}
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_FC_FIX8B;
|
||||
reg.tsk_opd_num = bias? 3: 2;
|
||||
reg.opd_typ = bf16_enable ? 1 : 0;
|
||||
reg.opt_shift_typ = is_arith_shift(p);
|
||||
reg.opt_right_shift = p->rshift_bits;
|
||||
reg.opt_left_shift = p->lshift_bits;
|
||||
reg.opt_relu = p->relu_enable;
|
||||
reg.opt_res_add = p->add_result;
|
||||
|
||||
reg.res0_addr = res->start_address;
|
||||
reg.opt_res0_int8 = (bf16_enable ? 1 : p->res_is_int8);
|
||||
|
||||
reg.opt_res0_sign = matrix_is_signed(res);
|
||||
reg.res0_n = res_row;
|
||||
reg.res0_c = res->shape.c;
|
||||
reg.res0_h = 1;
|
||||
reg.res0_w = res->shape.w;
|
||||
reg.short_res0_str = 0; // stride, b_stride calculated by H/W
|
||||
|
||||
reg.opd0_addr = left->start_address;
|
||||
reg.opt_opd0_int8 = 1;
|
||||
reg.opt_opd0_sign = (left->fmt == FMT_I8);
|
||||
reg.opd0_n = left_row;
|
||||
reg.opd0_c = left->shape.c;
|
||||
reg.opd0_h = 1;
|
||||
reg.opd0_w = left->shape.w;
|
||||
reg.short_opd0_str = 0;
|
||||
|
||||
reg.opd1_addr = right->start_address;
|
||||
reg.opt_opd1_int8 = 1;
|
||||
reg.opt_opd1_sign = (right->fmt == FMT_I8);
|
||||
reg.opd1_n = right_row;
|
||||
reg.opd1_c = right->shape.c;
|
||||
reg.opd1_h = 1;
|
||||
reg.opd1_w = left_col - left->shape.w * (left->shape.c - 1);
|
||||
reg.short_opd1_str = 0;
|
||||
|
||||
reg.ps32_md = p->ps32_mode;
|
||||
if (p->ps32_mode > 0)
|
||||
reg.res0_b_str = p->res->shape.n * p->res->stride.n;
|
||||
if(reg.opd0_c == 1)
|
||||
ASSERT(reg.opd0_w == reg.opd1_w);
|
||||
|
||||
if (bias) {
|
||||
ASSERT(bias->shape.n == 2);
|
||||
ASSERT(bias->shape.c == right->shape.c);
|
||||
ASSERT(bias->shape.w == right->shape.w);
|
||||
ASSERT(bias->shape.col == right->shape.col);
|
||||
|
||||
reg.opd2_addr = bias->start_address;
|
||||
reg.opt_opd2_int8 = 0;
|
||||
reg.opt_opd2_sign = (bias->fmt == FMT_I8);
|
||||
reg.opd2_n = 1;
|
||||
reg.opd2_c = bias->shape.c;
|
||||
reg.opd2_h = 1;
|
||||
reg.opd2_w = bias->shape.w;
|
||||
reg.short_opd2_str = 0;
|
||||
}
|
||||
|
||||
reg.layer_info = p->layer_id;
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
150
cvikernel/src/bm1880v2/tiu_matrix_multiplication_qdm.c
Normal file
150
cvikernel/src/bm1880v2/tiu_matrix_multiplication_qdm.c
Normal file
@ -0,0 +1,150 @@
|
||||
#include "kernel_1880v2.h"
|
||||
|
||||
typedef bmk1880v2_tiu_matrix_multiplication_qdm_param_t param_t;
|
||||
|
||||
static void check_matrix(ctx_t *ctx, const ml_t *m)
|
||||
{
|
||||
bmk1880v2_tensor_lmem_t t;
|
||||
t.start_address = m->start_address;
|
||||
t.fmt = m->fmt;
|
||||
t.shape.n = m->shape.n;
|
||||
t.shape.c = m->shape.c;
|
||||
t.shape.h = 1;
|
||||
t.shape.w = m->shape.w;
|
||||
t.stride.n = m->stride.n;
|
||||
t.stride.c = m->stride.c;
|
||||
t.stride.h = m->stride.h;
|
||||
t.stride.w = 1;
|
||||
|
||||
check_tiu_tensor(&t);
|
||||
assert_stride_type_0(ctx, &t);
|
||||
|
||||
uint32_t eu_num = ctx->chip_info.eu_num;
|
||||
ASSERT(m->start_address % eu_num == 0);
|
||||
}
|
||||
|
||||
static int is_arith_shift(const param_t *p)
|
||||
{
|
||||
if (p->left->fmt == FMT_I8)
|
||||
return 1;
|
||||
if (p->right->fmt == FMT_I8)
|
||||
return 1;
|
||||
if (p->bias && p->bias->fmt == FMT_I8)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
bmk1880v2_op_t * bmk1880v2_tiu_matrix_multiplication_qdm(ctx_t *ctx, const param_t *p)
|
||||
{
|
||||
const bmk1880v2_matrix_lmem_t *res = p->res;
|
||||
const bmk1880v2_matrix_lmem_t *left = p->left;
|
||||
const bmk1880v2_matrix_lmem_t *right = p->right;
|
||||
const bmk1880v2_matrix_lmem_t *bias = p->bias;
|
||||
|
||||
check_matrix(ctx, res);
|
||||
check_matrix(ctx, left);
|
||||
check_matrix(ctx, right);
|
||||
if (bias)
|
||||
check_matrix(ctx, bias);
|
||||
|
||||
ASSERT(p->lshift_bits < 32);
|
||||
ASSERT(!(p->relu_enable && p->add_result));
|
||||
if(p->ps32_mode & 0x2)
|
||||
{
|
||||
ASSERT(!p->relu_enable);
|
||||
ASSERT(!p->bias);
|
||||
ASSERT(!p->rshift_bits);
|
||||
}
|
||||
|
||||
uint32_t left_row = left->shape.n;
|
||||
uint32_t left_col = left->shape.col;
|
||||
uint32_t right_row = right->shape.n;
|
||||
uint32_t right_col = right->shape.col;
|
||||
uint32_t res_row = res->shape.n;
|
||||
uint32_t res_col = res->shape.col;
|
||||
ASSERT(left_col == right_row);
|
||||
ASSERT(res_col == right_col);
|
||||
ASSERT(p->res_is_int8 == 1);
|
||||
|
||||
if(p->ps32_mode)
|
||||
{
|
||||
ASSERT(!p->add_result);
|
||||
}
|
||||
else if (p->add_result) {
|
||||
ASSERT(res_row == left_row * 2);
|
||||
res_row = left_row;
|
||||
} else {
|
||||
ASSERT(res_row == left_row);
|
||||
}
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_FC_FIX8B;
|
||||
reg.tsk_opd_num = bias? 3: 2;
|
||||
reg.opt_shift_typ = is_arith_shift(p);
|
||||
reg.opt_right_shift = p->rshift_bits;
|
||||
reg.opt_left_shift = p->lshift_bits;
|
||||
reg.opt_relu = p->relu_enable;
|
||||
reg.opt_res_add = p->add_result;
|
||||
|
||||
reg.res0_addr = res->start_address;
|
||||
reg.opt_res0_int8 = 1;
|
||||
reg.opt_res0_sign = matrix_is_signed(res);
|
||||
reg.res0_n = res_row;
|
||||
reg.res0_c = res->shape.c;
|
||||
reg.res0_h = 1;
|
||||
reg.res0_w = res->shape.w;
|
||||
reg.short_res0_str = 0; // stride, b_stride calculated by H/W
|
||||
|
||||
reg.opd0_addr = left->start_address;
|
||||
reg.opt_opd0_int8 = 1;
|
||||
reg.opt_opd0_sign = (left->fmt == FMT_I8);
|
||||
reg.opd0_n = left_row;
|
||||
reg.opd0_c = left->shape.c;
|
||||
reg.opd0_h = 1;
|
||||
reg.opd0_w = left->shape.w;
|
||||
reg.short_opd0_str = 0;
|
||||
|
||||
reg.opd1_addr = right->start_address;
|
||||
reg.opt_opd1_int8 = 1;
|
||||
reg.opt_opd1_sign = (right->fmt == FMT_I8);
|
||||
reg.opd1_n = right_row;
|
||||
reg.opd1_c = right->shape.c;
|
||||
reg.opd1_h = 1;
|
||||
reg.opd1_w = left_col - left->shape.w * (left->shape.c - 1);
|
||||
reg.short_opd1_str = 0;
|
||||
|
||||
reg.ps32_md = p->ps32_mode;
|
||||
if (p->ps32_mode > 0)
|
||||
reg.res0_b_str = p->res->shape.n * p->res->stride.n;
|
||||
if(reg.opd0_c == 1)
|
||||
ASSERT(reg.opd0_w == reg.opd1_w);
|
||||
|
||||
// Only enable 32-bit multipler at the final post processing stage
|
||||
reg.opt_chl_quan = ((p->ps32_mode == 0) || (p->ps32_mode == 1)) ? 1 : 0;
|
||||
reg.quan_m = p->quan_m;
|
||||
|
||||
// 32b bias, determined by b_stride
|
||||
if (bias) {
|
||||
ASSERT(bias->shape.n == 4);
|
||||
ASSERT(bias->shape.c == right->shape.c);
|
||||
ASSERT(bias->shape.w == right->shape.w);
|
||||
ASSERT(bias->shape.col == right->shape.col);
|
||||
|
||||
reg.opd2_addr = bias->start_address;
|
||||
reg.opt_opd2_int8 = 0;
|
||||
reg.opt_opd2_sign = (bias->fmt == FMT_I8);
|
||||
reg.opd2_n = 1;
|
||||
reg.opd2_c = bias->shape.c;
|
||||
reg.opd2_h = 1;
|
||||
reg.opd2_w = bias->shape.w;
|
||||
reg.short_opd2_str = 0;
|
||||
}
|
||||
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
64
cvikernel/src/bm1880v2/tiu_max_pooling.c
Normal file
64
cvikernel/src/bm1880v2/tiu_max_pooling.c
Normal file
@ -0,0 +1,64 @@
|
||||
#include "kernel_1880v2.h"
|
||||
|
||||
bmk1880v2_op_t * bmk1880v2_tiu_max_pooling(
|
||||
ctx_t *ctx,
|
||||
const bmk1880v2_tiu_max_pooling_param_t *p)
|
||||
{
|
||||
int bf16_enable = (p->ifmap->fmt == FMT_BF16) ? 1 : 0;
|
||||
|
||||
check_tiu_tensor_2(p->ifmap, p->ofmap);
|
||||
ASSERT(p->kh * p->kw >= 1);
|
||||
ASSERT(p->ifmap->shape.n == p->ofmap->shape.n);
|
||||
ASSERT(p->ifmap->shape.c == p->ofmap->shape.c);
|
||||
if (bf16_enable) {
|
||||
assert_bf16_stride_type_0(ctx, p->ifmap);
|
||||
assert_bf16_stride_type_0(ctx, p->ofmap);
|
||||
} else {
|
||||
assert_stride_type_0(ctx, p->ifmap);
|
||||
assert_stride_type_0(ctx, p->ofmap);
|
||||
}
|
||||
int opd0_sign = tensor_is_signed(p->ifmap);
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_DEPTHWISE_POOL_FIX8B;
|
||||
reg.tsk_eu_typ = 0;
|
||||
reg.opt_relu = 0; /* Hardware relu function not validated. */
|
||||
reg.opt_right_shift = 0;
|
||||
reg.opt_shift_typ = opd0_sign;
|
||||
reg.tsk_opd_num = 1;
|
||||
reg.opd_typ = bf16_enable ? 1: 0;
|
||||
|
||||
reg.res0_addr = p->ofmap->start_address;
|
||||
reg.opt_res0_sign = opd0_sign;
|
||||
reg.opt_res0_int8 = 1;
|
||||
reg.res0_n = p->ofmap->shape.n;
|
||||
reg.res0_c = p->ofmap->shape.c;
|
||||
reg.res0_h = p->ofmap->shape.h;
|
||||
reg.res0_w = p->ofmap->shape.w;
|
||||
//reg.opd0_ins_val = bf16_enable ? 0 : (uint32_t)p->ins_val;
|
||||
reg.opd0_ins_val = (!p->ins_val && opd0_sign) ? -128 : p->ins_val; // backend not set yet
|
||||
reg.opd0_ins_fp = bf16_enable ? p->ins_fp : 0;
|
||||
reg.opd0_addr = p->ifmap->start_address;
|
||||
reg.opt_opd0_sign = opd0_sign;
|
||||
reg.opt_opd0_int8 = 1;
|
||||
reg.opd0_n = p->ifmap->shape.n;
|
||||
reg.opd0_c = p->ifmap->shape.c;
|
||||
reg.opd0_h = p->ifmap->shape.h;
|
||||
reg.opd0_w = p->ifmap->shape.w;
|
||||
reg.conv_opd0_up_pad = p->pad_top;
|
||||
reg.conv_opd0_dn_pad = p->pad_bottom;
|
||||
reg.conv_opd0_lf_pad = p->pad_left;
|
||||
reg.conv_opd0_rt_pad = p->pad_right;
|
||||
|
||||
reg.opt_opd1_int8 = 1;
|
||||
reg.opd1_h = p->kh;
|
||||
reg.opd1_w = p->kw;
|
||||
reg.conv_op_x_str = p->stride_w;
|
||||
reg.conv_op_y_str = p->stride_h;
|
||||
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
60
cvikernel/src/bm1880v2/tiu_mdsum.c
Normal file
60
cvikernel/src/bm1880v2/tiu_mdsum.c
Normal file
@ -0,0 +1,60 @@
|
||||
#include "kernel_1880v2.h"
|
||||
|
||||
bmk1880v2_op_t * bmk1880v2_tiu_mdsum(
|
||||
ctx_t *ctx,
|
||||
const bmk1880v2_tiu_mdsum_param_t *p)
|
||||
{
|
||||
const bmk1880v2_tensor_lmem_t *res = p->res;
|
||||
const bmk1880v2_tensor_lmem_t *input = p->input;
|
||||
|
||||
check_tiu_tensor_2(res, input);
|
||||
ASSERT(res->fmt == input->fmt);
|
||||
if (p->res_is_int8)
|
||||
ASSERT(res->shape.n == 1);
|
||||
else
|
||||
ASSERT(res->shape.n == 2);
|
||||
ASSERT(res->shape.c == input->shape.c);
|
||||
ASSERT(res->shape.h == 1);
|
||||
ASSERT(res->shape.w == 1);
|
||||
|
||||
int res_addr = res->start_address;
|
||||
|
||||
tiu_reg_t reg;
|
||||
reset_tiu_reg(®);
|
||||
|
||||
reg.cmd_en = 1;
|
||||
reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B;
|
||||
reg.tens_mdsum = 1;
|
||||
reg.tsk_opd_num = 1;
|
||||
reg.opt_relu = 0;
|
||||
|
||||
int arith_shift = tensor_is_signed(res);
|
||||
reg.opt_shift_typ = arith_shift;
|
||||
reg.opt_right_shift = p->rshift_bits;
|
||||
|
||||
reg.opd0_addr = input->start_address;
|
||||
reg.opt_opd0_sign = tensor_is_signed(input);
|
||||
reg.opt_opd0_int8 = 1;
|
||||
reg.opd0_n = input->shape.n;
|
||||
reg.opd0_c = input->shape.c;
|
||||
reg.opd0_h = input->shape.h;
|
||||
reg.opd0_w = input->shape.w;
|
||||
reg.opd0_n_str = input->stride.n;
|
||||
reg.opd0_c_str = input->stride.c;
|
||||
reg.opd0_h_str = input->stride.h;
|
||||
reg.opd0_w_str = 1;
|
||||
|
||||
reg.res0_addr = res_addr;
|
||||
reg.opt_res0_sign = tensor_is_signed(res);
|
||||
reg.opt_res0_int8 = p->res_is_int8;
|
||||
reg.res0_n = 1;
|
||||
reg.res0_c = res->shape.c;
|
||||
reg.res0_h = 1;
|
||||
reg.res0_w = 1;
|
||||
reg.short_res0_str = 0b01;
|
||||
|
||||
/* [15:0] layer id */
|
||||
reg.layer_info = p->layer_id;
|
||||
|
||||
return emit_tiu_cmdbuf(ctx, ®);
|
||||
}
|
||||
67
cvikernel/src/bm_kernel.c
Normal file
67
cvikernel/src/bm_kernel.c
Normal file
@ -0,0 +1,67 @@
|
||||
#include "kernel_internal.h"
|
||||
|
||||
shape_t shape_t4(int n, int c, int h, int w)
|
||||
{
|
||||
shape_t s;
|
||||
s.n = n;
|
||||
s.c = c;
|
||||
s.h = h;
|
||||
s.w = w;
|
||||
s.dim = 4;
|
||||
return s;
|
||||
}
|
||||
|
||||
shape_t shape_t3(int c, int h, int w)
|
||||
{
|
||||
shape_t s;
|
||||
s.n = 1;
|
||||
s.c = c;
|
||||
s.h = h;
|
||||
s.w = w;
|
||||
s.dim = 3;
|
||||
return s;
|
||||
}
|
||||
|
||||
shape_t shape_t2(int row, int col)
|
||||
{
|
||||
shape_t s;
|
||||
s.n = 1;
|
||||
s.c = 1;
|
||||
s.h = row;
|
||||
s.w = col;
|
||||
s.dim = 2;
|
||||
return s;
|
||||
}
|
||||
|
||||
shape_t shape_t1(int len)
|
||||
{
|
||||
int row = 1, col = len;
|
||||
while (col >= 65536) {
|
||||
ASSERT(col % 2 == 0);
|
||||
col /= 2;
|
||||
row *= 2;
|
||||
}
|
||||
shape_t s = {
|
||||
.dim = 2,
|
||||
.n = 1,
|
||||
.c = 1,
|
||||
.h = row,
|
||||
.w = col,
|
||||
};
|
||||
return s;
|
||||
}
|
||||
|
||||
uint8_t shape_equal(shape_t s1, shape_t s2)
|
||||
{
|
||||
return (s1.dim == s2.dim) &&
|
||||
(s1.n == s2.n) &&
|
||||
(s1.c == s2.c) &&
|
||||
(s1.h == s2.h) &&
|
||||
(s1.w == s2.w);
|
||||
}
|
||||
|
||||
void tl_reshape(tensor_lmem *tlp, shape_t shape)
|
||||
{
|
||||
ASSERT(tlp);
|
||||
tlp->shape = shape;
|
||||
}
|
||||
24
cvikernel/src/bmkernel_standard.h
Normal file
24
cvikernel/src/bmkernel_standard.h
Normal file
@ -0,0 +1,24 @@
|
||||
#ifndef BMKERNEL_STANDARD_H
|
||||
#define BMKERNEL_STANDARD_H
|
||||
#include <bmkernel/bm_kernel.h>
|
||||
#include "kernel_internal.h"
|
||||
#include <cvikernel/cvikernel.h>
|
||||
|
||||
typedef struct bmk_context {
|
||||
bmk_info_t info;
|
||||
cvk_chip_info_t chip_info;
|
||||
|
||||
ec_t ec;
|
||||
mode_manager_t mode_manager;
|
||||
|
||||
uint32_t cmdbuf_ptr;
|
||||
uint32_t max_nr_desc;
|
||||
uint32_t cur_nr_desc;
|
||||
desc_pair_t *desc_pairs;
|
||||
|
||||
uint32_t lmem_ptr;
|
||||
uint16_t layer_id;
|
||||
void* op; //<! compress used
|
||||
} bmk_context_t, ctx_t;
|
||||
|
||||
#endif // BMKERNEL_STANDARD_H
|
||||
885
cvikernel/src/cv180x/cvkcv180x.c
Normal file
885
cvikernel/src/cv180x/cvkcv180x.c
Normal file
@ -0,0 +1,885 @@
|
||||
#include "cvkcv180x.h"
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
static inline int bitsize_of_fmt(cvk_fmt_t fmt)
|
||||
{
|
||||
switch (fmt) {
|
||||
case CVK_FMT_F32:
|
||||
case CVK_FMT_I32:
|
||||
return 32;
|
||||
case CVK_FMT_F16:
|
||||
case CVK_FMT_I16:
|
||||
case CVK_FMT_U16:
|
||||
case CVK_FMT_BF16:
|
||||
return 16;
|
||||
case CVK_FMT_I8:
|
||||
case CVK_FMT_U8:
|
||||
return 8;
|
||||
default:
|
||||
return 32;
|
||||
}
|
||||
}
|
||||
|
||||
static void cvkcv180x_replace_cmd_id(uint32_t *desc, uint32_t eng_id, uint16_t ids[])
|
||||
{
|
||||
if (eng_id == CV180X_TIU) {
|
||||
tiu_reg_t reg;
|
||||
parse_tiu_reg(®, desc);
|
||||
reg.cmd_id_en = 1;
|
||||
reg.cmd_id_tpu = ids[eng_id];
|
||||
reg.cmd_id_gdma = ids[CV180X_TDMA];
|
||||
emit_tiu_reg(®, desc);
|
||||
} else if (eng_id == CV180X_TDMA) {
|
||||
tdma_reg_t tdma_reg;
|
||||
parse_tdma_reg(&tdma_reg, desc);
|
||||
tdma_reg.cmd_id = ids[eng_id];
|
||||
tdma_reg.wait_id_tpu = ids[CV180X_TIU];
|
||||
tdma_reg.bar_en = 1;
|
||||
emit_tdma_reg(&tdma_reg, desc);
|
||||
}
|
||||
}
|
||||
|
||||
static int cvkcv180x_get_engine_desc_length(uint32_t engine_id)
|
||||
{
|
||||
switch (engine_id) {
|
||||
case CV180X_TIU:
|
||||
return TIU_ENGINE_DESCRIPTOR_NUM * sizeof(uint32_t);
|
||||
case CV180X_TDMA:
|
||||
return TDMA_ENGINE_DESCRIPTOR_NUM * sizeof(uint32_t);
|
||||
//case CV180X_CPU:
|
||||
// return CPU_ENGINE_DESCRIPTOR_NUM * sizeof(uint32_t);
|
||||
default:
|
||||
//ASSERT(0);
|
||||
break;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Estimate the number of command descriptor based on buffer size provided
|
||||
// by the user.
|
||||
static uint32_t cvkcv180x_estimate_nr_desc(uint32_t cmdbuf_size)
|
||||
{
|
||||
uint32_t tiu_desc_len = cvkcv180x_get_engine_desc_length(CV180X_TIU);
|
||||
uint32_t tdma_desc_len = cvkcv180x_get_engine_desc_length(CV180X_TDMA);
|
||||
uint32_t hdr_len = sizeof(cmd_hdr_t);
|
||||
|
||||
uint32_t desc_len =
|
||||
(tiu_desc_len > tdma_desc_len) ? tiu_desc_len : tdma_desc_len;
|
||||
|
||||
return cmdbuf_size / (desc_len + hdr_len);
|
||||
}
|
||||
|
||||
static cmd_hdr_t *kernel_alloc_cmd_hdr(
|
||||
cvk_context_t *ctx, uint8_t eng_id, uint32_t desc_len)
|
||||
{
|
||||
cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data;
|
||||
uint32_t free_len = prv_data->cmdbuf_size - prv_data->cmdbuf_ptr;
|
||||
uint32_t hdr_len = sizeof(cmd_hdr_t);
|
||||
uint32_t total_len = hdr_len + desc_len;
|
||||
|
||||
if (total_len > free_len)
|
||||
return NULL;
|
||||
|
||||
cmd_hdr_t *hdr = (cmd_hdr_t *)&prv_data->cmdbuf[prv_data->cmdbuf_ptr];
|
||||
hdr->magic = 0xA8; // CMDBUF_HDR_MAGIC_180X
|
||||
hdr->len = desc_len;
|
||||
hdr->engine_id = eng_id;
|
||||
hdr->__deprecated = 0; // for valgrind
|
||||
hdr->flags = 0;
|
||||
hdr->mask = 0;
|
||||
|
||||
prv_data->cmdbuf_ptr += total_len;
|
||||
return hdr;
|
||||
}
|
||||
|
||||
static desc_pair_t *kernel_alloc_desc_pair(cvk_context_t *ctx, uint8_t eng_id)
|
||||
{
|
||||
cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data;
|
||||
|
||||
if (eng_id >= CV180X_ENGINE_NUM || prv_data->cur_nr_desc >= prv_data->max_nr_desc)
|
||||
return NULL;
|
||||
|
||||
uint32_t desc_len = cvkcv180x_get_engine_desc_length(eng_id);
|
||||
desc_pair_t *dp = &prv_data->desc_pairs[prv_data->cur_nr_desc++];
|
||||
dp->cmd_hdr = kernel_alloc_cmd_hdr(ctx, eng_id, desc_len);
|
||||
dp->ec_desc = ec_alloc_desc(&prv_data->ec, eng_id);
|
||||
|
||||
mode_manager_record_ec_desc(&prv_data->mode_manager, dp->ec_desc);
|
||||
return dp;
|
||||
}
|
||||
|
||||
static void cvkcv180x_update_sync_id(cvk_context_t *ctx)
|
||||
{
|
||||
cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data;
|
||||
ec_compute_sync_ids(&prv_data->ec);
|
||||
|
||||
for (uint32_t di = 0; di < prv_data->cur_nr_desc; di++) {
|
||||
desc_pair_t *dp = &prv_data->desc_pairs[di];
|
||||
uint8_t eng_id = dp->ec_desc->engine_id;
|
||||
uint32_t *desc = (uint32_t *)dp->cmd_hdr->cmd;
|
||||
cvkcv180x_replace_cmd_id(desc, eng_id, dp->ec_desc->sync_ids);
|
||||
}
|
||||
}
|
||||
|
||||
desc_pair_t *cvkcv180x_get_desc_pair(cvk_context_t *ctx, uint8_t eng_id)
|
||||
{
|
||||
#if 0
|
||||
if (eng_id == BMK1822_CPU) {
|
||||
kernel_update_sync_id(k);
|
||||
k->cur_nr_desc = 0;
|
||||
|
||||
ec_reset(&k->ec);
|
||||
mode_manager_restart_sync_id(&k->mode_manager);
|
||||
}
|
||||
#endif
|
||||
|
||||
return kernel_alloc_desc_pair(ctx, eng_id);
|
||||
}
|
||||
|
||||
void cvkcv180x_cleanup(cvk_context_t *ctx)
|
||||
{
|
||||
cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data;
|
||||
|
||||
free(prv_data->desc_pairs);
|
||||
ec_destroy(&prv_data->ec);
|
||||
mode_manager_destroy(&prv_data->mode_manager);
|
||||
}
|
||||
|
||||
void cvkcv180x_reset(cvk_context_t *ctx)
|
||||
{
|
||||
cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data;
|
||||
|
||||
prv_data->cur_nr_desc = 0;
|
||||
prv_data->cmdbuf_ptr = 0;
|
||||
|
||||
ec_reset(&prv_data->ec);
|
||||
mode_manager_reset(&prv_data->mode_manager);
|
||||
}
|
||||
|
||||
static uint8_t *cvkcv180x_acquire_cmdbuf(cvk_context_t *ctx, uint32_t *size)
|
||||
{
|
||||
cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data;
|
||||
|
||||
*size = prv_data->cmdbuf_ptr;
|
||||
cvkcv180x_update_sync_id(ctx);
|
||||
return prv_data->cmdbuf;
|
||||
}
|
||||
|
||||
void cvkcv180x_set_layer_id(
|
||||
struct cvikernel_context *ctx,
|
||||
uint16_t layer_id)
|
||||
{
|
||||
cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data;
|
||||
|
||||
prv_data->layer_id = layer_id;
|
||||
}
|
||||
|
||||
void cvkcv180x_parallel_enable(struct cvikernel_context *ctx)
|
||||
{
|
||||
cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data;
|
||||
|
||||
mode_manager_enable_parallel(&prv_data->mode_manager);
|
||||
}
|
||||
|
||||
void cvkcv180x_parallel_disable(struct cvikernel_context *ctx)
|
||||
{
|
||||
cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data;
|
||||
|
||||
mode_manager_disable_parallel(&prv_data->mode_manager);
|
||||
}
|
||||
|
||||
cvk_tl_stride_t cvkcv180x_tl_default_stride(
|
||||
cvk_context_t *ctx,
|
||||
cvk_tl_shape_t s,
|
||||
cvk_fmt_t fmt_type,
|
||||
int eu_align)
|
||||
{
|
||||
cvk_tl_stride_t stride;
|
||||
uint32_t eu_num = ctx->info.eu_num;
|
||||
uint32_t npu_num = ctx->info.npu_num;
|
||||
uint32_t fmt = (fmt_type == CVK_FMT_BF16) ? 2 : 1;
|
||||
stride.w = fmt;
|
||||
stride.h = s.w * fmt;
|
||||
if (eu_align)
|
||||
stride.c = align_up(s.h * s.w * fmt, eu_num);
|
||||
else
|
||||
stride.c = s.h * s.w * fmt;
|
||||
|
||||
stride.n = stride.c * ceiling_func(s.c, npu_num);
|
||||
|
||||
return stride;
|
||||
}
|
||||
|
||||
void cvkcv180x_lmem_init_tensor(
|
||||
struct cvikernel_context *ctx,
|
||||
cvk_tl_t *tl,
|
||||
cvk_tl_shape_t shape,
|
||||
cvk_fmt_t fmt,
|
||||
int eu_align)
|
||||
{
|
||||
memset(tl, 0, sizeof(*tl));
|
||||
tl->fmt = fmt;
|
||||
tl->shape = shape;
|
||||
tl->eu_align = eu_align;
|
||||
tl->stride = cvkcv180x_tl_default_stride(ctx, shape, fmt, eu_align);
|
||||
}
|
||||
|
||||
uint32_t cvkcv180x_lmem_tensor_to_size(
|
||||
struct cvikernel_context *ctx,
|
||||
cvk_tl_shape_t shape,
|
||||
cvk_fmt_t fmt,
|
||||
int eu_align)
|
||||
{
|
||||
uint32_t eu_num = ctx->info.eu_num;
|
||||
|
||||
cvk_tl_stride_t stride;
|
||||
stride = cvkcv180x_tl_default_stride(ctx, shape, fmt, eu_align);
|
||||
|
||||
uint32_t needed = align_up(shape.n * stride.n, eu_num);
|
||||
|
||||
return needed;
|
||||
}
|
||||
|
||||
cvk_tl_t *cvkcv180x_lmem_alloc_tensor(
|
||||
cvk_context_t *ctx,
|
||||
cvk_tl_shape_t shape,
|
||||
cvk_fmt_t fmt,
|
||||
int eu_align)
|
||||
{
|
||||
cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data;
|
||||
uint32_t lmem_size = ctx->info.lmem_size;
|
||||
uint32_t eu_num = ctx->info.eu_num;
|
||||
|
||||
cvk_tl_t *t = malloc(sizeof(*t));
|
||||
if (!t)
|
||||
return NULL;
|
||||
|
||||
memset(t, 0, sizeof(*t));
|
||||
t->start_address = prv_data->lmem_ptr;
|
||||
t->fmt = fmt;
|
||||
t->cmprs_fmt = fmt;
|
||||
t->shape = shape;
|
||||
t->eu_align = eu_align;
|
||||
t->stride = cvkcv180x_tl_default_stride(ctx, shape, fmt, eu_align);
|
||||
|
||||
uint32_t needed = align_up(t->shape.n * t->stride.n, eu_num);
|
||||
if ((lmem_size - prv_data->lmem_ptr < needed) || !needed) {
|
||||
free(t);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
prv_data->lmem_ptr += needed;
|
||||
return t;
|
||||
}
|
||||
|
||||
void cvkcv180x_lmem_free_tensor(
|
||||
struct cvikernel_context *ctx,
|
||||
const cvk_tl_t *tl)
|
||||
{
|
||||
cvk_prv_data_t *prv_data;
|
||||
|
||||
if (!ctx || !tl)
|
||||
return;
|
||||
|
||||
prv_data = (cvk_prv_data_t *)ctx->priv_data;
|
||||
|
||||
if (tl->start_address >= prv_data->lmem_ptr)
|
||||
printf("cvkcv180x lm free tensor: ptr out of range\n");
|
||||
|
||||
prv_data->lmem_ptr = tl->start_address;
|
||||
|
||||
free((void *)tl);
|
||||
}
|
||||
|
||||
static void try_optimize_matrix_shape(cvk_context_t *ctx, cvk_ml_shape_t *s,
|
||||
cvk_fmt_t fmt_type) {
|
||||
uint32_t eu_num = ctx->info.eu_num;
|
||||
uint32_t npu_num = ctx->info.npu_num;
|
||||
uint32_t col = s->col;
|
||||
uint8_t isBf16 = (fmt_type == CVK_FMT_BF16);
|
||||
uint32_t workingNumber = isBf16 ? eu_num / 2 : eu_num;
|
||||
|
||||
if (col >= workingNumber) {
|
||||
int num_eu = ceiling_func(col, workingNumber * npu_num);
|
||||
s->w = workingNumber * num_eu;
|
||||
s->c = ceiling_func(col, s->w);
|
||||
} else {
|
||||
// col < EU_NUM
|
||||
// Only transfer needed data
|
||||
// We still change tensor shape in TIU mac op
|
||||
s->w = col;
|
||||
s->c = 1;
|
||||
}
|
||||
}
|
||||
|
||||
cvk_ml_shape_t cvkcv180x_ml_default_shape(
|
||||
struct cvikernel_context *ctx,
|
||||
uint32_t row,
|
||||
uint32_t col,
|
||||
cvk_fmt_t fmt_type)
|
||||
{
|
||||
cvk_ml_shape_t shape = {0};
|
||||
shape.n = row;
|
||||
shape.col = col;
|
||||
|
||||
try_optimize_matrix_shape(ctx, &shape, fmt_type);
|
||||
|
||||
return shape;
|
||||
}
|
||||
|
||||
cvk_ml_stride_t cvkcv180x_ml_default_stride(
|
||||
struct cvikernel_context *ctx,
|
||||
cvk_ml_shape_t shape,
|
||||
cvk_fmt_t fmt,
|
||||
int eu_align)
|
||||
{
|
||||
uint32_t npu_num = ctx->info.npu_num;
|
||||
uint32_t eu_num = ctx->info.eu_num;
|
||||
uint32_t val = (fmt == CVK_FMT_BF16) ? 2 : 1;
|
||||
|
||||
cvk_ml_stride_t stride;
|
||||
stride.h = shape.w * val;
|
||||
if (eu_align)
|
||||
stride.c = align_up(shape.w * val, eu_num);
|
||||
else
|
||||
stride.c = shape.w * val;
|
||||
stride.n = stride.c * ceiling_func(shape.c, npu_num);
|
||||
|
||||
return stride;
|
||||
}
|
||||
|
||||
cvk_ml_shape_t cvkcv180x_ml_shape_t1(
|
||||
struct cvikernel_context *ctx,
|
||||
uint32_t len,
|
||||
cvk_fmt_t fmt_type)
|
||||
{
|
||||
uint32_t lmem_size = ctx->info.lmem_size;
|
||||
cvk_ml_shape_t shape = {0};
|
||||
|
||||
uint32_t row = 1;
|
||||
uint32_t col = len;
|
||||
|
||||
while (col >= lmem_size) {
|
||||
if (col % 2)
|
||||
return shape;
|
||||
|
||||
col /= 2;
|
||||
row *= 2;
|
||||
}
|
||||
|
||||
shape.n = row;
|
||||
shape.col = col;
|
||||
|
||||
try_optimize_matrix_shape(ctx, &shape, fmt_type);
|
||||
return shape;
|
||||
}
|
||||
|
||||
void cvkcv180x_lmem_init_matrix(
|
||||
struct cvikernel_context *ctx,
|
||||
cvk_ml_t *ml,
|
||||
cvk_ml_shape_t shape,
|
||||
cvk_fmt_t fmt,
|
||||
int eu_align)
|
||||
{
|
||||
memset(ml, 0, sizeof(*ml));
|
||||
ml->fmt = fmt;
|
||||
ml->shape = shape;
|
||||
ml->stride = cvkcv180x_ml_default_stride(ctx, shape, fmt, eu_align);
|
||||
ml->eu_align = eu_align;
|
||||
}
|
||||
|
||||
|
||||
uint32_t cvkcv180x_lmem_matrix_to_size(
|
||||
struct cvikernel_context *ctx,
|
||||
cvk_ml_shape_t shape,
|
||||
cvk_fmt_t fmt,
|
||||
int eu_align)
|
||||
{
|
||||
uint32_t npu_num = ctx->info.npu_num;
|
||||
uint32_t eu_num = ctx->info.eu_num;
|
||||
uint32_t val = (fmt == CVK_FMT_BF16) ? 2 : 1;
|
||||
|
||||
cvk_ml_t t;
|
||||
t.fmt = fmt;
|
||||
t.shape = shape;
|
||||
t.stride.h = shape.w * val;
|
||||
if (eu_align)
|
||||
t.stride.c = align_up(shape.w * val, eu_num);
|
||||
else
|
||||
t.stride.c = shape.w * val;
|
||||
t.stride.n = t.stride.c * ceiling_func(shape.c, npu_num);
|
||||
|
||||
uint32_t needed = align_up(t.shape.n * t.stride.n, eu_num);
|
||||
|
||||
return needed;
|
||||
}
|
||||
|
||||
uint32_t cvkcv180x_lmem_ps32_matrix_to_size(
|
||||
struct cvikernel_context *ctx,
|
||||
cvk_ml_shape_t shape,
|
||||
cvk_fmt_t fmt,
|
||||
int eu_align)
|
||||
{
|
||||
/* Partial sum is located in lmem in 32-bit format, so we times n to 4 to
|
||||
* spare a sapce for it.
|
||||
*/
|
||||
|
||||
shape.n = shape.n * (bitsize_of_fmt(CVK_FMT_I32) / bitsize_of_fmt(fmt));
|
||||
|
||||
return cvkcv180x_lmem_matrix_to_size(ctx, shape, fmt, eu_align);
|
||||
|
||||
}
|
||||
|
||||
cvk_ml_t *cvkcv180x_lmem_alloc_matrix(
|
||||
cvk_context_t *ctx,
|
||||
cvk_ml_shape_t s,
|
||||
cvk_fmt_t fmt,
|
||||
int eu_align)
|
||||
{
|
||||
cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data;
|
||||
uint32_t lmem_size = ctx->info.lmem_size;
|
||||
uint32_t npu_num = ctx->info.npu_num;
|
||||
uint32_t eu_num = ctx->info.eu_num;
|
||||
uint32_t val = (fmt == CVK_FMT_BF16) ? 2 : 1;
|
||||
|
||||
cvk_ml_t *t = malloc(sizeof(*t));
|
||||
if (!t)
|
||||
return NULL;
|
||||
|
||||
memset(t, 0, sizeof(*t));
|
||||
t->start_address = prv_data->lmem_ptr;
|
||||
t->fmt = fmt;
|
||||
t->shape = s;
|
||||
t->stride.h = s.w * val;
|
||||
if (eu_align)
|
||||
t->stride.c = align_up(s.w * val, eu_num);
|
||||
else
|
||||
t->stride.c = s.w * val;
|
||||
t->stride.n = t->stride.c * ceiling_func(s.c, npu_num);
|
||||
t->eu_align = eu_align;
|
||||
|
||||
uint32_t needed = align_up(t->shape.n * t->stride.n, eu_num);
|
||||
if (lmem_size - prv_data->lmem_ptr < needed) {
|
||||
free(t);
|
||||
return NULL;
|
||||
}
|
||||
prv_data->lmem_ptr += needed;
|
||||
|
||||
return t;
|
||||
}
|
||||
|
||||
void cvkcv180x_lmem_free_matrix(
|
||||
struct cvikernel_context *ctx,
|
||||
const cvk_ml_t *ml)
|
||||
{
|
||||
cvk_prv_data_t *prv_data;
|
||||
|
||||
if (!ctx || !ml)
|
||||
return;
|
||||
|
||||
prv_data = (cvk_prv_data_t *)ctx->priv_data;
|
||||
|
||||
if (ml->start_address >= prv_data->lmem_ptr)
|
||||
printf("cvkcv180x lm free matrix: ptr out of range\n");
|
||||
|
||||
prv_data->lmem_ptr = ml->start_address;
|
||||
free((void *)ml);
|
||||
}
|
||||
|
||||
cvk_ml_t *cvkcv180x_lmem_alloc_ps32_matrix(
|
||||
cvk_context_t *ctx,
|
||||
cvk_ml_shape_t shape,
|
||||
cvk_fmt_t fmt,
|
||||
int eu_align)
|
||||
{
|
||||
/* Partial sum is located in lmem in 32-bit format, so we times n to 4 to
|
||||
* spare a space for it.
|
||||
*/
|
||||
|
||||
uint32_t prev_n;
|
||||
|
||||
prev_n = shape.n;
|
||||
shape.n = shape.n * (bitsize_of_fmt(CVK_FMT_I32) / bitsize_of_fmt(fmt));
|
||||
cvk_ml_t *res = cvkcv180x_lmem_alloc_matrix(ctx, shape, fmt, eu_align);
|
||||
|
||||
if(res == NULL) {
|
||||
printf("cvkcv180x: alloc ps32 matrix fail\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
res->shape.n = prev_n;
|
||||
return res;
|
||||
}
|
||||
|
||||
cvk_tg_stride_t cvkcv180x_tg_default_stride(
|
||||
struct cvikernel_context *ctx,
|
||||
cvk_tg_shape_t shape,
|
||||
cvk_fmt_t fmt)
|
||||
{
|
||||
uint32_t data_type_size = (fmt == CVK_FMT_BF16) ? 2 : 1;
|
||||
cvk_tg_stride_t stride;
|
||||
stride.h = shape.w * data_type_size;
|
||||
stride.c = shape.h * stride.h;
|
||||
stride.n = shape.c * stride.c;
|
||||
stride.w = (fmt == CVK_FMT_BF16) ? 2 : 1;
|
||||
|
||||
(void)ctx;
|
||||
|
||||
return stride;
|
||||
}
|
||||
|
||||
void cvkcv180x_tiu_bf16_lookup_interp_table(
|
||||
cvk_context_t *ctx,
|
||||
const cvk_tiu_bf16_lookup_interp_table_param_t *param)
|
||||
{
|
||||
if (param->is_scientific) {
|
||||
// issue lut cmd
|
||||
cvk_tdma_l2l_tensor_copy_param_t p10;
|
||||
// remove low 8 bits by int8 copy with stride
|
||||
// get index(pow)
|
||||
memset(&p10, 0x00, sizeof(cvk_tdma_l2l_tensor_copy_param_t));
|
||||
p10.dst = param->ofmap;
|
||||
p10.src = param->ifmap;
|
||||
p10.mv_lut_base = 0; // MUST init by ifself in soc
|
||||
p10.mv_lut_idx = 1;
|
||||
p10.layer_id = param->layer_id;
|
||||
cvkcv180x_tdma_l2l_bf16_tensor_copy(ctx, &p10);
|
||||
p10.mv_lut_idx = 0;
|
||||
|
||||
// get f(x0) = 2^(x0*-0.5)
|
||||
cvk_tiu_lookup_table_param_t p12;
|
||||
p12.ofmap = param->ofmap;
|
||||
p12.ifmap = param->ofmap;
|
||||
p12.table = param->tbl_answer;
|
||||
p12.layer_id = param->layer_id;
|
||||
cvkcv180x_tiu_lookup_table(ctx, &p12);
|
||||
|
||||
// get mantissa value
|
||||
p12.ofmap = param->buf;
|
||||
p12.ifmap = param->ifmap;
|
||||
p12.table = param->tbl_answer_mantissa;
|
||||
cvkcv180x_tiu_lookup_table(ctx, &p12);
|
||||
|
||||
// (2^exp) * mantissa
|
||||
cvk_tiu_mul_param_t p1;
|
||||
p1.res_high = NULL;
|
||||
p1.res_low = param->ofmap;
|
||||
p1.a = param->ofmap;
|
||||
p1.b_is_const = 0;
|
||||
p1.b = param->buf;
|
||||
p1.rshift_bits = 0;
|
||||
p1.relu_enable = 0;
|
||||
p1.layer_id = param->layer_id;
|
||||
cvkcv180x_tiu_mul(ctx, &p1);
|
||||
}
|
||||
else {
|
||||
// duplicate from cvikernel_1880v2.c
|
||||
const cvk_tl_t *tl_ifmap = param->ifmap;
|
||||
const cvk_tl_t *tl_ofmap_slope = param->buf;
|
||||
const cvk_tl_t *tl_table_answer = param->tbl_answer;
|
||||
const cvk_tl_t *tl_table_answer_slope = param->tbl_answer_mantissa;
|
||||
const cvk_tl_t *tl_ofmap_y0 = param->ofmap;
|
||||
float min = param->min;
|
||||
float max = param->max;
|
||||
float scale = 256 / (max - min); // 256 means hw support lut index size
|
||||
uint8_t eu_align = param->eu_align;
|
||||
cvk_fmt_t fmt = CVK_FMT_BF16;
|
||||
|
||||
cvk_tl_shape_t tl_ofmap_x0_int8_shape = {
|
||||
1, tl_ifmap->shape.c, tl_ifmap->shape.h * tl_ifmap->shape.w, 1};
|
||||
|
||||
// filter y = max(range_min, x)
|
||||
cvk_tiu_max_param_t p1 = {0};
|
||||
p1.max = tl_ifmap;
|
||||
p1.a = tl_ifmap;
|
||||
p1.b_is_const = 1;
|
||||
p1.b_const.is_signed = 1;
|
||||
p1.b_const.val = cvk_convert_fp32_bf16(min);
|
||||
p1.layer_id = param->layer_id;
|
||||
ctx->ops->tiu_max(ctx, &p1);
|
||||
|
||||
// filter y = min(8, x)
|
||||
cvk_tiu_min_param_t p2 = {0};
|
||||
p2.min = tl_ifmap;
|
||||
p2.a = tl_ifmap;
|
||||
p2.b_is_const = 1;
|
||||
p2.b_const.val = cvk_convert_fp32_bf16(max - 1 / scale); // corner
|
||||
p2.b_const.is_signed = 1;
|
||||
p2.layer_id = param->layer_id;
|
||||
ctx->ops->tiu_min(ctx, &p2);
|
||||
|
||||
cvk_tdma_l2l_tensor_copy_param_t p3 = {0};
|
||||
// scale input for remap its idx(-x~x) to (-127~127), dirty tl_ifmap
|
||||
cvk_tiu_mul_param_t p4 = {0};
|
||||
p4.res_high = NULL;
|
||||
p4.res_low = tl_ifmap;
|
||||
p4.a = tl_ifmap;
|
||||
p4.b_is_const = 1;
|
||||
p4.b_const.val = cvk_convert_fp32_bf16(scale);
|
||||
p4.rshift_bits = 0;
|
||||
p4.relu_enable = 0;
|
||||
p4.layer_id = param->layer_id;
|
||||
ctx->ops->tiu_mul(ctx, &p4);
|
||||
|
||||
// <! get idx from bf16->int8
|
||||
memset(&p3, 0x00, sizeof(cvk_tdma_l2l_tensor_copy_param_t));
|
||||
cvk_tl_t dst;
|
||||
memcpy(&dst, tl_ofmap_y0, sizeof(cvk_tl_t));
|
||||
|
||||
dst.shape = tl_ofmap_x0_int8_shape;
|
||||
dst.fmt = CVK_FMT_I8;
|
||||
dst.stride =
|
||||
ctx->ops->tl_default_stride(ctx, tl_ofmap_x0_int8_shape, CVK_FMT_I8, eu_align);
|
||||
dst.stride.h = dst.stride.h * 2;
|
||||
dst.int8_rnd_mode = 1;
|
||||
p3.dst = &dst;
|
||||
p3.src = tl_ifmap;
|
||||
ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p3);
|
||||
dst.int8_rnd_mode = 0; // reset
|
||||
|
||||
// <! int8 to bf16 format cus for sub use, sub MUST in the same format
|
||||
memset(&p3, 0x00, sizeof(cvk_tdma_l2l_tensor_copy_param_t));
|
||||
p3.dst = tl_ofmap_slope; //<! bf16
|
||||
p3.src = &dst;
|
||||
ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p3);
|
||||
|
||||
// <! sub, diff base , a - b
|
||||
// (x - x0)
|
||||
cvk_tiu_sub_param_t p5 = {0};
|
||||
p5.res_high = 0;
|
||||
p5.res_low = tl_ifmap;
|
||||
p5.a_high = 0;
|
||||
p5.a_low = tl_ifmap;
|
||||
p5.b_high = 0;
|
||||
p5.b_low = tl_ofmap_slope;
|
||||
p5.rshift_bits = 0;
|
||||
ctx->ops->tiu_sub(ctx, &p5);
|
||||
|
||||
// get f(x0) and slope(x)
|
||||
// reshape, 16->16
|
||||
dst.fmt = fmt;
|
||||
dst.shape = tl_ofmap_slope->shape;
|
||||
dst.stride = tl_ofmap_slope->stride;
|
||||
|
||||
// <! get slope by index
|
||||
cvk_tiu_lookup_table_param_t p6 = {0};
|
||||
memset(&p6, 0x0, sizeof(cvk_tiu_lookup_table_param_t));
|
||||
p6.ofmap = tl_ofmap_slope;
|
||||
p6.ifmap = &dst;
|
||||
p6.table = tl_table_answer_slope;
|
||||
p6.layer_id = param->layer_id;
|
||||
ctx->ops->tiu_lookup_table(ctx, &p6);
|
||||
|
||||
// base f(x0)
|
||||
memset(&p6, 0x0, sizeof(cvk_tiu_lookup_table_param_t));
|
||||
p6.ofmap = tl_ofmap_y0;
|
||||
p6.ifmap = &dst;
|
||||
p6.table = tl_table_answer;
|
||||
p6.layer_id = param->layer_id;
|
||||
ctx->ops->tiu_lookup_table(ctx, &p6);
|
||||
|
||||
// <! mac
|
||||
// <! part A + part B, a * b + res = res
|
||||
cvk_tiu_mac_param_t p7 = {0};
|
||||
p7.res_high = 0;
|
||||
p7.res_low = tl_ofmap_y0;
|
||||
p7.res_is_int8 = 0;
|
||||
p7.a = tl_ifmap;
|
||||
p7.b_is_const = 0;
|
||||
p7.b = tl_ofmap_slope;
|
||||
p7.lshift_bits = 0; // lshift_bits;
|
||||
p7.rshift_bits = 0; // rshift_bits;
|
||||
p7.relu_enable = 0;
|
||||
p7.layer_id = param->layer_id;
|
||||
ctx->ops->tiu_mac(ctx, &p7);
|
||||
}
|
||||
}
|
||||
|
||||
void cvkcv180x_gmem_init_tensor(
|
||||
struct cvikernel_context *ctx,
|
||||
cvk_tg_t *tg,
|
||||
cvk_tg_shape_t shape,
|
||||
cvk_fmt_t fmt) {
|
||||
memset(tg, 0, sizeof(*tg));
|
||||
tg->fmt = fmt;
|
||||
tg->shape = shape;
|
||||
tg->stride = cvkcv180x_tg_default_stride(ctx, tg->shape, tg->fmt);
|
||||
}
|
||||
|
||||
static uint16_t cvkcv180x_float_to_bfloat16(
|
||||
cvk_context_t *ctx,
|
||||
float data)
|
||||
{
|
||||
(void)ctx;
|
||||
|
||||
return cvk_convert_fp32_bf16(data);
|
||||
}
|
||||
|
||||
static void cvkcv180x_bf16_table_shape(
|
||||
cvk_context_t *ctx,
|
||||
cvk_tl_shape_t *shape)
|
||||
{
|
||||
if (!ctx || !shape)
|
||||
return;
|
||||
|
||||
shape->n = 1;
|
||||
shape->c = ctx->info.npu_num;
|
||||
shape->h = 32; // hard-coded in cv180x
|
||||
shape->w = 8; // hard-coded in cv180x
|
||||
}
|
||||
|
||||
static cvk_operations_t cvk_cv180x_ops = {
|
||||
.cleanup = cvkcv180x_cleanup,
|
||||
.reset = cvkcv180x_reset,
|
||||
.acquire_cmdbuf = cvkcv180x_acquire_cmdbuf,
|
||||
.set_layer_id = cvkcv180x_set_layer_id,
|
||||
.parallel_enable = cvkcv180x_parallel_enable,
|
||||
.parallel_disable = cvkcv180x_parallel_disable,
|
||||
.lmem_alloc_tensor = cvkcv180x_lmem_alloc_tensor,
|
||||
.lmem_alloc_matrix = cvkcv180x_lmem_alloc_matrix,
|
||||
.lmem_alloc_ps32_matrix = cvkcv180x_lmem_alloc_ps32_matrix,
|
||||
.lmem_free_tensor = cvkcv180x_lmem_free_tensor,
|
||||
.lmem_free_matrix = cvkcv180x_lmem_free_matrix,
|
||||
.lmem_init_tensor = cvkcv180x_lmem_init_tensor,
|
||||
.lmem_init_matrix = cvkcv180x_lmem_init_matrix,
|
||||
.tl_default_stride = cvkcv180x_tl_default_stride,
|
||||
.tg_default_stride = cvkcv180x_tg_default_stride,
|
||||
.ml_default_shape = cvkcv180x_ml_default_shape,
|
||||
.ml_default_stride = cvkcv180x_ml_default_stride,
|
||||
.ml_shape_t1 = cvkcv180x_ml_shape_t1,
|
||||
.lmem_tensor_to_size = cvkcv180x_lmem_tensor_to_size,
|
||||
.lmem_matrix_to_size = cvkcv180x_lmem_matrix_to_size,
|
||||
.lmem_ps32_matrix_to_size = cvkcv180x_lmem_ps32_matrix_to_size,
|
||||
.gmem_init_tensor = cvkcv180x_gmem_init_tensor,
|
||||
.tdma_l2l_tensor_copy = cvkcv180x_tdma_l2l_bf16_tensor_copy,
|
||||
.tdma_l2l_bf16_tensor_copy = cvkcv180x_tdma_l2l_bf16_tensor_copy,
|
||||
.tdma_l2l_tensor_lrn_shift = cvkcv180x_tdma_l2l_tensor_lrn_shift,
|
||||
.tdma_l2g_tensor_copy = cvkcv180x_tdma_l2g_bf16_tensor_copy,
|
||||
.tdma_l2g_bf16_tensor_copy = cvkcv180x_tdma_l2g_bf16_tensor_copy,
|
||||
.tdma_l2g_tensor_copy_nc_transposed = cvkcv180x_tdma_l2g_bf16_tensor_copy_nc_transposed,
|
||||
.tdma_l2g_bf16_tensor_copy_nc_transposed = cvkcv180x_tdma_l2g_bf16_tensor_copy_nc_transposed,
|
||||
.tdma_l2g_tensor_copy_compressed = cvkcv180x_tdma_l2g_tensor_copy_compressed,
|
||||
.tdma_l2g_tensor_fill_constant = cvkcv180x_tdma_l2g_tensor_fill_constant,
|
||||
.tdma_l2g_tensor_copy_cw_transposed = cvkcv180x_tdma_l2g_bf16_tensor_copy_cw_transposed,
|
||||
.tdma_l2g_bf16_tensor_copy_cw_transposed = cvkcv180x_tdma_l2g_bf16_tensor_copy_cw_transposed,
|
||||
.tdma_l2g_matrix_copy = cvkcv180x_tdma_l2g_bf16_matrix_copy,
|
||||
.tdma_l2g_bf16_matrix_copy = cvkcv180x_tdma_l2g_bf16_matrix_copy,
|
||||
.tdma_l2g_matrix_copy_compressed = cvkcv180x_tdma_l2g_matrix_copy_compressed,
|
||||
.tdma_l2g_general_copy = cvkcv180x_tdma_l2g_general_copy,
|
||||
.tdma_l2g_bf16_general_copy = cvkcv180x_tdma_l2g_bf16_general_copy,
|
||||
.tdma_g2l_tensor_copy = cvkcv180x_tdma_g2l_bf16_tensor_copy,
|
||||
.tdma_g2l_bf16_tensor_copy = cvkcv180x_tdma_g2l_bf16_tensor_copy,
|
||||
.tdma_g2l_tensor_copy_nc_transposed = cvkcv180x_tdma_g2l_bf16_tensor_copy_nc_transposed,
|
||||
.tdma_g2l_bf16_tensor_copy_nc_transposed = cvkcv180x_tdma_g2l_bf16_tensor_copy_nc_transposed,
|
||||
.tdma_g2l_tensor_copy_chw_rotated = cvkcv180x_tdma_g2l_tensor_copy_chw_rotated,
|
||||
.tdma_g2l_tensor_copy_decompressed = cvkcv180x_tdma_g2l_tensor_copy_decompressed,
|
||||
.tdma_g2l_tensor_fill_constant = cvkcv180x_tdma_g2l_bf16_tensor_fill_constant,
|
||||
.tdma_g2l_bf16_tensor_fill_constant = cvkcv180x_tdma_g2l_bf16_tensor_fill_constant,
|
||||
.tdma_g2l_matrix_copy_decompressed = cvkcv180x_tdma_g2l_matrix_copy_decompressed,
|
||||
.tdma_g2l_matrix_copy = cvkcv180x_tdma_g2l_bf16_matrix_copy,
|
||||
.tdma_g2l_bf16_matrix_copy = cvkcv180x_tdma_g2l_bf16_matrix_copy,
|
||||
.tdma_g2l_matrix_copy_row_col_transposed = cvkcv180x_tdma_g2l_matrix_copy_row_col_transposed,
|
||||
.tdma_g2l_general_copy = cvkcv180x_tdma_g2l_general_copy,
|
||||
.tdma_g2l_bf16_general_copy = cvkcv180x_tdma_g2l_bf16_general_copy,
|
||||
.tdma_g2g_tensor_copy = cvkcv180x_tdma_g2g_tensor_copy,
|
||||
.tdma_g2g_general_copy = cvkcv180x_tdma_g2g_general_copy,
|
||||
.tdma_g2g_bf16_general_copy = cvkcv180x_tdma_g2g_bf16_general_copy,
|
||||
.tdma_g2g_bf16_tensor_copy = cvkcv180x_tdma_g2g_bf16_tensor_copy,
|
||||
.tiu_mul = cvkcv180x_tiu_mul,
|
||||
.tiu_mul_qm = cvkcv180x_tiu_mul_qm,
|
||||
.tiu_mac = cvkcv180x_tiu_mac,
|
||||
.tiu_add = cvkcv180x_tiu_add,
|
||||
.tiu_sub = cvkcv180x_tiu_sub,
|
||||
.tiu_max = cvkcv180x_tiu_max,
|
||||
.tiu_min = cvkcv180x_tiu_min,
|
||||
.tiu_and_int8 = cvkcv180x_tiu_and_int8,
|
||||
.tiu_arith_shift = cvkcv180x_tiu_arith_shift,
|
||||
.tiu_and_int16 = cvkcv180x_tiu_and_int16,
|
||||
.tiu_or_int8 = cvkcv180x_tiu_or_int8,
|
||||
.tiu_or_int16 = cvkcv180x_tiu_or_int16,
|
||||
.tiu_xor_int8 = cvkcv180x_tiu_xor_int8,
|
||||
.tiu_xor_int16 = cvkcv180x_tiu_xor_int16,
|
||||
.tiu_copy = cvkcv180x_tiu_copy,
|
||||
.tiu_lookup_table = cvkcv180x_tiu_lookup_table,
|
||||
.tiu_bf16_lookup_interp_table = cvkcv180x_tiu_bf16_lookup_interp_table,
|
||||
.tiu_pt_convolution = cvkcv180x_tiu_pt_convolution,
|
||||
.tiu_convolution = cvkcv180x_tiu_convolution,
|
||||
.tiu_max_pooling = cvkcv180x_tiu_max_pooling,
|
||||
.tiu_average_pooling = cvkcv180x_tiu_average_pooling,
|
||||
.tiu_pt_depthwise_convolution = cvkcv180x_tiu_pt_depthwise_convolution,
|
||||
.tiu_depthwise_convolution = cvkcv180x_tiu_depthwise_convolution,
|
||||
.tiu_matrix_multiplication = cvkcv180x_tiu_matrix_multiplication,
|
||||
.tiu_matrix_multiplication_qm = cvkcv180x_tiu_matrix_multiplication_qm,
|
||||
.tiu_ge = cvkcv180x_tiu_ge,
|
||||
.tiu_min_pooling = cvkcv180x_tiu_min_pooling,
|
||||
};
|
||||
|
||||
static cvk_misc_operations_t cvk_cv180x_misc_ops = {
|
||||
.float_to_bfloat16 = cvkcv180x_float_to_bfloat16,
|
||||
.bf16_table_shape = cvkcv180x_bf16_table_shape,
|
||||
};
|
||||
|
||||
char *cvikernel_get_chip_info_cv180x(void)
|
||||
{
|
||||
return CVI_TPU_VERSION_180X;
|
||||
}
|
||||
|
||||
void cvikernel_init_cv180x(
|
||||
cvk_reg_info_t *req_info,
|
||||
cvk_context_t *ctx)
|
||||
{
|
||||
uint32_t max_nr_desc = cvkcv180x_estimate_nr_desc(req_info->cmdbuf_size);
|
||||
cvk_prv_data_t *prv_data;
|
||||
desc_pair_t *desc_pairs;
|
||||
|
||||
prv_data = malloc(sizeof(cvk_prv_data_t));
|
||||
desc_pairs = malloc(max_nr_desc * sizeof(desc_pair_t));
|
||||
if (!req_info || !ctx || !prv_data || !desc_pairs) {
|
||||
if (prv_data)
|
||||
free(prv_data);
|
||||
if (desc_pairs)
|
||||
free(desc_pairs);
|
||||
return;
|
||||
}
|
||||
|
||||
ctx->info.version = CV180X_VER;
|
||||
ctx->info.node_num = CV180X_HW_NODE_CHIP_NUM;
|
||||
ctx->info.node_shift = CV180X_HW_NODE_CHIP_SHIFT;
|
||||
ctx->info.npu_num = CV180X_HW_NPU_NUM;
|
||||
ctx->info.npu_shift = CV180X_HW_NPU_SHIFT;
|
||||
ctx->info.eu_num = CV180X_HW_EU_NUM;
|
||||
ctx->info.eu_shift = CV180X_HW_EU_SHIFT;
|
||||
ctx->info.lmem_size = CV180X_HW_LMEM_SIZE;
|
||||
ctx->info.lmem_shift = CV180X_HW_LMEM_SHIFT;
|
||||
ctx->info.lmem_banks = CV180X_HW_LMEM_BANKS;
|
||||
ctx->info.lmem_bank_size = CV180X_HW_LMEM_BANK_SIZE;
|
||||
ctx->info.gmem_start = CV180X_GLOBAL_MEM_START_ADDR;
|
||||
ctx->info.features = CVK_HWF_FC_OP1_CONST | CVK_HWF_8B_ADD_SUB |
|
||||
CVK_HWF_MIN_POOL | CVK_HWF_M_BRADCAST |
|
||||
CVK_HWF_QM_LSHIFT | CVK_HWF_GE | CVK_HWF_CMD_PRE_EXE;
|
||||
ctx->info.gmem_size = CV180X_GLOBAL_MEM_SIZE;
|
||||
|
||||
ctx->ops = &cvk_cv180x_ops;
|
||||
ctx->misc_ops = &cvk_cv180x_misc_ops;
|
||||
|
||||
prv_data->cmdbuf_ptr = 0;
|
||||
prv_data->max_nr_desc = max_nr_desc;
|
||||
prv_data->cur_nr_desc = 0;
|
||||
prv_data->desc_pairs = desc_pairs;
|
||||
prv_data->lmem_ptr = 0;
|
||||
|
||||
if (!prv_data->desc_pairs) {
|
||||
printf("cvkcv180x init: fail to allocate internal data\n");
|
||||
free(prv_data);
|
||||
return;
|
||||
}
|
||||
|
||||
ec_init(&prv_data->ec, CV180X_ENGINE_NUM, max_nr_desc);
|
||||
mode_manager_init(&prv_data->mode_manager, &prv_data->ec, CV180X_ENGINE_NUM);
|
||||
|
||||
prv_data->cmdbuf = req_info->cmdbuf;
|
||||
prv_data->cmdbuf_size = req_info->cmdbuf_size;
|
||||
ctx->priv_data = prv_data;
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user