Files
SDK_SG200x_V2/cvimath/include/cvimath_internal.h
carbon 83dc4914fe add cvimath
commit ce8705f49da5e5f59c2ddb3253ef88323a0cd9c4
Author: sophgo-forum-service <forum_service@sophgo.com>
Date:   Mon May 13 14:04:10 2024 +0800

    [feat] cvimath opensource for cv18xx soc.

    - 9e8967
2024-05-31 11:54:07 +08:00

1067 lines
39 KiB
C

#ifndef CVIMATH_INTERNAL_H
#define CVIMATH_INTERNAL_H
#include <stdbool.h> //bool
#include <stddef.h> //size_t
#include "cvimath.h"
// copy from lagency
// TODO: move to properly header files
typedef uint8_t u8;
typedef uint16_t u16;
typedef uint32_t u32;
typedef uint64_t u64;
typedef int8_t s8;
typedef int16_t s16;
typedef int32_t s32;
typedef int64_t s64;
static inline uint64_t align_up(uint64_t x, uint64_t n) { return (x + n - 1) / n * n; }
/**
* please refer @example for more details
*/
#include <cvikernel/cvikernel.h>
#define CVK_MULTIPLIER_BIAS_PACKED_DATA_SIZE 9
#define CVK_MULTIPLIER_ONLY_PACKED_DATA_SIZE 5
// public function
#ifdef __cplusplus
extern "C" {
#endif
/**
* @brief get lookup tabel shape
*
* @param cvk_ctx kernel structure
* @param [out] shape the table shape
*/
void cvm_table_shape(cvk_context_t *cvk_ctx, cvk_tl_shape_t *shape);
/**
* @brief generate sqrt look up table for bf16 exponent part
*
* @param [out] table_data bf16 exponent part lookup table in host
* @param table_shape table shape
*/
void cvm_gen_sqrt(uint16_t *table_data, cvk_tl_shape_t *table_shape);
/**
* @brief syntactic sugar for cvm_gen_sqrt/cvm_gen_sqrt_mantissa
*
* @param [out] sqrt_table_data bf16 exponent part lookup table in host
* @param [out] sqrt_table_data_mantissa bf16 fraction part lookup table in host
* @param table_shape table shape
*/
void cvm_sqrt_tbl(uint16_t *sqrt_table_data, uint16_t *sqrt_table_data_mantissa,
cvk_tl_shape_t *table_shape);
/**
* @brief generate sqrt look up table for bf16 fraction part
*
* @param [out] table_mantissa bf16 fraction part lookup table in host
* @param table_shape table shape
*/
void cvm_gen_sqrt_mantissa(uint16_t *table_mantissa, cvk_tl_shape_t *table_shape);
/**
* @brief implement sqrt in tpu memory
*
* @param cvk_ctx kernel structure
* @param tl_ifmap input tensor in tpu memory
* @param tl_buf working buffer
* @param tbl_answer lookup table tensor for bf16 exponent part in tpu memory
* @param tbl_answer_mantissa lookup table tensor for fraction part in tpu memory
* @param [out] tl_ofmap_bf16 result in in memory
*
* @example
* // 1. alloc in tpu memory
* // 2. prepare table
* cvm_sqrt_tbl(table_data, table_data_mantissa, &table_shape);
* // 3. put host data to tpu memory
* // 4. prepare command buffer
* cvm_emit_sqrt(cvk_ctx, tl_ifmap, tl_buf, cvk_tl_table_answer, cvk_tl_table_answer_mantissa,
* tl_ofmap_bf16);
* // 5. submit it
* test_submit_comp(rt_ctx, cvk_ctx);
*
* // 6. get result from tpu memory
* uint16_t* result = (uint16_t *)get_bf16_tensor_l2g(rt_ctx, cvk_ctx, tl_ofmap_bf16,
* tl_ofmap_bf16->fmt);
*
* @return status, 0 means success, other means generates command fail
*/
int cvm_emit_sqrt(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
cvk_tl_t *tbl_answer, cvk_tl_t *tbl_answer_mantissa, cvk_tl_t *tl_ofmap_bf16);
/**
* @brief generate reciprocal look up table for bf16 exponent part
*
* @param [out] table_data bf16 exponent part lookup table in host
* @param table_shape table shape
*/
void cvm_gen_reciprocal(uint16_t *table_data, cvk_tl_shape_t *table_shape);
/**
* @brief generate reciprocal look up table for bf16 fraction part
*
* @param [out] table_mantissa bf16 fraction part lookup table in host
* @param table_shape table shape
*/
void cvm_gen_reciprocal_mantissa(uint16_t *table_mantissa, cvk_tl_shape_t *table_shape);
/**
* @brief syntactic sugar for cvm_gen_reciprocal/cvm_gen_reciprocal_mantissa
*
* @param [out] sqrt_table_data bf16 exponent part lookup table in host
* @param [out] sqrt_table_data_mantissa bf16 fraction part lookup table in host
* @param table_shape table shape
*/
void cvm_reciprocal_tbl(uint16_t *table_data, uint16_t *table_mantissa,
cvk_tl_shape_t *table_shape);
/**
* @brief implement reciprocal in tpu memory
*
* @param cvk_ctx kernel structure
* @param tl_ifmap input tensor in tpu memory
* @param tl_buf working buffer
* @param tbl_answer lookup table tensor for bf16 exponent part in tpu memory
* @param tbl_answer_mantissa lookup table tensor for fraction part in tpu memory
* @param [out] tl_ofmap_bf16 result in in memory
*
* @example
* int align = 1; // align eu(excution unit)
* // 1. alloc in tpu memory
* // 2. prepare table
* cvm_reciprocal_tbl(table_data, table_data_mantissa, &table_shape);
* // 3. put host data to tpu memory
* // 4. prepare command buffer
* cvm_emit_reciprocal(cvk_ctx, tl_ifmap, tl_buf, cvk_tl_table_answer,
* cvk_tl_table_answer_mantissa, tl_ofmap_bf16);
*
* // 5. submit it
* test_submit_comp(rt_ctx, cvk_ctx);
*
* // 6. get result from tpu memory
* uint16_t* result = (uint16_t *)get_bf16_tensor_l2g(rt_ctx, cvk_ctx, tl_ofmap_bf16,
* tl_ofmap_bf16->fmt);
*
* @return status, 0 means success, other means generates command fail
*/
int cvm_emit_reciprocal(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
cvk_tl_t *tbl_answer, cvk_tl_t *tbl_answer_mantissa,
cvk_tl_t *tl_ofmap_bf16);
/**
* @brief generate sigmoid lookup table in host,
* we leverage Linear interpolation fairly close to the original
* you can refer [wiki](https://en.wikipedia.org/wiki/Interpolation) for more details
*
* @param [out] sigmoid_table_data lookup table in host
* @param [out] sigmoid_table_data_slope slope table in host
* @param table_shape table shape
* @param range_start quantize range from,
* e.g: the original input range is -127 ~ 128, we quantize to -8 ~ 8
* than -8 is our \range_start and 8 is \range_end
* @param range_end quantize range end
*/
void cvm_sigmoid_tbl(uint16_t *sigmoid_table_data, uint16_t *sigmoid_table_data_slope,
cvk_tl_shape_t *table_shape, int range_start, int range_end);
/**
* @brief get scale factor from \range_start and \range_end
*
* @param range_start quantize range from
* @param range_end quantize range end
*
* @return scale factor
*/
float cvm_sigmoid_scale(int range_start, int range_end);
/**
* @brief get sigmoid value by linear interpolation
*
* @param cvk_ctx kernel structure
* @param tl_ifmap input tensor in tpu memory
* @param tl_buf working buffer
* @param tl_table_answer sigmoid table in tpu memory generated by \cvm_sigmoid_tbl
* @param tl_table_answer_slope sigmoid slope table in tpu memory generated by \cvm_sigmoid_tbl
* @param [out] tl_ofmap_bf16 result in in memory
* @param scale scale factor generated by \cvm_sigmoid_scale
*
* @example
* // 1. alloc in tpu memory
* // 2. prepare table
* cvm_sigmoid_tbl(table_data, table_data_slope, &table_shape, range_start, range_end);
* float scale = cvm_sigmoid_scale(range_start, range_end);
* // 3. put host data to tpu memory
* // 4. prepare command buffer
* cvm_emit_sigmoid(cvk_ctx, tl_ifmap, tl_buf, cvk_tl_table_answer, cvk_tl_table_answer_slope,
* tl_ofmap_bf16, scale);
* // 5. submit it
* test_submit_comp(rt_ctx, cvk_ctx);
*
* // 6. get result from tpu memory
* uint16_t* result = (uint16_t *)get_bf16_tensor_l2g(rt_ctx, cvk_ctx, tl_ofmap_bf16,
* tl_ofmap_bf16->fmt);
*
* @return status, 0 means success, other means generates command fail
*/
int cvm_emit_sigmoid(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
cvk_tl_t *tl_table_answer, cvk_tl_t *tl_table_answer_slope,
cvk_tl_t *tl_ofmap_bf16, float scale);
/**
* @brief General Matrix Multiplication
* that equal \lhs_gaddr * \rhs_gaddr = \dest_gaddr
*
* @param cvk_ctx kernel structure
* @param lhs_gaddr left hand side device memory address
* @param rhs_gaddr right hand side device memory address
* @param dest_gaddr destination device memory address
* @param in_row \lhs_gaddr matrix row
* @param in_col \lhs_gaddr matrix col
* @param out_col \dest_gaddr matrix col
* @param fmt the possible value is \CVK_FMT_BF16 or \CVK_FMT_I8 or \CVK_FMT_U8
* @example
*
* // 1. alloc host memory and put it to device memory
* // M=in_row K=in_col N=out_col
* cvk_mg_t *mg_A = _test_put_matrix_g(&ctx, M, K, CVK_FMT_BF16, (uint8_t *)bf16_A);
* cvk_mg_t *mg_B = _test_put_matrix_g(&ctx, K, N, CVK_FMT_BF16, (uint8_t *)bf16_B);
* cvk_mg_t *mg_R = _test_put_matrix_g(&ctx, M * 2, N, CVK_FMT_BF16, (uint8_t *)bf16_R);
*
* // 2. get device address for gemm
* gaddr_t gaddr_a = mg_A->start_address;
* gaddr_t gaddr_b = mg_B->start_address;
* gaddr_t gaddr_r = mg_R->start_address;
*
* // 3. prepare gemm descriptor
* cvm_gemm(cvk_ctx, gaddr_a, gaddr_b, gaddr_r, M, K, N);
*
* // 4. submit descriptor
* test_submit_comp(&ctx, cvk_ctx);
*
* // 5. get result from device to host
* uint16_t *bf16_ref = (uint16_t *)test_get_mg_mem_comp(&ctx, mg_R);
*
* @ return slice_num array of {M, N, K}
*/
size_t *cvm_gemm(cvk_context_t *cvk_ctx, uint64_t lhs_gaddr, uint64_t rhs_gaddr,
uint64_t dest_gaddr, int in_row, int in_col, int out_col, cvk_fmt_t fmt);
/**
* @brief combine \cvm_gemm int8 result to int32
* the raw output is seperate 32bit result info 4 part with bstride
* and we need to 'combine' it to human readable
* for instance, the following is the raw result
* lsb 31 msb
* 0x1 0x2 0x3 0x4 0x5 0x6 0x7 0x8
* 0x9 0xa 0xb 0xc 0xd 0xe 0xf 0x0
* 0x11 0x12 0x13 0x14 0x15 0x16 0x17 0x18
* 0x19 0x20 0x21 0x22 0x23 0x24 0x25 0x26
*
* the value by strategy could be column major:
* 1. 0x19110901
* 2. 0x20120a02
* 3. 0x21130b03
* and so on
*
* @param cvm_gemm_strategy return strategy value from \cvm_gemm
* @param cvm_output raw result from \cvm_gemm
* @param [out] i32_R int32 result
* @param M row of output matrix
* @param N column of output matrix
*
* @return status, 0 means success, other means generates command fail
*/
int cvm_combin_gemm_i8(size_t *cvm_gemm_strategy, uint8_t *cvm_output, uint32_t *i32_R, int M,
int N);
/**
* @brief fp32 to bf16 format int device memory
*
* @param cvk_ctx kernel structure
* @param gaddr_fp32 fp32 data with device memory address
* @param fp32_shape fp32 tensor shape
* @param [out] gaddr_bf16 bf16 data with device memory address
* @param bf16_shape bf16 tensor shape
* @param fmt tensor format such as \CVK_FMT_BF16
*
* @example
*
* cvk_tl_shape_t s = {1, 2, 3, 4}
* // 1. put fp32 to device memory
* test_put_tg_mem_comp(rt_ctx, tg_with_fp32, data)
* // 2. init bf16 tg
* // 3. prepare command buffer
* cvm_s2s_fp32_bf16(cvk_ctx, tg_with_fp32->start_address, tg_with_fp32->shape,
* tg_with_bf16->start_address, tg_with_bf16->shape, CVK_FMT_BF16);
* // 4. submit it
* test_submit_comp(rt_ctx, cvk_ctx);
* // 5. get result from device memory
* uint16_t *dst_data = (uint16_t *)test_get_tg_mem_comp(rt_ctx, tg_with_bf16);
*
* @return status, 0 means success, other means generates command fail
*/
int cvm_s2s_fp32_bf16(cvk_context_t *cvk_ctx, uint64_t gaddr_fp32, cvk_tg_shape_t fp32_shape,
uint64_t gaddr_bf16, cvk_tg_shape_t bf16_shape, cvk_fmt_t fmt);
/**
* @brief generate lookup table for check input is 0 or not
*
* @param [out] table_0 lookup table for 0 or not
* @param table_shape table shape
*/
void cvm_gen_0_tbl(uint16_t *table_0, cvk_tl_shape_t *table_shape);
// mask function
/**
* @brief get mask value that seperate 0 or not
* e.g: input = [0, 1, -1, 2] output [1, 0, 0, 0]
* please see \cvm_emit_mask for more details
*
* @param cvk_ctx kernel structure
* @param tl_ifmap input in tpu memory
* @param tl_buf working buffer
* @param tbl_answer lookup table for 0 or not in tpu memory, generate by \cvm_gen_0_tbl
* @param [out] tl_ofmap_bf16 mask result in tpu memory
* @param fmt tensor format such as \CVK_FMT_BF16
*
* @return status, 0 means success, other means generates command fail
*/
int cvm_emit_0_idx(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
cvk_tl_t *tbl_answer, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
/**
* @brief get mask value that check < 0
* e.g: input = [0, 10, 6, -1, 0] output [0, 0, 0, 1, 0]
* please see \cvm_emit_mask for more details
*
* @param cvk_ctx kernel structure
* @param tl_ifmap input in tpu memory
* @param tl_buf working buffer
* @param tl_pos_neg_buf lookup table generate from \cvm_pos_neg_tbl
* @param [out] tl_ofmap_bf16 mask result in tpu memory
* @param fmt tensor format such as \CVK_FMT_BF16
*
* @return status, 0 means success, other means generates command fail
*/
int cvm_emit_neg_idx(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
cvk_tl_t *tl_pos_neg_buf, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
/**
* @brief get mask value that check >= 0
* e.g: input = [0, 10, 6, -1, 0] output [0, 1, 1, 0, 0]
* please see \cvm_emit_mask for more details
*
* @param cvk_ctx kernel structure
* @param tl_ifmap input in tpu memory
* @param tl_buf working buffer
* @param tl_pos_neg_buf lookup table generate from \cvm_pos_neg_tbl
* @param [out] tl_ofmap_bf16 mask result in tpu memory
* @param fmt tensor format such as \CVK_FMT_BF16
*
* @return status, 0 means success, other means generates command fail
*/
int cvm_emit_pos_idx(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
cvk_tl_t *tl_pos_neg_buf, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
/**
* @brief invert 0/1 input
* e.g: input = [0, 1, 1, 1, 0] output [1, 0, 0, 0, 1]
*
* @param cvk_ctx kernel structure
* @param tl_ifmap input in tpu memory
* @param tl_buf working buffer
* @param [out] tl_ofmap_bf16 mask result in tpu memory
* @param fmt
*
* @return status, 0 means success, other means generates command fail
*/
int cvm_emit_0_1_revert_input(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
// mask enum define
enum CVM_MASK_TYPE {
CVM_MASK_TYPE_GT_0 = 0, // remain > 0
CVM_MASK_TYPE_GE_0, // remain >= 0
CVM_MASK_TYPE_EQ_0, // remain = 0
CVM_MASK_TYPE_LT_0, // remain < 0
CVM_MASK_TYPE_LE_0, // remain <= 0
CVM_MASK_MAX
};
/**
* @brief get mask for \CVM_MASK_TYPE case
*
* @param cvk_ctx kernel structure
* @param tl_ifmap input in tpu memory
* @param tl_buf working buffer
* @param tl_buf2 working buffer
* @param tl_buf3 working buffer
* @param tl_pos_neg_table lookup table generate from \cvm_pos_neg_tbl
* @param tl_0_idx_table lookup table for 0 or not in tpu memory generated by \cvm_gen_0_tbl
* @param [out] tl_ofmap_bf16 mask result in tpu memory
* @param fmt tensor format such as \CVK_FMT_BF16
* @param mask \CVM_MASK_TYPE
*
* @example
* // 1. alloc in tpu memory
* // 2. prepare table
* cvm_gen_0_tbl(idx_0_table_data, &table_shape);
* cvm_pos_neg_tbl(table_data_atan_pos_neg, &table_shape);
* // 3. put host data to tpu memory
* // 4. prepare command buffer
* cvm_emit_mask(cvk_ctx,
* tl_ifmap, // input
* tl_buf, tl_buf2, tl_buf4, // tmp buffer
* tl_pos_neg_buf, tl_0_idx_table, // lookup table
* tl_ofmap_bf16, // output
* fmt, mode);
*
* // 5. submit it
* test_submit_comp(rt_ctx, cvk_ctx);
*
* // 6. get result from tpu memory
* uint16_t* result = (uint16_t *)get_bf16_tensor_l2g(rt_ctx, cvk_ctx, tl_ofmap_bf16,
* tl_ofmap_bf16->fmt);
*
*
* @return status, 0 means success, other means generates command fail
*/
int cvm_emit_mask(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, cvk_tl_t *tl_buf2,
cvk_tl_t *tl_buf3, cvk_tl_t *tl_pos_neg_table, cvk_tl_t *tl_0_idx_table,
cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt, enum CVM_MASK_TYPE mask);
/**
* @brief generate lookup table for atan by degree
*
* @param [out] table_data_y0 atan by degree lookup table in host
* @param table_shape table shape
*/
void cvm_atan_fast_degree_y0(uint16_t *table_data_y0, cvk_tl_shape_t *table_shape);
/**
* @brief generate lookup table for check value of absolute in [0,1] or > 1
* atan2 used, [0-1] indicate 1, > 1 indicate with -1
*
* @param [out] table_invert lookup table in host
* @param table_shape table shape
*/
void cvm_atan_s_01(uint16_t *table_invert, cvk_tl_shape_t *table_shape);
/**
* @brief generate table for check input value is positive(>=0) or negtive(<0)
* by lookup table, 'pos_neg' means data is positive(>=0) is 1 or negtive(<0) is -1
*
* @param [out] table_pos_neg lookup table in host
* @param table_shape table shape
*/
void cvm_pos_neg_tbl(uint16_t *table_pos_neg, cvk_tl_shape_t *table_shape);
// deprecated code from \cvm_pos_neg_tbl
void cvm_atan_pos_neg(uint16_t *table_pos_neg, cvk_tl_shape_t *table_shape);
/**
* @brief generate atan answer by lookup table,
* plz refer [git](https://github.com/xiezhq-hermann/atan_lookup) for more details
*
* @param [out] table_data_y0 atan answer lookup table in host
* @param table_shape table shape
*/
void cvm_atan_y0(uint16_t *table_data_y0, cvk_tl_shape_t *table_shape);
/**
* @brief generate atan slope data, for more accuracy
*
* @param [out] table_slope atan slope lookup table in host
* @param table_shape table shape
*/
void cvm_atan_slope(uint16_t *table_slope, cvk_tl_shape_t *table_shape);
/**
* @brief syntactic sugar for cvm_atan_y0/cvm_atan_slope/cvm_atan_s_01/cvm_pos_neg_tbl
*
* @param [out] table_data_atan_y0 atan answer lookup table in host
* @param [out] table_data_atan_slope atan slope lookup table in host
* @param [out] table_data_atan_invert lookup table in host
* @param [out] table_data_atan_pos_neg lookup table in host
* @param table_shape table shape
*/
void cvm_atan_tbl(uint16_t *table_data_atan_y0, uint16_t *table_data_atan_slope,
uint16_t *table_data_atan_invert, uint16_t *table_data_atan_pos_neg,
cvk_tl_shape_t *table_shape);
/**
* @brief implement atan in tpu memory
*
* @param cvk_ctx kernel structure
* @param tl_ifmap input tensor in tpu memory
* @param tl_buf working buffer
* @param tl_buf2 working buffer
* @param tl_buf3 working buffer
* @param tl_y0_buf atan lookup table in tpu memory
* @param tl_slope_buf atan slope lookup table in tpu memory
* @param tl_invert_buf lookup table in tpu memory
* @param tl_pos_neg_buf lookup table in memory
* @param tl_table_answer reciprocal for bf16 exponent part in tpu memory
* @param tl_table_answer_mantissa reciprocal for bf16 fraction part in tpu memory
* @param [out] tl_ofmap_bf16 result in tpu memory
* @param fmt tensor format such as \CVK_FMT_BF16
*
* @example
* // 1. alloc in tpu memory
* // 2.1. get reciprocal table in host
* cvm_reciprocal_tbl(table_reciprocal_data, table_reciprocal_data_mantissa, &table_shape);
* // 2.2. get atan table in host
* cvm_atan_tbl(table_data_atan_y0, table_data_atan_slope, table_data_atan_invert,
* table_data_atan_pos_neg, &table_shape);
* // 3. put host data to tpu memory
* // 4. prepare command buffer
* cvm_atan_emit(cvk_ctx, tl_ifmap, tl_buf, tl_buf2, tl_buf4, tl_y0_buf,
* tl_slope_buf, tl_invert_buf, tl_pos_neg_buf, tl_reciprocal_table_answer,
* tl_reciprocal_table_answer_mantissa, tl_ofmap_bf16, fmt);
*
* // 5. submit it
* test_submit_comp(rt_ctx, cvk_ctx);
* // 6. get result from tpu memory
* uint16_t *ofmap_data = (uint16_t *)get_bf16_tensor_l2g(rt_ctx, cvk_ctx, tl_ofmap_bf16,
* tl_ofmap_bf16->fmt);
* @return status, 0 means success, other means generates command fail
*/
int cvm_atan_emit(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, cvk_tl_t *tl_buf2,
cvk_tl_t *tl_buf3, cvk_tl_t *tl_y0_buf, cvk_tl_t *tl_slope_buf,
cvk_tl_t *tl_invert_buf, cvk_tl_t *tl_pos_neg_buf, cvk_tl_t *tl_table_answer,
cvk_tl_t *tl_table_answer_mantissa, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
// atan2 function
/**
* @brief syntactic sugar for generate atan in degree lookup table in
* host/cvm_atan_s_01/cvm_pos_neg_tbl
*
* @param [out] table_data_atan_y0 atan answer lookup table in host
* @param [out] table_data_atan_invert lookup table in host
* @param [out] table_data_atan_pos_neg lookup table in host
* @param table_shape table shape
*/
void cvm_atan_fast_degree_tbl(uint16_t *table_data_atan_y0, uint16_t *table_data_atan_invert,
uint16_t *table_data_atan_pos_neg, cvk_tl_shape_t *table_shape);
/**
* @brief implement atan2 by degree in tpu memory, implemented by atan. you can refer
* [wiki](https://en.wikipedia.org/wiki/Atan2) for more details
*
* @param cvk_ctx kernel structure
* @param y input tensor in tpu memory
* @param x input tensor in tpu memory
* @param tl_buf working buffer
* @param tl_buf2 working buffer
* @param tl_buf3 working buffer
* @param tl_y0_buf atan2 lookup table in tpu memory
* @param tl_invert_buf lookup table in tpu memory
* @param tl_pos_neg_buf lookup table in memory
* @param tl_table_answer reciprocal for bf16 exponent part in tpu memory
* @param tl_table_answer_mantissa reciprocal for bf16 fraction part in tpu memory
* @param [out] tl_ofmap_bf16 result in tpu memory
* @param fmt tensor format such as \CVK_FMT_BF16
*
* @example
* // 1. alloc in tpu memory
* // 2.1. get reciprocal table in host
* cvm_reciprocal_tbl(table_reciprocal_data, table_reciprocal_data_mantissa, &table_shape);
* // 2.2. get atan table in host
* cvm_atan_fast_degree_tbl(table_data_atan_y0, table_data_atan_slope, table_data_atan_invert,
* table_data_atan_pos_neg, &table_shape);
* // 3. put host data to tpu memory
* // 4. prepare command buffer
* cvm_atan2_fast_degree_emit(
* cvk_ctx, tl_ifmap2, tl_ifmap, tl_buf, tl_buf2, tl_buf3, tl_y0_buf,
* tl_invert_buf, tl_pos_neg_buf, tl_reciprocal_table_answer,
* tl_reciprocal_table_answer_mantissa, tl_ofmap_bf16, fmt);
*
* // 5. submit it
* test_submit_comp(rt_ctx, cvk_ctx);
* // 6. get result from tpu memory
* uint16_t *ofmap_data = (uint16_t *)get_bf16_tensor_l2g(rt_ctx, cvk_ctx, tl_ofmap_bf16,
* tl_ofmap_bf16->fmt);
*/
void cvm_atan2_fast_degree_emit(cvk_context_t *cvk_ctx, cvk_tl_t *y, cvk_tl_t *x, cvk_tl_t *tl_buf,
cvk_tl_t *tl_buf2, cvk_tl_t *tl_buf3, cvk_tl_t *tl_y0_buf,
cvk_tl_t *tl_invert_buf, cvk_tl_t *tl_pos_neg_buf,
cvk_tl_t *tl_table_answer, cvk_tl_t *tl_table_answer_mantissa,
cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
/**
* @brief implement atan2 in tpu memory, implemented by atan. you can refer
* [wiki](https://en.wikipedia.org/wiki/Atan2) for more details
*
* @param cvk_ctx kernel structure
* @param y input tensor in tpu memory
* @param x input tensor in tpu memory
* @param tl_buf working buffer
* @param tl_buf2 working buffer
* @param tl_buf3 working buffer
* @param tl_y0_buf atan2 lookup table in tpu memory
* @param tl_invert_buf lookup table in tpu memory
* @param tl_pos_neg_buf lookup table in memory
* @param tl_table_answer reciprocal for bf16 exponent part in tpu memory
* @param tl_table_answer_mantissa reciprocal for bf16 fraction part in tpu memory
* @param [out] tl_ofmap_bf16 result in tpu memory
* @param fmt tensor format such as \CVK_FMT_BF16
*
* @example
* // 1. alloc in tpu memory
* // 2.1. get reciprocal table in host
* cvm_reciprocal_tbl(table_reciprocal_data, table_reciprocal_data_mantissa, &table_shape);
* // 2.2. get atan table in host
* cvm_atan_fast_degree_tbl(table_data_atan_y0, table_data_atan_slope, table_data_atan_invert,
* table_data_atan_pos_neg, &table_shape);
* // 3. put host data to tpu memory
* // 4. prepare command buffer
* cvm_atan2_fast_degree_emit(
* cvk_ctx, tl_ifmap2, tl_ifmap, tl_buf, tl_buf2, tl_buf3, tl_y0_buf,
* tl_invert_buf, tl_pos_neg_buf, tl_reciprocal_table_answer,
* tl_reciprocal_table_answer_mantissa, tl_ofmap_bf16, fmt);
*
* // 5. submit it
* test_submit_comp(rt_ctx, cvk_ctx);
* // 6. get result from tpu memory
* uint16_t *ofmap_data = (uint16_t *)get_bf16_tensor_l2g(rt_ctx, cvk_ctx, tl_ofmap_bf16,
* tl_ofmap_bf16->fmt);
*/
void cvm_atan2_merge_emit(cvk_context_t *cvk_ctx, cvk_tl_t *y, cvk_tl_t *x, cvk_tl_t *tl_buf,
cvk_tl_t *tl_buf2, cvk_tl_t *tl_buf3, cvk_tl_t *tl_y0_buf,
cvk_tl_t *tl_invert_buf, cvk_tl_t *tl_pos_neg_buf,
cvk_tl_t *tl_table_answer, cvk_tl_t *tl_table_answer_mantissa,
cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
/**
* @brief get lookup table size for host alloc mamory used
*
* @param cvk_ctx kernel structure
* @param table_shape table shape
* @param fmt tensor format such as \CVK_FMT_BF16
*
* @return table size in bytes
*/
uint64_t cvm_lut_tbl_bytesize(cvk_context_t *cvk_ctx, cvk_tl_shape_t *table_shape, cvk_fmt_t fmt);
/**
* @brief calculate new proper reshape channel for depthwise
* current only support batch = 1
*
* @param cvk_ctx kernel structure
* @param ic origin input shape of c
* @param ih origin input shape of h
* @param iw origin input shape of w
* @param kh origin kerenl shape of h
* @param kw origin kerenl shape of w
* @param pad_right padding right with input
* @param pad_left padding left with input
* @param stride_h stride h with input
* @param stride_w stride w with input
* @param [out] tl_load_shape shape structure for input in tpu memory
* @param [out] new_tl_ifmap_stride deprecated that stride for input in tpu memory
* @param [out] new_tg_ifmap_shape shape structure for input in device memory
* @param [out] new_tg_ifmap_stride stride structure for input in device memory
* @param [out] new_tl_weight_shape reshape weight in tpu memory
* @param [out] new_tl_bias_shape reshape bias in tpu memory
* @param [out] new_tl_ofmap_shape reshape output in tpu memory
* @param fmt the possible value is \CVK_FMT_BF16 or \CVK_FMT_I8 or \CVK_FMT_U8
* @param eu_align currently MUST set 1 is force align with hardware
*
* @example
* int align = 1; // force align
* cvk_tiu_depthwise_pt_convolution_param_t *p;
* // 1. get reshaped shape
* int r = cvm_reshape_channel_same(
* cvk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left, stride_h, stride_w,
* &tl_load_shape, &tl_load_stride, &tg_shape, &tg_stride, &tl_weight_shape,
* &tl_bias_shape, &tl_output_shape, fmt, align);
* // reshape fail
* if (r == -1) {
* return -1;
* }
*
* // 2.1 load input
* // load input into tpu memory
* int load_align = 0; // not align for pack
* tmp_tl_load = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_load_shape, fmt, load_align);
* tmp_tg = test_alloc_tg_mem_comp(&rt_ctx, cvk_ctx, tg_shape, fmt);
* tmp_tg->stride = tg_stride;
* // int8
* cvk_tdma_g2l_tensor_copy_param_t p1;
* cvk_ctx->ops->tdma_g2l_tensor_copy(cvk_ctx, &p1);
* test_submit_comp(&rt_ctx, cvk_ctx);
* test_free_tg_mem_comp(&rt_ctx, tmp_tg);
* // fit for hw
* int align_in_tl = 1;
* tmp_tl_load->stride = bmk1880v2_tensor_lmem_default_stride(
* cvk_ctx, tmp_tl_load->shape, fmt, align_in_tl);
* p->ifmap = tmp_tl_load;
* // 2.2 prepare load bias, put to tg and load back
* if (has_bias) {
* // bias must i8
* int no_bias_align = 0;
* p->bias = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_bias_shape, fmt, no_bias_align);
*
* // duplicate bias and replace old
* uint32_t *new_bias = cvm_reshape_channel_weight(
* (uint8_t *)bias, tl_bias_shape.n, tl_bias_shape.c, tl_bias_shape.h,
* tl_bias_shape.w, org_oc, fmt);
*
* test_put_tensor_g2l_comp(&rt_ctx, cvk_ctx, p->bias, bias);
* }
*
* // 2.3 prepare load weight, put to tg and load back
* {
* int weight_align = 1;
* p->weight = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_weight_shape, fmt, weight_align);
* // duplicate kernel with c
* uint8_t *new_weight = cvm_reshape_channel_weight(
* (uint8_t *)weight, tl_weight_shape.n, tl_weight_shape.c, tl_weight_shape.h,
* tl_weight_shape.w, org_oc, fmt);
*
* test_put_tensor_g2l_comp(&rt_ctx, cvk_ctx, p->weight, (u16 *)weight);
* }
*
* // 2.4 prepard ofmap
* {
* // we allocate 'same' mode shape
* int output_align = 1; // hw need
* p->ofmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_output_shape, fmt, output_align);
* }
*
* // 3. prepare command buffer
* cvk_ctx->ops->tiu_pt_depthwise_convolution(cvk_ctx, p);
*
* // 4. submit it
* test_submit_comp(rt_ctx, cvk_ctx);
*
* // 5. get result from tpu memory
* output = test_get_tensor_l2g_comp(&rt_ctx, cvk_ctx, p->ofmap, fmt);
*
* @return status, -1 means fail, other means reshape slice success
*/
int cvm_reshape_channel_same(cvk_context_t *cvk_ctx, int ic, int ih, int iw, int kh, int kw,
int pad_right, int pad_left, int stride_h, int stride_w,
cvk_tl_shape_t *tl_load_shape, cvk_tl_stride_t *new_tl_ifmap_stride,
cvk_tg_shape_t *new_tg_ifmap_shape,
cvk_tg_stride_t *new_tg_ifmap_stride,
cvk_tl_shape_t *new_tl_weight_shape, cvk_tl_shape_t *new_tl_bias_shape,
cvk_tl_shape_t *new_tl_ofmap_shape, cvk_fmt_t fmt, int eu_align);
/**
* @brief re-construct bias content by reshape channel
*
* @param bias original bias in host memory
* @param ni reshape bias shape of n
* @param ci reshape bias shape of c
* @param hi reshape bias shape of h
* @param wi reshape bias shape of w
* @param old_bias_c origin bias shape of c
* @param fmt the possible value is \CVK_FMT_BF16 or \CVK_FMT_I8 or \CVK_FMT_U8
*
* @return bias host data
*/
uint32_t *cvm_reshape_channel_bias(uint8_t *bias, int ni, int ci, int hi, int wi, int old_bias_c,
cvk_fmt_t fmt);
/**
* @brief re-construct weight content by reshape channel
*
* @param weight original bias in host memory
* @param ni reshape weight shape of n
* @param ci reshape weight shape of c
* @param hi reshape weight shape of h
* @param wi reshape weight shape of w
* @param old_weight_c origin weight shape of c
* @param fmt the possible value is \CVK_FMT_BF16 or \CVK_FMT_I8 or \CVK_FMT_U8
*
* @return weight host data
*/
uint8_t *cvm_reshape_channel_weight(uint8_t *weight, int ni, int ci, int hi, int wi,
int old_weight_c, cvk_fmt_t fmt);
typedef struct cvm_tiu_atan2_param {
cvk_tl_t *a;
cvk_tl_t *b;
cvk_tl_t *res;
cvk_tl_t *buf1;
cvk_tl_t *buf2;
cvk_tl_t *buf3;
cvk_tl_t *buf4;
cvk_tl_t *buf5;
cvk_tl_t *buf6;
cvk_tl_t *y0;
cvk_tl_t *slope;
cvk_tl_t *invert;
cvk_tl_t *pos_neg_table;
cvk_tl_t *reciprocal_table_answer;
cvk_tl_t *reciprocal_table_answer_mantissa;
cvk_tl_t *sqrt_table_answer;
cvk_tl_t *sqrt_table_answer_mantissa;
cvk_tl_t *idx_0_table;
cvk_fmt_t fmt;
bool output_degree;
} cvm_tiu_atan2_param_t;
typedef struct cvk_tiu_mask_param {
cvk_tl_t *ifmap;
cvk_tl_t *ofmap;
cvk_tl_t *buf;
cvk_tl_t *buf2;
cvk_tl_t *buf3;
cvk_tl_t *pos_neg_table;
cvk_tl_t *idx_0_table;
cvk_fmt_t fmt;
} cvm_tiu_mask_param_t;
typedef struct cvm_tiu_sigmoid_param {
float scale;
cvk_tl_t *ifmap;
cvk_tl_t *buf;
cvk_tl_t *table_answer;
cvk_tl_t *table_answer_slope;
cvk_tl_t *ofmap;
} cvm_tiu_sigmoid_param_t;
typedef struct cvm_tiu_sqrt_param {
cvk_tl_t *a;
cvk_tl_t *res;
cvk_tl_t *buf;
cvk_tl_t *sqrt_table_answer;
cvk_tl_t *sqrt_table_answer_mantissa;
} cvm_tiu_sqrt_param_t;
/**
* @brief get \quantized_multiplier and its \right_shift,
* please refer
* \https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels/internal/common.h:MultiplyByQuantizedMultiplier
* for more details
*
* @param real_multiplier
* @param quantized_multiplier
* @param right_shift
*/
void cvm_get_chl_quan(float real_multiplier, uint32_t *quantized_multiplier, int *right_shift);
/**
* @brief
*
* @param c
* @param quantized_multiplier
* @param right_shift
* @param cal_data
* @param bias_data
* @param has_bias
*/
void cvm_fill_chl_quan_data(const uint32_t c, const uint32_t quantized_multiplier,
const int right_shift, uint8_t *cal_data, int32_t *bias_data,
bool has_bias);
/**
* @brief
*
* @param c
* @param quantized_multiplier
* @param right_shift
* @param bias_data
* @param has_bias
*
* @return
*/
uint8_t *cvm_get_chl_quan_data(const uint32_t c, const uint32_t quantized_multiplier,
const int right_shift, int32_t *bias_data, bool has_bias);
/**
* @brief get byte size of input \fmt
*
* @param fmt \cvk_fmt_t structure
*
* @example
* int sz = cvm_bytesize_of_fmt(CVK_FMT_BF16);
* assert (sz == 2 && "bf16 takes 2 bytes")
*
* sz = cvm_bytesize_of_fmt(CVK_FMT_I8);
* assert (sz == 1 && "int8 takes 1 bytes")
* @return byte size of fmt
*/
int cvm_bytesize_of_fmt(cvk_fmt_t fmt);
/**
* @brief reduce multiplication for h,w
* the possible shape will be <1, c, 1, 1>
* you could refer [here](https://en.wikipedia.org/wiki/Reduction_Operator) for
* more details
*
* @param cvk_ctx kernel structure
* @param [out] mp_tl_mulsum input tensor in tpu memory, the shape should be <1, c, h, w>
*
* @return status, 0 means success, other means generates command fail
*/
int cvm_reduce_hw_mul(cvk_context_t *cvk_ctx, cvk_tl_t *mp_tl_mulsum);
/**
* @brief bf16 to fp32, ONLY move bf16 to fp32 high 16 bits part,
* the memory layout as following:
*
* bf16: 0x4300
* 0 16 (bit)
* -----
* 0x4300
*
* fp32: 0x43000000
* -----
* 0 16 32
* 0x 0x43
*
* @param cvk_ctx kernel structure
* @param tg_bf16 bf16 data in device memory
* @param [out] tg_fp32 fp32 data in decive memory, the w shape SHOULD be double with
* \tg_bf16->shape.w
*/
void cvm_bf16_fp32(cvk_context_t *cvk_ctx, cvk_tg_t *tg_bf16, cvk_tg_t *tg_fp32);
/**
* @brief set value by mask(0/1)
*
* @param [in] tl_ifmap image input, MUST uint8
* @param [in] tl_mask mask value, it MUST be 0 or 1, it will DIRTY it
* @param [in] tl_buf
* @param [in,out] tl_ofmap image output, MUST uint8, it will DIRTY it
*
* @return status, 0 means success, other means generates command fail
*/
int cvm_set_image_by_u8mask(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
cvk_tl_t *tl_mask, cvk_tl_t *tl_ofmap);
/**
* @brief set value by mask(0/1) by DePthwise
* 0 means keep \tl_ofmap one
* 1 means overwrite with \tl_ifmap
*
* @param [in] tl_ifmap image input, MUST uint8
* @param [in] tl_mask mask value, it MUST be 0 or 1, it will DIRTY it
* @param [in] tl_kernel for mask reverting(0/1->1/0) that the contain MUST BE -1 with int8
* and shape SHOULD BE <1, tl_ifmap->shape.c, 1, 1>
* @param [in] tl_bias for mask reverting(0/1->1/0) that the contain MUST BE 1 with int8,
* seperate high/low part, and shape SHOULD BE <2, tl_ifmap->shape.c, 1, 1>
* @param [in,out] tl_ofmap image output, MUST uint8, it will DIRTY it
*
* @return status, 0 means success, other means generates command fail
*/
int cvm_set_image_by_u8mask_dp(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_mask,
cvk_tl_t *tl_kernel, cvk_tl_t *tl_bias, cvk_tl_t *tl_ofmap);
/**
* @brief set value by mask and threshold, set it
* if \tl_mask && (int8_t)\tl_update_tbl < threshold
*
* @param [in] tl_ifmap image input, MUST uint8
* @param [in] tl_mask mask value, it MUST be 0 or 1, it will DIRTY it
* @param [in] tl_update_tbl the value range will under int8, it will DIRTY it
* @param [in,out] tl_ofmap image output, MUST uint8, it will DIRTY it
*
* @return status, 0 means success, other means generates command fail
*/
int cvm_set_image_by_two_info_i8(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf2,
cvk_tl_t *tl_mask, cvk_tl_t *tl_update_tbl, uint8_t threshold,
cvk_tl_t *tl_ofmap);
/**
* @brief set value by mask and threshold by DePthwise, set it
* if \tl_mask && (int8_t)\tl_update_tbl < threshold
*
* @param [in] tl_ifmap image input, MUST uint8
* @param [in] tl_kernel set all to 1 for \tl_update_tbl * 1 - threshold
* to test larger or smaller,
* that MUST BE 1 with int8 and shape SHOULD BE <1, tl_ifmap->shape.c, 1, 1>
* @param [in] tl_mask mask value, it MUST be 0 or 1, it will DIRTY it
* @param [in] tl_update_tbl the value range will under int8, it will DIRTY it
* @param [in] tl_threshold for boradcast \threshold to bias
* the type MUST BE int8 and seperate high/low part and it will DIRTY it
* @param [in,out] tl_ofmap image output, MUST uint8, it will DIRTY it
*
* @return status, 0 means success, other means generates command fail
*/
int cvm_set_image_by_two_info_i8_dp(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_kernel,
cvk_tl_t *tl_mask, cvk_tl_t *tl_update_tbl,
cvk_tl_t *tl_threshold, cvk_tl_t *tl_ofmap);
/**
* @brief get abs(\tl_ifmap-tl_ifmap2)
*
* @param [in] tl_ifmap image input, MUST uint8
* @param [in] tl_ifmap2 image input, MUST uint8, it will DIRTY it
* @param [out] tl_ofmap image output, MUST uint8, it will DIRTY it
*
* @return status, 0 means success, o, MUST uint8ther means generates command fail
*/
int cvm_gen_image_diff(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2,
cvk_tl_t *tl_buf, cvk_tl_t *tl_buf2, cvk_tl_t *tl_ofmap);
/**
* @brief update \tl_ofmap by \threshold_a, \threshold_b,
* plz refer \sample_set_val_by_mask.cpp for more details
*
* @param [out] tl_mask return 0/1 mask
* @param [in] tl_update_tbl u8
* @param [in,out] tl_ofmap image output, int8
*
* @return status, 0 means success, other means generates command fail
*/
int cvm_update_tbl_by_threshold(cvk_context_t *ctx, cvk_tl_t *tl_mask, cvk_tl_t *tl_buf,
cvk_tl_t *tl_buf2, cvk_tl_t *tl_buf3, cvk_tl_t *tl_update_tbl,
uint8_t threshold_a, uint8_t threshold_b, cvk_tl_t *tl_ofmap);
/**
* @brief set value by mask, update \tl_ofmap once (uint8_t)tl_update_tbl >= threshold
*
* @param [in] tl_ifmap image input, MUST uint8
* @param [in] tl_update_tbl the value range will under uint8
* @param [in,out] tl_ofmap image output, MUST uint8, it will DIRTY it
*
* @return status, 0 means success, other means generates command fail
*/
int cvm_set_image_by_two_info_u8(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
cvk_tl_t *tl_buf2, cvk_tl_t *tl_update_tbl, uint8_t threshold,
cvk_tl_t *tl_ofmap);
/**
* @brief set value by mask
* if (int8_t)\tl_update_tbl > threshold
*
* @param [in] tl_ifmap image input
* @param [in] tl_update_tbl int8, MUST uint8, it will DIRTY
* @param [in,out] tl_ofmap image output, MUST uint8, it will DIRTY it
*
* @return status, 0 means success, other means generates command fail
*/
int cvm_blend_image_by_tbl(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
cvk_tl_t *tl_buf2, cvk_tl_t *tl_update_tbl, uint8_t threshold,
uint8_t w1, uint8_t w2, cvk_tl_t *tl_ofmap);
/**
* @brief get upsample 2d with nearest mode
*
* @param [in] tl_ifmap
* @param [in] tl_weight upsample used that fill with 1
* @param [out] tl_ofmap
*
* @return status, 0 means success, other means generates command fail
*/
int cvm_upsample2d(cvk_context_t *ctx, cvk_tl_t *tl_input, cvk_tl_t *tl_weight,
cvk_tl_t *tl_output);
#ifdef __cplusplus
}
#endif
#endif // CVIMATH_INTERNAL_H