#ifndef CVIMATH_INTERNAL_H #define CVIMATH_INTERNAL_H #include //bool #include //size_t #include "cvimath.h" // copy from lagency // TODO: move to properly header files typedef uint8_t u8; typedef uint16_t u16; typedef uint32_t u32; typedef uint64_t u64; typedef int8_t s8; typedef int16_t s16; typedef int32_t s32; typedef int64_t s64; static inline uint64_t align_up(uint64_t x, uint64_t n) { return (x + n - 1) / n * n; } /** * please refer @example for more details */ #include #define CVK_MULTIPLIER_BIAS_PACKED_DATA_SIZE 9 #define CVK_MULTIPLIER_ONLY_PACKED_DATA_SIZE 5 // public function #ifdef __cplusplus extern "C" { #endif /** * @brief get lookup tabel shape * * @param cvk_ctx kernel structure * @param [out] shape the table shape */ void cvm_table_shape(cvk_context_t *cvk_ctx, cvk_tl_shape_t *shape); /** * @brief generate sqrt look up table for bf16 exponent part * * @param [out] table_data bf16 exponent part lookup table in host * @param table_shape table shape */ void cvm_gen_sqrt(uint16_t *table_data, cvk_tl_shape_t *table_shape); /** * @brief syntactic sugar for cvm_gen_sqrt/cvm_gen_sqrt_mantissa * * @param [out] sqrt_table_data bf16 exponent part lookup table in host * @param [out] sqrt_table_data_mantissa bf16 fraction part lookup table in host * @param table_shape table shape */ void cvm_sqrt_tbl(uint16_t *sqrt_table_data, uint16_t *sqrt_table_data_mantissa, cvk_tl_shape_t *table_shape); /** * @brief generate sqrt look up table for bf16 fraction part * * @param [out] table_mantissa bf16 fraction part lookup table in host * @param table_shape table shape */ void cvm_gen_sqrt_mantissa(uint16_t *table_mantissa, cvk_tl_shape_t *table_shape); /** * @brief implement sqrt in tpu memory * * @param cvk_ctx kernel structure * @param tl_ifmap input tensor in tpu memory * @param tl_buf working buffer * @param tbl_answer lookup table tensor for bf16 exponent part in tpu memory * @param tbl_answer_mantissa lookup table tensor for fraction part in tpu memory * @param [out] tl_ofmap_bf16 result in in memory * * @example * // 1. alloc in tpu memory * // 2. prepare table * cvm_sqrt_tbl(table_data, table_data_mantissa, &table_shape); * // 3. put host data to tpu memory * // 4. prepare command buffer * cvm_emit_sqrt(cvk_ctx, tl_ifmap, tl_buf, cvk_tl_table_answer, cvk_tl_table_answer_mantissa, * tl_ofmap_bf16); * // 5. submit it * test_submit_comp(rt_ctx, cvk_ctx); * * // 6. get result from tpu memory * uint16_t* result = (uint16_t *)get_bf16_tensor_l2g(rt_ctx, cvk_ctx, tl_ofmap_bf16, * tl_ofmap_bf16->fmt); * * @return status, 0 means success, other means generates command fail */ int cvm_emit_sqrt(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, cvk_tl_t *tbl_answer, cvk_tl_t *tbl_answer_mantissa, cvk_tl_t *tl_ofmap_bf16); /** * @brief generate reciprocal look up table for bf16 exponent part * * @param [out] table_data bf16 exponent part lookup table in host * @param table_shape table shape */ void cvm_gen_reciprocal(uint16_t *table_data, cvk_tl_shape_t *table_shape); /** * @brief generate reciprocal look up table for bf16 fraction part * * @param [out] table_mantissa bf16 fraction part lookup table in host * @param table_shape table shape */ void cvm_gen_reciprocal_mantissa(uint16_t *table_mantissa, cvk_tl_shape_t *table_shape); /** * @brief syntactic sugar for cvm_gen_reciprocal/cvm_gen_reciprocal_mantissa * * @param [out] sqrt_table_data bf16 exponent part lookup table in host * @param [out] sqrt_table_data_mantissa bf16 fraction part lookup table in host * @param table_shape table shape */ void cvm_reciprocal_tbl(uint16_t *table_data, uint16_t *table_mantissa, cvk_tl_shape_t *table_shape); /** * @brief implement reciprocal in tpu memory * * @param cvk_ctx kernel structure * @param tl_ifmap input tensor in tpu memory * @param tl_buf working buffer * @param tbl_answer lookup table tensor for bf16 exponent part in tpu memory * @param tbl_answer_mantissa lookup table tensor for fraction part in tpu memory * @param [out] tl_ofmap_bf16 result in in memory * * @example * int align = 1; // align eu(excution unit) * // 1. alloc in tpu memory * // 2. prepare table * cvm_reciprocal_tbl(table_data, table_data_mantissa, &table_shape); * // 3. put host data to tpu memory * // 4. prepare command buffer * cvm_emit_reciprocal(cvk_ctx, tl_ifmap, tl_buf, cvk_tl_table_answer, * cvk_tl_table_answer_mantissa, tl_ofmap_bf16); * * // 5. submit it * test_submit_comp(rt_ctx, cvk_ctx); * * // 6. get result from tpu memory * uint16_t* result = (uint16_t *)get_bf16_tensor_l2g(rt_ctx, cvk_ctx, tl_ofmap_bf16, * tl_ofmap_bf16->fmt); * * @return status, 0 means success, other means generates command fail */ int cvm_emit_reciprocal(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, cvk_tl_t *tbl_answer, cvk_tl_t *tbl_answer_mantissa, cvk_tl_t *tl_ofmap_bf16); /** * @brief generate sigmoid lookup table in host, * we leverage Linear interpolation fairly close to the original * you can refer [wiki](https://en.wikipedia.org/wiki/Interpolation) for more details * * @param [out] sigmoid_table_data lookup table in host * @param [out] sigmoid_table_data_slope slope table in host * @param table_shape table shape * @param range_start quantize range from, * e.g: the original input range is -127 ~ 128, we quantize to -8 ~ 8 * than -8 is our \range_start and 8 is \range_end * @param range_end quantize range end */ void cvm_sigmoid_tbl(uint16_t *sigmoid_table_data, uint16_t *sigmoid_table_data_slope, cvk_tl_shape_t *table_shape, int range_start, int range_end); /** * @brief get scale factor from \range_start and \range_end * * @param range_start quantize range from * @param range_end quantize range end * * @return scale factor */ float cvm_sigmoid_scale(int range_start, int range_end); /** * @brief get sigmoid value by linear interpolation * * @param cvk_ctx kernel structure * @param tl_ifmap input tensor in tpu memory * @param tl_buf working buffer * @param tl_table_answer sigmoid table in tpu memory generated by \cvm_sigmoid_tbl * @param tl_table_answer_slope sigmoid slope table in tpu memory generated by \cvm_sigmoid_tbl * @param [out] tl_ofmap_bf16 result in in memory * @param scale scale factor generated by \cvm_sigmoid_scale * * @example * // 1. alloc in tpu memory * // 2. prepare table * cvm_sigmoid_tbl(table_data, table_data_slope, &table_shape, range_start, range_end); * float scale = cvm_sigmoid_scale(range_start, range_end); * // 3. put host data to tpu memory * // 4. prepare command buffer * cvm_emit_sigmoid(cvk_ctx, tl_ifmap, tl_buf, cvk_tl_table_answer, cvk_tl_table_answer_slope, * tl_ofmap_bf16, scale); * // 5. submit it * test_submit_comp(rt_ctx, cvk_ctx); * * // 6. get result from tpu memory * uint16_t* result = (uint16_t *)get_bf16_tensor_l2g(rt_ctx, cvk_ctx, tl_ofmap_bf16, * tl_ofmap_bf16->fmt); * * @return status, 0 means success, other means generates command fail */ int cvm_emit_sigmoid(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, cvk_tl_t *tl_table_answer, cvk_tl_t *tl_table_answer_slope, cvk_tl_t *tl_ofmap_bf16, float scale); /** * @brief General Matrix Multiplication * that equal \lhs_gaddr * \rhs_gaddr = \dest_gaddr * * @param cvk_ctx kernel structure * @param lhs_gaddr left hand side device memory address * @param rhs_gaddr right hand side device memory address * @param dest_gaddr destination device memory address * @param in_row \lhs_gaddr matrix row * @param in_col \lhs_gaddr matrix col * @param out_col \dest_gaddr matrix col * @param fmt the possible value is \CVK_FMT_BF16 or \CVK_FMT_I8 or \CVK_FMT_U8 * @example * * // 1. alloc host memory and put it to device memory * // M=in_row K=in_col N=out_col * cvk_mg_t *mg_A = _test_put_matrix_g(&ctx, M, K, CVK_FMT_BF16, (uint8_t *)bf16_A); * cvk_mg_t *mg_B = _test_put_matrix_g(&ctx, K, N, CVK_FMT_BF16, (uint8_t *)bf16_B); * cvk_mg_t *mg_R = _test_put_matrix_g(&ctx, M * 2, N, CVK_FMT_BF16, (uint8_t *)bf16_R); * * // 2. get device address for gemm * gaddr_t gaddr_a = mg_A->start_address; * gaddr_t gaddr_b = mg_B->start_address; * gaddr_t gaddr_r = mg_R->start_address; * * // 3. prepare gemm descriptor * cvm_gemm(cvk_ctx, gaddr_a, gaddr_b, gaddr_r, M, K, N); * * // 4. submit descriptor * test_submit_comp(&ctx, cvk_ctx); * * // 5. get result from device to host * uint16_t *bf16_ref = (uint16_t *)test_get_mg_mem_comp(&ctx, mg_R); * * @ return slice_num array of {M, N, K} */ size_t *cvm_gemm(cvk_context_t *cvk_ctx, uint64_t lhs_gaddr, uint64_t rhs_gaddr, uint64_t dest_gaddr, int in_row, int in_col, int out_col, cvk_fmt_t fmt); /** * @brief combine \cvm_gemm int8 result to int32 * the raw output is seperate 32bit result info 4 part with bstride * and we need to 'combine' it to human readable * for instance, the following is the raw result * lsb 31 msb * 0x1 0x2 0x3 0x4 0x5 0x6 0x7 0x8 * 0x9 0xa 0xb 0xc 0xd 0xe 0xf 0x0 * 0x11 0x12 0x13 0x14 0x15 0x16 0x17 0x18 * 0x19 0x20 0x21 0x22 0x23 0x24 0x25 0x26 * * the value by strategy could be column major: * 1. 0x19110901 * 2. 0x20120a02 * 3. 0x21130b03 * and so on * * @param cvm_gemm_strategy return strategy value from \cvm_gemm * @param cvm_output raw result from \cvm_gemm * @param [out] i32_R int32 result * @param M row of output matrix * @param N column of output matrix * * @return status, 0 means success, other means generates command fail */ int cvm_combin_gemm_i8(size_t *cvm_gemm_strategy, uint8_t *cvm_output, uint32_t *i32_R, int M, int N); /** * @brief fp32 to bf16 format int device memory * * @param cvk_ctx kernel structure * @param gaddr_fp32 fp32 data with device memory address * @param fp32_shape fp32 tensor shape * @param [out] gaddr_bf16 bf16 data with device memory address * @param bf16_shape bf16 tensor shape * @param fmt tensor format such as \CVK_FMT_BF16 * * @example * * cvk_tl_shape_t s = {1, 2, 3, 4} * // 1. put fp32 to device memory * test_put_tg_mem_comp(rt_ctx, tg_with_fp32, data) * // 2. init bf16 tg * // 3. prepare command buffer * cvm_s2s_fp32_bf16(cvk_ctx, tg_with_fp32->start_address, tg_with_fp32->shape, * tg_with_bf16->start_address, tg_with_bf16->shape, CVK_FMT_BF16); * // 4. submit it * test_submit_comp(rt_ctx, cvk_ctx); * // 5. get result from device memory * uint16_t *dst_data = (uint16_t *)test_get_tg_mem_comp(rt_ctx, tg_with_bf16); * * @return status, 0 means success, other means generates command fail */ int cvm_s2s_fp32_bf16(cvk_context_t *cvk_ctx, uint64_t gaddr_fp32, cvk_tg_shape_t fp32_shape, uint64_t gaddr_bf16, cvk_tg_shape_t bf16_shape, cvk_fmt_t fmt); /** * @brief generate lookup table for check input is 0 or not * * @param [out] table_0 lookup table for 0 or not * @param table_shape table shape */ void cvm_gen_0_tbl(uint16_t *table_0, cvk_tl_shape_t *table_shape); // mask function /** * @brief get mask value that seperate 0 or not * e.g: input = [0, 1, -1, 2] output [1, 0, 0, 0] * please see \cvm_emit_mask for more details * * @param cvk_ctx kernel structure * @param tl_ifmap input in tpu memory * @param tl_buf working buffer * @param tbl_answer lookup table for 0 or not in tpu memory, generate by \cvm_gen_0_tbl * @param [out] tl_ofmap_bf16 mask result in tpu memory * @param fmt tensor format such as \CVK_FMT_BF16 * * @return status, 0 means success, other means generates command fail */ int cvm_emit_0_idx(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, cvk_tl_t *tbl_answer, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt); /** * @brief get mask value that check < 0 * e.g: input = [0, 10, 6, -1, 0] output [0, 0, 0, 1, 0] * please see \cvm_emit_mask for more details * * @param cvk_ctx kernel structure * @param tl_ifmap input in tpu memory * @param tl_buf working buffer * @param tl_pos_neg_buf lookup table generate from \cvm_pos_neg_tbl * @param [out] tl_ofmap_bf16 mask result in tpu memory * @param fmt tensor format such as \CVK_FMT_BF16 * * @return status, 0 means success, other means generates command fail */ int cvm_emit_neg_idx(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, cvk_tl_t *tl_pos_neg_buf, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt); /** * @brief get mask value that check >= 0 * e.g: input = [0, 10, 6, -1, 0] output [0, 1, 1, 0, 0] * please see \cvm_emit_mask for more details * * @param cvk_ctx kernel structure * @param tl_ifmap input in tpu memory * @param tl_buf working buffer * @param tl_pos_neg_buf lookup table generate from \cvm_pos_neg_tbl * @param [out] tl_ofmap_bf16 mask result in tpu memory * @param fmt tensor format such as \CVK_FMT_BF16 * * @return status, 0 means success, other means generates command fail */ int cvm_emit_pos_idx(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, cvk_tl_t *tl_pos_neg_buf, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt); /** * @brief invert 0/1 input * e.g: input = [0, 1, 1, 1, 0] output [1, 0, 0, 0, 1] * * @param cvk_ctx kernel structure * @param tl_ifmap input in tpu memory * @param tl_buf working buffer * @param [out] tl_ofmap_bf16 mask result in tpu memory * @param fmt * * @return status, 0 means success, other means generates command fail */ int cvm_emit_0_1_revert_input(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt); // mask enum define enum CVM_MASK_TYPE { CVM_MASK_TYPE_GT_0 = 0, // remain > 0 CVM_MASK_TYPE_GE_0, // remain >= 0 CVM_MASK_TYPE_EQ_0, // remain = 0 CVM_MASK_TYPE_LT_0, // remain < 0 CVM_MASK_TYPE_LE_0, // remain <= 0 CVM_MASK_MAX }; /** * @brief get mask for \CVM_MASK_TYPE case * * @param cvk_ctx kernel structure * @param tl_ifmap input in tpu memory * @param tl_buf working buffer * @param tl_buf2 working buffer * @param tl_buf3 working buffer * @param tl_pos_neg_table lookup table generate from \cvm_pos_neg_tbl * @param tl_0_idx_table lookup table for 0 or not in tpu memory generated by \cvm_gen_0_tbl * @param [out] tl_ofmap_bf16 mask result in tpu memory * @param fmt tensor format such as \CVK_FMT_BF16 * @param mask \CVM_MASK_TYPE * * @example * // 1. alloc in tpu memory * // 2. prepare table * cvm_gen_0_tbl(idx_0_table_data, &table_shape); * cvm_pos_neg_tbl(table_data_atan_pos_neg, &table_shape); * // 3. put host data to tpu memory * // 4. prepare command buffer * cvm_emit_mask(cvk_ctx, * tl_ifmap, // input * tl_buf, tl_buf2, tl_buf4, // tmp buffer * tl_pos_neg_buf, tl_0_idx_table, // lookup table * tl_ofmap_bf16, // output * fmt, mode); * * // 5. submit it * test_submit_comp(rt_ctx, cvk_ctx); * * // 6. get result from tpu memory * uint16_t* result = (uint16_t *)get_bf16_tensor_l2g(rt_ctx, cvk_ctx, tl_ofmap_bf16, * tl_ofmap_bf16->fmt); * * * @return status, 0 means success, other means generates command fail */ int cvm_emit_mask(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, cvk_tl_t *tl_buf2, cvk_tl_t *tl_buf3, cvk_tl_t *tl_pos_neg_table, cvk_tl_t *tl_0_idx_table, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt, enum CVM_MASK_TYPE mask); /** * @brief generate lookup table for atan by degree * * @param [out] table_data_y0 atan by degree lookup table in host * @param table_shape table shape */ void cvm_atan_fast_degree_y0(uint16_t *table_data_y0, cvk_tl_shape_t *table_shape); /** * @brief generate lookup table for check value of absolute in [0,1] or > 1 * atan2 used, [0-1] indicate 1, > 1 indicate with -1 * * @param [out] table_invert lookup table in host * @param table_shape table shape */ void cvm_atan_s_01(uint16_t *table_invert, cvk_tl_shape_t *table_shape); /** * @brief generate table for check input value is positive(>=0) or negtive(<0) * by lookup table, 'pos_neg' means data is positive(>=0) is 1 or negtive(<0) is -1 * * @param [out] table_pos_neg lookup table in host * @param table_shape table shape */ void cvm_pos_neg_tbl(uint16_t *table_pos_neg, cvk_tl_shape_t *table_shape); // deprecated code from \cvm_pos_neg_tbl void cvm_atan_pos_neg(uint16_t *table_pos_neg, cvk_tl_shape_t *table_shape); /** * @brief generate atan answer by lookup table, * plz refer [git](https://github.com/xiezhq-hermann/atan_lookup) for more details * * @param [out] table_data_y0 atan answer lookup table in host * @param table_shape table shape */ void cvm_atan_y0(uint16_t *table_data_y0, cvk_tl_shape_t *table_shape); /** * @brief generate atan slope data, for more accuracy * * @param [out] table_slope atan slope lookup table in host * @param table_shape table shape */ void cvm_atan_slope(uint16_t *table_slope, cvk_tl_shape_t *table_shape); /** * @brief syntactic sugar for cvm_atan_y0/cvm_atan_slope/cvm_atan_s_01/cvm_pos_neg_tbl * * @param [out] table_data_atan_y0 atan answer lookup table in host * @param [out] table_data_atan_slope atan slope lookup table in host * @param [out] table_data_atan_invert lookup table in host * @param [out] table_data_atan_pos_neg lookup table in host * @param table_shape table shape */ void cvm_atan_tbl(uint16_t *table_data_atan_y0, uint16_t *table_data_atan_slope, uint16_t *table_data_atan_invert, uint16_t *table_data_atan_pos_neg, cvk_tl_shape_t *table_shape); /** * @brief implement atan in tpu memory * * @param cvk_ctx kernel structure * @param tl_ifmap input tensor in tpu memory * @param tl_buf working buffer * @param tl_buf2 working buffer * @param tl_buf3 working buffer * @param tl_y0_buf atan lookup table in tpu memory * @param tl_slope_buf atan slope lookup table in tpu memory * @param tl_invert_buf lookup table in tpu memory * @param tl_pos_neg_buf lookup table in memory * @param tl_table_answer reciprocal for bf16 exponent part in tpu memory * @param tl_table_answer_mantissa reciprocal for bf16 fraction part in tpu memory * @param [out] tl_ofmap_bf16 result in tpu memory * @param fmt tensor format such as \CVK_FMT_BF16 * * @example * // 1. alloc in tpu memory * // 2.1. get reciprocal table in host * cvm_reciprocal_tbl(table_reciprocal_data, table_reciprocal_data_mantissa, &table_shape); * // 2.2. get atan table in host * cvm_atan_tbl(table_data_atan_y0, table_data_atan_slope, table_data_atan_invert, * table_data_atan_pos_neg, &table_shape); * // 3. put host data to tpu memory * // 4. prepare command buffer * cvm_atan_emit(cvk_ctx, tl_ifmap, tl_buf, tl_buf2, tl_buf4, tl_y0_buf, * tl_slope_buf, tl_invert_buf, tl_pos_neg_buf, tl_reciprocal_table_answer, * tl_reciprocal_table_answer_mantissa, tl_ofmap_bf16, fmt); * * // 5. submit it * test_submit_comp(rt_ctx, cvk_ctx); * // 6. get result from tpu memory * uint16_t *ofmap_data = (uint16_t *)get_bf16_tensor_l2g(rt_ctx, cvk_ctx, tl_ofmap_bf16, * tl_ofmap_bf16->fmt); * @return status, 0 means success, other means generates command fail */ int cvm_atan_emit(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, cvk_tl_t *tl_buf2, cvk_tl_t *tl_buf3, cvk_tl_t *tl_y0_buf, cvk_tl_t *tl_slope_buf, cvk_tl_t *tl_invert_buf, cvk_tl_t *tl_pos_neg_buf, cvk_tl_t *tl_table_answer, cvk_tl_t *tl_table_answer_mantissa, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt); // atan2 function /** * @brief syntactic sugar for generate atan in degree lookup table in * host/cvm_atan_s_01/cvm_pos_neg_tbl * * @param [out] table_data_atan_y0 atan answer lookup table in host * @param [out] table_data_atan_invert lookup table in host * @param [out] table_data_atan_pos_neg lookup table in host * @param table_shape table shape */ void cvm_atan_fast_degree_tbl(uint16_t *table_data_atan_y0, uint16_t *table_data_atan_invert, uint16_t *table_data_atan_pos_neg, cvk_tl_shape_t *table_shape); /** * @brief implement atan2 by degree in tpu memory, implemented by atan. you can refer * [wiki](https://en.wikipedia.org/wiki/Atan2) for more details * * @param cvk_ctx kernel structure * @param y input tensor in tpu memory * @param x input tensor in tpu memory * @param tl_buf working buffer * @param tl_buf2 working buffer * @param tl_buf3 working buffer * @param tl_y0_buf atan2 lookup table in tpu memory * @param tl_invert_buf lookup table in tpu memory * @param tl_pos_neg_buf lookup table in memory * @param tl_table_answer reciprocal for bf16 exponent part in tpu memory * @param tl_table_answer_mantissa reciprocal for bf16 fraction part in tpu memory * @param [out] tl_ofmap_bf16 result in tpu memory * @param fmt tensor format such as \CVK_FMT_BF16 * * @example * // 1. alloc in tpu memory * // 2.1. get reciprocal table in host * cvm_reciprocal_tbl(table_reciprocal_data, table_reciprocal_data_mantissa, &table_shape); * // 2.2. get atan table in host * cvm_atan_fast_degree_tbl(table_data_atan_y0, table_data_atan_slope, table_data_atan_invert, * table_data_atan_pos_neg, &table_shape); * // 3. put host data to tpu memory * // 4. prepare command buffer * cvm_atan2_fast_degree_emit( * cvk_ctx, tl_ifmap2, tl_ifmap, tl_buf, tl_buf2, tl_buf3, tl_y0_buf, * tl_invert_buf, tl_pos_neg_buf, tl_reciprocal_table_answer, * tl_reciprocal_table_answer_mantissa, tl_ofmap_bf16, fmt); * * // 5. submit it * test_submit_comp(rt_ctx, cvk_ctx); * // 6. get result from tpu memory * uint16_t *ofmap_data = (uint16_t *)get_bf16_tensor_l2g(rt_ctx, cvk_ctx, tl_ofmap_bf16, * tl_ofmap_bf16->fmt); */ void cvm_atan2_fast_degree_emit(cvk_context_t *cvk_ctx, cvk_tl_t *y, cvk_tl_t *x, cvk_tl_t *tl_buf, cvk_tl_t *tl_buf2, cvk_tl_t *tl_buf3, cvk_tl_t *tl_y0_buf, cvk_tl_t *tl_invert_buf, cvk_tl_t *tl_pos_neg_buf, cvk_tl_t *tl_table_answer, cvk_tl_t *tl_table_answer_mantissa, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt); /** * @brief implement atan2 in tpu memory, implemented by atan. you can refer * [wiki](https://en.wikipedia.org/wiki/Atan2) for more details * * @param cvk_ctx kernel structure * @param y input tensor in tpu memory * @param x input tensor in tpu memory * @param tl_buf working buffer * @param tl_buf2 working buffer * @param tl_buf3 working buffer * @param tl_y0_buf atan2 lookup table in tpu memory * @param tl_invert_buf lookup table in tpu memory * @param tl_pos_neg_buf lookup table in memory * @param tl_table_answer reciprocal for bf16 exponent part in tpu memory * @param tl_table_answer_mantissa reciprocal for bf16 fraction part in tpu memory * @param [out] tl_ofmap_bf16 result in tpu memory * @param fmt tensor format such as \CVK_FMT_BF16 * * @example * // 1. alloc in tpu memory * // 2.1. get reciprocal table in host * cvm_reciprocal_tbl(table_reciprocal_data, table_reciprocal_data_mantissa, &table_shape); * // 2.2. get atan table in host * cvm_atan_fast_degree_tbl(table_data_atan_y0, table_data_atan_slope, table_data_atan_invert, * table_data_atan_pos_neg, &table_shape); * // 3. put host data to tpu memory * // 4. prepare command buffer * cvm_atan2_fast_degree_emit( * cvk_ctx, tl_ifmap2, tl_ifmap, tl_buf, tl_buf2, tl_buf3, tl_y0_buf, * tl_invert_buf, tl_pos_neg_buf, tl_reciprocal_table_answer, * tl_reciprocal_table_answer_mantissa, tl_ofmap_bf16, fmt); * * // 5. submit it * test_submit_comp(rt_ctx, cvk_ctx); * // 6. get result from tpu memory * uint16_t *ofmap_data = (uint16_t *)get_bf16_tensor_l2g(rt_ctx, cvk_ctx, tl_ofmap_bf16, * tl_ofmap_bf16->fmt); */ void cvm_atan2_merge_emit(cvk_context_t *cvk_ctx, cvk_tl_t *y, cvk_tl_t *x, cvk_tl_t *tl_buf, cvk_tl_t *tl_buf2, cvk_tl_t *tl_buf3, cvk_tl_t *tl_y0_buf, cvk_tl_t *tl_invert_buf, cvk_tl_t *tl_pos_neg_buf, cvk_tl_t *tl_table_answer, cvk_tl_t *tl_table_answer_mantissa, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt); /** * @brief get lookup table size for host alloc mamory used * * @param cvk_ctx kernel structure * @param table_shape table shape * @param fmt tensor format such as \CVK_FMT_BF16 * * @return table size in bytes */ uint64_t cvm_lut_tbl_bytesize(cvk_context_t *cvk_ctx, cvk_tl_shape_t *table_shape, cvk_fmt_t fmt); /** * @brief calculate new proper reshape channel for depthwise * current only support batch = 1 * * @param cvk_ctx kernel structure * @param ic origin input shape of c * @param ih origin input shape of h * @param iw origin input shape of w * @param kh origin kerenl shape of h * @param kw origin kerenl shape of w * @param pad_right padding right with input * @param pad_left padding left with input * @param stride_h stride h with input * @param stride_w stride w with input * @param [out] tl_load_shape shape structure for input in tpu memory * @param [out] new_tl_ifmap_stride deprecated that stride for input in tpu memory * @param [out] new_tg_ifmap_shape shape structure for input in device memory * @param [out] new_tg_ifmap_stride stride structure for input in device memory * @param [out] new_tl_weight_shape reshape weight in tpu memory * @param [out] new_tl_bias_shape reshape bias in tpu memory * @param [out] new_tl_ofmap_shape reshape output in tpu memory * @param fmt the possible value is \CVK_FMT_BF16 or \CVK_FMT_I8 or \CVK_FMT_U8 * @param eu_align currently MUST set 1 is force align with hardware * * @example * int align = 1; // force align * cvk_tiu_depthwise_pt_convolution_param_t *p; * // 1. get reshaped shape * int r = cvm_reshape_channel_same( * cvk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left, stride_h, stride_w, * &tl_load_shape, &tl_load_stride, &tg_shape, &tg_stride, &tl_weight_shape, * &tl_bias_shape, &tl_output_shape, fmt, align); * // reshape fail * if (r == -1) { * return -1; * } * * // 2.1 load input * // load input into tpu memory * int load_align = 0; // not align for pack * tmp_tl_load = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_load_shape, fmt, load_align); * tmp_tg = test_alloc_tg_mem_comp(&rt_ctx, cvk_ctx, tg_shape, fmt); * tmp_tg->stride = tg_stride; * // int8 * cvk_tdma_g2l_tensor_copy_param_t p1; * cvk_ctx->ops->tdma_g2l_tensor_copy(cvk_ctx, &p1); * test_submit_comp(&rt_ctx, cvk_ctx); * test_free_tg_mem_comp(&rt_ctx, tmp_tg); * // fit for hw * int align_in_tl = 1; * tmp_tl_load->stride = bmk1880v2_tensor_lmem_default_stride( * cvk_ctx, tmp_tl_load->shape, fmt, align_in_tl); * p->ifmap = tmp_tl_load; * // 2.2 prepare load bias, put to tg and load back * if (has_bias) { * // bias must i8 * int no_bias_align = 0; * p->bias = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_bias_shape, fmt, no_bias_align); * * // duplicate bias and replace old * uint32_t *new_bias = cvm_reshape_channel_weight( * (uint8_t *)bias, tl_bias_shape.n, tl_bias_shape.c, tl_bias_shape.h, * tl_bias_shape.w, org_oc, fmt); * * test_put_tensor_g2l_comp(&rt_ctx, cvk_ctx, p->bias, bias); * } * * // 2.3 prepare load weight, put to tg and load back * { * int weight_align = 1; * p->weight = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_weight_shape, fmt, weight_align); * // duplicate kernel with c * uint8_t *new_weight = cvm_reshape_channel_weight( * (uint8_t *)weight, tl_weight_shape.n, tl_weight_shape.c, tl_weight_shape.h, * tl_weight_shape.w, org_oc, fmt); * * test_put_tensor_g2l_comp(&rt_ctx, cvk_ctx, p->weight, (u16 *)weight); * } * * // 2.4 prepard ofmap * { * // we allocate 'same' mode shape * int output_align = 1; // hw need * p->ofmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_output_shape, fmt, output_align); * } * * // 3. prepare command buffer * cvk_ctx->ops->tiu_pt_depthwise_convolution(cvk_ctx, p); * * // 4. submit it * test_submit_comp(rt_ctx, cvk_ctx); * * // 5. get result from tpu memory * output = test_get_tensor_l2g_comp(&rt_ctx, cvk_ctx, p->ofmap, fmt); * * @return status, -1 means fail, other means reshape slice success */ int cvm_reshape_channel_same(cvk_context_t *cvk_ctx, int ic, int ih, int iw, int kh, int kw, int pad_right, int pad_left, int stride_h, int stride_w, cvk_tl_shape_t *tl_load_shape, cvk_tl_stride_t *new_tl_ifmap_stride, cvk_tg_shape_t *new_tg_ifmap_shape, cvk_tg_stride_t *new_tg_ifmap_stride, cvk_tl_shape_t *new_tl_weight_shape, cvk_tl_shape_t *new_tl_bias_shape, cvk_tl_shape_t *new_tl_ofmap_shape, cvk_fmt_t fmt, int eu_align); /** * @brief re-construct bias content by reshape channel * * @param bias original bias in host memory * @param ni reshape bias shape of n * @param ci reshape bias shape of c * @param hi reshape bias shape of h * @param wi reshape bias shape of w * @param old_bias_c origin bias shape of c * @param fmt the possible value is \CVK_FMT_BF16 or \CVK_FMT_I8 or \CVK_FMT_U8 * * @return bias host data */ uint32_t *cvm_reshape_channel_bias(uint8_t *bias, int ni, int ci, int hi, int wi, int old_bias_c, cvk_fmt_t fmt); /** * @brief re-construct weight content by reshape channel * * @param weight original bias in host memory * @param ni reshape weight shape of n * @param ci reshape weight shape of c * @param hi reshape weight shape of h * @param wi reshape weight shape of w * @param old_weight_c origin weight shape of c * @param fmt the possible value is \CVK_FMT_BF16 or \CVK_FMT_I8 or \CVK_FMT_U8 * * @return weight host data */ uint8_t *cvm_reshape_channel_weight(uint8_t *weight, int ni, int ci, int hi, int wi, int old_weight_c, cvk_fmt_t fmt); typedef struct cvm_tiu_atan2_param { cvk_tl_t *a; cvk_tl_t *b; cvk_tl_t *res; cvk_tl_t *buf1; cvk_tl_t *buf2; cvk_tl_t *buf3; cvk_tl_t *buf4; cvk_tl_t *buf5; cvk_tl_t *buf6; cvk_tl_t *y0; cvk_tl_t *slope; cvk_tl_t *invert; cvk_tl_t *pos_neg_table; cvk_tl_t *reciprocal_table_answer; cvk_tl_t *reciprocal_table_answer_mantissa; cvk_tl_t *sqrt_table_answer; cvk_tl_t *sqrt_table_answer_mantissa; cvk_tl_t *idx_0_table; cvk_fmt_t fmt; bool output_degree; } cvm_tiu_atan2_param_t; typedef struct cvk_tiu_mask_param { cvk_tl_t *ifmap; cvk_tl_t *ofmap; cvk_tl_t *buf; cvk_tl_t *buf2; cvk_tl_t *buf3; cvk_tl_t *pos_neg_table; cvk_tl_t *idx_0_table; cvk_fmt_t fmt; } cvm_tiu_mask_param_t; typedef struct cvm_tiu_sigmoid_param { float scale; cvk_tl_t *ifmap; cvk_tl_t *buf; cvk_tl_t *table_answer; cvk_tl_t *table_answer_slope; cvk_tl_t *ofmap; } cvm_tiu_sigmoid_param_t; typedef struct cvm_tiu_sqrt_param { cvk_tl_t *a; cvk_tl_t *res; cvk_tl_t *buf; cvk_tl_t *sqrt_table_answer; cvk_tl_t *sqrt_table_answer_mantissa; } cvm_tiu_sqrt_param_t; /** * @brief get \quantized_multiplier and its \right_shift, * please refer * \https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels/internal/common.h:MultiplyByQuantizedMultiplier * for more details * * @param real_multiplier * @param quantized_multiplier * @param right_shift */ void cvm_get_chl_quan(float real_multiplier, uint32_t *quantized_multiplier, int *right_shift); /** * @brief * * @param c * @param quantized_multiplier * @param right_shift * @param cal_data * @param bias_data * @param has_bias */ void cvm_fill_chl_quan_data(const uint32_t c, const uint32_t quantized_multiplier, const int right_shift, uint8_t *cal_data, int32_t *bias_data, bool has_bias); /** * @brief * * @param c * @param quantized_multiplier * @param right_shift * @param bias_data * @param has_bias * * @return */ uint8_t *cvm_get_chl_quan_data(const uint32_t c, const uint32_t quantized_multiplier, const int right_shift, int32_t *bias_data, bool has_bias); /** * @brief get byte size of input \fmt * * @param fmt \cvk_fmt_t structure * * @example * int sz = cvm_bytesize_of_fmt(CVK_FMT_BF16); * assert (sz == 2 && "bf16 takes 2 bytes") * * sz = cvm_bytesize_of_fmt(CVK_FMT_I8); * assert (sz == 1 && "int8 takes 1 bytes") * @return byte size of fmt */ int cvm_bytesize_of_fmt(cvk_fmt_t fmt); /** * @brief reduce multiplication for h,w * the possible shape will be <1, c, 1, 1> * you could refer [here](https://en.wikipedia.org/wiki/Reduction_Operator) for * more details * * @param cvk_ctx kernel structure * @param [out] mp_tl_mulsum input tensor in tpu memory, the shape should be <1, c, h, w> * * @return status, 0 means success, other means generates command fail */ int cvm_reduce_hw_mul(cvk_context_t *cvk_ctx, cvk_tl_t *mp_tl_mulsum); /** * @brief bf16 to fp32, ONLY move bf16 to fp32 high 16 bits part, * the memory layout as following: * * bf16: 0x4300 * 0 16 (bit) * ----- * 0x4300 * * fp32: 0x43000000 * ----- * 0 16 32 * 0x 0x43 * * @param cvk_ctx kernel structure * @param tg_bf16 bf16 data in device memory * @param [out] tg_fp32 fp32 data in decive memory, the w shape SHOULD be double with * \tg_bf16->shape.w */ void cvm_bf16_fp32(cvk_context_t *cvk_ctx, cvk_tg_t *tg_bf16, cvk_tg_t *tg_fp32); /** * @brief set value by mask(0/1) * * @param [in] tl_ifmap image input, MUST uint8 * @param [in] tl_mask mask value, it MUST be 0 or 1, it will DIRTY it * @param [in] tl_buf * @param [in,out] tl_ofmap image output, MUST uint8, it will DIRTY it * * @return status, 0 means success, other means generates command fail */ int cvm_set_image_by_u8mask(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, cvk_tl_t *tl_mask, cvk_tl_t *tl_ofmap); /** * @brief set value by mask(0/1) by DePthwise * 0 means keep \tl_ofmap one * 1 means overwrite with \tl_ifmap * * @param [in] tl_ifmap image input, MUST uint8 * @param [in] tl_mask mask value, it MUST be 0 or 1, it will DIRTY it * @param [in] tl_kernel for mask reverting(0/1->1/0) that the contain MUST BE -1 with int8 * and shape SHOULD BE <1, tl_ifmap->shape.c, 1, 1> * @param [in] tl_bias for mask reverting(0/1->1/0) that the contain MUST BE 1 with int8, * seperate high/low part, and shape SHOULD BE <2, tl_ifmap->shape.c, 1, 1> * @param [in,out] tl_ofmap image output, MUST uint8, it will DIRTY it * * @return status, 0 means success, other means generates command fail */ int cvm_set_image_by_u8mask_dp(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_mask, cvk_tl_t *tl_kernel, cvk_tl_t *tl_bias, cvk_tl_t *tl_ofmap); /** * @brief set value by mask and threshold, set it * if \tl_mask && (int8_t)\tl_update_tbl < threshold * * @param [in] tl_ifmap image input, MUST uint8 * @param [in] tl_mask mask value, it MUST be 0 or 1, it will DIRTY it * @param [in] tl_update_tbl the value range will under int8, it will DIRTY it * @param [in,out] tl_ofmap image output, MUST uint8, it will DIRTY it * * @return status, 0 means success, other means generates command fail */ int cvm_set_image_by_two_info_i8(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf2, cvk_tl_t *tl_mask, cvk_tl_t *tl_update_tbl, uint8_t threshold, cvk_tl_t *tl_ofmap); /** * @brief set value by mask and threshold by DePthwise, set it * if \tl_mask && (int8_t)\tl_update_tbl < threshold * * @param [in] tl_ifmap image input, MUST uint8 * @param [in] tl_kernel set all to 1 for \tl_update_tbl * 1 - threshold * to test larger or smaller, * that MUST BE 1 with int8 and shape SHOULD BE <1, tl_ifmap->shape.c, 1, 1> * @param [in] tl_mask mask value, it MUST be 0 or 1, it will DIRTY it * @param [in] tl_update_tbl the value range will under int8, it will DIRTY it * @param [in] tl_threshold for boradcast \threshold to bias * the type MUST BE int8 and seperate high/low part and it will DIRTY it * @param [in,out] tl_ofmap image output, MUST uint8, it will DIRTY it * * @return status, 0 means success, other means generates command fail */ int cvm_set_image_by_two_info_i8_dp(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_kernel, cvk_tl_t *tl_mask, cvk_tl_t *tl_update_tbl, cvk_tl_t *tl_threshold, cvk_tl_t *tl_ofmap); /** * @brief get abs(\tl_ifmap-tl_ifmap2) * * @param [in] tl_ifmap image input, MUST uint8 * @param [in] tl_ifmap2 image input, MUST uint8, it will DIRTY it * @param [out] tl_ofmap image output, MUST uint8, it will DIRTY it * * @return status, 0 means success, o, MUST uint8ther means generates command fail */ int cvm_gen_image_diff(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2, cvk_tl_t *tl_buf, cvk_tl_t *tl_buf2, cvk_tl_t *tl_ofmap); /** * @brief update \tl_ofmap by \threshold_a, \threshold_b, * plz refer \sample_set_val_by_mask.cpp for more details * * @param [out] tl_mask return 0/1 mask * @param [in] tl_update_tbl u8 * @param [in,out] tl_ofmap image output, int8 * * @return status, 0 means success, other means generates command fail */ int cvm_update_tbl_by_threshold(cvk_context_t *ctx, cvk_tl_t *tl_mask, cvk_tl_t *tl_buf, cvk_tl_t *tl_buf2, cvk_tl_t *tl_buf3, cvk_tl_t *tl_update_tbl, uint8_t threshold_a, uint8_t threshold_b, cvk_tl_t *tl_ofmap); /** * @brief set value by mask, update \tl_ofmap once (uint8_t)tl_update_tbl >= threshold * * @param [in] tl_ifmap image input, MUST uint8 * @param [in] tl_update_tbl the value range will under uint8 * @param [in,out] tl_ofmap image output, MUST uint8, it will DIRTY it * * @return status, 0 means success, other means generates command fail */ int cvm_set_image_by_two_info_u8(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, cvk_tl_t *tl_buf2, cvk_tl_t *tl_update_tbl, uint8_t threshold, cvk_tl_t *tl_ofmap); /** * @brief set value by mask * if (int8_t)\tl_update_tbl > threshold * * @param [in] tl_ifmap image input * @param [in] tl_update_tbl int8, MUST uint8, it will DIRTY * @param [in,out] tl_ofmap image output, MUST uint8, it will DIRTY it * * @return status, 0 means success, other means generates command fail */ int cvm_blend_image_by_tbl(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, cvk_tl_t *tl_buf2, cvk_tl_t *tl_update_tbl, uint8_t threshold, uint8_t w1, uint8_t w2, cvk_tl_t *tl_ofmap); /** * @brief get upsample 2d with nearest mode * * @param [in] tl_ifmap * @param [in] tl_weight upsample used that fill with 1 * @param [out] tl_ofmap * * @return status, 0 means success, other means generates command fail */ int cvm_upsample2d(cvk_context_t *ctx, cvk_tl_t *tl_input, cvk_tl_t *tl_weight, cvk_tl_t *tl_output); #ifdef __cplusplus } #endif #endif // CVIMATH_INTERNAL_H