#ifndef CVIMATH_TEST_UTIL_H #define CVIMATH_TEST_UTIL_H #include #include "cvikernel/cvikernel.h" #include "bmruntime.h" #include "bmruntime_bmkernel.h" #include #include // pow #include // uint8_t / uint16_t #include /* printf, scanf, NULL */ #include /* malloc, free, rand */ #include // strncpy // copy from lagency // TODO: move to properly header files #define __ALIGN_MASK(x, mask) (((x) + (mask)) & ~(mask)) #define ALIGN(x, a) __ALIGN_MASK(x, (__typeof__(x))(a)-1) typedef uint32_t laddr_t; typedef uint64_t gaddr_t; typedef uint32_t ctrl_t; #define CTRL_NULL 0 #define CTRL_AL (1 << 0) // alloc aligned with EU_NUM #define CTRL_TP (1 << 5) // transpose #define CTRL_NEURON (1 << 11) // mark neuron address in GDMA #define LADDR_INVALID (0xFFFFFFFF) #define GADDR_INVALID (0x000000FFFFFFFFFFULL) static inline int ceiling_func(int numerator, int denominator) { return (numerator + denominator - 1) / denominator; } static inline int ceiling_func_shift(int numerator, int shift) { return (numerator + (1 << shift) - 1) >> shift; } static inline int get_num_shift(uint64_t num) { int n = 0; while (!(num & 1)) { n++; num >>= 1; } return n; } #ifdef __cplusplus extern "C" { #endif /* * bm runtime binds with bm kernel. * cvi kernel still needs bm runtime. * * Need to create the separate function to combine bm runtime and cvi kernel. * Function with postfix _comp (compatible) for such combination. */ #define __FILENAME__ (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) /** * @brief submit command buffer * * @param rt_ctx runtime structure * @param cvk_ctx kernel structure */ void test_submit_comp(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx); /** * @brief alloc tensor from device memory * * @param rt_ctx runtime structure * @param cvk_ctx kernel structure * @param shape tensor shape * @param fmt tensor format such as \CVK_FMT_U16 or \CVK_FMT_U8 * * @return cvk_tg_t structure */ cvk_tg_t *test_alloc_tg_mem_comp(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, cvk_tg_shape_t shape, cvk_fmt_t fmt); /** * @brief alloc matrix from device memory * * @param rt_ctx runtime structure * @param shape matrix shape * @param fmt tensor format such as \CVK_FMT_U16 or \CVK_FMT_U8 * * @return cvk_mg_t structure */ cvk_mg_t *test_alloc_mg_mem_comp(CVI_RT_HANDLE *rt_ctx, cvk_mg_shape_t shape, cvk_fmt_t fmt); /** * @brief free tensor from device memory * * @param rt_ctx runtime structure * @param tg pointer of tg */ void test_free_tg_mem_comp(CVI_RT_HANDLE *rt_ctx, const cvk_tg_t *tg); /** * @brief free matrix from device memory * * @param rt_ctx runtime structure * @param mg pointer of mg */ void test_free_mg_mem_comp(CVI_RT_HANDLE *rt_ctx, const cvk_mg_t *mg); /** * @brief put host data to alloced tensor device memory * * @param rt_ctx runtime structure * @param tg pointer of tg * @param data[] host data */ void test_put_tg_mem_comp(CVI_RT_HANDLE *rt_ctx, const cvk_tg_t *tg, uint8_t data[]); /** * @brief put host data to alloced matrix device memory * * @param rt_ctx runtime structure * @param mg pointer of mg * @param data[] host data */ void test_put_mg_mem_comp(CVI_RT_HANDLE *rt_ctx, const cvk_mg_t *mg, uint8_t data[]); /** * @brief syntactic sugar for \test_alloc_mg_mem_comp -> \test_put_mg_mem_comp * * @param rt_ctx runtime structure * @param mg_data_format mg format such as \CVK_FMT_U16 or \CVK_FMT_U8 * @param data[] host data * * @return */ cvk_mg_t *test_put_matrix_g(CVI_RT_HANDLE *rt_ctx, const cvk_mg_shape_t shape, cvk_fmt_t mg_data_format, uint8_t data[]); /** * @brief get tensor data from device memory * * @param rt_ctx runtime structure * @param tg pointer of tg * * @return data in device memory */ uint8_t *test_get_tg_mem_comp(CVI_RT_HANDLE *rt_ctx, const cvk_tg_t *tg); /** * @brief get matrix data from device memory * * @param rt_ctx runtime structure * @param mg pointer of mg * * @return data in device memory */ uint8_t *test_get_mg_mem_comp(CVI_RT_HANDLE *rt_ctx, const cvk_mg_t *mg); /** * @brief get tensor data from tpu memory, * the data path should be tpu memory -> device memory -> host memory * * @param rt_ctx runtime structure * @param cvk_ctx kernel structure * @param tl pointer of tl * * @return data in tpu memory */ uint8_t *test_get_tensor_l2g_comp(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, const cvk_tl_t *tl); /** * @brief get matrix data from tpu memory, * the data path should be tpu memory -> device memory -> host memory * * @param rt_ctx runtime structure * @param cvk_ctx kernel structure * @param ml pointer of ml * * @return data in tpu memory */ uint8_t *test_get_matrix_l2g_comp(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, const cvk_ml_t *ml); /** * @brief put host data to tpu memory with tensor * the data path should be host memory -> device memory -> tpu memory * * @param rt_ctx runtime structure * @param cvk_ctx kernel structure * @param tl pointer of tl * @param data[] data in host memory */ void test_put_tensor_g2l_comp(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, const cvk_tl_t *tl, uint8_t data[]); /** * @brief put host data to tpu memory with matrix * the data path should be host memory -> device memory -> tpu memory * * @param rt_ctx runtime structure * @param cvk_ctx kernel structure * @param ml pointer of ml * @param data[] data in host memory */ void test_put_matrix_g2l_comp(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, const cvk_ml_t *ml, uint8_t data[]); /** * @brief alloc tensor from tpu memory * * @param cvk_ctx kernel structure * @param shape shape of tensor * @param fmt tensor format such as \CVK_FMT_U16 or \CVK_FMT_U8 * @param eu_align is align excution unit * * @return pointer of tl */ cvk_tl_t *test_alloc_tl(cvk_context_t *cvk_ctx, cvk_tl_shape_t shape, cvk_fmt_t fmt, int eu_align); /** * @brief free tpu memory with tensor * * @param cvk_ctx kernel structure * @param tl pointer of tl */ void test_free_tl(cvk_context_t *cvk_ctx, const cvk_tl_t *tl); /** * @brief a small structure for getting RT memory information */ typedef struct _AddrInfo { uint64_t phy_addr; uint64_t size_bytes; uint8_t *vir_addr; int mem; }AddrInfo; /** * @brief get tpu global memory and assign info to an structure * * @param[in] bm_ctx runtime structure * @param[out] pAddrInfo a structure for physical, virtual address */ uint8_t *test_get_vp_addr(bmctx_t *ctx, AddrInfo *pAddrInfo); /** * @brief free tpu global memory from an info structure * * @param[in] bm_ctx runtime structure * @param[in] pAddrInfo a structure for physical, virtual address */ void test_free_vp_addr(bmctx_t *ctx, AddrInfo *pAddrInfo); /** * @breif wrapper function */ // tensor in local functions // get tl size static inline uint64_t tl_shape_size(const cvk_tl_shape_t *s) { return (uint64_t)s->n * s->c * s->h * s->w; } static inline uint64_t tg_shape_size(const cvk_tg_shape_t *s) { return (uint64_t)s->n * s->c * s->h * s->w; } static inline uint64_t mg_shape_size(const cvk_mg_shape_t *s) { return (uint64_t)s->row * s->col; } static inline void free_tl(cvk_context_t *cvk_ctx, const cvk_tl_t *t) { return cvk_ctx->ops->lmem_free_tensor(cvk_ctx, t); } typedef struct { cvk_fmt_t src_fmt; cvk_fmt_t dst_fmt; } cvk_fmt_type; static inline int bitsize_of_fmt(cvk_fmt_t fmt) { switch (fmt) { case CVK_FMT_F32: case CVK_FMT_I32: return 32; case CVK_FMT_F16: case CVK_FMT_I16: case CVK_FMT_U16: case CVK_FMT_BF16: return 16; case CVK_FMT_I8: case CVK_FMT_U8: return 8; case CVK_FMT_I4: return 4; case CVK_FMT_I2: return 2; case CVK_FMT_I1: return 1; default: assert(0); return -1; } } static inline int bytesize_of_fmt(cvk_fmt_t fmt) { return bitsize_of_fmt(fmt) / 8; } static inline void tg_2_tl_shape(cvk_tl_shape_t *tl, cvk_tg_shape_t *tg) { tl->n = tg->n; tl->c = tg->c; tl->h = tg->h; tl->w = tg->w; } static inline void tl_2_tg_shape(cvk_tg_shape_t *tg, cvk_tl_shape_t *tl) { tg->n = tl->n; tg->c = tl->c; tg->h = tl->h; tg->w = tl->w; } /** * @brief init test case with runtime/kernel * * @param rt_ctx runtime structure * @param cvk_ctx kernel structure */ // static inline void _test_init(CVI_RT_HANDLE ctx, cvk_context_t **cvk_ctx) { // CVI_RT_HANDLE _ctx = (CVI_RT_HANDLE)ctx; // int ret = CVI_RT_Init(&_ctx); // if (ret != CVI_SUCCESS) { // fprintf(stderr, "init failed, err %d\n", ret); // exit(-1); // } // // int alloc_size = 0x10000; // *cvk_ctx = (cvk_context_t*) CVI_RT_RegisterKernel(_ctx, alloc_size); // printf("alloc command buffer %d bytes success\n", alloc_size); //} // static inline void _test_exit(CVI_RT_HANDLE ctx, cvk_context_t *cvk_ctx) { // CVI_RT_UnRegisterKernel(cvk_ctx); // CVI_RT_HANDLE _ctx = (CVI_RT_HANDLE)ctx; // CVI_RT_DeInit(_ctx); //} static inline void test_init(CVI_RT_HANDLE *ctx, cvk_context_t **cvk_ctx) { CVI_RT_HANDLE *_ctx = (CVI_RT_HANDLE *)ctx; int ret = CVI_RT_Init(_ctx); if (ret != CVI_SUCCESS) { fprintf(stderr, "init failed, err %d\n", ret); exit(-1); } int alloc_size = 0x100000; *cvk_ctx = (cvk_context_t *)CVI_RT_RegisterKernel(*_ctx, alloc_size); printf("alloc command buffer %d bytes success\n", alloc_size); } /** * @brief de-init with runtime/kernel * * @param rt_ctx runtime structure * @param cvk_ctx kernel structure */ static inline void test_exit(CVI_RT_HANDLE *ctx, cvk_context_t *cvk_ctx) { CVI_RT_UnRegisterKernel(cvk_ctx); CVI_RT_HANDLE *_ctx = (CVI_RT_HANDLE *)ctx; CVI_RT_DeInit(*_ctx); } // converter bf16<->int8 uint8_t convert_bf16_u8(uint16_t data); int8_t convert_bf16_s8(uint16_t data); uint16_t convert_int8_bf16(uint8_t data, uint8_t sign); uint32_t convert_fp32_u32(float fp32); float convert_hex_fp32(uint32_t hval); uint32_t convert_fp32_hex(float val); float convert_bf16_fp32(uint16_t bf16); uint16_t convert_fp32_bf16(float fp32); int set_store_feround(); void restore_feround(int round_mode); static inline void *xmalloc(size_t size) { void *p = malloc(size); if (!p) { return NULL; } return p; } #ifdef __cplusplus } #endif #endif // CVIMATH_TEST_UTIL_H