#ifndef __BMKERNEL_1880v2_H__ #define __BMKERNEL_1880v2_H__ #include #ifdef __cplusplus extern "C" { #endif #define BMK1880v2_TIU 0 // Tensor Instruction Unit #define BMK1880v2_CPU 1 // CPU, Reserved for common cpu op #define BMK1880v2_TDMA 2 // TPU DMA #define BMK1880v2_ENGINE_NUM 3 // Number of Engines typedef struct bmk_context bmk1880v2_context_t; typedef struct bmk_context bmk_context_t; typedef cvk_chip_info_t bmk1880v2_chip_info_t; typedef struct ec_desc bmk1880v2_op_t; bmk1880v2_context_t * bmk1880v2_register(bmk_info_t *info); void bmk1880v2_cleanup(bmk1880v2_context_t *ctx); void bmk1880v2_reset(bmk1880v2_context_t *ctx); uint8_t *bmk1880v2_acquire_cmdbuf(bmk1880v2_context_t *ctx, uint32_t *size); void bmk1880v2_dmabuf_size(uint8_t *cmdbuf, uint32_t sz, uint32_t *psize, uint32_t *pmu_size); void bmk1880v2_dmabuf_relocate( uint8_t *dmabuf, uint64_t dmabuf_devaddr, uint32_t original_size, uint32_t pmubuf_size); void bmk1880v2_dmabuf_convert(uint8_t *cmdbuf, uint32_t sz, uint8_t *dmabuf); void bmk1880v2_dmabuf_dump(uint8_t * dmabuf); void bmk1880v2_arraybase_set( uint8_t *dmabuf, uint32_t arraybase0L, uint32_t arraybase1L, uint32_t arraybase0H, uint32_t arraybase1H); void bmk1880v2_parallel_enable(bmk1880v2_context_t *ctx); void bmk1880v2_set_op(bmk1880v2_context_t *ctx, void* op); void* bmk1880v2_get_op(bmk1880v2_context_t *ctx); void bmk1880v2_parallel_disable(bmk1880v2_context_t *ctx); void bmk1880v2_set_layer_id(bmk1880v2_context_t *ctx, uint16_t layer_id); uint16_t bmk1880v2_layer_id(bmk1880v2_context_t *ctx); void bmk1880v2_create_streams(bmk1880v2_context_t *ctx, int nr_streams); void bmk1880v2_destroy_streams(bmk1880v2_context_t *ctx); void bmk1880v2_set_stream(bmk1880v2_context_t *ctx, int i); void bmk1880v2_add_dependency( bmk1880v2_context_t *ctx, bmk1880v2_op_t *before, bmk1880v2_op_t *after); void bmk1880v2_cpu_op( bmk1880v2_context_t *ctx, const char* op_name, char *params, int size); /* * Fundamental structures for tensor and matrix */ typedef struct { uint32_t n, c, w, col; } bmk1880v2_matrix_lmem_shape_t; typedef struct { uint32_t row, col; } bmk1880v2_matrix_tgmem_shape_t; typedef struct { uint32_t n, c, h; } bmk1880v2_matrix_lmem_stride_t; typedef struct { uint32_t row; } bmk1880v2_matrix_tgmem_stride_t; typedef struct { uint32_t n, c, h, w; } bmk1880v2_tensor_lmem_shape_t; typedef struct { uint32_t n, c, h, w; } bmk1880v2_tensor_tgmem_shape_t; typedef struct { uint32_t n, c, h, w; } bmk1880v2_tensor_lmem_stride_t; typedef struct { uint32_t n, c, h; } bmk1880v2_tensor_tgmem_stride_t; typedef struct { uint32_t start_address; fmt_t fmt; fmt_t cmprs_fmt; bmk1880v2_tensor_lmem_shape_t shape; bmk1880v2_tensor_lmem_stride_t stride; uint8_t int8_rnd_mode; // (1, oc, kh*kw, ic) * TDMA load global (1, oc, kh*w, ic) -> local (1, oc, kh*kw, ic) * TIU conv opd1 (ic, oc, kh, kw) */ typedef struct { const bmk1880v2_tensor_lmem_t *ofmap; const bmk1880v2_tensor_lmem_t *ifmap; const bmk1880v2_tensor_lmem_t *weight; const bmk1880v2_tensor_lmem_t *bias; uint8_t ins_h, ins_last_h; uint8_t ins_w, ins_last_w; uint8_t pad_top, pad_bottom; uint8_t pad_left, pad_right; uint8_t stride_h, stride_w; uint8_t dilation_h, dilation_w; int relu_enable; uint8_t rshift_bits; uint8_t ps32_mode; uint8_t w_is_const; uint16_t layer_id; uint8_t fp_round_typ; int8_t ins_val; // padding value for int8 uint16_t ins_fp; // padding value for bf16 } bmk1880v2_tiu_convolution_param_t; bmk1880v2_op_t * bmk1880v2_tiu_convolution( bmk1880v2_context_t *ctx, const bmk1880v2_tiu_convolution_param_t *p); typedef struct { const bmk1880v2_tensor_lmem_t *ofmap; const bmk1880v2_tensor_lmem_t *ifmap; const bmk1880v2_tensor_lmem_t *weight; const bmk1880v2_tensor_lmem_t *chl_quan_param; uint8_t ins_h, ins_last_h; uint8_t ins_w, ins_last_w; uint8_t pad_top, pad_bottom; uint8_t pad_left, pad_right; uint8_t stride_h, stride_w; uint8_t dilation_h, dilation_w; uint8_t has_bias; uint8_t relu_enable; uint8_t ps32_mode; uint8_t w_is_const; uint16_t layer_id; int8_t ins_val; // padding value for int8 uint16_t ins_fp; // padding value for bf16 } bmk1880v2_tiu_convolution_qdm_param_t; bmk1880v2_op_t * bmk1880v2_tiu_convolution_qdm( bmk1880v2_context_t *ctx, const bmk1880v2_tiu_convolution_qdm_param_t *p); typedef struct { const bmk1880v2_tensor_lmem_t *ofmap; const bmk1880v2_tensor_lmem_t *ifmap; uint16_t kh, kw; uint8_t pad_top, pad_bottom; uint8_t pad_left, pad_right; uint8_t stride_h, stride_w; int8_t ins_val; // padding value for int8 uint16_t ins_fp; // padding value for bf16 uint16_t layer_id; } bmk1880v2_tiu_max_pooling_param_t; bmk1880v2_op_t * bmk1880v2_tiu_max_pooling( bmk1880v2_context_t *ctx, const bmk1880v2_tiu_max_pooling_param_t *p); typedef struct { const bmk1880v2_tensor_lmem_t *ofmap; const bmk1880v2_tensor_lmem_t *ifmap; uint16_t kh, kw; uint8_t ins_h, ins_last_h; uint8_t ins_w, ins_last_w; uint8_t pad_top, pad_bottom; uint8_t pad_left, pad_right; uint8_t stride_h, stride_w; uint16_t avg_pooling_const; uint8_t rshift_bits; uint16_t layer_id; int8_t ins_val; // padding value for int8 uint16_t ins_fp; // padding value for bf16 } bmk1880v2_tiu_average_pooling_param_t; bmk1880v2_op_t * bmk1880v2_tiu_average_pooling( bmk1880v2_context_t *ctx, const bmk1880v2_tiu_average_pooling_param_t *p); typedef struct { const bmk1880v2_tensor_lmem_t *ofmap; const bmk1880v2_tensor_lmem_t *ifmap; const bmk1880v2_tensor_lmem_t *weight; const bmk1880v2_tensor_lmem_t *bias; int weight_is_const; struct { int16_t val; int is_signed; } weight_const; uint8_t ins_h, ins_last_h; uint8_t ins_w, ins_last_w; uint8_t dilation_h, dilation_w; uint8_t pad_top, pad_bottom; uint8_t pad_left, pad_right; uint8_t stride_h, stride_w; uint8_t rshift_bits; int relu_enable; uint16_t layer_id; uint8_t ps32_mode; //output fp32 result if ps32_mode==2 int8_t ins_val; // padding value for int8 uint16_t ins_fp; // padding value for bf16 } bmk1880v2_tiu_depthwise_convolution_param_t; bmk1880v2_op_t * bmk1880v2_tiu_depthwise_convolution( bmk1880v2_context_t *ctx, const bmk1880v2_tiu_depthwise_convolution_param_t *p); typedef struct { const bmk1880v2_tensor_lmem_t *ofmap; const bmk1880v2_tensor_lmem_t *ifmap; const bmk1880v2_tensor_lmem_t *weight; const bmk1880v2_tensor_lmem_t *chl_quan_param; int weight_is_const; struct { int16_t val; int is_signed; } weight_const; uint8_t ins_h, ins_last_h; uint8_t ins_w, ins_last_w; uint8_t dilation_h, dilation_w; uint8_t pad_top, pad_bottom; uint8_t pad_left, pad_right; uint8_t stride_h, stride_w; uint8_t has_bias; uint8_t relu_enable; uint16_t layer_id; int8_t ins_val; // padding value for int8 uint16_t ins_fp; // padding value for bf16 } bmk1880v2_tiu_depthwise_convolution_qdm_param_t; bmk1880v2_op_t * bmk1880v2_tiu_depthwise_convolution_qdm( bmk1880v2_context_t *ctx, const bmk1880v2_tiu_depthwise_convolution_qdm_param_t *p); typedef struct { const bmk1880v2_matrix_lmem_t *res; const bmk1880v2_matrix_lmem_t *left; const bmk1880v2_matrix_lmem_t *right; const bmk1880v2_matrix_lmem_t *bias; uint8_t lshift_bits; uint8_t rshift_bits; int res_is_int8; int relu_enable; int add_result; uint8_t ps32_mode; uint16_t layer_id; } bmk1880v2_tiu_matrix_multiplication_param_t; bmk1880v2_op_t * bmk1880v2_tiu_matrix_multiplication( bmk1880v2_context_t *ctx, const bmk1880v2_tiu_matrix_multiplication_param_t *p); typedef struct { const bmk1880v2_matrix_lmem_t *res; const bmk1880v2_matrix_lmem_t *left; const bmk1880v2_matrix_lmem_t *right; const bmk1880v2_matrix_lmem_t *bias; uint8_t lshift_bits; uint8_t rshift_bits; int res_is_int8; int relu_enable; int add_result; uint8_t ps32_mode; int32_t quan_m; uint16_t layer_id; } bmk1880v2_tiu_matrix_multiplication_qdm_param_t; bmk1880v2_op_t * bmk1880v2_tiu_matrix_multiplication_qdm( bmk1880v2_context_t *ctx, const bmk1880v2_tiu_matrix_multiplication_qdm_param_t *p); /* * Helpers */ bmk1880v2_tensor_lmem_stride_t bmk1880v2_tensor_lmem_default_stride( bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_shape_t s, fmt_t fmt_type, int eu_align); bmk1880v2_tensor_tgmem_stride_t bmk1880v2_tensor_tgmem_default_stride( bmk1880v2_tensor_tgmem_shape_t s, fmt_t fmt_type); bmk1880v2_matrix_lmem_shape_t bmk1880v2_matrix_lmem_default_shape( bmk1880v2_context_t *ctx, uint32_t row, uint32_t col, fmt_t fmt_type); bmk1880v2_matrix_lmem_shape_t bmk1880v2_matrix_lmem_shape_t1( bmk1880v2_context_t *ctx, uint32_t len, fmt_t fmt_type); bmk1880v2_matrix_lmem_stride_t bmk1880v2_matrix_lmem_default_stride( bmk1880v2_context_t *ctx, bmk1880v2_matrix_lmem_shape_t s, fmt_t fmt, int eu_align); bmk1880v2_tensor_lmem_t * bmk1880v2_lmem_alloc_tensor( bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_shape_t s, fmt_t fmt, int eu_align); void bmk1880v2_lmem_init_tensor( bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl, bmk1880v2_tensor_lmem_shape_t shape, fmt_t fmt, int eu_align); bmk1880v2_tensor_lmem_t * bmk1880v2_lmem_alloc_ps32_tensor( bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_shape_t s, fmt_t fmt, int eu_align); void bmk1880v2_lmem_free_tensor( bmk1880v2_context_t *ctx, const bmk1880v2_tensor_lmem_t *t); bmk1880v2_matrix_lmem_t * bmk1880v2_lmem_alloc_matrix( bmk1880v2_context_t *ctx, bmk1880v2_matrix_lmem_shape_t s, fmt_t fmt, int eu_align); void bmk1880v2_lmem_init_matrix( bmk1880v2_context_t *ctx, bmk1880v2_matrix_lmem_t *ml, bmk1880v2_matrix_lmem_shape_t shape, fmt_t fmt, int eu_align); bmk1880v2_matrix_lmem_t * bmk1880v2_lmem_alloc_ps32_matrix( bmk1880v2_context_t *ctx, bmk1880v2_matrix_lmem_shape_t s, fmt_t fmt, int eu_align); void bmk1880v2_lmem_free_matrix( bmk1880v2_context_t *ctx, const bmk1880v2_matrix_lmem_t *t); uint32_t bmk1880v2_lmem_tensor_to_size( bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_shape_t s, fmt_t fmt, int eu_align); uint32_t bmk1880v2_lmem_matrix_to_size( bmk1880v2_context_t *ctx, bmk1880v2_matrix_lmem_shape_t s, fmt_t fmt, int eu_align); uint32_t bmk1880v2_lmem_ps32_matrix_to_size( bmk1880v2_context_t *ctx, bmk1880v2_matrix_lmem_shape_t s, fmt_t fmt, int eu_align); #include "non_atomic.h" #ifdef __cplusplus } #endif #endif /* __BMKERNEL_1880V2_H__ */