#include #include #include // calc_dilute_hw #define NPU_NUM (1 << 5) typedef cvk_tiu_depthwise_pt_convolution_param_t param_t; int random_seed; static void print_pooling_param(param_t *p) { int in = p->ifmap->shape.n; int ic = p->ifmap->shape.c; int ih = p->ifmap->shape.h; int iw = p->ifmap->shape.w; int kh = p->weight->shape.h; int kw = p->weight->shape.w; printf(" Pooling parameters:\n"); // printf(" random_seed : %d \n", random_seed); printf(" ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw); printf(" opd0_sign = %d\n", p->ifmap->fmt == CVK_FMT_I8); printf(" weight = (%d, %d)\n", kh, kw); printf(" padding = (%d, %d, %d, %d)\n", p->pad_top, p->pad_bottom, p->pad_left, p->pad_right); printf(" stride = (%d, %d)\n", p->stride_h, p->stride_w); // printf(" ins0 = (%d, %d, %d, %d)\n", // p->ins_h, p->ins_last_h, p->ins_w, p->ins_last_w); // printf(" dilation = (%d, %d)\n",p->dilation_h, p->dilation_w); // printf(" rshift_bits = %d\n", p->rshift_bits); // printf(" relu_enable = %d\n", p->relu_enable); printf(" res0_sign = %d\n", p->ofmap->fmt == CVK_FMT_I8); } static uint16_t *alloc_input(int ic, int ih, int iw, cvk_fmt_t ifmt) { uint64_t size = ic * ih * iw; uint16_t *data = (uint16_t *)new uint16_t[(size)]; if (ifmt == CVK_FMT_BF16) { for (uint64_t i = 0; i < size; i++) { float val = 0; int RAND_MAX2 = RAND_MAX / 2; // 5 ~ -5 val = (float)(rand() - RAND_MAX2) * 5 / (float)RAND_MAX; val = i; data[i] = convert_fp32_bf16(val); } } else { uint8_t *d = (uint8_t *)data; for (uint64_t i = 0; i < size; i++) { d[i] = i % 10 * (i % 2 ? -1 : 1); } } return data; } static uint16_t *alloc_weight(int ic, int kh, int kw, cvk_fmt_t fmt) { int size = ic * kh * kw; uint16_t *data = (uint16_t *)malloc(size * sizeof(uint16_t)); // printf("weight size is %d\n", size * 2); if (fmt == CVK_FMT_BF16) { for (int i = 0; i < size; i++) { float val = 0; int RAND_MAX2 = RAND_MAX / 2; // 5 ~ -5 val = (float)(rand() - RAND_MAX2) * 5 / (float)RAND_MAX; val = i; data[i] = convert_fp32_bf16(val); } } else { uint8_t *d = (uint8_t *)data; for (int i = 0; i < size; i++) { d[i] = i % 5 * (i % 2 ? -1 : 1); } } return data; } static uint32_t *alloc_bias(int ic, cvk_fmt_t fmt) { int c = ic; uint64_t size = c; uint32_t *bias = (uint32_t *)malloc(sizeof(uint32_t) * c); if (fmt == CVK_FMT_BF16) { for (int i = 0; i < c; i++) { float val = 0; int RAND_MAX2 = RAND_MAX / 2; // 2 ~ -2 val = (float)(rand() - RAND_MAX2) * 2 / (float)RAND_MAX; val = i; bias[i] = convert_fp32_hex(val); } } else { uint16_t *d = (uint16_t *)bias; for (uint64_t i = 0; i < size; i++) { d[i] = i % 0xf * (i % 2 ? -1 : 1); } } return bias; } static uint16_t *alloc_output(int ic, int oh, int ow) { uint64_t size = ic * oh * ow; return (uint16_t *)new uint16_t[(size)]; } static inline void cvm_relu(uint16_t *buf, uint64_t size, cvk_fmt_t fmt) { if (fmt == CVK_FMT_BF16) { for (uint64_t i = 0; i < size; i++) if (convert_bf16_fp32(buf[i]) < 0) buf[i] = convert_fp32_bf16(0); } else { int8_t *buf_int8_t = (int8_t *)buf; for (uint64_t i = 0; i < size; i++) { if (buf_int8_t[i] < 0) buf_int8_t[i] = 0; } } } static int index_get(int h, int w1, int w2) { return h * w1 + w2; } int native_pooling_avg_bf16(const uint16_t *i_fmap, const void *weight, const uint32_t *bias, uint16_t *o_fmap, int input_n, int input_c, int input_h, int input_w, int kh, int kw, int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r, int stride_h, int stride_w, int ins_h, int ins_w, int ins_h_last, int ins_w_last, int dh, int dw, int const_weight) { if (kh * kw <= 0) return BM_ERR_INVALID_ARGUMENT; uint16_t avg_const_weight = *(uint16_t *)weight; uint16_t *weight_arr = (uint16_t *)weight; int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b); int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r); int d_kh = calc_dilute_hw(kh, dh - 1, 0, 0, 0); int d_kw = calc_dilute_hw(kw, dw - 1, 0, 0, 0); int output_h = calc_output_hw(h_after, d_kh, stride_h); int output_w = calc_output_hw(w_after, d_kw, stride_w); // printf("output_h/output_w is %d/%d\n", output_h, output_w); float *avg_pooling_mac_a = (float *)malloc(d_kh * d_kw * sizeof(float)); float *avg_pooling_mac_b = (float *)malloc(d_kh * d_kw * sizeof(float)); uint16_t *i_fmap_pad = NULL; uint16_t *i_kmap_pad = NULL; for (int n = 0; n < input_n; n++) { if (const_weight == 0) weight_arr = (uint16_t *)weight; for (int c = 0; c < input_c; ++c) { fill_pad_fmap_bf16(i_fmap, &i_fmap_pad, 0, pad_w_l, pad_w_r, pad_h_t, pad_h_b, ins_h, ins_w, ins_h_last, ins_w_last, input_h, input_w); // kernel_dilation( if (const_weight == 0) fill_pad_fmap_bf16((weight_arr), &i_kmap_pad, 0, 0, 0, 0, 0, // no padding dh - 1, dw - 1, 0, 0, kh, kw); float avg_pool_result; for (int ph = 0; ph < output_h; ++ph) { for (int pw = 0; pw < output_w; ++pw) { int hstart = ph * stride_h; int wstart = pw * stride_w; int pool_index = index_get(ph, output_w, pw); int mac_index = 0; float r = 0; for (int h = 0; h < d_kh; h++) { for (int w = 0; w < d_kw; w++) { int index = index_get((hstart + h), w_after, (w + wstart)); mac_index = h * d_kw + w; avg_pooling_mac_a[mac_index] = convert_bf16_fp32(i_fmap_pad[index]); avg_pooling_mac_b[h * d_kw + w] = const_weight ? convert_bf16_fp32(avg_const_weight) : convert_bf16_fp32(i_kmap_pad[mac_index]); #if 0 printf ("ref[ni %u][ci %u][oh/ow %u/%u][kh/kw %u/%u] o[%d]" " %.1f * %.1f + %.1f = %.1f\n", n, c, ph, pw, h, w, pool_index, avg_pooling_mac_a[mac_index], avg_pooling_mac_b[h*d_kw+w], r, r + avg_pooling_mac_a[mac_index] * avg_pooling_mac_b[h*d_kw+w]); #endif r += avg_pooling_mac_a[mac_index] * avg_pooling_mac_b[h * d_kw + w]; } } inner_float_product(avg_pooling_mac_a, avg_pooling_mac_b, d_kh * d_kw, &avg_pool_result); if (bias) { avg_pool_result += convert_hex_fp32(bias[c]); } *(o_fmap + pool_index) = convert_fp32_bf16(avg_pool_result); } } weight_arr += kh * kw; i_fmap += input_w * input_h; o_fmap += output_w * output_h; } } free(i_fmap_pad); free(i_kmap_pad); free(avg_pooling_mac_a); free(avg_pooling_mac_b); return BM_SUCCESS; } static int get_fsz(cvk_fmt_t fmt) { assert(fmt == CVK_FMT_BF16 || fmt == CVK_FMT_I8 || fmt == CVK_FMT_U8); return fmt == CVK_FMT_BF16 ? 2 : 1; } static void compare_results(param_t *p, uint16_t input[], uint16_t weight[], uint32_t bias[], uint16_t output[], uint16_t output_ref[], uint32_t org_o_shape_size, int is_valid_pack, int org_oc, int org_oh, int org_ow) { assert(input); assert(weight); (void)input; (void)weight; printf("bias at %p\n", bias); int f_sz = get_fsz(p->ofmap->fmt); if (p->relu_enable) { cvm_relu(output_ref, org_o_shape_size, p->ofmap->fmt); } int cmp_res = -1; if (!is_valid_pack) { // we reshape c with SAME mode padding with garbage // \is_valid_pack set to false means we skip garbage part int org_hw = org_oh * org_ow; int new_hw = p->ofmap->shape.h * p->ofmap->shape.w; int duplicated_c = p->ofmap->shape.c / org_oc; assert(new_hw >= org_hw / duplicated_c); int8_t *output_c = ((int8_t *)output); int8_t *output_ref_c = ((int8_t *)output_ref); for (int c = 0; c < org_oc; c++) { cmp_res = array_cmp_int8("Comparing results ...\n", output_c + c * duplicated_c * new_hw * f_sz, output_ref_c + org_hw * c * f_sz, org_hw * f_sz); if (cmp_res != 0) { break; } // printf("compare [%d] pass, org len is %u, new len is %u\n", c, // org_hw, duplicated_c * new_hw); } } else { cmp_res = array_cmp_int8("Comparing results ...\n", (int8_t *)output_ref, (int8_t *)output, org_o_shape_size * f_sz); } if (cmp_res != 0) { printf("Comparison FAILED!!!\n"); // print_pooling_param(p); exit(-1); } delete[] output_ref; } static int pooling_ih_ext(int ins_h, int ins_last_h, int pad_top, int pad_bottom, int ih) { int ins = ins_h; int ins_last = ins_last_h; int pad = pad_top + pad_bottom; return (ih - 1) * (ins + 1) + ins_last + 1 + pad; } static int pooling_iw_ext(int ins_w, int ins_last_w, int pad_left, int pad_right, int iw) { int ins = ins_w; int ins_last = ins_last_w; int pad = pad_left + pad_right; return (iw - 1) * (ins + 1) + ins_last + 1 + pad; } static int pooling_oh(int ins_h, int ins_last_h, int pad_top, int pad_bottom, int stride_h, int ih, int kh, int dh) { int ih_ext = pooling_ih_ext(ins_h, ins_last_h, pad_top, pad_bottom, ih); int d_h = (kh - 1) * dh + 1; return (ih_ext - d_h) / stride_h + 1; } static int pooling_ow(int ins_w, int ins_last_w, int pad_left, int pad_right, int stride_w, int iw, int kw, int dw) { int iw_ext = pooling_iw_ext(ins_w, ins_last_w, pad_left, pad_right, iw); int d_w = (kw - 1) * dw + 1; return (iw_ext - d_w) / stride_w + 1; } static void free_depthwise_struct(param_t *p) { free((void *)p->ofmap); free((void *)p->ifmap); free((void *)p->weight); if (p->bias) { free((void *)p->bias); } p->ofmap = NULL; p->ifmap = NULL; p->weight = NULL; p->bias = NULL; } static void free_depthwise_param(cvk_context_t *ctx, param_t *p) { if (p->ofmap) free_tl(ctx, p->ofmap); if (p->weight) free_tl(ctx, p->weight); if (p->bias) free_tl(ctx, p->bias); if (p->ifmap) free_tl(ctx, p->ifmap); } static param_t random_depthwise_param(cvk_context_t *ctx, int _ih, int _iw, int _stride_h, cvk_fmt_t _fmt) { param_t p; // retry: random_seed = clock(); srand(random_seed); int using_bias = rand() % 2; int n = rand() % 5 + 1; n = 1; int c = rand() % (3 * NPU_NUM) + 1; c = 3; int ih = rand() % 30 + 3; int iw = rand() % 30 + 6; int kh = rand() % 7 + 1; int kw = rand() % 7 + 1; p.ins_h = rand() % kh; p.ins_w = rand() % kw; p.ins_last_h = rand() % kh; p.ins_last_w = rand() % kw; p.stride_h = rand() % kh + 1; p.stride_w = rand() % kw + 1; p.pad_top = rand() % kh; p.pad_bottom = rand() % kh; p.pad_left = rand() % kw; p.pad_right = rand() % kw; p.rshift_bits = rand() % 32; p.dilation_h = rand() % 4 + 1; p.dilation_w = rand() % 4 + 1; // default cvk_fmt_t ifmt = CVK_FMT_BF16; cvk_fmt_t other_fmt = CVK_FMT_BF16; ih = 24; iw = 16; kw = 5; kh = 5; p.stride_h = 1; p.stride_w = 1; p.rshift_bits = 0; ih = _ih; p.stride_h = _stride_h; iw = _iw; ifmt = _fmt; other_fmt = CVK_FMT_I8; if (ifmt != CVK_FMT_BF16) { } else { other_fmt = CVK_FMT_BF16; } p.pad_left = 2; p.pad_right = 2; p.pad_top = 0; p.pad_bottom = 0; // TODO: pad / ins / dilation p.ins_h = 0; p.ins_last_h = 0; p.ins_w = 0; p.ins_last_w = 0; p.dilation_h = 1; p.dilation_w = 1; int oh = pooling_oh(p.ins_h, p.ins_last_h, p.pad_top, p.pad_bottom, p.stride_h, ih, kh, p.dilation_h); int ow = pooling_ow(p.ins_w, p.ins_last_w, p.pad_left, p.pad_right, p.stride_w, iw, kw, p.dilation_w); cvk_tl_shape_t ofmap_shape; ofmap_shape.n = n; ofmap_shape.c = c; ofmap_shape.h = oh; ofmap_shape.w = ow; cvk_tl_shape_t ifmap_shape; ifmap_shape.n = n; ifmap_shape.c = c; ifmap_shape.h = ih; ifmap_shape.w = iw; cvk_tl_shape_t weight_shape; weight_shape.n = 1; weight_shape.c = c; weight_shape.h = kh; weight_shape.w = kw; cvk_tl_shape_t bias_shape; bias_shape.n = 2; bias_shape.c = c; bias_shape.h = 1; bias_shape.w = 1; p.relu_enable = rand() % 2; // fake init for ref cvk_tl_t *bias, *weight, *ofmap, *ifmap; ifmap = (cvk_tl_t *)malloc(sizeof(cvk_tl_t)); if (using_bias) { bias = (cvk_tl_t *)malloc(sizeof(cvk_tl_t)); } weight = (cvk_tl_t *)malloc(sizeof(cvk_tl_t)); ofmap = (cvk_tl_t *)malloc(sizeof(cvk_tl_t)); p.bias = NULL; if (using_bias) { bias->start_address = -1; bias->fmt = other_fmt; bias->shape = bias_shape; bias->stride = ctx->ops->tl_default_stride(ctx, bias->shape, other_fmt, /*eu_align*/ 0); p.bias = bias; } weight->start_address = -1; weight->fmt = other_fmt; weight->shape = weight_shape; weight->stride = ctx->ops->tl_default_stride(ctx, weight->shape, other_fmt, /*align*/ 1); p.weight = weight; ofmap->start_address = -1; ofmap->fmt = other_fmt; ofmap->shape = ofmap_shape; ofmap->stride = ctx->ops->tl_default_stride(ctx, ofmap->shape, other_fmt, /*align*/ 1); p.ofmap = ofmap; ifmap->start_address = -1; ifmap->fmt = ifmt; ifmap->shape = ifmap_shape; ifmap->stride = ctx->ops->tl_default_stride(ctx, ifmap->shape, ifmt, /*align*/ 1); p.ifmap = ifmap; #if 0 int d_kh = calc_dilute_hw(kh, p.dilation_h - 1, 0, 0, 0); int d_kw = calc_dilute_hw(kw, p.dilation_w - 1, 0, 0, 0); if ((kh > pooling_ih_ext(&p, ih)) || (kw > pooling_iw_ext(&p, iw)) || (oh < d_kh) || (ow < d_kw) || (p.pad_top >= (1 << 4)) || (p.pad_bottom >= (1 << 4)) || (p.pad_left >= (1 << 4)) || (p.pad_right >= (1 << 4)) || !p.ofmap || !p.ifmap || !p.weight || (using_bias && !p.bias) ) { LOG(INFO) << "retry init_pooling_param"; assert(0 && "it MUST valid param pass"); goto retry; } #endif return p; } static void put_bias_tensor(CVI_RT_HANDLE *ctx, cvk_context_t *bk_ctx, const cvk_tl_t *tl, uint32_t data[]) { int c = tl->shape.c; uint16_t *hi_lo = (uint16_t *)malloc(sizeof(uint16_t) * 2 * c); if (tl->fmt == CVK_FMT_BF16) { for (int i = 0; i < c; i++) { hi_lo[i] = (data[i] >> 16) & 0xffff; hi_lo[i + c] = (data[i] & 0xffff); } } else { uint8_t *hi_lo_uint8_t = (uint8_t *)hi_lo; uint16_t *data_uint16_t = (uint16_t *)data; for (int i = 0; i < c; i++) { hi_lo_uint8_t[i] = data_uint16_t[i] & 0xff; hi_lo_uint8_t[i + c] = (data_uint16_t[i] >> 8) & 0xff; } } put_bf16_tensor_g2l(ctx, bk_ctx, tl, (uint16_t *)hi_lo, tl->fmt); free(hi_lo); } /** * \brief */ static int reshape_valid_output(cvk_context_t *bk_ctx, const cvk_tl_t *ofmap, int org_oc, int org_oh, int org_ow, cvk_tl_shape_t *tl_shape, cvk_tl_stride_t *tl_load_stride, cvk_tg_shape_t *tg_shape, cvk_tg_stride_t *tg_stride, cvk_fmt_t fmt) { assert(fmt == CVK_FMT_BF16 || fmt == CVK_FMT_I8 || fmt == CVK_FMT_U8); // skip redundant one // store to sys and re-slice, maybe use next layer // sys->local skip redundant one tg_shape->n = tl_shape->n = 1; tg_shape->c = tl_shape->c = org_oc; tg_shape->h = tl_shape->h = org_oh; tg_shape->w = tl_shape->w = org_ow; cvk_tl_stride_t s = bk_ctx->ops->tl_default_stride(bk_ctx, *tl_shape, fmt, /*eu_align*/ 0); tl_load_stride->n = s.n; tl_load_stride->c = s.c; tl_load_stride->h = s.h; tl_load_stride->w = s.w; int duplicat_c = ofmap->shape.c / org_oc; tg_stride->n = tg_stride->c = duplicat_c * ofmap->shape.h * ofmap->shape.w * get_fsz(fmt); tg_stride->h = org_ow * get_fsz(fmt); return 0; } static bmerr_t init_ref(int ic, int ih, int iw, int kh, int kw, int pad_right, int pad_left, int stride_h, int stride_w, cvk_fmt_t fmt, uint16_t *input, uint16_t *weight, uint32_t *bias, uint16_t *output_ref) { bmerr_t ret; int in = 1; int ins_h = 0; int ins_w = 0; int ins_last_h = 0; int ins_last_w = 0; int dilation_h = 1; int dilation_w = 1; int pad_top = 0; int pad_bottom = 0; int rshift_bits = 0; if (fmt == CVK_FMT_BF16) { ret = native_pooling_avg_bf16(input, weight, bias ? bias : NULL, output_ref, in, ic, ih, iw, kh, kw, pad_top, pad_bottom, pad_left, pad_right, stride_h, stride_w, ins_h, ins_w, ins_last_h, ins_last_w, dilation_h, dilation_w, 0); } else { int opd0_sign = fmt == CVK_FMT_I8; int res0_sign = true; //(ofmap->fmt == CVK_FMT_I8); ret = native_pooling_ave_int8((int8_t *)input, (int8_t *)weight, bias ? (int16_t *)bias : NULL, (int8_t *)output_ref, in, ic, ih, iw, kh, kw, pad_top, pad_bottom, pad_left, pad_right, stride_h, stride_w, ins_h, ins_w, ins_last_h, ins_last_w, opd0_sign, res0_sign, rshift_bits, 0); } return ret; } static int test_depthwise(CVI_RT_HANDLE ctx, cvk_context_t *bk_ctx, int ic, int ih, int iw, int kh, int kw, int pad_right, int pad_left, int stride_h, int stride_w, bool has_bias, cvk_fmt_t ifmt) { // print_pooling_param(param); param_t param; param_t *p = ¶m; assert(ifmt == CVK_FMT_BF16 || ifmt == CVK_FMT_I8 || ifmt == CVK_FMT_U8); int in = 1; // TODO: verify dialate > 1 int dilation_h = 1; int dilation_w = 1; int relu_enable = 0; int rshift_bits = 0; // TODO: verity ins_x int org_oh = pooling_oh(0, 0, 0, 0, stride_h, ih, kh, dilation_h); int org_ow = pooling_ow(0, 0, pad_left, pad_right, stride_w, iw, kw, dilation_w); int org_oc = ic; int org_o_shape_size = in * org_oc * org_oh * org_ow; uint16_t *output; cvk_tdma_g2l_tensor_copy_param_t p1; cvk_tdma_l2g_tensor_copy_param_t p2; // weight / ofmap not support U8 format cvk_fmt_t other_fmt = ifmt == CVK_FMT_BF16 ? CVK_FMT_BF16 : CVK_FMT_I8; // alloc testbench, input/ref uint16_t *input = alloc_input(ic, ih, iw, ifmt); uint16_t *weight = alloc_weight(ic, kh, kw, ifmt); uint32_t *bias = NULL; if (has_bias) bias = alloc_bias(ic, ifmt); uint16_t *output_ref = alloc_output(ic, org_oh, org_ow); // init ref init_ref(ic, ih, iw, kh, kw, pad_right, pad_left, stride_h, stride_w, ifmt, input, weight, bias, output_ref); // assert(ret == BM_SUCCESS); // init param // TODO: verify pad_top/pad_bottom // TODO: verify ins_h_x p->pad_left = pad_left; p->pad_right = pad_right; p->pad_top = 0; p->pad_bottom = 0; p->ins_h = 0; p->ins_last_h = 0; p->ins_w = 0; p->ins_last_w = 0; p->dilation_h = dilation_h; p->dilation_w = dilation_w; p->stride_h = stride_h; p->stride_w = stride_w; p->relu_enable = relu_enable; p->rshift_bits = rshift_bits; p->bias = NULL; // prepard load / input / weight / bias / output new shape / stride cvk_tl_shape_t tl_load_shape; cvk_tl_stride_t tl_load_stride; cvk_tg_shape_t tg_shape; cvk_tg_stride_t tg_stride; cvk_tl_shape_t tl_weight_shape; cvk_tl_shape_t tl_bias_shape; cvk_tl_shape_t tl_output_shape; cvk_tl_t *tmp_tl_load; cvk_tg_t *tmp_tg; // get reshaped information int r = cvm_reshape_channel_same(bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left, stride_h, stride_w, &tl_load_shape, &tl_load_stride, &tg_shape, &tg_stride, &tl_weight_shape, &tl_bias_shape, &tl_output_shape, ifmt, /*align*/ 1); if (r == -1) { printf("could not reshape it, 81\n"); free_depthwise_param(bk_ctx, p); delete[] input; free(weight); free(bias); return -1; } // prepare input tg { cvk_tg_shape_t put_tg_shape; put_tg_shape.n = in; put_tg_shape.c = ic; put_tg_shape.h = ih; put_tg_shape.w = iw; cvk_tg_t *put_tg = alloc_tg_bf16_gmem(&ctx, bk_ctx, put_tg_shape, ifmt); put_tg_bf16_gmem(&ctx, put_tg, (uint8_t *)input); free_tg_gmem(&ctx, put_tg); } // prepare load input, put to tg and load back { tmp_tl_load = alloc_tl_bf16(bk_ctx, tl_load_shape, ifmt, /*eu_align*/ 0); assert(tmp_tl_load); tmp_tg = alloc_tg_bf16_gmem(&ctx, bk_ctx, tg_shape, ifmt); tmp_tg->stride = tg_stride; p1.src = tmp_tg; p1.dst = tmp_tl_load; bk_ctx->ops->tdma_g2l_bf16_tensor_copy(bk_ctx, &p1); test_submit_comp(&ctx, bk_ctx); free_tg_gmem(&ctx, tmp_tg); // fit for hw tmp_tl_load->stride = bk_ctx->ops->tl_default_stride(bk_ctx, tmp_tl_load->shape, ifmt, /*align*/ 1); p->ifmap = tmp_tl_load; } // prepare load bias, put to tg and load back if (has_bias) { // bias must i8 cvk_fmt_t bias_fmt = ifmt == CVK_FMT_BF16 ? CVK_FMT_BF16 : CVK_FMT_I8; p->bias = bk_ctx->ops->lmem_alloc_tensor(bk_ctx, tl_bias_shape, bias_fmt, 0); // duplicate bias and replace old uint32_t *new_bias = cvm_reshape_channel_bias((uint8_t *)bias, tl_bias_shape.n, tl_bias_shape.c, tl_bias_shape.h, tl_bias_shape.w, org_oc, ifmt); // free old one free(bias); bias = new_bias; put_bias_tensor(&ctx, bk_ctx, p->bias, bias); } // prepare load weight, put to tg and load back { p->weight = bk_ctx->ops->lmem_alloc_tensor(bk_ctx, tl_weight_shape, other_fmt, /*align*/ 1); assert(p->weight); // duplicate kernel with c uint8_t *new_weight = cvm_reshape_channel_weight((uint8_t *)weight, tl_weight_shape.n, tl_weight_shape.c, tl_weight_shape.h, tl_weight_shape.w, org_oc, ifmt); // free old one free(weight); weight = (uint16_t *)new_weight; put_bf16_tensor_g2l(&ctx, bk_ctx, p->weight, (uint16_t *)weight, ifmt); } // prepard ofmap { // we allocate 'same' mode shape p->ofmap = bk_ctx->ops->lmem_alloc_tensor(bk_ctx, tl_output_shape, other_fmt, /*align*/ 1); assert(p->ofmap); } // printf("p->ifmap at %p, c is %d\n", p->ifmap, tmp_tl_load->shape.c); // emit if (ifmt == CVK_FMT_BF16) { bk_ctx->ops->tiu_pt_depthwise_convolution(bk_ctx, p); } else { bk_ctx->ops->tiu_pt_depthwise_convolution(bk_ctx, p); } // output = (uint16_t *)get_bf16_tensor_l2g(&ctx, bk_ctx, p->ofmap, ifmt); // check with no pad if true int is_valid_pack = false; cvk_tl_shape_t r_ofmap_shape; cvk_tl_stride_t r_ofmap_stride; cvk_tg_shape_t r_tg_shape; cvk_tg_stride_t r_tg_stride; reshape_valid_output(bk_ctx, p->ofmap, org_oc, org_oh, org_ow, &r_ofmap_shape, &r_ofmap_stride, &r_tg_shape, &r_tg_stride, ifmt); p1.dst = p->ofmap; if (is_valid_pack) { cvk_tg_shape_t dst_shape; dst_shape.n = p->ofmap->shape.n; dst_shape.c = p->ofmap->shape.c; dst_shape.h = p->ofmap->shape.h; dst_shape.w = p->ofmap->shape.w; cvk_tg_t *cvk_tg_tmp = alloc_tg_bf16_gmem(&ctx, bk_ctx, dst_shape, ifmt); p2.src = p->ofmap; p2.dst = cvk_tg_tmp; // store for later reshape bk_ctx->ops->tdma_l2g_bf16_tensor_copy(bk_ctx, &p2); test_submit_comp(&ctx, bk_ctx); // free useless for later reallocate free_depthwise_param(bk_ctx, p); p->ofmap = bk_ctx->ops->lmem_alloc_tensor(bk_ctx, r_ofmap_shape, ifmt, /*eu_align*/ 0); assert(p->ofmap); cvk_tg_tmp->shape = r_tg_shape; cvk_tg_tmp->stride = r_tg_stride; p1.src = cvk_tg_tmp; p1.dst = p->ofmap; bk_ctx->ops->tdma_g2l_bf16_tensor_copy(bk_ctx, &p1); free_tg_gmem(&ctx, cvk_tg_tmp); } cvk_fmt_t ofmap_fmt = ifmt == CVK_FMT_BF16 ? CVK_FMT_BF16 : CVK_FMT_I8; output = (uint16_t *)get_bf16_tensor_l2g(&ctx, bk_ctx, p1.dst, ofmap_fmt); compare_results(p, input, weight, bias, output, output_ref, org_o_shape_size, is_valid_pack, org_oc, org_oh, org_ow); // free resource if (is_valid_pack) { free_tl(bk_ctx, p->ofmap); } else { free_depthwise_param(bk_ctx, p); } delete[] input; free(weight); free(bias); free(output); return 1; } static void init_input(param_t *p, int *ic, int *ih, int *iw, int *kh, int *kw, int *pad_right, int *pad_left) { *ic = p->ifmap->shape.c; *ih = p->ifmap->shape.h; *iw = p->ifmap->shape.w; *kh = p->weight->shape.h; *kw = p->weight->shape.w; *pad_right = p->pad_right; *pad_left = p->pad_left; } static int test_depthwise_pooling(CVI_RT_HANDLE *ctx, cvk_context_t *bk_ctx) { int loop = 1; int test_finished_num = 0; int ihs[] = {24, 96, 120, 480, 0}; int iws[] = {16, 17, 19, 23, 128, 256, 0}; int stride_hs[] = {3, 4, 0}; cvk_fmt_t formats[] = {CVK_FMT_I8, CVK_FMT_U8, CVK_FMT_BF16, CVK_FMT_F32}; int ic, ih, iw, kh, kw, pad_right, pad_left; cvk_fmt_t ifmt; param_t param; assert(print_pooling_param); ifmt = CVK_FMT_U8; param = random_depthwise_param(bk_ctx, 210, 640, 1, ifmt); init_input(¶m, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left); test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left, param.stride_h, param.stride_w, param.bias, ifmt); print_pooling_param(¶m); free_depthwise_struct(¶m); #if 1 param = random_depthwise_param(bk_ctx, 36, 11, 3, ifmt); init_input(¶m, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left); test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left, param.stride_h, param.stride_w, param.bias, ifmt); print_pooling_param(¶m); free_depthwise_struct(¶m); ifmt = CVK_FMT_U8; param = random_depthwise_param(bk_ctx, 24, 29, 3, ifmt); init_input(¶m, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left); free_depthwise_struct(¶m); test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left, param.stride_h, param.stride_w, param.bias, ifmt); ifmt = CVK_FMT_BF16; param = random_depthwise_param(bk_ctx, 480, 53, 3, ifmt); init_input(¶m, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left); free_depthwise_struct(¶m); test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left, param.stride_h, param.stride_w, param.bias, ifmt); ifmt = CVK_FMT_I8; param = random_depthwise_param(bk_ctx, 480, 61, 3, ifmt); init_input(¶m, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left); free_depthwise_struct(¶m); test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left, param.stride_h, param.stride_w, param.bias, ifmt); ifmt = CVK_FMT_U8; param = random_depthwise_param(bk_ctx, 24, 17, 3, ifmt); init_input(¶m, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left); free_depthwise_struct(¶m); test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left, param.stride_h, param.stride_w, param.bias, ifmt); ifmt = CVK_FMT_BF16; param = random_depthwise_param(bk_ctx, 48, 65, 3, ifmt); init_input(¶m, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left); free_depthwise_struct(¶m); test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left, param.stride_h, param.stride_w, param.bias, ifmt); ifmt = CVK_FMT_I8; param = random_depthwise_param(bk_ctx, 48, 63, 3, ifmt); init_input(¶m, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left); free_depthwise_struct(¶m); test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left, param.stride_h, param.stride_w, param.bias, ifmt); #endif for (int i = 0; i < loop; i++) { for (int i = 0; ihs[i] != 0; i++) { for (int j = 0; iws[j] != 0; j++) { for (int k = 0; stride_hs[k] != 0; k++) { for (int l = 0; formats[l] != 0; l++) { continue; if (ihs[i] >= 480 && formats[l] == CVK_FMT_BF16) { continue; } param = random_depthwise_param(bk_ctx, ihs[i], iws[j], stride_hs[k], formats[l]); ifmt = formats[l]; printf("test[%d] ih/iw/sh/fmt is {%d, %d, %d, %d}\n", test_finished_num, ihs[i], iws[j], stride_hs[k], formats[l]); init_input(¶m, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left); free_depthwise_struct(¶m); int r = test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left, param.stride_h, param.stride_w, param.bias, ifmt); test_finished_num += r; } } } } } printf("Test finished %u\n", test_finished_num); return test_finished_num; } int main() { CVI_RT_HANDLE ctx; cvk_context_t *bk_ctx; test_init(&ctx, &bk_ctx); int round_mode; round_mode = set_store_feround(); int ret = test_depthwise_pooling(&ctx, bk_ctx); assert(ret >= 0); (void)ret; printf("pass\n"); test_exit(&ctx, bk_ctx); restore_feround(round_mode); return 0; }