Files
SDK_SG200x_V2/cvimath/tests/cvi1835/depthwise_reshape_same.cpp
carbon 83dc4914fe add cvimath
commit ce8705f49da5e5f59c2ddb3253ef88323a0cd9c4
Author: sophgo-forum-service <forum_service@sophgo.com>
Date:   Mon May 13 14:04:10 2024 +0800

    [feat] cvimath opensource for cv18xx soc.

    - 9e8967
2024-05-31 11:54:07 +08:00

908 lines
29 KiB
C++

#include <cvimath_internal.h>
#include <test_cvikernel_util.h>
#include <test_native_ref.h> // calc_dilute_hw
#define NPU_NUM (1 << 5)
typedef cvk_tiu_depthwise_pt_convolution_param_t param_t;
int random_seed;
static void print_pooling_param(param_t *p) {
int in = p->ifmap->shape.n;
int ic = p->ifmap->shape.c;
int ih = p->ifmap->shape.h;
int iw = p->ifmap->shape.w;
int kh = p->weight->shape.h;
int kw = p->weight->shape.w;
printf(" Pooling parameters:\n");
// printf(" random_seed : %d \n", random_seed);
printf(" ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
printf(" opd0_sign = %d\n", p->ifmap->fmt == CVK_FMT_I8);
printf(" weight = (%d, %d)\n", kh, kw);
printf(" padding = (%d, %d, %d, %d)\n", p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
printf(" stride = (%d, %d)\n", p->stride_h, p->stride_w);
// printf(" ins0 = (%d, %d, %d, %d)\n",
// p->ins_h, p->ins_last_h, p->ins_w, p->ins_last_w);
// printf(" dilation = (%d, %d)\n",p->dilation_h, p->dilation_w);
// printf(" rshift_bits = %d\n", p->rshift_bits);
// printf(" relu_enable = %d\n", p->relu_enable);
printf(" res0_sign = %d\n", p->ofmap->fmt == CVK_FMT_I8);
}
static uint16_t *alloc_input(int ic, int ih, int iw, cvk_fmt_t ifmt) {
uint64_t size = ic * ih * iw;
uint16_t *data = (uint16_t *)new uint16_t[(size)];
if (ifmt == CVK_FMT_BF16) {
for (uint64_t i = 0; i < size; i++) {
float val = 0;
int RAND_MAX2 = RAND_MAX / 2; // 5 ~ -5
val = (float)(rand() - RAND_MAX2) * 5 / (float)RAND_MAX;
val = i;
data[i] = convert_fp32_bf16(val);
}
} else {
uint8_t *d = (uint8_t *)data;
for (uint64_t i = 0; i < size; i++) {
d[i] = i % 10 * (i % 2 ? -1 : 1);
}
}
return data;
}
static uint16_t *alloc_weight(int ic, int kh, int kw, cvk_fmt_t fmt) {
int size = ic * kh * kw;
uint16_t *data = (uint16_t *)malloc(size * sizeof(uint16_t));
// printf("weight size is %d\n", size * 2);
if (fmt == CVK_FMT_BF16) {
for (int i = 0; i < size; i++) {
float val = 0;
int RAND_MAX2 = RAND_MAX / 2; // 5 ~ -5
val = (float)(rand() - RAND_MAX2) * 5 / (float)RAND_MAX;
val = i;
data[i] = convert_fp32_bf16(val);
}
} else {
uint8_t *d = (uint8_t *)data;
for (int i = 0; i < size; i++) {
d[i] = i % 5 * (i % 2 ? -1 : 1);
}
}
return data;
}
static uint32_t *alloc_bias(int ic, cvk_fmt_t fmt) {
int c = ic;
uint64_t size = c;
uint32_t *bias = (uint32_t *)malloc(sizeof(uint32_t) * c);
if (fmt == CVK_FMT_BF16) {
for (int i = 0; i < c; i++) {
float val = 0;
int RAND_MAX2 = RAND_MAX / 2; // 2 ~ -2
val = (float)(rand() - RAND_MAX2) * 2 / (float)RAND_MAX;
val = i;
bias[i] = convert_fp32_hex(val);
}
} else {
uint16_t *d = (uint16_t *)bias;
for (uint64_t i = 0; i < size; i++) {
d[i] = i % 0xf * (i % 2 ? -1 : 1);
}
}
return bias;
}
static uint16_t *alloc_output(int ic, int oh, int ow) {
uint64_t size = ic * oh * ow;
return (uint16_t *)new uint16_t[(size)];
}
static inline void cvm_relu(uint16_t *buf, uint64_t size, cvk_fmt_t fmt) {
if (fmt == CVK_FMT_BF16) {
for (uint64_t i = 0; i < size; i++)
if (convert_bf16_fp32(buf[i]) < 0) buf[i] = convert_fp32_bf16(0);
} else {
int8_t *buf_int8_t = (int8_t *)buf;
for (uint64_t i = 0; i < size; i++) {
if (buf_int8_t[i] < 0) buf_int8_t[i] = 0;
}
}
}
static int index_get(int h, int w1, int w2) { return h * w1 + w2; }
int native_pooling_avg_bf16(const uint16_t *i_fmap, const void *weight, const uint32_t *bias,
uint16_t *o_fmap, int input_n, int input_c, int input_h, int input_w,
int kh, int kw, int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
int stride_h, int stride_w, int ins_h, int ins_w, int ins_h_last,
int ins_w_last, int dh, int dw, int const_weight) {
if (kh * kw <= 0) return BM_ERR_INVALID_ARGUMENT;
uint16_t avg_const_weight = *(uint16_t *)weight;
uint16_t *weight_arr = (uint16_t *)weight;
int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
int d_kh = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
int d_kw = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
int output_h = calc_output_hw(h_after, d_kh, stride_h);
int output_w = calc_output_hw(w_after, d_kw, stride_w);
// printf("output_h/output_w is %d/%d\n", output_h, output_w);
float *avg_pooling_mac_a = (float *)malloc(d_kh * d_kw * sizeof(float));
float *avg_pooling_mac_b = (float *)malloc(d_kh * d_kw * sizeof(float));
uint16_t *i_fmap_pad = NULL;
uint16_t *i_kmap_pad = NULL;
for (int n = 0; n < input_n; n++) {
if (const_weight == 0) weight_arr = (uint16_t *)weight;
for (int c = 0; c < input_c; ++c) {
fill_pad_fmap_bf16(i_fmap, &i_fmap_pad, 0, pad_w_l, pad_w_r, pad_h_t, pad_h_b, ins_h, ins_w,
ins_h_last, ins_w_last, input_h, input_w);
// kernel_dilation(
if (const_weight == 0)
fill_pad_fmap_bf16((weight_arr), &i_kmap_pad, 0, 0, 0, 0,
0, // no padding
dh - 1, dw - 1, 0, 0, kh, kw);
float avg_pool_result;
for (int ph = 0; ph < output_h; ++ph) {
for (int pw = 0; pw < output_w; ++pw) {
int hstart = ph * stride_h;
int wstart = pw * stride_w;
int pool_index = index_get(ph, output_w, pw);
int mac_index = 0;
float r = 0;
for (int h = 0; h < d_kh; h++) {
for (int w = 0; w < d_kw; w++) {
int index = index_get((hstart + h), w_after, (w + wstart));
mac_index = h * d_kw + w;
avg_pooling_mac_a[mac_index] = convert_bf16_fp32(i_fmap_pad[index]);
avg_pooling_mac_b[h * d_kw + w] = const_weight
? convert_bf16_fp32(avg_const_weight)
: convert_bf16_fp32(i_kmap_pad[mac_index]);
#if 0
printf ("ref[ni %u][ci %u][oh/ow %u/%u][kh/kw %u/%u] o[%d]"
" %.1f * %.1f + %.1f = %.1f\n",
n, c, ph, pw, h, w, pool_index,
avg_pooling_mac_a[mac_index], avg_pooling_mac_b[h*d_kw+w],
r, r + avg_pooling_mac_a[mac_index] * avg_pooling_mac_b[h*d_kw+w]);
#endif
r += avg_pooling_mac_a[mac_index] * avg_pooling_mac_b[h * d_kw + w];
}
}
inner_float_product(avg_pooling_mac_a, avg_pooling_mac_b, d_kh * d_kw, &avg_pool_result);
if (bias) {
avg_pool_result += convert_hex_fp32(bias[c]);
}
*(o_fmap + pool_index) = convert_fp32_bf16(avg_pool_result);
}
}
weight_arr += kh * kw;
i_fmap += input_w * input_h;
o_fmap += output_w * output_h;
}
}
free(i_fmap_pad);
free(i_kmap_pad);
free(avg_pooling_mac_a);
free(avg_pooling_mac_b);
return BM_SUCCESS;
}
static int get_fsz(cvk_fmt_t fmt) {
assert(fmt == CVK_FMT_BF16 || fmt == CVK_FMT_I8 || fmt == CVK_FMT_U8);
return fmt == CVK_FMT_BF16 ? 2 : 1;
}
static void compare_results(param_t *p, uint16_t input[], uint16_t weight[], uint32_t bias[],
uint16_t output[], uint16_t output_ref[], uint32_t org_o_shape_size,
int is_valid_pack, int org_oc, int org_oh, int org_ow) {
assert(input);
assert(weight);
(void)input;
(void)weight;
printf("bias at %p\n", bias);
int f_sz = get_fsz(p->ofmap->fmt);
if (p->relu_enable) {
cvm_relu(output_ref, org_o_shape_size, p->ofmap->fmt);
}
int cmp_res = -1;
if (!is_valid_pack) {
// we reshape c with SAME mode padding with garbage
// \is_valid_pack set to false means we skip garbage part
int org_hw = org_oh * org_ow;
int new_hw = p->ofmap->shape.h * p->ofmap->shape.w;
int duplicated_c = p->ofmap->shape.c / org_oc;
assert(new_hw >= org_hw / duplicated_c);
int8_t *output_c = ((int8_t *)output);
int8_t *output_ref_c = ((int8_t *)output_ref);
for (int c = 0; c < org_oc; c++) {
cmp_res =
array_cmp_int8("Comparing results ...\n", output_c + c * duplicated_c * new_hw * f_sz,
output_ref_c + org_hw * c * f_sz, org_hw * f_sz);
if (cmp_res != 0) {
break;
}
// printf("compare [%d] pass, org len is %u, new len is %u\n", c,
// org_hw, duplicated_c * new_hw);
}
} else {
cmp_res = array_cmp_int8("Comparing results ...\n", (int8_t *)output_ref, (int8_t *)output,
org_o_shape_size * f_sz);
}
if (cmp_res != 0) {
printf("Comparison FAILED!!!\n");
// print_pooling_param(p);
exit(-1);
}
delete[] output_ref;
}
static int pooling_ih_ext(int ins_h, int ins_last_h, int pad_top, int pad_bottom, int ih) {
int ins = ins_h;
int ins_last = ins_last_h;
int pad = pad_top + pad_bottom;
return (ih - 1) * (ins + 1) + ins_last + 1 + pad;
}
static int pooling_iw_ext(int ins_w, int ins_last_w, int pad_left, int pad_right, int iw) {
int ins = ins_w;
int ins_last = ins_last_w;
int pad = pad_left + pad_right;
return (iw - 1) * (ins + 1) + ins_last + 1 + pad;
}
static int pooling_oh(int ins_h, int ins_last_h, int pad_top, int pad_bottom, int stride_h, int ih,
int kh, int dh) {
int ih_ext = pooling_ih_ext(ins_h, ins_last_h, pad_top, pad_bottom, ih);
int d_h = (kh - 1) * dh + 1;
return (ih_ext - d_h) / stride_h + 1;
}
static int pooling_ow(int ins_w, int ins_last_w, int pad_left, int pad_right, int stride_w, int iw,
int kw, int dw) {
int iw_ext = pooling_iw_ext(ins_w, ins_last_w, pad_left, pad_right, iw);
int d_w = (kw - 1) * dw + 1;
return (iw_ext - d_w) / stride_w + 1;
}
static void free_depthwise_struct(param_t *p) {
free((void *)p->ofmap);
free((void *)p->ifmap);
free((void *)p->weight);
if (p->bias) {
free((void *)p->bias);
}
p->ofmap = NULL;
p->ifmap = NULL;
p->weight = NULL;
p->bias = NULL;
}
static void free_depthwise_param(cvk_context_t *ctx, param_t *p) {
if (p->ofmap) free_tl(ctx, p->ofmap);
if (p->weight) free_tl(ctx, p->weight);
if (p->bias) free_tl(ctx, p->bias);
if (p->ifmap) free_tl(ctx, p->ifmap);
}
static param_t random_depthwise_param(cvk_context_t *ctx, int _ih, int _iw, int _stride_h,
cvk_fmt_t _fmt) {
param_t p;
// retry:
random_seed = clock();
srand(random_seed);
int using_bias = rand() % 2;
int n = rand() % 5 + 1;
n = 1;
int c = rand() % (3 * NPU_NUM) + 1;
c = 3;
int ih = rand() % 30 + 3;
int iw = rand() % 30 + 6;
int kh = rand() % 7 + 1;
int kw = rand() % 7 + 1;
p.ins_h = rand() % kh;
p.ins_w = rand() % kw;
p.ins_last_h = rand() % kh;
p.ins_last_w = rand() % kw;
p.stride_h = rand() % kh + 1;
p.stride_w = rand() % kw + 1;
p.pad_top = rand() % kh;
p.pad_bottom = rand() % kh;
p.pad_left = rand() % kw;
p.pad_right = rand() % kw;
p.rshift_bits = rand() % 32;
p.dilation_h = rand() % 4 + 1;
p.dilation_w = rand() % 4 + 1;
// default
cvk_fmt_t ifmt = CVK_FMT_BF16;
cvk_fmt_t other_fmt = CVK_FMT_BF16;
ih = 24;
iw = 16;
kw = 5;
kh = 5;
p.stride_h = 1;
p.stride_w = 1;
p.rshift_bits = 0;
ih = _ih;
p.stride_h = _stride_h;
iw = _iw;
ifmt = _fmt;
other_fmt = CVK_FMT_I8;
if (ifmt != CVK_FMT_BF16) {
} else {
other_fmt = CVK_FMT_BF16;
}
p.pad_left = 2;
p.pad_right = 2;
p.pad_top = 0;
p.pad_bottom = 0;
// TODO: pad / ins / dilation
p.ins_h = 0;
p.ins_last_h = 0;
p.ins_w = 0;
p.ins_last_w = 0;
p.dilation_h = 1;
p.dilation_w = 1;
int oh =
pooling_oh(p.ins_h, p.ins_last_h, p.pad_top, p.pad_bottom, p.stride_h, ih, kh, p.dilation_h);
int ow =
pooling_ow(p.ins_w, p.ins_last_w, p.pad_left, p.pad_right, p.stride_w, iw, kw, p.dilation_w);
cvk_tl_shape_t ofmap_shape;
ofmap_shape.n = n;
ofmap_shape.c = c;
ofmap_shape.h = oh;
ofmap_shape.w = ow;
cvk_tl_shape_t ifmap_shape;
ifmap_shape.n = n;
ifmap_shape.c = c;
ifmap_shape.h = ih;
ifmap_shape.w = iw;
cvk_tl_shape_t weight_shape;
weight_shape.n = 1;
weight_shape.c = c;
weight_shape.h = kh;
weight_shape.w = kw;
cvk_tl_shape_t bias_shape;
bias_shape.n = 2;
bias_shape.c = c;
bias_shape.h = 1;
bias_shape.w = 1;
p.relu_enable = rand() % 2;
// fake init for ref
cvk_tl_t *bias, *weight, *ofmap, *ifmap;
ifmap = (cvk_tl_t *)malloc(sizeof(cvk_tl_t));
if (using_bias) {
bias = (cvk_tl_t *)malloc(sizeof(cvk_tl_t));
}
weight = (cvk_tl_t *)malloc(sizeof(cvk_tl_t));
ofmap = (cvk_tl_t *)malloc(sizeof(cvk_tl_t));
p.bias = NULL;
if (using_bias) {
bias->start_address = -1;
bias->fmt = other_fmt;
bias->shape = bias_shape;
bias->stride = ctx->ops->tl_default_stride(ctx, bias->shape, other_fmt, /*eu_align*/ 0);
p.bias = bias;
}
weight->start_address = -1;
weight->fmt = other_fmt;
weight->shape = weight_shape;
weight->stride = ctx->ops->tl_default_stride(ctx, weight->shape, other_fmt, /*align*/ 1);
p.weight = weight;
ofmap->start_address = -1;
ofmap->fmt = other_fmt;
ofmap->shape = ofmap_shape;
ofmap->stride = ctx->ops->tl_default_stride(ctx, ofmap->shape, other_fmt, /*align*/ 1);
p.ofmap = ofmap;
ifmap->start_address = -1;
ifmap->fmt = ifmt;
ifmap->shape = ifmap_shape;
ifmap->stride = ctx->ops->tl_default_stride(ctx, ifmap->shape, ifmt, /*align*/ 1);
p.ifmap = ifmap;
#if 0
int d_kh = calc_dilute_hw(kh, p.dilation_h - 1, 0, 0, 0);
int d_kw = calc_dilute_hw(kw, p.dilation_w - 1, 0, 0, 0);
if ((kh > pooling_ih_ext(&p, ih))
|| (kw > pooling_iw_ext(&p, iw))
|| (oh < d_kh)
|| (ow < d_kw)
|| (p.pad_top >= (1 << 4))
|| (p.pad_bottom >= (1 << 4))
|| (p.pad_left >= (1 << 4))
|| (p.pad_right >= (1 << 4))
|| !p.ofmap
|| !p.ifmap
|| !p.weight
|| (using_bias && !p.bias)
) {
LOG(INFO) << "retry init_pooling_param";
assert(0 && "it MUST valid param pass");
goto retry;
}
#endif
return p;
}
static void put_bias_tensor(CVI_RT_HANDLE *ctx, cvk_context_t *bk_ctx, const cvk_tl_t *tl,
uint32_t data[]) {
int c = tl->shape.c;
uint16_t *hi_lo = (uint16_t *)malloc(sizeof(uint16_t) * 2 * c);
if (tl->fmt == CVK_FMT_BF16) {
for (int i = 0; i < c; i++) {
hi_lo[i] = (data[i] >> 16) & 0xffff;
hi_lo[i + c] = (data[i] & 0xffff);
}
} else {
uint8_t *hi_lo_uint8_t = (uint8_t *)hi_lo;
uint16_t *data_uint16_t = (uint16_t *)data;
for (int i = 0; i < c; i++) {
hi_lo_uint8_t[i] = data_uint16_t[i] & 0xff;
hi_lo_uint8_t[i + c] = (data_uint16_t[i] >> 8) & 0xff;
}
}
put_bf16_tensor_g2l(ctx, bk_ctx, tl, (uint16_t *)hi_lo, tl->fmt);
free(hi_lo);
}
/**
* \brief
*/
static int reshape_valid_output(cvk_context_t *bk_ctx, const cvk_tl_t *ofmap, int org_oc,
int org_oh, int org_ow, cvk_tl_shape_t *tl_shape,
cvk_tl_stride_t *tl_load_stride, cvk_tg_shape_t *tg_shape,
cvk_tg_stride_t *tg_stride, cvk_fmt_t fmt) {
assert(fmt == CVK_FMT_BF16 || fmt == CVK_FMT_I8 || fmt == CVK_FMT_U8);
// skip redundant one
// store to sys and re-slice, maybe use next layer
// sys->local skip redundant one
tg_shape->n = tl_shape->n = 1;
tg_shape->c = tl_shape->c = org_oc;
tg_shape->h = tl_shape->h = org_oh;
tg_shape->w = tl_shape->w = org_ow;
cvk_tl_stride_t s = bk_ctx->ops->tl_default_stride(bk_ctx, *tl_shape, fmt, /*eu_align*/ 0);
tl_load_stride->n = s.n;
tl_load_stride->c = s.c;
tl_load_stride->h = s.h;
tl_load_stride->w = s.w;
int duplicat_c = ofmap->shape.c / org_oc;
tg_stride->n = tg_stride->c = duplicat_c * ofmap->shape.h * ofmap->shape.w * get_fsz(fmt);
tg_stride->h = org_ow * get_fsz(fmt);
return 0;
}
static bmerr_t init_ref(int ic, int ih, int iw, int kh, int kw, int pad_right, int pad_left,
int stride_h, int stride_w, cvk_fmt_t fmt, uint16_t *input,
uint16_t *weight, uint32_t *bias, uint16_t *output_ref) {
bmerr_t ret;
int in = 1;
int ins_h = 0;
int ins_w = 0;
int ins_last_h = 0;
int ins_last_w = 0;
int dilation_h = 1;
int dilation_w = 1;
int pad_top = 0;
int pad_bottom = 0;
int rshift_bits = 0;
if (fmt == CVK_FMT_BF16) {
ret = native_pooling_avg_bf16(input, weight, bias ? bias : NULL, output_ref, in, ic, ih, iw, kh,
kw, pad_top, pad_bottom, pad_left, pad_right, stride_h, stride_w,
ins_h, ins_w, ins_last_h, ins_last_w, dilation_h, dilation_w, 0);
} else {
int opd0_sign = fmt == CVK_FMT_I8;
int res0_sign = true; //(ofmap->fmt == CVK_FMT_I8);
ret = native_pooling_ave_int8((int8_t *)input, (int8_t *)weight, bias ? (int16_t *)bias : NULL,
(int8_t *)output_ref, in, ic, ih, iw, kh, kw, pad_top, pad_bottom,
pad_left, pad_right, stride_h, stride_w, ins_h, ins_w, ins_last_h,
ins_last_w, opd0_sign, res0_sign, rshift_bits, 0);
}
return ret;
}
static int test_depthwise(CVI_RT_HANDLE ctx, cvk_context_t *bk_ctx, int ic, int ih, int iw, int kh,
int kw, int pad_right, int pad_left, int stride_h, int stride_w,
bool has_bias, cvk_fmt_t ifmt) {
// print_pooling_param(param);
param_t param;
param_t *p = &param;
assert(ifmt == CVK_FMT_BF16 || ifmt == CVK_FMT_I8 || ifmt == CVK_FMT_U8);
int in = 1;
// TODO: verify dialate > 1
int dilation_h = 1;
int dilation_w = 1;
int relu_enable = 0;
int rshift_bits = 0;
// TODO: verity ins_x
int org_oh = pooling_oh(0, 0, 0, 0, stride_h, ih, kh, dilation_h);
int org_ow = pooling_ow(0, 0, pad_left, pad_right, stride_w, iw, kw, dilation_w);
int org_oc = ic;
int org_o_shape_size = in * org_oc * org_oh * org_ow;
uint16_t *output;
cvk_tdma_g2l_tensor_copy_param_t p1;
cvk_tdma_l2g_tensor_copy_param_t p2;
// weight / ofmap not support U8 format
cvk_fmt_t other_fmt = ifmt == CVK_FMT_BF16 ? CVK_FMT_BF16 : CVK_FMT_I8;
// alloc testbench, input/ref
uint16_t *input = alloc_input(ic, ih, iw, ifmt);
uint16_t *weight = alloc_weight(ic, kh, kw, ifmt);
uint32_t *bias = NULL;
if (has_bias) bias = alloc_bias(ic, ifmt);
uint16_t *output_ref = alloc_output(ic, org_oh, org_ow);
// init ref
init_ref(ic, ih, iw, kh, kw, pad_right, pad_left, stride_h, stride_w, ifmt, input, weight, bias,
output_ref);
// assert(ret == BM_SUCCESS);
// init param
// TODO: verify pad_top/pad_bottom
// TODO: verify ins_h_x
p->pad_left = pad_left;
p->pad_right = pad_right;
p->pad_top = 0;
p->pad_bottom = 0;
p->ins_h = 0;
p->ins_last_h = 0;
p->ins_w = 0;
p->ins_last_w = 0;
p->dilation_h = dilation_h;
p->dilation_w = dilation_w;
p->stride_h = stride_h;
p->stride_w = stride_w;
p->relu_enable = relu_enable;
p->rshift_bits = rshift_bits;
p->bias = NULL;
// prepard load / input / weight / bias / output new shape / stride
cvk_tl_shape_t tl_load_shape;
cvk_tl_stride_t tl_load_stride;
cvk_tg_shape_t tg_shape;
cvk_tg_stride_t tg_stride;
cvk_tl_shape_t tl_weight_shape;
cvk_tl_shape_t tl_bias_shape;
cvk_tl_shape_t tl_output_shape;
cvk_tl_t *tmp_tl_load;
cvk_tg_t *tmp_tg;
// get reshaped information
int r = cvm_reshape_channel_same(bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left, stride_h,
stride_w, &tl_load_shape, &tl_load_stride, &tg_shape, &tg_stride,
&tl_weight_shape, &tl_bias_shape, &tl_output_shape, ifmt,
/*align*/ 1);
if (r == -1) {
printf("could not reshape it, 81\n");
free_depthwise_param(bk_ctx, p);
delete[] input;
free(weight);
free(bias);
return -1;
}
// prepare input tg
{
cvk_tg_shape_t put_tg_shape;
put_tg_shape.n = in;
put_tg_shape.c = ic;
put_tg_shape.h = ih;
put_tg_shape.w = iw;
cvk_tg_t *put_tg = alloc_tg_bf16_gmem(&ctx, bk_ctx, put_tg_shape, ifmt);
put_tg_bf16_gmem(&ctx, put_tg, (uint8_t *)input);
free_tg_gmem(&ctx, put_tg);
}
// prepare load input, put to tg and load back
{
tmp_tl_load = alloc_tl_bf16(bk_ctx, tl_load_shape, ifmt, /*eu_align*/ 0);
assert(tmp_tl_load);
tmp_tg = alloc_tg_bf16_gmem(&ctx, bk_ctx, tg_shape, ifmt);
tmp_tg->stride = tg_stride;
p1.src = tmp_tg;
p1.dst = tmp_tl_load;
bk_ctx->ops->tdma_g2l_bf16_tensor_copy(bk_ctx, &p1);
test_submit_comp(&ctx, bk_ctx);
free_tg_gmem(&ctx, tmp_tg);
// fit for hw
tmp_tl_load->stride =
bk_ctx->ops->tl_default_stride(bk_ctx, tmp_tl_load->shape, ifmt, /*align*/ 1);
p->ifmap = tmp_tl_load;
}
// prepare load bias, put to tg and load back
if (has_bias) {
// bias must i8
cvk_fmt_t bias_fmt = ifmt == CVK_FMT_BF16 ? CVK_FMT_BF16 : CVK_FMT_I8;
p->bias = bk_ctx->ops->lmem_alloc_tensor(bk_ctx, tl_bias_shape, bias_fmt, 0);
// duplicate bias and replace old
uint32_t *new_bias = cvm_reshape_channel_bias((uint8_t *)bias, tl_bias_shape.n, tl_bias_shape.c,
tl_bias_shape.h, tl_bias_shape.w, org_oc, ifmt);
// free old one
free(bias);
bias = new_bias;
put_bias_tensor(&ctx, bk_ctx, p->bias, bias);
}
// prepare load weight, put to tg and load back
{
p->weight = bk_ctx->ops->lmem_alloc_tensor(bk_ctx, tl_weight_shape, other_fmt, /*align*/ 1);
assert(p->weight);
// duplicate kernel with c
uint8_t *new_weight =
cvm_reshape_channel_weight((uint8_t *)weight, tl_weight_shape.n, tl_weight_shape.c,
tl_weight_shape.h, tl_weight_shape.w, org_oc, ifmt);
// free old one
free(weight);
weight = (uint16_t *)new_weight;
put_bf16_tensor_g2l(&ctx, bk_ctx, p->weight, (uint16_t *)weight, ifmt);
}
// prepard ofmap
{
// we allocate 'same' mode shape
p->ofmap = bk_ctx->ops->lmem_alloc_tensor(bk_ctx, tl_output_shape, other_fmt, /*align*/ 1);
assert(p->ofmap);
}
// printf("p->ifmap at %p, c is %d\n", p->ifmap, tmp_tl_load->shape.c);
// emit
if (ifmt == CVK_FMT_BF16) {
bk_ctx->ops->tiu_pt_depthwise_convolution(bk_ctx, p);
} else {
bk_ctx->ops->tiu_pt_depthwise_convolution(bk_ctx, p);
}
// output = (uint16_t *)get_bf16_tensor_l2g(&ctx, bk_ctx, p->ofmap, ifmt);
// check with no pad if true
int is_valid_pack = false;
cvk_tl_shape_t r_ofmap_shape;
cvk_tl_stride_t r_ofmap_stride;
cvk_tg_shape_t r_tg_shape;
cvk_tg_stride_t r_tg_stride;
reshape_valid_output(bk_ctx, p->ofmap, org_oc, org_oh, org_ow, &r_ofmap_shape, &r_ofmap_stride,
&r_tg_shape, &r_tg_stride, ifmt);
p1.dst = p->ofmap;
if (is_valid_pack) {
cvk_tg_shape_t dst_shape;
dst_shape.n = p->ofmap->shape.n;
dst_shape.c = p->ofmap->shape.c;
dst_shape.h = p->ofmap->shape.h;
dst_shape.w = p->ofmap->shape.w;
cvk_tg_t *cvk_tg_tmp = alloc_tg_bf16_gmem(&ctx, bk_ctx, dst_shape, ifmt);
p2.src = p->ofmap;
p2.dst = cvk_tg_tmp;
// store for later reshape
bk_ctx->ops->tdma_l2g_bf16_tensor_copy(bk_ctx, &p2);
test_submit_comp(&ctx, bk_ctx);
// free useless for later reallocate
free_depthwise_param(bk_ctx, p);
p->ofmap = bk_ctx->ops->lmem_alloc_tensor(bk_ctx, r_ofmap_shape, ifmt,
/*eu_align*/ 0);
assert(p->ofmap);
cvk_tg_tmp->shape = r_tg_shape;
cvk_tg_tmp->stride = r_tg_stride;
p1.src = cvk_tg_tmp;
p1.dst = p->ofmap;
bk_ctx->ops->tdma_g2l_bf16_tensor_copy(bk_ctx, &p1);
free_tg_gmem(&ctx, cvk_tg_tmp);
}
cvk_fmt_t ofmap_fmt = ifmt == CVK_FMT_BF16 ? CVK_FMT_BF16 : CVK_FMT_I8;
output = (uint16_t *)get_bf16_tensor_l2g(&ctx, bk_ctx, p1.dst, ofmap_fmt);
compare_results(p, input, weight, bias, output, output_ref, org_o_shape_size, is_valid_pack,
org_oc, org_oh, org_ow);
// free resource
if (is_valid_pack) {
free_tl(bk_ctx, p->ofmap);
} else {
free_depthwise_param(bk_ctx, p);
}
delete[] input;
free(weight);
free(bias);
free(output);
return 1;
}
static void init_input(param_t *p, int *ic, int *ih, int *iw, int *kh, int *kw, int *pad_right,
int *pad_left) {
*ic = p->ifmap->shape.c;
*ih = p->ifmap->shape.h;
*iw = p->ifmap->shape.w;
*kh = p->weight->shape.h;
*kw = p->weight->shape.w;
*pad_right = p->pad_right;
*pad_left = p->pad_left;
}
static int test_depthwise_pooling(CVI_RT_HANDLE *ctx, cvk_context_t *bk_ctx) {
int loop = 1;
int test_finished_num = 0;
int ihs[] = {24, 96, 120, 480, 0};
int iws[] = {16, 17, 19, 23, 128, 256, 0};
int stride_hs[] = {3, 4, 0};
cvk_fmt_t formats[] = {CVK_FMT_I8, CVK_FMT_U8, CVK_FMT_BF16, CVK_FMT_F32};
int ic, ih, iw, kh, kw, pad_right, pad_left;
cvk_fmt_t ifmt;
param_t param;
assert(print_pooling_param);
ifmt = CVK_FMT_U8;
param = random_depthwise_param(bk_ctx, 210, 640, 1, ifmt);
init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
param.stride_h, param.stride_w, param.bias, ifmt);
print_pooling_param(&param);
free_depthwise_struct(&param);
#if 1
param = random_depthwise_param(bk_ctx, 36, 11, 3, ifmt);
init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
param.stride_h, param.stride_w, param.bias, ifmt);
print_pooling_param(&param);
free_depthwise_struct(&param);
ifmt = CVK_FMT_U8;
param = random_depthwise_param(bk_ctx, 24, 29, 3, ifmt);
init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
free_depthwise_struct(&param);
test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
param.stride_h, param.stride_w, param.bias, ifmt);
ifmt = CVK_FMT_BF16;
param = random_depthwise_param(bk_ctx, 480, 53, 3, ifmt);
init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
free_depthwise_struct(&param);
test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
param.stride_h, param.stride_w, param.bias, ifmt);
ifmt = CVK_FMT_I8;
param = random_depthwise_param(bk_ctx, 480, 61, 3, ifmt);
init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
free_depthwise_struct(&param);
test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
param.stride_h, param.stride_w, param.bias, ifmt);
ifmt = CVK_FMT_U8;
param = random_depthwise_param(bk_ctx, 24, 17, 3, ifmt);
init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
free_depthwise_struct(&param);
test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
param.stride_h, param.stride_w, param.bias, ifmt);
ifmt = CVK_FMT_BF16;
param = random_depthwise_param(bk_ctx, 48, 65, 3, ifmt);
init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
free_depthwise_struct(&param);
test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
param.stride_h, param.stride_w, param.bias, ifmt);
ifmt = CVK_FMT_I8;
param = random_depthwise_param(bk_ctx, 48, 63, 3, ifmt);
init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
free_depthwise_struct(&param);
test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
param.stride_h, param.stride_w, param.bias, ifmt);
#endif
for (int i = 0; i < loop; i++) {
for (int i = 0; ihs[i] != 0; i++) {
for (int j = 0; iws[j] != 0; j++) {
for (int k = 0; stride_hs[k] != 0; k++) {
for (int l = 0; formats[l] != 0; l++) {
continue;
if (ihs[i] >= 480 && formats[l] == CVK_FMT_BF16) {
continue;
}
param = random_depthwise_param(bk_ctx, ihs[i], iws[j], stride_hs[k], formats[l]);
ifmt = formats[l];
printf("test[%d] ih/iw/sh/fmt is {%d, %d, %d, %d}\n", test_finished_num, ihs[i], iws[j],
stride_hs[k], formats[l]);
init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
free_depthwise_struct(&param);
int r = test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
param.stride_h, param.stride_w, param.bias, ifmt);
test_finished_num += r;
}
}
}
}
}
printf("Test finished %u\n", test_finished_num);
return test_finished_num;
}
int main() {
CVI_RT_HANDLE ctx;
cvk_context_t *bk_ctx;
test_init(&ctx, &bk_ctx);
int round_mode;
round_mode = set_store_feround();
int ret = test_depthwise_pooling(&ctx, bk_ctx);
assert(ret >= 0);
(void)ret;
printf("pass\n");
test_exit(&ctx, bk_ctx);
restore_feround(round_mode);
return 0;
}