commit ce8705f49da5e5f59c2ddb3253ef88323a0cd9c4 Author: sophgo-forum-service <forum_service@sophgo.com> Date: Mon May 13 14:04:10 2024 +0800 [feat] cvimath opensource for cv18xx soc. - 9e8967
908 lines
29 KiB
C++
908 lines
29 KiB
C++
#include <cvimath_internal.h>
|
|
#include <test_cvikernel_util.h>
|
|
|
|
#include <test_native_ref.h> // calc_dilute_hw
|
|
|
|
#define NPU_NUM (1 << 5)
|
|
typedef cvk_tiu_depthwise_pt_convolution_param_t param_t;
|
|
|
|
int random_seed;
|
|
static void print_pooling_param(param_t *p) {
|
|
int in = p->ifmap->shape.n;
|
|
int ic = p->ifmap->shape.c;
|
|
int ih = p->ifmap->shape.h;
|
|
int iw = p->ifmap->shape.w;
|
|
int kh = p->weight->shape.h;
|
|
int kw = p->weight->shape.w;
|
|
|
|
printf(" Pooling parameters:\n");
|
|
// printf(" random_seed : %d \n", random_seed);
|
|
printf(" ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
|
|
printf(" opd0_sign = %d\n", p->ifmap->fmt == CVK_FMT_I8);
|
|
printf(" weight = (%d, %d)\n", kh, kw);
|
|
printf(" padding = (%d, %d, %d, %d)\n", p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
|
|
printf(" stride = (%d, %d)\n", p->stride_h, p->stride_w);
|
|
// printf(" ins0 = (%d, %d, %d, %d)\n",
|
|
// p->ins_h, p->ins_last_h, p->ins_w, p->ins_last_w);
|
|
// printf(" dilation = (%d, %d)\n",p->dilation_h, p->dilation_w);
|
|
// printf(" rshift_bits = %d\n", p->rshift_bits);
|
|
// printf(" relu_enable = %d\n", p->relu_enable);
|
|
printf(" res0_sign = %d\n", p->ofmap->fmt == CVK_FMT_I8);
|
|
}
|
|
|
|
static uint16_t *alloc_input(int ic, int ih, int iw, cvk_fmt_t ifmt) {
|
|
uint64_t size = ic * ih * iw;
|
|
uint16_t *data = (uint16_t *)new uint16_t[(size)];
|
|
if (ifmt == CVK_FMT_BF16) {
|
|
for (uint64_t i = 0; i < size; i++) {
|
|
float val = 0;
|
|
int RAND_MAX2 = RAND_MAX / 2; // 5 ~ -5
|
|
val = (float)(rand() - RAND_MAX2) * 5 / (float)RAND_MAX;
|
|
val = i;
|
|
data[i] = convert_fp32_bf16(val);
|
|
}
|
|
} else {
|
|
uint8_t *d = (uint8_t *)data;
|
|
for (uint64_t i = 0; i < size; i++) {
|
|
d[i] = i % 10 * (i % 2 ? -1 : 1);
|
|
}
|
|
}
|
|
|
|
return data;
|
|
}
|
|
|
|
static uint16_t *alloc_weight(int ic, int kh, int kw, cvk_fmt_t fmt) {
|
|
int size = ic * kh * kw;
|
|
uint16_t *data = (uint16_t *)malloc(size * sizeof(uint16_t));
|
|
// printf("weight size is %d\n", size * 2);
|
|
if (fmt == CVK_FMT_BF16) {
|
|
for (int i = 0; i < size; i++) {
|
|
float val = 0;
|
|
int RAND_MAX2 = RAND_MAX / 2; // 5 ~ -5
|
|
val = (float)(rand() - RAND_MAX2) * 5 / (float)RAND_MAX;
|
|
val = i;
|
|
data[i] = convert_fp32_bf16(val);
|
|
}
|
|
} else {
|
|
uint8_t *d = (uint8_t *)data;
|
|
for (int i = 0; i < size; i++) {
|
|
d[i] = i % 5 * (i % 2 ? -1 : 1);
|
|
}
|
|
}
|
|
return data;
|
|
}
|
|
|
|
static uint32_t *alloc_bias(int ic, cvk_fmt_t fmt) {
|
|
int c = ic;
|
|
uint64_t size = c;
|
|
uint32_t *bias = (uint32_t *)malloc(sizeof(uint32_t) * c);
|
|
if (fmt == CVK_FMT_BF16) {
|
|
for (int i = 0; i < c; i++) {
|
|
float val = 0;
|
|
int RAND_MAX2 = RAND_MAX / 2; // 2 ~ -2
|
|
val = (float)(rand() - RAND_MAX2) * 2 / (float)RAND_MAX;
|
|
val = i;
|
|
bias[i] = convert_fp32_hex(val);
|
|
}
|
|
} else {
|
|
uint16_t *d = (uint16_t *)bias;
|
|
for (uint64_t i = 0; i < size; i++) {
|
|
d[i] = i % 0xf * (i % 2 ? -1 : 1);
|
|
}
|
|
}
|
|
return bias;
|
|
}
|
|
|
|
static uint16_t *alloc_output(int ic, int oh, int ow) {
|
|
uint64_t size = ic * oh * ow;
|
|
return (uint16_t *)new uint16_t[(size)];
|
|
}
|
|
|
|
static inline void cvm_relu(uint16_t *buf, uint64_t size, cvk_fmt_t fmt) {
|
|
if (fmt == CVK_FMT_BF16) {
|
|
for (uint64_t i = 0; i < size; i++)
|
|
if (convert_bf16_fp32(buf[i]) < 0) buf[i] = convert_fp32_bf16(0);
|
|
} else {
|
|
int8_t *buf_int8_t = (int8_t *)buf;
|
|
for (uint64_t i = 0; i < size; i++) {
|
|
if (buf_int8_t[i] < 0) buf_int8_t[i] = 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
static int index_get(int h, int w1, int w2) { return h * w1 + w2; }
|
|
|
|
int native_pooling_avg_bf16(const uint16_t *i_fmap, const void *weight, const uint32_t *bias,
|
|
uint16_t *o_fmap, int input_n, int input_c, int input_h, int input_w,
|
|
int kh, int kw, int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
|
|
int stride_h, int stride_w, int ins_h, int ins_w, int ins_h_last,
|
|
int ins_w_last, int dh, int dw, int const_weight) {
|
|
if (kh * kw <= 0) return BM_ERR_INVALID_ARGUMENT;
|
|
|
|
uint16_t avg_const_weight = *(uint16_t *)weight;
|
|
uint16_t *weight_arr = (uint16_t *)weight;
|
|
int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
|
|
int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
|
|
int d_kh = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
|
|
int d_kw = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
|
|
|
|
int output_h = calc_output_hw(h_after, d_kh, stride_h);
|
|
int output_w = calc_output_hw(w_after, d_kw, stride_w);
|
|
// printf("output_h/output_w is %d/%d\n", output_h, output_w);
|
|
float *avg_pooling_mac_a = (float *)malloc(d_kh * d_kw * sizeof(float));
|
|
float *avg_pooling_mac_b = (float *)malloc(d_kh * d_kw * sizeof(float));
|
|
|
|
uint16_t *i_fmap_pad = NULL;
|
|
uint16_t *i_kmap_pad = NULL;
|
|
for (int n = 0; n < input_n; n++) {
|
|
if (const_weight == 0) weight_arr = (uint16_t *)weight;
|
|
|
|
for (int c = 0; c < input_c; ++c) {
|
|
fill_pad_fmap_bf16(i_fmap, &i_fmap_pad, 0, pad_w_l, pad_w_r, pad_h_t, pad_h_b, ins_h, ins_w,
|
|
ins_h_last, ins_w_last, input_h, input_w);
|
|
|
|
// kernel_dilation(
|
|
if (const_weight == 0)
|
|
fill_pad_fmap_bf16((weight_arr), &i_kmap_pad, 0, 0, 0, 0,
|
|
0, // no padding
|
|
dh - 1, dw - 1, 0, 0, kh, kw);
|
|
|
|
float avg_pool_result;
|
|
for (int ph = 0; ph < output_h; ++ph) {
|
|
for (int pw = 0; pw < output_w; ++pw) {
|
|
int hstart = ph * stride_h;
|
|
int wstart = pw * stride_w;
|
|
int pool_index = index_get(ph, output_w, pw);
|
|
int mac_index = 0;
|
|
|
|
float r = 0;
|
|
for (int h = 0; h < d_kh; h++) {
|
|
for (int w = 0; w < d_kw; w++) {
|
|
int index = index_get((hstart + h), w_after, (w + wstart));
|
|
mac_index = h * d_kw + w;
|
|
|
|
avg_pooling_mac_a[mac_index] = convert_bf16_fp32(i_fmap_pad[index]);
|
|
|
|
avg_pooling_mac_b[h * d_kw + w] = const_weight
|
|
? convert_bf16_fp32(avg_const_weight)
|
|
: convert_bf16_fp32(i_kmap_pad[mac_index]);
|
|
|
|
#if 0
|
|
printf ("ref[ni %u][ci %u][oh/ow %u/%u][kh/kw %u/%u] o[%d]"
|
|
" %.1f * %.1f + %.1f = %.1f\n",
|
|
n, c, ph, pw, h, w, pool_index,
|
|
avg_pooling_mac_a[mac_index], avg_pooling_mac_b[h*d_kw+w],
|
|
r, r + avg_pooling_mac_a[mac_index] * avg_pooling_mac_b[h*d_kw+w]);
|
|
#endif
|
|
|
|
r += avg_pooling_mac_a[mac_index] * avg_pooling_mac_b[h * d_kw + w];
|
|
}
|
|
}
|
|
|
|
inner_float_product(avg_pooling_mac_a, avg_pooling_mac_b, d_kh * d_kw, &avg_pool_result);
|
|
|
|
if (bias) {
|
|
avg_pool_result += convert_hex_fp32(bias[c]);
|
|
}
|
|
*(o_fmap + pool_index) = convert_fp32_bf16(avg_pool_result);
|
|
}
|
|
}
|
|
weight_arr += kh * kw;
|
|
i_fmap += input_w * input_h;
|
|
o_fmap += output_w * output_h;
|
|
}
|
|
}
|
|
free(i_fmap_pad);
|
|
free(i_kmap_pad);
|
|
free(avg_pooling_mac_a);
|
|
free(avg_pooling_mac_b);
|
|
|
|
return BM_SUCCESS;
|
|
}
|
|
|
|
static int get_fsz(cvk_fmt_t fmt) {
|
|
assert(fmt == CVK_FMT_BF16 || fmt == CVK_FMT_I8 || fmt == CVK_FMT_U8);
|
|
return fmt == CVK_FMT_BF16 ? 2 : 1;
|
|
}
|
|
|
|
static void compare_results(param_t *p, uint16_t input[], uint16_t weight[], uint32_t bias[],
|
|
uint16_t output[], uint16_t output_ref[], uint32_t org_o_shape_size,
|
|
int is_valid_pack, int org_oc, int org_oh, int org_ow) {
|
|
assert(input);
|
|
assert(weight);
|
|
(void)input;
|
|
(void)weight;
|
|
printf("bias at %p\n", bias);
|
|
int f_sz = get_fsz(p->ofmap->fmt);
|
|
|
|
if (p->relu_enable) {
|
|
cvm_relu(output_ref, org_o_shape_size, p->ofmap->fmt);
|
|
}
|
|
|
|
int cmp_res = -1;
|
|
if (!is_valid_pack) {
|
|
// we reshape c with SAME mode padding with garbage
|
|
// \is_valid_pack set to false means we skip garbage part
|
|
int org_hw = org_oh * org_ow;
|
|
int new_hw = p->ofmap->shape.h * p->ofmap->shape.w;
|
|
int duplicated_c = p->ofmap->shape.c / org_oc;
|
|
|
|
assert(new_hw >= org_hw / duplicated_c);
|
|
|
|
int8_t *output_c = ((int8_t *)output);
|
|
int8_t *output_ref_c = ((int8_t *)output_ref);
|
|
for (int c = 0; c < org_oc; c++) {
|
|
cmp_res =
|
|
array_cmp_int8("Comparing results ...\n", output_c + c * duplicated_c * new_hw * f_sz,
|
|
output_ref_c + org_hw * c * f_sz, org_hw * f_sz);
|
|
|
|
if (cmp_res != 0) {
|
|
break;
|
|
}
|
|
// printf("compare [%d] pass, org len is %u, new len is %u\n", c,
|
|
// org_hw, duplicated_c * new_hw);
|
|
}
|
|
} else {
|
|
cmp_res = array_cmp_int8("Comparing results ...\n", (int8_t *)output_ref, (int8_t *)output,
|
|
org_o_shape_size * f_sz);
|
|
}
|
|
if (cmp_res != 0) {
|
|
printf("Comparison FAILED!!!\n");
|
|
// print_pooling_param(p);
|
|
exit(-1);
|
|
}
|
|
|
|
delete[] output_ref;
|
|
}
|
|
|
|
static int pooling_ih_ext(int ins_h, int ins_last_h, int pad_top, int pad_bottom, int ih) {
|
|
int ins = ins_h;
|
|
int ins_last = ins_last_h;
|
|
int pad = pad_top + pad_bottom;
|
|
return (ih - 1) * (ins + 1) + ins_last + 1 + pad;
|
|
}
|
|
|
|
static int pooling_iw_ext(int ins_w, int ins_last_w, int pad_left, int pad_right, int iw) {
|
|
int ins = ins_w;
|
|
int ins_last = ins_last_w;
|
|
int pad = pad_left + pad_right;
|
|
return (iw - 1) * (ins + 1) + ins_last + 1 + pad;
|
|
}
|
|
|
|
static int pooling_oh(int ins_h, int ins_last_h, int pad_top, int pad_bottom, int stride_h, int ih,
|
|
int kh, int dh) {
|
|
int ih_ext = pooling_ih_ext(ins_h, ins_last_h, pad_top, pad_bottom, ih);
|
|
int d_h = (kh - 1) * dh + 1;
|
|
return (ih_ext - d_h) / stride_h + 1;
|
|
}
|
|
|
|
static int pooling_ow(int ins_w, int ins_last_w, int pad_left, int pad_right, int stride_w, int iw,
|
|
int kw, int dw) {
|
|
int iw_ext = pooling_iw_ext(ins_w, ins_last_w, pad_left, pad_right, iw);
|
|
int d_w = (kw - 1) * dw + 1;
|
|
return (iw_ext - d_w) / stride_w + 1;
|
|
}
|
|
|
|
static void free_depthwise_struct(param_t *p) {
|
|
free((void *)p->ofmap);
|
|
free((void *)p->ifmap);
|
|
free((void *)p->weight);
|
|
if (p->bias) {
|
|
free((void *)p->bias);
|
|
}
|
|
|
|
p->ofmap = NULL;
|
|
p->ifmap = NULL;
|
|
p->weight = NULL;
|
|
p->bias = NULL;
|
|
}
|
|
|
|
static void free_depthwise_param(cvk_context_t *ctx, param_t *p) {
|
|
if (p->ofmap) free_tl(ctx, p->ofmap);
|
|
|
|
if (p->weight) free_tl(ctx, p->weight);
|
|
|
|
if (p->bias) free_tl(ctx, p->bias);
|
|
|
|
if (p->ifmap) free_tl(ctx, p->ifmap);
|
|
}
|
|
|
|
static param_t random_depthwise_param(cvk_context_t *ctx, int _ih, int _iw, int _stride_h,
|
|
cvk_fmt_t _fmt) {
|
|
param_t p;
|
|
|
|
// retry:
|
|
random_seed = clock();
|
|
srand(random_seed);
|
|
int using_bias = rand() % 2;
|
|
int n = rand() % 5 + 1;
|
|
n = 1;
|
|
int c = rand() % (3 * NPU_NUM) + 1;
|
|
c = 3;
|
|
int ih = rand() % 30 + 3;
|
|
int iw = rand() % 30 + 6;
|
|
int kh = rand() % 7 + 1;
|
|
int kw = rand() % 7 + 1;
|
|
|
|
p.ins_h = rand() % kh;
|
|
p.ins_w = rand() % kw;
|
|
p.ins_last_h = rand() % kh;
|
|
p.ins_last_w = rand() % kw;
|
|
p.stride_h = rand() % kh + 1;
|
|
p.stride_w = rand() % kw + 1;
|
|
p.pad_top = rand() % kh;
|
|
p.pad_bottom = rand() % kh;
|
|
p.pad_left = rand() % kw;
|
|
p.pad_right = rand() % kw;
|
|
p.rshift_bits = rand() % 32;
|
|
p.dilation_h = rand() % 4 + 1;
|
|
p.dilation_w = rand() % 4 + 1;
|
|
|
|
// default
|
|
cvk_fmt_t ifmt = CVK_FMT_BF16;
|
|
cvk_fmt_t other_fmt = CVK_FMT_BF16;
|
|
ih = 24;
|
|
iw = 16;
|
|
kw = 5;
|
|
kh = 5;
|
|
p.stride_h = 1;
|
|
p.stride_w = 1;
|
|
|
|
p.rshift_bits = 0;
|
|
|
|
ih = _ih;
|
|
p.stride_h = _stride_h;
|
|
iw = _iw;
|
|
ifmt = _fmt;
|
|
other_fmt = CVK_FMT_I8;
|
|
if (ifmt != CVK_FMT_BF16) {
|
|
} else {
|
|
other_fmt = CVK_FMT_BF16;
|
|
}
|
|
|
|
p.pad_left = 2;
|
|
p.pad_right = 2;
|
|
p.pad_top = 0;
|
|
p.pad_bottom = 0;
|
|
// TODO: pad / ins / dilation
|
|
p.ins_h = 0;
|
|
p.ins_last_h = 0;
|
|
p.ins_w = 0;
|
|
p.ins_last_w = 0;
|
|
p.dilation_h = 1;
|
|
p.dilation_w = 1;
|
|
|
|
int oh =
|
|
pooling_oh(p.ins_h, p.ins_last_h, p.pad_top, p.pad_bottom, p.stride_h, ih, kh, p.dilation_h);
|
|
int ow =
|
|
pooling_ow(p.ins_w, p.ins_last_w, p.pad_left, p.pad_right, p.stride_w, iw, kw, p.dilation_w);
|
|
|
|
cvk_tl_shape_t ofmap_shape;
|
|
ofmap_shape.n = n;
|
|
ofmap_shape.c = c;
|
|
ofmap_shape.h = oh;
|
|
ofmap_shape.w = ow;
|
|
cvk_tl_shape_t ifmap_shape;
|
|
ifmap_shape.n = n;
|
|
ifmap_shape.c = c;
|
|
ifmap_shape.h = ih;
|
|
ifmap_shape.w = iw;
|
|
cvk_tl_shape_t weight_shape;
|
|
weight_shape.n = 1;
|
|
weight_shape.c = c;
|
|
weight_shape.h = kh;
|
|
weight_shape.w = kw;
|
|
cvk_tl_shape_t bias_shape;
|
|
bias_shape.n = 2;
|
|
bias_shape.c = c;
|
|
bias_shape.h = 1;
|
|
bias_shape.w = 1;
|
|
p.relu_enable = rand() % 2;
|
|
|
|
// fake init for ref
|
|
cvk_tl_t *bias, *weight, *ofmap, *ifmap;
|
|
ifmap = (cvk_tl_t *)malloc(sizeof(cvk_tl_t));
|
|
if (using_bias) {
|
|
bias = (cvk_tl_t *)malloc(sizeof(cvk_tl_t));
|
|
}
|
|
weight = (cvk_tl_t *)malloc(sizeof(cvk_tl_t));
|
|
ofmap = (cvk_tl_t *)malloc(sizeof(cvk_tl_t));
|
|
|
|
p.bias = NULL;
|
|
if (using_bias) {
|
|
bias->start_address = -1;
|
|
bias->fmt = other_fmt;
|
|
bias->shape = bias_shape;
|
|
bias->stride = ctx->ops->tl_default_stride(ctx, bias->shape, other_fmt, /*eu_align*/ 0);
|
|
p.bias = bias;
|
|
}
|
|
|
|
weight->start_address = -1;
|
|
weight->fmt = other_fmt;
|
|
weight->shape = weight_shape;
|
|
weight->stride = ctx->ops->tl_default_stride(ctx, weight->shape, other_fmt, /*align*/ 1);
|
|
p.weight = weight;
|
|
|
|
ofmap->start_address = -1;
|
|
ofmap->fmt = other_fmt;
|
|
ofmap->shape = ofmap_shape;
|
|
ofmap->stride = ctx->ops->tl_default_stride(ctx, ofmap->shape, other_fmt, /*align*/ 1);
|
|
p.ofmap = ofmap;
|
|
|
|
ifmap->start_address = -1;
|
|
ifmap->fmt = ifmt;
|
|
ifmap->shape = ifmap_shape;
|
|
ifmap->stride = ctx->ops->tl_default_stride(ctx, ifmap->shape, ifmt, /*align*/ 1);
|
|
p.ifmap = ifmap;
|
|
|
|
#if 0
|
|
int d_kh = calc_dilute_hw(kh, p.dilation_h - 1, 0, 0, 0);
|
|
int d_kw = calc_dilute_hw(kw, p.dilation_w - 1, 0, 0, 0);
|
|
if ((kh > pooling_ih_ext(&p, ih))
|
|
|| (kw > pooling_iw_ext(&p, iw))
|
|
|| (oh < d_kh)
|
|
|| (ow < d_kw)
|
|
|| (p.pad_top >= (1 << 4))
|
|
|| (p.pad_bottom >= (1 << 4))
|
|
|| (p.pad_left >= (1 << 4))
|
|
|| (p.pad_right >= (1 << 4))
|
|
|| !p.ofmap
|
|
|| !p.ifmap
|
|
|| !p.weight
|
|
|| (using_bias && !p.bias)
|
|
) {
|
|
LOG(INFO) << "retry init_pooling_param";
|
|
assert(0 && "it MUST valid param pass");
|
|
goto retry;
|
|
}
|
|
#endif
|
|
return p;
|
|
}
|
|
|
|
static void put_bias_tensor(CVI_RT_HANDLE *ctx, cvk_context_t *bk_ctx, const cvk_tl_t *tl,
|
|
uint32_t data[]) {
|
|
int c = tl->shape.c;
|
|
|
|
uint16_t *hi_lo = (uint16_t *)malloc(sizeof(uint16_t) * 2 * c);
|
|
if (tl->fmt == CVK_FMT_BF16) {
|
|
for (int i = 0; i < c; i++) {
|
|
hi_lo[i] = (data[i] >> 16) & 0xffff;
|
|
hi_lo[i + c] = (data[i] & 0xffff);
|
|
}
|
|
} else {
|
|
uint8_t *hi_lo_uint8_t = (uint8_t *)hi_lo;
|
|
uint16_t *data_uint16_t = (uint16_t *)data;
|
|
for (int i = 0; i < c; i++) {
|
|
hi_lo_uint8_t[i] = data_uint16_t[i] & 0xff;
|
|
hi_lo_uint8_t[i + c] = (data_uint16_t[i] >> 8) & 0xff;
|
|
}
|
|
}
|
|
put_bf16_tensor_g2l(ctx, bk_ctx, tl, (uint16_t *)hi_lo, tl->fmt);
|
|
|
|
free(hi_lo);
|
|
}
|
|
|
|
/**
|
|
* \brief
|
|
*/
|
|
static int reshape_valid_output(cvk_context_t *bk_ctx, const cvk_tl_t *ofmap, int org_oc,
|
|
int org_oh, int org_ow, cvk_tl_shape_t *tl_shape,
|
|
cvk_tl_stride_t *tl_load_stride, cvk_tg_shape_t *tg_shape,
|
|
cvk_tg_stride_t *tg_stride, cvk_fmt_t fmt) {
|
|
assert(fmt == CVK_FMT_BF16 || fmt == CVK_FMT_I8 || fmt == CVK_FMT_U8);
|
|
|
|
// skip redundant one
|
|
// store to sys and re-slice, maybe use next layer
|
|
// sys->local skip redundant one
|
|
|
|
tg_shape->n = tl_shape->n = 1;
|
|
tg_shape->c = tl_shape->c = org_oc;
|
|
tg_shape->h = tl_shape->h = org_oh;
|
|
tg_shape->w = tl_shape->w = org_ow;
|
|
|
|
cvk_tl_stride_t s = bk_ctx->ops->tl_default_stride(bk_ctx, *tl_shape, fmt, /*eu_align*/ 0);
|
|
|
|
tl_load_stride->n = s.n;
|
|
tl_load_stride->c = s.c;
|
|
tl_load_stride->h = s.h;
|
|
tl_load_stride->w = s.w;
|
|
|
|
int duplicat_c = ofmap->shape.c / org_oc;
|
|
tg_stride->n = tg_stride->c = duplicat_c * ofmap->shape.h * ofmap->shape.w * get_fsz(fmt);
|
|
tg_stride->h = org_ow * get_fsz(fmt);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static bmerr_t init_ref(int ic, int ih, int iw, int kh, int kw, int pad_right, int pad_left,
|
|
int stride_h, int stride_w, cvk_fmt_t fmt, uint16_t *input,
|
|
uint16_t *weight, uint32_t *bias, uint16_t *output_ref) {
|
|
bmerr_t ret;
|
|
int in = 1;
|
|
int ins_h = 0;
|
|
int ins_w = 0;
|
|
int ins_last_h = 0;
|
|
int ins_last_w = 0;
|
|
int dilation_h = 1;
|
|
int dilation_w = 1;
|
|
int pad_top = 0;
|
|
int pad_bottom = 0;
|
|
int rshift_bits = 0;
|
|
|
|
if (fmt == CVK_FMT_BF16) {
|
|
ret = native_pooling_avg_bf16(input, weight, bias ? bias : NULL, output_ref, in, ic, ih, iw, kh,
|
|
kw, pad_top, pad_bottom, pad_left, pad_right, stride_h, stride_w,
|
|
ins_h, ins_w, ins_last_h, ins_last_w, dilation_h, dilation_w, 0);
|
|
} else {
|
|
int opd0_sign = fmt == CVK_FMT_I8;
|
|
int res0_sign = true; //(ofmap->fmt == CVK_FMT_I8);
|
|
ret = native_pooling_ave_int8((int8_t *)input, (int8_t *)weight, bias ? (int16_t *)bias : NULL,
|
|
(int8_t *)output_ref, in, ic, ih, iw, kh, kw, pad_top, pad_bottom,
|
|
pad_left, pad_right, stride_h, stride_w, ins_h, ins_w, ins_last_h,
|
|
ins_last_w, opd0_sign, res0_sign, rshift_bits, 0);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
static int test_depthwise(CVI_RT_HANDLE ctx, cvk_context_t *bk_ctx, int ic, int ih, int iw, int kh,
|
|
int kw, int pad_right, int pad_left, int stride_h, int stride_w,
|
|
bool has_bias, cvk_fmt_t ifmt) {
|
|
// print_pooling_param(param);
|
|
param_t param;
|
|
param_t *p = ¶m;
|
|
assert(ifmt == CVK_FMT_BF16 || ifmt == CVK_FMT_I8 || ifmt == CVK_FMT_U8);
|
|
|
|
int in = 1;
|
|
// TODO: verify dialate > 1
|
|
int dilation_h = 1;
|
|
int dilation_w = 1;
|
|
int relu_enable = 0;
|
|
int rshift_bits = 0;
|
|
|
|
// TODO: verity ins_x
|
|
int org_oh = pooling_oh(0, 0, 0, 0, stride_h, ih, kh, dilation_h);
|
|
int org_ow = pooling_ow(0, 0, pad_left, pad_right, stride_w, iw, kw, dilation_w);
|
|
int org_oc = ic;
|
|
int org_o_shape_size = in * org_oc * org_oh * org_ow;
|
|
uint16_t *output;
|
|
cvk_tdma_g2l_tensor_copy_param_t p1;
|
|
cvk_tdma_l2g_tensor_copy_param_t p2;
|
|
// weight / ofmap not support U8 format
|
|
cvk_fmt_t other_fmt = ifmt == CVK_FMT_BF16 ? CVK_FMT_BF16 : CVK_FMT_I8;
|
|
|
|
// alloc testbench, input/ref
|
|
uint16_t *input = alloc_input(ic, ih, iw, ifmt);
|
|
uint16_t *weight = alloc_weight(ic, kh, kw, ifmt);
|
|
uint32_t *bias = NULL;
|
|
if (has_bias) bias = alloc_bias(ic, ifmt);
|
|
|
|
uint16_t *output_ref = alloc_output(ic, org_oh, org_ow);
|
|
|
|
// init ref
|
|
init_ref(ic, ih, iw, kh, kw, pad_right, pad_left, stride_h, stride_w, ifmt, input, weight, bias,
|
|
output_ref);
|
|
// assert(ret == BM_SUCCESS);
|
|
|
|
// init param
|
|
// TODO: verify pad_top/pad_bottom
|
|
// TODO: verify ins_h_x
|
|
p->pad_left = pad_left;
|
|
p->pad_right = pad_right;
|
|
p->pad_top = 0;
|
|
p->pad_bottom = 0;
|
|
p->ins_h = 0;
|
|
p->ins_last_h = 0;
|
|
p->ins_w = 0;
|
|
p->ins_last_w = 0;
|
|
p->dilation_h = dilation_h;
|
|
p->dilation_w = dilation_w;
|
|
p->stride_h = stride_h;
|
|
p->stride_w = stride_w;
|
|
|
|
p->relu_enable = relu_enable;
|
|
p->rshift_bits = rshift_bits;
|
|
p->bias = NULL;
|
|
|
|
// prepard load / input / weight / bias / output new shape / stride
|
|
cvk_tl_shape_t tl_load_shape;
|
|
cvk_tl_stride_t tl_load_stride;
|
|
cvk_tg_shape_t tg_shape;
|
|
cvk_tg_stride_t tg_stride;
|
|
cvk_tl_shape_t tl_weight_shape;
|
|
cvk_tl_shape_t tl_bias_shape;
|
|
cvk_tl_shape_t tl_output_shape;
|
|
cvk_tl_t *tmp_tl_load;
|
|
cvk_tg_t *tmp_tg;
|
|
|
|
// get reshaped information
|
|
int r = cvm_reshape_channel_same(bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left, stride_h,
|
|
stride_w, &tl_load_shape, &tl_load_stride, &tg_shape, &tg_stride,
|
|
&tl_weight_shape, &tl_bias_shape, &tl_output_shape, ifmt,
|
|
/*align*/ 1);
|
|
|
|
if (r == -1) {
|
|
printf("could not reshape it, 81\n");
|
|
free_depthwise_param(bk_ctx, p);
|
|
|
|
delete[] input;
|
|
free(weight);
|
|
free(bias);
|
|
return -1;
|
|
}
|
|
|
|
// prepare input tg
|
|
{
|
|
cvk_tg_shape_t put_tg_shape;
|
|
|
|
put_tg_shape.n = in;
|
|
put_tg_shape.c = ic;
|
|
put_tg_shape.h = ih;
|
|
put_tg_shape.w = iw;
|
|
cvk_tg_t *put_tg = alloc_tg_bf16_gmem(&ctx, bk_ctx, put_tg_shape, ifmt);
|
|
put_tg_bf16_gmem(&ctx, put_tg, (uint8_t *)input);
|
|
free_tg_gmem(&ctx, put_tg);
|
|
}
|
|
|
|
// prepare load input, put to tg and load back
|
|
{
|
|
tmp_tl_load = alloc_tl_bf16(bk_ctx, tl_load_shape, ifmt, /*eu_align*/ 0);
|
|
assert(tmp_tl_load);
|
|
|
|
tmp_tg = alloc_tg_bf16_gmem(&ctx, bk_ctx, tg_shape, ifmt);
|
|
tmp_tg->stride = tg_stride;
|
|
|
|
p1.src = tmp_tg;
|
|
p1.dst = tmp_tl_load;
|
|
|
|
bk_ctx->ops->tdma_g2l_bf16_tensor_copy(bk_ctx, &p1);
|
|
test_submit_comp(&ctx, bk_ctx);
|
|
free_tg_gmem(&ctx, tmp_tg);
|
|
|
|
// fit for hw
|
|
tmp_tl_load->stride =
|
|
bk_ctx->ops->tl_default_stride(bk_ctx, tmp_tl_load->shape, ifmt, /*align*/ 1);
|
|
p->ifmap = tmp_tl_load;
|
|
}
|
|
|
|
// prepare load bias, put to tg and load back
|
|
if (has_bias) {
|
|
// bias must i8
|
|
cvk_fmt_t bias_fmt = ifmt == CVK_FMT_BF16 ? CVK_FMT_BF16 : CVK_FMT_I8;
|
|
p->bias = bk_ctx->ops->lmem_alloc_tensor(bk_ctx, tl_bias_shape, bias_fmt, 0);
|
|
|
|
// duplicate bias and replace old
|
|
uint32_t *new_bias = cvm_reshape_channel_bias((uint8_t *)bias, tl_bias_shape.n, tl_bias_shape.c,
|
|
tl_bias_shape.h, tl_bias_shape.w, org_oc, ifmt);
|
|
|
|
// free old one
|
|
free(bias);
|
|
bias = new_bias;
|
|
put_bias_tensor(&ctx, bk_ctx, p->bias, bias);
|
|
}
|
|
|
|
// prepare load weight, put to tg and load back
|
|
{
|
|
p->weight = bk_ctx->ops->lmem_alloc_tensor(bk_ctx, tl_weight_shape, other_fmt, /*align*/ 1);
|
|
assert(p->weight);
|
|
|
|
// duplicate kernel with c
|
|
uint8_t *new_weight =
|
|
cvm_reshape_channel_weight((uint8_t *)weight, tl_weight_shape.n, tl_weight_shape.c,
|
|
tl_weight_shape.h, tl_weight_shape.w, org_oc, ifmt);
|
|
|
|
// free old one
|
|
free(weight);
|
|
weight = (uint16_t *)new_weight;
|
|
put_bf16_tensor_g2l(&ctx, bk_ctx, p->weight, (uint16_t *)weight, ifmt);
|
|
}
|
|
|
|
// prepard ofmap
|
|
{
|
|
// we allocate 'same' mode shape
|
|
p->ofmap = bk_ctx->ops->lmem_alloc_tensor(bk_ctx, tl_output_shape, other_fmt, /*align*/ 1);
|
|
assert(p->ofmap);
|
|
}
|
|
|
|
// printf("p->ifmap at %p, c is %d\n", p->ifmap, tmp_tl_load->shape.c);
|
|
|
|
// emit
|
|
if (ifmt == CVK_FMT_BF16) {
|
|
bk_ctx->ops->tiu_pt_depthwise_convolution(bk_ctx, p);
|
|
} else {
|
|
bk_ctx->ops->tiu_pt_depthwise_convolution(bk_ctx, p);
|
|
}
|
|
|
|
// output = (uint16_t *)get_bf16_tensor_l2g(&ctx, bk_ctx, p->ofmap, ifmt);
|
|
|
|
// check with no pad if true
|
|
int is_valid_pack = false;
|
|
cvk_tl_shape_t r_ofmap_shape;
|
|
cvk_tl_stride_t r_ofmap_stride;
|
|
cvk_tg_shape_t r_tg_shape;
|
|
cvk_tg_stride_t r_tg_stride;
|
|
|
|
reshape_valid_output(bk_ctx, p->ofmap, org_oc, org_oh, org_ow, &r_ofmap_shape, &r_ofmap_stride,
|
|
&r_tg_shape, &r_tg_stride, ifmt);
|
|
|
|
p1.dst = p->ofmap;
|
|
|
|
if (is_valid_pack) {
|
|
cvk_tg_shape_t dst_shape;
|
|
dst_shape.n = p->ofmap->shape.n;
|
|
dst_shape.c = p->ofmap->shape.c;
|
|
dst_shape.h = p->ofmap->shape.h;
|
|
dst_shape.w = p->ofmap->shape.w;
|
|
cvk_tg_t *cvk_tg_tmp = alloc_tg_bf16_gmem(&ctx, bk_ctx, dst_shape, ifmt);
|
|
|
|
p2.src = p->ofmap;
|
|
p2.dst = cvk_tg_tmp;
|
|
|
|
// store for later reshape
|
|
bk_ctx->ops->tdma_l2g_bf16_tensor_copy(bk_ctx, &p2);
|
|
test_submit_comp(&ctx, bk_ctx);
|
|
|
|
// free useless for later reallocate
|
|
free_depthwise_param(bk_ctx, p);
|
|
|
|
p->ofmap = bk_ctx->ops->lmem_alloc_tensor(bk_ctx, r_ofmap_shape, ifmt,
|
|
/*eu_align*/ 0);
|
|
assert(p->ofmap);
|
|
|
|
cvk_tg_tmp->shape = r_tg_shape;
|
|
cvk_tg_tmp->stride = r_tg_stride;
|
|
|
|
p1.src = cvk_tg_tmp;
|
|
p1.dst = p->ofmap;
|
|
bk_ctx->ops->tdma_g2l_bf16_tensor_copy(bk_ctx, &p1);
|
|
free_tg_gmem(&ctx, cvk_tg_tmp);
|
|
}
|
|
|
|
cvk_fmt_t ofmap_fmt = ifmt == CVK_FMT_BF16 ? CVK_FMT_BF16 : CVK_FMT_I8;
|
|
output = (uint16_t *)get_bf16_tensor_l2g(&ctx, bk_ctx, p1.dst, ofmap_fmt);
|
|
compare_results(p, input, weight, bias, output, output_ref, org_o_shape_size, is_valid_pack,
|
|
org_oc, org_oh, org_ow);
|
|
|
|
// free resource
|
|
if (is_valid_pack) {
|
|
free_tl(bk_ctx, p->ofmap);
|
|
} else {
|
|
free_depthwise_param(bk_ctx, p);
|
|
}
|
|
|
|
delete[] input;
|
|
free(weight);
|
|
free(bias);
|
|
free(output);
|
|
|
|
return 1;
|
|
}
|
|
|
|
static void init_input(param_t *p, int *ic, int *ih, int *iw, int *kh, int *kw, int *pad_right,
|
|
int *pad_left) {
|
|
*ic = p->ifmap->shape.c;
|
|
*ih = p->ifmap->shape.h;
|
|
*iw = p->ifmap->shape.w;
|
|
*kh = p->weight->shape.h;
|
|
*kw = p->weight->shape.w;
|
|
*pad_right = p->pad_right;
|
|
*pad_left = p->pad_left;
|
|
}
|
|
|
|
static int test_depthwise_pooling(CVI_RT_HANDLE *ctx, cvk_context_t *bk_ctx) {
|
|
int loop = 1;
|
|
int test_finished_num = 0;
|
|
int ihs[] = {24, 96, 120, 480, 0};
|
|
int iws[] = {16, 17, 19, 23, 128, 256, 0};
|
|
int stride_hs[] = {3, 4, 0};
|
|
cvk_fmt_t formats[] = {CVK_FMT_I8, CVK_FMT_U8, CVK_FMT_BF16, CVK_FMT_F32};
|
|
int ic, ih, iw, kh, kw, pad_right, pad_left;
|
|
cvk_fmt_t ifmt;
|
|
param_t param;
|
|
assert(print_pooling_param);
|
|
|
|
ifmt = CVK_FMT_U8;
|
|
param = random_depthwise_param(bk_ctx, 210, 640, 1, ifmt);
|
|
init_input(¶m, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
|
|
test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
|
|
param.stride_h, param.stride_w, param.bias, ifmt);
|
|
print_pooling_param(¶m);
|
|
free_depthwise_struct(¶m);
|
|
|
|
#if 1
|
|
param = random_depthwise_param(bk_ctx, 36, 11, 3, ifmt);
|
|
init_input(¶m, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
|
|
test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
|
|
param.stride_h, param.stride_w, param.bias, ifmt);
|
|
print_pooling_param(¶m);
|
|
free_depthwise_struct(¶m);
|
|
|
|
ifmt = CVK_FMT_U8;
|
|
param = random_depthwise_param(bk_ctx, 24, 29, 3, ifmt);
|
|
init_input(¶m, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
|
|
free_depthwise_struct(¶m);
|
|
test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
|
|
param.stride_h, param.stride_w, param.bias, ifmt);
|
|
|
|
ifmt = CVK_FMT_BF16;
|
|
param = random_depthwise_param(bk_ctx, 480, 53, 3, ifmt);
|
|
init_input(¶m, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
|
|
free_depthwise_struct(¶m);
|
|
test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
|
|
param.stride_h, param.stride_w, param.bias, ifmt);
|
|
|
|
ifmt = CVK_FMT_I8;
|
|
param = random_depthwise_param(bk_ctx, 480, 61, 3, ifmt);
|
|
init_input(¶m, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
|
|
free_depthwise_struct(¶m);
|
|
test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
|
|
param.stride_h, param.stride_w, param.bias, ifmt);
|
|
|
|
ifmt = CVK_FMT_U8;
|
|
param = random_depthwise_param(bk_ctx, 24, 17, 3, ifmt);
|
|
init_input(¶m, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
|
|
free_depthwise_struct(¶m);
|
|
test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
|
|
param.stride_h, param.stride_w, param.bias, ifmt);
|
|
|
|
ifmt = CVK_FMT_BF16;
|
|
param = random_depthwise_param(bk_ctx, 48, 65, 3, ifmt);
|
|
init_input(¶m, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
|
|
free_depthwise_struct(¶m);
|
|
test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
|
|
param.stride_h, param.stride_w, param.bias, ifmt);
|
|
|
|
ifmt = CVK_FMT_I8;
|
|
param = random_depthwise_param(bk_ctx, 48, 63, 3, ifmt);
|
|
init_input(¶m, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
|
|
free_depthwise_struct(¶m);
|
|
test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
|
|
param.stride_h, param.stride_w, param.bias, ifmt);
|
|
#endif
|
|
|
|
for (int i = 0; i < loop; i++) {
|
|
for (int i = 0; ihs[i] != 0; i++) {
|
|
for (int j = 0; iws[j] != 0; j++) {
|
|
for (int k = 0; stride_hs[k] != 0; k++) {
|
|
for (int l = 0; formats[l] != 0; l++) {
|
|
continue;
|
|
if (ihs[i] >= 480 && formats[l] == CVK_FMT_BF16) {
|
|
continue;
|
|
}
|
|
param = random_depthwise_param(bk_ctx, ihs[i], iws[j], stride_hs[k], formats[l]);
|
|
ifmt = formats[l];
|
|
printf("test[%d] ih/iw/sh/fmt is {%d, %d, %d, %d}\n", test_finished_num, ihs[i], iws[j],
|
|
stride_hs[k], formats[l]);
|
|
|
|
init_input(¶m, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
|
|
free_depthwise_struct(¶m);
|
|
int r = test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
|
|
param.stride_h, param.stride_w, param.bias, ifmt);
|
|
test_finished_num += r;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
printf("Test finished %u\n", test_finished_num);
|
|
|
|
return test_finished_num;
|
|
}
|
|
|
|
int main() {
|
|
CVI_RT_HANDLE ctx;
|
|
cvk_context_t *bk_ctx;
|
|
|
|
test_init(&ctx, &bk_ctx);
|
|
|
|
int round_mode;
|
|
round_mode = set_store_feround();
|
|
int ret = test_depthwise_pooling(&ctx, bk_ctx);
|
|
assert(ret >= 0);
|
|
(void)ret;
|
|
printf("pass\n");
|
|
|
|
test_exit(&ctx, bk_ctx);
|
|
restore_feround(round_mode);
|
|
return 0;
|
|
}
|