commit ce8705f49da5e5f59c2ddb3253ef88323a0cd9c4 Author: sophgo-forum-service <forum_service@sophgo.com> Date: Mon May 13 14:04:10 2024 +0800 [feat] cvimath opensource for cv18xx soc. - 9e8967
657 lines
22 KiB
C++
657 lines
22 KiB
C++
// \file sample for set value by mask, plz refer \cvimath_internal.h for more details
|
|
|
|
// header include
|
|
#include <assert.h>
|
|
#include <cvimath_internal.h> // math
|
|
#include <test_cvikernel_util.h> // kerenl
|
|
|
|
#include <sys/time.h> // int gettimeofday
|
|
#include <time.h> /* clock_t, clock, CLOCKS_PER_SEC */
|
|
|
|
#define DEBUG 1 // < 0 is disable debug
|
|
#define debug_print(fmt, ...) \
|
|
do { \
|
|
if (DEBUG) fprintf(stderr, fmt, __VA_ARGS__); \
|
|
} while (0)
|
|
|
|
int flip = 0;
|
|
struct testbench {
|
|
char *name;
|
|
int (*cvm_run)(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2, cvk_tl_t *tl_buf,
|
|
cvk_tl_t *tl_mask, cvk_tl_t *tl_update_tbl, cvk_tl_t *tl_kernel, cvk_tl_t *tl_bias,
|
|
uint8_t threshold, uint8_t w1, uint8_t w2, cvk_tl_t *tl_ofmap);
|
|
void (*ref)(uint8_t *ref_data, uint64_t ifmap_size, uint8_t *mask, uint8_t *pNewY, uint8_t *pY,
|
|
uint8_t *g_update_tbl, uint8_t threshold, uint8_t w1, uint8_t w2);
|
|
uint8_t threshold;
|
|
uint8_t w1;
|
|
uint8_t w2;
|
|
};
|
|
|
|
static void init_kernel(uint8_t *kernel_data, uint64_t kernel_size, int8_t val) {
|
|
int8_t *kernel_data_i8 = (int8_t *)kernel_data;
|
|
for (uint64_t i = 0; i < kernel_size; i++) {
|
|
kernel_data_i8[i] = val;
|
|
}
|
|
}
|
|
|
|
static void init_bias(uint8_t *bias_data, uint64_t bias_size, int16_t val) {
|
|
int c = bias_size / 2;
|
|
|
|
for (int i = 0; i < c; i++) {
|
|
bias_data[i] = val & 0xff;
|
|
bias_data[i + c] = (val >> 8) & 0xff;
|
|
}
|
|
}
|
|
|
|
static void init_input_2(uint8_t *input_data, uint64_t ifmap_size) {
|
|
for (uint64_t i = 0; i < ifmap_size; i++) {
|
|
input_data[i] = i * 2 * (i % 3 ? -1 : 1);
|
|
}
|
|
}
|
|
|
|
static void init_input_3(uint8_t *input_data, uint64_t ifmap_size) {
|
|
for (uint64_t i = 0; i < ifmap_size; i++) {
|
|
input_data[i] = i * 3;
|
|
}
|
|
}
|
|
|
|
static void init_mask(uint8_t *mask, uint64_t ifmap_size) {
|
|
for (uint64_t i = 0; i < ifmap_size; i++) {
|
|
mask[i] = i % 2;
|
|
}
|
|
}
|
|
|
|
static void init_update_tbl(uint8_t *update_tbl, uint64_t ifmap_size) {
|
|
int8_t *update_tbl_i8 = (int8_t *)update_tbl;
|
|
for (uint64_t i = 0; i < ifmap_size; i++) {
|
|
update_tbl_i8[i] = i * (i % 2 ? -1 : 1);
|
|
}
|
|
}
|
|
|
|
static void init_ref(uint8_t *ref_data, uint64_t ofmap_size) {
|
|
for (uint64_t i = 0; i < ofmap_size; i++) {
|
|
ref_data[i] = -1 * i;
|
|
// ref_data[i] = 3 * i;
|
|
}
|
|
}
|
|
|
|
static void set_image_by_u8mask(uint8_t *ref_data, uint64_t ifmap_size, uint8_t *mask,
|
|
uint8_t *pNewY, uint8_t *pY, uint8_t *g_update_tbl,
|
|
uint8_t threshold, uint8_t w1, uint8_t w2) {
|
|
(void)pY;
|
|
(void)g_update_tbl;
|
|
(void)threshold;
|
|
(void)w1;
|
|
(void)w2;
|
|
|
|
for (size_t i = 0; i < ifmap_size; i++) {
|
|
if (mask[i]) {
|
|
ref_data[i] = pNewY[i];
|
|
}
|
|
}
|
|
}
|
|
|
|
static void set_image_by_two_info_i8(uint8_t *ref_data, uint64_t ifmap_size, uint8_t *mask,
|
|
uint8_t *pNewY, uint8_t *pY, uint8_t *g_update_tbl,
|
|
uint8_t threshold, uint8_t w1, uint8_t w2) {
|
|
(void)pY;
|
|
(void)w1;
|
|
(void)w2;
|
|
int8_t *g_update_tbl_i8 = (int8_t *)g_update_tbl;
|
|
|
|
for (size_t i = 0; i < ifmap_size; i++) {
|
|
if (mask[i] && (g_update_tbl_i8[i] < threshold)) {
|
|
ref_data[i] = pNewY[i];
|
|
}
|
|
}
|
|
}
|
|
|
|
static void gen_image_diff(uint8_t *ref_data, uint64_t ifmap_size, uint8_t *mask, uint8_t *pNewY,
|
|
uint8_t *pY, uint8_t *g_update_tbl, uint8_t threshold, uint8_t w1,
|
|
uint8_t w2) {
|
|
(void)mask;
|
|
(void)w1;
|
|
(void)w2;
|
|
(void)g_update_tbl;
|
|
(void)threshold;
|
|
|
|
for (size_t i = 0; i < ifmap_size; i++) {
|
|
ref_data[i] = abs(pNewY[i] - pY[i]);
|
|
}
|
|
}
|
|
|
|
static void update_tbl_by_threshold(uint8_t *ref_data, uint64_t ifmap_size, uint8_t *mask,
|
|
uint8_t *pNewY, uint8_t *pY, uint8_t *g_update_tbl,
|
|
uint8_t threshold, uint8_t w1, uint8_t w2) {
|
|
(void)pNewY;
|
|
(void)pY;
|
|
(void)g_update_tbl;
|
|
(void)mask;
|
|
(void)w2;
|
|
int8_t *ref_data_i8 = (int8_t *)ref_data; // output is i8
|
|
|
|
for (size_t i = 0; i < ifmap_size; i++) {
|
|
mask[i] = 0;
|
|
}
|
|
|
|
for (size_t i = 0; i < ifmap_size; i++) {
|
|
int8_t old = ref_data_i8[i];
|
|
if (g_update_tbl[i] < threshold) {
|
|
ref_data_i8[i] = (ref_data_i8[i] < w1) ? 0 : (ref_data_i8[i] - 1);
|
|
} else {
|
|
if (old != 127) {
|
|
// saturate it
|
|
ref_data_i8[i]++;
|
|
}
|
|
mask[i] = 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
static void set_image_by_two_info_u8(uint8_t *ref_data, uint64_t ifmap_size, uint8_t *mask,
|
|
uint8_t *pNewY, uint8_t *pY, uint8_t *g_update_tbl,
|
|
uint8_t threshold, uint8_t w1, uint8_t w2) {
|
|
(void)pY;
|
|
(void)mask;
|
|
(void)w1;
|
|
(void)w2;
|
|
// int8_t* g_update_tbl_i8 = (int8_t*)g_update_tbl;
|
|
|
|
for (size_t i = 0; i < ifmap_size; i++) {
|
|
if (g_update_tbl[i] >= threshold) {
|
|
ref_data[i] = pNewY[i];
|
|
}
|
|
}
|
|
}
|
|
|
|
static void blend_image_by_tbl(uint8_t *ref_data, uint64_t ifmap_size, uint8_t *mask,
|
|
uint8_t *pNewY, uint8_t *pY, uint8_t *g_update_tbl,
|
|
uint8_t threshold, uint8_t w1, uint8_t w2) {
|
|
(void)mask;
|
|
(void)pY;
|
|
int8_t *g_update_tbl_i8 = (int8_t *)g_update_tbl;
|
|
for (size_t i = 0; i < ifmap_size; i++) {
|
|
if (g_update_tbl_i8[i] > threshold) {
|
|
ref_data[i] = (w1 * ref_data[i] + w2 * pNewY[i]) >> 8;
|
|
}
|
|
}
|
|
}
|
|
|
|
static int _cvm_set_image_by_u8mask(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2,
|
|
cvk_tl_t *tl_buf, cvk_tl_t *tl_mask, cvk_tl_t *tl_update_tbl,
|
|
cvk_tl_t *tl_kernel, cvk_tl_t *tl_bias, uint8_t threshold,
|
|
uint8_t w1, uint8_t w2, cvk_tl_t *tl_ofmap) {
|
|
(void)tl_ifmap2;
|
|
(void)tl_update_tbl;
|
|
(void)threshold;
|
|
(void)w1;
|
|
(void)w2;
|
|
(void)tl_kernel;
|
|
(void)tl_bias;
|
|
(void)tl_buf;
|
|
|
|
return cvm_set_image_by_u8mask(ctx, tl_ifmap, tl_buf, tl_mask, tl_ofmap);
|
|
}
|
|
|
|
static int _cvm_set_image_by_u8mask_dp(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2,
|
|
cvk_tl_t *tl_buf, cvk_tl_t *tl_mask, cvk_tl_t *tl_update_tbl,
|
|
cvk_tl_t *tl_kernel, cvk_tl_t *tl_bias, uint8_t threshold,
|
|
uint8_t w1, uint8_t w2, cvk_tl_t *tl_ofmap) {
|
|
(void)tl_ifmap2;
|
|
(void)tl_update_tbl;
|
|
(void)threshold;
|
|
(void)w1;
|
|
(void)w2;
|
|
(void)tl_kernel;
|
|
(void)tl_bias;
|
|
(void)tl_buf;
|
|
|
|
return cvm_set_image_by_u8mask_dp(ctx, tl_ifmap, tl_mask, tl_kernel, tl_bias, tl_ofmap);
|
|
}
|
|
|
|
static int _cvm_set_image_by_two_info_i8(cvk_context_t *ctx, cvk_tl_t *tl_ifmap,
|
|
cvk_tl_t *tl_ifmap2, cvk_tl_t *tl_buf, cvk_tl_t *tl_mask,
|
|
cvk_tl_t *tl_update_tbl, cvk_tl_t *tl_kernel,
|
|
cvk_tl_t *tl_bias, uint8_t threshold, uint8_t w1,
|
|
uint8_t w2, cvk_tl_t *tl_ofmap) {
|
|
(void)tl_ifmap2;
|
|
(void)threshold;
|
|
(void)w1;
|
|
(void)w2;
|
|
(void)tl_kernel;
|
|
(void)tl_bias;
|
|
|
|
// tl_ifmap2 as buf
|
|
return cvm_set_image_by_two_info_i8(ctx, tl_ifmap, tl_buf, tl_mask, tl_update_tbl, threshold,
|
|
tl_ofmap);
|
|
}
|
|
|
|
static int _cvm_set_image_by_two_info_i8_dp(cvk_context_t *ctx, cvk_tl_t *tl_ifmap,
|
|
cvk_tl_t *tl_ifmap2, cvk_tl_t *tl_buf,
|
|
cvk_tl_t *tl_mask, cvk_tl_t *tl_update_tbl,
|
|
cvk_tl_t *tl_kernel, cvk_tl_t *tl_bias,
|
|
uint8_t threshold, uint8_t w1, uint8_t w2,
|
|
cvk_tl_t *tl_ofmap) {
|
|
(void)tl_ifmap2;
|
|
(void)threshold;
|
|
(void)w1;
|
|
(void)w2;
|
|
(void)tl_kernel;
|
|
(void)threshold;
|
|
(void)tl_buf;
|
|
|
|
return cvm_set_image_by_two_info_i8_dp(ctx, tl_ifmap, tl_kernel, tl_mask, tl_update_tbl, tl_bias,
|
|
tl_ofmap);
|
|
}
|
|
|
|
static int _cvm_gen_image_diff(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2,
|
|
cvk_tl_t *tl_buf, cvk_tl_t *tl_mask, cvk_tl_t *tl_update_tbl,
|
|
cvk_tl_t *tl_kernel, cvk_tl_t *tl_bias, uint8_t threshold,
|
|
uint8_t w1, uint8_t w2, cvk_tl_t *tl_ofmap) {
|
|
(void)tl_mask;
|
|
(void)tl_buf;
|
|
(void)tl_update_tbl;
|
|
(void)threshold;
|
|
(void)w1;
|
|
(void)w2;
|
|
(void)tl_kernel;
|
|
(void)tl_bias;
|
|
|
|
// tl_mask as buffer
|
|
return cvm_gen_image_diff(ctx, tl_ifmap, tl_ifmap2, tl_mask, tl_buf, tl_ofmap);
|
|
}
|
|
|
|
static int _cvm_update_tbl_by_threshold(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2,
|
|
cvk_tl_t *tl_buf, cvk_tl_t *tl_mask,
|
|
cvk_tl_t *tl_update_tbl, cvk_tl_t *tl_kernel,
|
|
cvk_tl_t *tl_bias, uint8_t threshold, uint8_t w1,
|
|
uint8_t w2, cvk_tl_t *tl_ofmap) {
|
|
(void)w2;
|
|
(void)tl_kernel;
|
|
(void)tl_bias;
|
|
|
|
// w1 as threshold_b, tl_ifmap/tl_ifmap2 as buf
|
|
return cvm_update_tbl_by_threshold(ctx, tl_mask, tl_ifmap, tl_ifmap2, tl_buf, tl_update_tbl,
|
|
threshold, w1, tl_ofmap);
|
|
}
|
|
|
|
static int _cvm_set_image_by_two_info_u8(cvk_context_t *ctx, cvk_tl_t *tl_ifmap,
|
|
cvk_tl_t *tl_ifmap2, cvk_tl_t *tl_buf, cvk_tl_t *tl_mask,
|
|
cvk_tl_t *tl_update_tbl, cvk_tl_t *tl_kernel,
|
|
cvk_tl_t *tl_bias, uint8_t threshold, uint8_t w1,
|
|
uint8_t w2, cvk_tl_t *tl_ofmap) {
|
|
(void)tl_ifmap2;
|
|
(void)tl_mask;
|
|
(void)w1;
|
|
(void)w2;
|
|
(void)tl_kernel;
|
|
(void)tl_bias;
|
|
|
|
// tl_ifmap2 as buf
|
|
return cvm_set_image_by_two_info_u8(ctx, tl_ifmap, tl_ifmap2, tl_buf, tl_update_tbl, threshold,
|
|
tl_ofmap);
|
|
}
|
|
|
|
static int _cvm_blend_image_by_tbl(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2,
|
|
cvk_tl_t *tl_buf, cvk_tl_t *tl_mask, cvk_tl_t *tl_update_tbl,
|
|
cvk_tl_t *tl_kernel, cvk_tl_t *tl_bias, uint8_t threshold,
|
|
uint8_t w1, uint8_t w2, cvk_tl_t *tl_ofmap) {
|
|
(void)tl_ifmap2;
|
|
(void)tl_kernel;
|
|
(void)tl_bias;
|
|
// tl_mask as buf
|
|
return cvm_blend_image_by_tbl(ctx, tl_ifmap, tl_mask, tl_buf, tl_update_tbl, threshold, w1, w2,
|
|
tl_ofmap);
|
|
}
|
|
|
|
struct testbench testbenchs[] = {
|
|
{(char *)"set_image_by_two_info_i8_dp", _cvm_set_image_by_two_info_i8_dp,
|
|
set_image_by_two_info_i8, 2, 2, 3},
|
|
{(char *)"set_image_by_u8mask_dp", _cvm_set_image_by_u8mask_dp, set_image_by_u8mask, 10, 2, 3},
|
|
|
|
{(char *)"set_image_by_u8mask", _cvm_set_image_by_u8mask, set_image_by_u8mask, 10, 2, 3},
|
|
{(char *)"set_image_by_two_info_i8", _cvm_set_image_by_two_info_i8, set_image_by_two_info_i8, 2,
|
|
2, 3},
|
|
{(char *)"update_tbl_by_threshold", _cvm_update_tbl_by_threshold, update_tbl_by_threshold, 15,
|
|
12, 3},
|
|
{(char *)"gen_image_diff", _cvm_gen_image_diff, gen_image_diff, 10, 2, 3},
|
|
{(char *)"set_image_by_two_info_u8", _cvm_set_image_by_two_info_u8, set_image_by_two_info_u8,
|
|
40, 2, 3},
|
|
{(char *)"blend_image_by_tbl", _cvm_blend_image_by_tbl, blend_image_by_tbl, 6, 2, 3},
|
|
};
|
|
|
|
static void load(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap2,
|
|
uint8_t *input_ifmap2, cvk_tl_t *tl_ifmap3, uint8_t *input_ifmap3,
|
|
cvk_tl_t *tl_ofmap, uint8_t *input_ofmap, cvk_tl_t *tl_mask, uint8_t *input_mask,
|
|
cvk_tl_t *tl_update_tbl, uint8_t *input_update_tbl) {
|
|
// send device memory to sram
|
|
test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_ifmap2, input_ifmap2);
|
|
test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_ifmap3, input_ifmap3);
|
|
test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_mask, input_mask);
|
|
test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_update_tbl, input_update_tbl);
|
|
test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_ofmap, input_ofmap);
|
|
}
|
|
|
|
static void store(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, char *name, cvk_tl_t *tl_ofmap,
|
|
uint8_t *output_ofmap, cvk_tl_t *tl_mask, uint8_t *output_mask, int sz) {
|
|
uint8_t *ofmap_data = (uint8_t *)test_get_tensor_l2g_comp(rt_ctx, cvk_ctx, tl_ofmap);
|
|
|
|
// NOTICE: heavy copy
|
|
memcpy(output_ofmap, ofmap_data, sz);
|
|
|
|
free(ofmap_data);
|
|
|
|
if (!strcmp(name, "update_tbl_by_threshold")) {
|
|
uint8_t *mask_data = (uint8_t *)test_get_tensor_l2g_comp(rt_ctx, cvk_ctx, tl_mask);
|
|
memcpy(output_mask, mask_data, sz);
|
|
free(mask_data);
|
|
}
|
|
}
|
|
|
|
static void testbench(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, cvk_tg_shape_t *tg_shape,
|
|
int testcase_idx, int is_pingpong = false) {
|
|
// for calculate size we need in host
|
|
cvk_tl_shape_t ifmap_shape = {tg_shape->n, tg_shape->c, tg_shape->h, tg_shape->w};
|
|
cvk_tl_shape_t ofmap_shape = ifmap_shape;
|
|
|
|
uint64_t ifmap_size = tl_shape_size(&ifmap_shape);
|
|
uint64_t ofmap_size = tl_shape_size(&ofmap_shape);
|
|
|
|
// unit size is 1 bytes
|
|
int data_type_size = 1;
|
|
|
|
// get input/output size
|
|
uint64_t ifmap_bytesize = ifmap_size * data_type_size;
|
|
uint64_t ofmap_bytesize = ofmap_size * data_type_size;
|
|
|
|
// alloc on ddr
|
|
// uint8_t *input_data = (uint8_t *)xmalloc(ifmap_bytesize);
|
|
uint8_t *input_data2 = (uint8_t *)xmalloc(ifmap_bytesize);
|
|
uint8_t *input_data3 = (uint8_t *)xmalloc(ifmap_bytesize);
|
|
uint8_t *mask = (uint8_t *)xmalloc(ifmap_bytesize);
|
|
uint8_t *update_tbl = (uint8_t *)xmalloc(ifmap_bytesize);
|
|
uint8_t *_ref_data = (uint8_t *)xmalloc(ofmap_bytesize);
|
|
uint8_t *ref_data = (uint8_t *)xmalloc(ofmap_bytesize);
|
|
uint8_t *tpu_output_data = (uint8_t *)xmalloc(ofmap_bytesize);
|
|
uint8_t *tpu_output_mask = (uint8_t *)xmalloc(ofmap_bytesize);
|
|
|
|
// init input / output data in ddr
|
|
uint8_t threshold, w1, w2;
|
|
threshold = testbenchs[testcase_idx].threshold;
|
|
w1 = testbenchs[testcase_idx].w1;
|
|
w2 = testbenchs[testcase_idx].w2;
|
|
init_input_2(input_data2, ifmap_size);
|
|
init_input_3(input_data3, ifmap_size);
|
|
// init_input(input_data2, ifmap_size);
|
|
// init_input(input_data3, ifmap_size);
|
|
init_mask(mask, ifmap_size);
|
|
init_update_tbl(update_tbl, ifmap_size);
|
|
init_ref(ref_data, ofmap_size);
|
|
|
|
// keep org output
|
|
memcpy(_ref_data, ref_data, ofmap_bytesize);
|
|
|
|
testbenchs[testcase_idx].ref(ref_data, ofmap_size, mask, input_data2, input_data3, update_tbl,
|
|
threshold, w1, w2);
|
|
|
|
int tiles = std::ceil(ifmap_shape.c / (float)cvk_ctx->info.npu_num);
|
|
|
|
ifmap_shape.c = ifmap_shape.c / tiles;
|
|
|
|
cvk_tl_shape_t kernel_shape = ifmap_shape;
|
|
kernel_shape.h = 1;
|
|
kernel_shape.w = 1;
|
|
|
|
cvk_tl_shape_t bias_shape = ifmap_shape;
|
|
bias_shape.h = 1;
|
|
bias_shape.w = 1;
|
|
bias_shape.n = 2;
|
|
|
|
uint64_t kernel_size = tl_shape_size(&kernel_shape);
|
|
uint64_t bias_size = tl_shape_size(&bias_shape);
|
|
uint64_t kernel_bytesize = kernel_size * data_type_size;
|
|
uint64_t bias_bytesize = bias_size * data_type_size;
|
|
uint8_t *kernel_data = (uint8_t *)xmalloc(kernel_bytesize);
|
|
uint8_t *bias_data = (uint8_t *)xmalloc(bias_bytesize);
|
|
|
|
// NOTICE: must init with it
|
|
init_kernel(kernel_data, kernel_size, -1);
|
|
init_bias(bias_data, bias_size, 1);
|
|
|
|
if (!strcmp(testbenchs[testcase_idx].name, "set_image_by_two_info_i8_dp")) {
|
|
init_kernel(kernel_data, kernel_size, 1);
|
|
init_bias(bias_data, bias_size, -1 * threshold);
|
|
}
|
|
|
|
if (is_pingpong) {
|
|
// quirk that we tile h for easy implemenetation
|
|
ifmap_shape.h /= 2;
|
|
tiles *= 2;
|
|
}
|
|
|
|
// sync input/output
|
|
ofmap_shape = ifmap_shape;
|
|
|
|
// NOTICE: dont care batch
|
|
int shape_sz = ifmap_shape.c * ifmap_shape.h * ifmap_shape.w;
|
|
|
|
// alloc on sram, just once
|
|
cvk_fmt_t fmt = CVK_FMT_U8; // for mac used
|
|
int eu_align = 1; // dont care
|
|
cvk_tl_t *tl_ifmap2[2] = {NULL, NULL};
|
|
cvk_tl_t *tl_ifmap3[2] = {NULL, NULL};
|
|
cvk_tl_t *tl_ofmap[2] = {NULL, NULL};
|
|
cvk_tl_t *tl_mask[2] = {NULL, NULL};
|
|
cvk_tl_t *tl_update_tbl[2] = {NULL, NULL};
|
|
// must place last for high part of 'mac'
|
|
cvk_tl_t *tl_buf[2] = {NULL, NULL};
|
|
cvk_tl_t *tl_kernel, *tl_bias;
|
|
|
|
// alloc sram
|
|
tl_kernel = test_alloc_tl(cvk_ctx, kernel_shape, CVK_FMT_I8, eu_align);
|
|
tl_bias = test_alloc_tl(cvk_ctx, bias_shape, CVK_FMT_I8, /*eu_align=*/0);
|
|
|
|
int alloc_nr = is_pingpong ? 2 : 1;
|
|
for (int i = 0; i < alloc_nr; i++) {
|
|
tl_ifmap2[i] = test_alloc_tl(cvk_ctx, ifmap_shape, fmt, eu_align);
|
|
tl_ifmap3[i] = test_alloc_tl(cvk_ctx, ifmap_shape, fmt, eu_align);
|
|
tl_ofmap[i] = test_alloc_tl(cvk_ctx, ofmap_shape, fmt, eu_align);
|
|
tl_mask[i] = test_alloc_tl(cvk_ctx, ifmap_shape, fmt, eu_align);
|
|
tl_update_tbl[i] = test_alloc_tl(cvk_ctx, ifmap_shape, fmt, eu_align);
|
|
// must place last for high part of 'mac'
|
|
tl_buf[i] = test_alloc_tl(cvk_ctx, ifmap_shape, fmt, eu_align);
|
|
}
|
|
|
|
// NOTICE: consider residual
|
|
int load_offset = 0;
|
|
int store_offset = 0;
|
|
int ret;
|
|
int curr = flip;
|
|
long elapsed;
|
|
struct timeval t0, t1;
|
|
gettimeofday(&t0, NULL);
|
|
|
|
if (!is_pingpong) {
|
|
int off = 0;
|
|
for (int i = 0; i < tiles; i++) {
|
|
// NOTICE: load each loop
|
|
test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_kernel, kernel_data);
|
|
test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_bias, bias_data);
|
|
|
|
load(rt_ctx, cvk_ctx, tl_ifmap2[curr], input_data2 + off, tl_ifmap3[curr], input_data3 + off,
|
|
tl_ofmap[curr], _ref_data + off, tl_mask[curr], mask + off, tl_update_tbl[curr],
|
|
update_tbl + off);
|
|
|
|
int ret = testbenchs[testcase_idx].cvm_run(
|
|
cvk_ctx, tl_ifmap2[curr], tl_ifmap3[curr], tl_buf[curr], tl_mask[curr],
|
|
tl_update_tbl[curr], tl_kernel, tl_bias, threshold, w1, w2, tl_ofmap[curr]);
|
|
|
|
if (ret) {
|
|
fflush(stderr);
|
|
printf("%s", "generate commands fail, return\n");
|
|
exit(-1);
|
|
}
|
|
|
|
store(rt_ctx, cvk_ctx, testbenchs[testcase_idx].name, tl_ofmap[curr], tpu_output_data + off,
|
|
tl_mask[curr], tpu_output_mask + off, shape_sz);
|
|
|
|
off += shape_sz;
|
|
}
|
|
} else {
|
|
// TODO: not load at once
|
|
int operand_num = 1;
|
|
int input_flip = 0;
|
|
int output_flip = 0;
|
|
for (int i = 0; i < tiles + 2; i++) {
|
|
cvk_ctx->ops->parallel_enable(cvk_ctx);
|
|
// NOTICE: load each loop
|
|
test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_kernel, kernel_data);
|
|
test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_bias, bias_data);
|
|
|
|
// send device memory to sram
|
|
if ((i - 2) >= 0 && (i - 2) % operand_num == operand_num - 1) {
|
|
int curr = 1 - output_flip;
|
|
store(rt_ctx, cvk_ctx, testbenchs[testcase_idx].name, tl_ofmap[curr],
|
|
tpu_output_data + store_offset, tl_mask[curr], tpu_output_mask + store_offset,
|
|
shape_sz);
|
|
store_offset += shape_sz;
|
|
}
|
|
|
|
if (i - 1 >= 0 && i - 1 < tiles) {
|
|
// get data from tl
|
|
int curr = 1 - input_flip;
|
|
// prepare command buffer
|
|
ret = testbenchs[testcase_idx].cvm_run(
|
|
cvk_ctx, tl_ifmap2[curr], tl_ifmap3[curr], tl_buf[curr], tl_mask[curr],
|
|
tl_update_tbl[curr], tl_kernel, tl_bias, threshold, w1, w2, tl_ofmap[curr]);
|
|
|
|
if (ret) {
|
|
fflush(stderr);
|
|
printf("%s", "generate commands fail, return\n");
|
|
exit(-1);
|
|
}
|
|
output_flip = 1 - output_flip;
|
|
}
|
|
|
|
if (i < tiles) {
|
|
load(rt_ctx, cvk_ctx, tl_ifmap2[input_flip], input_data2 + load_offset,
|
|
tl_ifmap3[input_flip], input_data3 + load_offset, tl_ofmap[input_flip],
|
|
_ref_data + load_offset, tl_mask[input_flip], mask + load_offset,
|
|
tl_update_tbl[input_flip], update_tbl + load_offset);
|
|
load_offset += shape_sz;
|
|
input_flip = 1 - input_flip;
|
|
}
|
|
cvk_ctx->ops->parallel_disable(cvk_ctx);
|
|
}
|
|
}
|
|
|
|
// submit descriptor
|
|
test_submit_comp(rt_ctx, cvk_ctx);
|
|
|
|
gettimeofday(&t1, NULL);
|
|
elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
|
|
|
|
// compare with reference with byte
|
|
debug_print("%s comparing...", testbenchs[testcase_idx].name);
|
|
for (uint32_t i = 0; i < (uint32_t)ofmap_bytesize; i++) {
|
|
if (ref_data[i] != tpu_output_data[i]) {
|
|
debug_print("comparing failed output[%u] got %u, ref %u\n", i, tpu_output_data[i],
|
|
ref_data[i]);
|
|
// fail case
|
|
fflush(stderr);
|
|
exit(-1);
|
|
}
|
|
}
|
|
|
|
// compare another export information
|
|
if (!strcmp(testbenchs[testcase_idx].name, "update_tbl_by_threshold")) {
|
|
for (uint32_t i = 0; i < (uint32_t)shape_sz; i++) {
|
|
if (mask[i] != tpu_output_mask[i]) {
|
|
debug_print("comparing mask failed output[%u] got %u, ref %u\n", i, tpu_output_mask[i],
|
|
mask[i]);
|
|
// fail case
|
|
fflush(stderr);
|
|
exit(-1);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (tiles == 1) {
|
|
debug_print("%s", " pass\n");
|
|
} else {
|
|
// get elapsed time
|
|
debug_print("(takes %ld us)\n", elapsed);
|
|
}
|
|
|
|
// free resource from tpu memory
|
|
for (int i = alloc_nr - 1; i >= 0; --i) {
|
|
free_tl(cvk_ctx, tl_buf[i]);
|
|
free_tl(cvk_ctx, tl_update_tbl[i]);
|
|
free_tl(cvk_ctx, tl_mask[i]);
|
|
free_tl(cvk_ctx, tl_ofmap[i]);
|
|
free_tl(cvk_ctx, tl_ifmap3[i]);
|
|
free_tl(cvk_ctx, tl_ifmap2[i]);
|
|
}
|
|
free_tl(cvk_ctx, tl_bias);
|
|
free_tl(cvk_ctx, tl_kernel);
|
|
|
|
// free resource from host memory
|
|
// free(input_data);
|
|
free(ref_data);
|
|
free(tpu_output_data);
|
|
free(tpu_output_mask);
|
|
free(input_data2);
|
|
free(input_data3);
|
|
free(mask);
|
|
free(update_tbl);
|
|
free(_ref_data);
|
|
free(kernel_data);
|
|
free(bias_data);
|
|
}
|
|
|
|
int main() {
|
|
CVI_RT_HANDLE rt_ctx;
|
|
cvk_context_t *cvk_ctx;
|
|
|
|
// init runtime / kerenl structure
|
|
test_init(&rt_ctx, &cvk_ctx);
|
|
|
|
cvk_tg_shape_t tg_shape = {1, 20, 3, 4};
|
|
|
|
// run test
|
|
int testbench_nr = sizeof(testbenchs) / sizeof(testbenchs[0]);
|
|
|
|
for (int i = 0; i < testbench_nr; i++) {
|
|
testbench(&rt_ctx, cvk_ctx, &tg_shape, i);
|
|
}
|
|
#if 1
|
|
|
|
// run test without ping-pong
|
|
tg_shape = {1, 128, 340, 16};
|
|
|
|
printf("[heavy data] w/o ping pong\n");
|
|
|
|
// NOTICE: only check c
|
|
int tiles = std::ceil(tg_shape.c / (float)cvk_ctx->info.npu_num);
|
|
if (tg_shape.c > cvk_ctx->info.npu_num) {
|
|
debug_print("tile nr %d channel base one npu nr %d\n", tiles, cvk_ctx->info.npu_num);
|
|
}
|
|
|
|
for (int i = 0; i < testbench_nr; i++) {
|
|
testbench(&rt_ctx, cvk_ctx, &tg_shape, i);
|
|
}
|
|
|
|
tg_shape = {1, 128, 340, 16};
|
|
printf("[heavy data] w/ ping pong\n");
|
|
for (int i = 0; i < testbench_nr; i++) {
|
|
testbench(&rt_ctx, cvk_ctx, &tg_shape, i, /*is_pingpong=*/true);
|
|
}
|
|
#endif
|
|
// de-init runtime / kerenl structure
|
|
test_exit(&rt_ctx, cvk_ctx);
|
|
|
|
printf("all pass\n");
|
|
|
|
return 0;
|
|
}
|