// \file mask sample for gt(great than), ge(great equal), eq(equal), lt(less than), le(less equal) // header include #include #include // math #include // kerenl void init_input(uint8_t *input_data, uint64_t ifmap_bytesize, cvk_fmt_t fmt) { uint32_t fmt_size = cvm_bytesize_of_fmt(fmt); uint64_t sz = ifmap_bytesize / fmt_size; int round = 4; // random for (uint64_t i = 0; i < sz; i++) { uint8_t r[2]; r[0] = i % round; if (r[0] == 0) { r[0] = 1; // prevent mul to 0 } if (fmt_size == 2) { // bf16 uint16_t bf16 = convert_fp32_bf16((float)r[0]); memcpy(r, &bf16, fmt_size); } memcpy(&input_data[i * fmt_size], r, fmt_size); } } void init_ref(uint8_t *input_data, uint8_t *ref_data, cvk_tl_shape_t *ifmap_shape, cvk_fmt_t fmt) { uint32_t fmt_size = cvm_bytesize_of_fmt(fmt); int ref_idx = 0; // reduce ONLY hw for (uint32_t n = 0; n < ifmap_shape->n; n++) { for (uint32_t c = 0; c < ifmap_shape->c; c++) { float tmp = 1; for (uint32_t h = 0; h < ifmap_shape->h; h++) { for (uint32_t w = 0; w < ifmap_shape->w; w++) { uint32_t off = (n * ifmap_shape->c * ifmap_shape->h * ifmap_shape->w + c * ifmap_shape->h * ifmap_shape->w + h * ifmap_shape->w + w) * fmt_size; float v; if (fmt_size == 2) { // bf16 case uint16_t bf16; memcpy(&bf16, &input_data[off], fmt_size); v = convert_bf16_fp32(bf16); } else { v = input_data[off]; } tmp = v * tmp; } } uint8_t r[2]; if (fmt_size == 2) { // bf16 case uint16_t bf16 = convert_fp32_bf16(tmp); memcpy(r, (void *)&bf16, fmt_size); } else { r[0] = tmp; } memcpy(&ref_data[ref_idx * fmt_size], r, fmt_size); ref_idx++; } } } static void testbench(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, cvk_fmt_t fmt) { // alloc shape, align with \len uint32_t input_n = 1; uint32_t input_c = 3; uint32_t input_h = 2; uint32_t input_w = 2; cvk_tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w}; // NOTICE: ONLY reduce hw for performance cvk_tl_shape_t ofmap_shape = {input_n, input_c, 1, 1}; uint64_t ifmap_size = tl_shape_size(&ifmap_shape); uint64_t ofmap_size = tl_shape_size(&ofmap_shape); // unit size is 1 bytes, bf16 takes 2 bytes int data_type_size = 1; if (fmt == CVK_FMT_BF16) { data_type_size = 2; } uint64_t ifmap_bytesize = ifmap_size * data_type_size; uint64_t ofmap_bytesize = ofmap_size * data_type_size; // alloc input/output tl cvk_tl_t *tl_ifmap = test_alloc_tl(cvk_ctx, ifmap_shape, fmt, CTRL_AL); // alloc data from ddr uint8_t *input_data = (uint8_t *)xmalloc(ifmap_bytesize); uint8_t *ref_data = (uint8_t *)xmalloc(ofmap_bytesize); // init input / output data in ddr init_input(input_data, ifmap_bytesize, fmt); init_ref(input_data, ref_data, &ifmap_shape, fmt); // send host memory->device memory->tpu_memory test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_ifmap, (uint8_t *)input_data); // prepare command buffer cvm_reduce_hw_mul(cvk_ctx, tl_ifmap); // submit descriptor test_submit_comp(rt_ctx, cvk_ctx); // reshape for reduce result tl_ifmap->shape = {tl_ifmap->shape.n, tl_ifmap->shape.c, 1, 1}; tl_ifmap->stride = cvk_ctx->ops->tl_default_stride(cvk_ctx, tl_ifmap->shape, tl_ifmap->fmt, 1); // get data from tl uint8_t *ofmap_data = test_get_tensor_l2g_comp(rt_ctx, cvk_ctx, tl_ifmap); // compare with reference with byte for (uint32_t i = 0; i < ofmap_size; i++) { if (ref_data[i] != ofmap_data[i]) { fprintf(stderr, "comparing failed output[%u] got %u, ref %u\n", i, ofmap_data[i], ref_data[i]); // fail case exit(-1); } } // free resource from tpu memory free_tl(cvk_ctx, tl_ifmap); // free resource from host memory free(input_data); free(ref_data); free(ofmap_data); } int main() { CVI_RT_HANDLE rt_ctx; cvk_context_t *cvk_ctx; int round_mode; // align kerenl rounding mode round_mode = set_store_feround(); // init runtime / kerenl structure test_init(&rt_ctx, &cvk_ctx); printf("test reduce mul int8\n"); testbench(&rt_ctx, cvk_ctx, CVK_FMT_I8); printf("test reduce mul bf16\n"); testbench(&rt_ctx, cvk_ctx, CVK_FMT_BF16); // de-init runtime / kerenl structure test_exit(&rt_ctx, cvk_ctx); // restore rounding mode restore_feround(round_mode); return 0; }