// \file implement activation function(sigmoid) by interpolation lookup table, // please refer [here](https://en.wikipedia.org/wiki/Linear_interpolation) for more details // header include #include #include // math #include // kerenl // ========== user config ============ #define MAX_ERROR (0.004) // tolerance // for current example, we quauntize data to -8 ~ +8 // range depend on ur activation static int range_start = -8; static int range_end = 8; // ========== end of user config ============ // gen reference by cpu static double _gen_sigmoid(float x) { return 1.0 / (1.0 + exp(-(x))); } // gen reference static void gen_ref(uint16_t *ofmap, uint16_t *ifmap, cvk_tl_shape_t ifmap_shape) { for (uint64_t i = 0; i < tl_shape_size(&ifmap_shape); i++) { ofmap[i] = convert_fp32_bf16(_gen_sigmoid(convert_bf16_fp32(ifmap[i]))); } } // verify cpu data with tpu static bool verify(uint16_t *ofmap_data, uint16_t *ref_data, uint64_t ofmap_size) { int count = 0; uint64_t size = ofmap_size; for (uint64_t i = 0; i < size; i++) { float got = convert_bf16_fp32(ofmap_data[i]); float exp = convert_bf16_fp32(ref_data[i]); if (fabs(got - exp) > MAX_ERROR) { fprintf(stderr, "[%d] comparing failed at ofmap_data[%u], got %x, exp %x, " "diff(%f - %f) is %f\n", count, (uint32_t)i, ofmap_data[i], ref_data[i], got, exp, fabs(got - exp)); count++; } } // exit if fail if (count != 0) { printf("error count is %d\n", count); exit(-1); } return true; } // gen random input for test static void gen_input(uint16_t *ifmap, uint64_t ifmap_size) { int table_hw = 256; for (uint64_t i = 0; i < ifmap_size; i++) { // input range is -8 ~ +8 float input = ((int)i % 7) * (((int)i % 2) ? 1 : -1) + 0.03 + (i % table_hw) * 0.002; ifmap[i] = convert_fp32_bf16(input); } } // main code for test sigmoid interpolate implement by lookup table static void testbench(CVI_RT_HANDLE *ctx, cvk_context_t *bmk) { // example for input tensor cvk_tl_shape_t ifmap_shape = {1, 32, 16, 16}; cvk_fmt_t fmt = CVK_FMT_BF16; // get table / input shape cvk_tl_shape_t table_shape; cvm_table_shape(bmk, &table_shape); cvk_tl_shape_t ofmap_shape = ifmap_shape; uint64_t ifmap_size = tl_shape_size(&ifmap_shape); uint64_t table_size = tl_shape_size(&table_shape); uint64_t ofmap_size = tl_shape_size(&ofmap_shape); // get table/input size int data_type_size = 1; if (fmt == CVK_FMT_BF16) { // bf16 takes 2 bytes data_type_size = 2; } uint64_t ifmap_bytesize = ifmap_size * data_type_size; uint64_t table_bytesize = table_size * data_type_size; uint64_t ofmap_bytesize = ofmap_size * data_type_size; // alloc host memory uint16_t *ifmap = (uint16_t *)xmalloc(ifmap_bytesize); uint16_t *table_data = (uint16_t *)xmalloc(table_bytesize); uint16_t *table_data_slope = (uint16_t *)xmalloc(table_bytesize); uint16_t *ref_data = (uint16_t *)xmalloc(ofmap_bytesize); // gen input and assign data in host gen_input(ifmap, ifmap_size); // gen table, interpolation need 2 tables, one for lookup, another one is slope cvm_sigmoid_tbl(table_data, table_data_slope, &table_shape, range_start, range_end); // gen reference gen_ref(ref_data, ifmap, ofmap_shape); // alloc input / output / tmp / lookup table / slope table cvk_tl_t *tl_ifmap = test_alloc_tl(bmk, ifmap_shape, fmt, /*align*/ 1); cvk_tl_t *cvk_tl_table_answer = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1); cvk_tl_t *cvk_tl_table_answer_slope = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1); cvk_tl_t *tl_buf = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1); cvk_tl_t *tl_ofmap_bf16 = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1); // device memory load to local memory test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap, (uint8_t *)ifmap); test_put_tensor_g2l_comp(ctx, bmk, cvk_tl_table_answer, (uint8_t *)table_data); test_put_tensor_g2l_comp(ctx, bmk, cvk_tl_table_answer_slope, (uint8_t *)table_data_slope); // get quantize(scale) value float scale = cvm_sigmoid_scale(range_start, range_end); // emit core function cvm_emit_sigmoid(bmk, tl_ifmap, tl_buf, cvk_tl_table_answer, cvk_tl_table_answer_slope, tl_ofmap_bf16, scale); // get result from device to host uint16_t *ofmap_data = (uint16_t *)test_get_tensor_l2g_comp(ctx, bmk, tl_ofmap_bf16); // verify data with tolerance verify(ofmap_data, ref_data, ofmap_size); // release device memory in revert order free_tl(bmk, tl_ofmap_bf16); free_tl(bmk, tl_buf); free_tl(bmk, cvk_tl_table_answer_slope); free_tl(bmk, cvk_tl_table_answer); free_tl(bmk, tl_ifmap); // release host memory free(ifmap); free(table_data); free(table_data_slope); free(ref_data); free(ofmap_data); } int main() { CVI_RT_HANDLE ctx; cvk_context_t *bmk; int round_mode; round_mode = set_store_feround(); // init runtime / kerenl structure test_init(&ctx, &bmk); // emit test case testbench(&ctx, bmk); // de-init runtime / kerenl structure test_exit(&ctx, bmk); // restore rounding restore_feround(round_mode); return 0; }