#include #include #include #include #include #include #include #include #define math_min(x, y) ((x) < (y) ? (x) : (y)) #define math_max(x, y) ((x) > (y) ? (x) : (y)) typedef uint8_t uint8_t; typedef uint16_t uint16_t; typedef uint32_t uint32_t; typedef uint64_t uint64_t; typedef int8_t int8_t; typedef int16_t int16_t; typedef int32_t int32_t; typedef int64_t s64; typedef uint32_t bmerr_t; #define BM_SUCCESS 0 // The operation was successful #define BM_ERR_AGAIN 1 // Not ready yet #define BM_ERR_FAILURE 2 // General failure #define BM_ERR_TIMEOUT 3 // Timeout #define BM_ERR_UNINITIALIZED 4 // Uninitialzed #define BM_ERR_INVALID_ARGUMENT 5 // Arguments invalid #define BM_ERR_NOMEM 6 // Not enough memory #define BM_ERR_DATA 7 // Data error #define BM_ERR_BUSY 8 // Busy #define BM_ERR_NOT_SUPPORTED 9 // Not supported yet typedef uint32_t BLOB_OP; #define BLOB_ADD 0 #define BLOB_SUB 1 #define BLOB_MUL 2 #define BLOB_DIV 3 #define BLOB_INVALID 4 static inline int calc_offset(int *shape, int *offset) { return ((offset[0] * shape[1] + offset[1]) * shape[2] + offset[2]) * shape[3] + offset[3]; } static int index_get(int h, int w1, int w2) { return h * w1 + w2; } int array_cmp_float_rel(const char *const info, float *p_exp, float *p_got, int count, float delta) { int idx = 0; for (idx = 0; idx < count; idx++) { if (math_max(fabs(p_exp[idx]), fabs(p_got[idx])) > 1.0) { // compare rel if (math_min(fabs(p_exp[idx]), fabs(p_got[idx])) < 1e-20) { printf("%s rel error at index %d exp %.20f got %.20f\n", info, idx, p_exp[idx], p_got[idx]); if (isnan(p_exp[idx]) && isnan(p_got[idx])) { printf("both exp and got are NAN"); return 0; } return -1; } if (fabs(p_exp[idx] - p_got[idx]) > delta * math_min(fabs(p_exp[idx]), fabs(p_got[idx]))) { printf("%s rel error at index %d exp %.20f got %.20f\n", info, idx, p_exp[idx], p_got[idx]); if (isnan(p_exp[idx]) && isnan(p_got[idx])) { printf("both exp and got are NAN"); return 0; } return -1; } } else { if (fabs(p_exp[idx] - p_got[idx]) > delta) { printf("%s abs error at index %d exp %.20f got %.20f\n", info, idx, p_exp[idx], p_got[idx]); if (isnan(p_exp[idx]) && isnan(p_got[idx])) { printf("both exp and got are NAN"); return 0; } return -1; } } if (isnan(p_got[idx]) && !isnan(p_exp[idx])) { printf("%s, found nans idx %d\n", info, idx); printf("floating from exp %.10f got %.10f\n", p_exp[idx], p_got[idx]); IF_VAL exp, got; exp.fval = p_exp[idx]; got.fval = p_got[idx]; printf("hex form exp %8.8x got %8.8x\n", exp.ival, got.ival); return -2; } } return 0; } int array_cmp_float(const char *const info, float *p_exp, float *p_got, int count, float delta) { if (delta == 0.0f) { for (int idx = 0; idx < count; idx++) { if (p_exp[idx] != p_got[idx]) { printf("%s error at index %d exp %.20f got %.20f\n", info, idx, p_exp[idx], p_got[idx]); if (isnan(p_exp[idx]) && isnan(p_got[idx])) { printf("both exp and got are NAN\n"); return 0; } return -1; } } } else { return array_cmp_float_rel(info, p_exp, p_got, count, delta); } return 0; } int array_cmp_int(const char *const info, int *p_exp, int *p_got, int count) { int idx; for (idx = 0; idx < count; idx++) { if (p_exp[idx] != p_got[idx]) { printf("%s error at index %d exp %d got %d\n", info, idx, p_exp[idx], p_got[idx]); return -1; } } return 0; } int array_cmp_int8(const char *const info, const int8_t *p_exp, const int8_t *p_got, int count) { int idx; for (idx = 0; idx < count; idx++) { if (p_exp[idx] != p_got[idx]) { printf("%s error at index %d exp %d got %d\n", info, idx, p_exp[idx], p_got[idx]); return -1; } } return 0; } int calc_dilute_hw(int h, int ins_h, int ins_h_l, int pad_h_b, int pad_h_t) { return (h - 1) * (ins_h + 1) + ins_h_l + 1 + pad_h_t + pad_h_b; } int calc_output_hw(int hw, int khw, int stride) { return (hw - khw) / stride + 1; } int fill_pad_fmap_int8(const int8_t *before, int8_t **pafter, int val, int pad_l, int pad_r, int pad_t, int pad_b, int ins_h, int ins_w, int ins_h_last, int ins_w_last, int h_before, int w_before) { int w_after = (w_before - 1) * (ins_w + 1) + ins_w_last + 1 + pad_l + pad_r; int h_after = (h_before - 1) * (ins_h + 1) + ins_h_last + 1 + pad_t + pad_b; int8_t *after = *pafter; if (!before || !pafter) return BM_ERR_INVALID_ARGUMENT; if (!after) { after = malloc(sizeof(int8_t) * w_after * h_after); if (!after) return BM_ERR_NOMEM; } memset(after, val, w_after * h_after); for (int h = 0; h < h_before; h++) { for (int w = 0; w < w_before; w++) { int i = (h * (ins_h + 1) + pad_t) * w_after + w * (ins_w + 1) + pad_l; after[i] = before[h * w_before + w]; } } *pafter = after; return BM_SUCCESS; } int fill_pad_fmap_bf16(const uint16_t *before, uint16_t **pafter, int val, int pad_l, int pad_r, int pad_t, int pad_b, int ins_h, int ins_w, int ins_h_last, int ins_w_last, int h_before, int w_before) { int w_after = (w_before - 1) * (ins_w + 1) + ins_w_last + 1 + pad_l + pad_r; int h_after = (h_before - 1) * (ins_h + 1) + ins_h_last + 1 + pad_t + pad_b; uint16_t *after = *pafter; if (!before || !pafter) return BM_ERR_INVALID_ARGUMENT; if (!after) { after = malloc(sizeof(uint16_t) * w_after * h_after); if (!after) return BM_ERR_NOMEM; } for (int i = 0; i < w_after * h_after; i++) after[i] = val; for (int h = 0; h < h_before; h++) { for (int w = 0; w < w_before; w++) { int i = (h * (ins_h + 1) + pad_t) * w_after + w * (ins_w + 1) + pad_l; after[i] = before[h * w_before + w]; } } #if 0 printf("bf16 padding:\n"); for(int i=0;i= 0 && in_y < input_h && in_x >= 0 && in_x < input_w) { weight_offset[0] = o + o_head; weight_offset[1] = k; if (flip) { weight_offset[2] = (kh - 1 - p); weight_offset[3] = (kw - 1 - q); } else { weight_offset[2] = p; weight_offset[3] = q; } in_offset[0] = n; in_offset[1] = k + k_head; in_offset[2] = in_y; in_offset[3] = in_x; ofmap_f[calc_offset(o_shape, out_offset)] += ifmap_f[calc_offset(i_shape, in_offset)] * weight_f[calc_offset(k_shape, weight_offset)]; if (k_g == 1 && kh == 1 && kw == 1) { ofmap_f[calc_offset(o_shape, out_offset)] = ifmap_f[calc_offset(i_shape, in_offset)] * weight_f[calc_offset(k_shape, weight_offset)]; } } } } } if (using_bias) { ofmap_f[calc_offset(o_shape, out_offset)] += bias_f[o + o_head]; } if (result_add) { ofmap_f[calc_offset(o_shape, out_offset)] += result_init; } } } } } } } int native_fc_int8(const int8_t *L, const int8_t *R, const int16_t *B, const int16_t *Y, int *Y_ref, int L_row_num, int L_col_num, int R_col_num, int L_sign, int R_sign, int B_sign, int l_shift_width, int r_shift_width, int is_result_int8, int do_relu) { const uint8_t *uL = (const uint8_t *)L; const uint8_t *uR = (const uint8_t *)R; const uint16_t *uB = (const uint16_t *)B; int opd0, opd1, opd2; int ret = BM_SUCCESS; for (int hidx = 0; hidx < L_row_num; hidx++) { for (int widx = 0; widx < R_col_num; widx++) { int Y1 = 0; int Y2 = 0; int sum_idx = 0; for (sum_idx = 0; sum_idx < L_col_num; sum_idx++) { int idx_L = index_get(hidx, L_col_num, sum_idx); int idx_R = index_get(sum_idx, R_col_num, widx); opd0 = (L_sign) ? L[idx_L] : uL[idx_L]; opd1 = (R_sign) ? R[idx_R] : uR[idx_R]; if ((sum_idx % 2 == 0 && sum_idx != 2) || sum_idx == 1) { Y1 += opd0 * opd1; } else { Y2 += opd0 * opd1; } } sum_idx++; if (B) { opd2 = (B_sign) ? (int)B[widx] : (int)uB[widx]; if ((sum_idx % 2 == 0 && sum_idx != 2) || sum_idx == 1) { Y1 += opd2; } else { Y2 += opd2; } sum_idx++; } int idx_Y = index_get(hidx, R_col_num, widx); if (Y) { if ((sum_idx % 2 == 0 && sum_idx != 2) || sum_idx == 1) { Y1 += (Y[idx_Y] << l_shift_width); } else { Y2 += (Y[idx_Y] << l_shift_width); } } Y_ref[idx_Y] = Y1 + Y2; } } uint8_t *Yout_int8 = malloc(sizeof(int8_t) * L_row_num * R_col_num); uint16_t *Yout_int16 = malloc(sizeof(int16_t) * L_row_num * R_col_num); if (is_result_int8) { ret = satu_2_8bit(Y_ref, L_row_num * R_col_num, (int8_t *)Yout_int8, r_shift_width, 1, !do_relu); if (ret != BM_SUCCESS) goto error_release; fill_int_with_int8(Y_ref, (int8_t *)Yout_int8, L_row_num * R_col_num); } else { ret = satu_2_16bit(Y_ref, L_row_num * R_col_num, (int16_t *)Yout_int16, r_shift_width, 1, !do_relu); if (ret != BM_SUCCESS) goto error_release; fill_int_with_int16(Y_ref, (int16_t *)Yout_int16, L_row_num * R_col_num); } error_release: free(Yout_int8); free(Yout_int16); return ret; } int native_pooling_ave_int8(const int8_t *i_fmap, const void *weight, const int16_t *bias, int8_t *o_fmap, int input_n, int input_c, int input_h, int input_w, int kh, int kw, int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r, int stride_h, int stride_w, int ins_h, int ins_w, int ins_h_last, int ins_w_last, int input_sign, int satu_sign, int r_shift_width, int const_weight) { if (kh * kw <= 0) return BM_ERR_INVALID_ARGUMENT; int *avg_pooling_mac_a = (int *)malloc(kh * kw * sizeof(int)); int *avg_pooling_mac_b = (int *)malloc(kh * kw * sizeof(int)); uint8_t avg_const_weight = *(uint8_t *)weight; const int8_t *weight_arr = weight; int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b); int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r); int output_h = calc_output_hw(h_after, kh, stride_h); int output_w = calc_output_hw(w_after, kw, stride_w); int8_t *i_fmap_pad = NULL; for (int n = 0; n < input_n; n++) { if (const_weight == 0) weight_arr = weight; for (int c = 0; c < input_c; ++c) { fill_pad_fmap_int8(i_fmap, &i_fmap_pad, 0, pad_w_l, pad_w_r, pad_h_t, pad_h_b, ins_h, ins_w, ins_h_last, ins_w_last, input_h, input_w); for (int ph = 0; ph < output_h; ++ph) { for (int pw = 0; pw < output_w; ++pw) { int hstart = ph * stride_h; int wstart = pw * stride_w; int pool_index = index_get(ph, output_w, pw); int mac_index = 0; int avg_pool_result; for (int h = 0; h < kh; h++) { for (int w = 0; w < kw; w++) { int index = index_get((hstart + h), w_after, (w + wstart)); mac_index = index_get(h, kw, w); avg_pooling_mac_a[mac_index] = input_sign ? i_fmap_pad[index] : (uint8_t)(i_fmap_pad[index]); avg_pooling_mac_b[mac_index] = const_weight ? avg_const_weight : weight_arr[mac_index]; } } inner_product(avg_pooling_mac_a, avg_pooling_mac_b, kh * kw, &avg_pool_result); if (bias) { avg_pool_result += bias[c]; } int ret = satu_2_8bit(&avg_pool_result, sizeof(int8_t), o_fmap + pool_index, r_shift_width, 1, satu_sign); if (ret != BM_SUCCESS) { free(i_fmap_pad); free(avg_pooling_mac_a); free(avg_pooling_mac_b); return BM_ERR_INVALID_ARGUMENT; } } } i_fmap += input_w * input_h; if (const_weight == 0) weight_arr += kh * kw; o_fmap += output_w * output_h; } } free(i_fmap_pad); free(avg_pooling_mac_a); free(avg_pooling_mac_b); return BM_SUCCESS; } int native_pooling_max_int8(const int8_t *i_fmap, int8_t *o_fmap, int input_n, int input_c, int input_h, int input_w, int kh, int kw, int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r, int stride_h, int stride_w, int ins_h, int ins_w, int ins_h_last, int ins_w_last, int input_sign) { if (ins_h != 0 || ins_w != 0 || ins_h_last != 0 || ins_w_last != 0) return BM_ERR_INVALID_ARGUMENT; int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b); int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r); int output_h = calc_output_hw(h_after, kh, stride_h); int output_w = calc_output_hw(w_after, kw, stride_w); const int max_init = input_sign ? -128 : 0; int8_t *i_fmap_pad = NULL; for (int nc = 0; nc < input_n * input_c; nc++) { fill_pad_fmap_int8(i_fmap, &i_fmap_pad, max_init, pad_w_l, pad_w_r, pad_h_t, pad_h_b, 0, 0, 0, 0, input_h, input_w); for (int ph = 0; ph < output_h; ++ph) { for (int pw = 0; pw < output_w; ++pw) { int hstart = ph * stride_h; int wstart = pw * stride_w; int pool_index = index_get(ph, output_w, pw); int max = max_init; for (int h = 0; h < kh; h++) { for (int w = 0; w < kw; w++) { int index = index_get((hstart + h), (input_w + pad_w_l + pad_w_r), (w + wstart)); int val = input_sign ? i_fmap_pad[index] : (uint8_t)i_fmap_pad[index]; max = (val > max) ? val : max; } } o_fmap[pool_index] = max; } } i_fmap += input_w * input_h; o_fmap += output_w * output_h; } free(i_fmap_pad); return BM_SUCCESS; } int native_pooling_max_fp32(const float *ifmap, float *ofmap, int input_n, int input_c, int input_h, int input_w, int kh, int kw, int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r, int stride_h, int stride_w, int ins_h, int ins_w, int ins_h_last, int ins_w_last) { int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b); int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r); int output_h = calc_output_hw(h_after, kh, stride_h); int output_w = calc_output_hw(w_after, kw, stride_w); float *ifmap_after = malloc(sizeof(float) * h_after * w_after); if (ifmap_after == NULL) { printf("No enough memory[h_after, w_after]: [%u, %u].\n", h_after, w_after); return BM_ERR_NOMEM; } for (int n = 0; n < input_n; n++) { for (int c = 0; c < input_c; c++) { int ret = fill_pad_fmap_fp32(ifmap, &ifmap_after, -FLT_MAX, pad_h_t, pad_h_b, pad_w_l, pad_w_r, ins_h, ins_w, ins_h_last, ins_w_last, input_h, input_w); if (ret != BM_SUCCESS) { printf("Failed to pad input fmap.\n"); free(ifmap_after); return BM_ERR_FAILURE; } for (int h = 0; h < output_h; h++) { for (int w = 0; w < output_w; w++) { int rf_h = h * stride_h, rf_w = w * stride_w; int kh_end = math_min(kh, h_after - rf_h); int kw_end = math_min(kw, w_after - rf_w); float *rf_addr = ifmap_after + rf_h * w_after + rf_w; float max_val = -FLT_MAX; for (int i = 0; i < kh_end; i++) { for (int j = 0; j < kw_end; j++) { max_val = math_max(rf_addr[i * w_after + j], max_val); } } ofmap[h * output_w + w] = max_val; } } ifmap += input_h * input_w; ofmap += output_h * output_w; } } free(ifmap_after); return BM_SUCCESS; } int native_pooling_avg_fp32(const float *ifmap, float *ofmap, int input_n, int input_c, int input_h, int input_w, int kh, int kw, int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r, int stride_h, int stride_w, int ins_h, int ins_w, int ins_h_last, int ins_w_last, float avg_pooling_const) { int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b); int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r); int output_h = calc_output_hw(h_after, kh, stride_h); int output_w = calc_output_hw(w_after, kw, stride_w); float *ifmap_after = malloc(sizeof(float) * h_after * w_after); if (ifmap_after == NULL) { printf("No enough memory[h_after, w_after]: [%u, %u].\n", h_after, w_after); return BM_ERR_NOMEM; } for (int n = 0; n < input_n; n++) { for (int c = 0; c < input_c; c++) { int ret = fill_pad_fmap_fp32(ifmap, &ifmap_after, 0, pad_h_t, pad_h_b, pad_w_l, pad_w_r, ins_h, ins_w, ins_h_last, ins_w_last, input_h, input_w); if (ret != BM_SUCCESS) { printf("Failed to pad input fmap.\n"); free(ifmap_after); return BM_ERR_FAILURE; } for (int h = 0; h < output_h; h++) { for (int w = 0; w < output_w; w++) { int rf_h = h * stride_h, rf_w = w * stride_w; int kh_end = math_min(kh, h_after - rf_h); int kw_end = math_min(kw, w_after - rf_w); float *rf_addr = ifmap_after + rf_h * w_after + rf_w; float dot_product_even = 0.0, dot_product_odd = 0.0; for (int i = 0; i < kh_end; i++) { for (int j = 0; j < kw_end; j++) { if ((i * kw_end + j) % 2) { dot_product_odd += rf_addr[i * w_after + j] * avg_pooling_const; } else { dot_product_even += rf_addr[i * w_after + j] * avg_pooling_const; } } } ofmap[h * output_w + w] = dot_product_even + dot_product_odd; } } ifmap += input_h * input_w; ofmap += output_h * output_w; } } free(ifmap_after); return BM_SUCCESS; } void native_pooling_forward_max(const float *bottom_data, float *top_data, int *mask_data, const int count, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w) { (void)num; for (int index = 0; index < count; ++index) { const int pw = index % pooled_width; const int ph = (index / pooled_width) % pooled_height; const int c = (index / pooled_width / pooled_height) % channels; const int n = index / pooled_width / pooled_height / channels; int hstart = ph * stride_h - pad_h; int wstart = pw * stride_w - pad_w; const int hend = math_min(hstart + kernel_h, height); const int wend = math_min(wstart + kernel_w, width); hstart = math_max(hstart, 0); wstart = math_max(wstart, 0); float maxval = -FLT_MAX; int maxidx = -1; const float *const bottom_slice = bottom_data + (n * channels + c) * height * width; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { if (bottom_slice[h * width + w] > maxval) { maxidx = h * width + w; maxval = bottom_slice[maxidx]; } } } top_data[index] = maxval; mask_data[index] = maxidx; } } void native_pooling_forward_ave(const float *bottom_data, float *top_data, const int count, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w) { (void)num; for (int index = 0; index < count; ++index) { const int pw = index % pooled_width; const int ph = (index / pooled_width) % pooled_height; const int c = (index / pooled_width / pooled_height) % channels; const int n = index / pooled_width / pooled_height / channels; int hstart = ph * stride_h - pad_h; int wstart = pw * stride_w - pad_w; int hend = math_min(hstart + kernel_h, height + pad_h); int wend = math_min(wstart + kernel_w, width + pad_w); const int pool_size = (hend - hstart) * (wend - wstart); hstart = math_max(hstart, 0); wstart = math_max(wstart, 0); hend = math_min(hend, height); wend = math_min(wend, width); float aveval = 0; const float *const bottom_slice = bottom_data + (n * channels + c) * height * width; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { aveval += bottom_slice[h * width + w]; } } top_data[index] = aveval / pool_size; } } int satu_2_8bit(const int *pBuff, int len, int8_t *pByteOut, int rshiftbits, int round_floor, int sign_unsign) { if (rshiftbits < 0) return BM_ERR_INVALID_ARGUMENT; int temp; int satu_max = sign_unsign ? 127 : 255; int satu_min = sign_unsign ? -128 : 0; if (rshiftbits == 0) { for (int ii = 0; ii < len; ii++) { temp = (pBuff[ii] > satu_max) ? satu_max : ((pBuff[ii] < satu_min) ? satu_min : pBuff[ii]); memcpy(pByteOut + ii, &temp, 1); } } else { // rshiftbits>0 for (int ii = 0; ii < len; ii++) { if (round_floor == 1) temp = ((pBuff[ii] >> (rshiftbits - 1)) + 1) >> 1; else temp = pBuff[ii] >> rshiftbits; temp = (temp > satu_max) ? satu_max : ((temp < satu_min) ? satu_min : temp); memcpy(pByteOut + ii, &temp, 1); } } return BM_SUCCESS; } int satu_2_16bit(const int *pBuff, int len, short *pByteOut, int rshiftbits, int round_floor, int sign_unsign) { if (rshiftbits < 0) return BM_ERR_INVALID_ARGUMENT; int ii; int temp; int satu_max = sign_unsign ? 32767 : 65535; int satu_min = sign_unsign ? -32768 : 0; if (rshiftbits == 0) { for (ii = 0; ii < len; ii++) { temp = (pBuff[ii] > satu_max) ? satu_max : ((pBuff[ii] < satu_min) ? satu_min : pBuff[ii]); memcpy(pByteOut + ii, &temp, 2); } } else { // rshiftbits>0 for (ii = 0; ii < len; ii++) { if (round_floor == 1) temp = ((pBuff[ii] >> (rshiftbits - 1)) + 1) >> 1; else temp = pBuff[ii] >> rshiftbits; temp = (temp > satu_max) ? satu_max : ((temp < satu_min) ? satu_min : temp); memcpy(pByteOut + ii, &temp, 2); } } return BM_SUCCESS; }