Files
SDK_SG200x_V2/cvimath/tests/common/test_native_ref.c
carbon 83dc4914fe add cvimath
commit ce8705f49da5e5f59c2ddb3253ef88323a0cd9c4
Author: sophgo-forum-service <forum_service@sophgo.com>
Date:   Mon May 13 14:04:10 2024 +0800

    [feat] cvimath opensource for cv18xx soc.

    - 9e8967
2024-05-31 11:54:07 +08:00

981 lines
34 KiB
C

#include <assert.h>
#include <errno.h>
#include <float.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <test_native_ref.h>
#define math_min(x, y) ((x) < (y) ? (x) : (y))
#define math_max(x, y) ((x) > (y) ? (x) : (y))
typedef uint8_t uint8_t;
typedef uint16_t uint16_t;
typedef uint32_t uint32_t;
typedef uint64_t uint64_t;
typedef int8_t int8_t;
typedef int16_t int16_t;
typedef int32_t int32_t;
typedef int64_t s64;
typedef uint32_t bmerr_t;
#define BM_SUCCESS 0 // The operation was successful
#define BM_ERR_AGAIN 1 // Not ready yet
#define BM_ERR_FAILURE 2 // General failure
#define BM_ERR_TIMEOUT 3 // Timeout
#define BM_ERR_UNINITIALIZED 4 // Uninitialzed
#define BM_ERR_INVALID_ARGUMENT 5 // Arguments invalid
#define BM_ERR_NOMEM 6 // Not enough memory
#define BM_ERR_DATA 7 // Data error
#define BM_ERR_BUSY 8 // Busy
#define BM_ERR_NOT_SUPPORTED 9 // Not supported yet
typedef uint32_t BLOB_OP;
#define BLOB_ADD 0
#define BLOB_SUB 1
#define BLOB_MUL 2
#define BLOB_DIV 3
#define BLOB_INVALID 4
static inline int calc_offset(int *shape, int *offset) {
return ((offset[0] * shape[1] + offset[1]) * shape[2] + offset[2]) * shape[3] + offset[3];
}
static int index_get(int h, int w1, int w2) { return h * w1 + w2; }
int array_cmp_float_rel(const char *const info, float *p_exp, float *p_got, int count,
float delta) {
int idx = 0;
for (idx = 0; idx < count; idx++) {
if (math_max(fabs(p_exp[idx]), fabs(p_got[idx])) > 1.0) {
// compare rel
if (math_min(fabs(p_exp[idx]), fabs(p_got[idx])) < 1e-20) {
printf("%s rel error at index %d exp %.20f got %.20f\n", info, idx, p_exp[idx], p_got[idx]);
if (isnan(p_exp[idx]) && isnan(p_got[idx])) {
printf("both exp and got are NAN");
return 0;
}
return -1;
}
if (fabs(p_exp[idx] - p_got[idx]) > delta * math_min(fabs(p_exp[idx]), fabs(p_got[idx]))) {
printf("%s rel error at index %d exp %.20f got %.20f\n", info, idx, p_exp[idx], p_got[idx]);
if (isnan(p_exp[idx]) && isnan(p_got[idx])) {
printf("both exp and got are NAN");
return 0;
}
return -1;
}
} else {
if (fabs(p_exp[idx] - p_got[idx]) > delta) {
printf("%s abs error at index %d exp %.20f got %.20f\n", info, idx, p_exp[idx], p_got[idx]);
if (isnan(p_exp[idx]) && isnan(p_got[idx])) {
printf("both exp and got are NAN");
return 0;
}
return -1;
}
}
if (isnan(p_got[idx]) && !isnan(p_exp[idx])) {
printf("%s, found nans idx %d\n", info, idx);
printf("floating from exp %.10f got %.10f\n", p_exp[idx], p_got[idx]);
IF_VAL exp, got;
exp.fval = p_exp[idx];
got.fval = p_got[idx];
printf("hex form exp %8.8x got %8.8x\n", exp.ival, got.ival);
return -2;
}
}
return 0;
}
int array_cmp_float(const char *const info, float *p_exp, float *p_got, int count, float delta) {
if (delta == 0.0f) {
for (int idx = 0; idx < count; idx++) {
if (p_exp[idx] != p_got[idx]) {
printf("%s error at index %d exp %.20f got %.20f\n", info, idx, p_exp[idx], p_got[idx]);
if (isnan(p_exp[idx]) && isnan(p_got[idx])) {
printf("both exp and got are NAN\n");
return 0;
}
return -1;
}
}
} else {
return array_cmp_float_rel(info, p_exp, p_got, count, delta);
}
return 0;
}
int array_cmp_int(const char *const info, int *p_exp, int *p_got, int count) {
int idx;
for (idx = 0; idx < count; idx++) {
if (p_exp[idx] != p_got[idx]) {
printf("%s error at index %d exp %d got %d\n", info, idx, p_exp[idx], p_got[idx]);
return -1;
}
}
return 0;
}
int array_cmp_int8(const char *const info, const int8_t *p_exp, const int8_t *p_got, int count) {
int idx;
for (idx = 0; idx < count; idx++) {
if (p_exp[idx] != p_got[idx]) {
printf("%s error at index %d exp %d got %d\n", info, idx, p_exp[idx], p_got[idx]);
return -1;
}
}
return 0;
}
int calc_dilute_hw(int h, int ins_h, int ins_h_l, int pad_h_b, int pad_h_t) {
return (h - 1) * (ins_h + 1) + ins_h_l + 1 + pad_h_t + pad_h_b;
}
int calc_output_hw(int hw, int khw, int stride) { return (hw - khw) / stride + 1; }
int fill_pad_fmap_int8(const int8_t *before, int8_t **pafter, int val, int pad_l, int pad_r,
int pad_t, int pad_b, int ins_h, int ins_w, int ins_h_last, int ins_w_last,
int h_before, int w_before) {
int w_after = (w_before - 1) * (ins_w + 1) + ins_w_last + 1 + pad_l + pad_r;
int h_after = (h_before - 1) * (ins_h + 1) + ins_h_last + 1 + pad_t + pad_b;
int8_t *after = *pafter;
if (!before || !pafter) return BM_ERR_INVALID_ARGUMENT;
if (!after) {
after = malloc(sizeof(int8_t) * w_after * h_after);
if (!after) return BM_ERR_NOMEM;
}
memset(after, val, w_after * h_after);
for (int h = 0; h < h_before; h++) {
for (int w = 0; w < w_before; w++) {
int i = (h * (ins_h + 1) + pad_t) * w_after + w * (ins_w + 1) + pad_l;
after[i] = before[h * w_before + w];
}
}
*pafter = after;
return BM_SUCCESS;
}
int fill_pad_fmap_bf16(const uint16_t *before, uint16_t **pafter, int val, int pad_l, int pad_r,
int pad_t, int pad_b, int ins_h, int ins_w, int ins_h_last, int ins_w_last,
int h_before, int w_before) {
int w_after = (w_before - 1) * (ins_w + 1) + ins_w_last + 1 + pad_l + pad_r;
int h_after = (h_before - 1) * (ins_h + 1) + ins_h_last + 1 + pad_t + pad_b;
uint16_t *after = *pafter;
if (!before || !pafter) return BM_ERR_INVALID_ARGUMENT;
if (!after) {
after = malloc(sizeof(uint16_t) * w_after * h_after);
if (!after) return BM_ERR_NOMEM;
}
for (int i = 0; i < w_after * h_after; i++) after[i] = val;
for (int h = 0; h < h_before; h++) {
for (int w = 0; w < w_before; w++) {
int i = (h * (ins_h + 1) + pad_t) * w_after + w * (ins_w + 1) + pad_l;
after[i] = before[h * w_before + w];
}
}
#if 0
printf("bf16 padding:\n");
for(int i=0;i<h_after;i++) {
printf("[\n");
for(int j=0;j<w_after;j++)
printf("%04x ", (after[i*w_after+j]));
printf("\n");
}
printf("]\n");
#endif
*pafter = after;
return BM_SUCCESS;
}
void fill_int_with_int8(int *pdest, int8_t *psrc, int len) {
for (int ii = 0; ii < len; ii++) pdest[ii] = (int)psrc[ii];
}
void fill_int_with_uint8(int *pdest, uint8_t *psrc, int len) {
for (int ii = 0; ii < len; ii++) pdest[ii] = psrc[ii];
}
void fill_int_with_int16(int *pdest, int16_t *psrc, int len) {
for (int ii = 0; ii < len; ii++) {
pdest[ii] = (int16_t)psrc[ii];
}
}
void inner_product(const int *a, const int *b, int len, int *c) {
*c = 0;
for (int ii = 0; ii < len; ii++) {
*c += (a[ii] * b[ii]);
}
}
void inner_float_product(const float *a, const float *b, int len, float *c) {
*c = 0;
for (int ii = 0; ii < len; ii++) {
*c += (a[ii] * b[ii]);
}
}
int fill_pad_fmap_fp32(const float *before, float **after, float pad_value, int pad_h_t,
int pad_h_b, int pad_w_l, int pad_w_r, int ins_h, int ins_w, int ins_h_l,
int ins_w_l, int h, int w) {
int h_after = calc_dilute_hw(h, ins_h, ins_h_l, pad_h_b, pad_h_t);
int w_after = calc_dilute_hw(w, ins_w, ins_w_l, pad_w_l, pad_w_r);
float *ofmap = NULL;
if (before == NULL || after == NULL) {
return BM_ERR_INVALID_ARGUMENT;
}
if (*after == NULL && (*after = malloc(sizeof(float) * h_after * w_after)) == NULL) {
printf("No enough memory: [h_after, w_after]=[%i, %i].\n", h_after, w_after);
return BM_ERR_NOMEM;
}
ofmap = *after;
for (int i = 0; i < h_after * w_after; i++) {
ofmap[i] = pad_value;
}
for (int i = 0; i < h; i++) {
float *start_addr = ofmap + (pad_h_t + i * (ins_h + 1)) * w_after + pad_w_l;
int ins_h_count = (i == h - 1) ? ins_h_l : ins_h;
for (int j = 0; j < ins_h_count + 1; j++) {
memset(start_addr + j * w_after, 0, sizeof(float) * (w_after - pad_w_l - pad_w_r));
}
for (int j = 0; j < w; j++) {
start_addr[j * (ins_w + 1)] = before[i * w + j];
}
}
return BM_SUCCESS;
}
void native_md_scalar(float *a, float *b, float *r, int N, int C, int H, int W, int op,
bool result_add) {
int count = N * C * H * W;
for (int i = 0; i < count; i++) {
switch (op) {
case BLOB_ADD:
r[i] = a[i] + b[i];
break;
case BLOB_SUB:
r[i] = a[i] - b[i];
break;
case BLOB_MUL:
r[i] = result_add ? r[i] : 0;
r[i] += a[i] * b[i];
break;
case BLOB_DIV:
r[i] = a[i] / b[i];
break;
default:
assert(0);
break;
}
}
}
void native_md_scalar_int8(int8_t *a, int8_t *b, int8_t *r, int N, int C, int H, int W, int op,
bool result_add) {
int count = N * C * H * W;
for (int i = 0; i < count; i++) {
switch (op) {
case BLOB_ADD:
r[i] = a[i] + b[i];
break;
case BLOB_SUB:
r[i] = a[i] - b[i];
break;
case BLOB_MUL:
r[i] = result_add ? r[i] : 0;
r[i] += a[i] * b[i];
break;
case BLOB_DIV:
r[i] = a[i] / b[i];
break;
default:
assert(0);
break;
}
}
}
static int matrix_dot_mult(int8_t *A, int8_t *B, int dim_n, int dim_m, int opd0_sign) {
int sum = 0;
for (int i = 0; i < dim_n; i++) {
for (int j = 0; j < dim_m; j++) {
int index = index_get(i, dim_m, j);
if (opd0_sign) {
sum += A[index] * B[index];
} else {
sum += (int)((uint8_t)A[index]) * B[index];
}
}
}
return sum;
}
int native_conv_int8(const int8_t *ifmap, const int8_t *weight, const int16_t *bias, int8_t *ofmap,
int in, int ic, int ih, int iw, int oc, int kh, int kw, int dh, int dw,
int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r, int stride_h, int stride_w,
int ins_h, int ins_w, int ins_h_last, int ins_w_last, int input_sign,
int r_shift_width, int do_relu) {
int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last, pad_h_t, pad_h_b);
int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last, pad_w_l, pad_w_r);
int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
memset(result, 0, sizeof(int) * in * oc * oh * ow);
int8_t *i_fmap_pad_ker = (int8_t *)malloc(kh_ext * kw_ext);
int ret = BM_SUCCESS;
int8_t *i_fmap_pad = NULL;
int8_t *kernel_after = NULL;
for (int n = 0; n < in; ++n) {
for (int c = 0; c < oc; ++c) {
for (int cc = 0; cc < ic; ++cc) {
fill_pad_fmap_int8((int8_t *)ifmap + n * ic * ih * iw + cc * ih * iw, &i_fmap_pad, 0,
pad_w_l, pad_w_r, pad_h_t, pad_h_b, ins_h, ins_w, ins_h_last, ins_w_last,
ih, iw);
// kernel_dilation(
fill_pad_fmap_int8((weight + c * ic * kh * kw + cc * kh * kw), &kernel_after, 0, 0, 0, 0,
0, // no padding
dh - 1, dw - 1, 0, 0, kh, kw);
for (int ph = 0; ph < oh; ++ph) {
for (int pw = 0; pw < ow; ++pw) {
for (int idxh = 0; idxh < kh_ext; ++idxh)
for (int idxw = 0; idxw < kw_ext; ++idxw) {
i_fmap_pad_ker[idxh * kw_ext + idxw] =
i_fmap_pad[(idxh + ph * stride_h) * iw_ext + idxw + pw * stride_w];
}
result[n * oc * oh * ow + c * oh * ow + ph * ow + pw] +=
matrix_dot_mult(i_fmap_pad_ker, kernel_after, kh_ext, kw_ext, input_sign);
}
}
}
if (bias) {
for (int ph = 0; ph < oh; ++ph) {
for (int pw = 0; pw < ow; ++pw) {
result[n * oc * oh * ow + c * oh * ow + ph * ow + pw] += bias[c]; // bias+c ;
}
}
}
ret = satu_2_8bit(&result[n * oc * oh * ow + c * oh * ow], oh * ow,
&ofmap[n * oc * oh * ow + c * oh * ow], r_shift_width, 1, !do_relu);
if (ret != BM_SUCCESS) goto error_release;
} // end for (int c = 0; c < oc; ++c)
} // end for (int n = 0; n < in; n++)
error_release:
free(i_fmap_pad);
free(kernel_after);
free(i_fmap_pad_ker);
free(result);
return ret;
}
int native_depthwise_fp32(const float *ifmap, const float *weight, const float *bias, float *ofmap,
int in, int ic, int ih, int iw, int kh, int kw, int dh, int dw,
int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r, int stride_h,
int stride_w, int ins_h, int ins_w, int ins_h_last, int ins_w_last) {
int h_after = calc_dilute_hw(ih, ins_h, ins_h_last, pad_h_t, pad_h_b);
int w_after = calc_dilute_hw(iw, ins_w, ins_w_last, pad_w_l, pad_w_r);
int kh_dilation = (kh - 1) * dh + 1, kw_dilatoin = (kw - 1) * dw + 1;
int oh = calc_output_hw(h_after, kh_dilation, stride_h);
int ow = calc_output_hw(w_after, kw_dilatoin, stride_w);
float *ifmap_after = malloc(sizeof(float) * h_after * w_after);
float *weight_dilation = malloc(sizeof(float) * kh_dilation * kw_dilatoin);
if (ifmap_after == NULL || weight_dilation == NULL) {
printf("No enough memory.\n");
free(ifmap_after);
free(weight_dilation);
return BM_ERR_NOMEM;
}
for (int n = 0; n < in; n++) {
for (int c = 0; c < ic; c++, ifmap += ih * iw, ofmap += oh * ow) {
float init_value = bias ? bias[c] : 0;
int ret_ifmap = fill_pad_fmap_fp32(ifmap, &ifmap_after, 0, pad_h_t, pad_h_b, pad_w_l, pad_w_r,
ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
int ret_weight = fill_pad_fmap_fp32(weight + c * kh * kw, &weight_dilation, 0, 0, 0, 0, 0,
dh - 1, dw - 1, 0, 0, kh, kw);
if ((ret_ifmap != BM_SUCCESS) || (ret_weight != BM_SUCCESS)) {
printf("failed to pad ifmap or weight.\n");
return BM_ERR_FAILURE;
}
for (int h = 0; h < oh; h++) {
for (int w = 0; w < ow; w++) {
int rf_h = h * stride_h, rf_w = w * stride_w;
int kh_end = math_min(kh_dilation, h_after - rf_h);
int kw_end = math_min(kw_dilatoin, w_after - rf_w);
float *rf_addr = ifmap_after + rf_h * w_after + rf_w;
float dot_product_even = 0.0, dot_product_odd = 0.0;
for (int i = 0; i < kh_end; i++) {
for (int j = 0; j < kw_end; j++) {
if ((i * kw_end + j) % 2) {
dot_product_odd += rf_addr[i * w_after + j] * weight_dilation[i * kw_dilatoin + j];
} else {
dot_product_even += rf_addr[i * w_after + j] * weight_dilation[i * kw_dilatoin + j];
}
}
}
ofmap[h * ow + w] = dot_product_even + dot_product_odd + init_value;
}
}
}
}
free(ifmap_after);
free(weight_dilation);
return BM_SUCCESS;
}
void native_conv_ref(const void *ifmap, void *ofmap, const void *weight, int input_n, int input_c,
int input_h, int input_w, int output_c, int output_h, int output_w, int groups,
int kh, int kw, int dilation_h, int dilation_w, int pad_h, int pad_w,
int stride_h, int stride_w, int flip, int using_bias, const void *bias,
int result_add) {
int kh_extent = dilation_h * (kh - 1) + 1;
int kw_extent = dilation_w * (kw - 1) + 1;
int output_h_expect = (input_h + 2 * pad_h - kh_extent) / stride_h + 1;
int output_w_expect = (input_w + 2 * pad_w - kw_extent) / stride_w + 1;
(void)output_h_expect;
(void)output_w_expect;
assert(output_h == output_h_expect && "Expect same output_h");
assert(output_w == output_w_expect && "Expect same output_w");
if (!result_add) {
memset(ofmap, 0, input_n * output_c * output_h * output_w * sizeof(float));
}
float *ifmap_f = (float *)ifmap;
float *ofmap_f = (float *)ofmap;
float *weight_f = (float *)weight;
float *bias_f = (float *)bias;
int i_shape[4];
i_shape[0] = input_n;
i_shape[1] = input_c;
i_shape[2] = input_h;
i_shape[3] = input_w;
int o_shape[4];
o_shape[0] = input_n;
o_shape[1] = output_c;
o_shape[2] = output_h;
o_shape[3] = output_w;
int k_shape[4];
k_shape[0] = output_c;
k_shape[1] = input_c / groups;
k_shape[2] = kh;
k_shape[3] = kw;
int o_g = output_c / groups;
int k_g = input_c / groups;
int o_head, k_head;
int weight_offset[4];
int in_offset[4];
int out_offset[4];
for (int n = 0; n < input_n; n++) {
for (int g = 0; g < groups; g++) {
o_head = o_g * g;
k_head = k_g * g;
for (int o = 0; o < o_g; o++) {
for (int y = 0; y < output_h; y++) {
for (int x = 0; x < output_w; x++) {
out_offset[0] = n;
out_offset[1] = o + o_head;
out_offset[2] = y;
out_offset[3] = x;
float result_init = ofmap_f[calc_offset(o_shape, out_offset)];
ofmap_f[calc_offset(o_shape, out_offset)] = 0.0f;
for (int k = 0; k < k_g; k++) {
for (int p = 0; p < kh; p++) {
for (int q = 0; q < kw; q++) {
int in_y = y * stride_h - pad_h + p * dilation_h;
int in_x = x * stride_w - pad_w + q * dilation_w;
if (in_y >= 0 && in_y < input_h && in_x >= 0 && in_x < input_w) {
weight_offset[0] = o + o_head;
weight_offset[1] = k;
if (flip) {
weight_offset[2] = (kh - 1 - p);
weight_offset[3] = (kw - 1 - q);
} else {
weight_offset[2] = p;
weight_offset[3] = q;
}
in_offset[0] = n;
in_offset[1] = k + k_head;
in_offset[2] = in_y;
in_offset[3] = in_x;
ofmap_f[calc_offset(o_shape, out_offset)] +=
ifmap_f[calc_offset(i_shape, in_offset)] *
weight_f[calc_offset(k_shape, weight_offset)];
if (k_g == 1 && kh == 1 && kw == 1) {
ofmap_f[calc_offset(o_shape, out_offset)] =
ifmap_f[calc_offset(i_shape, in_offset)] *
weight_f[calc_offset(k_shape, weight_offset)];
}
}
}
}
}
if (using_bias) {
ofmap_f[calc_offset(o_shape, out_offset)] += bias_f[o + o_head];
}
if (result_add) {
ofmap_f[calc_offset(o_shape, out_offset)] += result_init;
}
}
}
}
}
}
}
int native_fc_int8(const int8_t *L, const int8_t *R, const int16_t *B, const int16_t *Y, int *Y_ref,
int L_row_num, int L_col_num, int R_col_num, int L_sign, int R_sign, int B_sign,
int l_shift_width, int r_shift_width, int is_result_int8, int do_relu) {
const uint8_t *uL = (const uint8_t *)L;
const uint8_t *uR = (const uint8_t *)R;
const uint16_t *uB = (const uint16_t *)B;
int opd0, opd1, opd2;
int ret = BM_SUCCESS;
for (int hidx = 0; hidx < L_row_num; hidx++) {
for (int widx = 0; widx < R_col_num; widx++) {
int Y1 = 0;
int Y2 = 0;
int sum_idx = 0;
for (sum_idx = 0; sum_idx < L_col_num; sum_idx++) {
int idx_L = index_get(hidx, L_col_num, sum_idx);
int idx_R = index_get(sum_idx, R_col_num, widx);
opd0 = (L_sign) ? L[idx_L] : uL[idx_L];
opd1 = (R_sign) ? R[idx_R] : uR[idx_R];
if ((sum_idx % 2 == 0 && sum_idx != 2) || sum_idx == 1) {
Y1 += opd0 * opd1;
} else {
Y2 += opd0 * opd1;
}
}
sum_idx++;
if (B) {
opd2 = (B_sign) ? (int)B[widx] : (int)uB[widx];
if ((sum_idx % 2 == 0 && sum_idx != 2) || sum_idx == 1) {
Y1 += opd2;
} else {
Y2 += opd2;
}
sum_idx++;
}
int idx_Y = index_get(hidx, R_col_num, widx);
if (Y) {
if ((sum_idx % 2 == 0 && sum_idx != 2) || sum_idx == 1) {
Y1 += (Y[idx_Y] << l_shift_width);
} else {
Y2 += (Y[idx_Y] << l_shift_width);
}
}
Y_ref[idx_Y] = Y1 + Y2;
}
}
uint8_t *Yout_int8 = malloc(sizeof(int8_t) * L_row_num * R_col_num);
uint16_t *Yout_int16 = malloc(sizeof(int16_t) * L_row_num * R_col_num);
if (is_result_int8) {
ret =
satu_2_8bit(Y_ref, L_row_num * R_col_num, (int8_t *)Yout_int8, r_shift_width, 1, !do_relu);
if (ret != BM_SUCCESS) goto error_release;
fill_int_with_int8(Y_ref, (int8_t *)Yout_int8, L_row_num * R_col_num);
} else {
ret = satu_2_16bit(Y_ref, L_row_num * R_col_num, (int16_t *)Yout_int16, r_shift_width, 1,
!do_relu);
if (ret != BM_SUCCESS) goto error_release;
fill_int_with_int16(Y_ref, (int16_t *)Yout_int16, L_row_num * R_col_num);
}
error_release:
free(Yout_int8);
free(Yout_int16);
return ret;
}
int native_pooling_ave_int8(const int8_t *i_fmap, const void *weight, const int16_t *bias,
int8_t *o_fmap, int input_n, int input_c, int input_h, int input_w,
int kh, int kw, int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
int stride_h, int stride_w, int ins_h, int ins_w, int ins_h_last,
int ins_w_last, int input_sign, int satu_sign, int r_shift_width,
int const_weight) {
if (kh * kw <= 0) return BM_ERR_INVALID_ARGUMENT;
int *avg_pooling_mac_a = (int *)malloc(kh * kw * sizeof(int));
int *avg_pooling_mac_b = (int *)malloc(kh * kw * sizeof(int));
uint8_t avg_const_weight = *(uint8_t *)weight;
const int8_t *weight_arr = weight;
int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
int output_h = calc_output_hw(h_after, kh, stride_h);
int output_w = calc_output_hw(w_after, kw, stride_w);
int8_t *i_fmap_pad = NULL;
for (int n = 0; n < input_n; n++) {
if (const_weight == 0) weight_arr = weight;
for (int c = 0; c < input_c; ++c) {
fill_pad_fmap_int8(i_fmap, &i_fmap_pad, 0, pad_w_l, pad_w_r, pad_h_t, pad_h_b, ins_h, ins_w,
ins_h_last, ins_w_last, input_h, input_w);
for (int ph = 0; ph < output_h; ++ph) {
for (int pw = 0; pw < output_w; ++pw) {
int hstart = ph * stride_h;
int wstart = pw * stride_w;
int pool_index = index_get(ph, output_w, pw);
int mac_index = 0;
int avg_pool_result;
for (int h = 0; h < kh; h++) {
for (int w = 0; w < kw; w++) {
int index = index_get((hstart + h), w_after, (w + wstart));
mac_index = index_get(h, kw, w);
avg_pooling_mac_a[mac_index] =
input_sign ? i_fmap_pad[index] : (uint8_t)(i_fmap_pad[index]);
avg_pooling_mac_b[mac_index] =
const_weight ? avg_const_weight : weight_arr[mac_index];
}
}
inner_product(avg_pooling_mac_a, avg_pooling_mac_b, kh * kw, &avg_pool_result);
if (bias) {
avg_pool_result += bias[c];
}
int ret = satu_2_8bit(&avg_pool_result, sizeof(int8_t), o_fmap + pool_index,
r_shift_width, 1, satu_sign);
if (ret != BM_SUCCESS) {
free(i_fmap_pad);
free(avg_pooling_mac_a);
free(avg_pooling_mac_b);
return BM_ERR_INVALID_ARGUMENT;
}
}
}
i_fmap += input_w * input_h;
if (const_weight == 0) weight_arr += kh * kw;
o_fmap += output_w * output_h;
}
}
free(i_fmap_pad);
free(avg_pooling_mac_a);
free(avg_pooling_mac_b);
return BM_SUCCESS;
}
int native_pooling_max_int8(const int8_t *i_fmap, int8_t *o_fmap, int input_n, int input_c,
int input_h, int input_w, int kh, int kw, int pad_h_t, int pad_h_b,
int pad_w_l, int pad_w_r, int stride_h, int stride_w, int ins_h,
int ins_w, int ins_h_last, int ins_w_last, int input_sign) {
if (ins_h != 0 || ins_w != 0 || ins_h_last != 0 || ins_w_last != 0)
return BM_ERR_INVALID_ARGUMENT;
int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
int output_h = calc_output_hw(h_after, kh, stride_h);
int output_w = calc_output_hw(w_after, kw, stride_w);
const int max_init = input_sign ? -128 : 0;
int8_t *i_fmap_pad = NULL;
for (int nc = 0; nc < input_n * input_c; nc++) {
fill_pad_fmap_int8(i_fmap, &i_fmap_pad, max_init, pad_w_l, pad_w_r, pad_h_t, pad_h_b, 0, 0, 0,
0, input_h, input_w);
for (int ph = 0; ph < output_h; ++ph) {
for (int pw = 0; pw < output_w; ++pw) {
int hstart = ph * stride_h;
int wstart = pw * stride_w;
int pool_index = index_get(ph, output_w, pw);
int max = max_init;
for (int h = 0; h < kh; h++) {
for (int w = 0; w < kw; w++) {
int index = index_get((hstart + h), (input_w + pad_w_l + pad_w_r), (w + wstart));
int val = input_sign ? i_fmap_pad[index] : (uint8_t)i_fmap_pad[index];
max = (val > max) ? val : max;
}
}
o_fmap[pool_index] = max;
}
}
i_fmap += input_w * input_h;
o_fmap += output_w * output_h;
}
free(i_fmap_pad);
return BM_SUCCESS;
}
int native_pooling_max_fp32(const float *ifmap, float *ofmap, int input_n, int input_c, int input_h,
int input_w, int kh, int kw, int pad_h_t, int pad_h_b, int pad_w_l,
int pad_w_r, int stride_h, int stride_w, int ins_h, int ins_w,
int ins_h_last, int ins_w_last) {
int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
int output_h = calc_output_hw(h_after, kh, stride_h);
int output_w = calc_output_hw(w_after, kw, stride_w);
float *ifmap_after = malloc(sizeof(float) * h_after * w_after);
if (ifmap_after == NULL) {
printf("No enough memory[h_after, w_after]: [%u, %u].\n", h_after, w_after);
return BM_ERR_NOMEM;
}
for (int n = 0; n < input_n; n++) {
for (int c = 0; c < input_c; c++) {
int ret = fill_pad_fmap_fp32(ifmap, &ifmap_after, -FLT_MAX, pad_h_t, pad_h_b, pad_w_l,
pad_w_r, ins_h, ins_w, ins_h_last, ins_w_last, input_h, input_w);
if (ret != BM_SUCCESS) {
printf("Failed to pad input fmap.\n");
free(ifmap_after);
return BM_ERR_FAILURE;
}
for (int h = 0; h < output_h; h++) {
for (int w = 0; w < output_w; w++) {
int rf_h = h * stride_h, rf_w = w * stride_w;
int kh_end = math_min(kh, h_after - rf_h);
int kw_end = math_min(kw, w_after - rf_w);
float *rf_addr = ifmap_after + rf_h * w_after + rf_w;
float max_val = -FLT_MAX;
for (int i = 0; i < kh_end; i++) {
for (int j = 0; j < kw_end; j++) {
max_val = math_max(rf_addr[i * w_after + j], max_val);
}
}
ofmap[h * output_w + w] = max_val;
}
}
ifmap += input_h * input_w;
ofmap += output_h * output_w;
}
}
free(ifmap_after);
return BM_SUCCESS;
}
int native_pooling_avg_fp32(const float *ifmap, float *ofmap, int input_n, int input_c, int input_h,
int input_w, int kh, int kw, int pad_h_t, int pad_h_b, int pad_w_l,
int pad_w_r, int stride_h, int stride_w, int ins_h, int ins_w,
int ins_h_last, int ins_w_last, float avg_pooling_const) {
int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
int output_h = calc_output_hw(h_after, kh, stride_h);
int output_w = calc_output_hw(w_after, kw, stride_w);
float *ifmap_after = malloc(sizeof(float) * h_after * w_after);
if (ifmap_after == NULL) {
printf("No enough memory[h_after, w_after]: [%u, %u].\n", h_after, w_after);
return BM_ERR_NOMEM;
}
for (int n = 0; n < input_n; n++) {
for (int c = 0; c < input_c; c++) {
int ret = fill_pad_fmap_fp32(ifmap, &ifmap_after, 0, pad_h_t, pad_h_b, pad_w_l, pad_w_r,
ins_h, ins_w, ins_h_last, ins_w_last, input_h, input_w);
if (ret != BM_SUCCESS) {
printf("Failed to pad input fmap.\n");
free(ifmap_after);
return BM_ERR_FAILURE;
}
for (int h = 0; h < output_h; h++) {
for (int w = 0; w < output_w; w++) {
int rf_h = h * stride_h, rf_w = w * stride_w;
int kh_end = math_min(kh, h_after - rf_h);
int kw_end = math_min(kw, w_after - rf_w);
float *rf_addr = ifmap_after + rf_h * w_after + rf_w;
float dot_product_even = 0.0, dot_product_odd = 0.0;
for (int i = 0; i < kh_end; i++) {
for (int j = 0; j < kw_end; j++) {
if ((i * kw_end + j) % 2) {
dot_product_odd += rf_addr[i * w_after + j] * avg_pooling_const;
} else {
dot_product_even += rf_addr[i * w_after + j] * avg_pooling_const;
}
}
}
ofmap[h * output_w + w] = dot_product_even + dot_product_odd;
}
}
ifmap += input_h * input_w;
ofmap += output_h * output_w;
}
}
free(ifmap_after);
return BM_SUCCESS;
}
void native_pooling_forward_max(const float *bottom_data, float *top_data, int *mask_data,
const int count, const int num, const int channels,
const int height, const int width, const int pooled_height,
const int pooled_width, const int kernel_h, const int kernel_w,
const int stride_h, const int stride_w, const int pad_h,
const int pad_w) {
(void)num;
for (int index = 0; index < count; ++index) {
const int pw = index % pooled_width;
const int ph = (index / pooled_width) % pooled_height;
const int c = (index / pooled_width / pooled_height) % channels;
const int n = index / pooled_width / pooled_height / channels;
int hstart = ph * stride_h - pad_h;
int wstart = pw * stride_w - pad_w;
const int hend = math_min(hstart + kernel_h, height);
const int wend = math_min(wstart + kernel_w, width);
hstart = math_max(hstart, 0);
wstart = math_max(wstart, 0);
float maxval = -FLT_MAX;
int maxidx = -1;
const float *const bottom_slice = bottom_data + (n * channels + c) * height * width;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
if (bottom_slice[h * width + w] > maxval) {
maxidx = h * width + w;
maxval = bottom_slice[maxidx];
}
}
}
top_data[index] = maxval;
mask_data[index] = maxidx;
}
}
void native_pooling_forward_ave(const float *bottom_data, float *top_data, const int count,
const int num, const int channels, const int height,
const int width, const int pooled_height, const int pooled_width,
const int kernel_h, const int kernel_w, const int stride_h,
const int stride_w, const int pad_h, const int pad_w) {
(void)num;
for (int index = 0; index < count; ++index) {
const int pw = index % pooled_width;
const int ph = (index / pooled_width) % pooled_height;
const int c = (index / pooled_width / pooled_height) % channels;
const int n = index / pooled_width / pooled_height / channels;
int hstart = ph * stride_h - pad_h;
int wstart = pw * stride_w - pad_w;
int hend = math_min(hstart + kernel_h, height + pad_h);
int wend = math_min(wstart + kernel_w, width + pad_w);
const int pool_size = (hend - hstart) * (wend - wstart);
hstart = math_max(hstart, 0);
wstart = math_max(wstart, 0);
hend = math_min(hend, height);
wend = math_min(wend, width);
float aveval = 0;
const float *const bottom_slice = bottom_data + (n * channels + c) * height * width;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
aveval += bottom_slice[h * width + w];
}
}
top_data[index] = aveval / pool_size;
}
}
int satu_2_8bit(const int *pBuff, int len, int8_t *pByteOut, int rshiftbits, int round_floor,
int sign_unsign) {
if (rshiftbits < 0) return BM_ERR_INVALID_ARGUMENT;
int temp;
int satu_max = sign_unsign ? 127 : 255;
int satu_min = sign_unsign ? -128 : 0;
if (rshiftbits == 0) {
for (int ii = 0; ii < len; ii++) {
temp = (pBuff[ii] > satu_max) ? satu_max : ((pBuff[ii] < satu_min) ? satu_min : pBuff[ii]);
memcpy(pByteOut + ii, &temp, 1);
}
} else { // rshiftbits>0
for (int ii = 0; ii < len; ii++) {
if (round_floor == 1)
temp = ((pBuff[ii] >> (rshiftbits - 1)) + 1) >> 1;
else
temp = pBuff[ii] >> rshiftbits;
temp = (temp > satu_max) ? satu_max : ((temp < satu_min) ? satu_min : temp);
memcpy(pByteOut + ii, &temp, 1);
}
}
return BM_SUCCESS;
}
int satu_2_16bit(const int *pBuff, int len, short *pByteOut, int rshiftbits, int round_floor,
int sign_unsign) {
if (rshiftbits < 0) return BM_ERR_INVALID_ARGUMENT;
int ii;
int temp;
int satu_max = sign_unsign ? 32767 : 65535;
int satu_min = sign_unsign ? -32768 : 0;
if (rshiftbits == 0) {
for (ii = 0; ii < len; ii++) {
temp = (pBuff[ii] > satu_max) ? satu_max : ((pBuff[ii] < satu_min) ? satu_min : pBuff[ii]);
memcpy(pByteOut + ii, &temp, 2);
}
} else { // rshiftbits>0
for (ii = 0; ii < len; ii++) {
if (round_floor == 1)
temp = ((pBuff[ii] >> (rshiftbits - 1)) + 1) >> 1;
else
temp = pBuff[ii] >> rshiftbits;
temp = (temp > satu_max) ? satu_max : ((temp < satu_min) ? satu_min : temp);
memcpy(pByteOut + ii, &temp, 2);
}
}
return BM_SUCCESS;
}