commit 3f4938648950a7f3bf9a19c320ca9fae7c52de20 Author: sophgo-forum-service <forum_service@sophgo.com> Date: Mon May 13 13:44:23 2024 +0800 [feat] cviruntime opensource for cv18xx soc. - a4b6a3, add cumsum and gatherelements_pt.
461 lines
16 KiB
C++
461 lines
16 KiB
C++
/*
|
|
* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
|
|
*/
|
|
#include "LeakyReluOp.h"
|
|
#include "QuantHelper.h"
|
|
#include <cvikernel/cvikernel.h>
|
|
|
|
#define NPU_SHIFT 5
|
|
#define EU_SHIFT 4
|
|
#define NPU_NUM (1 << NPU_SHIFT)
|
|
#define EU_NUM (1 << EU_SHIFT)
|
|
#define LOCAL_MEM_SIZE (1 << 15)
|
|
#define NEURON_MEMORY 0
|
|
#define WEIGHT_MEMORY 1
|
|
|
|
namespace cvi {
|
|
|
|
void LeakyReluOp::interpretFp32(
|
|
std::vector<std::shared_ptr<std::vector<float>>> &operand_tensors,
|
|
std::vector<std::vector<int64_t>> &operand_shapes,
|
|
std::shared_ptr<std::vector<float>> &result_tensor,
|
|
std::vector<int64_t> &result_shape) {
|
|
int n = operand_shapes[0][0];
|
|
int c = operand_shapes[0][1];
|
|
int h = operand_shapes[0][2];
|
|
int w = operand_shapes[0][3];
|
|
auto input = operand_tensors[0]->data();
|
|
auto output = result_tensor->data();
|
|
auto negative_slope = param.get<float>("negative_slope");
|
|
|
|
for (int i = 0; i < (int)operand_tensors[0]->size(); ++i) {
|
|
if (input[i] >= 0) {
|
|
output[i] = input[i];
|
|
} else {
|
|
output[i] = negative_slope * input[i];
|
|
}
|
|
}
|
|
}
|
|
|
|
void LeakyReluOp::interpretInt8(
|
|
std::vector<std::shared_ptr<std::vector<float>>> &operand_tensors,
|
|
std::vector<std::vector<int64_t>> &operand_shapes,
|
|
std::shared_ptr<std::vector<float>> &result_tensor,
|
|
std::vector<int64_t> &result_shape) {
|
|
int n = operand_shapes[0][0];
|
|
int c = operand_shapes[0][1];
|
|
int h = operand_shapes[0][2];
|
|
int w = operand_shapes[0][3];
|
|
|
|
auto input = operand_tensors[0]->data();
|
|
auto quant_pos_rshift =
|
|
param.has("rshift_pos") ? (float)param.get<int8_t>("rshift_pos") : 0.0f;
|
|
auto quant_pos_multiplier =
|
|
param.has("m_i8_pos") ? (float)param.get<int8_t>("m_i8_pos") : 0.0f;
|
|
auto quant_neg_rshift = (float)param.get<int8_t>("rshift_neg");
|
|
auto quant_neg_multiplier = (float)param.get<int8_t>("m_i8_neg");
|
|
|
|
auto output = result_tensor->data();
|
|
|
|
// rshift and saturate on output
|
|
for (int i = 0; i < (int)operand_tensors[0]->size(); ++i) {
|
|
if (input[i] > 0) {
|
|
if (quant_pos_multiplier != 0.0f) {
|
|
output[i] = (float)applyMultiplierAndRShiftAndSaturateInt8(
|
|
input[i], (uint32_t)quant_pos_rshift, quant_pos_multiplier, false);
|
|
} else {
|
|
output[i] = input[i];
|
|
}
|
|
} else {
|
|
output[i] = (float)applyMultiplierAndRShiftAndSaturateInt8(
|
|
input[i], (uint32_t)quant_neg_rshift, quant_neg_multiplier, false);
|
|
}
|
|
}
|
|
}
|
|
|
|
void LeakyReluOp::quantizeInt8() {
|
|
// support per-tensor only for now
|
|
setOpQuantPerchannel(false);
|
|
// use rshift and INT8 multiplier
|
|
setOpQuantParamType("RSHIFT_AND_M_I8");
|
|
|
|
float negative_slope = param.get<float>("negative_slope");
|
|
std::cout << " negative_slope: " << std::to_string(negative_slope) << "\n";
|
|
|
|
// create tensors for rshift and multiplier
|
|
float rshift_pos = 0;
|
|
float multiplier_pos = 0;
|
|
float rshift_neg = 0;
|
|
float multiplier_neg = 0;
|
|
|
|
// quantization
|
|
float threshold_x = getPrevOpThreshold();
|
|
float threshold_y = getOpThreshold();
|
|
std::cout << "threshold_y = " << std::to_string(threshold_y)
|
|
<< ", threshold_x = " << std::to_string(threshold_x) << "\n";
|
|
|
|
// positive
|
|
double qscale_pos = threshold_x / threshold_y;
|
|
if (fabs(threshold_x - threshold_y) < 1e-5 * std::min(threshold_x, threshold_y)) {
|
|
// no positive scale
|
|
rshift_pos = 0;
|
|
multiplier_pos = 0;
|
|
std::cout << " Positive: no_scale\n";
|
|
} else {
|
|
uint32_t uint_multiplier_pos;
|
|
rshift_pos =
|
|
(float)findRShiftAndMultiplierFromQScale(qscale_pos, &uint_multiplier_pos, false);
|
|
multiplier_pos = (float)uint_multiplier_pos;
|
|
std::cout << " Positive: ";
|
|
std::cout << " [multiplier : rshift] = [" << std::to_string(multiplier_pos) << " : "
|
|
<< std::to_string(rshift_pos) << "]\n";
|
|
}
|
|
// negative
|
|
float qscale_neg = fabs(qscale_pos * negative_slope);
|
|
uint32_t uint_multiplier_neg = 0;
|
|
rshift_neg =
|
|
(float)findRShiftAndMultiplierFromQScale(qscale_neg, &uint_multiplier_neg, false);
|
|
multiplier_neg = (float)uint_multiplier_neg;
|
|
std::cout << " Negative: ";
|
|
std::cout << " [multiplier : rshift] = [" << std::to_string(multiplier_neg) << " : "
|
|
<< std::to_string(rshift_neg) << "]\n";
|
|
|
|
bool do_pos_scale = (multiplier_pos != 0.0) ? true : false;
|
|
if (do_pos_scale) {
|
|
param.put<int8_t>("rshift_pos", static_cast<int8_t>(rshift_pos));
|
|
param.put<int8_t>("m_i8_pos", static_cast<int8_t>(multiplier_pos));
|
|
}
|
|
param.put<int8_t>("rshift_neg", static_cast<int8_t>(rshift_neg));
|
|
param.put<int8_t>("m_i8_neg", static_cast<int8_t>(multiplier_neg));
|
|
}
|
|
|
|
void LeakyReluOp::codeGenInt8(void *ctx,
|
|
std::vector<std::vector<int64_t>> &operand_shapes,
|
|
std::vector<uint64_t> &operand_gaddrs,
|
|
std::vector<int64_t> &result_shape, uint64_t result_gaddr,
|
|
int layer_id) {
|
|
auto pos_rshift = param.has("rshift_pos") ? param.get<int8_t>("rshift_pos") : 0;
|
|
auto pos_m_i8 = param.has("m_i8_pos") ? param.get<int8_t>("m_i8_pos") : 0;
|
|
auto neg_rshift = param.has("rshift_neg") ? param.get<int8_t>("rshift_neg") : 0;
|
|
auto neg_m_i8 = param.has("m_i8_neg") ? param.get<int8_t>("m_i8_neg") : 0;
|
|
assert(neg_m_i8);
|
|
|
|
int n = operand_shapes[0][0];
|
|
int c = operand_shapes[0][1];
|
|
int h = operand_shapes[0][2];
|
|
int w = operand_shapes[0][3];
|
|
uint64_t operand_gaddr = operand_gaddrs[0];
|
|
uint64_t ga_output = result_gaddr;
|
|
|
|
leakyrelu_codegen((cvk_context_t *)ctx, // ctx
|
|
layer_id, // layer_id
|
|
operand_gaddr, // input_gaddr
|
|
result_gaddr, // output_gaddr
|
|
n, // input_n
|
|
c, // input_c
|
|
h, // input_h
|
|
w, // input_w
|
|
pos_rshift, // GT_right_shift_width
|
|
neg_rshift, // LE_right_shift_width
|
|
pos_m_i8, // GT_scale
|
|
neg_m_i8 // LE_scale
|
|
);
|
|
}
|
|
|
|
void LeakyReluOp::tdma_load(cvk_context_t *ctx, cvk_tl_t *tlp, uint64_t ga_src) {
|
|
cvk_tg_t ts_data;
|
|
ts_data.base_reg_index = NEURON_MEMORY;
|
|
ts_data.fmt = tlp->fmt;
|
|
ts_data.start_address = ga_src;
|
|
ts_data.shape = {tlp->shape.n, tlp->shape.c, tlp->shape.h, tlp->shape.w};
|
|
ts_data.stride = ctx->ops->tg_default_stride(ctx, ts_data.shape, ts_data.fmt);
|
|
|
|
cvk_tdma_g2l_tensor_copy_param_t p1;
|
|
p1.src = &ts_data;
|
|
p1.dst = tlp;
|
|
ctx->ops->tdma_g2l_tensor_copy(ctx, &p1);
|
|
}
|
|
|
|
void LeakyReluOp::tdma_store(cvk_context_t *ctx, cvk_tl_t *tlp, uint64_t ga_dst) {
|
|
cvk_tg_t ts_data;
|
|
ts_data.base_reg_index = NEURON_MEMORY;
|
|
ts_data.fmt = tlp->fmt;
|
|
ts_data.start_address = ga_dst;
|
|
ts_data.shape = {tlp->shape.n, tlp->shape.c, tlp->shape.h, tlp->shape.w};
|
|
ts_data.stride = ctx->ops->tg_default_stride(ctx, ts_data.shape, ts_data.fmt);
|
|
|
|
cvk_tdma_l2g_tensor_copy_param_t p1;
|
|
p1.src = tlp;
|
|
p1.dst = &ts_data;
|
|
ctx->ops->tdma_l2g_tensor_copy(ctx, &p1);
|
|
}
|
|
|
|
void LeakyReluOp::leakyrelu_kernel(cvk_context_t *ctx, int layer_id, cvk_tl_t &bottom,
|
|
cvk_tl_t &relu, cvk_tl_t &neg,
|
|
int GT_right_shift_width, int LE_right_shift_width,
|
|
int GT_scale, int LE_scale) {
|
|
bool isIgnorePosPart = (GT_scale == 0);
|
|
bool isSlopeSmallerThanOne = ((LE_scale >> LE_right_shift_width) == 0);
|
|
|
|
if (isIgnorePosPart) {
|
|
cvk_tiu_mul_param_t p4;
|
|
p4.res_high = nullptr;
|
|
p4.res_low = &relu;
|
|
p4.a = ⊥
|
|
p4.b_const.val = LE_scale;
|
|
p4.b_const.is_signed = true;
|
|
p4.b_is_const = 1;
|
|
p4.rshift_bits = LE_right_shift_width;
|
|
p4.layer_id = layer_id;
|
|
p4.relu_enable = 0;
|
|
ctx->ops->tiu_mul(ctx, &p4);
|
|
|
|
if (isSlopeSmallerThanOne) {
|
|
cvk_tiu_max_param_t p1;
|
|
p1.max = ⊥
|
|
p1.a = ⊥
|
|
p1.b = &relu;
|
|
p1.b_is_const = 0;
|
|
p1.layer_id = layer_id;
|
|
ctx->ops->tiu_max(ctx, &p1);
|
|
} else {
|
|
cvk_tiu_min_param_t p1;
|
|
p1.min = ⊥
|
|
p1.a = ⊥
|
|
p1.b = &relu;
|
|
p1.b_is_const = 0;
|
|
p1.layer_id = layer_id;
|
|
ctx->ops->tiu_min(ctx, &p1);
|
|
}
|
|
} else {
|
|
// 0. relu = relu(bottom)
|
|
cvk_tiu_max_param_t p13;
|
|
p13.max = &relu;
|
|
p13.a = ⊥
|
|
p13.b_is_const = 1;
|
|
p13.b_const.is_signed = 1;
|
|
p13.b_const.val = 0;
|
|
p13.layer_id = layer_id;
|
|
ctx->ops->tiu_max(ctx, &p13);
|
|
|
|
// 1. relu = (relu * GT_scale) >> GT_right_shift_width
|
|
cvk_tiu_mul_param_t p;
|
|
p.res_high = nullptr;
|
|
p.res_low = &relu;
|
|
p.a = &relu;
|
|
p.b_const.val = GT_scale;
|
|
p.b_const.is_signed = true;
|
|
p.b_is_const = 1;
|
|
p.rshift_bits = GT_right_shift_width;
|
|
p.layer_id = layer_id;
|
|
p.relu_enable = 0;
|
|
ctx->ops->tiu_mul(ctx, &p);
|
|
|
|
// 2. neg = neg(0, botom)
|
|
cvk_tiu_min_param_t p7;
|
|
p7.min = &neg;
|
|
p7.a = ⊥
|
|
p7.b_is_const = 1;
|
|
p7.b_const.val = 0;
|
|
p7.b_const.is_signed = 1;
|
|
p7.layer_id = layer_id;
|
|
ctx->ops->tiu_min(ctx, &p7);
|
|
|
|
// 3. neg (n,c,h,w) = (neg(n,c,h,w) * slope) >> LE_right_shift_width
|
|
cvk_tiu_mul_param_t p8;
|
|
p8.res_high = nullptr;
|
|
p8.res_low = &neg;
|
|
p8.a = &neg;
|
|
p8.b_const.val = LE_scale;
|
|
p8.b_const.is_signed = true;
|
|
p8.b_is_const = 1;
|
|
p8.rshift_bits = LE_right_shift_width;
|
|
p8.layer_id = layer_id;
|
|
p8.relu_enable = 0;
|
|
ctx->ops->tiu_mul(ctx, &p8);
|
|
|
|
// 4. bottom = or relu, neg
|
|
cvk_tiu_or_int8_param_t p9;
|
|
p9.res = ⊥
|
|
p9.a = &relu;
|
|
p9.b = &neg;
|
|
p9.layer_id = layer_id;
|
|
ctx->ops->tiu_or_int8(ctx, &p9);
|
|
}
|
|
}
|
|
|
|
void LeakyReluOp::leakyrelu_codegen(cvk_context_t *ctx, uint32_t layer_id,
|
|
uint64_t input_gaddr, uint64_t output_gaddr,
|
|
int input_n, int input_c, int input_h, int input_w,
|
|
int GT_right_shift_width, int LE_right_shift_width,
|
|
int GT_scale, int LE_scale) {
|
|
printf("leakyrelu_codegen:\n"
|
|
" layer_id %d\n"
|
|
" input_gddr: %lx, output_gaddr: %lx\n"
|
|
" input (%d, %d, %d, %d)\n"
|
|
" GT_scale:%d, LE_scale:%d\n"
|
|
" GT_right_shift_width:%d, LE_right_shift_width:%d\n",
|
|
layer_id, input_gaddr, output_gaddr, input_n, input_c, input_h, input_w,
|
|
GT_scale, LE_scale, GT_right_shift_width, LE_right_shift_width);
|
|
|
|
// Split input based on local memory
|
|
uint32_t total_eu = NPU_NUM * EU_NUM;
|
|
uint32_t lane_size = LOCAL_MEM_SIZE;
|
|
uint32_t total_mem_size = NPU_NUM * LOCAL_MEM_SIZE;
|
|
uint32_t max_N = (1 << 12) - 1; // 1880v2: 12 bit
|
|
uint32_t max_W = (1 << 12) - 1; // 1880v2: 12 bit
|
|
uint32_t count = input_n * input_c * input_h * input_w;
|
|
uint32_t tiled_N = count / total_eu / 3; // 3 blobs
|
|
tiled_N = (tiled_N > max_N) ? max_N : tiled_N;
|
|
|
|
// local tensor shape(tiled_N, npu_num, 1, eu_num)
|
|
cvk_tl_shape_t tl_shape = {tiled_N, static_cast<uint32_t>(NPU_NUM), 1,
|
|
static_cast<uint32_t>(EU_NUM)};
|
|
cvk_tl_stride_t tl_stride = ctx->ops->tl_default_stride(ctx, tl_shape, CVK_FMT_I8, 1);
|
|
|
|
// Find max tiled_N
|
|
uint32_t required_size = 0;
|
|
do {
|
|
tl_shape.n = tiled_N;
|
|
tl_stride = ctx->ops->tl_default_stride(ctx, tl_shape, CVK_FMT_I8, 1);
|
|
required_size = 3 * tl_shape.n * tl_stride.n; // 3 blobs
|
|
|
|
if (required_size <= lane_size) {
|
|
break;
|
|
}
|
|
|
|
} while (--tiled_N);
|
|
|
|
printf(" tiled_bottom shape (%d, %d, %d, %d), stride (%d, %d, %d, %d)\n"
|
|
" required_size %d kB/lane\n",
|
|
tl_shape.n, tl_shape.c, tl_shape.h, tl_shape.w, tl_stride.n, tl_stride.c,
|
|
tl_stride.h, tl_stride.w, required_size / 1024);
|
|
|
|
assert(tiled_N);
|
|
if (!tiled_N) {
|
|
return;
|
|
}
|
|
|
|
// Tiled local memory layout:
|
|
// tiled bottom/result
|
|
// tiled relu
|
|
// tiled neg
|
|
|
|
// Tiled bottom
|
|
required_size /= 3; // for 3 blobs
|
|
cvk_tl_t tl_tiled_bottom;
|
|
tl_tiled_bottom.start_address = 0;
|
|
tl_tiled_bottom.fmt = CVK_FMT_I8;
|
|
tl_tiled_bottom.shape = tl_shape;
|
|
tl_tiled_bottom.stride = tl_stride;
|
|
|
|
// Tiled relu
|
|
cvk_tl_t tl_tiled_relu = tl_tiled_bottom;
|
|
tl_tiled_relu.start_address = tl_tiled_bottom.start_address + required_size;
|
|
|
|
// Tiled neg
|
|
cvk_tl_t tl_tiled_neg = tl_tiled_bottom;
|
|
tl_tiled_neg.start_address = tl_tiled_relu.start_address + required_size;
|
|
|
|
// In unit of tiled_N * npu_num * eu_num
|
|
uint32_t global_input_offset = 0;
|
|
for (uint32_t i = 0; i < (count / total_eu / tiled_N); i++) {
|
|
// Load as a chunk of contiguous memory in global memory, not use global
|
|
// shape/stride Local memory use tensor shape to maximize eu utilization.
|
|
tdma_load(ctx, &tl_tiled_bottom, input_gaddr + global_input_offset);
|
|
leakyrelu_kernel(ctx, layer_id, tl_tiled_bottom, tl_tiled_relu, tl_tiled_neg,
|
|
GT_right_shift_width, LE_right_shift_width, GT_scale, LE_scale);
|
|
// Store bottom as a chunk of contiguous memory, not use global shape/stride
|
|
tdma_store(ctx, &tl_tiled_bottom, output_gaddr + global_input_offset);
|
|
|
|
// Next input offset
|
|
global_input_offset += tiled_N * total_eu;
|
|
|
|
} // for (uint32_t i = 0; i < (count/total_eu/tiled_N); i++)
|
|
|
|
// Remaining count, in unit of npu_num * eu_num
|
|
if (global_input_offset < count) {
|
|
uint32_t tiled_W = (count - global_input_offset) / NPU_NUM;
|
|
tiled_N = 1;
|
|
do {
|
|
tl_shape.n = tiled_N;
|
|
tl_shape.w = tiled_W;
|
|
tl_stride = ctx->ops->tl_default_stride(ctx, tl_shape, CVK_FMT_I8, 1);
|
|
required_size = 3 * tl_shape.n * tl_stride.n; // 3 blobs
|
|
|
|
if (required_size <= lane_size && (tiled_W <= max_W)) {
|
|
break;
|
|
} else {
|
|
tiled_W /= 2;
|
|
tiled_N *= 2;
|
|
}
|
|
} while (true); // Magic number for 2^12 -1 - 32
|
|
|
|
if ((count - global_input_offset) % NPU_NUM != 0) {
|
|
std::cout << "Remaining size should align npu_num, or die";
|
|
assert(0);
|
|
}
|
|
|
|
// Update shape, stride
|
|
tl_shape.n = tiled_N;
|
|
tl_shape.w = tiled_W;
|
|
tl_stride = ctx->ops->tl_default_stride(ctx, tl_shape, CVK_FMT_I8, 1);
|
|
required_size = tl_shape.n * tl_stride.n;
|
|
|
|
printf(" tiled_bottom shape (%d, %d, %d, %d), stride (%d, %d, %d, %d)\n"
|
|
" required_size %d kB/lane\n",
|
|
tl_shape.n, tl_shape.c, tl_shape.h, tl_shape.w, tl_stride.n, tl_stride.c,
|
|
tl_stride.h, tl_stride.w, required_size / 1024);
|
|
|
|
// Tiled bottom
|
|
tl_tiled_bottom.shape = tl_shape;
|
|
tl_tiled_bottom.stride = tl_stride;
|
|
|
|
// Tiled bottom with precise stride
|
|
cvk_tl_t tl_tiled_bottom_precise_stride = tl_tiled_bottom;
|
|
tl_tiled_bottom_precise_stride.stride = {
|
|
static_cast<uint32_t>(tl_shape.h * tl_shape.w * sizeof(uint8_t)),
|
|
static_cast<uint32_t>(tl_shape.h * tl_shape.w * sizeof(uint8_t)),
|
|
static_cast<uint32_t>(tl_shape.w * sizeof(uint8_t)), sizeof(uint8_t)};
|
|
|
|
printf(" tiled_bottom_precise shape (%d, %d, %d, %d), stride (%d, %d, %d, %d)\n"
|
|
" required_size %d kB/lane\n",
|
|
tl_shape.n, tl_shape.c, tl_shape.h, tl_shape.w,
|
|
tl_tiled_bottom_precise_stride.stride.n,
|
|
tl_tiled_bottom_precise_stride.stride.c,
|
|
tl_tiled_bottom_precise_stride.stride.h,
|
|
tl_tiled_bottom_precise_stride.stride.w, required_size / 1024);
|
|
|
|
// Tiled relu
|
|
tl_tiled_relu = tl_tiled_bottom;
|
|
tl_tiled_relu.start_address = tl_tiled_bottom.start_address + required_size;
|
|
|
|
// Tiled neg
|
|
tl_tiled_neg = tl_tiled_bottom;
|
|
tl_tiled_neg.start_address = tl_tiled_relu.start_address + required_size;
|
|
|
|
// Load as a chunk of contiguous memory in global memory, not use global
|
|
// shape/stride Local memory use tensor shape to maximize eu utilization.
|
|
|
|
tdma_load(ctx, &tl_tiled_bottom, input_gaddr + global_input_offset);
|
|
|
|
leakyrelu_kernel(ctx, layer_id, tl_tiled_bottom, tl_tiled_relu, tl_tiled_neg,
|
|
GT_right_shift_width, LE_right_shift_width, GT_scale, LE_scale);
|
|
|
|
// Store bottom as a chunk of contiguous memory, not use global shape/stride
|
|
tdma_store(ctx, &tl_tiled_bottom, output_gaddr + global_input_offset);
|
|
|
|
global_input_offset += tl_tiled_bottom_precise_stride.shape.n *
|
|
tl_tiled_bottom_precise_stride.stride.n * NPU_NUM;
|
|
}
|
|
|
|
// Remaining count, in unit of eu_num
|
|
if (global_input_offset != count) {
|
|
printf("global_input_offset != count (%d != %d)/n", global_input_offset, count);
|
|
assert(0);
|
|
}
|
|
}
|
|
|
|
RegisterCustomOp(leaky_relu, LeakyReluOp);
|
|
|
|
} // namespace cvi
|