commit 3f4938648950a7f3bf9a19c320ca9fae7c52de20 Author: sophgo-forum-service <forum_service@sophgo.com> Date: Mon May 13 13:44:23 2024 +0800 [feat] cviruntime opensource for cv18xx soc. - a4b6a3, add cumsum and gatherelements_pt.
307 lines
12 KiB
C++
307 lines
12 KiB
C++
/*
|
|
* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
|
|
*/
|
|
#include "UnPoolingOp.h"
|
|
#include "QuantHelper.h"
|
|
#include <cvikernel/cvikernel.h>
|
|
|
|
#define NPU_SHIFT 5
|
|
#define EU_SHIFT 4
|
|
#define NPU_NUM (1 << NPU_SHIFT)
|
|
#define EU_NUM (1 << EU_SHIFT)
|
|
#define LOCAL_MEM_SIZE (1 << 15)
|
|
#define NEURON_MEMORY 0
|
|
#define WEIGHT_MEMORY 1
|
|
|
|
namespace cvi {
|
|
|
|
void UnPoolingOp::interpretFp32(
|
|
std::vector<std::shared_ptr<std::vector<float>>> &operand_tensors,
|
|
std::vector<std::vector<int64_t>> &operand_shapes,
|
|
std::shared_ptr<std::vector<float>> &result_tensor,
|
|
std::vector<int64_t> &result_shape) {
|
|
unpooling(operand_tensors, operand_shapes, result_tensor, result_shape);
|
|
}
|
|
|
|
void UnPoolingOp::interpretInt8(
|
|
std::vector<std::shared_ptr<std::vector<float>>> &operand_tensors,
|
|
std::vector<std::vector<int64_t>> &operand_shapes,
|
|
std::shared_ptr<std::vector<float>> &result_tensor,
|
|
std::vector<int64_t> &result_shape) {
|
|
unpooling(operand_tensors, operand_shapes, result_tensor, result_shape);
|
|
}
|
|
|
|
void UnPoolingOp::quantizeInt8() {
|
|
// support per-tensor only for now
|
|
setOpQuantPerchannel(false);
|
|
// use rshift and INT8 multiplier
|
|
setOpQuantParamType("RSHIFT_AND_M_I8");
|
|
|
|
// quantization
|
|
float threshold_x = getPrevOpThreshold();
|
|
float threshold_y = getOpThreshold();
|
|
std::cout << "threshold_y = " << std::to_string(threshold_y)
|
|
<< ", threshold_x = " << std::to_string(threshold_x) << "\n";
|
|
}
|
|
|
|
void UnPoolingOp::codeGenInt8(void *ctx,
|
|
std::vector<std::vector<int64_t>> &operand_shapes,
|
|
std::vector<uint64_t> &operand_gaddrs,
|
|
std::vector<int64_t> &result_shape,
|
|
uint64_t result_gaddr, int layer_id) {
|
|
int n = operand_shapes[0][0];
|
|
int c = operand_shapes[0][1];
|
|
int h = operand_shapes[0][2];
|
|
int w = operand_shapes[0][3];
|
|
uint64_t data_gaddr = operand_gaddrs[0];
|
|
uint64_t mask_gaddr = operand_gaddrs[1];
|
|
uint64_t ga_output = result_gaddr;
|
|
|
|
int scale = param.get<int>("scale");
|
|
int unpool_h = param.get<int>("unpool_h");
|
|
int unpool_w = param.get<int>("unpool_w");
|
|
|
|
unpooling_codegen((cvk_context_t *)ctx, // ctx
|
|
layer_id, // layer_id
|
|
data_gaddr, // data_gaddr
|
|
mask_gaddr, // mask_gaddr
|
|
ga_output, // output_gaddr
|
|
n, c, h, w, // input shape
|
|
scale, unpool_h, unpool_w);
|
|
}
|
|
|
|
void UnPoolingOp::alloc_lmem(cvk_context_t *ctx, uint32_t tiling_c, uint32_t tiling_h,
|
|
uint32_t input_c, uint32_t input_h, uint32_t input_w,
|
|
uint32_t output_c, uint32_t output_h, uint32_t output_w,
|
|
cvk_fmt_t fmt, int eu_align, cvk_tl_t &tl_ifmap, cvk_tl_t &tl_working,
|
|
cvk_tl_t &tl_mask, cvk_tl_t &tl_ofmap) {
|
|
uint32_t tl_offset = 0;
|
|
ctx->ops->lmem_init_tensor(ctx, &tl_ifmap, {1, tiling_c, tiling_h, input_w}, fmt,
|
|
eu_align);
|
|
tl_ifmap.start_address = tl_offset;
|
|
tl_offset += ctx->ops->lmem_tensor_to_size(ctx, tl_ifmap.shape, tl_ifmap.fmt,
|
|
tl_ifmap.eu_align);
|
|
|
|
ctx->ops->lmem_init_tensor(ctx, &tl_working, {1, tiling_c, tiling_h, output_w}, fmt,
|
|
eu_align);
|
|
tl_working.start_address = tl_offset;
|
|
tl_offset += ctx->ops->lmem_tensor_to_size(ctx, tl_working.shape, tl_working.fmt,
|
|
tl_working.eu_align);
|
|
|
|
uint32_t tiling_oh = tiling_h * (output_h / input_h);
|
|
ctx->ops->lmem_init_tensor(ctx, &tl_mask, {1, tiling_c, tiling_oh, output_w}, fmt,
|
|
eu_align);
|
|
tl_mask.start_address = tl_offset;
|
|
tl_offset += ctx->ops->lmem_tensor_to_size(ctx, tl_mask.shape, tl_mask.fmt,
|
|
tl_mask.eu_align);
|
|
|
|
ctx->ops->lmem_init_tensor(ctx, &tl_ofmap, {1, tiling_c, tiling_oh, output_w}, fmt,
|
|
eu_align);
|
|
tl_ofmap.start_address = tl_offset;
|
|
}
|
|
|
|
void UnPoolingOp::tdma_load(cvk_context_t *ctx, cvk_tl_t *tlp, uint64_t ga_src,
|
|
cvk_tg_stride_t stride, int32_t n_pos, int32_t c_pos, int32_t h_pos) {
|
|
cvk_tg_t ts_data;
|
|
ts_data.base_reg_index = NEURON_MEMORY;
|
|
ts_data.fmt = tlp->fmt;
|
|
ts_data.start_address = ga_src + stride.n * n_pos + stride.c * c_pos + stride.h * h_pos;
|
|
ts_data.shape = {tlp->shape.n, tlp->shape.c, tlp->shape.h, tlp->shape.w};
|
|
ts_data.stride = stride;
|
|
|
|
cvk_tdma_g2l_tensor_copy_param_t p1;
|
|
p1.src = &ts_data;
|
|
p1.dst = tlp;
|
|
ctx->ops->tdma_g2l_tensor_copy(ctx, &p1);
|
|
}
|
|
|
|
void UnPoolingOp::unpooling_compute(
|
|
cvk_context_t *ctx, uint32_t layer_id, int scale_h, int scale_w,
|
|
cvk_tl_t *tl_ifmap, cvk_tl_t *tl_working, cvk_tl_t *tl_mask, cvk_tl_t *tl_ofmap) {
|
|
|
|
cvk_tl_stride_t tl_ifmap_fake_stride = {0, tl_ifmap->stride.c, tl_ifmap->stride.h, tl_ifmap->stride.w};
|
|
cvk_tl_t tl_ifmap_fake = {0};
|
|
tl_ifmap_fake.start_address = tl_ifmap->start_address;
|
|
tl_ifmap_fake.fmt = tl_ifmap->fmt;
|
|
tl_ifmap_fake.shape = {scale_w, tl_ifmap->shape.c, tl_ifmap->shape.h, tl_ifmap->shape.w};
|
|
tl_ifmap_fake.stride = tl_ifmap_fake_stride;
|
|
tl_ifmap_fake.eu_align = tl_ifmap->eu_align;
|
|
|
|
cvk_tl_stride_t tl_working_fake_stride = {
|
|
tl_working->stride.w, tl_working->stride.c,
|
|
tl_working->stride.h, tl_working->stride.w * scale_w};
|
|
cvk_tl_t tl_working_fake = {0};
|
|
tl_working_fake.start_address = tl_working->start_address;
|
|
tl_working_fake.fmt = tl_working->fmt;
|
|
tl_working_fake.shape = {scale_w, tl_ifmap->shape.c, tl_ifmap->shape.h, tl_ifmap->shape.w};
|
|
tl_working_fake.stride = tl_working_fake_stride;
|
|
tl_working_fake.eu_align = tl_working->eu_align;
|
|
|
|
cvk_tiu_copy_param_t param = {0};
|
|
param.dst = &tl_working_fake;
|
|
param.src = &tl_ifmap_fake;
|
|
param.layer_id = layer_id;
|
|
ctx->ops->tiu_copy(ctx, ¶m);
|
|
|
|
cvk_tl_stride_t tl_working_fake2_stride = {0, tl_working->stride.c, tl_working->stride.h, tl_working->stride.w};
|
|
cvk_tl_t tl_working_fake2 = {0};
|
|
tl_working_fake2.start_address = tl_working->start_address;
|
|
tl_working_fake2.fmt = tl_working->fmt;
|
|
tl_working_fake2.shape = {scale_h, tl_ofmap->shape.c, tl_ifmap->shape.h, tl_ofmap->shape.w};
|
|
tl_working_fake2.stride = tl_working_fake2_stride;
|
|
tl_working_fake2.eu_align = tl_working->eu_align;
|
|
|
|
cvk_tl_stride_t tl_ofmap_fake_stride = {tl_ofmap->stride.h, tl_ofmap->stride.c, tl_ofmap->stride.h * scale_h, tl_ofmap->stride.w};
|
|
cvk_tl_t tl_ofmap_fake = {0};
|
|
tl_ofmap_fake.start_address = tl_ofmap->start_address;
|
|
tl_ofmap_fake.fmt = tl_ofmap->fmt;
|
|
tl_ofmap_fake.shape = {scale_h, tl_ofmap->shape.c, tl_ifmap->shape.h, tl_ofmap->shape.w};
|
|
tl_ofmap_fake.stride = tl_ofmap_fake_stride;
|
|
tl_ofmap_fake.eu_align = tl_ofmap->eu_align;
|
|
|
|
cvk_tiu_copy_param_t param2 = {0};
|
|
param2.dst = &tl_ofmap_fake;
|
|
param2.src = &tl_working_fake2;
|
|
param2.layer_id = layer_id;
|
|
ctx->ops->tiu_copy(ctx, ¶m2);
|
|
|
|
cvk_tiu_mul_param_t param3 = {0};
|
|
param3.res_high = nullptr;
|
|
param3.res_low = tl_ofmap;
|
|
param3.a = tl_ofmap;
|
|
param3.b_is_const = 0;
|
|
param3.b = tl_mask;
|
|
param3.layer_id = layer_id;
|
|
param3.rshift_bits = 0;
|
|
param3.relu_enable = 0;
|
|
ctx->ops->tiu_mul(ctx, ¶m3);
|
|
}
|
|
|
|
void UnPoolingOp::tdma_store(cvk_context_t *ctx, cvk_tl_t *tlp,
|
|
uint64_t ga_dst, cvk_tg_stride_t stride,
|
|
uint32_t n_pos, uint32_t c_pos, uint32_t h_pos,
|
|
uint32_t crop_h, uint32_t crop_w) {
|
|
cvk_tl_t tl_src;
|
|
tl_src.start_address = tlp->start_address;
|
|
tl_src.fmt = tlp->fmt;
|
|
tl_src.shape = {tlp->shape.n, tlp->shape.c, tlp->shape.h - crop_h, tlp->shape.w - crop_w};
|
|
tl_src.stride = tlp->stride;
|
|
|
|
cvk_tg_t tg_dst;
|
|
tg_dst.base_reg_index = NEURON_MEMORY;
|
|
tg_dst.fmt = tlp->fmt;
|
|
tg_dst.start_address = ga_dst + stride.n * n_pos + stride.c * c_pos + stride.h * h_pos;
|
|
tg_dst.shape = {tlp->shape.n, tlp->shape.c, tlp->shape.h - crop_h, tlp->shape.w - crop_w};
|
|
tg_dst.stride = stride;
|
|
|
|
cvk_tdma_l2g_tensor_copy_param_t p1;
|
|
p1.src = &tl_src;
|
|
p1.dst = &tg_dst;
|
|
ctx->ops->tdma_l2g_tensor_copy(ctx, &p1);
|
|
}
|
|
|
|
void UnPoolingOp::unpooling_codegen(cvk_context_t *ctx, uint32_t layer_id,
|
|
uint64_t data_gaddr, uint64_t mask_gaddr, uint64_t output_gaddr,
|
|
int input_n, int input_c, int input_h, int input_w,
|
|
int scale, int unpool_h, int unpool_w) {
|
|
printf("unpooling_codegen:\n"
|
|
" layer_id %d\n"
|
|
" data_gddr: %lx, mask_gaddr: %lx, output_gaddr: %lx\n"
|
|
" input (%d, %d, %d, %d)\n"
|
|
" scale:%d, unpool_h:%d, unpool_w:%d\n",
|
|
layer_id, data_gaddr, mask_gaddr, output_gaddr, input_n, input_c, input_h,
|
|
input_w, scale, unpool_h, unpool_w);
|
|
|
|
// Split input based on local memory
|
|
uint32_t total_eu = NPU_NUM * EU_NUM;
|
|
uint32_t lane_size = LOCAL_MEM_SIZE;
|
|
uint32_t total_mem_size = NPU_NUM * LOCAL_MEM_SIZE;
|
|
uint32_t max_N = (1 << 12) - 1; // 1880v2: 12 bit
|
|
uint32_t max_W = (1 << 12) - 1; // 1880v2: 12 bit
|
|
uint32_t count = input_n * input_c * input_h * input_w;
|
|
|
|
uint32_t output_c = input_c;
|
|
uint32_t output_h = input_h * scale;
|
|
uint32_t output_w = input_w * scale;
|
|
|
|
uint32_t n_step = 1;
|
|
uint32_t c_step = 0;
|
|
uint32_t h_step = 0;
|
|
|
|
h_step = input_h;
|
|
uint32_t h_factor = scale;
|
|
|
|
for (; h_step > 0; --h_step) {
|
|
uint32_t total_size;
|
|
for (c_step = input_c; c_step >= (uint32_t)NPU_NUM ; --c_step) {
|
|
cvk_tl_shape_t tiled_ifmap_shape = {1, c_step, h_step, input_w};
|
|
uint32_t tiled_ifmap_size =
|
|
ctx->ops->lmem_tensor_to_size(ctx, tiled_ifmap_shape, CVK_FMT_I8, 0);
|
|
|
|
cvk_tl_shape_t tiled_working_shape = {1, c_step, h_step, output_w};
|
|
uint32_t tiled_working_size =
|
|
ctx->ops->lmem_tensor_to_size(ctx, tiled_working_shape, CVK_FMT_I8, 0);
|
|
|
|
cvk_tl_shape_t tiled_ofmap_shape = {1, c_step, h_step * h_factor, output_w};
|
|
uint32_t tiled_ofmap_size =
|
|
ctx->ops->lmem_tensor_to_size(ctx, tiled_ofmap_shape, CVK_FMT_I8, 0);
|
|
|
|
total_size = tiled_ifmap_size + tiled_working_size + tiled_ofmap_size * 2;
|
|
if (total_size <= static_cast<uint32_t>(LOCAL_MEM_SIZE))
|
|
break;
|
|
}
|
|
if (total_size <= static_cast<uint32_t>(LOCAL_MEM_SIZE))
|
|
break;
|
|
}
|
|
|
|
printf("tiling: c_step %d, h_step %d\n", c_step, h_step);
|
|
assert(c_step && h_step && "Expect valid tiling");
|
|
|
|
cvk_tg_stride_t ifmap_stride = {
|
|
input_c * input_h * input_w,
|
|
input_h * input_w,
|
|
input_w};
|
|
cvk_tg_stride_t mask_stride = {
|
|
output_c * output_h * output_w,
|
|
output_h * output_w,
|
|
output_w};
|
|
cvk_tg_stride_t output_stride = {
|
|
output_c * unpool_h * unpool_w,
|
|
unpool_h * unpool_w,
|
|
unpool_w};
|
|
|
|
uint64_t output_offset = 0;
|
|
uint32_t crop_h = 0;
|
|
uint32_t crop_w = 0;
|
|
for (uint32_t n_pos = 0; n_pos < input_n; n_pos += n_step) {
|
|
for (uint32_t c_pos = 0; c_pos < input_c; c_pos += c_step) {
|
|
uint32_t tiling_c = std::min(input_c - c_pos, c_step);
|
|
for (uint32_t h_pos = 0; h_pos < input_h; h_pos += h_step) {
|
|
uint32_t tiling_h = std::min(input_h - h_pos, h_step);
|
|
|
|
cvk_tl_t tl_ifmap, tl_ofmap, tl_mask, tl_working;
|
|
alloc_lmem(ctx, tiling_c, tiling_h, input_c, input_h, input_w, output_c,
|
|
output_h, output_w, CVK_FMT_I8, 0, tl_ifmap, tl_working,
|
|
tl_mask, tl_ofmap);
|
|
|
|
tdma_load(ctx, &tl_ifmap, data_gaddr, ifmap_stride, n_pos, c_pos, h_pos);
|
|
tdma_load(ctx, &tl_mask, mask_gaddr, mask_stride, n_pos, c_pos, h_pos * scale);
|
|
|
|
unpooling_compute(ctx, layer_id, scale, scale, &tl_ifmap, &tl_working, &tl_mask, &tl_ofmap);
|
|
|
|
uint32_t oh_pos = h_pos * scale;
|
|
crop_w = output_w - unpool_w;
|
|
if (oh_pos + tiling_h * scale > unpool_h) {
|
|
crop_h = oh_pos + tiling_h * scale - unpool_h;
|
|
} else {
|
|
crop_h = 0;
|
|
}
|
|
tdma_store(ctx, &tl_ofmap, output_gaddr, output_stride, n_pos, c_pos, h_pos * scale, crop_h, crop_w);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
RegisterCustomOp(unpooling, UnPoolingOp);
|
|
|
|
} // namespace cvi
|