Files
SDK_SG200x_V2/cviruntime/custom_op/example/UnPoolingOp.cpp
carbon e25f20f7a3 add cviruntime
commit 3f4938648950a7f3bf9a19c320ca9fae7c52de20
Author: sophgo-forum-service <forum_service@sophgo.com>
Date:   Mon May 13 13:44:23 2024 +0800

    [feat] cviruntime opensource for cv18xx soc.

    - a4b6a3, add cumsum and gatherelements_pt.
2024-05-31 11:51:34 +08:00

307 lines
12 KiB
C++

/*
* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
*/
#include "UnPoolingOp.h"
#include "QuantHelper.h"
#include <cvikernel/cvikernel.h>
#define NPU_SHIFT 5
#define EU_SHIFT 4
#define NPU_NUM (1 << NPU_SHIFT)
#define EU_NUM (1 << EU_SHIFT)
#define LOCAL_MEM_SIZE (1 << 15)
#define NEURON_MEMORY 0
#define WEIGHT_MEMORY 1
namespace cvi {
void UnPoolingOp::interpretFp32(
std::vector<std::shared_ptr<std::vector<float>>> &operand_tensors,
std::vector<std::vector<int64_t>> &operand_shapes,
std::shared_ptr<std::vector<float>> &result_tensor,
std::vector<int64_t> &result_shape) {
unpooling(operand_tensors, operand_shapes, result_tensor, result_shape);
}
void UnPoolingOp::interpretInt8(
std::vector<std::shared_ptr<std::vector<float>>> &operand_tensors,
std::vector<std::vector<int64_t>> &operand_shapes,
std::shared_ptr<std::vector<float>> &result_tensor,
std::vector<int64_t> &result_shape) {
unpooling(operand_tensors, operand_shapes, result_tensor, result_shape);
}
void UnPoolingOp::quantizeInt8() {
// support per-tensor only for now
setOpQuantPerchannel(false);
// use rshift and INT8 multiplier
setOpQuantParamType("RSHIFT_AND_M_I8");
// quantization
float threshold_x = getPrevOpThreshold();
float threshold_y = getOpThreshold();
std::cout << "threshold_y = " << std::to_string(threshold_y)
<< ", threshold_x = " << std::to_string(threshold_x) << "\n";
}
void UnPoolingOp::codeGenInt8(void *ctx,
std::vector<std::vector<int64_t>> &operand_shapes,
std::vector<uint64_t> &operand_gaddrs,
std::vector<int64_t> &result_shape,
uint64_t result_gaddr, int layer_id) {
int n = operand_shapes[0][0];
int c = operand_shapes[0][1];
int h = operand_shapes[0][2];
int w = operand_shapes[0][3];
uint64_t data_gaddr = operand_gaddrs[0];
uint64_t mask_gaddr = operand_gaddrs[1];
uint64_t ga_output = result_gaddr;
int scale = param.get<int>("scale");
int unpool_h = param.get<int>("unpool_h");
int unpool_w = param.get<int>("unpool_w");
unpooling_codegen((cvk_context_t *)ctx, // ctx
layer_id, // layer_id
data_gaddr, // data_gaddr
mask_gaddr, // mask_gaddr
ga_output, // output_gaddr
n, c, h, w, // input shape
scale, unpool_h, unpool_w);
}
void UnPoolingOp::alloc_lmem(cvk_context_t *ctx, uint32_t tiling_c, uint32_t tiling_h,
uint32_t input_c, uint32_t input_h, uint32_t input_w,
uint32_t output_c, uint32_t output_h, uint32_t output_w,
cvk_fmt_t fmt, int eu_align, cvk_tl_t &tl_ifmap, cvk_tl_t &tl_working,
cvk_tl_t &tl_mask, cvk_tl_t &tl_ofmap) {
uint32_t tl_offset = 0;
ctx->ops->lmem_init_tensor(ctx, &tl_ifmap, {1, tiling_c, tiling_h, input_w}, fmt,
eu_align);
tl_ifmap.start_address = tl_offset;
tl_offset += ctx->ops->lmem_tensor_to_size(ctx, tl_ifmap.shape, tl_ifmap.fmt,
tl_ifmap.eu_align);
ctx->ops->lmem_init_tensor(ctx, &tl_working, {1, tiling_c, tiling_h, output_w}, fmt,
eu_align);
tl_working.start_address = tl_offset;
tl_offset += ctx->ops->lmem_tensor_to_size(ctx, tl_working.shape, tl_working.fmt,
tl_working.eu_align);
uint32_t tiling_oh = tiling_h * (output_h / input_h);
ctx->ops->lmem_init_tensor(ctx, &tl_mask, {1, tiling_c, tiling_oh, output_w}, fmt,
eu_align);
tl_mask.start_address = tl_offset;
tl_offset += ctx->ops->lmem_tensor_to_size(ctx, tl_mask.shape, tl_mask.fmt,
tl_mask.eu_align);
ctx->ops->lmem_init_tensor(ctx, &tl_ofmap, {1, tiling_c, tiling_oh, output_w}, fmt,
eu_align);
tl_ofmap.start_address = tl_offset;
}
void UnPoolingOp::tdma_load(cvk_context_t *ctx, cvk_tl_t *tlp, uint64_t ga_src,
cvk_tg_stride_t stride, int32_t n_pos, int32_t c_pos, int32_t h_pos) {
cvk_tg_t ts_data;
ts_data.base_reg_index = NEURON_MEMORY;
ts_data.fmt = tlp->fmt;
ts_data.start_address = ga_src + stride.n * n_pos + stride.c * c_pos + stride.h * h_pos;
ts_data.shape = {tlp->shape.n, tlp->shape.c, tlp->shape.h, tlp->shape.w};
ts_data.stride = stride;
cvk_tdma_g2l_tensor_copy_param_t p1;
p1.src = &ts_data;
p1.dst = tlp;
ctx->ops->tdma_g2l_tensor_copy(ctx, &p1);
}
void UnPoolingOp::unpooling_compute(
cvk_context_t *ctx, uint32_t layer_id, int scale_h, int scale_w,
cvk_tl_t *tl_ifmap, cvk_tl_t *tl_working, cvk_tl_t *tl_mask, cvk_tl_t *tl_ofmap) {
cvk_tl_stride_t tl_ifmap_fake_stride = {0, tl_ifmap->stride.c, tl_ifmap->stride.h, tl_ifmap->stride.w};
cvk_tl_t tl_ifmap_fake = {0};
tl_ifmap_fake.start_address = tl_ifmap->start_address;
tl_ifmap_fake.fmt = tl_ifmap->fmt;
tl_ifmap_fake.shape = {scale_w, tl_ifmap->shape.c, tl_ifmap->shape.h, tl_ifmap->shape.w};
tl_ifmap_fake.stride = tl_ifmap_fake_stride;
tl_ifmap_fake.eu_align = tl_ifmap->eu_align;
cvk_tl_stride_t tl_working_fake_stride = {
tl_working->stride.w, tl_working->stride.c,
tl_working->stride.h, tl_working->stride.w * scale_w};
cvk_tl_t tl_working_fake = {0};
tl_working_fake.start_address = tl_working->start_address;
tl_working_fake.fmt = tl_working->fmt;
tl_working_fake.shape = {scale_w, tl_ifmap->shape.c, tl_ifmap->shape.h, tl_ifmap->shape.w};
tl_working_fake.stride = tl_working_fake_stride;
tl_working_fake.eu_align = tl_working->eu_align;
cvk_tiu_copy_param_t param = {0};
param.dst = &tl_working_fake;
param.src = &tl_ifmap_fake;
param.layer_id = layer_id;
ctx->ops->tiu_copy(ctx, &param);
cvk_tl_stride_t tl_working_fake2_stride = {0, tl_working->stride.c, tl_working->stride.h, tl_working->stride.w};
cvk_tl_t tl_working_fake2 = {0};
tl_working_fake2.start_address = tl_working->start_address;
tl_working_fake2.fmt = tl_working->fmt;
tl_working_fake2.shape = {scale_h, tl_ofmap->shape.c, tl_ifmap->shape.h, tl_ofmap->shape.w};
tl_working_fake2.stride = tl_working_fake2_stride;
tl_working_fake2.eu_align = tl_working->eu_align;
cvk_tl_stride_t tl_ofmap_fake_stride = {tl_ofmap->stride.h, tl_ofmap->stride.c, tl_ofmap->stride.h * scale_h, tl_ofmap->stride.w};
cvk_tl_t tl_ofmap_fake = {0};
tl_ofmap_fake.start_address = tl_ofmap->start_address;
tl_ofmap_fake.fmt = tl_ofmap->fmt;
tl_ofmap_fake.shape = {scale_h, tl_ofmap->shape.c, tl_ifmap->shape.h, tl_ofmap->shape.w};
tl_ofmap_fake.stride = tl_ofmap_fake_stride;
tl_ofmap_fake.eu_align = tl_ofmap->eu_align;
cvk_tiu_copy_param_t param2 = {0};
param2.dst = &tl_ofmap_fake;
param2.src = &tl_working_fake2;
param2.layer_id = layer_id;
ctx->ops->tiu_copy(ctx, &param2);
cvk_tiu_mul_param_t param3 = {0};
param3.res_high = nullptr;
param3.res_low = tl_ofmap;
param3.a = tl_ofmap;
param3.b_is_const = 0;
param3.b = tl_mask;
param3.layer_id = layer_id;
param3.rshift_bits = 0;
param3.relu_enable = 0;
ctx->ops->tiu_mul(ctx, &param3);
}
void UnPoolingOp::tdma_store(cvk_context_t *ctx, cvk_tl_t *tlp,
uint64_t ga_dst, cvk_tg_stride_t stride,
uint32_t n_pos, uint32_t c_pos, uint32_t h_pos,
uint32_t crop_h, uint32_t crop_w) {
cvk_tl_t tl_src;
tl_src.start_address = tlp->start_address;
tl_src.fmt = tlp->fmt;
tl_src.shape = {tlp->shape.n, tlp->shape.c, tlp->shape.h - crop_h, tlp->shape.w - crop_w};
tl_src.stride = tlp->stride;
cvk_tg_t tg_dst;
tg_dst.base_reg_index = NEURON_MEMORY;
tg_dst.fmt = tlp->fmt;
tg_dst.start_address = ga_dst + stride.n * n_pos + stride.c * c_pos + stride.h * h_pos;
tg_dst.shape = {tlp->shape.n, tlp->shape.c, tlp->shape.h - crop_h, tlp->shape.w - crop_w};
tg_dst.stride = stride;
cvk_tdma_l2g_tensor_copy_param_t p1;
p1.src = &tl_src;
p1.dst = &tg_dst;
ctx->ops->tdma_l2g_tensor_copy(ctx, &p1);
}
void UnPoolingOp::unpooling_codegen(cvk_context_t *ctx, uint32_t layer_id,
uint64_t data_gaddr, uint64_t mask_gaddr, uint64_t output_gaddr,
int input_n, int input_c, int input_h, int input_w,
int scale, int unpool_h, int unpool_w) {
printf("unpooling_codegen:\n"
" layer_id %d\n"
" data_gddr: %lx, mask_gaddr: %lx, output_gaddr: %lx\n"
" input (%d, %d, %d, %d)\n"
" scale:%d, unpool_h:%d, unpool_w:%d\n",
layer_id, data_gaddr, mask_gaddr, output_gaddr, input_n, input_c, input_h,
input_w, scale, unpool_h, unpool_w);
// Split input based on local memory
uint32_t total_eu = NPU_NUM * EU_NUM;
uint32_t lane_size = LOCAL_MEM_SIZE;
uint32_t total_mem_size = NPU_NUM * LOCAL_MEM_SIZE;
uint32_t max_N = (1 << 12) - 1; // 1880v2: 12 bit
uint32_t max_W = (1 << 12) - 1; // 1880v2: 12 bit
uint32_t count = input_n * input_c * input_h * input_w;
uint32_t output_c = input_c;
uint32_t output_h = input_h * scale;
uint32_t output_w = input_w * scale;
uint32_t n_step = 1;
uint32_t c_step = 0;
uint32_t h_step = 0;
h_step = input_h;
uint32_t h_factor = scale;
for (; h_step > 0; --h_step) {
uint32_t total_size;
for (c_step = input_c; c_step >= (uint32_t)NPU_NUM ; --c_step) {
cvk_tl_shape_t tiled_ifmap_shape = {1, c_step, h_step, input_w};
uint32_t tiled_ifmap_size =
ctx->ops->lmem_tensor_to_size(ctx, tiled_ifmap_shape, CVK_FMT_I8, 0);
cvk_tl_shape_t tiled_working_shape = {1, c_step, h_step, output_w};
uint32_t tiled_working_size =
ctx->ops->lmem_tensor_to_size(ctx, tiled_working_shape, CVK_FMT_I8, 0);
cvk_tl_shape_t tiled_ofmap_shape = {1, c_step, h_step * h_factor, output_w};
uint32_t tiled_ofmap_size =
ctx->ops->lmem_tensor_to_size(ctx, tiled_ofmap_shape, CVK_FMT_I8, 0);
total_size = tiled_ifmap_size + tiled_working_size + tiled_ofmap_size * 2;
if (total_size <= static_cast<uint32_t>(LOCAL_MEM_SIZE))
break;
}
if (total_size <= static_cast<uint32_t>(LOCAL_MEM_SIZE))
break;
}
printf("tiling: c_step %d, h_step %d\n", c_step, h_step);
assert(c_step && h_step && "Expect valid tiling");
cvk_tg_stride_t ifmap_stride = {
input_c * input_h * input_w,
input_h * input_w,
input_w};
cvk_tg_stride_t mask_stride = {
output_c * output_h * output_w,
output_h * output_w,
output_w};
cvk_tg_stride_t output_stride = {
output_c * unpool_h * unpool_w,
unpool_h * unpool_w,
unpool_w};
uint64_t output_offset = 0;
uint32_t crop_h = 0;
uint32_t crop_w = 0;
for (uint32_t n_pos = 0; n_pos < input_n; n_pos += n_step) {
for (uint32_t c_pos = 0; c_pos < input_c; c_pos += c_step) {
uint32_t tiling_c = std::min(input_c - c_pos, c_step);
for (uint32_t h_pos = 0; h_pos < input_h; h_pos += h_step) {
uint32_t tiling_h = std::min(input_h - h_pos, h_step);
cvk_tl_t tl_ifmap, tl_ofmap, tl_mask, tl_working;
alloc_lmem(ctx, tiling_c, tiling_h, input_c, input_h, input_w, output_c,
output_h, output_w, CVK_FMT_I8, 0, tl_ifmap, tl_working,
tl_mask, tl_ofmap);
tdma_load(ctx, &tl_ifmap, data_gaddr, ifmap_stride, n_pos, c_pos, h_pos);
tdma_load(ctx, &tl_mask, mask_gaddr, mask_stride, n_pos, c_pos, h_pos * scale);
unpooling_compute(ctx, layer_id, scale, scale, &tl_ifmap, &tl_working, &tl_mask, &tl_ofmap);
uint32_t oh_pos = h_pos * scale;
crop_w = output_w - unpool_w;
if (oh_pos + tiling_h * scale > unpool_h) {
crop_h = oh_pos + tiling_h * scale - unpool_h;
} else {
crop_h = 0;
}
tdma_store(ctx, &tl_ofmap, output_gaddr, output_stride, n_pos, c_pos, h_pos * scale, crop_h, crop_w);
}
}
}
}
RegisterCustomOp(unpooling, UnPoolingOp);
} // namespace cvi