/* * Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved. */ #include "LeakyReluOp.h" #include "QuantHelper.h" #include #define NPU_SHIFT 5 #define EU_SHIFT 4 #define NPU_NUM (1 << NPU_SHIFT) #define EU_NUM (1 << EU_SHIFT) #define LOCAL_MEM_SIZE (1 << 15) #define NEURON_MEMORY 0 #define WEIGHT_MEMORY 1 namespace cvi { void LeakyReluOp::interpretFp32( std::vector>> &operand_tensors, std::vector> &operand_shapes, std::shared_ptr> &result_tensor, std::vector &result_shape) { int n = operand_shapes[0][0]; int c = operand_shapes[0][1]; int h = operand_shapes[0][2]; int w = operand_shapes[0][3]; auto input = operand_tensors[0]->data(); auto output = result_tensor->data(); auto negative_slope = param.get("negative_slope"); for (int i = 0; i < (int)operand_tensors[0]->size(); ++i) { if (input[i] >= 0) { output[i] = input[i]; } else { output[i] = negative_slope * input[i]; } } } void LeakyReluOp::interpretInt8( std::vector>> &operand_tensors, std::vector> &operand_shapes, std::shared_ptr> &result_tensor, std::vector &result_shape) { int n = operand_shapes[0][0]; int c = operand_shapes[0][1]; int h = operand_shapes[0][2]; int w = operand_shapes[0][3]; auto input = operand_tensors[0]->data(); auto quant_pos_rshift = param.has("rshift_pos") ? (float)param.get("rshift_pos") : 0.0f; auto quant_pos_multiplier = param.has("m_i8_pos") ? (float)param.get("m_i8_pos") : 0.0f; auto quant_neg_rshift = (float)param.get("rshift_neg"); auto quant_neg_multiplier = (float)param.get("m_i8_neg"); auto output = result_tensor->data(); // rshift and saturate on output for (int i = 0; i < (int)operand_tensors[0]->size(); ++i) { if (input[i] > 0) { if (quant_pos_multiplier != 0.0f) { output[i] = (float)applyMultiplierAndRShiftAndSaturateInt8( input[i], (uint32_t)quant_pos_rshift, quant_pos_multiplier, false); } else { output[i] = input[i]; } } else { output[i] = (float)applyMultiplierAndRShiftAndSaturateInt8( input[i], (uint32_t)quant_neg_rshift, quant_neg_multiplier, false); } } } void LeakyReluOp::quantizeInt8() { // support per-tensor only for now setOpQuantPerchannel(false); // use rshift and INT8 multiplier setOpQuantParamType("RSHIFT_AND_M_I8"); float negative_slope = param.get("negative_slope"); std::cout << " negative_slope: " << std::to_string(negative_slope) << "\n"; // create tensors for rshift and multiplier float rshift_pos = 0; float multiplier_pos = 0; float rshift_neg = 0; float multiplier_neg = 0; // quantization float threshold_x = getPrevOpThreshold(); float threshold_y = getOpThreshold(); std::cout << "threshold_y = " << std::to_string(threshold_y) << ", threshold_x = " << std::to_string(threshold_x) << "\n"; // positive double qscale_pos = threshold_x / threshold_y; if (fabs(threshold_x - threshold_y) < 1e-5 * std::min(threshold_x, threshold_y)) { // no positive scale rshift_pos = 0; multiplier_pos = 0; std::cout << " Positive: no_scale\n"; } else { uint32_t uint_multiplier_pos; rshift_pos = (float)findRShiftAndMultiplierFromQScale(qscale_pos, &uint_multiplier_pos, false); multiplier_pos = (float)uint_multiplier_pos; std::cout << " Positive: "; std::cout << " [multiplier : rshift] = [" << std::to_string(multiplier_pos) << " : " << std::to_string(rshift_pos) << "]\n"; } // negative float qscale_neg = fabs(qscale_pos * negative_slope); uint32_t uint_multiplier_neg = 0; rshift_neg = (float)findRShiftAndMultiplierFromQScale(qscale_neg, &uint_multiplier_neg, false); multiplier_neg = (float)uint_multiplier_neg; std::cout << " Negative: "; std::cout << " [multiplier : rshift] = [" << std::to_string(multiplier_neg) << " : " << std::to_string(rshift_neg) << "]\n"; bool do_pos_scale = (multiplier_pos != 0.0) ? true : false; if (do_pos_scale) { param.put("rshift_pos", static_cast(rshift_pos)); param.put("m_i8_pos", static_cast(multiplier_pos)); } param.put("rshift_neg", static_cast(rshift_neg)); param.put("m_i8_neg", static_cast(multiplier_neg)); } void LeakyReluOp::codeGenInt8(void *ctx, std::vector> &operand_shapes, std::vector &operand_gaddrs, std::vector &result_shape, uint64_t result_gaddr, int layer_id) { auto pos_rshift = param.has("rshift_pos") ? param.get("rshift_pos") : 0; auto pos_m_i8 = param.has("m_i8_pos") ? param.get("m_i8_pos") : 0; auto neg_rshift = param.has("rshift_neg") ? param.get("rshift_neg") : 0; auto neg_m_i8 = param.has("m_i8_neg") ? param.get("m_i8_neg") : 0; assert(neg_m_i8); int n = operand_shapes[0][0]; int c = operand_shapes[0][1]; int h = operand_shapes[0][2]; int w = operand_shapes[0][3]; uint64_t operand_gaddr = operand_gaddrs[0]; uint64_t ga_output = result_gaddr; leakyrelu_codegen((cvk_context_t *)ctx, // ctx layer_id, // layer_id operand_gaddr, // input_gaddr result_gaddr, // output_gaddr n, // input_n c, // input_c h, // input_h w, // input_w pos_rshift, // GT_right_shift_width neg_rshift, // LE_right_shift_width pos_m_i8, // GT_scale neg_m_i8 // LE_scale ); } void LeakyReluOp::tdma_load(cvk_context_t *ctx, cvk_tl_t *tlp, uint64_t ga_src) { cvk_tg_t ts_data; ts_data.base_reg_index = NEURON_MEMORY; ts_data.fmt = tlp->fmt; ts_data.start_address = ga_src; ts_data.shape = {tlp->shape.n, tlp->shape.c, tlp->shape.h, tlp->shape.w}; ts_data.stride = ctx->ops->tg_default_stride(ctx, ts_data.shape, ts_data.fmt); cvk_tdma_g2l_tensor_copy_param_t p1; p1.src = &ts_data; p1.dst = tlp; ctx->ops->tdma_g2l_tensor_copy(ctx, &p1); } void LeakyReluOp::tdma_store(cvk_context_t *ctx, cvk_tl_t *tlp, uint64_t ga_dst) { cvk_tg_t ts_data; ts_data.base_reg_index = NEURON_MEMORY; ts_data.fmt = tlp->fmt; ts_data.start_address = ga_dst; ts_data.shape = {tlp->shape.n, tlp->shape.c, tlp->shape.h, tlp->shape.w}; ts_data.stride = ctx->ops->tg_default_stride(ctx, ts_data.shape, ts_data.fmt); cvk_tdma_l2g_tensor_copy_param_t p1; p1.src = tlp; p1.dst = &ts_data; ctx->ops->tdma_l2g_tensor_copy(ctx, &p1); } void LeakyReluOp::leakyrelu_kernel(cvk_context_t *ctx, int layer_id, cvk_tl_t &bottom, cvk_tl_t &relu, cvk_tl_t &neg, int GT_right_shift_width, int LE_right_shift_width, int GT_scale, int LE_scale) { bool isIgnorePosPart = (GT_scale == 0); bool isSlopeSmallerThanOne = ((LE_scale >> LE_right_shift_width) == 0); if (isIgnorePosPart) { cvk_tiu_mul_param_t p4; p4.res_high = nullptr; p4.res_low = &relu; p4.a = ⊥ p4.b_const.val = LE_scale; p4.b_const.is_signed = true; p4.b_is_const = 1; p4.rshift_bits = LE_right_shift_width; p4.layer_id = layer_id; p4.relu_enable = 0; ctx->ops->tiu_mul(ctx, &p4); if (isSlopeSmallerThanOne) { cvk_tiu_max_param_t p1; p1.max = ⊥ p1.a = ⊥ p1.b = &relu; p1.b_is_const = 0; p1.layer_id = layer_id; ctx->ops->tiu_max(ctx, &p1); } else { cvk_tiu_min_param_t p1; p1.min = ⊥ p1.a = ⊥ p1.b = &relu; p1.b_is_const = 0; p1.layer_id = layer_id; ctx->ops->tiu_min(ctx, &p1); } } else { // 0. relu = relu(bottom) cvk_tiu_max_param_t p13; p13.max = &relu; p13.a = ⊥ p13.b_is_const = 1; p13.b_const.is_signed = 1; p13.b_const.val = 0; p13.layer_id = layer_id; ctx->ops->tiu_max(ctx, &p13); // 1. relu = (relu * GT_scale) >> GT_right_shift_width cvk_tiu_mul_param_t p; p.res_high = nullptr; p.res_low = &relu; p.a = &relu; p.b_const.val = GT_scale; p.b_const.is_signed = true; p.b_is_const = 1; p.rshift_bits = GT_right_shift_width; p.layer_id = layer_id; p.relu_enable = 0; ctx->ops->tiu_mul(ctx, &p); // 2. neg = neg(0, botom) cvk_tiu_min_param_t p7; p7.min = &neg; p7.a = ⊥ p7.b_is_const = 1; p7.b_const.val = 0; p7.b_const.is_signed = 1; p7.layer_id = layer_id; ctx->ops->tiu_min(ctx, &p7); // 3. neg (n,c,h,w) = (neg(n,c,h,w) * slope) >> LE_right_shift_width cvk_tiu_mul_param_t p8; p8.res_high = nullptr; p8.res_low = &neg; p8.a = &neg; p8.b_const.val = LE_scale; p8.b_const.is_signed = true; p8.b_is_const = 1; p8.rshift_bits = LE_right_shift_width; p8.layer_id = layer_id; p8.relu_enable = 0; ctx->ops->tiu_mul(ctx, &p8); // 4. bottom = or relu, neg cvk_tiu_or_int8_param_t p9; p9.res = ⊥ p9.a = &relu; p9.b = &neg; p9.layer_id = layer_id; ctx->ops->tiu_or_int8(ctx, &p9); } } void LeakyReluOp::leakyrelu_codegen(cvk_context_t *ctx, uint32_t layer_id, uint64_t input_gaddr, uint64_t output_gaddr, int input_n, int input_c, int input_h, int input_w, int GT_right_shift_width, int LE_right_shift_width, int GT_scale, int LE_scale) { printf("leakyrelu_codegen:\n" " layer_id %d\n" " input_gddr: %lx, output_gaddr: %lx\n" " input (%d, %d, %d, %d)\n" " GT_scale:%d, LE_scale:%d\n" " GT_right_shift_width:%d, LE_right_shift_width:%d\n", layer_id, input_gaddr, output_gaddr, input_n, input_c, input_h, input_w, GT_scale, LE_scale, GT_right_shift_width, LE_right_shift_width); // Split input based on local memory uint32_t total_eu = NPU_NUM * EU_NUM; uint32_t lane_size = LOCAL_MEM_SIZE; uint32_t total_mem_size = NPU_NUM * LOCAL_MEM_SIZE; uint32_t max_N = (1 << 12) - 1; // 1880v2: 12 bit uint32_t max_W = (1 << 12) - 1; // 1880v2: 12 bit uint32_t count = input_n * input_c * input_h * input_w; uint32_t tiled_N = count / total_eu / 3; // 3 blobs tiled_N = (tiled_N > max_N) ? max_N : tiled_N; // local tensor shape(tiled_N, npu_num, 1, eu_num) cvk_tl_shape_t tl_shape = {tiled_N, static_cast(NPU_NUM), 1, static_cast(EU_NUM)}; cvk_tl_stride_t tl_stride = ctx->ops->tl_default_stride(ctx, tl_shape, CVK_FMT_I8, 1); // Find max tiled_N uint32_t required_size = 0; do { tl_shape.n = tiled_N; tl_stride = ctx->ops->tl_default_stride(ctx, tl_shape, CVK_FMT_I8, 1); required_size = 3 * tl_shape.n * tl_stride.n; // 3 blobs if (required_size <= lane_size) { break; } } while (--tiled_N); printf(" tiled_bottom shape (%d, %d, %d, %d), stride (%d, %d, %d, %d)\n" " required_size %d kB/lane\n", tl_shape.n, tl_shape.c, tl_shape.h, tl_shape.w, tl_stride.n, tl_stride.c, tl_stride.h, tl_stride.w, required_size / 1024); assert(tiled_N); if (!tiled_N) { return; } // Tiled local memory layout: // tiled bottom/result // tiled relu // tiled neg // Tiled bottom required_size /= 3; // for 3 blobs cvk_tl_t tl_tiled_bottom; tl_tiled_bottom.start_address = 0; tl_tiled_bottom.fmt = CVK_FMT_I8; tl_tiled_bottom.shape = tl_shape; tl_tiled_bottom.stride = tl_stride; // Tiled relu cvk_tl_t tl_tiled_relu = tl_tiled_bottom; tl_tiled_relu.start_address = tl_tiled_bottom.start_address + required_size; // Tiled neg cvk_tl_t tl_tiled_neg = tl_tiled_bottom; tl_tiled_neg.start_address = tl_tiled_relu.start_address + required_size; // In unit of tiled_N * npu_num * eu_num uint32_t global_input_offset = 0; for (uint32_t i = 0; i < (count / total_eu / tiled_N); i++) { // Load as a chunk of contiguous memory in global memory, not use global // shape/stride Local memory use tensor shape to maximize eu utilization. tdma_load(ctx, &tl_tiled_bottom, input_gaddr + global_input_offset); leakyrelu_kernel(ctx, layer_id, tl_tiled_bottom, tl_tiled_relu, tl_tiled_neg, GT_right_shift_width, LE_right_shift_width, GT_scale, LE_scale); // Store bottom as a chunk of contiguous memory, not use global shape/stride tdma_store(ctx, &tl_tiled_bottom, output_gaddr + global_input_offset); // Next input offset global_input_offset += tiled_N * total_eu; } // for (uint32_t i = 0; i < (count/total_eu/tiled_N); i++) // Remaining count, in unit of npu_num * eu_num if (global_input_offset < count) { uint32_t tiled_W = (count - global_input_offset) / NPU_NUM; tiled_N = 1; do { tl_shape.n = tiled_N; tl_shape.w = tiled_W; tl_stride = ctx->ops->tl_default_stride(ctx, tl_shape, CVK_FMT_I8, 1); required_size = 3 * tl_shape.n * tl_stride.n; // 3 blobs if (required_size <= lane_size && (tiled_W <= max_W)) { break; } else { tiled_W /= 2; tiled_N *= 2; } } while (true); // Magic number for 2^12 -1 - 32 if ((count - global_input_offset) % NPU_NUM != 0) { std::cout << "Remaining size should align npu_num, or die"; assert(0); } // Update shape, stride tl_shape.n = tiled_N; tl_shape.w = tiled_W; tl_stride = ctx->ops->tl_default_stride(ctx, tl_shape, CVK_FMT_I8, 1); required_size = tl_shape.n * tl_stride.n; printf(" tiled_bottom shape (%d, %d, %d, %d), stride (%d, %d, %d, %d)\n" " required_size %d kB/lane\n", tl_shape.n, tl_shape.c, tl_shape.h, tl_shape.w, tl_stride.n, tl_stride.c, tl_stride.h, tl_stride.w, required_size / 1024); // Tiled bottom tl_tiled_bottom.shape = tl_shape; tl_tiled_bottom.stride = tl_stride; // Tiled bottom with precise stride cvk_tl_t tl_tiled_bottom_precise_stride = tl_tiled_bottom; tl_tiled_bottom_precise_stride.stride = { static_cast(tl_shape.h * tl_shape.w * sizeof(uint8_t)), static_cast(tl_shape.h * tl_shape.w * sizeof(uint8_t)), static_cast(tl_shape.w * sizeof(uint8_t)), sizeof(uint8_t)}; printf(" tiled_bottom_precise shape (%d, %d, %d, %d), stride (%d, %d, %d, %d)\n" " required_size %d kB/lane\n", tl_shape.n, tl_shape.c, tl_shape.h, tl_shape.w, tl_tiled_bottom_precise_stride.stride.n, tl_tiled_bottom_precise_stride.stride.c, tl_tiled_bottom_precise_stride.stride.h, tl_tiled_bottom_precise_stride.stride.w, required_size / 1024); // Tiled relu tl_tiled_relu = tl_tiled_bottom; tl_tiled_relu.start_address = tl_tiled_bottom.start_address + required_size; // Tiled neg tl_tiled_neg = tl_tiled_bottom; tl_tiled_neg.start_address = tl_tiled_relu.start_address + required_size; // Load as a chunk of contiguous memory in global memory, not use global // shape/stride Local memory use tensor shape to maximize eu utilization. tdma_load(ctx, &tl_tiled_bottom, input_gaddr + global_input_offset); leakyrelu_kernel(ctx, layer_id, tl_tiled_bottom, tl_tiled_relu, tl_tiled_neg, GT_right_shift_width, LE_right_shift_width, GT_scale, LE_scale); // Store bottom as a chunk of contiguous memory, not use global shape/stride tdma_store(ctx, &tl_tiled_bottom, output_gaddr + global_input_offset); global_input_offset += tl_tiled_bottom_precise_stride.shape.n * tl_tiled_bottom_precise_stride.stride.n * NPU_NUM; } // Remaining count, in unit of eu_num if (global_input_offset != count) { printf("global_input_offset != count (%d != %d)/n", global_input_offset, count); assert(0); } } RegisterCustomOp(leaky_relu, LeakyReluOp); } // namespace cvi