From 88a2fed916dc12f2def1b368f88fb8b6e10b7e20 Mon Sep 17 00:00:00 2001 From: carbon Date: Fri, 31 May 2024 11:46:37 +0800 Subject: [PATCH] add cvikernel commit 9f1f57a19c3c281a931dfc71b318494487193d56 Author: sophgo-forum-service Date: Mon May 13 13:58:23 2024 +0800 [feat] cvikernel opensource for cv18xx soc. - 79b6a7, set lookup_interp_table layer_id. --- .version/2024-05-31.md | 1 + cvikernel/.gitignore | 1 + cvikernel/CMakeLists.txt | 128 + cvikernel/README.md | 56 + .../include/bmkernel/bm1822/1822_fp_convert.h | 334 +++ .../include/bmkernel/bm1822/bm1822_tdma_reg.h | 306 ++ .../include/bmkernel/bm1822/bm1822_tiu_reg.h | 599 ++++ .../include/bmkernel/bm1822/bm1822_tpu_cfg.h | 38 + .../include/bmkernel/bm1822/bm_vlc_compress.h | 703 +++++ .../include/bmkernel/bm1822/bmkernel_1822.h | 1176 ++++++++ .../include/bmkernel/bm1822/compression.h | 369 +++ .../bmkernel/bm1880v2/1880v2_fp_convert.h | 338 +++ .../bmkernel/bm1880v2/bm1880v2_tdma_reg.h | 301 ++ .../bmkernel/bm1880v2/bm1880v2_tiu_reg.h | 574 ++++ .../bmkernel/bm1880v2/bm1880v2_tpu_cfg.h | 37 + .../bmkernel/bm1880v2/bm_vlc_compress.h | 708 +++++ .../bmkernel/bm1880v2/bmkernel_1880v2.h | 1042 +++++++ .../include/bmkernel/bm1880v2/compression.h | 369 +++ .../include/bmkernel/bm1880v2/non_atomic.h | 300 ++ cvikernel/include/bmkernel/bm_kernel.h | 115 + cvikernel/include/bmkernel/bm_kernel_legacy.h | 223 ++ cvikernel/include/bmkernel/bm_regcpu.h | 72 + cvikernel/include/bmkernel/reg_bdcast.h | 37 + cvikernel/include/bmkernel/reg_tdma.h | 98 + cvikernel/include/bmkernel/reg_tiu.h | 20 + .../cvikernel/cv180x/cv180x_tdma_reg.h | 310 ++ .../include/cvikernel/cv180x/cv180x_tiu_reg.h | 622 ++++ .../include/cvikernel/cv180x/cv180x_tpu_cfg.h | 38 + .../cvikernel/cv181x/cv181x_tdma_reg.h | 310 ++ .../include/cvikernel/cv181x/cv181x_tiu_reg.h | 622 ++++ .../include/cvikernel/cv181x/cv181x_tpu_cfg.h | 38 + cvikernel/include/cvikernel/cvikernel.h | 1171 ++++++++ cvikernel/include/cvikernel/cvk_fp_convert.h | 333 +++ .../include/cvikernel/cvk_vlc_compress.h | 728 +++++ cvikernel/src/bm1822/bm_dmabuf.c | 423 +++ cvikernel/src/bm1822/bm_kernel.c | 586 ++++ cvikernel/src/bm1822/kernel_1822.h | 374 +++ cvikernel/src/bm1822/tdma.c | 1977 +++++++++++++ cvikernel/src/bm1822/tiu_average_pooling.c | 90 + cvikernel/src/bm1822/tiu_convolution.c | 176 ++ cvikernel/src/bm1822/tiu_convolution_qdm.c | 166 ++ .../src/bm1822/tiu_depthwise_convolution.c | 152 + .../bm1822/tiu_depthwise_convolution_qdm.c | 142 + cvikernel/src/bm1822/tiu_element_wise_add.c | 81 + cvikernel/src/bm1822/tiu_element_wise_and.c | 100 + cvikernel/src/bm1822/tiu_element_wise_copy.c | 42 + cvikernel/src/bm1822/tiu_element_wise_ge.c | 110 + cvikernel/src/bm1822/tiu_element_wise_mac.c | 68 + cvikernel/src/bm1822/tiu_element_wise_max.c | 56 + cvikernel/src/bm1822/tiu_element_wise_min.c | 58 + cvikernel/src/bm1822/tiu_element_wise_mul.c | 67 + .../src/bm1822/tiu_element_wise_mul_qdm.c | 67 + cvikernel/src/bm1822/tiu_element_wise_or.c | 100 + cvikernel/src/bm1822/tiu_element_wise_shift.c | 58 + cvikernel/src/bm1822/tiu_element_wise_sub.c | 68 + cvikernel/src/bm1822/tiu_element_wise_xor.c | 100 + cvikernel/src/bm1822/tiu_lookup_table.c | 112 + .../src/bm1822/tiu_matrix_multiplication.c | 151 + .../bm1822/tiu_matrix_multiplication_qdm.c | 151 + cvikernel/src/bm1822/tiu_max_pooling.c | 69 + cvikernel/src/bm1822/tiu_min_pooling.c | 120 + cvikernel/src/bm1880v2/bm_dmabuf.c | 410 +++ cvikernel/src/bm1880v2/bm_kernel.c | 594 ++++ cvikernel/src/bm1880v2/kernel_1880v2.h | 372 +++ cvikernel/src/bm1880v2/non_atomic/common.c | 1201 ++++++++ .../bm1880v2/non_atomic/fp32_bf16_kernel.c | 49 + cvikernel/src/bm1880v2/non_atomic/gen_lut.h | 182 ++ .../bm1880v2/non_atomic/hists_svm_kernel.c | 929 ++++++ .../src/bm1880v2/non_atomic/tiu_lut_atan.c | 1105 ++++++++ .../src/bm1880v2/non_atomic/tiu_lut_atan2.c | 1015 +++++++ .../src/bm1880v2/non_atomic/tiu_reciprocal.c | 167 ++ .../src/bm1880v2/non_atomic/tiu_reshape_c.c | 438 +++ .../src/bm1880v2/non_atomic/tiu_sigmoid.c | 277 ++ cvikernel/src/bm1880v2/non_atomic/tiu_sqrt.c | 138 + cvikernel/src/bm1880v2/tdma.c | 1960 +++++++++++++ cvikernel/src/bm1880v2/tiu_average_pooling.c | 79 + cvikernel/src/bm1880v2/tiu_convolution.c | 170 ++ cvikernel/src/bm1880v2/tiu_convolution_qdm.c | 160 ++ .../src/bm1880v2/tiu_depthwise_convolution.c | 148 + .../bm1880v2/tiu_depthwise_convolution_qdm.c | 122 + cvikernel/src/bm1880v2/tiu_element_wise_add.c | 79 + cvikernel/src/bm1880v2/tiu_element_wise_and.c | 100 + .../src/bm1880v2/tiu_element_wise_copy.c | 42 + cvikernel/src/bm1880v2/tiu_element_wise_mac.c | 67 + cvikernel/src/bm1880v2/tiu_element_wise_max.c | 56 + cvikernel/src/bm1880v2/tiu_element_wise_min.c | 58 + cvikernel/src/bm1880v2/tiu_element_wise_mul.c | 66 + .../src/bm1880v2/tiu_element_wise_mul_qdm.c | 66 + cvikernel/src/bm1880v2/tiu_element_wise_or.c | 100 + .../src/bm1880v2/tiu_element_wise_shift.c | 58 + cvikernel/src/bm1880v2/tiu_element_wise_sub.c | 68 + cvikernel/src/bm1880v2/tiu_element_wise_xor.c | 100 + cvikernel/src/bm1880v2/tiu_lookup_table.c | 113 + .../src/bm1880v2/tiu_matrix_multiplication.c | 149 + .../bm1880v2/tiu_matrix_multiplication_qdm.c | 150 + cvikernel/src/bm1880v2/tiu_max_pooling.c | 64 + cvikernel/src/bm1880v2/tiu_mdsum.c | 60 + cvikernel/src/bm_kernel.c | 67 + cvikernel/src/bmkernel_standard.h | 24 + cvikernel/src/cv180x/cvkcv180x.c | 885 ++++++ cvikernel/src/cv180x/cvkcv180x.h | 753 +++++ cvikernel/src/cv180x/tdma.c | 2267 +++++++++++++++ cvikernel/src/cv180x/tiu_add.c | 88 + cvikernel/src/cv180x/tiu_and.c | 111 + cvikernel/src/cv180x/tiu_average_pooling.c | 94 + cvikernel/src/cv180x/tiu_convolution.c | 175 ++ cvikernel/src/cv180x/tiu_copy.c | 47 + .../src/cv180x/tiu_depthwise_convolution.c | 147 + cvikernel/src/cv180x/tiu_ge.c | 123 + cvikernel/src/cv180x/tiu_lookup_table.c | 118 + cvikernel/src/cv180x/tiu_mac.c | 73 + .../src/cv180x/tiu_matrix_multiplication.c | 160 ++ .../src/cv180x/tiu_matrix_multiplication_qm.c | 153 + cvikernel/src/cv180x/tiu_max.c | 62 + cvikernel/src/cv180x/tiu_max_pooling.c | 74 + cvikernel/src/cv180x/tiu_min.c | 63 + cvikernel/src/cv180x/tiu_min_pooling.c | 140 + cvikernel/src/cv180x/tiu_mul.c | 72 + cvikernel/src/cv180x/tiu_mul_qm.c | 71 + cvikernel/src/cv180x/tiu_or.c | 112 + cvikernel/src/cv180x/tiu_pt_convolution.c | 183 ++ .../src/cv180x/tiu_pt_depthwise_convolution.c | 158 ++ cvikernel/src/cv180x/tiu_shift.c | 63 + cvikernel/src/cv180x/tiu_sub.c | 73 + cvikernel/src/cv180x/tiu_xor.c | 111 + cvikernel/src/cv181x/cvkcv181x.c | 885 ++++++ cvikernel/src/cv181x/cvkcv181x.h | 753 +++++ cvikernel/src/cv181x/tdma.c | 2267 +++++++++++++++ cvikernel/src/cv181x/tiu_add.c | 88 + cvikernel/src/cv181x/tiu_and.c | 111 + cvikernel/src/cv181x/tiu_average_pooling.c | 94 + cvikernel/src/cv181x/tiu_convolution.c | 175 ++ cvikernel/src/cv181x/tiu_copy.c | 47 + .../src/cv181x/tiu_depthwise_convolution.c | 147 + cvikernel/src/cv181x/tiu_ge.c | 123 + cvikernel/src/cv181x/tiu_lookup_table.c | 118 + cvikernel/src/cv181x/tiu_mac.c | 73 + .../src/cv181x/tiu_matrix_multiplication.c | 160 ++ .../src/cv181x/tiu_matrix_multiplication_qm.c | 153 + cvikernel/src/cv181x/tiu_max.c | 62 + cvikernel/src/cv181x/tiu_max_pooling.c | 74 + cvikernel/src/cv181x/tiu_min.c | 63 + cvikernel/src/cv181x/tiu_min_pooling.c | 140 + cvikernel/src/cv181x/tiu_mul.c | 72 + cvikernel/src/cv181x/tiu_mul_qm.c | 71 + cvikernel/src/cv181x/tiu_or.c | 112 + cvikernel/src/cv181x/tiu_pt_convolution.c | 183 ++ .../src/cv181x/tiu_pt_depthwise_convolution.c | 158 ++ cvikernel/src/cv181x/tiu_shift.c | 63 + cvikernel/src/cv181x/tiu_sub.c | 73 + cvikernel/src/cv181x/tiu_xor.c | 111 + cvikernel/src/cv1822/cvikernel_1822.c | 2507 +++++++++++++++++ cvikernel/src/cv1880v2/cvikernel_1880v2.c | 2410 ++++++++++++++++ cvikernel/src/cvikernel.c | 100 + cvikernel/src/engine_conductor.c | 255 ++ cvikernel/src/engine_conductor.h | 45 + cvikernel/src/engine_state.c | 33 + cvikernel/src/engine_state.h | 17 + cvikernel/src/kernel_internal.h | 94 + cvikernel/src/lmem.c | 115 + cvikernel/src/lmem.h | 21 + cvikernel/src/mode_manager.c | 142 + cvikernel/src/mode_manager.h | 94 + cvikernel/src/parallel_mode.c | 23 + cvikernel/src/serial_mode.c | 28 + cvikernel/src/stream_mode.c | 33 + cvikernel/tools/readcmdbuf.cpp | 361 +++ 167 files changed, 47814 insertions(+) create mode 100644 cvikernel/.gitignore create mode 100644 cvikernel/CMakeLists.txt create mode 100644 cvikernel/README.md create mode 100644 cvikernel/include/bmkernel/bm1822/1822_fp_convert.h create mode 100644 cvikernel/include/bmkernel/bm1822/bm1822_tdma_reg.h create mode 100644 cvikernel/include/bmkernel/bm1822/bm1822_tiu_reg.h create mode 100644 cvikernel/include/bmkernel/bm1822/bm1822_tpu_cfg.h create mode 100644 cvikernel/include/bmkernel/bm1822/bm_vlc_compress.h create mode 100644 cvikernel/include/bmkernel/bm1822/bmkernel_1822.h create mode 100644 cvikernel/include/bmkernel/bm1822/compression.h create mode 100755 cvikernel/include/bmkernel/bm1880v2/1880v2_fp_convert.h create mode 100644 cvikernel/include/bmkernel/bm1880v2/bm1880v2_tdma_reg.h create mode 100644 cvikernel/include/bmkernel/bm1880v2/bm1880v2_tiu_reg.h create mode 100644 cvikernel/include/bmkernel/bm1880v2/bm1880v2_tpu_cfg.h create mode 100644 cvikernel/include/bmkernel/bm1880v2/bm_vlc_compress.h create mode 100644 cvikernel/include/bmkernel/bm1880v2/bmkernel_1880v2.h create mode 100644 cvikernel/include/bmkernel/bm1880v2/compression.h create mode 100644 cvikernel/include/bmkernel/bm1880v2/non_atomic.h create mode 100644 cvikernel/include/bmkernel/bm_kernel.h create mode 100644 cvikernel/include/bmkernel/bm_kernel_legacy.h create mode 100644 cvikernel/include/bmkernel/bm_regcpu.h create mode 100644 cvikernel/include/bmkernel/reg_bdcast.h create mode 100644 cvikernel/include/bmkernel/reg_tdma.h create mode 100644 cvikernel/include/bmkernel/reg_tiu.h create mode 100644 cvikernel/include/cvikernel/cv180x/cv180x_tdma_reg.h create mode 100644 cvikernel/include/cvikernel/cv180x/cv180x_tiu_reg.h create mode 100644 cvikernel/include/cvikernel/cv180x/cv180x_tpu_cfg.h create mode 100644 cvikernel/include/cvikernel/cv181x/cv181x_tdma_reg.h create mode 100644 cvikernel/include/cvikernel/cv181x/cv181x_tiu_reg.h create mode 100644 cvikernel/include/cvikernel/cv181x/cv181x_tpu_cfg.h create mode 100644 cvikernel/include/cvikernel/cvikernel.h create mode 100644 cvikernel/include/cvikernel/cvk_fp_convert.h create mode 100644 cvikernel/include/cvikernel/cvk_vlc_compress.h create mode 100644 cvikernel/src/bm1822/bm_dmabuf.c create mode 100644 cvikernel/src/bm1822/bm_kernel.c create mode 100644 cvikernel/src/bm1822/kernel_1822.h create mode 100644 cvikernel/src/bm1822/tdma.c create mode 100644 cvikernel/src/bm1822/tiu_average_pooling.c create mode 100644 cvikernel/src/bm1822/tiu_convolution.c create mode 100644 cvikernel/src/bm1822/tiu_convolution_qdm.c create mode 100644 cvikernel/src/bm1822/tiu_depthwise_convolution.c create mode 100644 cvikernel/src/bm1822/tiu_depthwise_convolution_qdm.c create mode 100644 cvikernel/src/bm1822/tiu_element_wise_add.c create mode 100644 cvikernel/src/bm1822/tiu_element_wise_and.c create mode 100644 cvikernel/src/bm1822/tiu_element_wise_copy.c create mode 100644 cvikernel/src/bm1822/tiu_element_wise_ge.c create mode 100644 cvikernel/src/bm1822/tiu_element_wise_mac.c create mode 100644 cvikernel/src/bm1822/tiu_element_wise_max.c create mode 100644 cvikernel/src/bm1822/tiu_element_wise_min.c create mode 100644 cvikernel/src/bm1822/tiu_element_wise_mul.c create mode 100644 cvikernel/src/bm1822/tiu_element_wise_mul_qdm.c create mode 100644 cvikernel/src/bm1822/tiu_element_wise_or.c create mode 100644 cvikernel/src/bm1822/tiu_element_wise_shift.c create mode 100644 cvikernel/src/bm1822/tiu_element_wise_sub.c create mode 100644 cvikernel/src/bm1822/tiu_element_wise_xor.c create mode 100644 cvikernel/src/bm1822/tiu_lookup_table.c create mode 100644 cvikernel/src/bm1822/tiu_matrix_multiplication.c create mode 100644 cvikernel/src/bm1822/tiu_matrix_multiplication_qdm.c create mode 100644 cvikernel/src/bm1822/tiu_max_pooling.c create mode 100644 cvikernel/src/bm1822/tiu_min_pooling.c create mode 100644 cvikernel/src/bm1880v2/bm_dmabuf.c create mode 100644 cvikernel/src/bm1880v2/bm_kernel.c create mode 100644 cvikernel/src/bm1880v2/kernel_1880v2.h create mode 100644 cvikernel/src/bm1880v2/non_atomic/common.c create mode 100644 cvikernel/src/bm1880v2/non_atomic/fp32_bf16_kernel.c create mode 100644 cvikernel/src/bm1880v2/non_atomic/gen_lut.h create mode 100644 cvikernel/src/bm1880v2/non_atomic/hists_svm_kernel.c create mode 100644 cvikernel/src/bm1880v2/non_atomic/tiu_lut_atan.c create mode 100644 cvikernel/src/bm1880v2/non_atomic/tiu_lut_atan2.c create mode 100644 cvikernel/src/bm1880v2/non_atomic/tiu_reciprocal.c create mode 100644 cvikernel/src/bm1880v2/non_atomic/tiu_reshape_c.c create mode 100644 cvikernel/src/bm1880v2/non_atomic/tiu_sigmoid.c create mode 100644 cvikernel/src/bm1880v2/non_atomic/tiu_sqrt.c create mode 100644 cvikernel/src/bm1880v2/tdma.c create mode 100644 cvikernel/src/bm1880v2/tiu_average_pooling.c create mode 100644 cvikernel/src/bm1880v2/tiu_convolution.c create mode 100644 cvikernel/src/bm1880v2/tiu_convolution_qdm.c create mode 100644 cvikernel/src/bm1880v2/tiu_depthwise_convolution.c create mode 100644 cvikernel/src/bm1880v2/tiu_depthwise_convolution_qdm.c create mode 100644 cvikernel/src/bm1880v2/tiu_element_wise_add.c create mode 100644 cvikernel/src/bm1880v2/tiu_element_wise_and.c create mode 100644 cvikernel/src/bm1880v2/tiu_element_wise_copy.c create mode 100644 cvikernel/src/bm1880v2/tiu_element_wise_mac.c create mode 100644 cvikernel/src/bm1880v2/tiu_element_wise_max.c create mode 100644 cvikernel/src/bm1880v2/tiu_element_wise_min.c create mode 100644 cvikernel/src/bm1880v2/tiu_element_wise_mul.c create mode 100644 cvikernel/src/bm1880v2/tiu_element_wise_mul_qdm.c create mode 100644 cvikernel/src/bm1880v2/tiu_element_wise_or.c create mode 100644 cvikernel/src/bm1880v2/tiu_element_wise_shift.c create mode 100644 cvikernel/src/bm1880v2/tiu_element_wise_sub.c create mode 100644 cvikernel/src/bm1880v2/tiu_element_wise_xor.c create mode 100644 cvikernel/src/bm1880v2/tiu_lookup_table.c create mode 100644 cvikernel/src/bm1880v2/tiu_matrix_multiplication.c create mode 100644 cvikernel/src/bm1880v2/tiu_matrix_multiplication_qdm.c create mode 100644 cvikernel/src/bm1880v2/tiu_max_pooling.c create mode 100644 cvikernel/src/bm1880v2/tiu_mdsum.c create mode 100644 cvikernel/src/bm_kernel.c create mode 100644 cvikernel/src/bmkernel_standard.h create mode 100644 cvikernel/src/cv180x/cvkcv180x.c create mode 100644 cvikernel/src/cv180x/cvkcv180x.h create mode 100644 cvikernel/src/cv180x/tdma.c create mode 100644 cvikernel/src/cv180x/tiu_add.c create mode 100644 cvikernel/src/cv180x/tiu_and.c create mode 100644 cvikernel/src/cv180x/tiu_average_pooling.c create mode 100644 cvikernel/src/cv180x/tiu_convolution.c create mode 100644 cvikernel/src/cv180x/tiu_copy.c create mode 100644 cvikernel/src/cv180x/tiu_depthwise_convolution.c create mode 100644 cvikernel/src/cv180x/tiu_ge.c create mode 100644 cvikernel/src/cv180x/tiu_lookup_table.c create mode 100644 cvikernel/src/cv180x/tiu_mac.c create mode 100644 cvikernel/src/cv180x/tiu_matrix_multiplication.c create mode 100644 cvikernel/src/cv180x/tiu_matrix_multiplication_qm.c create mode 100644 cvikernel/src/cv180x/tiu_max.c create mode 100644 cvikernel/src/cv180x/tiu_max_pooling.c create mode 100644 cvikernel/src/cv180x/tiu_min.c create mode 100644 cvikernel/src/cv180x/tiu_min_pooling.c create mode 100644 cvikernel/src/cv180x/tiu_mul.c create mode 100644 cvikernel/src/cv180x/tiu_mul_qm.c create mode 100644 cvikernel/src/cv180x/tiu_or.c create mode 100644 cvikernel/src/cv180x/tiu_pt_convolution.c create mode 100644 cvikernel/src/cv180x/tiu_pt_depthwise_convolution.c create mode 100644 cvikernel/src/cv180x/tiu_shift.c create mode 100644 cvikernel/src/cv180x/tiu_sub.c create mode 100644 cvikernel/src/cv180x/tiu_xor.c create mode 100644 cvikernel/src/cv181x/cvkcv181x.c create mode 100644 cvikernel/src/cv181x/cvkcv181x.h create mode 100644 cvikernel/src/cv181x/tdma.c create mode 100644 cvikernel/src/cv181x/tiu_add.c create mode 100644 cvikernel/src/cv181x/tiu_and.c create mode 100644 cvikernel/src/cv181x/tiu_average_pooling.c create mode 100644 cvikernel/src/cv181x/tiu_convolution.c create mode 100644 cvikernel/src/cv181x/tiu_copy.c create mode 100644 cvikernel/src/cv181x/tiu_depthwise_convolution.c create mode 100644 cvikernel/src/cv181x/tiu_ge.c create mode 100644 cvikernel/src/cv181x/tiu_lookup_table.c create mode 100644 cvikernel/src/cv181x/tiu_mac.c create mode 100644 cvikernel/src/cv181x/tiu_matrix_multiplication.c create mode 100644 cvikernel/src/cv181x/tiu_matrix_multiplication_qm.c create mode 100644 cvikernel/src/cv181x/tiu_max.c create mode 100644 cvikernel/src/cv181x/tiu_max_pooling.c create mode 100644 cvikernel/src/cv181x/tiu_min.c create mode 100644 cvikernel/src/cv181x/tiu_min_pooling.c create mode 100644 cvikernel/src/cv181x/tiu_mul.c create mode 100644 cvikernel/src/cv181x/tiu_mul_qm.c create mode 100644 cvikernel/src/cv181x/tiu_or.c create mode 100644 cvikernel/src/cv181x/tiu_pt_convolution.c create mode 100644 cvikernel/src/cv181x/tiu_pt_depthwise_convolution.c create mode 100644 cvikernel/src/cv181x/tiu_shift.c create mode 100644 cvikernel/src/cv181x/tiu_sub.c create mode 100644 cvikernel/src/cv181x/tiu_xor.c create mode 100644 cvikernel/src/cv1822/cvikernel_1822.c create mode 100755 cvikernel/src/cv1880v2/cvikernel_1880v2.c create mode 100644 cvikernel/src/cvikernel.c create mode 100644 cvikernel/src/engine_conductor.c create mode 100644 cvikernel/src/engine_conductor.h create mode 100644 cvikernel/src/engine_state.c create mode 100644 cvikernel/src/engine_state.h create mode 100644 cvikernel/src/kernel_internal.h create mode 100644 cvikernel/src/lmem.c create mode 100644 cvikernel/src/lmem.h create mode 100644 cvikernel/src/mode_manager.c create mode 100644 cvikernel/src/mode_manager.h create mode 100644 cvikernel/src/parallel_mode.c create mode 100644 cvikernel/src/serial_mode.c create mode 100644 cvikernel/src/stream_mode.c create mode 100644 cvikernel/tools/readcmdbuf.cpp diff --git a/.version/2024-05-31.md b/.version/2024-05-31.md index cf463e248..e1c3913e4 100644 --- a/.version/2024-05-31.md +++ b/.version/2024-05-31.md @@ -17,3 +17,4 @@ | FreeRTOS-Kernel | freertos/Source | https://github.com/sophgo/FreeRTOS-Kernel.git | sg200x-dev | d52c1b6e6 | | Lab-Project-FreeRTOS-POSIX | freertos/Source/FreeRTOS-Plus-POSIX | https://github.com/sophgo/Lab-Project-FreeRTOS-POSIX.git | sg200x-dev | 5042bfd | | cvibuilder | cvibuilder | https://github.com/sophgo/cvibuilder.git | sg200x-dev | 4309f2a | +| cvikernel | cvikernel | https://github.com/sophgo/cvikernel.git | sg200x-dev | 9f1f57a | diff --git a/cvikernel/.gitignore b/cvikernel/.gitignore new file mode 100644 index 000000000..567609b12 --- /dev/null +++ b/cvikernel/.gitignore @@ -0,0 +1 @@ +build/ diff --git a/cvikernel/CMakeLists.txt b/cvikernel/CMakeLists.txt new file mode 100644 index 000000000..a2f58bef2 --- /dev/null +++ b/cvikernel/CMakeLists.txt @@ -0,0 +1,128 @@ +cmake_minimum_required(VERSION 3.1.0) + +project(cvikernel C CXX) + +set(CMAKE_C_STANDARD 99) +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS ON) + +set(SAFETY_FLAGS "-Werror -Wall -Wextra -fno-strict-aliasing") +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAFETY_FLAGS}") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAFETY_FLAGS}") +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +install(FILES include/bmkernel/bm_kernel.h + DESTINATION include/bmkernel) +install(FILES include/bmkernel/bm_kernel_legacy.h + DESTINATION include/bmkernel) +install(FILES include/bmkernel/bm_regcpu.h + DESTINATION include/bmkernel) +install(FILES include/bmkernel/reg_tiu.h + DESTINATION include/bmkernel) +install(FILES include/bmkernel/reg_tdma.h + DESTINATION include/bmkernel) +install(FILES include/bmkernel/reg_bdcast.h + DESTINATION include/bmkernel) +install(FILES include/bmkernel/bm1880v2/bmkernel_1880v2.h + DESTINATION include/bmkernel/bm1880v2) +install(FILES include/bmkernel/bm1880v2/non_atomic.h + DESTINATION include/bmkernel/bm1880v2) +install(FILES include/bmkernel/bm1880v2/1880v2_fp_convert.h + DESTINATION include/bmkernel/bm1880v2) +install(FILES include/bmkernel/bm1880v2/bm_vlc_compress.h + DESTINATION include/bmkernel/bm1880v2) +install(FILES include/bmkernel/bm1880v2/compression.h + DESTINATION include/bmkernel/bm1880v2) +install(FILES include/bmkernel/bm1880v2/bm1880v2_tiu_reg.h + DESTINATION include/bmkernel/bm1880v2) +install(FILES include/bmkernel/bm1880v2/bm1880v2_tdma_reg.h + DESTINATION include/bmkernel/bm1880v2) +install(FILES include/bmkernel/bm1880v2/bm1880v2_tpu_cfg.h + DESTINATION include/bmkernel/bm1880v2) +install(FILES include/bmkernel/bm1822/bmkernel_1822.h + DESTINATION include/bmkernel/bm1822) +install(FILES include/bmkernel/bm1822/1822_fp_convert.h + DESTINATION include/bmkernel/bm1822) +install(FILES include/bmkernel/bm1822/bm_vlc_compress.h + DESTINATION include/bmkernel/bm1822) +install(FILES include/bmkernel/bm1822/compression.h + DESTINATION include/bmkernel/bm1822) +install(FILES include/bmkernel/bm1822/bm1822_tiu_reg.h + DESTINATION include/bmkernel/bm1822) +install(FILES include/bmkernel/bm1822/bm1822_tdma_reg.h + DESTINATION include/bmkernel/bm1822) +install(FILES include/bmkernel/bm1822/bm1822_tpu_cfg.h + DESTINATION include/bmkernel/bm1822) +install(FILES include/cvikernel/cv181x/cv181x_tiu_reg.h + DESTINATION include/cvikernel/cv181x) +install(FILES include/cvikernel/cv181x/cv181x_tdma_reg.h + DESTINATION include/cvikernel/cv181x) +install(FILES include/cvikernel/cv181x/cv181x_tpu_cfg.h + DESTINATION include/cvikernel/cv181x) +install(FILES include/cvikernel/cv180x/cv180x_tiu_reg.h + DESTINATION include/cvikernel/cv180x) +install(FILES include/cvikernel/cv180x/cv180x_tdma_reg.h + DESTINATION include/cvikernel/cv180x) +install(FILES include/cvikernel/cv180x/cv180x_tpu_cfg.h + DESTINATION include/cvikernel/cv180x) +install(FILES include/cvikernel/cvikernel.h + DESTINATION include/cvikernel) +install(FILES include/cvikernel/cvk_fp_convert.h + DESTINATION include/cvikernel) +install(FILES include/cvikernel/cvk_vlc_compress.h + DESTINATION include/cvikernel) +enable_testing() + +include_directories(include) +include_directories(src) + +file(GLOB COMMON_SOURCES "src/*.c") +file(GLOB_RECURSE BM1822_SOURCES "src/bm1822/*.c") +file(GLOB_RECURSE BM1880v2_SOURCES "src/bm1880v2/*.c") +file(GLOB_RECURSE CV1822_SOURCES "src/cv1822/*.c") +file(GLOB_RECURSE CV1880v2_SOURCES "src/cv1880v2/*.c") +file(GLOB_RECURSE CV181X_SOURCES "src/cv181x/*.c") +file(GLOB_RECURSE CV180X_SOURCES "src/cv180x/*.c") + +SET(_SOURCES ${COMMON_SOURCES}) +SET(_SOURCES ${_SOURCES} ${BM1822_SOURCES}) +SET(_SOURCES ${_SOURCES} ${CV1822_SOURCES}) + +if (CHIP STREQUAL "cv181x") +SET(_SOURCES ${_SOURCES} ${CV181X_SOURCES}) +add_definitions(-DCHIPID=0x3) +elseif (CHIP STREQUAL "cv180x") +SET(_SOURCES ${_SOURCES} ${CV180X_SOURCES}) +add_definitions(-DCHIPID=0x4) +elseif(CHIP STREQUAL "cv183x") +add_definitions(-DCHIPID=0x1) +SET(_SOURCES ${_SOURCES} ${BM1880v2_SOURCES}) +SET(_SOURCES ${_SOURCES} ${CV1880v2_SOURCES}) +elseif(CHIP STREQUAL "cv182x") +add_definitions(-DCHIPID=0x2) +else() +# pc cmodel +add_definitions(-DCHIPID=0x0) +SET(_SOURCES ${_SOURCES} ${CV180X_SOURCES}) +SET(_SOURCES ${_SOURCES} ${CV181X_SOURCES}) +SET(_SOURCES ${_SOURCES} ${BM1880v2_SOURCES}) +SET(_SOURCES ${_SOURCES} ${CV1880v2_SOURCES}) +endif() + +# +# check for `enum-compare` +# for c compiler not treat enum-compare as [error](https://gcc.gnu.org/bugzilla/show_bug.cgi?id=30357), we could leverage c++ compiler check temporary +# you could refer [here](https://stackoverflow.com/questions/7690800/can-cmake-use-g-to-compile-c-files) for more details +# and [here](https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html) for compiler options c/c++ +# default check is off +#file(GLOB_RECURSE CHECK_SOURCE "src/bm1880v2/non_atomic/*.c") +#set_source_files_properties(${CHECK_SOURCE} PROPERTIES LANGUAGE CXX ) + +add_library(cvikernel SHARED ${_SOURCES}) +add_library(cvikernel-static STATIC ${_SOURCES}) +target_link_libraries(cvikernel m) # m for + +install(TARGETS cvikernel cvikernel-static DESTINATION lib) + +set(CVI_LIBS ${CVI_LIBS} cvikernel) diff --git a/cvikernel/README.md b/cvikernel/README.md new file mode 100644 index 000000000..3ad5ce1d6 --- /dev/null +++ b/cvikernel/README.md @@ -0,0 +1,56 @@ +# bmkernel + +## overview + +bmkernel is a lib for TPU instruction generation, serving as assembly. + +## dependency + +none + +## build + +assuming install to ../install_bmkernel + +``` +$ cd bmkernel +$ mkdir build +$ cd build +$ cmake -G Ninja -DCHIP=BM1880v2 -DCMAKE_INSTALL_PREFIX=../../install_bmkernel .. + +Build +$ cmake --build . +$ cmake --build . -- -v + +Install +$ cmake --build . --target install +$ cmake --build . --target install -- -v + +Test +$ cmake --build . --target test -- -v + +Uninstall +$ xargs rm < install_manifest.txt +``` + +## output + +``` +├── bin +│   └── readcmdbuf +├── include +│   └── bmkernel +│   ├── bm1880v2 +│   │   └── bmkernel_1880v2.h +│   ├── bm_kernel.h +│   └── bm_kernel_legacy.h +└── lib + ├── libbmkernel.so + └── libbmkernel-static.a +``` + +## TODO + +* add more testing +* mv assembly & disassembly here +* round trip testing, asm %s | disasm diff --git a/cvikernel/include/bmkernel/bm1822/1822_fp_convert.h b/cvikernel/include/bmkernel/bm1822/1822_fp_convert.h new file mode 100644 index 000000000..5e5b40ade --- /dev/null +++ b/cvikernel/include/bmkernel/bm1822/1822_fp_convert.h @@ -0,0 +1,334 @@ +#ifndef ATOMIC_FP_H_ +#define ATOMIC_FP_H_ + +#if __arm__ +#define __DISABLE_FENV__ +#endif + +#ifndef __DISABLE_FENV__ +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +static inline uint8_t convert_bf16_u8(uint16_t data); +static inline uint8_t _convert_bf16_u8(uint16_t data, int int8_rnd_md); +static inline int8_t _convert_bf16_s8(uint16_t data, int int8_rnd_md); +static inline int8_t convert_bf16_s8(uint16_t data); +static inline uint16_t convert_int8_bf16(uint8_t data, uint8_t sign); +static inline uint32_t convert_fp32_u32(float fp32); +static inline uint32_t convert_fp32_hex(float val); +static inline float convert_hex_fp32(uint32_t hval); + +static inline float convert_bf16_fp32(uint16_t bf16); +static inline uint16_t convert_fp32_bf16(float fp32); + +static inline void f32_integer(void *if32, void *o_integer, int integer_size, int accumulate, int int8_signed, int int8_rnd_md); +//static inline void f32_integer(void *if32, void *o_integer, + // 0 for 32 bit , 1 for 16 bit , 2 for 8 bit +// int integer_size, int accumulate = 0, int int8_signed = 1, int int8_rnd_md = 0); + +union convert_type_float { + float fval; + uint16_t bf16[2]; + uint32_t ival; +}; + +typedef union convert_type_float convert_int_float; +static const uint16_t NAN_VALUE = 0x7FC0; + +//static int round_mode = 0; +static uint8_t float_isnan(const float x) { + //return isnan(x); + return x != x; +} + +static inline int set_store_feround() +{ +#ifndef __DISABLE_FENV__ + int round_mode = fegetround(); + fesetround(FE_TOWARDZERO); + return round_mode; +#else + return 0; +#endif +} + +static inline void restore_feround(int round_mode) +{ +#ifndef __DISABLE_FENV__ + fesetround(round_mode); +#else + (void)round_mode; +#endif +} + +static inline uint8_t _convert_bf16_u8(uint16_t data, int int8_rnd_md) +{ + /* convert bf16 to float32*/ + float fp32; + convert_int_float convert_val; + fp32 = convert_bf16_fp32(data); + /* convert float32 to uint8_t*/ + f32_integer((void*)&fp32, &convert_val.ival, 2, 0, 0, int8_rnd_md); + return (uint8_t) convert_val.ival; +} + +static inline uint8_t convert_bf16_u8(uint16_t data) +{ + return (uint8_t) _convert_bf16_u8(data, 0); +} + +static inline int8_t _convert_bf16_s8(uint16_t data, int int8_rnd_md) +{ + /* convert bf16 to float32*/ + float fp32; + convert_int_float convert_val; + fp32 = convert_bf16_fp32(data); + /* convert float32 to uint8_t*/ + f32_integer((void*)&fp32, &convert_val.ival, 2, 0, 1, int8_rnd_md); + return (int8_t) convert_val.ival; +} + +static inline int8_t convert_bf16_s8(uint16_t data) +{ + return (int8_t) _convert_bf16_s8(data, 0); +} + +static inline uint16_t convert_int8_bf16(uint8_t data, uint8_t sign) +{ + int32_t val = sign ? (int8_t) data : (uint8_t) data; + /* need to round to bf16 mode */ + return convert_fp32_bf16((float) val); +} + +static inline uint16_t convert_fp32_bf16(float fp32) +{ + if (float_isnan(fp32)) + return NAN_VALUE; + convert_int_float convert_val; + convert_val.fval = fp32; + uint32_t input = convert_val.ival; + uint32_t lsb = (input >> 16) & 1; + uint32_t rounding_bias = 0x7fff + lsb; + input += rounding_bias; + convert_val.bf16[1] = (uint16_t) (input >> 16); + + /* HW behavior */ + if ((convert_val.bf16[1] & 0x7f80) == 0x7f80) { + convert_val.bf16[1] = 0x7f7f; + } + return convert_val.bf16[1]; +} + +static inline uint8_t convert_fp32_u8(float fp32) +{ + convert_int_float convert_val; + f32_integer((void*)&fp32, &convert_val.ival, 2, 0, 0, 0); + return (uint8_t) convert_val.ival; +} + +static inline int8_t convert_fp32_s8(float fp32) +{ + convert_int_float convert_val; + f32_integer((void*)&fp32, &convert_val.ival, 2, 0, 1, 0); + return (int8_t) convert_val.ival; +} + +static inline uint32_t convert_fp32_u32(float fp32) +{ + convert_int_float convert_val; + f32_integer((void*)&fp32, &convert_val.ival, 0, 0, 0, 0); + return (uint32_t) convert_val.ival; +} + +static inline int32_t convert_fp32_s32(float fp32) +{ + convert_int_float convert_val; + f32_integer((void*)&fp32, &convert_val.ival, 0, 0, 1, 0); + return (int32_t) convert_val.ival; +} + +/* convert hex to float directly */ +static inline float convert_hex_fp32(uint32_t hval) +{ + convert_int_float convert_val; + convert_val.ival = hval; + return convert_val.fval; +} +/* convert float to hex directly */ +static inline uint32_t convert_fp32_hex(float val) +{ + convert_int_float convert_val; + convert_val.fval = val; + return convert_val.ival; +} +static inline float convert_bf16_fp32(uint16_t bf16) +{ + convert_int_float convert_val; + convert_val.bf16[1] = bf16; + convert_val.bf16[0] = 0; + return convert_val.fval; +} + +static inline void flt2int_flt(float x, unsigned long long* integer_part, float * sub_part, uint8_t sign) +{ + convert_int_float work_x; + int level_code; + unsigned long tail_code; + work_x.fval = x; + level_code = ((work_x.ival >> 23) & 0xff) - 127; + + //if the level code is negaive, the integer part of the float is zero + if ( level_code < 0 ){ + *integer_part = 0; + *sub_part = x; + } + else { + tail_code = (work_x.ival) & 0x7fffff; + tail_code = tail_code | 0x800000; + + if (level_code < 23){ + tail_code >>= (23 - level_code); + *integer_part = tail_code; + work_x.ival &= 0xffffffff << (23 - level_code); + *sub_part = x - work_x.fval; + } + else { + tail_code <<= (level_code - 23); + *integer_part = tail_code; + if(level_code>30){ + *integer_part = 0x7fffffff; + if(sign)*integer_part = 0x800000000; + } + *sub_part = 0; + } + } +} + +inline static int flt2int(float ifval, int int8_rnd_md) +{ + union { + float floatNum; + unsigned long intNum; + } tempIfval; + tempIfval.floatNum = ifval; + uint8_t isPositive = ((tempIfval.intNum & 0x80000000UL) == 0x80000000UL) ? false : true ; + float abs_fval = (!isPositive) ? -ifval : ifval; + float sub_part; + unsigned long long integer_part; + uint8_t sign = !isPositive; + flt2int_flt(abs_fval, &integer_part, &sub_part, sign); + if (!isPositive) + { + unsigned long long result; + if(int8_rnd_md == 0) { // round to nearest even + if ( sub_part > 0.5f ) + { + result = integer_part + 1; + } + else if (sub_part == 0.5f) + { + if ( integer_part & 0x1 ) + { + result = integer_part + 1; + } + else + { + result = integer_part; + } + } + else + { + result = integer_part; + } + } else { //round to zero + result = integer_part; + } + if ( result > 0x80000000UL ) + { + result = 0x80000000UL; + } + return -result; + } + else + { + unsigned long long result; + if(int8_rnd_md == 0) { // round to nearest even + if ( sub_part > 0.5f ) + { + result = integer_part + 1; + } + else if ( sub_part == 0.5f ) + { + if ( integer_part & 0x1 ) + { + result = integer_part + 1; + } + else + { + result = integer_part; + } + } + else + { + result = integer_part; + } + } else { + result = integer_part; + } + if ( result > 0x7fffffff ) + { + result = 0x7fffffff; + } + return result; + } +} + +static inline void f32_integer(void *if32, void *o_integer, int integer_size, int accumulate, int int8_signed, int int8_rnd_md) +{ + int i_tmp; + float *f_tmp; + f_tmp = (float *)if32; + i_tmp = flt2int(*f_tmp, int8_rnd_md); + int *o32 = (int *)o_integer; + int dst_f32 = *o32; + short *o16 = (short *)o_integer; + short dst_o16 = *o32; + char *o8 = (char *)o_integer; + char dst_o8 = *o8; + + if (integer_size == 0) { + *o32 = i_tmp; + } else if (integer_size == 1) { + *o16 = i_tmp; + } else{ + *o8 = i_tmp; + int min = (int8_signed) ? -128 : 0; + int max = (int8_signed) ? 127 : 255; + if (i_tmp < min ){ + *o8 = min; + } + else if (i_tmp > max){ + *o8 = max; + } + //*o8 = i_tmp; + } + if (accumulate) { + if (integer_size == 0) { + *o32 += dst_f32; + } else if (integer_size == 1) { + *o16 += dst_o16; + } else + *o8 += dst_o8; + } +} + +#ifdef __cplusplus +} +#endif + +#endif /* ATOMIC_FP_H_ */ + diff --git a/cvikernel/include/bmkernel/bm1822/bm1822_tdma_reg.h b/cvikernel/include/bmkernel/bm1822/bm1822_tdma_reg.h new file mode 100644 index 000000000..fc4b43b90 --- /dev/null +++ b/cvikernel/include/bmkernel/bm1822/bm1822_tdma_reg.h @@ -0,0 +1,306 @@ +#ifndef BM1822_TDMA_REG_H +#define BM1822_TDMA_REG_H + +/* + * This file is generated by tools. Do not edit it manually. + */ + +#include +#include + +typedef unsigned long long ullong; + +typedef struct { + uint32_t vld; + uint32_t compress_en; + uint32_t eod; + uint32_t intp_en; + uint32_t bar_en; + uint32_t check_bf16_value; + uint32_t trans_dir; + uint32_t rsv00; + uint32_t trans_fmt; + uint32_t transpose_md; + uint32_t rsv01; + uint32_t intra_cmd_paral; + uint32_t outstanding_en; + uint32_t cmd_id; + uint32_t spec_func; + uint32_t dst_fmt; + uint32_t src_fmt; + uint32_t cmprs_fmt; + uint32_t sys_dtype; + uint32_t rsv2_1; + uint32_t int8_sign; + uint32_t compress_zero_guard; + uint32_t int8_rnd_mode; + uint32_t wait_id_tpu; + uint32_t wait_id_other_tdma; + uint32_t wait_id_sdma; + uint32_t const_val; + uint32_t src_base_reg_sel; + uint32_t mv_lut_idx; + uint32_t dst_base_reg_sel; + uint32_t mv_lut_base; + uint32_t rsv4_5; + uint32_t dst_h_stride; + uint32_t dst_c_stride_low; + uint32_t dst_n_stride; + uint32_t src_h_stride; + uint32_t src_c_stride_low; + uint32_t src_n_stride; + uint32_t dst_c; + uint32_t src_c; + uint32_t dst_w; + uint32_t dst_h; + uint32_t src_w; + uint32_t src_h; + uint32_t dst_base_addr_low; + uint32_t src_base_addr_low; + uint32_t src_n; + uint32_t dst_base_addr_high; + uint32_t src_base_addr_high; + uint32_t src_c_stride_high; + uint32_t dst_c_stride_high; + uint32_t compress_bias0; + uint32_t compress_bias1; + uint32_t layer_ID; +} tdma_reg_t; + +static inline void parse_tdma_reg(tdma_reg_t *r, const uint32_t *p) +{ + r->vld = p[0] & 1; + r->compress_en = (p[0] >> 1) & 1; + r->eod = (p[0] >> 2) & 1; + r->intp_en = (p[0] >> 3) & 1; + r->bar_en = (p[0] >> 4) & 1; + r->check_bf16_value = (p[0] >> 5) & 1; + r->trans_dir = (p[0] >> 6) & ((1u << 2) - 1); + r->rsv00 = (p[0] >> 8) & ((1u << 2) - 1); + r->trans_fmt = (p[0] >> 10) & 1; + r->transpose_md = (p[0] >> 11) & ((1u << 2) - 1); + r->rsv01 = (p[0] >> 13) & 1; + r->intra_cmd_paral = (p[0] >> 14) & 1; + r->outstanding_en = (p[0] >> 15) & 1; + r->cmd_id = (p[0] >> 16) & ((1u << 16) - 1); + r->spec_func = p[1] & ((1u << 3) - 1); + r->dst_fmt = (p[1] >> 3) & ((1u << 2) - 1); + r->src_fmt = (p[1] >> 5) & ((1u << 2) - 1); + r->cmprs_fmt = (p[1] >> 7) & 1; + r->sys_dtype = (p[1] >> 8) & 1; + r->rsv2_1 = (p[1] >> 9) & ((1u << 4) - 1); + r->int8_sign = (p[1] >> 13) & 1; + r->compress_zero_guard = (p[1] >> 14) & 1; + r->int8_rnd_mode = (p[1] >> 15) & 1; + r->wait_id_tpu = (p[1] >> 16) & ((1u << 16) - 1); + r->wait_id_other_tdma = p[2] & ((1u << 16) - 1); + r->wait_id_sdma = (p[2] >> 16) & ((1u << 16) - 1); + r->const_val = p[3] & ((1u << 16) - 1); + r->src_base_reg_sel = (p[3] >> 16) & ((1u << 3) - 1); + r->mv_lut_idx = (p[3] >> 19) & 1; + r->dst_base_reg_sel = (p[3] >> 20) & ((1u << 3) - 1); + r->mv_lut_base = (p[3] >> 23) & 1; + r->rsv4_5 = (p[3] >> 24) & ((1u << 8) - 1); + r->dst_h_stride = p[4] & ((1u << 16) - 1); + r->dst_c_stride_low = (p[4] >> 16) & ((1u << 16) - 1); + r->dst_n_stride = p[5]; + r->src_h_stride = p[6] & ((1u << 16) - 1); + r->src_c_stride_low = (p[6] >> 16) & ((1u << 16) - 1); + r->src_n_stride = p[7]; + r->dst_c = p[8] & ((1u << 16) - 1); + r->src_c = (p[8] >> 16) & ((1u << 16) - 1); + r->dst_w = p[9] & ((1u << 16) - 1); + r->dst_h = (p[9] >> 16) & ((1u << 16) - 1); + r->src_w = p[10] & ((1u << 16) - 1); + r->src_h = (p[10] >> 16) & ((1u << 16) - 1); + r->dst_base_addr_low = p[11]; + r->src_base_addr_low = p[12]; + r->src_n = p[13] & ((1u << 16) - 1); + r->dst_base_addr_high = (p[13] >> 16) & ((1u << 8) - 1); + r->src_base_addr_high = (p[13] >> 24) & ((1u << 8) - 1); + r->src_c_stride_high = p[14] & ((1u << 16) - 1); + r->dst_c_stride_high = (p[14] >> 16) & ((1u << 16) - 1); + r->compress_bias0 = p[15] & ((1u << 8) - 1); + r->compress_bias1 = (p[15] >> 8) & ((1u << 8) - 1); + r->layer_ID = (p[15] >> 16) & ((1u << 16) - 1); +} + +static inline void emit_tdma_reg(const tdma_reg_t *r, uint32_t *_p) +{ + volatile uint32_t *p = (typeof(p))_p; + p[15] = (r->compress_bias0 & ((1u << 8) - 1)) | + ((r->compress_bias1 & ((1u << 8) - 1)) << 8) | + ((r->layer_ID & ((1u << 16) - 1)) << 16); + p[14] = (r->src_c_stride_high & ((1u << 16) - 1)) | + ((r->dst_c_stride_high & ((1u << 16) - 1)) << 16); + p[13] = (r->src_n & ((1u << 16) - 1)) | + ((r->dst_base_addr_high & ((1u << 8) - 1)) << 16) | + ((r->src_base_addr_high & ((1u << 8) - 1)) << 24); + p[12] = (r->src_base_addr_low & (((uint64_t)1 << 32) - 1)); + p[11] = (r->dst_base_addr_low & (((uint64_t)1 << 32) - 1)); + p[10] = (r->src_w & ((1u << 16) - 1)) | + ((r->src_h & ((1u << 16) - 1)) << 16); + p[9] = (r->dst_w & ((1u << 16) - 1)) | + ((r->dst_h & ((1u << 16) - 1)) << 16); + p[8] = (r->dst_c & ((1u << 16) - 1)) | + ((r->src_c & ((1u << 16) - 1)) << 16); + p[7] = (r->src_n_stride & (((uint64_t)1 << 32) - 1)); + p[6] = (r->src_h_stride & ((1u << 16) - 1)) | + ((r->src_c_stride_low & ((1u << 16) - 1)) << 16); + p[5] = (r->dst_n_stride & (((uint64_t)1 << 32) - 1)); + p[4] = (r->dst_h_stride & ((1u << 16) - 1)) | + ((r->dst_c_stride_low & ((1u << 16) - 1)) << 16); + p[3] = (r->const_val & ((1u << 16) - 1)) | + ((r->src_base_reg_sel & ((1u << 3) - 1)) << 16) | + ((r->mv_lut_idx & 1) << 19) | + ((r->dst_base_reg_sel & ((1u << 3) - 1)) << 20) | + ((r->mv_lut_base & 1) << 23) | + ((r->rsv4_5 & ((1u << 8) - 1)) << 24); + p[2] = (r->wait_id_other_tdma & ((1u << 16) - 1)) | + ((r->wait_id_sdma & ((1u << 16) - 1)) << 16); + p[1] = (r->spec_func & ((1u << 3) - 1)) | + ((r->dst_fmt & ((1u << 2) - 1)) << 3) | + ((r->src_fmt & ((1u << 2) - 1)) << 5) | + ((r->cmprs_fmt & 1) << 7) | + ((r->sys_dtype & 1) << 8) | + ((r->rsv2_1 & ((1u << 4) - 1)) << 9) | + ((r->int8_sign & 1) << 13) | + ((r->compress_zero_guard & 1) << 14) | + ((r->int8_rnd_mode & 1) << 15) | + ((r->wait_id_tpu & ((1u << 16) - 1)) << 16); + p[0] = (r->vld & 1) | + ((r->compress_en & 1) << 1) | + ((r->eod & 1) << 2) | + ((r->intp_en & 1) << 3) | + ((r->bar_en & 1) << 4) | + ((r->check_bf16_value & 1) << 5) | + ((r->trans_dir & ((1u << 2) - 1)) << 6) | + ((r->rsv00 & ((1u << 2) - 1)) << 8) | + ((r->trans_fmt & 1) << 10) | + ((r->transpose_md & ((1u << 2) - 1)) << 11) | + ((r->rsv01 & 1) << 13) | + ((r->intra_cmd_paral & 1) << 14) | + ((r->outstanding_en & 1) << 15) | + ((r->cmd_id & ((1u << 16) - 1)) << 16); +} + +static inline void reset_tdma_reg(tdma_reg_t *r) +{ + r->vld = 0x0; + r->compress_en = 0x0; + r->eod = 0x0; + r->intp_en = 0x0; + r->bar_en = 0x0; + r->check_bf16_value = 0x0; + r->trans_dir = 0x0; + r->rsv00 = 0x0; + r->trans_fmt = 0x0; + r->transpose_md = 0x0; + r->rsv01 = 0x0; + r->intra_cmd_paral = 0x0; + r->outstanding_en = 0x0; + r->cmd_id = 0x0; + r->spec_func = 0x0; + r->dst_fmt = 0x1; + r->src_fmt = 0x1; + r->cmprs_fmt = 0x0; + r->sys_dtype = 0x0; + r->rsv2_1 = 0x0; + r->int8_sign = 0x0; + r->compress_zero_guard = 0x0; + r->int8_rnd_mode = 0x0; + r->wait_id_tpu = 0x0; + r->wait_id_other_tdma = 0x0; + r->wait_id_sdma = 0x0; + r->const_val = 0x0; + r->src_base_reg_sel = 0x0; + r->mv_lut_idx = 0x0; + r->dst_base_reg_sel = 0x0; + r->mv_lut_base = 0x0; + r->rsv4_5 = 0x0; + r->dst_h_stride = 0x1; + r->dst_c_stride_low = 0x1; + r->dst_n_stride = 0x1; + r->src_h_stride = 0x1; + r->src_c_stride_low = 0x1; + r->src_n_stride = 0x1; + r->dst_c = 0x1; + r->src_c = 0x1; + r->dst_w = 0x1; + r->dst_h = 0x1; + r->src_w = 0x1; + r->src_h = 0x1; + r->dst_base_addr_low = 0x0; + r->src_base_addr_low = 0x0; + r->src_n = 0x1; + r->dst_base_addr_high = 0x0; + r->src_base_addr_high = 0x0; + r->src_c_stride_high = 0x0; + r->dst_c_stride_high = 0x0; + r->compress_bias0 = 0x0; + r->compress_bias1 = 0x0; + r->layer_ID = 0x0; +} + +static inline void trace_tdma_reg(tdma_reg_t *r, const char *tag) +{ +#define trace_one_reg(name) \ + printf(" %s: 0x%llx\n", #name, (ullong)r->name) + + printf("--- %s ---\n", tag); + trace_one_reg(vld); + trace_one_reg(compress_en); + trace_one_reg(eod); + trace_one_reg(intp_en); + trace_one_reg(bar_en); + trace_one_reg(check_bf16_value); + trace_one_reg(trans_dir); + trace_one_reg(rsv00); + trace_one_reg(trans_fmt); + trace_one_reg(transpose_md); + trace_one_reg(rsv01); + trace_one_reg(intra_cmd_paral); + trace_one_reg(outstanding_en); + trace_one_reg(cmd_id); + trace_one_reg(spec_func); + trace_one_reg(dst_fmt); + trace_one_reg(src_fmt); + trace_one_reg(cmprs_fmt); + trace_one_reg(sys_dtype); + trace_one_reg(rsv2_1); + trace_one_reg(int8_sign); + trace_one_reg(compress_zero_guard); + trace_one_reg(int8_rnd_mode); + trace_one_reg(wait_id_tpu); + trace_one_reg(wait_id_other_tdma); + trace_one_reg(wait_id_sdma); + trace_one_reg(const_val); + trace_one_reg(src_base_reg_sel); + trace_one_reg(mv_lut_idx); + trace_one_reg(dst_base_reg_sel); + trace_one_reg(mv_lut_base); + trace_one_reg(rsv4_5); + trace_one_reg(dst_h_stride); + trace_one_reg(dst_c_stride_low); + trace_one_reg(dst_n_stride); + trace_one_reg(src_h_stride); + trace_one_reg(src_c_stride_low); + trace_one_reg(src_n_stride); + trace_one_reg(dst_c); + trace_one_reg(src_c); + trace_one_reg(dst_w); + trace_one_reg(dst_h); + trace_one_reg(src_w); + trace_one_reg(src_h); + trace_one_reg(dst_base_addr_low); + trace_one_reg(src_base_addr_low); + trace_one_reg(src_n); + trace_one_reg(dst_base_addr_high); + trace_one_reg(src_base_addr_high); + trace_one_reg(src_c_stride_high); + trace_one_reg(dst_c_stride_high); + trace_one_reg(compress_bias0); + trace_one_reg(compress_bias1); + trace_one_reg(layer_ID); +} +#endif /* BM1822_TDMA_REG_H */ diff --git a/cvikernel/include/bmkernel/bm1822/bm1822_tiu_reg.h b/cvikernel/include/bmkernel/bm1822/bm1822_tiu_reg.h new file mode 100644 index 000000000..2d8816594 --- /dev/null +++ b/cvikernel/include/bmkernel/bm1822/bm1822_tiu_reg.h @@ -0,0 +1,599 @@ +#ifndef BM1822_TIU_REG_H +#define BM1822_TIU_REG_H + +/* + * This file is generated by tools. Do not edit it manually. + */ + +#include +#include + +typedef unsigned long long ullong; + +typedef struct { + uint32_t cmd_en; + uint32_t cmd_end; + uint32_t cmd_id_en; + uint32_t cmd_keep; + uint32_t cmd_intr_en; + uint32_t tsk_typ; + uint32_t tsk_eu_typ; + uint32_t tsk_opd_num; + uint32_t opt_res_shift; + uint32_t opt_left_shift; + uint32_t opt_shift_typ; + uint32_t opt_rshift_typ; + uint32_t dummy1; + uint32_t opd_typ; + uint32_t opt_chl_quan; + uint32_t cmd_id_tpu; + uint32_t cmd_id_gdma; + uint32_t quan_m; + uint32_t opt_res0_sign; + uint32_t opt_opd0_sign; + uint32_t opt_opd1_sign; + uint32_t opt_opd2_sign; + uint32_t opt_res0_seg; + uint32_t opt_opd0_seg; + uint32_t opt_opd1_seg; + uint32_t opt_opd2_seg; + uint32_t ps32_md; + uint32_t double_conv; + uint32_t opt_left_tran; + uint32_t fp_round_typ; + uint32_t opt_relu_typ; + uint32_t opt_relu_value; + uint32_t cmd_pre_exe_typ; + uint32_t opt_res_add; + uint32_t rsvd0; + uint32_t conv_opd0_x_ins0; + uint32_t conv_opd0_y_ins0; + uint32_t conv_opd0_x_ins0_last; + uint32_t conv_opd0_y_ins0_last; + uint32_t conv_opd1_x_ins0; + uint32_t conv_opd1_y_ins0; + uint32_t dummy0; + uint32_t opd0_ins_val; + uint32_t conv_opd0_up_pad; + uint32_t conv_opd0_dn_pad; + uint32_t conv_opd0_lf_pad; + uint32_t conv_opd0_rt_pad; + uint32_t res0_n; + uint32_t res0_c; + uint32_t res0_h; + uint32_t res0_w; + uint32_t conv_op_x_str; + uint32_t conv_op_y_str; + uint32_t cmd_pre_exe; + uint32_t rsvd1; + uint32_t res0_addr; + uint32_t opd0_addr; + uint32_t opd1_addr; + uint32_t opd2_addr; + uint32_t opt_opd0_const; + uint32_t opt_opd1_const; + uint32_t opt_opd2_const; + uint32_t short_nchwstr_same; + uint32_t short_res0_str; + uint32_t short_opd0_str; + uint32_t short_opd1_str; + uint32_t short_opd2_str; + uint32_t dummy2; + uint32_t opd0_n; + uint32_t opd0_c; + uint32_t dummy3; + uint32_t rsvd2; + uint32_t opd0_h; + uint32_t opd0_w; + uint32_t opd1_n; + uint32_t opd1_c; + uint32_t opd1_h; + uint32_t opd1_w; + uint32_t opd2_n; + uint32_t opd2_c; + uint32_t opd2_h; + uint32_t opd2_w; + uint32_t dummy4; + uint32_t rsvd3; + uint32_t layer_info; + uint32_t res0_n_str; + uint32_t res0_c_str; + uint32_t res0_h_str; + uint32_t res0_w_str; + uint32_t res0_b_str; + uint32_t opd0_n_str; + uint32_t dummy5; + uint32_t rsvd4; + uint32_t opd0_c_str; + uint32_t opd0_h_str; + uint32_t opd0_w_str; + uint32_t opd0_b_str; + uint32_t opd1_n_str; + uint32_t opd1_c_str; + uint32_t opd1_h_str; + uint32_t dummy6; + uint32_t rsvd5; + uint32_t opd1_w_str; + uint32_t opd1_b_str; + uint32_t opd2_n_str; + uint32_t opd2_c_str; + uint32_t opd2_h_str; + uint32_t opd2_w_str; + uint32_t opd2_b_str; + uint32_t dummy7; + uint32_t rsvd6; +} tiu_reg_t; + +static inline void parse_tiu_reg(tiu_reg_t *r, const uint32_t *p) +{ + r->cmd_en = p[0] & 1; + r->cmd_end = (p[0] >> 1) & 1; + r->cmd_id_en = (p[0] >> 2) & 1; + r->cmd_keep = (p[0] >> 3) & 1; + r->cmd_intr_en = (p[0] >> 4) & 1; + r->tsk_typ = (p[0] >> 5) & ((1u << 4) - 1); + r->tsk_eu_typ = (p[0] >> 9) & ((1u << 5) - 1); + r->tsk_opd_num = (p[0] >> 14) & ((1u << 2) - 1); + r->opt_res_shift = (p[0] >> 16) & ((1u << 6) - 1); + r->opt_left_shift = (p[0] >> 22) & ((1u << 5) - 1); + r->opt_shift_typ = (p[0] >> 27) & 1; + r->opt_rshift_typ = (p[0] >> 28) & 1; + r->dummy1 = (p[0] >> 29) & 1; + r->opd_typ = (p[0] >> 30) & 1; + r->opt_chl_quan = (p[0] >> 31) & 1; + r->cmd_id_tpu = p[1] & ((1u << 16) - 1); + r->cmd_id_gdma = (p[1] >> 16) & ((1u << 16) - 1); + r->quan_m = p[2]; + r->opt_res0_sign = p[3] & 1; + r->opt_opd0_sign = (p[3] >> 1) & 1; + r->opt_opd1_sign = (p[3] >> 2) & 1; + r->opt_opd2_sign = (p[3] >> 3) & 1; + r->opt_res0_seg = (p[3] >> 4) & ((1u << 2) - 1); + r->opt_opd0_seg = (p[3] >> 6) & ((1u << 2) - 1); + r->opt_opd1_seg = (p[3] >> 8) & ((1u << 2) - 1); + r->opt_opd2_seg = (p[3] >> 10) & 1; + r->ps32_md = (p[3] >> 11) & ((1u << 2) - 1); + r->double_conv = (p[3] >> 13) & 1; + r->opt_left_tran = (p[3] >> 14) & 1; + r->fp_round_typ = (p[3] >> 15) & 1; + r->opt_relu_typ = (p[3] >> 16) & ((1u << 2) - 1); + r->opt_relu_value = (p[3] >> 18) & ((1u << 8) - 1); + r->cmd_pre_exe_typ = (p[3] >> 26) & 1; + r->opt_res_add = (p[3] >> 27) & 1; + r->rsvd0 = (p[3] >> 28) & ((1u << 4) - 1); + r->conv_opd0_x_ins0 = p[4] & ((1u << 4) - 1); + r->conv_opd0_y_ins0 = (p[4] >> 4) & ((1u << 4) - 1); + r->conv_opd0_x_ins0_last = (p[4] >> 8) & ((1u << 4) - 1); + r->conv_opd0_y_ins0_last = (p[4] >> 12) & ((1u << 4) - 1); + r->conv_opd1_x_ins0 = (p[4] >> 16) & ((1u << 4) - 1); + r->conv_opd1_y_ins0 = (p[4] >> 20) & ((1u << 4) - 1); + r->dummy0 = (p[4] >> 24) & ((1u << 8) - 1); + r->opd0_ins_val = p[5] & ((1u << 16) - 1); + r->conv_opd0_up_pad = (p[5] >> 16) & ((1u << 4) - 1); + r->conv_opd0_dn_pad = (p[5] >> 20) & ((1u << 4) - 1); + r->conv_opd0_lf_pad = (p[5] >> 24) & ((1u << 4) - 1); + r->conv_opd0_rt_pad = (p[5] >> 28) & ((1u << 4) - 1); + r->res0_n = p[6] & ((1u << 12) - 1); + r->res0_c = (p[6] >> 12) & ((1u << 12) - 1); + r->res0_h = (p[6] >> 24) & ((1u << 8) - 1); + r->res0_h |= (uint64_t)(p[7] & ((1u << 4) - 1)) << 8; + r->res0_w = (p[7] >> 4) & ((1u << 12) - 1); + r->conv_op_x_str = (p[7] >> 16) & ((1u << 5) - 1); + r->conv_op_y_str = (p[7] >> 21) & ((1u << 5) - 1); + r->cmd_pre_exe = (p[7] >> 26) & ((1u << 2) - 1); + r->rsvd1 = (p[7] >> 28) & ((1u << 4) - 1); + r->res0_addr = p[8] & ((1u << 24) - 1); + r->opd0_addr = (p[8] >> 24) & ((1u << 8) - 1); + r->opd0_addr |= (uint64_t)(p[9] & ((1u << 16) - 1)) << 8; + r->opd1_addr = (p[9] >> 16) & ((1u << 16) - 1); + r->opd2_addr = p[10] & ((1u << 16) - 1); + r->opt_opd0_const = (p[10] >> 16) & 1; + r->opt_opd1_const = (p[10] >> 17) & 1; + r->opt_opd2_const = (p[10] >> 18) & 1; + r->short_nchwstr_same = (p[10] >> 19) & 1; + r->short_res0_str = (p[10] >> 20) & ((1u << 2) - 1); + r->short_opd0_str = (p[10] >> 22) & ((1u << 2) - 1); + r->short_opd1_str = (p[10] >> 24) & ((1u << 2) - 1); + r->short_opd2_str = (p[10] >> 26) & ((1u << 2) - 1); + r->dummy2 = (p[10] >> 28) & ((1u << 4) - 1); + r->opd0_n = p[11] & ((1u << 12) - 1); + r->opd0_c = (p[11] >> 12) & ((1u << 12) - 1); + r->dummy3 = (p[11] >> 24) & ((1u << 4) - 1); + r->rsvd2 = (p[11] >> 28) & ((1u << 4) - 1); + r->opd0_h = p[12] & ((1u << 12) - 1); + r->opd0_w = (p[12] >> 12) & ((1u << 12) - 1); + r->opd1_n = (p[12] >> 24) & ((1u << 8) - 1); + r->opd1_n |= (uint64_t)(p[13] & ((1u << 4) - 1)) << 8; + r->opd1_c = (p[13] >> 4) & ((1u << 12) - 1); + r->opd1_h = (p[13] >> 16) & ((1u << 12) - 1); + r->opd1_w = (p[13] >> 28) & ((1u << 4) - 1); + r->opd1_w |= (uint64_t)(p[14] & ((1u << 8) - 1)) << 4; + r->opd2_n = (p[14] >> 8) & ((1u << 12) - 1); + r->opd2_c = (p[14] >> 20) & ((1u << 12) - 1); + r->opd2_h = p[15] & ((1u << 12) - 1); + r->opd2_w = (p[15] >> 12) & ((1u << 12) - 1); + r->dummy4 = (p[15] >> 24) & ((1u << 4) - 1); + r->rsvd3 = (p[15] >> 28) & ((1u << 4) - 1); + r->layer_info = p[16] & ((1u << 16) - 1); + r->res0_n_str = (p[16] >> 16) & ((1u << 16) - 1); + r->res0_c_str = p[17] & ((1u << 16) - 1); + r->res0_h_str = (p[17] >> 16) & ((1u << 16) - 1); + r->res0_w_str = p[18] & ((1u << 16) - 1); + r->res0_b_str = (p[18] >> 16) & ((1u << 16) - 1); + r->opd0_n_str = p[19] & ((1u << 16) - 1); + r->dummy5 = (p[19] >> 16) & ((1u << 12) - 1); + r->rsvd4 = (p[19] >> 28) & ((1u << 4) - 1); + r->opd0_c_str = p[20] & ((1u << 16) - 1); + r->opd0_h_str = (p[20] >> 16) & ((1u << 16) - 1); + r->opd0_w_str = p[21] & ((1u << 16) - 1); + r->opd0_b_str = (p[21] >> 16) & ((1u << 16) - 1); + r->opd1_n_str = p[22] & ((1u << 16) - 1); + r->opd1_c_str = (p[22] >> 16) & ((1u << 16) - 1); + r->opd1_h_str = p[23] & ((1u << 16) - 1); + r->dummy6 = (p[23] >> 16) & ((1u << 12) - 1); + r->rsvd5 = (p[23] >> 28) & ((1u << 4) - 1); + r->opd1_w_str = p[24] & ((1u << 16) - 1); + r->opd1_b_str = (p[24] >> 16) & ((1u << 16) - 1); + r->opd2_n_str = p[25] & ((1u << 16) - 1); + r->opd2_c_str = (p[25] >> 16) & ((1u << 16) - 1); + r->opd2_h_str = p[26] & ((1u << 16) - 1); + r->opd2_w_str = (p[26] >> 16) & ((1u << 16) - 1); + r->opd2_b_str = p[27] & ((1u << 16) - 1); + r->dummy7 = (p[27] >> 16) & ((1u << 12) - 1); + r->rsvd6 = (p[27] >> 28) & ((1u << 4) - 1); +} + +static inline void emit_tiu_reg(const tiu_reg_t *r, uint32_t *_p) +{ + volatile uint32_t *p = (typeof(p))_p; + p[27] = (r->opd2_b_str & ((1u << 16) - 1)) | + ((r->dummy7 & ((1u << 12) - 1)) << 16) | + ((r->rsvd6 & ((1u << 4) - 1)) << 28); + p[26] = (r->opd2_h_str & ((1u << 16) - 1)) | + ((r->opd2_w_str & ((1u << 16) - 1)) << 16); + p[25] = (r->opd2_n_str & ((1u << 16) - 1)) | + ((r->opd2_c_str & ((1u << 16) - 1)) << 16); + p[24] = (r->opd1_w_str & ((1u << 16) - 1)) | + ((r->opd1_b_str & ((1u << 16) - 1)) << 16); + p[23] = (r->opd1_h_str & ((1u << 16) - 1)) | + ((r->dummy6 & ((1u << 12) - 1)) << 16) | + ((r->rsvd5 & ((1u << 4) - 1)) << 28); + p[22] = (r->opd1_n_str & ((1u << 16) - 1)) | + ((r->opd1_c_str & ((1u << 16) - 1)) << 16); + p[21] = (r->opd0_w_str & ((1u << 16) - 1)) | + ((r->opd0_b_str & ((1u << 16) - 1)) << 16); + p[20] = (r->opd0_c_str & ((1u << 16) - 1)) | + ((r->opd0_h_str & ((1u << 16) - 1)) << 16); + p[19] = (r->opd0_n_str & ((1u << 16) - 1)) | + ((r->dummy5 & ((1u << 12) - 1)) << 16) | + ((r->rsvd4 & ((1u << 4) - 1)) << 28); + p[18] = (r->res0_w_str & ((1u << 16) - 1)) | + ((r->res0_b_str & ((1u << 16) - 1)) << 16); + p[17] = (r->res0_c_str & ((1u << 16) - 1)) | + ((r->res0_h_str & ((1u << 16) - 1)) << 16); + p[16] = (r->layer_info & ((1u << 16) - 1)) | + ((r->res0_n_str & ((1u << 16) - 1)) << 16); + p[15] = (r->opd2_h & ((1u << 12) - 1)) | + ((r->opd2_w & ((1u << 12) - 1)) << 12) | + ((r->dummy4 & ((1u << 4) - 1)) << 24) | + ((r->rsvd3 & ((1u << 4) - 1)) << 28); + p[14] = ((r->opd1_w >> 4) & ((1u << 8) - 1)) | + ((r->opd2_n & ((1u << 12) - 1)) << 8) | + ((r->opd2_c & ((1u << 12) - 1)) << 20); + p[13] = ((r->opd1_n >> 8) & ((1u << 4) - 1)) | + ((r->opd1_c & ((1u << 12) - 1)) << 4) | + ((r->opd1_h & ((1u << 12) - 1)) << 16) | + ((r->opd1_w & ((1u << 4) - 1)) << 28); + p[12] = (r->opd0_h & ((1u << 12) - 1)) | + ((r->opd0_w & ((1u << 12) - 1)) << 12) | + ((r->opd1_n & ((1u << 8) - 1)) << 24); + p[11] = (r->opd0_n & ((1u << 12) - 1)) | + ((r->opd0_c & ((1u << 12) - 1)) << 12) | + ((r->dummy3 & ((1u << 4) - 1)) << 24) | + ((r->rsvd2 & ((1u << 4) - 1)) << 28); + p[10] = (r->opd2_addr & ((1u << 16) - 1)) | + ((r->opt_opd0_const & 1) << 16) | + ((r->opt_opd1_const & 1) << 17) | + ((r->opt_opd2_const & 1) << 18) | + ((r->short_nchwstr_same & 1) << 19) | + ((r->short_res0_str & ((1u << 2) - 1)) << 20) | + ((r->short_opd0_str & ((1u << 2) - 1)) << 22) | + ((r->short_opd1_str & ((1u << 2) - 1)) << 24) | + ((r->short_opd2_str & ((1u << 2) - 1)) << 26) | + ((r->dummy2 & ((1u << 4) - 1)) << 28); + p[9] = ((r->opd0_addr >> 8) & ((1u << 16) - 1)) | + ((r->opd1_addr & ((1u << 16) - 1)) << 16); + p[8] = (r->res0_addr & ((1u << 24) - 1)) | + ((r->opd0_addr & ((1u << 8) - 1)) << 24); + p[7] = ((r->res0_h >> 8) & ((1u << 4) - 1)) | + ((r->res0_w & ((1u << 12) - 1)) << 4) | + ((r->conv_op_x_str & ((1u << 5) - 1)) << 16) | + ((r->conv_op_y_str & ((1u << 5) - 1)) << 21) | + ((r->cmd_pre_exe & ((1u << 2) - 1)) << 26) | + ((r->rsvd1 & ((1u << 4) - 1)) << 28); + p[6] = (r->res0_n & ((1u << 12) - 1)) | + ((r->res0_c & ((1u << 12) - 1)) << 12) | + ((r->res0_h & ((1u << 8) - 1)) << 24); + p[5] = (r->opd0_ins_val & ((1u << 16) - 1)) | + ((r->conv_opd0_up_pad & ((1u << 4) - 1)) << 16) | + ((r->conv_opd0_dn_pad & ((1u << 4) - 1)) << 20) | + ((r->conv_opd0_lf_pad & ((1u << 4) - 1)) << 24) | + ((r->conv_opd0_rt_pad & ((1u << 4) - 1)) << 28); + p[4] = (r->conv_opd0_x_ins0 & ((1u << 4) - 1)) | + ((r->conv_opd0_y_ins0 & ((1u << 4) - 1)) << 4) | + ((r->conv_opd0_x_ins0_last & ((1u << 4) - 1)) << 8) | + ((r->conv_opd0_y_ins0_last & ((1u << 4) - 1)) << 12) | + ((r->conv_opd1_x_ins0 & ((1u << 4) - 1)) << 16) | + ((r->conv_opd1_y_ins0 & ((1u << 4) - 1)) << 20) | + ((r->dummy0 & ((1u << 8) - 1)) << 24); + p[3] = (r->opt_res0_sign & 1) | + ((r->opt_opd0_sign & 1) << 1) | + ((r->opt_opd1_sign & 1) << 2) | + ((r->opt_opd2_sign & 1) << 3) | + ((r->opt_res0_seg & ((1u << 2) - 1)) << 4) | + ((r->opt_opd0_seg & ((1u << 2) - 1)) << 6) | + ((r->opt_opd1_seg & ((1u << 2) - 1)) << 8) | + ((r->opt_opd2_seg & 1) << 10) | + ((r->ps32_md & ((1u << 2) - 1)) << 11) | + ((r->double_conv & 1) << 13) | + ((r->opt_left_tran & 1) << 14) | + ((r->fp_round_typ & 1) << 15) | + ((r->opt_relu_typ & ((1u << 2) - 1)) << 16) | + ((r->opt_relu_value & ((1u << 8) - 1)) << 18) | + ((r->cmd_pre_exe_typ & 1) << 26) | + ((r->opt_res_add & 1) << 27) | + ((r->rsvd0 & ((1u << 4) - 1)) << 28); + p[2] = (r->quan_m & (((uint64_t)1 << 32) - 1)); + p[1] = (r->cmd_id_tpu & ((1u << 16) - 1)) | + ((r->cmd_id_gdma & ((1u << 16) - 1)) << 16); + p[0] = (r->cmd_en & 1) | + ((r->cmd_end & 1) << 1) | + ((r->cmd_id_en & 1) << 2) | + ((r->cmd_keep & 1) << 3) | + ((r->cmd_intr_en & 1) << 4) | + ((r->tsk_typ & ((1u << 4) - 1)) << 5) | + ((r->tsk_eu_typ & ((1u << 5) - 1)) << 9) | + ((r->tsk_opd_num & ((1u << 2) - 1)) << 14) | + ((r->opt_res_shift & ((1u << 6) - 1)) << 16) | + ((r->opt_left_shift & ((1u << 5) - 1)) << 22) | + ((r->opt_shift_typ & 1) << 27) | + ((r->opt_rshift_typ & 1) << 28) | + ((r->dummy1 & 1) << 29) | + ((r->opd_typ & 1) << 30) | + ((r->opt_chl_quan & 1) << 31); +} + +static inline void reset_tiu_reg(tiu_reg_t *r) +{ + r->cmd_en = 0x0; + r->cmd_end = 0x0; + r->cmd_id_en = 0x0; + r->cmd_keep = 0x0; + r->cmd_intr_en = 0x0; + r->tsk_typ = 0x0; + r->tsk_eu_typ = 0x0; + r->tsk_opd_num = 0x3; + r->opt_res_shift = 0xa; + r->opt_left_shift = 0x2; + r->opt_shift_typ = 0x1; + r->opt_rshift_typ = 0x1; + r->dummy1 = 0x0; + r->opd_typ = 0x0; + r->opt_chl_quan = 0x0; + r->cmd_id_tpu = 0x0; + r->cmd_id_gdma = 0x0; + r->quan_m = 0x0; + r->opt_res0_sign = 0x0; + r->opt_opd0_sign = 0x0; + r->opt_opd1_sign = 0x1; + r->opt_opd2_sign = 0x1; + r->opt_res0_seg = 0x1; + r->opt_opd0_seg = 0x1; + r->opt_opd1_seg = 0x1; + r->opt_opd2_seg = 0x0; + r->ps32_md = 0x0; + r->double_conv = 0x0; + r->opt_left_tran = 0x0; + r->fp_round_typ = 0x0; + r->opt_relu_typ = 0x0; + r->opt_relu_value = 0x0; + r->cmd_pre_exe_typ = 0x0; + r->opt_res_add = 0x0; + r->rsvd0 = 0x0; + r->conv_opd0_x_ins0 = 0x0; + r->conv_opd0_y_ins0 = 0x0; + r->conv_opd0_x_ins0_last = 0x0; + r->conv_opd0_y_ins0_last = 0x0; + r->conv_opd1_x_ins0 = 0x0; + r->conv_opd1_y_ins0 = 0x0; + r->dummy0 = 0x0; + r->opd0_ins_val = 0x0; + r->conv_opd0_up_pad = 0x0; + r->conv_opd0_dn_pad = 0x0; + r->conv_opd0_lf_pad = 0x0; + r->conv_opd0_rt_pad = 0x0; + r->res0_n = 0x1; + r->res0_c = 0x1; + r->res0_h = 0x1; + r->res0_w = 0x10; + r->conv_op_x_str = 0x1; + r->conv_op_y_str = 0x1; + r->cmd_pre_exe = 0x0; + r->rsvd1 = 0x1; + r->res0_addr = 0x0; + r->opd0_addr = 0x0; + r->opd1_addr = 0x0; + r->opd2_addr = 0x0; + r->opt_opd0_const = 0x0; + r->opt_opd1_const = 0x0; + r->opt_opd2_const = 0x0; + r->short_nchwstr_same = 0x0; + r->short_res0_str = 0x0; + r->short_opd0_str = 0x0; + r->short_opd1_str = 0x0; + r->short_opd2_str = 0x0; + r->dummy2 = 0x0; + r->opd0_n = 0x1; + r->opd0_c = 0x1; + r->dummy3 = 0x0; + r->rsvd2 = 0x2; + r->opd0_h = 0x1; + r->opd0_w = 0x10; + r->opd1_n = 0x1; + r->opd1_c = 0x1; + r->opd1_h = 0x1; + r->opd1_w = 0x10; + r->opd2_n = 0x1; + r->opd2_c = 0x1; + r->opd2_h = 0x1; + r->opd2_w = 0x10; + r->dummy4 = 0x0; + r->rsvd3 = 0x3; + r->layer_info = 0x0; + r->res0_n_str = 0x10; + r->res0_c_str = 0x10; + r->res0_h_str = 0x0; + r->res0_w_str = 0x1; + r->res0_b_str = 0x10; + r->opd0_n_str = 0x10; + r->dummy5 = 0x0; + r->rsvd4 = 0x4; + r->opd0_c_str = 0x10; + r->opd0_h_str = 0x0; + r->opd0_w_str = 0x1; + r->opd0_b_str = 0x10; + r->opd1_n_str = 0x10; + r->opd1_c_str = 0x10; + r->opd1_h_str = 0x0; + r->dummy6 = 0x0; + r->rsvd5 = 0x5; + r->opd1_w_str = 0x1; + r->opd1_b_str = 0x10; + r->opd2_n_str = 0x10; + r->opd2_c_str = 0x10; + r->opd2_h_str = 0x0; + r->opd2_w_str = 0x1; + r->opd2_b_str = 0x10; + r->dummy7 = 0x0; + r->rsvd6 = 0x6; +} + +static inline void trace_tiu_reg(tiu_reg_t *r, const char *tag) +{ +#define trace_one_reg(name) \ + printf(" %s: 0x%llx\n", #name, (ullong)r->name) + + printf("--- %s ---\n", tag); + trace_one_reg(cmd_en); + trace_one_reg(cmd_end); + trace_one_reg(cmd_id_en); + trace_one_reg(cmd_keep); + trace_one_reg(cmd_intr_en); + trace_one_reg(tsk_typ); + trace_one_reg(tsk_eu_typ); + trace_one_reg(tsk_opd_num); + trace_one_reg(opt_res_shift); + trace_one_reg(opt_left_shift); + trace_one_reg(opt_shift_typ); + trace_one_reg(opt_rshift_typ); + trace_one_reg(dummy1); + trace_one_reg(opd_typ); + trace_one_reg(opt_chl_quan); + trace_one_reg(cmd_id_tpu); + trace_one_reg(cmd_id_gdma); + trace_one_reg(quan_m); + trace_one_reg(opt_res0_sign); + trace_one_reg(opt_opd0_sign); + trace_one_reg(opt_opd1_sign); + trace_one_reg(opt_opd2_sign); + trace_one_reg(opt_res0_seg); + trace_one_reg(opt_opd0_seg); + trace_one_reg(opt_opd1_seg); + trace_one_reg(opt_opd2_seg); + trace_one_reg(ps32_md); + trace_one_reg(double_conv); + trace_one_reg(opt_left_tran); + trace_one_reg(fp_round_typ); + trace_one_reg(opt_relu_typ); + trace_one_reg(opt_relu_value); + trace_one_reg(cmd_pre_exe_typ); + trace_one_reg(opt_res_add); + trace_one_reg(rsvd0); + trace_one_reg(conv_opd0_x_ins0); + trace_one_reg(conv_opd0_y_ins0); + trace_one_reg(conv_opd0_x_ins0_last); + trace_one_reg(conv_opd0_y_ins0_last); + trace_one_reg(conv_opd1_x_ins0); + trace_one_reg(conv_opd1_y_ins0); + trace_one_reg(dummy0); + trace_one_reg(opd0_ins_val); + trace_one_reg(conv_opd0_up_pad); + trace_one_reg(conv_opd0_dn_pad); + trace_one_reg(conv_opd0_lf_pad); + trace_one_reg(conv_opd0_rt_pad); + trace_one_reg(res0_n); + trace_one_reg(res0_c); + trace_one_reg(res0_h); + trace_one_reg(res0_w); + trace_one_reg(conv_op_x_str); + trace_one_reg(conv_op_y_str); + trace_one_reg(cmd_pre_exe); + trace_one_reg(rsvd1); + trace_one_reg(res0_addr); + trace_one_reg(opd0_addr); + trace_one_reg(opd1_addr); + trace_one_reg(opd2_addr); + trace_one_reg(opt_opd0_const); + trace_one_reg(opt_opd1_const); + trace_one_reg(opt_opd2_const); + trace_one_reg(short_nchwstr_same); + trace_one_reg(short_res0_str); + trace_one_reg(short_opd0_str); + trace_one_reg(short_opd1_str); + trace_one_reg(short_opd2_str); + trace_one_reg(dummy2); + trace_one_reg(opd0_n); + trace_one_reg(opd0_c); + trace_one_reg(dummy3); + trace_one_reg(rsvd2); + trace_one_reg(opd0_h); + trace_one_reg(opd0_w); + trace_one_reg(opd1_n); + trace_one_reg(opd1_c); + trace_one_reg(opd1_h); + trace_one_reg(opd1_w); + trace_one_reg(opd2_n); + trace_one_reg(opd2_c); + trace_one_reg(opd2_h); + trace_one_reg(opd2_w); + trace_one_reg(dummy4); + trace_one_reg(rsvd3); + trace_one_reg(layer_info); + trace_one_reg(res0_n_str); + trace_one_reg(res0_c_str); + trace_one_reg(res0_h_str); + trace_one_reg(res0_w_str); + trace_one_reg(res0_b_str); + trace_one_reg(opd0_n_str); + trace_one_reg(dummy5); + trace_one_reg(rsvd4); + trace_one_reg(opd0_c_str); + trace_one_reg(opd0_h_str); + trace_one_reg(opd0_w_str); + trace_one_reg(opd0_b_str); + trace_one_reg(opd1_n_str); + trace_one_reg(opd1_c_str); + trace_one_reg(opd1_h_str); + trace_one_reg(dummy6); + trace_one_reg(rsvd5); + trace_one_reg(opd1_w_str); + trace_one_reg(opd1_b_str); + trace_one_reg(opd2_n_str); + trace_one_reg(opd2_c_str); + trace_one_reg(opd2_h_str); + trace_one_reg(opd2_w_str); + trace_one_reg(opd2_b_str); + trace_one_reg(dummy7); + trace_one_reg(rsvd6); +} +#endif /* BM1822_TIU_REG_H */ diff --git a/cvikernel/include/bmkernel/bm1822/bm1822_tpu_cfg.h b/cvikernel/include/bmkernel/bm1822/bm1822_tpu_cfg.h new file mode 100644 index 000000000..4e37cdeac --- /dev/null +++ b/cvikernel/include/bmkernel/bm1822/bm1822_tpu_cfg.h @@ -0,0 +1,38 @@ +#ifndef __BM1822_TPU_CFG__ +#define __BM1822_TPU_CFG__ + +#define BM1822_VER 1822 +#define BM1822_HW_NPU_SHIFT 3 +#define BM1822_HW_EU_SHIFT 4 +#define BM1822_HW_LMEM_SHIFT 15 +#define BM1822_HW_LMEM_BANKS 8 +#define BM1822_HW_LMEM_BANK_SIZE 0x1000 +#define BM1822_HW_NODE_CHIP_SHIFT 0 +#define BM1822_HW_NPU_NUM (1 << BM1822_HW_NPU_SHIFT) +#define BM1822_HW_EU_NUM (1 << BM1822_HW_EU_SHIFT) +#define BM1822_HW_LMEM_SIZE (1 << BM1822_HW_LMEM_SHIFT) +#define BM1822_HW_LMEM_START_ADDR 0x0C000000 +#define BM1822_HW_NODE_CHIP_NUM (1 << BM1822_HW_NODE_CHIP_SHIFT) + +#if (BM1822_HW_LMEM_SIZE != (BM1822_HW_LMEM_BANK_SIZE * BM1822_HW_LMEM_BANKS)) +#error "Set wrong TPU configuraiton." +#endif + +#define BM1822_GLOBAL_MEM_START_ADDR 0x0 +#define BM1822_GLOBAL_MEM_SIZE 0x100000000 + +#define BM1822_GLOBAL_TIU_CMDBUF_ADDR 0x00000000 +#define BM1822_GLOBAL_TDMA_CMDBUF_ADDR 0x10000000 +#define BM1822_GLOBAL_TIU_CMDBUF_RESERVED_SIZE 0x10000000 +#define BM1822_GLOBAL_TDMA_CMDBUF_RESERVED_SIZE 0x10000000 +#define BM1822_GLOBAL_POOL_RESERVED_SIZE (BM1822_GLOBAL_MEM_SIZE - BM1822_GLOBAL_TIU_CMDBUF_RESERVED_SIZE - BM1822_GLOBAL_TDMA_CMDBUF_RESERVED_SIZE) + +#define BM1822_UART_CTLR_BASE_ADDR 0x04140000 + +#define BM1822_TDMA_ENGINE_BASE_ADDR 0x0C100000 +#define BM1822_TDMA_ENGINE_END_ADDR (BM1822_TDMA_ENGINE_BASE_ADDR + 0x1000) + +#define BM1822_TIU_ENGINE_BASE_ADDR 0x0C101000 //"NPS Register" in memory map? +#define BM1822_TIU_ENGINE_END_ADDR (BM1822_TIU_ENGINE_BASE_ADDR + 0x1000) + +#endif diff --git a/cvikernel/include/bmkernel/bm1822/bm_vlc_compress.h b/cvikernel/include/bmkernel/bm1822/bm_vlc_compress.h new file mode 100644 index 000000000..247c47a60 --- /dev/null +++ b/cvikernel/include/bmkernel/bm1822/bm_vlc_compress.h @@ -0,0 +1,703 @@ +#ifndef __BM_VLC_COMPRESS_H__ +#define __BM_VLC_COMPRESS_H__ +#include +#include +#ifdef __cplusplus +extern "C" +{ +#endif + +#define MAX_UNARY_FIELD_SIZE 47 +#define MAX_ORDER_K 5 + + /** + * \data_type 0 means 8bit, 1 means 16bit + */ + static inline size_t get_out_bs_buf_size(uint64_t in_size, uint8_t data_type) { + size_t blk_num = (data_type) ? ((in_size + 31) >> 5) : ((in_size + 15) >> 4); + size_t in_size_pad = blk_num << (4 + data_type); + size_t bs_buf_size = in_size_pad + (ceiling_func(blk_num, 16) << 4) + 16; + return bs_buf_size; + } + + typedef struct + { + uint8_t signedness; + uint8_t is_bfloat16; + uint8_t bias0; + uint8_t bias1; + uint8_t zero_guard_en; + } CommandInfo; + typedef struct + { + uint8_t *stream; // stream buffer pointer + int bit_pos; // current pointer (in bit) + int buf_size; // in byte + } StreamBuffer; + +static inline int8_t two_side_circular_shift(int8_t val, uint8_t bias0, uint8_t bias1); +static inline int8_t inv_two_side_circular_shift(int8_t val, uint8_t bias0, uint8_t bias1); +static inline uint8_t center_shift(uint8_t val, uint8_t bias, uint8_t zero_guard); +static inline uint8_t inv_center_shift(uint8_t val, uint8_t bias, uint8_t zero_guard); + +static inline void init_stream(StreamBuffer *bs, const uint8_t *buf, int buf_size, uint8_t read_only); + +static inline void bm_vlc_est_weight_bias(const uint8_t *ibuf, size_t isz, uint8_t signedness, uint8_t isBfloat16, CommandInfo *cmd_info); +static inline void bm_vlc_enc_int8(const uint8_t *ibuf, size_t isz, uint8_t *obuf, size_t *osz, CommandInfo *cmd_info); +static inline void bm_vlc_dec_int8_ext(const uint8_t *ibuf, size_t isz, uint8_t *obuf, size_t *bs_size); +static inline void bm_vlc_dec_int8(const uint8_t *ibuf, size_t isz, uint8_t *obuf); +static inline void bm_vlc_enc_bf16(const uint16_t *ibuf, size_t isz, uint8_t *obuf, size_t *osz, CommandInfo *cmd_info); +static inline void bm_vlc_dec_bf16_ext(const uint8_t *ibuf, size_t isz, uint16_t *obuf, size_t *bs_size); +static inline void bm_vlc_dec_bf16(const uint8_t *ibuf, size_t isz, uint16_t *obuf); + +static inline uint8_t get_bit_val(uint8_t *buf, int byte_idx, int bit_idx) + { + return (buf[byte_idx] >> bit_idx) & 0x1; + } + +static inline uint8_t sign_to_unsign(uint8_t val) + { + uint8_t sign_i = (val >> 7) & 0x1; + int abs_data_i = abs(((int8_t)val)); + return ((abs_data_i << 1) - sign_i); + } + +static inline int8_t unsign_to_sign(uint8_t val) + { + uint8_t sign_i = val & 0x1; + int abs_data_i = (((int)val) + 1) >> 1; + return (uint8_t)((sign_i == 1) ? (-abs_data_i) : abs_data_i); + } + +static inline void dispatch_bf16_data(const uint16_t *bf16_in, uint8_t *exp, uint8_t *frac, size_t isz) +{ + for (size_t i = 0; i < isz; i++) + { + exp[i] = (uint8_t)((bf16_in[i] >> 7) & 0xFF); + frac[i] = (uint8_t)(((bf16_in[i] >> 15) << 7) | (bf16_in[i] & 0x7F)); + } +} + +static inline void merge_bf16_data(const uint8_t *exp_in, const uint8_t *frac_in, uint16_t *bf16_out, size_t isz) +{ + memset(bf16_out, 0, sizeof(uint16_t)); + for (size_t i = 0; i < isz; i++) + { + bf16_out[i] = ((frac_in[i] >> 7) << 15) | (exp_in[i] << 7) | (frac_in[i] & 0x7F); + } +} + +// -- streaming operation handler -- +static inline void init_stream(StreamBuffer *bs, const uint8_t *buf, int buf_size, uint8_t read_only) +{ + bs->bit_pos = 0; + bs->stream = (uint8_t *)buf; + bs->buf_size = buf_size; + if (!read_only) + memset((uint8_t *)buf, 0, sizeof(uint8_t) * buf_size); +} + +static inline void write_stream(StreamBuffer *bs, uint8_t *src, int bit_len) +{ + for (int bit = 0; bit < bit_len; bit++) + { + int src_byte_i = bit / 8; + int src_bit_i = bit % 8; + int dest_byte_i = (bs->bit_pos + bit) / 8; + int dest_bit_i = (bs->bit_pos + bit) % 8; + bs->stream[dest_byte_i] |= (get_bit_val(src, src_byte_i, src_bit_i) << dest_bit_i); + } + bs->bit_pos += bit_len; +} + +static inline void move_stream_ptr(StreamBuffer *bs, int bit_len) +{ + bs->bit_pos += bit_len; +} + +static inline void parse_stream(StreamBuffer *bs, uint8_t *dest, int bit_len) +{ + memset(dest, 0, sizeof(uint8_t) * (bit_len + 7) >> 3); + for (int bit = 0; bit < bit_len; bit++) + { + int dest_byte_i = bit / 8; + int dest_bit_i = bit % 8; + int bs_byte_i = (bs->bit_pos + bit) / 8; + int bs_bit_i = (bs->bit_pos + bit) % 8; + dest[dest_byte_i] |= (get_bit_val(bs->stream, bs_byte_i, bs_bit_i) << dest_bit_i); + } + bs->bit_pos += bit_len; +} + +// -- header read/write operation handler -- +static inline void vlc_enc_header(StreamBuffer *bs_header, CommandInfo *cmd_info, size_t blk_bs_size) +{ + write_stream(bs_header, (uint8_t *)&blk_bs_size, 24); // bit[23:0] compressed block stream size + move_stream_ptr(bs_header, 4); // bit[27:24] reserved + write_stream(bs_header, (uint8_t *)&cmd_info->signedness, 1); // bit[28] signedness + write_stream(bs_header, (uint8_t *)&cmd_info->is_bfloat16, 1); // bit[29] data type + move_stream_ptr(bs_header, 2); // bit[31:30] bit depth + write_stream(bs_header, (uint8_t *)&cmd_info->bias0, 8); // bit[39:32] bias0 for symbol remapping + write_stream(bs_header, (uint8_t *)&cmd_info->bias1, 7); // bit[46:40] bias1 for symbol remapping + write_stream(bs_header, (uint8_t *)&cmd_info->zero_guard_en, 1); // bit[47] zero guard +} + +static inline void vlc_dec_header_ext(StreamBuffer *bs_header, CommandInfo *cmd_info, size_t *blk_bs_size) +{ + parse_stream(bs_header, (uint8_t *)blk_bs_size, 24); // bit[23:0] compressed block stream size + move_stream_ptr(bs_header, 4); // bit[27:24] reserved + parse_stream(bs_header, (uint8_t *)&cmd_info->signedness, 1); // bit[28] signedness + parse_stream(bs_header, (uint8_t *)&cmd_info->is_bfloat16, 1); // bit[29] data type + move_stream_ptr(bs_header, 2); + parse_stream(bs_header, (uint8_t *)&cmd_info->bias0, 8); // bit[39:32] bias0 for symbol remapping + parse_stream(bs_header, (uint8_t *)&cmd_info->bias1, 7); // bit[46:40] bias1 for symbol remapping + parse_stream(bs_header, (uint8_t *)&cmd_info->zero_guard_en, 1); // bit[47] zero guard +} + +static inline void vlc_dec_header(StreamBuffer *bs_header, CommandInfo *cmd_info) +{ + size_t blk_bs_size; + vlc_dec_header_ext(bs_header, cmd_info, &blk_bs_size); +} + +// -- symbol remmaping handler -- +static inline uint8_t center_shift(uint8_t val, uint8_t bias, uint8_t zero_guard) +{ + if (val == 0 && zero_guard) + return 0; + + int16_t shift_data_i = val - bias; + uint8_t range = (bias <= 128) ? bias : 255 - bias; + if (bias <= 128) + { + return (val >= (range << 1)) ? val : sign_to_unsign(shift_data_i) + zero_guard; + } + else + { + return (val < (bias - range)) ? (range + bias - val + zero_guard) : (sign_to_unsign(shift_data_i) + zero_guard); + } +} + +static inline uint8_t inv_center_shift(uint8_t val, uint8_t bias, uint8_t zero_guard) +{ + if (val == 0 && zero_guard) + return 0; + + uint8_t unsign_data_i = val - zero_guard; + uint8_t range = (bias <= 128) ? bias : 255 - bias; + if (bias <= 128) + { + return (val >= (range << 1)) ? val : unsign_to_sign(unsign_data_i) + bias; + } + else + { + return (unsign_data_i > (range << 1)) ? (range + bias - val + zero_guard) : unsign_to_sign(unsign_data_i) + bias; + } +} + +static inline int8_t two_side_circular_shift(int8_t val, uint8_t bias0, uint8_t bias1) +{ + if (val == 0) + return 0; + + uint8_t sign = (val < 0) ? true : false; + int32_t abs_val = abs(val); + abs_val -= (sign) ? bias1 : bias0; + abs_val += (abs_val <= 0) ? (127 + sign) : 0; + return (sign) ? -abs_val : abs_val; +} + +static inline int8_t inv_two_side_circular_shift(int8_t val, uint8_t bias0, uint8_t bias1) +{ + if (val == 0) + return 0; + + uint8_t sign = (val < 0) ? true : false; + uint32_t abs_val = abs(val); + abs_val += (sign) ? bias1 : bias0; + int32_t abs_val_minus = abs_val - (127 + sign); + uint8_t abs_val_lsb = ((abs_val_minus <= 0) + ? abs_val + : abs_val_minus) & + 0xFF; + return (sign) ? -abs_val_lsb : abs_val_lsb; +} + +static inline void symbol_remapping(uint8_t *blk_in, uint8_t *blk_out, uint8_t bias0, uint8_t bias1, uint8_t signedness, uint8_t is_bf16_exp, uint8_t zero_guard) +{ + if (is_bf16_exp == false && signedness == false) + { + // remapping bypass + memcpy(blk_out, blk_in, sizeof(uint8_t) * 16); + return; + } + + if (is_bf16_exp == true) + { + // center circular shift + for (int i = 0; i < 16; i++) + { + blk_out[i] = center_shift(blk_in[i], bias0, zero_guard); + } + } + else + { + // two-side circular shift + for (int i = 0; i < 16; i++) + { + int8_t shift_data_i = two_side_circular_shift((int8_t)blk_in[i], bias0, bias1); + blk_out[i] = sign_to_unsign(shift_data_i); + } + } +} + +static inline void inv_symbol_remapping(uint8_t *blk_in, uint8_t *blk_out, uint8_t bias0, uint8_t bias1, uint8_t signedness, uint8_t is_bf16_exp, uint8_t zero_guard) +{ + if (is_bf16_exp == false && signedness == false) + { + // remapping bypass + memcpy(blk_out, blk_in, sizeof(uint8_t) * 16); + return; + } + + if (is_bf16_exp == true) + { + // center circular shift + for (int i = 0; i < 16; i++) + { + blk_out[i] = inv_center_shift(blk_in[i], bias0, zero_guard); + } + } + else + { + // two-side circular shift + for (int i = 0; i < 16; i++) + { + int8_t sign_data_i = unsign_to_sign(blk_in[i]); + blk_out[i] = (uint8_t)inv_two_side_circular_shift(sign_data_i, bias0, bias1); + } + } +} + +static inline int vlc_estimate_block_order(uint8_t *blk_in, uint8_t bf16_zvc_en) +{ + int best_k = 0; + int best_bs_size = 0x7FFFFFFF; + + for (int k = 0; k <= (int)MAX_ORDER_K; k++) + { + uint8_t remain_field_size = k << 4; + int unary_field_len = 0; + for (int i = 0; i < 16; i++) + { + uint8_t group_idx = blk_in[i] >> k; + unary_field_len += (group_idx + 1); + } + int znum_bit = (bf16_zvc_en && k > 0) ? 4 : 0; + int blk_size = (unary_field_len <= MAX_UNARY_FIELD_SIZE) + ? remain_field_size + unary_field_len + znum_bit + : 255; + if (blk_size < best_bs_size) + { + best_k = k; + best_bs_size = blk_size; + } + } + + best_k = (best_bs_size > 128) ? -1 : best_k; + return best_k; +} +// -- vlc block parrelel GR encode/decode -- +static inline uint8_t vlc_gr_enc_block_data(uint8_t *blk_in, StreamBuffer *bs, int order_k, uint8_t bf16_zvc_en) +{ + // uncompressed mode + if (order_k == -1) + { + write_stream(bs, blk_in, 128); + return 128; + } + + // remain field + uint8_t remain_field[16] = {0}; + uint8_t unary_field[8] = {0}; + uint8_t sym_end_pos[16] = {0}; + uint8_t unary_field_len = 0; + int sym_end_pos_accum = -1; + + // bit plane encode for remain field + for (int k = 0; k < order_k; k++) + { + uint8_t bit_plane0 = 0, bit_plane1 = 0; + for (int i = 0; i < 8; i++) + { + bit_plane0 |= (get_bit_val(blk_in, i, k) << i); + bit_plane1 |= (get_bit_val(blk_in, i + 8, k) << i); + } + remain_field[k << 1] = bit_plane0; + remain_field[(k << 1) + 1] = bit_plane1; + } + write_stream(bs, remain_field, order_k << 4); + + if (bf16_zvc_en && order_k > 0) + { + int zero_num = 0; + for (int i = 0; i < 16; i++) + { + if (blk_in[i] == 0) + zero_num++; + } + assert(zero_num < 16); + write_stream(bs, (uint8_t *)&zero_num, 4); + } + + // unary encode for unary field + for (int i = 0; i < 16; i++) + { + int group_idx = blk_in[i] >> order_k; + sym_end_pos_accum += (group_idx + 1); + sym_end_pos[i] = sym_end_pos_accum; + int byte_idx = sym_end_pos[i] / 8; + int bit_idx = sym_end_pos[i] % 8; + unary_field[byte_idx] |= (1 << (bit_idx)); + } + unary_field_len = sym_end_pos[15] + 1; + assert(unary_field_len <= MAX_UNARY_FIELD_SIZE); + uint8_t ulen = (unary_field_len - 16) & 0x1F; + write_stream(bs, unary_field, unary_field_len); + + return ulen; +} + +static inline void vlc_gr_dec_block_data(StreamBuffer *bs, uint8_t bs_size, uint8_t *rec, int order_k, uint8_t bf16_zvc_en) +{ + assert(bs_size <= 128); + // uncompressed mode + if (order_k == -1) + { + parse_stream(bs, rec, 128); + return; + } + + // remain field + uint8_t remain_data[16] = {0}; + uint8_t remain_bs[16] = {0}; + uint8_t unary_field[8] = {0}; + uint8_t sym_end_pos[16] = {0}; + uint8_t unary_sym[16] = {0}; + uint8_t remain_field_size = order_k << 4; + + parse_stream(bs, remain_bs, remain_field_size); + // bit plane encode for remain field + for (int k = 0; k < order_k; k++) + { + for (int i = 0; i < 8; i++) + { + remain_data[i] |= (get_bit_val(remain_bs, k << 1, i) << k); + remain_data[i + 8] |= (get_bit_val(remain_bs, (k << 1) + 1, i) << k); + } + } + + // zero number info + int znum_bit = (bf16_zvc_en && order_k > 0) ? 4 : 0; + uint8_t znum = 0; + parse_stream(bs, &znum, znum_bit); + + // unary encode for unary field + uint8_t unary_field_len = bs_size - remain_field_size - znum_bit; + parse_stream(bs, unary_field, unary_field_len); + + int sym_cnt = 0; + for (uint8_t ubit_i = 0; ubit_i < unary_field_len; ubit_i++) + { + int byte_idx = ubit_i / 8; + int bit_idx = ubit_i % 8; + if (get_bit_val(unary_field, byte_idx, bit_idx) == 1) + { + sym_end_pos[sym_cnt] = ubit_i; + sym_cnt++; + } + } + unary_sym[0] = sym_end_pos[0]; + for (int i = 1; i < 16; i++) + { + unary_sym[i] = sym_end_pos[i] - sym_end_pos[i - 1] - 1; + } + for (int i = 0; i < 16; i++) + { + rec[i] = (unary_sym[i] << order_k) + remain_data[i]; + } +} + +// -- vlc encode int8 entry function -- +static inline void bm_vlc_enc_int8(const uint8_t *ibuf, size_t isz, uint8_t *obuf, size_t *osz, CommandInfo *cmd_info) +{ + StreamBuffer bs_header, bs_kmap, bs_data; + size_t blk_num = (isz + 15) >> 4; + size_t header_size = 16; + size_t kmap_size = ceiling_func(blk_num, 16) << 4; + size_t bs_buf_size = header_size + kmap_size + (blk_num << 4); + uint8_t *bsbuf = (uint8_t *)calloc(bs_buf_size, sizeof(uint8_t)); + + // block encode + init_stream(&bs_kmap, bsbuf + header_size, kmap_size, false); + init_stream(&bs_data, bsbuf + header_size + kmap_size, blk_num << 4, false); + + for (size_t blk_idx = 0; blk_idx < blk_num; blk_idx++) + { + uint8_t blk_data[16] = {0}, blk_sr_data[16] = {0}; + size_t in_size = (blk_idx == (blk_num - 1)) ? isz - (blk_idx << 4) : 16; + memcpy(blk_data, &ibuf[blk_idx << 4], sizeof(uint8_t) * in_size); + + symbol_remapping(blk_data, blk_sr_data, cmd_info->bias0, cmd_info->bias1, cmd_info->signedness, false, false); + + int k = vlc_estimate_block_order(blk_sr_data, false); + uint8_t ulen = vlc_gr_enc_block_data(blk_sr_data, &bs_data, k, false); + uint8_t k_info = (k == -1) ? 0xE0 : (k << 5) + ulen; + write_stream(&bs_kmap, &k_info, 8); + } + + int blk_bs_size = ceiling_func(((bs_data.bit_pos + 7) >> 3), 16) << 4; // 16 byte align + *osz = header_size + kmap_size + blk_bs_size; + + // write header + init_stream(&bs_header, bsbuf, header_size, false); + vlc_enc_header(&bs_header, cmd_info, blk_bs_size); + + memcpy(obuf, bsbuf, (*osz) * sizeof(uint8_t)); + free(bsbuf); +} + +// -- vlc decode int8 entry function -- +static inline void bm_vlc_dec_int8_ext(const uint8_t *ibuf, size_t isz, uint8_t *obuf, size_t *bs_size) +{ + StreamBuffer bs_header, bs_kmap, bs_data; + CommandInfo cmd_info; + memset(&cmd_info, 0, sizeof(CommandInfo)); + + size_t blk_num = (isz + 15) >> 4; + int header_size = 16; + int kmap_size = ceiling_func(blk_num, 16) << 4; + + // parse header + init_stream(&bs_header, ibuf, header_size, true); + vlc_dec_header_ext(&bs_header, &cmd_info, bs_size); + + // Check whether valid header + size_t bs_buf_size = get_out_bs_buf_size(isz, 0); // int8 + ASSERT(*bs_size <= bs_buf_size); + ASSERT(cmd_info.is_bfloat16 == 0); + + // block decode + init_stream(&bs_kmap, ibuf + header_size, kmap_size, true); + init_stream(&bs_data, ibuf + header_size + kmap_size, blk_num << 4, true); + + for (size_t blk_idx = 0; blk_idx < blk_num; blk_idx++) + { + uint8_t blk_data[16] = {0}, blk_sr_data[16] = {0}; + uint8_t k_info = 0; + parse_stream(&bs_kmap, &k_info, 8); + uint8_t ulen = k_info & 0x1F; + int k = (k_info >> 5 == 7) ? -1 : k_info >> 5; + int blk_bs_size = (k == -1) ? 128 : (k << 4) + ulen + 16; + vlc_gr_dec_block_data(&bs_data, blk_bs_size, blk_data, k, false); + + inv_symbol_remapping(blk_data, blk_sr_data, cmd_info.bias0, cmd_info.bias1, cmd_info.signedness, false, false); + + int out_size = (blk_idx == (blk_num - 1)) ? isz - (blk_idx << 4) : 16; + memcpy(&obuf[blk_idx << 4], blk_sr_data, sizeof(uint8_t) * out_size); + } +} + +static inline void bm_vlc_dec_int8(const uint8_t *ibuf, size_t isz, uint8_t *obuf) +{ + size_t bs_size; + bm_vlc_dec_int8_ext(ibuf, isz, obuf, &bs_size); +} + +// -- vlc encode bfloat16 entry function -- +static inline void bm_vlc_enc_bf16(const uint16_t *ibuf, size_t isz, uint8_t *obuf, size_t *osz, CommandInfo *cmd_info) +{ + StreamBuffer bs_header, bs_kmap, bs_data; + size_t blk_num = (isz + 31) >> 5; // 32 bytes per blok + size_t header_size = 16; + size_t kmap_size = ceiling_func(blk_num, 16) << 4; + size_t bs_buf_size = header_size + kmap_size + (blk_num << 5); + uint8_t *bsbuf = (uint8_t *)calloc(bs_buf_size, sizeof(uint8_t)); + + // block encode + init_stream(&bs_kmap, bsbuf + header_size, kmap_size, false); + init_stream(&bs_data, bsbuf + header_size + kmap_size, blk_num << 5, false); + + for (size_t blk_idx = 0; blk_idx < blk_num; blk_idx++) + { + uint8_t blk_data[16] = {0}, blk_sr_data[16] = {0}, blk_data_frac[16] = {0}; + size_t in_num = (blk_idx == (blk_num - 1)) ? ((isz >> 1) - (blk_idx << 4)) : 16; + dispatch_bf16_data(&ibuf[blk_idx << 4], blk_data, blk_data_frac, in_num); + + // exp: BGR encode + symbol_remapping(blk_data, blk_sr_data, cmd_info->bias0, cmd_info->bias1, false, true, cmd_info->zero_guard_en); + + int k = vlc_estimate_block_order(blk_sr_data, cmd_info->zero_guard_en); + uint8_t ulen = vlc_gr_enc_block_data(blk_sr_data, &bs_data, k, cmd_info->zero_guard_en); + uint8_t k_info = (k == -1) ? 0xE0 : (k << 5) + ulen; + write_stream(&bs_kmap, &k_info, 8); + + // frac: implicit zero compression + for (size_t i = 0; i < 16; i++) + { + if (!cmd_info->zero_guard_en || blk_data[i] != 0) + { + write_stream(&bs_data, &blk_data_frac[i], 8); + } + } + } + + int blk_bs_size = ceiling_func(((bs_data.bit_pos + 7) >> 3), 16) << 4; // 16 byte align + *osz = header_size + kmap_size + blk_bs_size; + + // write header + init_stream(&bs_header, bsbuf, header_size, false); + vlc_enc_header(&bs_header, cmd_info, blk_bs_size); + + memcpy(obuf, bsbuf, (*osz) * sizeof(uint8_t)); + free(bsbuf); +} + +// -- vlc decode bfloat16 entry function -- +static inline void bm_vlc_dec_bf16_ext(const uint8_t *ibuf, size_t isz, uint16_t *obuf, size_t *bs_size) +{ + StreamBuffer bs_header, bs_kmap, bs_data; + CommandInfo cmd_info; + memset(&cmd_info, 0, sizeof(CommandInfo)); + + size_t blk_num = (isz + 31) >> 5; // 32 bytes per blok + int header_size = 16; + int kmap_size = ceiling_func(blk_num, 16) << 4; + + // parse header + init_stream(&bs_header, ibuf, header_size, true); + vlc_dec_header_ext(&bs_header, &cmd_info, bs_size); + + // Check whether valid header + size_t bs_buf_size = get_out_bs_buf_size(isz, 1); // bf16 + ASSERT(*bs_size <= bs_buf_size); + ASSERT(cmd_info.is_bfloat16 == 1); + + // block decode + init_stream(&bs_kmap, ibuf + header_size, kmap_size, true); + init_stream(&bs_data, ibuf + header_size + kmap_size, blk_num << 5, true); + + for (size_t blk_idx = 0; blk_idx < blk_num; blk_idx++) + { + uint8_t blk_data[16] = {0}, blk_sr_data[16] = {0}, blk_data_frac[16] = {0}; + uint8_t k_info = 0; + parse_stream(&bs_kmap, &k_info, 8); + uint8_t ulen = k_info & 0x1F; + int k = (k_info >> 5 == 7) ? -1 : k_info >> 5; + int znum_bit = (cmd_info.zero_guard_en && k > 0) ? 4 : 0; + uint8_t blk_bs_size = (k == -1) ? 128 : (k << 4) + ulen + 16 + znum_bit; + + // exp: BGR decode + vlc_gr_dec_block_data(&bs_data, blk_bs_size, blk_data, k, cmd_info.zero_guard_en); + + inv_symbol_remapping(blk_data, blk_sr_data, cmd_info.bias0, cmd_info.bias1, false, true, cmd_info.zero_guard_en); + + size_t out_num = (blk_idx == (blk_num - 1)) ? ((isz >> 1) - (blk_idx << 4)) : 16; + + // frac: implicit zero compression + for (size_t i = 0; i < out_num; i++) + { + if (!cmd_info.zero_guard_en || blk_sr_data[i] != 0) + { + parse_stream(&bs_data, &blk_data_frac[i], 8); + } + } + merge_bf16_data(blk_sr_data, blk_data_frac, &obuf[blk_idx << 4], out_num); + } +} + +static inline void bm_vlc_dec_bf16(const uint8_t *ibuf, size_t isz, uint16_t *obuf) +{ + size_t bs_size; + bm_vlc_dec_bf16_ext(ibuf, isz, obuf, &bs_size); +} + +// -- offline estimate model weight params -- +static inline void bm_vlc_est_weight_bias(const uint8_t *ibuf, size_t isz, uint8_t signedness, uint8_t isBfloat16, CommandInfo *cmd_info) +{ + assert(!(isBfloat16 && signedness)); // WARNING: signedness MUST be 0 as isBfloat16==True + + cmd_info->is_bfloat16 = isBfloat16; + if (isBfloat16 == false && signedness == true) + { + // two-side circular shift + int hist[256] = {0}; + for (size_t i = 0; i < isz; i++) + { + hist[ibuf[i]]++; + } + + int8_t pos_v = 1; + //while (pos_v < 128) + // comparison is always true due to limited range of data type [-Werror=type-limits] + while (true) + { + if (hist[((uint8_t)pos_v)] == 0) + { + pos_v++; + } + else + { + break; + } + } + //cmd_info->bias0 = (pos_v > 1 && pos_v < 128) ? (pos_v - 1) : 0; + // comparison is always true due to limited range of data type [-Werror=type-limits] + cmd_info->bias0 = (pos_v > 1) ? (pos_v - 1) : 0; + int8_t neg_v = -1; + //while (neg_v >= (-128)) // comparison is always true due to limited range of data type [-Werror=type-limits] + while (true) + { + if (hist[(uint8_t)neg_v] == 0) + { + neg_v--; + } + else + { + break; + } + } + //cmd_info->bias1 = (neg_v < -1 && neg_v >= -128) ? abs(neg_v + 1) : 0; + // comparison is always true due to limited range of data type [-Werror=type-limits] + cmd_info->bias1 = (neg_v < -1) ? abs(neg_v + 1) : 0; + cmd_info->signedness = true; + } + + if (isBfloat16 == true) + { + // center shift + int64_t exp_accum = 0; + uint16_t *bf16_in = (uint16_t *)ibuf; + size_t inum = (isz >> 1), cnt = 0; + for (size_t i = 0; i < inum; i++) + { + uint8_t exp = ((bf16_in[i] >> 7) & 0xFF); + if (exp != 0) + { + exp_accum += exp; + cnt++; + } + } + if (cnt > 0) + { + cmd_info->bias0 = (uint8_t)((exp_accum / (float)cnt) + 0.5); + } + cmd_info->zero_guard_en = (inum == cnt) ? false : true; + cmd_info->signedness = false; + } +} + #ifdef __cplusplus +} +#endif + +#endif /* __BM_VLC_COMPRESS_H__ */ diff --git a/cvikernel/include/bmkernel/bm1822/bmkernel_1822.h b/cvikernel/include/bmkernel/bm1822/bmkernel_1822.h new file mode 100644 index 000000000..8dc79b478 --- /dev/null +++ b/cvikernel/include/bmkernel/bm1822/bmkernel_1822.h @@ -0,0 +1,1176 @@ +#ifndef __BMKERNEL_1822_H__ +#define __BMKERNEL_1822_H__ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define BMK1822_TIU 0 // Tensor Instruction Unit +#define BMK1822_CPU 1 // CPU, Reserved for common cpu op +#define BMK1822_TDMA 2 // TPU DMA +#define BMK1822_ENGINE_NUM 3 // Number of Engines + +typedef struct bmk_context bmk1822_context_t; +typedef struct bmk_context bmk_context_t; +typedef cvk_chip_info_t bmk1822_chip_info_t; + +typedef struct ec_desc bmk1822_op_t; + +bmk1822_context_t * bmk1822_register(bmk_info_t *info); +void bmk1822_cleanup(bmk1822_context_t *ctx); +void bmk1822_reset(bmk1822_context_t *ctx); +uint8_t *bmk1822_acquire_cmdbuf(bmk1822_context_t *ctx, uint32_t *size); +void bmk1822_dmabuf_size(uint8_t *cmdbuf, uint32_t sz, uint32_t *psize, uint32_t *pmu_size); +void bmk1822_dmabuf_relocate( + uint8_t *dmabuf, uint64_t dmabuf_devaddr, + uint32_t original_size, uint32_t pmubuf_size); +void bmk1822_dmabuf_convert(uint8_t *cmdbuf, uint32_t sz, uint8_t *dmabuf); +void bmk1822_dmabuf_dump(uint8_t * dmabuf); +void bmk1822_arraybase_set( + uint8_t *dmabuf, uint32_t arraybase0L, uint32_t arraybase1L, + uint32_t arraybase0H, uint32_t arraybase1H); + + +void bmk1822_parallel_enable(bmk1822_context_t *ctx); +void bmk1822_set_op(bmk1822_context_t *ctx, void* op); +void* bmk1822_get_op(bmk1822_context_t *ctx); +void bmk1822_parallel_disable(bmk1822_context_t *ctx); +void bmk1822_set_layer_id(bmk1822_context_t *ctx, uint16_t layer_id); +uint16_t bmk1822_layer_id(bmk1822_context_t *ctx); + +void bmk1822_create_streams(bmk1822_context_t *ctx, int nr_streams); +void bmk1822_destroy_streams(bmk1822_context_t *ctx); +void bmk1822_set_stream(bmk1822_context_t *ctx, int i); + +void bmk1822_add_dependency( + bmk1822_context_t *ctx, + bmk1822_op_t *before, + bmk1822_op_t *after); + +void bmk1822_cpu_op( + bmk1822_context_t *ctx, + const char* op_name, char *params, int size); + +/* + * Fundamental structures for tensor and matrix + */ + +typedef struct { + uint32_t n, c, w, col; +} bmk1822_matrix_lmem_shape_t; + +typedef struct { + uint32_t row, col; +} bmk1822_matrix_tgmem_shape_t; + +typedef struct { + uint32_t n, c, h; +} bmk1822_matrix_lmem_stride_t; + +typedef struct { + uint32_t row; +} bmk1822_matrix_tgmem_stride_t; + +typedef struct { + uint32_t n, c, h, w; +} bmk1822_tensor_lmem_shape_t; + +typedef struct { + uint32_t n, c, h, w; +} bmk1822_tensor_tgmem_shape_t; + +typedef struct { + uint32_t n, c, h, w; +} bmk1822_tensor_lmem_stride_t; + +typedef struct { + uint32_t n, c, h; +} bmk1822_tensor_tgmem_stride_t; + +typedef struct { + uint32_t start_address; + fmt_t fmt; + fmt_t cmprs_fmt; + bmk1822_tensor_lmem_shape_t shape; + bmk1822_tensor_lmem_stride_t stride; + uint8_t int8_rnd_mode; // (1, oc, kh*kw, ic) + * TDMA load global (1, oc, kh*w, ic) -> local (1, oc, kh*kw, ic) + * TIU conv opd1 (ic, oc, kh, kw) + */ +typedef struct { + const bmk1822_tensor_lmem_t *ofmap; + const bmk1822_tensor_lmem_t *ifmap; + const bmk1822_tensor_lmem_t *weight; + const bmk1822_tensor_lmem_t *bias; + uint8_t ins_h, ins_last_h; + uint8_t ins_w, ins_last_w; + uint8_t pad_top, pad_bottom; + uint8_t pad_left, pad_right; + uint8_t stride_h, stride_w; + uint8_t dilation_h, dilation_w; + int relu_enable; + uint8_t rshift_bits; + uint8_t ps32_mode; + uint8_t w_is_const; + uint16_t layer_id; + uint8_t fp_round_typ; + uint8_t cmd_pre_exe_typ; // tiu execute cmd when channel data is ready + // wait type: + // 0: activation + // 1: weight + uint8_t cmd_pre_exe; // tiu execute cmd when channel data is ready + // 0: disable + // 1: load pre exec + // 2: store pre exec + // 3: load and store pre exec + int8_t ins_val; // padding value for int8 + uint16_t ins_fp; // padding value for bf16 +} bmk1822_tiu_convolution_param_t; + +bmk1822_op_t * bmk1822_tiu_convolution( + bmk1822_context_t *ctx, + const bmk1822_tiu_convolution_param_t *p); + +typedef struct { + const bmk1822_tensor_lmem_t *ofmap; + const bmk1822_tensor_lmem_t *ifmap; + const bmk1822_tensor_lmem_t *weight; + const bmk1822_tensor_lmem_t *chl_quan_param; + uint8_t ins_h, ins_last_h; + uint8_t ins_w, ins_last_w; + uint8_t pad_top, pad_bottom; + uint8_t pad_left, pad_right; + uint8_t stride_h, stride_w; + uint8_t dilation_h, dilation_w; + uint8_t has_bias; + uint8_t relu_enable; + uint8_t ps32_mode; + uint8_t w_is_const; + uint16_t layer_id; + uint8_t cmd_pre_exe_typ; // tiu execute cmd when channel data is ready + // wait type: + // 0: activation + // 1: weight + uint8_t cmd_pre_exe; // tiu execute cmd when channel data is ready + // 0: disable + // 1: load pre exec + // 2: store pre exec + // 3: load and store pre exec + int8_t ins_val; // padding value for int8 + uint16_t ins_fp; // padding value for bf16 +} bmk1822_tiu_convolution_qdm_param_t; + +bmk1822_op_t * bmk1822_tiu_convolution_qdm( + bmk1822_context_t *ctx, + const bmk1822_tiu_convolution_qdm_param_t *p); + +typedef struct { + const bmk1822_tensor_lmem_t *ofmap; + const bmk1822_tensor_lmem_t *ifmap; + uint16_t kh, kw; + uint8_t pad_top, pad_bottom; + uint8_t pad_left, pad_right; + uint8_t stride_h, stride_w; + int8_t ins_val; // padding value for int8 + uint16_t ins_fp; // padding value for bf16 + uint16_t layer_id; +} bmk1822_tiu_max_pooling_param_t; + +bmk1822_op_t * bmk1822_tiu_max_pooling( + bmk1822_context_t *ctx, + const bmk1822_tiu_max_pooling_param_t *p); + +typedef struct { + const bmk1822_tensor_lmem_t *ofmap; + const bmk1822_tensor_lmem_t *ifmap; + uint16_t kh, kw; + uint8_t pad_top, pad_bottom; + uint8_t pad_left, pad_right; + uint8_t stride_h, stride_w; + uint16_t ins_fp; + uint16_t layer_id; +} bmk1822_tiu_min_pooling_param_t; + +bmk1822_op_t * bmk1822_tiu_min_pooling( + bmk1822_context_t *ctx, + const bmk1822_tiu_min_pooling_param_t *p); + +bmk1822_op_t * bmk1822_tiu_bf16_min_pooling( + bmk1822_context_t *ctx, + const bmk1822_tiu_min_pooling_param_t *p); + +typedef struct { + const bmk1822_tensor_lmem_t *ofmap; + const bmk1822_tensor_lmem_t *ifmap; + uint16_t kh, kw; + uint8_t ins_h, ins_last_h; + uint8_t ins_w, ins_last_w; + uint8_t pad_top, pad_bottom; + uint8_t pad_left, pad_right; + uint8_t stride_h, stride_w; + uint16_t avg_pooling_const; + uint8_t rshift_bits; + uint16_t layer_id; + int8_t ins_val; // padding value for int8 + uint16_t ins_fp; // padding value for bf16 +} bmk1822_tiu_average_pooling_param_t; + +bmk1822_op_t * bmk1822_tiu_average_pooling( + bmk1822_context_t *ctx, + const bmk1822_tiu_average_pooling_param_t *p); + +typedef struct { + const bmk1822_tensor_lmem_t *ofmap; + const bmk1822_tensor_lmem_t *ifmap; + const bmk1822_tensor_lmem_t *weight; + const bmk1822_tensor_lmem_t *bias; + int weight_is_const; + struct { + int16_t val; + int is_signed; + } weight_const; + uint8_t ins_h, ins_last_h; + uint8_t ins_w, ins_last_w; + uint8_t dilation_h, dilation_w; + uint8_t pad_top, pad_bottom; + uint8_t pad_left, pad_right; + uint8_t stride_h, stride_w; + uint8_t rshift_bits; + int relu_enable; + uint16_t layer_id; + uint8_t cmd_pre_exe_typ; // tiu execute cmd when channel data is ready + // wait type: + // 0: activation + // 1: weight + uint8_t cmd_pre_exe; // tiu execute cmd when channel data is ready + // 0: disable + // 1: load pre exec + // 2: store pre exec + // 3: load and store pre exec + int8_t ins_val; // padding value for int8 + uint16_t ins_fp; // padding value for bf16 +} bmk1822_tiu_depthwise_convolution_param_t; + +bmk1822_op_t * bmk1822_tiu_depthwise_convolution( + bmk1822_context_t *ctx, + const bmk1822_tiu_depthwise_convolution_param_t *p); + +typedef struct { + const bmk1822_tensor_lmem_t *ofmap; + const bmk1822_tensor_lmem_t *ifmap; + const bmk1822_tensor_lmem_t *weight; + const bmk1822_tensor_lmem_t *chl_quan_param; + int weight_is_const; + struct { + int16_t val; + int is_signed; + } weight_const; + uint8_t ins_h, ins_last_h; + uint8_t ins_w, ins_last_w; + uint8_t dilation_h, dilation_w; + uint8_t pad_top, pad_bottom; + uint8_t pad_left, pad_right; + uint8_t stride_h, stride_w; + uint8_t has_bias; + uint8_t relu_enable; + uint16_t layer_id; + uint8_t cmd_pre_exe_typ; // tiu execute cmd when channel data is ready + // wait type: + // 0: activation + // 1: weight + uint8_t cmd_pre_exe; // tiu execute cmd when channel data is ready + // 0: disable + // 1: load pre exec + // 2: store pre exec + // 3: load and store pre exec + int8_t ins_val; // padding value for int8 + uint16_t ins_fp; // padding value for bf16 +} bmk1822_tiu_depthwise_convolution_qdm_param_t; + +bmk1822_op_t * bmk1822_tiu_depthwise_convolution_qdm( + bmk1822_context_t *ctx, + const bmk1822_tiu_depthwise_convolution_qdm_param_t *p); + +typedef struct { + const bmk1822_matrix_lmem_t *res; + const bmk1822_matrix_lmem_t *left; + const bmk1822_matrix_lmem_t *right; + const bmk1822_matrix_lmem_t *bias; + uint8_t lshift_bits; + uint8_t rshift_bits; + int res_is_int8; + int relu_enable; + int add_result; + uint8_t ps32_mode; + uint16_t layer_id; +} bmk1822_tiu_matrix_multiplication_param_t; + +bmk1822_op_t * bmk1822_tiu_matrix_multiplication( + bmk1822_context_t *ctx, + const bmk1822_tiu_matrix_multiplication_param_t *p); + +typedef struct { + const bmk1822_matrix_lmem_t *res; + const bmk1822_matrix_lmem_t *left; + const bmk1822_matrix_lmem_t *right; + const bmk1822_matrix_lmem_t *bias; + uint8_t lshift_bits; + uint8_t rshift_bits; + int res_is_int8; + int relu_enable; + int add_result; + uint8_t ps32_mode; + int32_t quan_m; + uint16_t layer_id; +} bmk1822_tiu_matrix_multiplication_qdm_param_t; + +bmk1822_op_t * bmk1822_tiu_matrix_multiplication_qdm( + bmk1822_context_t *ctx, + const bmk1822_tiu_matrix_multiplication_qdm_param_t *p); + +/* + * Helpers + */ + +bmk1822_tensor_lmem_stride_t bmk1822_tensor_lmem_default_stride( + bmk1822_context_t *ctx, + bmk1822_tensor_lmem_shape_t s, + fmt_t fmt_type, + int eu_align); + +bmk1822_tensor_tgmem_stride_t bmk1822_tensor_tgmem_default_stride( + bmk1822_tensor_tgmem_shape_t s, + fmt_t fmt_type); + +bmk1822_matrix_lmem_shape_t bmk1822_matrix_lmem_default_shape( + bmk1822_context_t *ctx, + uint32_t row, + uint32_t col, + fmt_t fmt_type); + +bmk1822_matrix_lmem_shape_t bmk1822_matrix_lmem_shape_t1( + bmk1822_context_t *ctx, + uint32_t len, + fmt_t fmt_type); + +bmk1822_matrix_lmem_stride_t bmk1822_matrix_lmem_default_stride( + bmk1822_context_t *ctx, + bmk1822_matrix_lmem_shape_t s, + fmt_t fmt, + int eu_align); + +bmk1822_tensor_lmem_t * bmk1822_lmem_alloc_tensor( + bmk1822_context_t *ctx, + bmk1822_tensor_lmem_shape_t s, + fmt_t fmt, + int eu_align); + +void bmk1822_lmem_init_tensor( + bmk1822_context_t *ctx, + bmk1822_tensor_lmem_t *tl, + bmk1822_tensor_lmem_shape_t shape, + fmt_t fmt, + int eu_align); + +bmk1822_tensor_lmem_t * bmk1822_lmem_alloc_ps32_tensor( + bmk1822_context_t *ctx, + bmk1822_tensor_lmem_shape_t s, + fmt_t fmt, + int eu_align); + +void bmk1822_lmem_free_tensor( + bmk1822_context_t *ctx, + const bmk1822_tensor_lmem_t *t); + +bmk1822_matrix_lmem_t * bmk1822_lmem_alloc_matrix( + bmk1822_context_t *ctx, + bmk1822_matrix_lmem_shape_t s, + fmt_t fmt, + int eu_align); + +void bmk1822_lmem_init_matrix( + bmk1822_context_t *ctx, + bmk1822_matrix_lmem_t *ml, + bmk1822_matrix_lmem_shape_t shape, + fmt_t fmt, + int eu_align); + +bmk1822_matrix_lmem_t * bmk1822_lmem_alloc_ps32_matrix( + bmk1822_context_t *ctx, + bmk1822_matrix_lmem_shape_t s, + fmt_t fmt, + int eu_align); + +void bmk1822_lmem_free_matrix( + bmk1822_context_t *ctx, + const bmk1822_matrix_lmem_t *t); + +uint32_t bmk1822_lmem_tensor_to_size( + bmk1822_context_t *ctx, + bmk1822_tensor_lmem_shape_t s, + fmt_t fmt, + int eu_align); + +uint32_t bmk1822_lmem_matrix_to_size( + bmk1822_context_t *ctx, + bmk1822_matrix_lmem_shape_t s, + fmt_t fmt, + int eu_align); + +uint32_t bmk1822_lmem_ps32_matrix_to_size( + bmk1822_context_t *ctx, + bmk1822_matrix_lmem_shape_t s, + fmt_t fmt, + int eu_align); + +#ifdef __cplusplus +} +#endif + +#endif /* __BMKERNEL_1822_H__ */ diff --git a/cvikernel/include/bmkernel/bm1822/compression.h b/cvikernel/include/bmkernel/bm1822/compression.h new file mode 100644 index 000000000..905e62abe --- /dev/null +++ b/cvikernel/include/bmkernel/bm1822/compression.h @@ -0,0 +1,369 @@ +#ifndef COMPRESSION_H +#define COMPRESSION_H + +#include + +typedef struct { + uint32_t compress_md; + uint32_t bit_length; + int is_signed; + + uint64_t total_data_num; + uint32_t non_zero_data_num; + + uint64_t header_bytes; + uint64_t map_bytes; + uint64_t data_bytes; + uint64_t total_bytes; + + int compressed_min; + int compressed_max; +} compression_info_t; + +typedef struct { + uint64_t header_offset; + uint64_t header_size; + uint64_t map_offset; + uint64_t map_size; + uint64_t data_offset; + uint64_t data_size; + uint64_t total_size; +} compress_addr_info; + +static uint64_t compression_map_bytes(uint64_t total_data_num) +{ + uint64_t bit_alignment = 16 * 8; + uint64_t bits = total_data_num; + + return ceiling_func(bits, bit_alignment)*16; +} + +static uint64_t compression_map_clear_bytes(uint64_t total_data_num) +{ + uint64_t bit_alignment = 2 * 8; + uint64_t bits = total_data_num; + + return ceiling_func(bits, bit_alignment)*2; +} + + +static uint64_t compression_data_bytes(uint64_t non_zero_data_num, uint32_t bit_length) +{ + if (bit_length == 1) + return 0; + + uint64_t bit_alignment = 8; + uint64_t bits = non_zero_data_num * bit_length; + + return ceiling_func(bits, bit_alignment); +} + +static inline uint32_t compression_bit_length(uint32_t compress_md) +{ + switch (compress_md) { + case 0: + return 8; + case 1: + return 4; + case 2: + return 2; + case 3: + return 1; + default: + assert(0); + } +} + +static inline void compute_compressed_range( + uint32_t bit_length, int is_signed, int *min, int *max) +{ + if (is_signed) { + switch (bit_length) { + case 1: + *min = -1; + *max = 0; + return; + case 2: + *min = -2; + *max = 1; + return; + case 4: + *min = -8; + *max = 7; + return; + case 8: + *min = -128; + *max = 127; + return; + } + } else { + *min = 0; + switch (bit_length) { + case 1: + *max = 1; + return; + case 2: + *max = 3; + return; + case 4: + *max = 15; + return; + case 8: + *max = 255; + return; + } + } + assert(0); +} + +static inline int saturate(int val, int max, int min) +{ + if (val < min) + return min; + else if (val > max) + return max; + else + return val; +} + +static inline uint64_t count_non_zero_results( + uint8_t buf[], uint64_t size, int is_signed, int max, int min) +{ + uint64_t n = 0; + + for (uint64_t i = 0; i < size; i++) { + int val = is_signed? (int8_t)buf[i]: buf[i]; + int res = saturate(val, max, min); + if (res != 0) + n++; + } + + return n; +} + +static inline void set_map_bit(uint8_t map[], uint64_t i) +{ + uint64_t byte_i = i / 8; + uint64_t bit_i = i % 8; + + map[byte_i] |= (1 << bit_i); +} + +static inline uint8_t read_map_bit(uint8_t map[], uint64_t i) +{ + uint64_t byte_i = i / 8; + uint64_t bit_i = i % 8; + + return (map[byte_i] >> bit_i) & 1; +} + +static inline void parse_header( + uint32_t header, int *is_signed, uint32_t *compress_md, uint32_t *nz_num) +{ + *is_signed = (header >> 29) & 1; + *compress_md = (header >> 24) & 0b11; + *nz_num = header & 0xffffff; +} + +static inline void fill_header(uint32_t *hdr, compression_info_t *info) +{ + if(compression_bit_length(info->compress_md)!=1) + { + *hdr = (info->is_signed << 29) | (1 << 28) | + (info->compress_md << 24) | + info->non_zero_data_num; + }else + { + *hdr = (info->is_signed << 29) | (1 << 28) | + (info->compress_md << 24); + } +} + +static inline void fill_map(uint8_t map[], uint8_t buf[], compression_info_t *info) +{ + int min = info->compressed_min; + int max = info->compressed_max; + + uint64_t clear_map = compression_map_clear_bytes(info->total_data_num); + for (uint64_t i = 0; i < clear_map; i++) + map[i] = 0; + + for (uint64_t i = 0; i < info->total_data_num; i++) { + int val = info->is_signed? (int8_t)buf[i]: buf[i]; + int res = saturate(val, max, min); + if (res != 0) + set_map_bit(map, i); + } +} + +static inline void compress_one_data( + uint8_t data[], uint64_t i, uint8_t val, compression_info_t *info) +{ + uint32_t bit_len = info->bit_length; + uint32_t data_per_byte = 8 / bit_len; + + uint32_t byte_i = i / data_per_byte; + uint32_t bit_i = (i % data_per_byte) * bit_len; + uint8_t mask = (1 << bit_len) - 1; + + data[byte_i] |= (val & mask) << bit_i; +} + +static inline uint8_t sign_extend(uint8_t val, uint32_t bit_len) +{ + int shift = 8 - bit_len; + return (int8_t)(val << shift) >> shift; +} + +static inline uint8_t decompress_one_data( + uint8_t data[], uint64_t i, compression_info_t *info) +{ + uint32_t bit_len = info->bit_length; + uint32_t data_per_byte = 8 / bit_len; + + uint32_t byte_i = i / data_per_byte; + uint32_t bit_i = (i % data_per_byte) * bit_len; + uint8_t mask = (1 << bit_len) - 1; + + uint8_t val = (data[byte_i] >> bit_i) & mask; + if (info->is_signed) + val = sign_extend(val, bit_len); + + return val; +} + +static inline void fill_data(uint8_t data[], uint8_t buf[], compression_info_t *info) +{ + int min = info->compressed_min; + int max = info->compressed_max; + + for (uint64_t i = 0; i < info->data_bytes; i++) + data[i] = 0; + + uint64_t nz_i = 0; + for (uint64_t i = 0; i < info->total_data_num; i++) { + int val = info->is_signed? (int8_t)buf[i]: buf[i]; + int res = saturate(val, max, min); + if (res != 0) { + compress_one_data(data, nz_i, res, info); + nz_i++; + } + } +} + +static inline compression_info_t make_compression_info( + uint8_t buf[], uint64_t size, uint32_t compress_md, int is_signed) +{ + uint32_t bit_length = compression_bit_length(compress_md); + + int min, max; + compute_compressed_range(bit_length, is_signed, &min, &max); + + uint32_t nz_num = count_non_zero_results(buf, size, is_signed, max, min); + assert(nz_num <= 0xffffff); + + compression_info_t info; + info.compress_md = compress_md; + info.bit_length = bit_length; + info.is_signed = is_signed; + info.total_data_num = size; + info.non_zero_data_num = nz_num; + info.header_bytes = 16; + info.map_bytes = compression_map_bytes(size); + info.data_bytes = compression_data_bytes(nz_num, bit_length); + info.total_bytes = info.header_bytes + info.map_bytes + info.data_bytes; + info.compressed_min = min; + info.compressed_max = max; + return info; +} + +static inline compression_info_t parse_compression_info( + uint8_t compressed_buf[], uint64_t max_size, uint64_t total_data_num) +{ + uint64_t header_bytes = 16; + assert(header_bytes <= max_size); + + int is_signed; + uint32_t compress_md, nz_num; + parse_header(*(uint32_t *)compressed_buf, &is_signed, &compress_md, &nz_num); + + uint32_t bit_length = compression_bit_length(compress_md); + int min, max; + compute_compressed_range(bit_length, is_signed, &min, &max); + + compression_info_t info; + info.compress_md = compress_md; + info.bit_length = compression_bit_length(compress_md); + info.is_signed = is_signed; + info.total_data_num = total_data_num; + info.non_zero_data_num = nz_num; + info.header_bytes = header_bytes; + info.map_bytes = compression_map_bytes(total_data_num); + info.data_bytes = compression_data_bytes(nz_num, info.bit_length); + info.total_bytes = header_bytes + info.map_bytes + info.data_bytes; + info.compressed_min = min; + info.compressed_max = max; + + assert(info.total_bytes <= max_size); + + return info; +} + +static inline uint8_t * compress( + uint8_t buf[], uint64_t size, uint32_t compress_md, int is_signed, compress_addr_info *compressed_data) +{ + compression_info_t info = + make_compression_info(buf, size, compress_md, is_signed); + + assert(info.total_bytes < 0x100000); + static uint8_t *result = new uint8_t[0x100000]; + uint32_t *hdr = (uint32_t *)result; + uint8_t *map = &result[info.header_bytes]; + uint8_t *data = &map[info.map_bytes]; + + fill_header(hdr, &info); + fill_map(map, buf, &info); + if (info.bit_length != 1) + fill_data(data, buf, &info); + + compressed_data->header_offset = 0; + compressed_data->header_size = 4; + compressed_data->map_offset = info.header_bytes; + compressed_data->map_size = compression_map_clear_bytes(info.total_data_num); + compressed_data->data_offset = info.map_bytes + info.header_bytes; + compressed_data->data_size = info.data_bytes; + compressed_data->total_size = info.total_bytes; + + return result; +} + +static inline void decompress( + uint8_t buf[], uint64_t size, uint8_t compressed_buf[], uint64_t max_size) +{ + compression_info_t info = + parse_compression_info(compressed_buf, max_size, size); + assert(info.total_bytes <= max_size); + assert(info.total_data_num == size); + + uint8_t *map = &compressed_buf[info.header_bytes]; + if (info.bit_length == 1) { + for (uint64_t i = 0; i < size; i++) { + uint8_t val = read_map_bit(map, i); + buf[i] = info.is_signed? sign_extend(val, 1): val; + } + } else { + uint8_t *data = &map[info.map_bytes]; + uint64_t data_i = 0; + for (uint64_t i = 0; i < size; i++) { + uint8_t val = read_map_bit(map, i); + if (val == 0) { + buf[i] = 0; + } else { + buf[i] = decompress_one_data(data, data_i, &info); + data_i++; + } + } + } +} + +#endif /* COMPRESSION_H */ diff --git a/cvikernel/include/bmkernel/bm1880v2/1880v2_fp_convert.h b/cvikernel/include/bmkernel/bm1880v2/1880v2_fp_convert.h new file mode 100755 index 000000000..b8c59acc2 --- /dev/null +++ b/cvikernel/include/bmkernel/bm1880v2/1880v2_fp_convert.h @@ -0,0 +1,338 @@ +#ifndef ATOMIC_FP_H_ +#define ATOMIC_FP_H_ + +#if __arm__ +#define __DISABLE_FENV__ +#endif + +#ifndef __DISABLE_FENV__ +#include +#endif +#include + +#ifdef __cplusplus +extern "C" { +#endif + +static inline uint8_t convert_bf16_u8(uint16_t data); +static inline uint8_t _convert_bf16_u8(uint16_t data, int int8_rnd_md); +static inline int8_t _convert_bf16_s8(uint16_t data, int int8_rnd_md); +static inline int8_t convert_bf16_s8(uint16_t data); +static inline uint16_t convert_int8_bf16(uint8_t data, uint8_t sign); +static inline uint32_t convert_fp32_u32(float fp32); +static inline uint32_t convert_fp32_hex(float val); +static inline float convert_hex_fp32(uint32_t hval); + +static inline float convert_bf16_fp32(uint16_t bf16); +static inline uint16_t convert_fp32_bf16(float fp32); + +static inline void f32_integer(void *if32, void *o_integer, int integer_size, int accumulate, int int8_signed, int int8_rnd_md); +//static inline void f32_integer(void *if32, void *o_integer, + // 0 for 32 bit , 1 for 16 bit , 2 for 8 bit +// int integer_size, int accumulate = 0, int int8_signed = 1, int int8_rnd_md = 0); + +union convert_type_float { + float fval; + uint16_t bf16[2]; + uint32_t ival; +}; + +typedef union convert_type_float convert_int_float; +static const uint16_t NAN_VALUE = 0x7FC0; + +//static int round_mode = 0; +static uint8_t float_isnan(const float x) { + //return isnan(x); + return x != x; +} + +static inline int set_store_feround() +{ +#ifndef __DISABLE_FENV__ + int round_mode = fegetround(); + fesetround(FE_TOWARDZERO); + return round_mode; +#else + return 0; +#endif +} + +static inline void restore_feround(int round_mode) +{ +#ifndef __DISABLE_FENV__ + fesetround(round_mode); +#else + (void)round_mode; +#endif +} + +static inline uint8_t _convert_bf16_u8(uint16_t data, int int8_rnd_md) +{ + /* convert bf16 to float32*/ + float fp32; + convert_int_float convert_val; + fp32 = convert_bf16_fp32(data); + /* convert float32 to uint8_t*/ + f32_integer((void*)&fp32, &convert_val.ival, 2, 0, 0, int8_rnd_md); + return (uint8_t) convert_val.ival; +} + +static inline uint8_t convert_bf16_u8(uint16_t data) +{ + return (uint8_t) _convert_bf16_u8(data, 0); +} + +static inline int8_t _convert_bf16_s8(uint16_t data, int int8_rnd_md) +{ + /* convert bf16 to float32*/ + float fp32; + convert_int_float convert_val; + fp32 = convert_bf16_fp32(data); + /* convert float32 to uint8_t*/ + f32_integer((void*)&fp32, &convert_val.ival, 2, 0, 1, int8_rnd_md); + return (int8_t) convert_val.ival; +} + +static inline int8_t convert_bf16_s8(uint16_t data) +{ + return (int8_t) _convert_bf16_s8(data, 0); +} + +static inline uint16_t convert_int8_bf16(uint8_t data, uint8_t sign) +{ + int32_t val = sign ? (int8_t) data : (uint8_t) data; + /* need to round to bf16 mode */ + return convert_fp32_bf16((float) val); +} + +static inline uint16_t convert_fp32_bf16(float fp32) +{ + if (float_isnan(fp32)) + return NAN_VALUE; + convert_int_float convert_val; + convert_val.fval = fp32; + uint32_t input = convert_val.ival; + uint32_t lsb = (input >> 16) & 1; + uint32_t rounding_bias = 0x7fff + lsb; + input += rounding_bias; + convert_val.bf16[1] = (uint16_t) (input >> 16); + + /* HW behavior */ + if ((convert_val.bf16[1] & 0x7f80) == 0x7f80) { + convert_val.bf16[1] = 0x7f7f; + } + return convert_val.bf16[1]; +} + +static inline uint8_t convert_fp32_u8(float fp32) +{ + convert_int_float convert_val; + f32_integer((void*)&fp32, &convert_val.ival, 2, 0, 0, 0); + return (uint8_t) convert_val.ival; +} + +static inline int8_t convert_fp32_s8(float fp32) +{ + convert_int_float convert_val; + f32_integer((void*)&fp32, &convert_val.ival, 2, 0, 1, 0); + return (int8_t) convert_val.ival; +} + +static inline uint32_t convert_fp32_u32(float fp32) +{ + convert_int_float convert_val; + f32_integer((void*)&fp32, &convert_val.ival, 0, 0, 0, 0); + return (uint32_t) convert_val.ival; +} + +static inline int32_t convert_fp32_s32(float fp32) +{ + convert_int_float convert_val; + f32_integer((void*)&fp32, &convert_val.ival, 0, 0, 1, 0); + return (int32_t) convert_val.ival; +} + +/* convert hex to float directly */ +static inline float convert_hex_fp32(uint32_t hval) +{ + convert_int_float convert_val; + convert_val.ival = hval; + return convert_val.fval; +} +/* convert float to hex directly */ +static inline uint32_t convert_fp32_hex(float val) +{ + convert_int_float convert_val; + convert_val.fval = val; + return convert_val.ival; +} +static inline float convert_bf16_fp32(uint16_t bf16) +{ + convert_int_float convert_val; + convert_val.bf16[1] = bf16; + convert_val.bf16[0] = 0; + return convert_val.fval; +} + +static inline void flt2int_flt(float x, unsigned long long* integer_part, float * sub_part, uint8_t sign) +{ + convert_int_float work_x; + int level_code; + unsigned long tail_code; + work_x.fval = x; + level_code = ((work_x.ival >> 23) & 0xff) - 127; + + //if the level code is negaive, the integer part of the float is zero + if ( level_code < 0 ){ + *integer_part = 0; + *sub_part = x; + } + else { + tail_code = (work_x.ival) & 0x7fffff; + tail_code = tail_code | 0x800000; + + if (level_code < 23){ + tail_code >>= (23 - level_code); + *integer_part = tail_code; + work_x.ival &= 0xffffffff << (23 - level_code); + *sub_part = x - work_x.fval; + } + else { + tail_code <<= (level_code - 23); + *integer_part = tail_code; + if(level_code>30){ + *integer_part = 0x7fffffff; + if(sign)*integer_part = 0x800000000; + } + *sub_part = 0; + } + } +} + +inline static int flt2int(float ifval, int int8_rnd_md) +{ + union { + float floatNum; + unsigned long intNum; + } tempIfval; + tempIfval.floatNum = ifval; + uint8_t isPositive = ((tempIfval.intNum & 0x80000000UL) == 0x80000000UL) ? 0 : 1 ; + float abs_fval = (!isPositive) ? -ifval : ifval; + double sub_part; + double integer; + unsigned long long integer_part; + //uint8_t sign = !isPositive; + //flt2int_flt(abs_fval, &integer_part, &sub_part, sign); + sub_part = modf((double)abs_fval, &integer); + integer_part = (unsigned long long)integer; + if (!isPositive) + { + unsigned long long result; + if(int8_rnd_md == 0) { // round to nearest even + if ( sub_part > 0.5f ) + { + result = integer_part + 1; + } + else if (sub_part == 0.5f) + { + if ( integer_part & 0x1 ) + { + result = integer_part + 1; + } + else + { + result = integer_part; + } + } + else + { + result = integer_part; + } + } else { //round to zero + result = integer_part; + } + if ( result > 0x80000000UL ) + { + result = 0x80000000UL; + } + return -result; + } + else + { + unsigned long long result; + if(int8_rnd_md == 0) { // round to nearest even + if ( sub_part > 0.5f ) + { + result = integer_part + 1; + } + else if ( sub_part == 0.5f ) + { + if ( integer_part & 0x1 ) + { + result = integer_part + 1; + } + else + { + result = integer_part; + } + } + else + { + result = integer_part; + } + } else { + result = integer_part; + } + if ( result > 0x7fffffff ) + { + result = 0x7fffffff; + } + return result; + } +} + +static inline void f32_integer(void *if32, void *o_integer, int integer_size, int accumulate, int int8_signed, int int8_rnd_md) +{ + int i_tmp; + float *f_tmp; + f_tmp = (float *)if32; + i_tmp = flt2int(*f_tmp, int8_rnd_md); + int *o32 = (int *)o_integer; + int dst_f32 = *o32; + short *o16 = (short *)o_integer; + short dst_o16 = *o32; + char *o8 = (char *)o_integer; + char dst_o8 = *o8; + + if (integer_size == 0) { + *o32 = i_tmp; + } else if (integer_size == 1) { + *o16 = i_tmp; + } else{ + *o8 = i_tmp; + int min = (int8_signed) ? -128 : 0; + int max = (int8_signed) ? 127 : 255; + if (i_tmp < min ){ + *o8 = min; + } + else if (i_tmp > max){ + *o8 = max; + } + //*o8 = i_tmp; + } + if (accumulate) { + if (integer_size == 0) { + *o32 += dst_f32; + } else if (integer_size == 1) { + *o16 += dst_o16; + } else + *o8 += dst_o8; + } +} + +#ifdef __cplusplus +} +#endif + +#endif /* ATOMIC_FP_H_ */ + diff --git a/cvikernel/include/bmkernel/bm1880v2/bm1880v2_tdma_reg.h b/cvikernel/include/bmkernel/bm1880v2/bm1880v2_tdma_reg.h new file mode 100644 index 000000000..b3834cb3b --- /dev/null +++ b/cvikernel/include/bmkernel/bm1880v2/bm1880v2_tdma_reg.h @@ -0,0 +1,301 @@ +#ifndef BM1880v2_TDMA_REG_V1_32_H +#define BM1880v2_TDMA_REG_V1_32_H + +/* + * This file is generated by tools. Do not edit it manually. + */ + +#include +#include + +typedef unsigned long long ullong; + +typedef struct { + uint32_t vld; + uint32_t compress_en; + uint32_t eod; + uint32_t intp_en; + uint32_t bar_en; + uint32_t check_bf16_value; + uint32_t trans_dir; + uint32_t rsv00; + uint32_t trans_fmt; + uint32_t transpose_md; + uint32_t rsv01; + uint32_t outstanding_en; + uint32_t cmd_id; + uint32_t spec_func; + uint32_t dst_fmt; + uint32_t src_fmt; + uint32_t cmprs_fmt; + uint32_t sys_dtype; + uint32_t rsv2_1; + uint32_t int8_sign; + uint32_t compress_zero_guard; + uint32_t int8_rnd_mode; + uint32_t wait_id_tpu; + uint32_t wait_id_other_tdma; + uint32_t wait_id_sdma; + uint32_t const_val; + uint32_t src_base_reg_sel; + uint32_t mv_lut_idx; + uint32_t dst_base_reg_sel; + uint32_t mv_lut_base; + uint32_t rsv4_5; + uint32_t dst_h_stride; + uint32_t dst_c_stride_low; + uint32_t dst_n_stride; + uint32_t src_h_stride; + uint32_t src_c_stride_low; + uint32_t src_n_stride; + uint32_t dst_c; + uint32_t src_c; + uint32_t dst_w; + uint32_t dst_h; + uint32_t src_w; + uint32_t src_h; + uint32_t dst_base_addr_low; + uint32_t src_base_addr_low; + uint32_t src_n; + uint32_t dst_base_addr_high; + uint32_t src_base_addr_high; + uint32_t src_c_stride_high; + uint32_t dst_c_stride_high; + uint32_t compress_bias0; + uint32_t compress_bias1; + uint32_t layer_ID; +} tdma_reg_t; + +static inline void parse_tdma_reg(tdma_reg_t *r, const uint32_t *p) +{ + r->vld = p[0] & 1; + r->compress_en = (p[0] >> 1) & 1; + r->eod = (p[0] >> 2) & 1; + r->intp_en = (p[0] >> 3) & 1; + r->bar_en = (p[0] >> 4) & 1; + r->check_bf16_value = (p[0] >> 5) & 1; + r->trans_dir = (p[0] >> 6) & ((1u << 2) - 1); + r->rsv00 = (p[0] >> 8) & ((1u << 2) - 1); + r->trans_fmt = (p[0] >> 10) & 1; + r->transpose_md = (p[0] >> 11) & ((1u << 2) - 1); + r->rsv01 = (p[0] >> 13) & ((1u << 2) - 1); + r->outstanding_en = (p[0] >> 15) & 1; + r->cmd_id = (p[0] >> 16) & ((1u << 16) - 1); + r->spec_func = p[1] & ((1u << 3) - 1); + r->dst_fmt = (p[1] >> 3) & ((1u << 2) - 1); + r->src_fmt = (p[1] >> 5) & ((1u << 2) - 1); + r->cmprs_fmt = (p[1] >> 7) & 1; + r->sys_dtype = (p[1] >> 8) & 1; + r->rsv2_1 = (p[1] >> 9) & ((1u << 4) - 1); + r->int8_sign = (p[1] >> 13) & 1; + r->compress_zero_guard = (p[1] >> 14) & 1; + r->int8_rnd_mode = (p[1] >> 15) & 1; + r->wait_id_tpu = (p[1] >> 16) & ((1u << 16) - 1); + r->wait_id_other_tdma = p[2] & ((1u << 16) - 1); + r->wait_id_sdma = (p[2] >> 16) & ((1u << 16) - 1); + r->const_val = p[3] & ((1u << 16) - 1); + r->src_base_reg_sel = (p[3] >> 16) & ((1u << 3) - 1); + r->mv_lut_idx = (p[3] >> 19) & 1; + r->dst_base_reg_sel = (p[3] >> 20) & ((1u << 3) - 1); + r->mv_lut_base = (p[3] >> 23) & 1; + r->rsv4_5 = (p[3] >> 24) & ((1u << 8) - 1); + r->dst_h_stride = p[4] & ((1u << 16) - 1); + r->dst_c_stride_low = (p[4] >> 16) & ((1u << 16) - 1); + r->dst_n_stride = p[5]; + r->src_h_stride = p[6] & ((1u << 16) - 1); + r->src_c_stride_low = (p[6] >> 16) & ((1u << 16) - 1); + r->src_n_stride = p[7]; + r->dst_c = p[8] & ((1u << 16) - 1); + r->src_c = (p[8] >> 16) & ((1u << 16) - 1); + r->dst_w = p[9] & ((1u << 16) - 1); + r->dst_h = (p[9] >> 16) & ((1u << 16) - 1); + r->src_w = p[10] & ((1u << 16) - 1); + r->src_h = (p[10] >> 16) & ((1u << 16) - 1); + r->dst_base_addr_low = p[11]; + r->src_base_addr_low = p[12]; + r->src_n = p[13] & ((1u << 16) - 1); + r->dst_base_addr_high = (p[13] >> 16) & ((1u << 8) - 1); + r->src_base_addr_high = (p[13] >> 24) & ((1u << 8) - 1); + r->src_c_stride_high = p[14] & ((1u << 16) - 1); + r->dst_c_stride_high = (p[14] >> 16) & ((1u << 16) - 1); + r->compress_bias0 = p[15] & ((1u << 8) - 1); + r->compress_bias1 = (p[15] >> 8) & ((1u << 8) - 1); + r->layer_ID = (p[15] >> 16) & ((1u << 16) - 1); +} + +static inline void emit_tdma_reg(const tdma_reg_t *r, uint32_t *_p) +{ + volatile uint32_t *p = (typeof(p))_p; + p[15] = (r->compress_bias0 & ((1u << 8) - 1)) | + ((r->compress_bias1 & ((1u << 8) - 1)) << 8) | + ((r->layer_ID & ((1u << 16) - 1)) << 16); + p[14] = (r->src_c_stride_high & ((1u << 16) - 1)) | + ((r->dst_c_stride_high & ((1u << 16) - 1)) << 16); + p[13] = (r->src_n & ((1u << 16) - 1)) | + ((r->dst_base_addr_high & ((1u << 8) - 1)) << 16) | + ((r->src_base_addr_high & ((1u << 8) - 1)) << 24); + p[12] = (r->src_base_addr_low & (((uint64_t)1 << 32) - 1)); + p[11] = (r->dst_base_addr_low & (((uint64_t)1 << 32) - 1)); + p[10] = (r->src_w & ((1u << 16) - 1)) | + ((r->src_h & ((1u << 16) - 1)) << 16); + p[9] = (r->dst_w & ((1u << 16) - 1)) | + ((r->dst_h & ((1u << 16) - 1)) << 16); + p[8] = (r->dst_c & ((1u << 16) - 1)) | + ((r->src_c & ((1u << 16) - 1)) << 16); + p[7] = (r->src_n_stride & (((uint64_t)1 << 32) - 1)); + p[6] = (r->src_h_stride & ((1u << 16) - 1)) | + ((r->src_c_stride_low & ((1u << 16) - 1)) << 16); + p[5] = (r->dst_n_stride & (((uint64_t)1 << 32) - 1)); + p[4] = (r->dst_h_stride & ((1u << 16) - 1)) | + ((r->dst_c_stride_low & ((1u << 16) - 1)) << 16); + p[3] = (r->const_val & ((1u << 16) - 1)) | + ((r->src_base_reg_sel & ((1u << 3) - 1)) << 16) | + ((r->mv_lut_idx & 1) << 19) | + ((r->dst_base_reg_sel & ((1u << 3) - 1)) << 20) | + ((r->mv_lut_base & 1) << 23) | + ((r->rsv4_5 & ((1u << 8) - 1)) << 24); + p[2] = (r->wait_id_other_tdma & ((1u << 16) - 1)) | + ((r->wait_id_sdma & ((1u << 16) - 1)) << 16); + p[1] = (r->spec_func & ((1u << 3) - 1)) | + ((r->dst_fmt & ((1u << 2) - 1)) << 3) | + ((r->src_fmt & ((1u << 2) - 1)) << 5) | + ((r->cmprs_fmt & 1) << 7) | + ((r->sys_dtype & 1) << 8) | + ((r->rsv2_1 & ((1u << 4) - 1)) << 9) | + ((r->int8_sign & 1) << 13) | + ((r->compress_zero_guard & 1) << 14) | + ((r->int8_rnd_mode & 1) << 15) | + ((r->wait_id_tpu & ((1u << 16) - 1)) << 16); + p[0] = (r->vld & 1) | + ((r->compress_en & 1) << 1) | + ((r->eod & 1) << 2) | + ((r->intp_en & 1) << 3) | + ((r->bar_en & 1) << 4) | + ((r->check_bf16_value & 1) << 5) | + ((r->trans_dir & ((1u << 2) - 1)) << 6) | + ((r->rsv00 & ((1u << 2) - 1)) << 8) | + ((r->trans_fmt & 1) << 10) | + ((r->transpose_md & ((1u << 2) - 1)) << 11) | + ((r->rsv01 & ((1u << 2) - 1)) << 13) | + ((r->outstanding_en & 1) << 15) | + ((r->cmd_id & ((1u << 16) - 1)) << 16); +} + +static inline void reset_tdma_reg(tdma_reg_t *r) +{ + r->vld = 0x0; + r->compress_en = 0x0; + r->eod = 0x0; + r->intp_en = 0x0; + r->bar_en = 0x0; + r->check_bf16_value = 0x0; + r->trans_dir = 0x0; + r->rsv00 = 0x0; + r->trans_fmt = 0x0; + r->transpose_md = 0x0; + r->rsv01 = 0x0; + r->outstanding_en = 0x0; + r->cmd_id = 0x0; + r->spec_func = 0x0; + r->dst_fmt = 0x1; + r->src_fmt = 0x1; + r->cmprs_fmt = 0x0; + r->sys_dtype = 0x0; + r->rsv2_1 = 0x0; + r->int8_sign = 0x0; + r->compress_zero_guard = 0x0; + r->int8_rnd_mode = 0x0; + r->wait_id_tpu = 0x0; + r->wait_id_other_tdma = 0x0; + r->wait_id_sdma = 0x0; + r->const_val = 0x0; + r->src_base_reg_sel = 0x0; + r->mv_lut_idx = 0x0; + r->dst_base_reg_sel = 0x0; + r->mv_lut_base = 0x0; + r->rsv4_5 = 0x0; + r->dst_h_stride = 0x1; + r->dst_c_stride_low = 0x1; + r->dst_n_stride = 0x1; + r->src_h_stride = 0x1; + r->src_c_stride_low = 0x1; + r->src_n_stride = 0x1; + r->dst_c = 0x1; + r->src_c = 0x1; + r->dst_w = 0x1; + r->dst_h = 0x1; + r->src_w = 0x1; + r->src_h = 0x1; + r->dst_base_addr_low = 0x0; + r->src_base_addr_low = 0x0; + r->src_n = 0x1; + r->dst_base_addr_high = 0x0; + r->src_base_addr_high = 0x0; + r->src_c_stride_high = 0x0; + r->dst_c_stride_high = 0x0; + r->compress_bias0 = 0x0; + r->compress_bias1 = 0x0; + r->layer_ID = 0x0; +} + +static inline void trace_tdma_reg(tdma_reg_t *r, const char *tag) +{ +#define trace_one_reg(name) \ + printf(" %s: 0x%llx\n", #name, (ullong)r->name) + + printf("--- %s ---\n", tag); + trace_one_reg(vld); + trace_one_reg(compress_en); + trace_one_reg(eod); + trace_one_reg(intp_en); + trace_one_reg(bar_en); + trace_one_reg(check_bf16_value); + trace_one_reg(trans_dir); + trace_one_reg(rsv00); + trace_one_reg(trans_fmt); + trace_one_reg(transpose_md); + trace_one_reg(rsv01); + trace_one_reg(outstanding_en); + trace_one_reg(cmd_id); + trace_one_reg(spec_func); + trace_one_reg(dst_fmt); + trace_one_reg(src_fmt); + trace_one_reg(cmprs_fmt); + trace_one_reg(sys_dtype); + trace_one_reg(rsv2_1); + trace_one_reg(int8_sign); + trace_one_reg(compress_zero_guard); + trace_one_reg(int8_rnd_mode); + trace_one_reg(wait_id_tpu); + trace_one_reg(wait_id_other_tdma); + trace_one_reg(wait_id_sdma); + trace_one_reg(const_val); + trace_one_reg(src_base_reg_sel); + trace_one_reg(mv_lut_idx); + trace_one_reg(dst_base_reg_sel); + trace_one_reg(mv_lut_base); + trace_one_reg(rsv4_5); + trace_one_reg(dst_h_stride); + trace_one_reg(dst_c_stride_low); + trace_one_reg(dst_n_stride); + trace_one_reg(src_h_stride); + trace_one_reg(src_c_stride_low); + trace_one_reg(src_n_stride); + trace_one_reg(dst_c); + trace_one_reg(src_c); + trace_one_reg(dst_w); + trace_one_reg(dst_h); + trace_one_reg(src_w); + trace_one_reg(src_h); + trace_one_reg(dst_base_addr_low); + trace_one_reg(src_base_addr_low); + trace_one_reg(src_n); + trace_one_reg(dst_base_addr_high); + trace_one_reg(src_base_addr_high); + trace_one_reg(src_c_stride_high); + trace_one_reg(dst_c_stride_high); + trace_one_reg(compress_bias0); + trace_one_reg(compress_bias1); + trace_one_reg(layer_ID); +} +#endif /* BM1880v2_TDMA_REG_V1_32_H */ diff --git a/cvikernel/include/bmkernel/bm1880v2/bm1880v2_tiu_reg.h b/cvikernel/include/bmkernel/bm1880v2/bm1880v2_tiu_reg.h new file mode 100644 index 000000000..39ea07af9 --- /dev/null +++ b/cvikernel/include/bmkernel/bm1880v2/bm1880v2_tiu_reg.h @@ -0,0 +1,574 @@ +#ifndef BM1880v2_TIU_REG_V2_11_H +#define BM1880v2_TIU_REG_V2_11_H + +/* + * This file is generated by tools. Do not edit it manually. + */ + +#include +#include + +typedef uint8_t uint8_t; +typedef uint64_t uint64_t; +typedef unsigned long long ullong; + +typedef struct { + uint32_t cmd_en; + uint32_t cmd_end; + uint32_t cmd_id_en; + uint32_t cmd_id_tpu; + uint32_t cmd_id_gdma; + uint32_t cmd_keep; + uint32_t cmd_intr_en; + uint32_t tsk_typ; + uint32_t tsk_eu_typ; + uint32_t tsk_opd_num; + uint32_t opt_right_shift; + uint32_t opt_left_shift; + uint32_t opt_shift_typ; + uint32_t opt_rshift_typ; + uint32_t opt_res_add; + uint32_t opt_relu; + uint32_t opt_left_tran; + uint32_t opt_chl_quan; + uint32_t tens_mdsum; + uint32_t tens_lookup; + uint32_t opt_res0_sign; + uint32_t opt_opd0_sign; + uint32_t opt_opd1_sign; + uint32_t opt_opd2_sign; + uint32_t opt_res0_int8; + uint32_t opt_opd0_int8; + uint32_t opt_opd1_int8; + uint32_t opt_opd2_int8; + uint32_t opt_opd0_const; + uint32_t opt_opd1_const; + uint32_t opt_opd2_const; + uint32_t short_nchwstr_same; + uint32_t short_res0_str; + uint32_t short_opd0_str; + uint32_t short_opd1_str; + uint32_t short_opd2_str; + uint32_t conv_opd0_x_ins0; + uint32_t conv_opd0_y_ins0; + uint32_t conv_opd0_x_ins0_last; + uint32_t conv_opd0_y_ins0_last; + uint32_t conv_opd1_x_ins0; + uint32_t conv_opd1_y_ins0; + uint32_t opd0_ins_val; + uint32_t ps32_md; + uint32_t double_conv; + uint32_t rsvd0; + uint32_t res0_n; + uint32_t res0_c; + uint32_t res0_h; + uint32_t res0_w; + uint32_t res0_addr; + uint32_t opd0_addr; + uint32_t opd1_addr; + uint32_t rsvd1; + uint32_t opd2_addr; + uint32_t opd0_c; + uint32_t opd0_h; + uint32_t opd0_w; + uint32_t opd1_h; + uint32_t opd1_w; + uint32_t conv_opd0_up_pad; + uint32_t conv_opd0_dn_pad; + uint32_t conv_opd0_lf_pad; + uint32_t conv_opd0_rt_pad; + uint32_t conv_op_x_str; + uint32_t conv_op_y_str; + uint32_t opd0_ins_fp; + uint32_t rsvd2; + uint32_t opd0_n; + uint32_t opd1_n; + uint32_t opd1_c; + uint32_t opd2_n; + uint32_t opd2_c; + uint32_t opd2_h; + uint32_t opd2_w; + uint32_t quan_m; + uint32_t opd_typ; + uint32_t fp_round_typ; + uint32_t rsvd7; + uint32_t rsvd3; + uint32_t res0_n_str; + uint32_t res0_c_str; + uint32_t res0_h_str; + uint32_t res0_w_str; + uint32_t res0_b_str; + uint32_t opd0_n_str; + uint32_t opd0_c_str; + uint32_t rsvd4; + uint32_t opd0_h_str; + uint32_t opd0_w_str; + uint32_t opd0_b_str; + uint32_t opd1_n_str; + uint32_t opd1_c_str; + uint32_t opd1_h_str; + uint32_t opd1_w_str; + uint32_t rsvd5; + uint32_t opd1_b_str; + uint32_t opd2_n_str; + uint32_t opd2_c_str; + uint32_t opd2_h_str; + uint32_t opd2_w_str; + uint32_t opd2_b_str; + uint32_t layer_info; + uint32_t rsvd6; +} tiu_reg_t; + +static inline void parse_tiu_reg(tiu_reg_t *r, const uint32_t *p) +{ + r->cmd_en = p[0] & 1; + r->cmd_end = (p[0] >> 1) & 1; + r->cmd_id_en = (p[0] >> 2) & 1; + r->cmd_id_tpu = (p[0] >> 3) & ((1u << 16) - 1); + r->cmd_id_gdma = (p[0] >> 19) & ((1u << 13) - 1); + r->cmd_id_gdma |= (uint64_t)(p[1] & ((1u << 3) - 1)) << 13; + r->cmd_keep = (p[1] >> 3) & 1; + r->cmd_intr_en = (p[1] >> 4) & 1; + r->tsk_typ = (p[1] >> 5) & ((1u << 4) - 1); + r->tsk_eu_typ = (p[1] >> 9) & ((1u << 8) - 1); + r->tsk_opd_num = (p[1] >> 17) & ((1u << 2) - 1); + r->opt_right_shift = (p[1] >> 19) & ((1u << 5) - 1); + r->opt_left_shift = (p[1] >> 24) & ((1u << 5) - 1); + r->opt_shift_typ = (p[1] >> 29) & 1; + r->opt_rshift_typ = (p[1] >> 30) & 1; + r->opt_res_add = (p[1] >> 31) & 1; + r->opt_relu = p[2] & 1; + r->opt_left_tran = (p[2] >> 1) & 1; + r->opt_chl_quan = (p[2] >> 2) & 1; + r->tens_mdsum = (p[2] >> 3) & 1; + r->tens_lookup = (p[2] >> 4) & 1; + r->opt_res0_sign = (p[2] >> 5) & 1; + r->opt_opd0_sign = (p[2] >> 6) & 1; + r->opt_opd1_sign = (p[2] >> 7) & 1; + r->opt_opd2_sign = (p[2] >> 8) & 1; + r->opt_res0_int8 = (p[2] >> 9) & 1; + r->opt_opd0_int8 = (p[2] >> 10) & 1; + r->opt_opd1_int8 = (p[2] >> 11) & 1; + r->opt_opd2_int8 = (p[2] >> 12) & 1; + r->opt_opd0_const = (p[2] >> 13) & 1; + r->opt_opd1_const = (p[2] >> 14) & 1; + r->opt_opd2_const = (p[2] >> 15) & 1; + r->short_nchwstr_same = (p[2] >> 16) & 1; + r->short_res0_str = (p[2] >> 17) & ((1u << 2) - 1); + r->short_opd0_str = (p[2] >> 19) & ((1u << 2) - 1); + r->short_opd1_str = (p[2] >> 21) & ((1u << 2) - 1); + r->short_opd2_str = (p[2] >> 23) & ((1u << 2) - 1); + r->conv_opd0_x_ins0 = (p[2] >> 25) & ((1u << 4) - 1); + r->conv_opd0_y_ins0 = (p[2] >> 29) & ((1u << 3) - 1); + r->conv_opd0_y_ins0 |= (uint64_t)(p[3] & 1) << 3; + r->conv_opd0_x_ins0_last = (p[3] >> 1) & ((1u << 4) - 1); + r->conv_opd0_y_ins0_last = (p[3] >> 5) & ((1u << 4) - 1); + r->conv_opd1_x_ins0 = (p[3] >> 9) & ((1u << 4) - 1); + r->conv_opd1_y_ins0 = (p[3] >> 13) & ((1u << 4) - 1); + r->opd0_ins_val = (p[3] >> 17) & ((1u << 8) - 1); + r->ps32_md = (p[3] >> 25) & ((1u << 2) - 1); + r->double_conv = (p[3] >> 27) & 1; + r->rsvd0 = (p[3] >> 28) & ((1u << 4) - 1); + r->res0_n = p[4] & ((1u << 12) - 1); + r->res0_c = (p[4] >> 12) & ((1u << 12) - 1); + r->res0_h = (p[4] >> 24) & ((1u << 8) - 1); + r->res0_h |= (uint64_t)(p[5] & ((1u << 4) - 1)) << 8; + r->res0_w = (p[5] >> 4) & ((1u << 12) - 1); + r->res0_addr = (p[5] >> 16) & ((1u << 16) - 1); + r->res0_addr |= (uint64_t)(p[6] & ((1u << 8) - 1)) << 16; + r->opd0_addr = (p[6] >> 8) & ((1u << 24) - 1); + r->opd1_addr = p[7] & ((1u << 16) - 1); + r->rsvd1 = (p[7] >> 16) & ((1u << 16) - 1); + r->opd2_addr = p[8] & ((1u << 16) - 1); + r->opd0_c = (p[8] >> 16) & ((1u << 12) - 1); + r->opd0_h = (p[8] >> 28) & ((1u << 4) - 1); + r->opd0_h |= (uint64_t)(p[9] & ((1u << 8) - 1)) << 4; + r->opd0_w = (p[9] >> 8) & ((1u << 12) - 1); + r->opd1_h = (p[9] >> 20) & ((1u << 12) - 1); + r->opd1_w = p[10] & ((1u << 12) - 1); + r->conv_opd0_up_pad = (p[10] >> 12) & ((1u << 4) - 1); + r->conv_opd0_dn_pad = (p[10] >> 16) & ((1u << 4) - 1); + r->conv_opd0_lf_pad = (p[10] >> 20) & ((1u << 4) - 1); + r->conv_opd0_rt_pad = (p[10] >> 24) & ((1u << 4) - 1); + r->conv_op_x_str = (p[10] >> 28) & ((1u << 4) - 1); + r->conv_op_y_str = p[11] & ((1u << 4) - 1); + r->opd0_ins_fp = (p[11] >> 4) & ((1u << 16) - 1); + r->rsvd2 = (p[11] >> 20) & ((1u << 12) - 1); + r->opd0_n = p[12] & ((1u << 12) - 1); + r->opd1_n = (p[12] >> 12) & ((1u << 12) - 1); + r->opd1_c = (p[12] >> 24) & ((1u << 8) - 1); + r->opd1_c |= (uint64_t)(p[13] & ((1u << 4) - 1)) << 8; + r->opd2_n = (p[13] >> 4) & ((1u << 12) - 1); + r->opd2_c = (p[13] >> 16) & ((1u << 12) - 1); + r->opd2_h = (p[13] >> 28) & ((1u << 4) - 1); + r->opd2_h |= (uint64_t)(p[14] & ((1u << 8) - 1)) << 4; + r->opd2_w = (p[14] >> 8) & ((1u << 12) - 1); + r->quan_m = (p[14] >> 20) & ((1u << 12) - 1); + r->quan_m |= (uint64_t)(p[15] & ((1u << 20) - 1)) << 12; + r->opd_typ = (p[15] >> 20) & 1; + r->fp_round_typ = (p[15] >> 21) & ((1u << 3) - 1); + r->rsvd7 = (p[15] >> 24) & ((1u << 4) - 1); + r->rsvd3 = (p[15] >> 28) & ((1u << 4) - 1); + r->res0_n_str = p[16] & ((1u << 16) - 1); + r->res0_c_str = (p[16] >> 16) & ((1u << 16) - 1); + r->res0_h_str = p[17] & ((1u << 16) - 1); + r->res0_w_str = (p[17] >> 16) & ((1u << 16) - 1); + r->res0_b_str = p[18] & ((1u << 16) - 1); + r->opd0_n_str = (p[18] >> 16) & ((1u << 16) - 1); + r->opd0_c_str = p[19] & ((1u << 16) - 1); + r->rsvd4 = (p[19] >> 16) & ((1u << 16) - 1); + r->opd0_h_str = p[20] & ((1u << 16) - 1); + r->opd0_w_str = (p[20] >> 16) & ((1u << 16) - 1); + r->opd0_b_str = p[21] & ((1u << 16) - 1); + r->opd1_n_str = (p[21] >> 16) & ((1u << 16) - 1); + r->opd1_c_str = p[22] & ((1u << 16) - 1); + r->opd1_h_str = (p[22] >> 16) & ((1u << 16) - 1); + r->opd1_w_str = p[23] & ((1u << 16) - 1); + r->rsvd5 = (p[23] >> 16) & ((1u << 16) - 1); + r->opd1_b_str = p[24] & ((1u << 16) - 1); + r->opd2_n_str = (p[24] >> 16) & ((1u << 16) - 1); + r->opd2_c_str = p[25] & ((1u << 16) - 1); + r->opd2_h_str = (p[25] >> 16) & ((1u << 16) - 1); + r->opd2_w_str = p[26] & ((1u << 16) - 1); + r->opd2_b_str = (p[26] >> 16) & ((1u << 16) - 1); + r->layer_info = p[27] & ((1u << 28) - 1); + r->rsvd6 = (p[27] >> 28) & ((1u << 4) - 1); +} + +static inline void emit_tiu_reg(const tiu_reg_t *r, uint32_t *_p) +{ + volatile uint32_t *p = (typeof(p))_p; + p[27] = (r->layer_info & ((1u << 28) - 1)) | + ((r->rsvd6 & ((1u << 4) - 1)) << 28); + p[26] = (r->opd2_w_str & ((1u << 16) - 1)) | + ((r->opd2_b_str & ((1u << 16) - 1)) << 16); + p[25] = (r->opd2_c_str & ((1u << 16) - 1)) | + ((r->opd2_h_str & ((1u << 16) - 1)) << 16); + p[24] = (r->opd1_b_str & ((1u << 16) - 1)) | + ((r->opd2_n_str & ((1u << 16) - 1)) << 16); + p[23] = (r->opd1_w_str & ((1u << 16) - 1)) | + ((r->rsvd5 & ((1u << 16) - 1)) << 16); + p[22] = (r->opd1_c_str & ((1u << 16) - 1)) | + ((r->opd1_h_str & ((1u << 16) - 1)) << 16); + p[21] = (r->opd0_b_str & ((1u << 16) - 1)) | + ((r->opd1_n_str & ((1u << 16) - 1)) << 16); + p[20] = (r->opd0_h_str & ((1u << 16) - 1)) | + ((r->opd0_w_str & ((1u << 16) - 1)) << 16); + p[19] = (r->opd0_c_str & ((1u << 16) - 1)) | + ((r->rsvd4 & ((1u << 16) - 1)) << 16); + p[18] = (r->res0_b_str & ((1u << 16) - 1)) | + ((r->opd0_n_str & ((1u << 16) - 1)) << 16); + p[17] = (r->res0_h_str & ((1u << 16) - 1)) | + ((r->res0_w_str & ((1u << 16) - 1)) << 16); + p[16] = (r->res0_n_str & ((1u << 16) - 1)) | + ((r->res0_c_str & ((1u << 16) - 1)) << 16); + p[15] = ((r->quan_m >> 12) & ((1u << 20) - 1)) | + ((r->opd_typ & 1) << 20) | + ((r->fp_round_typ & ((1u << 3) - 1)) << 21) | + ((r->rsvd7 & ((1u << 4) - 1)) << 24) | + ((r->rsvd3 & ((1u << 4) - 1)) << 28); + p[14] = ((r->opd2_h >> 4) & ((1u << 8) - 1)) | + ((r->opd2_w & ((1u << 12) - 1)) << 8) | + ((r->quan_m & ((1u << 12) - 1)) << 20); + p[13] = ((r->opd1_c >> 8) & ((1u << 4) - 1)) | + ((r->opd2_n & ((1u << 12) - 1)) << 4) | + ((r->opd2_c & ((1u << 12) - 1)) << 16) | + ((r->opd2_h & ((1u << 4) - 1)) << 28); + p[12] = (r->opd0_n & ((1u << 12) - 1)) | + ((r->opd1_n & ((1u << 12) - 1)) << 12) | + ((r->opd1_c & ((1u << 8) - 1)) << 24); + p[11] = (r->conv_op_y_str & ((1u << 4) - 1)) | + ((r->opd0_ins_fp & ((1u << 16) - 1)) << 4) | + ((r->rsvd2 & ((1u << 12) - 1)) << 20); + p[10] = (r->opd1_w & ((1u << 12) - 1)) | + ((r->conv_opd0_up_pad & ((1u << 4) - 1)) << 12) | + ((r->conv_opd0_dn_pad & ((1u << 4) - 1)) << 16) | + ((r->conv_opd0_lf_pad & ((1u << 4) - 1)) << 20) | + ((r->conv_opd0_rt_pad & ((1u << 4) - 1)) << 24) | + ((r->conv_op_x_str & ((1u << 4) - 1)) << 28); + p[9] = ((r->opd0_h >> 4) & ((1u << 8) - 1)) | + ((r->opd0_w & ((1u << 12) - 1)) << 8) | + ((r->opd1_h & ((1u << 12) - 1)) << 20); + p[8] = (r->opd2_addr & ((1u << 16) - 1)) | + ((r->opd0_c & ((1u << 12) - 1)) << 16) | + ((r->opd0_h & ((1u << 4) - 1)) << 28); + p[7] = (r->opd1_addr & ((1u << 16) - 1)) | + ((r->rsvd1 & ((1u << 16) - 1)) << 16); + p[6] = ((r->res0_addr >> 16) & ((1u << 8) - 1)) | + ((r->opd0_addr & ((1u << 24) - 1)) << 8); + p[5] = ((r->res0_h >> 8) & ((1u << 4) - 1)) | + ((r->res0_w & ((1u << 12) - 1)) << 4) | + ((r->res0_addr & ((1u << 16) - 1)) << 16); + p[4] = (r->res0_n & ((1u << 12) - 1)) | + ((r->res0_c & ((1u << 12) - 1)) << 12) | + ((r->res0_h & ((1u << 8) - 1)) << 24); + p[3] = ((r->conv_opd0_y_ins0 >> 3) & 1) | + ((r->conv_opd0_x_ins0_last & ((1u << 4) - 1)) << 1) | + ((r->conv_opd0_y_ins0_last & ((1u << 4) - 1)) << 5) | + ((r->conv_opd1_x_ins0 & ((1u << 4) - 1)) << 9) | + ((r->conv_opd1_y_ins0 & ((1u << 4) - 1)) << 13) | + ((r->opd0_ins_val & ((1u << 8) - 1)) << 17) | + ((r->ps32_md & ((1u << 2) - 1)) << 25) | + ((r->double_conv & 1) << 27) | + ((r->rsvd0 & ((1u << 4) - 1)) << 28); + p[2] = (r->opt_relu & 1) | + ((r->opt_left_tran & 1) << 1) | + ((r->opt_chl_quan & 1) << 2) | + ((r->tens_mdsum & 1) << 3) | + ((r->tens_lookup & 1) << 4) | + ((r->opt_res0_sign & 1) << 5) | + ((r->opt_opd0_sign & 1) << 6) | + ((r->opt_opd1_sign & 1) << 7) | + ((r->opt_opd2_sign & 1) << 8) | + ((r->opt_res0_int8 & 1) << 9) | + ((r->opt_opd0_int8 & 1) << 10) | + ((r->opt_opd1_int8 & 1) << 11) | + ((r->opt_opd2_int8 & 1) << 12) | + ((r->opt_opd0_const & 1) << 13) | + ((r->opt_opd1_const & 1) << 14) | + ((r->opt_opd2_const & 1) << 15) | + ((r->short_nchwstr_same & 1) << 16) | + ((r->short_res0_str & ((1u << 2) - 1)) << 17) | + ((r->short_opd0_str & ((1u << 2) - 1)) << 19) | + ((r->short_opd1_str & ((1u << 2) - 1)) << 21) | + ((r->short_opd2_str & ((1u << 2) - 1)) << 23) | + ((r->conv_opd0_x_ins0 & ((1u << 4) - 1)) << 25) | + ((r->conv_opd0_y_ins0 & ((1u << 3) - 1)) << 29); + p[1] = ((r->cmd_id_gdma >> 13) & ((1u << 3) - 1)) | + ((r->cmd_keep & 1) << 3) | + ((r->cmd_intr_en & 1) << 4) | + ((r->tsk_typ & ((1u << 4) - 1)) << 5) | + ((r->tsk_eu_typ & ((1u << 8) - 1)) << 9) | + ((r->tsk_opd_num & ((1u << 2) - 1)) << 17) | + ((r->opt_right_shift & ((1u << 5) - 1)) << 19) | + ((r->opt_left_shift & ((1u << 5) - 1)) << 24) | + ((r->opt_shift_typ & 1) << 29) | + ((r->opt_rshift_typ & 1) << 30) | + ((r->opt_res_add & 1) << 31); + p[0] = (r->cmd_en & 1) | + ((r->cmd_end & 1) << 1) | + ((r->cmd_id_en & 1) << 2) | + ((r->cmd_id_tpu & ((1u << 16) - 1)) << 3) | + ((r->cmd_id_gdma & ((1u << 13) - 1)) << 19); +} + +static inline void reset_tiu_reg(tiu_reg_t *r) +{ + r->cmd_en = 0b0; + r->cmd_end = 0b0; + r->cmd_id_en = 0b0; + r->cmd_id_tpu = 0; + r->cmd_id_gdma = 0; + r->cmd_keep = 0b0; + r->cmd_intr_en = 0b0; + r->tsk_typ = 0; + r->tsk_eu_typ = 0; + r->tsk_opd_num = 0b11; + r->opt_right_shift = 10; + r->opt_left_shift = 2; + r->opt_shift_typ = 0b1; + r->opt_rshift_typ = 0b1; + r->opt_res_add = 0b0; + r->opt_relu = 0b1; + r->opt_left_tran = 0b0; + r->opt_chl_quan = 0b0; + r->tens_mdsum = 0b0; + r->tens_lookup = 0b0; + r->opt_res0_sign = 0b0; + r->opt_opd0_sign = 0b0; + r->opt_opd1_sign = 0b1; + r->opt_opd2_sign = 0b1; + r->opt_res0_int8 = 0b1; + r->opt_opd0_int8 = 0b1; + r->opt_opd1_int8 = 0b1; + r->opt_opd2_int8 = 0b0; + r->opt_opd0_const = 0b0; + r->opt_opd1_const = 0b0; + r->opt_opd2_const = 0b0; + r->short_nchwstr_same = 0b0; + r->short_res0_str = 0b00; + r->short_opd0_str = 0b00; + r->short_opd1_str = 0b00; + r->short_opd2_str = 0b00; + r->conv_opd0_x_ins0 = 0; + r->conv_opd0_y_ins0 = 0; + r->conv_opd0_x_ins0_last = 0; + r->conv_opd0_y_ins0_last = 0; + r->conv_opd1_x_ins0 = 0; + r->conv_opd1_y_ins0 = 0; + r->opd0_ins_val = 0; + r->ps32_md = 0; + r->double_conv = 0; + r->rsvd0 = 0; + r->res0_n = 1; + r->res0_c = 1; + r->res0_h = 1; + r->res0_w = 16; + r->res0_addr = 0; + r->opd0_addr = 0; + r->opd1_addr = 0; + r->rsvd1 = 0; + r->opd2_addr = 0; + r->opd0_c = 1; + r->opd0_h = 1; + r->opd0_w = 16; + r->opd1_h = 1; + r->opd1_w = 16; + r->conv_opd0_up_pad = 0; + r->conv_opd0_dn_pad = 0; + r->conv_opd0_lf_pad = 0; + r->conv_opd0_rt_pad = 0; + r->conv_op_x_str = 1; + r->conv_op_y_str = 1; + r->opd0_ins_fp = 0; + r->rsvd2 = 0; + r->opd0_n = 1; + r->opd1_n = 1; + r->opd1_c = 1; + r->opd2_n = 1; + r->opd2_c = 1; + r->opd2_h = 1; + r->opd2_w = 16; + r->quan_m = 0; + r->opd_typ = 0; + r->fp_round_typ = 0; + r->rsvd7 = 0; + r->rsvd3 = 0; + r->res0_n_str = 16; + r->res0_c_str = 16; + r->res0_h_str = 0; + r->res0_w_str = 1; + r->res0_b_str = 16; + r->opd0_n_str = 16; + r->opd0_c_str = 16; + r->rsvd4 = 0; + r->opd0_h_str = 0; + r->opd0_w_str = 1; + r->opd0_b_str = 16; + r->opd1_n_str = 16; + r->opd1_c_str = 16; + r->opd1_h_str = 0; + r->opd1_w_str = 1; + r->rsvd5 = 0; + r->opd1_b_str = 16; + r->opd2_n_str = 16; + r->opd2_c_str = 16; + r->opd2_h_str = 0; + r->opd2_w_str = 1; + r->opd2_b_str = 16; + r->layer_info = 0; + r->rsvd6 = 0; +} + +static inline void trace_tiu_reg(tiu_reg_t *r, const char *tag) +{ +#define trace_one_reg(name) \ + printf(" %s: 0x%llx\n", #name, (ullong)r->name) + + printf("--- %s ---\n", tag); + trace_one_reg(cmd_en); + trace_one_reg(cmd_end); + trace_one_reg(cmd_id_en); + trace_one_reg(cmd_id_tpu); + trace_one_reg(cmd_id_gdma); + trace_one_reg(cmd_keep); + trace_one_reg(cmd_intr_en); + trace_one_reg(tsk_typ); + trace_one_reg(tsk_eu_typ); + trace_one_reg(tsk_opd_num); + trace_one_reg(opt_right_shift); + trace_one_reg(opt_left_shift); + trace_one_reg(opt_shift_typ); + trace_one_reg(opt_rshift_typ); + trace_one_reg(opt_res_add); + trace_one_reg(opt_relu); + trace_one_reg(opt_left_tran); + trace_one_reg(opt_chl_quan); + trace_one_reg(tens_mdsum); + trace_one_reg(tens_lookup); + trace_one_reg(opt_res0_sign); + trace_one_reg(opt_opd0_sign); + trace_one_reg(opt_opd1_sign); + trace_one_reg(opt_opd2_sign); + trace_one_reg(opt_res0_int8); + trace_one_reg(opt_opd0_int8); + trace_one_reg(opt_opd1_int8); + trace_one_reg(opt_opd2_int8); + trace_one_reg(opt_opd0_const); + trace_one_reg(opt_opd1_const); + trace_one_reg(opt_opd2_const); + trace_one_reg(short_nchwstr_same); + trace_one_reg(short_res0_str); + trace_one_reg(short_opd0_str); + trace_one_reg(short_opd1_str); + trace_one_reg(short_opd2_str); + trace_one_reg(conv_opd0_x_ins0); + trace_one_reg(conv_opd0_y_ins0); + trace_one_reg(conv_opd0_x_ins0_last); + trace_one_reg(conv_opd0_y_ins0_last); + trace_one_reg(conv_opd1_x_ins0); + trace_one_reg(conv_opd1_y_ins0); + trace_one_reg(opd0_ins_val); + trace_one_reg(ps32_md); + trace_one_reg(double_conv); + trace_one_reg(rsvd0); + trace_one_reg(res0_n); + trace_one_reg(res0_c); + trace_one_reg(res0_h); + trace_one_reg(res0_w); + trace_one_reg(res0_addr); + trace_one_reg(opd0_addr); + trace_one_reg(opd1_addr); + trace_one_reg(rsvd1); + trace_one_reg(opd2_addr); + trace_one_reg(opd0_c); + trace_one_reg(opd0_h); + trace_one_reg(opd0_w); + trace_one_reg(opd1_h); + trace_one_reg(opd1_w); + trace_one_reg(conv_opd0_up_pad); + trace_one_reg(conv_opd0_dn_pad); + trace_one_reg(conv_opd0_lf_pad); + trace_one_reg(conv_opd0_rt_pad); + trace_one_reg(conv_op_x_str); + trace_one_reg(conv_op_y_str); + trace_one_reg(opd0_ins_fp); + trace_one_reg(rsvd2); + trace_one_reg(opd0_n); + trace_one_reg(opd1_n); + trace_one_reg(opd1_c); + trace_one_reg(opd2_n); + trace_one_reg(opd2_c); + trace_one_reg(opd2_h); + trace_one_reg(opd2_w); + trace_one_reg(quan_m); + trace_one_reg(opd_typ); + trace_one_reg(fp_round_typ); + trace_one_reg(rsvd7); + trace_one_reg(rsvd3); + trace_one_reg(res0_n_str); + trace_one_reg(res0_c_str); + trace_one_reg(res0_h_str); + trace_one_reg(res0_w_str); + trace_one_reg(res0_b_str); + trace_one_reg(opd0_n_str); + trace_one_reg(opd0_c_str); + trace_one_reg(rsvd4); + trace_one_reg(opd0_h_str); + trace_one_reg(opd0_w_str); + trace_one_reg(opd0_b_str); + trace_one_reg(opd1_n_str); + trace_one_reg(opd1_c_str); + trace_one_reg(opd1_h_str); + trace_one_reg(opd1_w_str); + trace_one_reg(rsvd5); + trace_one_reg(opd1_b_str); + trace_one_reg(opd2_n_str); + trace_one_reg(opd2_c_str); + trace_one_reg(opd2_h_str); + trace_one_reg(opd2_w_str); + trace_one_reg(opd2_b_str); + trace_one_reg(layer_info); + trace_one_reg(rsvd6); +} +#endif /* BM1880v2_TIU_REG_V2_11_H */ diff --git a/cvikernel/include/bmkernel/bm1880v2/bm1880v2_tpu_cfg.h b/cvikernel/include/bmkernel/bm1880v2/bm1880v2_tpu_cfg.h new file mode 100644 index 000000000..7a06f9ff2 --- /dev/null +++ b/cvikernel/include/bmkernel/bm1880v2/bm1880v2_tpu_cfg.h @@ -0,0 +1,37 @@ +#ifndef __BM1880V2_TPU_CFG__ +#define __BM1880V2_TPU_CFG__ + +#define BM1880V2_VER 18802 +#define BM1880V2_HW_NPU_SHIFT 5 +#define BM1880V2_HW_EU_SHIFT 4 +#define BM1880V2_HW_LMEM_SHIFT 15 +#define BM1880V2_HW_LMEM_BANKS 8 +#define BM1880V2_HW_LMEM_BANK_SIZE 0x1000 +#define BM1880V2_HW_NODE_CHIP_SHIFT 0 +#define BM1880V2_HW_NPU_NUM (1 << BM1880V2_HW_NPU_SHIFT) +#define BM1880V2_HW_EU_NUM (1 << BM1880V2_HW_EU_SHIFT) +#define BM1880V2_HW_LMEM_SIZE (1 << BM1880V2_HW_LMEM_SHIFT) +#define BM1880V2_HW_NODE_CHIP_NUM (1 << BM1880V2_HW_NODE_CHIP_SHIFT) + +#if (BM1880V2_HW_LMEM_SIZE != (BM1880V2_HW_LMEM_BANK_SIZE * BM1880V2_HW_LMEM_BANKS)) +#error "Set wrong TPU configuraiton." +#endif + +#define BM1880V2_GLOBAL_MEM_START_ADDR 0x100000000 +#define BM1880V2_GLOBAL_MEM_SIZE 0x100000000 + +#define BM1880V2_GLOBAL_TIU_CMDBUF_ADDR 0x00000000 +#define BM1880V2_GLOBAL_TDMA_CMDBUF_ADDR 0x01400000 +#define BM1880V2_GLOBAL_TIU_CMDBUF_RESERVED_SIZE 0x01400000 +#define BM1880V2_GLOBAL_TDMA_CMDBUF_RESERVED_SIZE 0x01400000 +#define BM1880V2_GLOBAL_POOL_RESERVED_SIZE (BM1880V2_GLOBAL_MEM_SIZE - BM1880V2_GLOBAL_TIU_CMDBUF_RESERVED_SIZE - BM1880V2_GLOBAL_TDMA_CMDBUF_RESERVED_SIZE) + +#define BM1880V2_UART_CTLR_BASE_ADDR 0x04140000 + +#define BM1880V2_TDMA_ENGINE_BASE_ADDR 0x0C100000 +#define BM1880V2_TDMA_ENGINE_END_ADDR (BM1880V2_TDMA_ENGINE_BASE_ADDR + 0x1000) + +#define BM1880V2_TIU_ENGINE_BASE_ADDR 0x0C101000 //"NPS Register" in memory map? +#define BM1880V2_TIU_ENGINE_END_ADDR (BM1880V2_TIU_ENGINE_BASE_ADDR + 0x1000) + +#endif diff --git a/cvikernel/include/bmkernel/bm1880v2/bm_vlc_compress.h b/cvikernel/include/bmkernel/bm1880v2/bm_vlc_compress.h new file mode 100644 index 000000000..30bab084a --- /dev/null +++ b/cvikernel/include/bmkernel/bm1880v2/bm_vlc_compress.h @@ -0,0 +1,708 @@ +#ifndef __BM_VLC_COMPRESS_H__ +#define __BM_VLC_COMPRESS_H__ +#include +#include +#ifdef __cplusplus +extern "C" +{ +#endif + +#define MAX_UNARY_FIELD_SIZE 47 +#define MAX_ORDER_K 5 + + /** + * \data_type 0 means 8bit, 1 means 16bit + */ + static inline size_t get_out_bs_buf_size(uint64_t in_size, uint8_t data_type) { + size_t blk_num = (data_type) ? ((in_size + 31) >> 5) : ((in_size + 15) >> 4); + size_t in_size_pad = blk_num << (4 + data_type); + size_t bs_buf_size = in_size_pad + (ceiling_func(blk_num, 16) << 4) + 16; + return bs_buf_size; + } + + typedef struct + { + uint8_t signedness; + uint8_t is_bfloat16; + uint8_t bias0; + uint8_t bias1; + uint8_t zero_guard_en; + } CommandInfo; + typedef struct + { + uint8_t *stream; // stream buffer pointer + int bit_pos; // current pointer (in bit) + int buf_size; // in byte + } StreamBuffer; + +static inline int8_t two_side_circular_shift(int8_t val, uint8_t bias0, uint8_t bias1); +static inline int8_t inv_two_side_circular_shift(int8_t val, uint8_t bias0, uint8_t bias1); +static inline uint8_t center_shift(uint8_t val, uint8_t bias, uint8_t zero_guard); +static inline uint8_t inv_center_shift(uint8_t val, uint8_t bias, uint8_t zero_guard); + +static inline void init_stream(StreamBuffer *bs, const uint8_t *buf, int buf_size, uint8_t read_only); + +static inline void bm_vlc_est_weight_bias(const uint8_t *ibuf, size_t isz, uint8_t signedness, uint8_t isBfloat16, CommandInfo *cmd_info); +static inline void bm_vlc_enc_int8(const uint8_t *ibuf, size_t isz, uint8_t *obuf, size_t *osz, CommandInfo *cmd_info); +static inline void bm_vlc_dec_int8_ext(const uint8_t *ibuf, size_t isz, uint8_t *obuf, size_t *bs_size); +static inline void bm_vlc_dec_int8(const uint8_t *ibuf, size_t isz, uint8_t *obuf); +static inline void bm_vlc_enc_bf16(const uint16_t *ibuf, size_t isz, uint8_t *obuf, size_t *osz, CommandInfo *cmd_info); +static inline void bm_vlc_dec_bf16_ext(const uint8_t *ibuf, size_t isz, uint16_t *obuf, size_t *bs_size); +static inline void bm_vlc_dec_bf16(const uint8_t *ibuf, size_t isz, uint16_t *obuf); + +static inline uint8_t get_bit_val(uint8_t *buf, int byte_idx, int bit_idx) + { + return (buf[byte_idx] >> bit_idx) & 0x1; + } + +static inline uint8_t sign_to_unsign(uint8_t val) + { + uint8_t sign_i = (val >> 7) & 0x1; + int abs_data_i = abs(((int8_t)val)); + return ((abs_data_i << 1) - sign_i); + } + +static inline int8_t unsign_to_sign(uint8_t val) + { + uint8_t sign_i = val & 0x1; + int abs_data_i = (((int)val) + 1) >> 1; + return (uint8_t)((sign_i == 1) ? (-abs_data_i) : abs_data_i); + } + +static inline void dispatch_bf16_data(const uint16_t *bf16_in, uint8_t *exp, uint8_t *frac, size_t isz) +{ + for (size_t i = 0; i < isz; i++) + { + exp[i] = (uint8_t)((bf16_in[i] >> 7) & 0xFF); + frac[i] = (uint8_t)(((bf16_in[i] >> 15) << 7) | (bf16_in[i] & 0x7F)); + } +} + +static inline void merge_bf16_data(const uint8_t *exp_in, const uint8_t *frac_in, uint16_t *bf16_out, size_t isz) +{ + memset(bf16_out, 0, sizeof(uint16_t)); + for (size_t i = 0; i < isz; i++) + { + bf16_out[i] = ((frac_in[i] >> 7) << 15) | (exp_in[i] << 7) | (frac_in[i] & 0x7F); + } +} + +// -- streaming operation handler -- +static inline void init_stream(StreamBuffer *bs, const uint8_t *buf, int buf_size, uint8_t read_only) +{ + bs->bit_pos = 0; + bs->stream = (uint8_t *)buf; + bs->buf_size = buf_size; + if (!read_only) + memset((uint8_t *)buf, 0, sizeof(uint8_t) * buf_size); +} + +static inline void write_stream(StreamBuffer *bs, uint8_t *src, int bit_len) +{ + for (int bit = 0; bit < bit_len; bit++) + { + int src_byte_i = bit / 8; + int src_bit_i = bit % 8; + int dest_byte_i = (bs->bit_pos + bit) / 8; + int dest_bit_i = (bs->bit_pos + bit) % 8; + bs->stream[dest_byte_i] |= (get_bit_val(src, src_byte_i, src_bit_i) << dest_bit_i); + } + bs->bit_pos += bit_len; +} + +static inline void move_stream_ptr(StreamBuffer *bs, int bit_len) +{ + bs->bit_pos += bit_len; +} + +static inline void parse_stream(StreamBuffer *bs, uint8_t *dest, int bit_len) +{ + memset(dest, 0, sizeof(uint8_t) * (bit_len + 7) >> 3); + for (int bit = 0; bit < bit_len; bit++) + { + int dest_byte_i = bit / 8; + int dest_bit_i = bit % 8; + int bs_byte_i = (bs->bit_pos + bit) / 8; + int bs_bit_i = (bs->bit_pos + bit) % 8; + dest[dest_byte_i] |= (get_bit_val(bs->stream, bs_byte_i, bs_bit_i) << dest_bit_i); + } + bs->bit_pos += bit_len; +} + +// -- header read/write operation handler -- +static inline void vlc_enc_header(StreamBuffer *bs_header, CommandInfo *cmd_info, size_t blk_bs_size) +{ + write_stream(bs_header, (uint8_t *)&blk_bs_size, 24); // bit[23:0] compressed block stream size + move_stream_ptr(bs_header, 4); // bit[27:24] reserved + write_stream(bs_header, (uint8_t *)&cmd_info->signedness, 1); // bit[28] signedness + write_stream(bs_header, (uint8_t *)&cmd_info->is_bfloat16, 1); // bit[29] data type + move_stream_ptr(bs_header, 2); // bit[31:30] bit depth + write_stream(bs_header, (uint8_t *)&cmd_info->bias0, 8); // bit[39:32] bias0 for symbol remapping + write_stream(bs_header, (uint8_t *)&cmd_info->bias1, 7); // bit[46:40] bias1 for symbol remapping + write_stream(bs_header, (uint8_t *)&cmd_info->zero_guard_en, 1); // bit[47] zero guard +} + +static inline void vlc_dec_header(StreamBuffer *bs_header, CommandInfo *cmd_info) +{ + move_stream_ptr(bs_header, 28); // bit[27:24] reserved + parse_stream(bs_header, (uint8_t *)&cmd_info->signedness, 1); // bit[28] signedness + parse_stream(bs_header, (uint8_t *)&cmd_info->is_bfloat16, 1); // bit[29] data type + move_stream_ptr(bs_header, 2); + parse_stream(bs_header, (uint8_t *)&cmd_info->bias0, 8); // bit[39:32] bias0 for symbol remapping + parse_stream(bs_header, (uint8_t *)&cmd_info->bias1, 7); // bit[46:40] bias1 for symbol remapping + parse_stream(bs_header, (uint8_t *)&cmd_info->zero_guard_en, 1); // bit[47] zero guard +} + +static inline void vlc_dec_header_ext(StreamBuffer *bs_header, CommandInfo *cmd_info, size_t *bs_size) +{ + parse_stream(bs_header, (uint8_t *)bs_size, 24); // bit[23:0] compressed block stream size + move_stream_ptr(bs_header, 4); // bit[27:24] reserved + parse_stream(bs_header, (uint8_t *)&cmd_info->signedness, 1); // bit[28] signedness + parse_stream(bs_header, (uint8_t *)&cmd_info->is_bfloat16, 1); // bit[29] data type + move_stream_ptr(bs_header, 2); + parse_stream(bs_header, (uint8_t *)&cmd_info->bias0, 8); // bit[39:32] bias0 for symbol remapping + parse_stream(bs_header, (uint8_t *)&cmd_info->bias1, 7); // bit[46:40] bias1 for symbol remapping + parse_stream(bs_header, (uint8_t *)&cmd_info->zero_guard_en, 1); // bit[47] zero guard +} + +// -- symbol remmaping handler -- +static inline uint8_t center_shift(uint8_t val, uint8_t bias, uint8_t zero_guard) +{ + if (val == 0 && zero_guard) + return 0; + + int16_t shift_data_i = val - bias; + uint8_t range = (bias <= 128) ? bias : 255 - bias; + if (bias <= 128) + { + return (val >= (range << 1)) ? val : sign_to_unsign(shift_data_i) + zero_guard; + } + else + { + return (val < (bias - range)) ? (range + bias - val + zero_guard) : (sign_to_unsign(shift_data_i) + zero_guard); + } +} + +static inline uint8_t inv_center_shift(uint8_t val, uint8_t bias, uint8_t zero_guard) +{ + if (val == 0 && zero_guard) + return 0; + + uint8_t unsign_data_i = val - zero_guard; + uint8_t range = (bias <= 128) ? bias : 255 - bias; + if (bias <= 128) + { + return (val >= (range << 1)) ? val : unsign_to_sign(unsign_data_i) + bias; + } + else + { + return (unsign_data_i > (range << 1)) ? (range + bias - val + zero_guard) : unsign_to_sign(unsign_data_i) + bias; + } +} + +static inline int8_t two_side_circular_shift(int8_t val, uint8_t bias0, uint8_t bias1) +{ + if (val == 0) + return 0; + + uint8_t sign = (val < 0) ? true : false; + int32_t abs_val = abs(val); + abs_val -= (sign) ? bias1 : bias0; + abs_val += (abs_val <= 0) ? (127 + sign) : 0; + return (sign) ? -abs_val : abs_val; +} + +static inline int8_t inv_two_side_circular_shift(int8_t val, uint8_t bias0, uint8_t bias1) +{ + if (val == 0) + return 0; + + uint8_t sign = (val < 0) ? true : false; + uint32_t abs_val = abs(val); + abs_val += (sign) ? bias1 : bias0; + int32_t abs_val_minus = abs_val - (127 + sign); + uint8_t abs_val_lsb = ((abs_val_minus <= 0) + ? abs_val + : abs_val_minus) & + 0xFF; + return (sign) ? -abs_val_lsb : abs_val_lsb; +} + +static inline void symbol_remapping(uint8_t *blk_in, uint8_t *blk_out, uint8_t bias0, uint8_t bias1, uint8_t signedness, uint8_t is_bf16_exp, uint8_t zero_guard) +{ + if (is_bf16_exp == false && signedness == false) + { + // remapping bypass + memcpy(blk_out, blk_in, sizeof(uint8_t) * 16); + return; + } + + if (is_bf16_exp == true) + { + // center circular shift + for (int i = 0; i < 16; i++) + { + blk_out[i] = center_shift(blk_in[i], bias0, zero_guard); + } + } + else + { + // two-side circular shift + for (int i = 0; i < 16; i++) + { + int8_t shift_data_i = two_side_circular_shift((int8_t)blk_in[i], bias0, bias1); + blk_out[i] = sign_to_unsign(shift_data_i); + } + } +} + +static inline void inv_symbol_remapping(uint8_t *blk_in, uint8_t *blk_out, uint8_t bias0, uint8_t bias1, uint8_t signedness, uint8_t is_bf16_exp, uint8_t zero_guard) +{ + if (is_bf16_exp == false && signedness == false) + { + // remapping bypass + memcpy(blk_out, blk_in, sizeof(uint8_t) * 16); + return; + } + + if (is_bf16_exp == true) + { + // center circular shift + for (int i = 0; i < 16; i++) + { + blk_out[i] = inv_center_shift(blk_in[i], bias0, zero_guard); + } + } + else + { + // two-side circular shift + for (int i = 0; i < 16; i++) + { + int8_t sign_data_i = unsign_to_sign(blk_in[i]); + blk_out[i] = (uint8_t)inv_two_side_circular_shift(sign_data_i, bias0, bias1); + } + } +} + +static inline int vlc_estimate_block_order(uint8_t *blk_in, uint8_t bf16_zvc_en) +{ + int best_k = 0; + int best_bs_size = 0x7FFFFFFF; + + for (int k = 0; k <= (int)MAX_ORDER_K; k++) + { + uint8_t remain_field_size = k << 4; + int unary_field_len = 0; + for (int i = 0; i < 16; i++) + { + uint8_t group_idx = blk_in[i] >> k; + unary_field_len += (group_idx + 1); + } + int znum_bit = (bf16_zvc_en && k > 0) ? 4 : 0; + int blk_size = (unary_field_len <= MAX_UNARY_FIELD_SIZE) + ? remain_field_size + unary_field_len + znum_bit + : 255; + if (blk_size < best_bs_size) + { + best_k = k; + best_bs_size = blk_size; + } + } + + best_k = (best_bs_size > 128) ? -1 : best_k; + return best_k; +} +// -- vlc block parrelel GR encode/decode -- +static inline uint8_t vlc_gr_enc_block_data(uint8_t *blk_in, StreamBuffer *bs, int order_k, uint8_t bf16_zvc_en) +{ + // uncompressed mode + if (order_k == -1) + { + write_stream(bs, blk_in, 128); + return 128; + } + + // remain field + uint8_t remain_field[16] = {0}; + uint8_t unary_field[8] = {0}; + uint8_t sym_end_pos[16] = {0}; + uint8_t unary_field_len = 0; + int sym_end_pos_accum = -1; + + // bit plane encode for remain field + for (int k = 0; k < order_k; k++) + { + uint8_t bit_plane0 = 0, bit_plane1 = 0; + for (int i = 0; i < 8; i++) + { + bit_plane0 |= (get_bit_val(blk_in, i, k) << i); + bit_plane1 |= (get_bit_val(blk_in, i + 8, k) << i); + } + remain_field[k << 1] = bit_plane0; + remain_field[(k << 1) + 1] = bit_plane1; + } + write_stream(bs, remain_field, order_k << 4); + + if (bf16_zvc_en && order_k > 0) + { + int zero_num = 0; + for (int i = 0; i < 16; i++) + { + if (blk_in[i] == 0) + zero_num++; + } + assert(zero_num < 16); + write_stream(bs, (uint8_t *)&zero_num, 4); + } + + // unary encode for unary field + for (int i = 0; i < 16; i++) + { + int group_idx = blk_in[i] >> order_k; + sym_end_pos_accum += (group_idx + 1); + sym_end_pos[i] = sym_end_pos_accum; + int byte_idx = sym_end_pos[i] / 8; + int bit_idx = sym_end_pos[i] % 8; + unary_field[byte_idx] |= (1 << (bit_idx)); + } + unary_field_len = sym_end_pos[15] + 1; + assert(unary_field_len <= MAX_UNARY_FIELD_SIZE); + uint8_t ulen = (unary_field_len - 16) & 0x1F; + write_stream(bs, unary_field, unary_field_len); + + return ulen; +} + +static inline void vlc_gr_dec_block_data(StreamBuffer *bs, uint8_t bs_size, uint8_t *rec, int order_k, uint8_t bf16_zvc_en) +{ + assert(bs_size <= 128); + // uncompressed mode + if (order_k == -1) + { + parse_stream(bs, rec, 128); + return; + } + + // remain field + uint8_t remain_data[16] = {0}; + uint8_t remain_bs[16] = {0}; + uint8_t unary_field[8] = {0}; + uint8_t sym_end_pos[16] = {0}; + uint8_t unary_sym[16] = {0}; + uint8_t remain_field_size = order_k << 4; + + parse_stream(bs, remain_bs, remain_field_size); + // bit plane encode for remain field + for (int k = 0; k < order_k; k++) + { + for (int i = 0; i < 8; i++) + { + remain_data[i] |= (get_bit_val(remain_bs, k << 1, i) << k); + remain_data[i + 8] |= (get_bit_val(remain_bs, (k << 1) + 1, i) << k); + } + } + + // zero number info + int znum_bit = (bf16_zvc_en && order_k > 0) ? 4 : 0; + uint8_t znum = 0; + parse_stream(bs, &znum, znum_bit); + + // unary encode for unary field + uint8_t unary_field_len = bs_size - remain_field_size - znum_bit; + parse_stream(bs, unary_field, unary_field_len); + + int sym_cnt = 0; + for (uint8_t ubit_i = 0; ubit_i < unary_field_len; ubit_i++) + { + int byte_idx = ubit_i / 8; + int bit_idx = ubit_i % 8; + if (get_bit_val(unary_field, byte_idx, bit_idx) == 1) + { + sym_end_pos[sym_cnt] = ubit_i; + sym_cnt++; + } + } + unary_sym[0] = sym_end_pos[0]; + for (int i = 1; i < 16; i++) + { + unary_sym[i] = sym_end_pos[i] - sym_end_pos[i - 1] - 1; + } + for (int i = 0; i < 16; i++) + { + rec[i] = (unary_sym[i] << order_k) + remain_data[i]; + } +} + +// -- vlc encode int8 entry funtion -- +static inline void bm_vlc_enc_int8(const uint8_t *ibuf, size_t isz, uint8_t *obuf, size_t *osz, CommandInfo *cmd_info) +{ + StreamBuffer bs_header, bs_kmap, bs_data; + size_t blk_num = (isz + 15) >> 4; + size_t header_size = 16; + size_t kmap_size = ceiling_func(blk_num, 16) << 4; + size_t bs_buf_size = header_size + kmap_size + (blk_num << 4); + uint8_t *bsbuf = (uint8_t *)calloc(bs_buf_size, sizeof(uint8_t)); + + // block encode + init_stream(&bs_kmap, bsbuf + header_size, kmap_size, false); + init_stream(&bs_data, bsbuf + header_size + kmap_size, blk_num << 4, false); + + for (size_t blk_idx = 0; blk_idx < blk_num; blk_idx++) + { + uint8_t blk_data[16] = {0}, blk_sr_data[16] = {0}; + size_t in_size = (blk_idx == (blk_num - 1)) ? isz - (blk_idx << 4) : 16; + memcpy(blk_data, &ibuf[blk_idx << 4], sizeof(uint8_t) * in_size); + + symbol_remapping(blk_data, blk_sr_data, cmd_info->bias0, cmd_info->bias1, cmd_info->signedness, false, false); + + int k = vlc_estimate_block_order(blk_sr_data, false); + uint8_t ulen = vlc_gr_enc_block_data(blk_sr_data, &bs_data, k, false); + uint8_t k_info = (k == -1) ? 0xE0 : (k << 5) + ulen; + write_stream(&bs_kmap, &k_info, 8); + } + + int blk_bs_size = ceiling_func(((bs_data.bit_pos + 7) >> 3), 16) << 4; // 16 byte align + *osz = header_size + kmap_size + blk_bs_size; + + // write header + init_stream(&bs_header, bsbuf, header_size, false); + vlc_enc_header(&bs_header, cmd_info, blk_bs_size); + + memcpy(obuf, bsbuf, (*osz) * sizeof(uint8_t)); + free(bsbuf); +} + +// -- vlc decode int8 entry funtion -- +static inline void bm_vlc_dec_int8_ext(const uint8_t *ibuf, size_t isz, uint8_t *obuf, size_t *bs_size) +{ + StreamBuffer bs_header, bs_kmap, bs_data; + CommandInfo cmd_info; + memset(&cmd_info, 0, sizeof(CommandInfo)); + + size_t blk_num = (isz + 15) >> 4; + int header_size = 16; + int kmap_size = ceiling_func(blk_num, 16) << 4; + + // parse header + init_stream(&bs_header, ibuf, header_size, true); + vlc_dec_header_ext(&bs_header, &cmd_info, bs_size); + + // Check whether valid header + size_t bs_buf_size = get_out_bs_buf_size(isz, 0); // int8 + ASSERT(*bs_size <= bs_buf_size); + ASSERT(cmd_info.is_bfloat16 == 0); + + // block decode + init_stream(&bs_kmap, ibuf + header_size, kmap_size, true); + init_stream(&bs_data, ibuf + header_size + kmap_size, blk_num << 4, true); + + for (size_t blk_idx = 0; blk_idx < blk_num; blk_idx++) + { + uint8_t blk_data[16] = {0}, blk_sr_data[16] = {0}; + uint8_t k_info = 0; + parse_stream(&bs_kmap, &k_info, 8); + uint8_t ulen = k_info & 0x1F; + int k = (k_info >> 5 == 7) ? -1 : k_info >> 5; + int blk_bs_size = (k == -1) ? 128 : (k << 4) + ulen + 16; + vlc_gr_dec_block_data(&bs_data, blk_bs_size, blk_data, k, false); + + inv_symbol_remapping(blk_data, blk_sr_data, cmd_info.bias0, cmd_info.bias1, cmd_info.signedness, false, false); + + int out_size = (blk_idx == (blk_num - 1)) ? isz - (blk_idx << 4) : 16; + memcpy(&obuf[blk_idx << 4], blk_sr_data, sizeof(uint8_t) * out_size); + } +} + +static inline void bm_vlc_dec_int8(const uint8_t *ibuf, size_t isz, uint8_t *obuf) +{ + size_t bs_size; + bm_vlc_dec_int8_ext(ibuf, isz, obuf, &bs_size); +} + +// -- vlc encode bfloat16 entry funtion -- +static inline void bm_vlc_enc_bf16(const uint16_t *ibuf, size_t isz, uint8_t *obuf, size_t *osz, CommandInfo *cmd_info) +{ + StreamBuffer bs_header, bs_kmap, bs_data; + size_t blk_num = (isz + 31) >> 5; // 32 bytes per blok + size_t header_size = 16; + size_t kmap_size = ceiling_func(blk_num, 16) << 4; + size_t bs_buf_size = header_size + kmap_size + (blk_num << 5); + uint8_t *bsbuf = (uint8_t *)calloc(bs_buf_size, sizeof(uint8_t)); + + // block encode + init_stream(&bs_kmap, bsbuf + header_size, kmap_size, false); + init_stream(&bs_data, bsbuf + header_size + kmap_size, blk_num << 5, false); + + for (size_t blk_idx = 0; blk_idx < blk_num; blk_idx++) + { + uint8_t blk_data[16] = {0}, blk_sr_data[16] = {0}, blk_data_frac[16] = {0}; + size_t in_num = (blk_idx == (blk_num - 1)) ? ((isz >> 1) - (blk_idx << 4)) : 16; + dispatch_bf16_data(&ibuf[blk_idx << 4], blk_data, blk_data_frac, in_num); + + // exp: BGR encode + symbol_remapping(blk_data, blk_sr_data, cmd_info->bias0, cmd_info->bias1, false, true, cmd_info->zero_guard_en); + + int k = vlc_estimate_block_order(blk_sr_data, cmd_info->zero_guard_en); + uint8_t ulen = vlc_gr_enc_block_data(blk_sr_data, &bs_data, k, cmd_info->zero_guard_en); + uint8_t k_info = (k == -1) ? 0xE0 : (k << 5) + ulen; + write_stream(&bs_kmap, &k_info, 8); + + // frac: implicit zero compression + for (size_t i = 0; i < 16; i++) + { + if (!cmd_info->zero_guard_en || blk_data[i] != 0) + { + write_stream(&bs_data, &blk_data_frac[i], 8); + } + } + } + + int blk_bs_size = ceiling_func(((bs_data.bit_pos + 7) >> 3), 16) << 4; // 16 byte align + *osz = header_size + kmap_size + blk_bs_size; + + // write header + init_stream(&bs_header, bsbuf, header_size, false); + vlc_enc_header(&bs_header, cmd_info, blk_bs_size); + + memcpy(obuf, bsbuf, (*osz) * sizeof(uint8_t)); + free(bsbuf); +} + +// -- vlc decode bfloat16 entry funtion -- +static inline void bm_vlc_dec_bf16_ext(const uint8_t *ibuf, size_t isz, uint16_t *obuf, size_t *bs_size) +{ + StreamBuffer bs_header, bs_kmap, bs_data; + CommandInfo cmd_info; + memset(&cmd_info, 0, sizeof(CommandInfo)); + + size_t blk_num = (isz + 31) >> 5; // 32 bytes per blok + int header_size = 16; + int kmap_size = ceiling_func(blk_num, 16) << 4; + + // parse header + init_stream(&bs_header, ibuf, header_size, true); + vlc_dec_header_ext(&bs_header, &cmd_info, bs_size); + + // Check whether valid header + size_t bs_buf_size = get_out_bs_buf_size(isz, 1); // bf16 + ASSERT(*bs_size <= bs_buf_size); + ASSERT(cmd_info.is_bfloat16 == 1); + + // block decode + init_stream(&bs_kmap, ibuf + header_size, kmap_size, true); + init_stream(&bs_data, ibuf + header_size + kmap_size, blk_num << 5, true); + + for (size_t blk_idx = 0; blk_idx < blk_num; blk_idx++) + { + uint8_t blk_data[16] = {0}, blk_sr_data[16] = {0}, blk_data_frac[16] = {0}; + uint8_t k_info = 0; + parse_stream(&bs_kmap, &k_info, 8); + uint8_t ulen = k_info & 0x1F; + int k = (k_info >> 5 == 7) ? -1 : k_info >> 5; + int znum_bit = (cmd_info.zero_guard_en && k > 0) ? 4 : 0; + uint8_t blk_bs_size = (k == -1) ? 128 : (k << 4) + ulen + 16 + znum_bit; + + // exp: BGR decode + vlc_gr_dec_block_data(&bs_data, blk_bs_size, blk_data, k, cmd_info.zero_guard_en); + + inv_symbol_remapping(blk_data, blk_sr_data, cmd_info.bias0, cmd_info.bias1, false, true, cmd_info.zero_guard_en); + + size_t out_num = (blk_idx == (blk_num - 1)) ? ((isz >> 1) - (blk_idx << 4)) : 16; + + // frac: implicit zero compression + for (size_t i = 0; i < out_num; i++) + { + if (!cmd_info.zero_guard_en || blk_sr_data[i] != 0) + { + parse_stream(&bs_data, &blk_data_frac[i], 8); + } + } + merge_bf16_data(blk_sr_data, blk_data_frac, &obuf[blk_idx << 4], out_num); + } +} + +static inline void bm_vlc_dec_bf16(const uint8_t *ibuf, size_t isz, uint16_t *obuf) +{ + size_t bs_size; + bm_vlc_dec_bf16_ext(ibuf, isz, obuf, &bs_size); +} + +// -- offline estimate model weight params -- +static inline void bm_vlc_est_weight_bias(const uint8_t *ibuf, size_t isz, uint8_t signedness, uint8_t isBfloat16, CommandInfo *cmd_info) +{ + assert(!(isBfloat16 && signedness)); // WARNING: signedness MUST be 0 as isBfloat16==True + + cmd_info->is_bfloat16 = isBfloat16; + if (isBfloat16 == false && signedness == true) + { + // two-side circular shift + int hist[256] = {0}; + for (size_t i = 0; i < isz; i++) + { + hist[ibuf[i]]++; + } + + int8_t pos_v = 1; + //while (pos_v < 128) + // comparison is always true due to limited range of data type [-Werror=type-limits] + while (true) + { + if (hist[((uint8_t)pos_v)] == 0) + { + pos_v++; + } + else + { + break; + } + } + //cmd_info->bias0 = (pos_v > 1 && pos_v < 128) ? (pos_v - 1) : 0; + // comparison is always true due to limited range of data type [-Werror=type-limits] + cmd_info->bias0 = (pos_v > 1) ? (pos_v - 1) : 0; + int8_t neg_v = -1; + //while (neg_v >= (-128)) // comparison is always true due to limited range of data type [-Werror=type-limits] + while (true) + { + if (hist[(uint8_t)neg_v] == 0) + { + neg_v--; + } + else + { + break; + } + } + //cmd_info->bias1 = (neg_v < -1 && neg_v >= -128) ? abs(neg_v + 1) : 0; + // comparison is always true due to limited range of data type [-Werror=type-limits] + cmd_info->bias1 = (neg_v < -1) ? abs(neg_v + 1) : 0; + cmd_info->signedness = true; + } + + if (isBfloat16 == true) + { + // center shift + int64_t exp_accum = 0; + uint16_t *bf16_in = (uint16_t *)ibuf; + size_t inum = (isz >> 1), cnt = 0; + for (size_t i = 0; i < inum; i++) + { + uint8_t exp = ((bf16_in[i] >> 7) & 0xFF); + if (exp != 0) + { + exp_accum += exp; + cnt++; + } + } + if (cnt > 0) + { + cmd_info->bias0 = (uint8_t)((exp_accum / (float)cnt) + 0.5); + } + cmd_info->zero_guard_en = (inum == cnt) ? false : true; + cmd_info->signedness = false; + } +} + #ifdef __cplusplus +} +#endif + +#endif /* __BM_VLC_COMPRESS_H__ */ diff --git a/cvikernel/include/bmkernel/bm1880v2/bmkernel_1880v2.h b/cvikernel/include/bmkernel/bm1880v2/bmkernel_1880v2.h new file mode 100644 index 000000000..494e817f3 --- /dev/null +++ b/cvikernel/include/bmkernel/bm1880v2/bmkernel_1880v2.h @@ -0,0 +1,1042 @@ +#ifndef __BMKERNEL_1880v2_H__ +#define __BMKERNEL_1880v2_H__ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define BMK1880v2_TIU 0 // Tensor Instruction Unit +#define BMK1880v2_CPU 1 // CPU, Reserved for common cpu op +#define BMK1880v2_TDMA 2 // TPU DMA +#define BMK1880v2_ENGINE_NUM 3 // Number of Engines + +typedef struct bmk_context bmk1880v2_context_t; +typedef struct bmk_context bmk_context_t; +typedef cvk_chip_info_t bmk1880v2_chip_info_t; + +typedef struct ec_desc bmk1880v2_op_t; + +bmk1880v2_context_t * bmk1880v2_register(bmk_info_t *info); +void bmk1880v2_cleanup(bmk1880v2_context_t *ctx); +void bmk1880v2_reset(bmk1880v2_context_t *ctx); +uint8_t *bmk1880v2_acquire_cmdbuf(bmk1880v2_context_t *ctx, uint32_t *size); +void bmk1880v2_dmabuf_size(uint8_t *cmdbuf, uint32_t sz, uint32_t *psize, uint32_t *pmu_size); +void bmk1880v2_dmabuf_relocate( + uint8_t *dmabuf, uint64_t dmabuf_devaddr, + uint32_t original_size, uint32_t pmubuf_size); +void bmk1880v2_dmabuf_convert(uint8_t *cmdbuf, uint32_t sz, uint8_t *dmabuf); +void bmk1880v2_dmabuf_dump(uint8_t * dmabuf); +void bmk1880v2_arraybase_set( + uint8_t *dmabuf, uint32_t arraybase0L, uint32_t arraybase1L, + uint32_t arraybase0H, uint32_t arraybase1H); + +void bmk1880v2_parallel_enable(bmk1880v2_context_t *ctx); +void bmk1880v2_set_op(bmk1880v2_context_t *ctx, void* op); +void* bmk1880v2_get_op(bmk1880v2_context_t *ctx); +void bmk1880v2_parallel_disable(bmk1880v2_context_t *ctx); +void bmk1880v2_set_layer_id(bmk1880v2_context_t *ctx, uint16_t layer_id); +uint16_t bmk1880v2_layer_id(bmk1880v2_context_t *ctx); + +void bmk1880v2_create_streams(bmk1880v2_context_t *ctx, int nr_streams); +void bmk1880v2_destroy_streams(bmk1880v2_context_t *ctx); +void bmk1880v2_set_stream(bmk1880v2_context_t *ctx, int i); + +void bmk1880v2_add_dependency( + bmk1880v2_context_t *ctx, + bmk1880v2_op_t *before, + bmk1880v2_op_t *after); + +void bmk1880v2_cpu_op( + bmk1880v2_context_t *ctx, + const char* op_name, char *params, int size); + +/* + * Fundamental structures for tensor and matrix + */ + +typedef struct { + uint32_t n, c, w, col; +} bmk1880v2_matrix_lmem_shape_t; + +typedef struct { + uint32_t row, col; +} bmk1880v2_matrix_tgmem_shape_t; + +typedef struct { + uint32_t n, c, h; +} bmk1880v2_matrix_lmem_stride_t; + +typedef struct { + uint32_t row; +} bmk1880v2_matrix_tgmem_stride_t; + +typedef struct { + uint32_t n, c, h, w; +} bmk1880v2_tensor_lmem_shape_t; + +typedef struct { + uint32_t n, c, h, w; +} bmk1880v2_tensor_tgmem_shape_t; + +typedef struct { + uint32_t n, c, h, w; +} bmk1880v2_tensor_lmem_stride_t; + +typedef struct { + uint32_t n, c, h; +} bmk1880v2_tensor_tgmem_stride_t; + +typedef struct { + uint32_t start_address; + fmt_t fmt; + fmt_t cmprs_fmt; + bmk1880v2_tensor_lmem_shape_t shape; + bmk1880v2_tensor_lmem_stride_t stride; + uint8_t int8_rnd_mode; // (1, oc, kh*kw, ic) + * TDMA load global (1, oc, kh*w, ic) -> local (1, oc, kh*kw, ic) + * TIU conv opd1 (ic, oc, kh, kw) + */ +typedef struct { + const bmk1880v2_tensor_lmem_t *ofmap; + const bmk1880v2_tensor_lmem_t *ifmap; + const bmk1880v2_tensor_lmem_t *weight; + const bmk1880v2_tensor_lmem_t *bias; + uint8_t ins_h, ins_last_h; + uint8_t ins_w, ins_last_w; + uint8_t pad_top, pad_bottom; + uint8_t pad_left, pad_right; + uint8_t stride_h, stride_w; + uint8_t dilation_h, dilation_w; + int relu_enable; + uint8_t rshift_bits; + uint8_t ps32_mode; + uint8_t w_is_const; + uint16_t layer_id; + uint8_t fp_round_typ; + int8_t ins_val; // padding value for int8 + uint16_t ins_fp; // padding value for bf16 +} bmk1880v2_tiu_convolution_param_t; + +bmk1880v2_op_t * bmk1880v2_tiu_convolution( + bmk1880v2_context_t *ctx, + const bmk1880v2_tiu_convolution_param_t *p); + +typedef struct { + const bmk1880v2_tensor_lmem_t *ofmap; + const bmk1880v2_tensor_lmem_t *ifmap; + const bmk1880v2_tensor_lmem_t *weight; + const bmk1880v2_tensor_lmem_t *chl_quan_param; + uint8_t ins_h, ins_last_h; + uint8_t ins_w, ins_last_w; + uint8_t pad_top, pad_bottom; + uint8_t pad_left, pad_right; + uint8_t stride_h, stride_w; + uint8_t dilation_h, dilation_w; + uint8_t has_bias; + uint8_t relu_enable; + uint8_t ps32_mode; + uint8_t w_is_const; + uint16_t layer_id; + int8_t ins_val; // padding value for int8 + uint16_t ins_fp; // padding value for bf16 +} bmk1880v2_tiu_convolution_qdm_param_t; + +bmk1880v2_op_t * bmk1880v2_tiu_convolution_qdm( + bmk1880v2_context_t *ctx, + const bmk1880v2_tiu_convolution_qdm_param_t *p); + +typedef struct { + const bmk1880v2_tensor_lmem_t *ofmap; + const bmk1880v2_tensor_lmem_t *ifmap; + uint16_t kh, kw; + uint8_t pad_top, pad_bottom; + uint8_t pad_left, pad_right; + uint8_t stride_h, stride_w; + int8_t ins_val; // padding value for int8 + uint16_t ins_fp; // padding value for bf16 + uint16_t layer_id; +} bmk1880v2_tiu_max_pooling_param_t; + +bmk1880v2_op_t * bmk1880v2_tiu_max_pooling( + bmk1880v2_context_t *ctx, + const bmk1880v2_tiu_max_pooling_param_t *p); + +typedef struct { + const bmk1880v2_tensor_lmem_t *ofmap; + const bmk1880v2_tensor_lmem_t *ifmap; + uint16_t kh, kw; + uint8_t ins_h, ins_last_h; + uint8_t ins_w, ins_last_w; + uint8_t pad_top, pad_bottom; + uint8_t pad_left, pad_right; + uint8_t stride_h, stride_w; + uint16_t avg_pooling_const; + uint8_t rshift_bits; + uint16_t layer_id; + int8_t ins_val; // padding value for int8 + uint16_t ins_fp; // padding value for bf16 +} bmk1880v2_tiu_average_pooling_param_t; + +bmk1880v2_op_t * bmk1880v2_tiu_average_pooling( + bmk1880v2_context_t *ctx, + const bmk1880v2_tiu_average_pooling_param_t *p); + +typedef struct { + const bmk1880v2_tensor_lmem_t *ofmap; + const bmk1880v2_tensor_lmem_t *ifmap; + const bmk1880v2_tensor_lmem_t *weight; + const bmk1880v2_tensor_lmem_t *bias; + int weight_is_const; + struct { + int16_t val; + int is_signed; + } weight_const; + uint8_t ins_h, ins_last_h; + uint8_t ins_w, ins_last_w; + uint8_t dilation_h, dilation_w; + uint8_t pad_top, pad_bottom; + uint8_t pad_left, pad_right; + uint8_t stride_h, stride_w; + uint8_t rshift_bits; + int relu_enable; + uint16_t layer_id; + uint8_t ps32_mode; //output fp32 result if ps32_mode==2 + int8_t ins_val; // padding value for int8 + uint16_t ins_fp; // padding value for bf16 +} bmk1880v2_tiu_depthwise_convolution_param_t; + +bmk1880v2_op_t * bmk1880v2_tiu_depthwise_convolution( + bmk1880v2_context_t *ctx, + const bmk1880v2_tiu_depthwise_convolution_param_t *p); + +typedef struct { + const bmk1880v2_tensor_lmem_t *ofmap; + const bmk1880v2_tensor_lmem_t *ifmap; + const bmk1880v2_tensor_lmem_t *weight; + const bmk1880v2_tensor_lmem_t *chl_quan_param; + int weight_is_const; + struct { + int16_t val; + int is_signed; + } weight_const; + uint8_t ins_h, ins_last_h; + uint8_t ins_w, ins_last_w; + uint8_t dilation_h, dilation_w; + uint8_t pad_top, pad_bottom; + uint8_t pad_left, pad_right; + uint8_t stride_h, stride_w; + uint8_t has_bias; + uint8_t relu_enable; + uint16_t layer_id; + int8_t ins_val; // padding value for int8 + uint16_t ins_fp; // padding value for bf16 +} bmk1880v2_tiu_depthwise_convolution_qdm_param_t; + +bmk1880v2_op_t * bmk1880v2_tiu_depthwise_convolution_qdm( + bmk1880v2_context_t *ctx, + const bmk1880v2_tiu_depthwise_convolution_qdm_param_t *p); + +typedef struct { + const bmk1880v2_matrix_lmem_t *res; + const bmk1880v2_matrix_lmem_t *left; + const bmk1880v2_matrix_lmem_t *right; + const bmk1880v2_matrix_lmem_t *bias; + uint8_t lshift_bits; + uint8_t rshift_bits; + int res_is_int8; + int relu_enable; + int add_result; + uint8_t ps32_mode; + uint16_t layer_id; +} bmk1880v2_tiu_matrix_multiplication_param_t; + +bmk1880v2_op_t * bmk1880v2_tiu_matrix_multiplication( + bmk1880v2_context_t *ctx, + const bmk1880v2_tiu_matrix_multiplication_param_t *p); + +typedef struct { + const bmk1880v2_matrix_lmem_t *res; + const bmk1880v2_matrix_lmem_t *left; + const bmk1880v2_matrix_lmem_t *right; + const bmk1880v2_matrix_lmem_t *bias; + uint8_t lshift_bits; + uint8_t rshift_bits; + int res_is_int8; + int relu_enable; + int add_result; + uint8_t ps32_mode; + int32_t quan_m; + uint16_t layer_id; +} bmk1880v2_tiu_matrix_multiplication_qdm_param_t; + +bmk1880v2_op_t * bmk1880v2_tiu_matrix_multiplication_qdm( + bmk1880v2_context_t *ctx, + const bmk1880v2_tiu_matrix_multiplication_qdm_param_t *p); + +/* + * Helpers + */ + +bmk1880v2_tensor_lmem_stride_t bmk1880v2_tensor_lmem_default_stride( + bmk1880v2_context_t *ctx, + bmk1880v2_tensor_lmem_shape_t s, + fmt_t fmt_type, + int eu_align); + +bmk1880v2_tensor_tgmem_stride_t bmk1880v2_tensor_tgmem_default_stride( + bmk1880v2_tensor_tgmem_shape_t s, + fmt_t fmt_type); + +bmk1880v2_matrix_lmem_shape_t bmk1880v2_matrix_lmem_default_shape( + bmk1880v2_context_t *ctx, + uint32_t row, + uint32_t col, + fmt_t fmt_type); + +bmk1880v2_matrix_lmem_shape_t bmk1880v2_matrix_lmem_shape_t1( + bmk1880v2_context_t *ctx, + uint32_t len, + fmt_t fmt_type); + +bmk1880v2_matrix_lmem_stride_t bmk1880v2_matrix_lmem_default_stride( + bmk1880v2_context_t *ctx, + bmk1880v2_matrix_lmem_shape_t s, + fmt_t fmt, + int eu_align); + +bmk1880v2_tensor_lmem_t * bmk1880v2_lmem_alloc_tensor( + bmk1880v2_context_t *ctx, + bmk1880v2_tensor_lmem_shape_t s, + fmt_t fmt, + int eu_align); + +void bmk1880v2_lmem_init_tensor( + bmk1880v2_context_t *ctx, + bmk1880v2_tensor_lmem_t *tl, + bmk1880v2_tensor_lmem_shape_t shape, + fmt_t fmt, + int eu_align); + +bmk1880v2_tensor_lmem_t * bmk1880v2_lmem_alloc_ps32_tensor( + bmk1880v2_context_t *ctx, + bmk1880v2_tensor_lmem_shape_t s, + fmt_t fmt, + int eu_align); + +void bmk1880v2_lmem_free_tensor( + bmk1880v2_context_t *ctx, + const bmk1880v2_tensor_lmem_t *t); + +bmk1880v2_matrix_lmem_t * bmk1880v2_lmem_alloc_matrix( + bmk1880v2_context_t *ctx, + bmk1880v2_matrix_lmem_shape_t s, + fmt_t fmt, + int eu_align); + +void bmk1880v2_lmem_init_matrix( + bmk1880v2_context_t *ctx, + bmk1880v2_matrix_lmem_t *ml, + bmk1880v2_matrix_lmem_shape_t shape, + fmt_t fmt, + int eu_align); + +bmk1880v2_matrix_lmem_t * bmk1880v2_lmem_alloc_ps32_matrix( + bmk1880v2_context_t *ctx, + bmk1880v2_matrix_lmem_shape_t s, + fmt_t fmt, + int eu_align); + +void bmk1880v2_lmem_free_matrix( + bmk1880v2_context_t *ctx, + const bmk1880v2_matrix_lmem_t *t); + +uint32_t bmk1880v2_lmem_tensor_to_size( + bmk1880v2_context_t *ctx, + bmk1880v2_tensor_lmem_shape_t s, + fmt_t fmt, + int eu_align); + +uint32_t bmk1880v2_lmem_matrix_to_size( + bmk1880v2_context_t *ctx, + bmk1880v2_matrix_lmem_shape_t s, + fmt_t fmt, + int eu_align); + +uint32_t bmk1880v2_lmem_ps32_matrix_to_size( + bmk1880v2_context_t *ctx, + bmk1880v2_matrix_lmem_shape_t s, + fmt_t fmt, + int eu_align); + +#include "non_atomic.h" + +#ifdef __cplusplus +} +#endif + +#endif /* __BMKERNEL_1880V2_H__ */ diff --git a/cvikernel/include/bmkernel/bm1880v2/compression.h b/cvikernel/include/bmkernel/bm1880v2/compression.h new file mode 100644 index 000000000..905e62abe --- /dev/null +++ b/cvikernel/include/bmkernel/bm1880v2/compression.h @@ -0,0 +1,369 @@ +#ifndef COMPRESSION_H +#define COMPRESSION_H + +#include + +typedef struct { + uint32_t compress_md; + uint32_t bit_length; + int is_signed; + + uint64_t total_data_num; + uint32_t non_zero_data_num; + + uint64_t header_bytes; + uint64_t map_bytes; + uint64_t data_bytes; + uint64_t total_bytes; + + int compressed_min; + int compressed_max; +} compression_info_t; + +typedef struct { + uint64_t header_offset; + uint64_t header_size; + uint64_t map_offset; + uint64_t map_size; + uint64_t data_offset; + uint64_t data_size; + uint64_t total_size; +} compress_addr_info; + +static uint64_t compression_map_bytes(uint64_t total_data_num) +{ + uint64_t bit_alignment = 16 * 8; + uint64_t bits = total_data_num; + + return ceiling_func(bits, bit_alignment)*16; +} + +static uint64_t compression_map_clear_bytes(uint64_t total_data_num) +{ + uint64_t bit_alignment = 2 * 8; + uint64_t bits = total_data_num; + + return ceiling_func(bits, bit_alignment)*2; +} + + +static uint64_t compression_data_bytes(uint64_t non_zero_data_num, uint32_t bit_length) +{ + if (bit_length == 1) + return 0; + + uint64_t bit_alignment = 8; + uint64_t bits = non_zero_data_num * bit_length; + + return ceiling_func(bits, bit_alignment); +} + +static inline uint32_t compression_bit_length(uint32_t compress_md) +{ + switch (compress_md) { + case 0: + return 8; + case 1: + return 4; + case 2: + return 2; + case 3: + return 1; + default: + assert(0); + } +} + +static inline void compute_compressed_range( + uint32_t bit_length, int is_signed, int *min, int *max) +{ + if (is_signed) { + switch (bit_length) { + case 1: + *min = -1; + *max = 0; + return; + case 2: + *min = -2; + *max = 1; + return; + case 4: + *min = -8; + *max = 7; + return; + case 8: + *min = -128; + *max = 127; + return; + } + } else { + *min = 0; + switch (bit_length) { + case 1: + *max = 1; + return; + case 2: + *max = 3; + return; + case 4: + *max = 15; + return; + case 8: + *max = 255; + return; + } + } + assert(0); +} + +static inline int saturate(int val, int max, int min) +{ + if (val < min) + return min; + else if (val > max) + return max; + else + return val; +} + +static inline uint64_t count_non_zero_results( + uint8_t buf[], uint64_t size, int is_signed, int max, int min) +{ + uint64_t n = 0; + + for (uint64_t i = 0; i < size; i++) { + int val = is_signed? (int8_t)buf[i]: buf[i]; + int res = saturate(val, max, min); + if (res != 0) + n++; + } + + return n; +} + +static inline void set_map_bit(uint8_t map[], uint64_t i) +{ + uint64_t byte_i = i / 8; + uint64_t bit_i = i % 8; + + map[byte_i] |= (1 << bit_i); +} + +static inline uint8_t read_map_bit(uint8_t map[], uint64_t i) +{ + uint64_t byte_i = i / 8; + uint64_t bit_i = i % 8; + + return (map[byte_i] >> bit_i) & 1; +} + +static inline void parse_header( + uint32_t header, int *is_signed, uint32_t *compress_md, uint32_t *nz_num) +{ + *is_signed = (header >> 29) & 1; + *compress_md = (header >> 24) & 0b11; + *nz_num = header & 0xffffff; +} + +static inline void fill_header(uint32_t *hdr, compression_info_t *info) +{ + if(compression_bit_length(info->compress_md)!=1) + { + *hdr = (info->is_signed << 29) | (1 << 28) | + (info->compress_md << 24) | + info->non_zero_data_num; + }else + { + *hdr = (info->is_signed << 29) | (1 << 28) | + (info->compress_md << 24); + } +} + +static inline void fill_map(uint8_t map[], uint8_t buf[], compression_info_t *info) +{ + int min = info->compressed_min; + int max = info->compressed_max; + + uint64_t clear_map = compression_map_clear_bytes(info->total_data_num); + for (uint64_t i = 0; i < clear_map; i++) + map[i] = 0; + + for (uint64_t i = 0; i < info->total_data_num; i++) { + int val = info->is_signed? (int8_t)buf[i]: buf[i]; + int res = saturate(val, max, min); + if (res != 0) + set_map_bit(map, i); + } +} + +static inline void compress_one_data( + uint8_t data[], uint64_t i, uint8_t val, compression_info_t *info) +{ + uint32_t bit_len = info->bit_length; + uint32_t data_per_byte = 8 / bit_len; + + uint32_t byte_i = i / data_per_byte; + uint32_t bit_i = (i % data_per_byte) * bit_len; + uint8_t mask = (1 << bit_len) - 1; + + data[byte_i] |= (val & mask) << bit_i; +} + +static inline uint8_t sign_extend(uint8_t val, uint32_t bit_len) +{ + int shift = 8 - bit_len; + return (int8_t)(val << shift) >> shift; +} + +static inline uint8_t decompress_one_data( + uint8_t data[], uint64_t i, compression_info_t *info) +{ + uint32_t bit_len = info->bit_length; + uint32_t data_per_byte = 8 / bit_len; + + uint32_t byte_i = i / data_per_byte; + uint32_t bit_i = (i % data_per_byte) * bit_len; + uint8_t mask = (1 << bit_len) - 1; + + uint8_t val = (data[byte_i] >> bit_i) & mask; + if (info->is_signed) + val = sign_extend(val, bit_len); + + return val; +} + +static inline void fill_data(uint8_t data[], uint8_t buf[], compression_info_t *info) +{ + int min = info->compressed_min; + int max = info->compressed_max; + + for (uint64_t i = 0; i < info->data_bytes; i++) + data[i] = 0; + + uint64_t nz_i = 0; + for (uint64_t i = 0; i < info->total_data_num; i++) { + int val = info->is_signed? (int8_t)buf[i]: buf[i]; + int res = saturate(val, max, min); + if (res != 0) { + compress_one_data(data, nz_i, res, info); + nz_i++; + } + } +} + +static inline compression_info_t make_compression_info( + uint8_t buf[], uint64_t size, uint32_t compress_md, int is_signed) +{ + uint32_t bit_length = compression_bit_length(compress_md); + + int min, max; + compute_compressed_range(bit_length, is_signed, &min, &max); + + uint32_t nz_num = count_non_zero_results(buf, size, is_signed, max, min); + assert(nz_num <= 0xffffff); + + compression_info_t info; + info.compress_md = compress_md; + info.bit_length = bit_length; + info.is_signed = is_signed; + info.total_data_num = size; + info.non_zero_data_num = nz_num; + info.header_bytes = 16; + info.map_bytes = compression_map_bytes(size); + info.data_bytes = compression_data_bytes(nz_num, bit_length); + info.total_bytes = info.header_bytes + info.map_bytes + info.data_bytes; + info.compressed_min = min; + info.compressed_max = max; + return info; +} + +static inline compression_info_t parse_compression_info( + uint8_t compressed_buf[], uint64_t max_size, uint64_t total_data_num) +{ + uint64_t header_bytes = 16; + assert(header_bytes <= max_size); + + int is_signed; + uint32_t compress_md, nz_num; + parse_header(*(uint32_t *)compressed_buf, &is_signed, &compress_md, &nz_num); + + uint32_t bit_length = compression_bit_length(compress_md); + int min, max; + compute_compressed_range(bit_length, is_signed, &min, &max); + + compression_info_t info; + info.compress_md = compress_md; + info.bit_length = compression_bit_length(compress_md); + info.is_signed = is_signed; + info.total_data_num = total_data_num; + info.non_zero_data_num = nz_num; + info.header_bytes = header_bytes; + info.map_bytes = compression_map_bytes(total_data_num); + info.data_bytes = compression_data_bytes(nz_num, info.bit_length); + info.total_bytes = header_bytes + info.map_bytes + info.data_bytes; + info.compressed_min = min; + info.compressed_max = max; + + assert(info.total_bytes <= max_size); + + return info; +} + +static inline uint8_t * compress( + uint8_t buf[], uint64_t size, uint32_t compress_md, int is_signed, compress_addr_info *compressed_data) +{ + compression_info_t info = + make_compression_info(buf, size, compress_md, is_signed); + + assert(info.total_bytes < 0x100000); + static uint8_t *result = new uint8_t[0x100000]; + uint32_t *hdr = (uint32_t *)result; + uint8_t *map = &result[info.header_bytes]; + uint8_t *data = &map[info.map_bytes]; + + fill_header(hdr, &info); + fill_map(map, buf, &info); + if (info.bit_length != 1) + fill_data(data, buf, &info); + + compressed_data->header_offset = 0; + compressed_data->header_size = 4; + compressed_data->map_offset = info.header_bytes; + compressed_data->map_size = compression_map_clear_bytes(info.total_data_num); + compressed_data->data_offset = info.map_bytes + info.header_bytes; + compressed_data->data_size = info.data_bytes; + compressed_data->total_size = info.total_bytes; + + return result; +} + +static inline void decompress( + uint8_t buf[], uint64_t size, uint8_t compressed_buf[], uint64_t max_size) +{ + compression_info_t info = + parse_compression_info(compressed_buf, max_size, size); + assert(info.total_bytes <= max_size); + assert(info.total_data_num == size); + + uint8_t *map = &compressed_buf[info.header_bytes]; + if (info.bit_length == 1) { + for (uint64_t i = 0; i < size; i++) { + uint8_t val = read_map_bit(map, i); + buf[i] = info.is_signed? sign_extend(val, 1): val; + } + } else { + uint8_t *data = &map[info.map_bytes]; + uint64_t data_i = 0; + for (uint64_t i = 0; i < size; i++) { + uint8_t val = read_map_bit(map, i); + if (val == 0) { + buf[i] = 0; + } else { + buf[i] = decompress_one_data(data, data_i, &info); + data_i++; + } + } + } +} + +#endif /* COMPRESSION_H */ diff --git a/cvikernel/include/bmkernel/bm1880v2/non_atomic.h b/cvikernel/include/bmkernel/bm1880v2/non_atomic.h new file mode 100644 index 000000000..77f4738f7 --- /dev/null +++ b/cvikernel/include/bmkernel/bm1880v2/non_atomic.h @@ -0,0 +1,300 @@ +#ifndef __BMKERNEL_1880v2_NON_ATOMIC_H__ +#define __BMKERNEL_1880v2_NON_ATOMIC_H__ + +#include "bmkernel_1880v2.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// non atomic +void bf16_table_shape(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_shape_t *s); + +int bf16_emit_sqrt(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_buf, bmk1880v2_tensor_lmem_t *tbl_answer, + bmk1880v2_tensor_lmem_t *tbl_answer_mantissa, + bmk1880v2_tensor_lmem_t *tl_ofmap_bf16); +void bf16_gen_sqrt(uint16_t *table_data, bmk1880v2_tensor_lmem_shape_t *table_shape); +void bf16_gen_sqrt_mantissa(uint16_t *table_mantissa, bmk1880v2_tensor_lmem_shape_t *table_shape); +void bf16_sqrt_tbl(uint16_t *sqrt_table_data, uint16_t *sqrt_table_data_mantissa, + bmk1880v2_tensor_lmem_shape_t *table_shape); + +int bf16_emit_reciprocal(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_buf, bmk1880v2_tensor_lmem_t *tbl_answer, + bmk1880v2_tensor_lmem_t *tbl_answer_mantissa, + bmk1880v2_tensor_lmem_t *tl_ofmap_bf16); +void bf16_gen_reciprocal(uint16_t *table_data, bmk1880v2_tensor_lmem_shape_t *table_shape); +void bf16_gen_reciprocal_mantissa(uint16_t *table_mantissa, bmk1880v2_tensor_lmem_shape_t *table_shape); +void bf16_reciprocal_tbl(uint16_t *table_data, uint16_t *table_mantissa, + bmk1880v2_tensor_lmem_shape_t *table_shape); + +void bf16_atan_y0(uint16_t *table_data_y0, bmk1880v2_tensor_lmem_shape_t *table_shape); +void bf16_atan_fast_degree_y0(uint16_t *table_data_y0, bmk1880v2_tensor_lmem_shape_t *table_shape); +void bf16_atan_slope(uint16_t *table_slope, bmk1880v2_tensor_lmem_shape_t *table_shape); +void bf16_atan_s_01(uint16_t *table_invert, bmk1880v2_tensor_lmem_shape_t *table_shape); +void bf16_atan_pos_neg(uint16_t *table_pos_neg, bmk1880v2_tensor_lmem_shape_t *table_shape); +int bf16_atan_slope_multipilier(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_buf, + bmk1880v2_tensor_lmem_t *tl_buf2, bmk1880v2_tensor_lmem_t *tl_buf3, + bmk1880v2_tensor_lmem_t *tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt); + +int bf16_atan_emit(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_buf, bmk1880v2_tensor_lmem_t *tl_buf2, + bmk1880v2_tensor_lmem_t *tl_buf3, bmk1880v2_tensor_lmem_t *tl_y0_buf, + bmk1880v2_tensor_lmem_t *tl_slope_buf, bmk1880v2_tensor_lmem_t *tl_invert_buf, + bmk1880v2_tensor_lmem_t *tl_pos_neg_buf, + bmk1880v2_tensor_lmem_t *tl_table_answer, + bmk1880v2_tensor_lmem_t *tl_table_answer_mantissa, + bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt); + +int bf16_atan_fast_emit(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_buf, bmk1880v2_tensor_lmem_t *tl_buf2, + bmk1880v2_tensor_lmem_t *tl_y0_buf, + bmk1880v2_tensor_lmem_t *tl_invert_buf, + bmk1880v2_tensor_lmem_t *tl_pos_neg_buf, + bmk1880v2_tensor_lmem_t *tl_table_answer, + bmk1880v2_tensor_lmem_t *tl_table_answer_mantissa, + bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt, + uint8_t is_dirty_ifmap); + +void bf16_atan2_fast_emit(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *y, + bmk1880v2_tensor_lmem_t *x, bmk1880v2_tensor_lmem_t *tl_buf, + bmk1880v2_tensor_lmem_t *tl_buf2, bmk1880v2_tensor_lmem_t *tl_buf3, + bmk1880v2_tensor_lmem_t *tl_buf4, bmk1880v2_tensor_lmem_t *tl_y0_buf, + bmk1880v2_tensor_lmem_t *tl_slope_buf, bmk1880v2_tensor_lmem_t *tl_invert_buf, + bmk1880v2_tensor_lmem_t *tl_pos_neg_buf, + bmk1880v2_tensor_lmem_t *tl_table_answer, + bmk1880v2_tensor_lmem_t *tl_table_answer_mantissa, + bmk1880v2_tensor_lmem_t *tl_0_idx_table, + bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt); + +void bf16_atan2_fast_degree_emit(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *y, + bmk1880v2_tensor_lmem_t *x, bmk1880v2_tensor_lmem_t *tl_buf, + bmk1880v2_tensor_lmem_t *tl_buf2, bmk1880v2_tensor_lmem_t *tl_buf3, + bmk1880v2_tensor_lmem_t *tl_y0_buf, + bmk1880v2_tensor_lmem_t *tl_invert_buf, + bmk1880v2_tensor_lmem_t *tl_pos_neg_buf, + bmk1880v2_tensor_lmem_t *tl_table_answer, + bmk1880v2_tensor_lmem_t *tl_table_answer_mantissa, + bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt); + +void bf16_atan2_merge_emit(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *y, + bmk1880v2_tensor_lmem_t *x, bmk1880v2_tensor_lmem_t *tl_buf, + bmk1880v2_tensor_lmem_t *tl_buf2, bmk1880v2_tensor_lmem_t *tl_buf3, + bmk1880v2_tensor_lmem_t *tl_y0_buf, + bmk1880v2_tensor_lmem_t *tl_invert_buf, + bmk1880v2_tensor_lmem_t *tl_pos_neg_buf, + bmk1880v2_tensor_lmem_t *tl_table_answer, + bmk1880v2_tensor_lmem_t *tl_table_answer_mantissa, + bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt); + + +uint64_t bf16_lut_tbl_bytesize(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_shape_t *table_shape, + fmt_t fmt); + +void bf16_atan_tbl(uint16_t *table_data_atan_y0, uint16_t *table_data_atan_slope, uint16_t *table_data_atan_invert, + uint16_t *table_data_atan_pos_neg, bmk1880v2_tensor_lmem_shape_t *table_shape); + +void bf16_atan_fast_degree_tbl(uint16_t *table_data_atan_y0, uint16_t *table_data_atan_invert, + uint16_t *table_data_atan_pos_neg, bmk1880v2_tensor_lmem_shape_t *table_shape); + +void bf16_gen_0_tbl(uint16_t *table_0, bmk1880v2_tensor_lmem_shape_t *table_shape); + +int bf16_emit_0_idx(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_buf, bmk1880v2_tensor_lmem_t *tbl_answer, + bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt); + +int bf16_emit_neg_idx(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_buf, bmk1880v2_tensor_lmem_t *tl_pos_neg_buf, + bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt); + +int bf16_emit_pos_idx(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_buf, bmk1880v2_tensor_lmem_t *tl_pos_neg_buf, + bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt); + +int bf16_emit_0_1_revert_input(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_buf, + bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt); + +void bf16_atan2_emit(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *y, + bmk1880v2_tensor_lmem_t *x, bmk1880v2_tensor_lmem_t *tl_buf, + bmk1880v2_tensor_lmem_t *tl_buf2, bmk1880v2_tensor_lmem_t *tl_buf3, + bmk1880v2_tensor_lmem_t *tl_buf4, bmk1880v2_tensor_lmem_t *tl_buf5, + bmk1880v2_tensor_lmem_t *tl_buf6, bmk1880v2_tensor_lmem_t *tl_y0_buf, + bmk1880v2_tensor_lmem_t *tl_slope_buf, bmk1880v2_tensor_lmem_t *tl_invert_buf, + bmk1880v2_tensor_lmem_t *tl_pos_neg_buf, + bmk1880v2_tensor_lmem_t *tl_table_answer, + bmk1880v2_tensor_lmem_t *tl_table_answer_mantissa, + bmk1880v2_tensor_lmem_t *tl_sqrt_table_answer, + bmk1880v2_tensor_lmem_t *tl_sqrt_table_answer_mantissa, + bmk1880v2_tensor_lmem_t *tl_0_idx_table, + bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt); + +// nn function +int bf16_emit_pythagoras(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *y, + bmk1880v2_tensor_lmem_t *x, bmk1880v2_tensor_lmem_t *tl_buf, + bmk1880v2_tensor_lmem_t *tl_buf2, + bmk1880v2_tensor_lmem_t *tl_sqrt_table_answer, + bmk1880v2_tensor_lmem_t *tl_sqrt_table_answer_mantissa, + bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt); + +int bf16_emit_max_const(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt, float b); + +int bf16_emit_min_const(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt, float b); + +int bf16_emit_0_1_revert(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_buf, bmk1880v2_tensor_lmem_t *tbl_answer, + bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt); + +int bf16_emit_mul(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_ifmap2, bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, + fmt_t fmt); + +int bf16_emit_add(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_ifmap2, bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, + fmt_t fmt); + +int bf16_emit_add_const(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt, float b); + +int bf16_emit_mul_const(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt, float b); + +// mask please refer \BF16_MASK_TYPE for supported case +int bf16_emit_mask_gt0(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_buf, bmk1880v2_tensor_lmem_t *tl_buf2, + bmk1880v2_tensor_lmem_t *tl_buf3, bmk1880v2_tensor_lmem_t *tl_pos_neg_buf, + bmk1880v2_tensor_lmem_t *tl_0_idx_buf, + bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt); + +int bf16_emit_mask_ge0(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_buf, bmk1880v2_tensor_lmem_t *tl_pos_neg_table, + bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt); + +int bf16_emit_mask_le0(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_buf, bmk1880v2_tensor_lmem_t *tl_pos_neg_table, + bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt); + +int bf16_emit_mask_eq0(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_buf, bmk1880v2_tensor_lmem_t *tl_0_idx_table, + bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt); +enum BF16_MASK_TYPE { + BF16_MASK_TYPE_GT_0 = 0, // remain > 0 + BF16_MASK_TYPE_GE_0, // remain >= 0 + BF16_MASK_TYPE_EQ_0, // remain = 0 + BF16_MASK_TYPE_LT_0, // remain < 0 + BF16_MASK_TYPE_LE_0, // remain <= 0 + BF16_MASK_MAX +}; + +int bf16_emit_mask(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_buf, bmk1880v2_tensor_lmem_t *tl_buf2, + bmk1880v2_tensor_lmem_t *tl_buf3, bmk1880v2_tensor_lmem_t *tl_pos_neg_table, + bmk1880v2_tensor_lmem_t *tl_0_idx_table, bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, + fmt_t fmt, enum BF16_MASK_TYPE mask); + +int bf16_emit_mask_lt0(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_buf, bmk1880v2_tensor_lmem_t *tl_pos_neg_table, + bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt); + +int _bf16_atan_emit(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_buf, bmk1880v2_tensor_lmem_t *tl_buf2, + bmk1880v2_tensor_lmem_t *tl_buf3, bmk1880v2_tensor_lmem_t *tl_y0_buf, + bmk1880v2_tensor_lmem_t *tl_slope_buf, bmk1880v2_tensor_lmem_t *tl_invert_buf, + bmk1880v2_tensor_lmem_t *tl_pos_neg_buf, + bmk1880v2_tensor_lmem_t *tl_table_answer, + bmk1880v2_tensor_lmem_t *tl_table_answer_mantissa, + bmk1880v2_tensor_lmem_t *tl_ofmap_bf16, fmt_t fmt, float b); + +uint32_t *bm1880v2_reshape_channel_bias(uint8_t *bias, int ni, int ci, int hi, int wi, int old_bias_c, + fmt_t fmt); + +int bm1880v2_reshape_channel_same(bmk1880v2_context_t *bk_ctx, int ic, int ih, int iw, int kh, + int kw, int pad_right, int pad_left, int stride_h, int stride_w, + bmk1880v2_tensor_lmem_shape_t *tl_load_shape, + bmk1880v2_tensor_lmem_stride_t *new_tl_ifmap_stride, + bmk1880v2_tensor_tgmem_shape_t *new_tg_ifmap_shape, + bmk1880v2_tensor_tgmem_stride_t *new_tg_ifmap_stride, + bmk1880v2_tensor_lmem_shape_t *new_tl_weight_shape, + bmk1880v2_tensor_lmem_shape_t *new_tl_bias_shape, + bmk1880v2_tensor_lmem_shape_t *new_tl_ofmap_shape, fmt_t fmt, + int eu_align); + +uint8_t *bm1880v2_reshape_channel_weight(uint8_t *weight, int ni, int ci, int hi, int wi, int old_weight_c, + fmt_t fmt); + + +int bm1880v2_reshape_channel_same_pad( + bmk1880v2_context_t *bk_ctx, + int ic, int ih, int iw, int kh, int kw, + int pad_right, int pad_left, int stride_h, int stride_w, + bmk1880v2_tensor_lmem_shape_t* tl_load_shape, + bmk1880v2_tensor_lmem_stride_t* new_tl_ifmap_stride, + bmk1880v2_tensor_tgmem_shape_t* new_tg_ifmap_shape, + bmk1880v2_tensor_tgmem_stride_t* new_tg_ifmap_stride, + bmk1880v2_tensor_lmem_shape_t* new_tl_weight_shape, + bmk1880v2_tensor_lmem_shape_t* new_tl_bias_shape, + bmk1880v2_tensor_lmem_shape_t* new_tl_ofmap_shape, + fmt_t fmt, int eu_align); + +int bf16_emit_sigmoid(bmk1880v2_context_t *ctx, + bmk1880v2_tensor_lmem_t* tl_ifmap, + bmk1880v2_tensor_lmem_t* tl_buf, + bmk1880v2_tensor_lmem_t *tl_table_answer, + bmk1880v2_tensor_lmem_t *tl_table_answer_slope, + bmk1880v2_tensor_lmem_t* tl_ofmap_bf16, + float scale); + +void bf16_sigmoid_tbl(uint16_t *sigmoid_table_data, uint16_t* sigmoid_table_data_slope, + bmk1880v2_tensor_lmem_shape_t* table_shape, + int range_start, int range_end); + +float bf16_sigmoid_scale(int range_start, int range_end); + +void bf16_emit_mask_ge0_lt0( + bmk1880v2_context_t *ctx, + bmk1880v2_tensor_lmem_t* y, + bmk1880v2_tensor_lmem_t* index_i8, + bmk1880v2_tensor_lmem_t* tl_buf3, + fmt_t fmt + ); + +void bf16_emit_mask_eq_0( + bmk1880v2_context_t *ctx, + bmk1880v2_tensor_lmem_t* y, + bmk1880v2_tensor_lmem_t* tl_buf, + bmk1880v2_tensor_lmem_t* index_i8, + bmk1880v2_tensor_lmem_t* tl_buf3, + fmt_t fmt + ); + +int bf16_lut_exp_mantissa(bmk1880v2_context_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_buf, + bmk1880v2_tensor_lmem_t *tbl_answer, + bmk1880v2_tensor_lmem_t *tbl_answer_mantissa, + bmk1880v2_tensor_lmem_t *tl_ofmap_bf16); + +int bf16_s2s_fp32_bf16(bmk1880v2_context_t *ctx, uint64_t gaddr_fp32, + bmk1880v2_tensor_tgmem_shape_t fp32_shape, uint64_t gaddr_bf16, + bmk1880v2_tensor_tgmem_shape_t bf16_shape, fmt_t fmt); + +/** + * \gaddr_nc_image for temp gaddr, it could be the same as \gaddr_image + * \re_order_gaddr_svm means we re-ordered weight by \unit_size and oc/ic transpose + * \svm_shape as alias as weight of conv, record actually shape likes (oc, ic, kh, kw), + * the passible shape is + * \unit_size as vecotr size, it should be 36 in HOG + */ +int bf16_hists_svm(bmk1880v2_context_t *ctx, uint64_t gaddr_image, uint64_t gaddr_nc_image, + bmk1880v2_tensor_tgmem_shape_t image_shape, uint64_t re_order_gaddr_svm, + bmk1880v2_tensor_tgmem_shape_t svm_shape, // (oc, ic, kh, kw) + uint64_t gaddr_output, int unit_size, fmt_t fmt); + +#ifdef __cplusplus +} +#endif + +#endif /* __BMKERNEL_1880v2_NON_ATOMIC_H__ */ + diff --git a/cvikernel/include/bmkernel/bm_kernel.h b/cvikernel/include/bmkernel/bm_kernel.h new file mode 100644 index 000000000..93ffe4de6 --- /dev/null +++ b/cvikernel/include/bmkernel/bm_kernel.h @@ -0,0 +1,115 @@ +#ifndef __BM_KERNEL_H__ +#define __BM_KERNEL_H__ + +#include +#include +#include +#include +#include +#include + +#include + +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; + +typedef int8_t s8; +typedef int16_t s16; +typedef int32_t s32; +typedef int64_t s64; + +#ifdef __cplusplus +extern "C" { +#endif + +typedef int bmerr_t; +#define BM_SUCCESS 0 // The operation was successful +#define BM_ERR_AGAIN 1 // Not ready yet +#define BM_ERR_FAILURE 2 // General failure +#define BM_ERR_TIMEOUT 3 // Timeout +#define BM_ERR_UNINITIALIZED 4 // Uninitialzed +#define BM_ERR_INVALID_ARGUMENT 5 // Arguments invalid +#define BM_ERR_NOMEM 6 // Not enough memory +#define BM_ERR_DATA 7 // Data error +#define BM_ERR_BUSY 8 // Busy +#define BM_ERR_NOT_SUPPORTED 9 // Not supported yet + +#define CVI_TPU_TIU 0 // Tensor Instruction Unit +#define CVI_TPU_CPU 1 // CPU, Reserved for common cpu op +#define CVI_TPU_TDMA 2 // TPU DMA +#define CVI_TPU_ENGINE_NUM 3 // Number of Engines + +typedef cvk_fmt_t fmt_t; +#define FMT_F32 CVK_FMT_F32 +#define FMT_F16 CVK_FMT_F16 +#define FMT_I32 CVK_FMT_I32 +#define FMT_I16 CVK_FMT_I16 +#define FMT_I8 CVK_FMT_I8 +#define FMT_I4 CVK_FMT_I4 +#define FMT_I2 CVK_FMT_I2 +#define FMT_I1 CVK_FMT_I1 +#define FMT_U32 CVK_FMT_U32 +#define FMT_U16 CVK_FMT_U16 +#define FMT_U8 CVK_FMT_U8 +#define FMT_BF16 CVK_FMT_BF16 +#define FMT_INVALID CVK_FMT_INVALID + +typedef enum _Cmdbuf_Head_Magic { + CMDBUF_HDR_MAGIC_1880v2 = 0xA5, + CMDBUF_HDR_MAGIC_1822 = 0xA6, + CMDBUF_HDR_MAGIC_181X = 0xA7, + CMDBUF_HDR_MAGIC_180X = 0xA8, +} Cmdbuf_Head_Magic; + +#define BM_CMB_HDR_FLAG_NEURON (0x1) +#define BM_CMB_HDR_FLAG_WEIGHT (0x2) + +typedef struct __cmd_hdr_s { + uint8_t magic; // 0xA5 + uint8_t len; // lens in bytes + uint8_t engine_id: 4; // TPU, GDMA, CDMA + uint8_t __deprecated: 4; + uint8_t flags; // CMD_ID, sync flags, etc. TBD + uint32_t mask; // bit mask for which register need to write + uint8_t cmd[0]; +} __attribute__((packed)) cmd_hdr_t; + +typedef struct { + uint32_t chip_version; + uint32_t cmdbuf_size; + uint8_t *cmdbuf; +} bmk_info_t; + +cvk_chip_info_t bmk1880v2_chip_info(void); +cvk_chip_info_t bmk1822_chip_info(void); + +static inline int ceiling_func(int numerator, int denominator) +{ + return (numerator + denominator - 1) / denominator; +} + +static inline int ceiling_func_shift(int numerator, int shift) +{ + return (numerator + (1 << shift) - 1) >> shift; +} + +static inline uint64_t align_up(uint64_t x, uint64_t n) +{ + return (x + n - 1) / n * n; +} + +// len max number is 255, sometimes cmd larger than 255 +static inline uint32_t cmd_hdr_len(cmd_hdr_t * hdr) { + if (hdr->len == 0) { + return hdr->mask; + } + return hdr->len; +} + +#ifdef __cplusplus +} +#endif + +#endif /* __BM_KERNEL_H__ */ diff --git a/cvikernel/include/bmkernel/bm_kernel_legacy.h b/cvikernel/include/bmkernel/bm_kernel_legacy.h new file mode 100644 index 000000000..9c2cb339e --- /dev/null +++ b/cvikernel/include/bmkernel/bm_kernel_legacy.h @@ -0,0 +1,223 @@ +#ifndef __BM_KERNEL_LEGACY_H__ +#define __BM_KERNEL_LEGACY_H__ + +#include + +typedef uint32_t laddr_t; +typedef uint64_t gaddr_t; + +#ifdef __cplusplus +extern "C" { +#endif + +#define LADDR_INVALID (0xFFFFFFFF) +#define GADDR_INVALID (0x000000FFFFFFFFFFULL) + +#define FMT_U8_to_F32 0xFF + +#define ENGINE_BD 0 // Broadcast Engine +#define ENGINE_CPU 1 // CPU, Reserved +#define ENGINE_GDMA 2 // GDMA Engine +#define ENGINE_CDMA 3 // CDMA Engine +#define ENGINE_END 4 // Invalid + +typedef struct __dma_hdr_t { + uint16_t dmabuf_magic_m; + uint16_t dmabuf_magic_s; + uint32_t dmabuf_size; + uint32_t cpu_desc_count; + uint32_t bd_desc_count; //16bytes + uint32_t tdma_desc_count; + uint32_t tpu_clk_rate; + uint32_t pmubuf_size; + uint32_t pmubuf_offset; //32bytes + uint32_t arraybase_0_L; + uint32_t arraybase_0_H; + uint32_t arraybase_1_L; + uint32_t arraybase_1_H; //48bytes + uint32_t arraybase_2_L; + uint32_t arraybase_2_H; + uint32_t arraybase_3_L; + uint32_t arraybase_3_H; //64bytes + + uint32_t arraybase_4_L; + uint32_t arraybase_4_H; + uint32_t arraybase_5_L; + uint32_t arraybase_5_H; + uint32_t arraybase_6_L; + uint32_t arraybase_6_H; + uint32_t arraybase_7_L; + uint32_t arraybase_7_H; + uint32_t reserve[8]; //128bytes, 128bytes align +} dma_hdr_t; + +typedef struct { + uint32_t version; + uint32_t npu_num; + uint32_t eu_num; + uint32_t lmem_size; + uint32_t lmem_banks; + uint32_t lmem_bank_size; +} bmk_chip_info_t; + +#define FLOAT_SIZE 4 +#define INT8_SIZE 1 +#define BF16_SIZE 2 + +#define UNUSED(x) (void)(x) + +#define __ALIGN_MASK(x,mask) (((x)+(mask))&~(mask)) +#define ALIGN(x,a) __ALIGN_MASK(x,(__typeof__(x))(a)-1) +#define ALIGN_DOWN(x, a) ((x) / (a) * (a)) + +#define math_min(x, y) ((x) < (y) ? (x) : (y)) +#define math_max(x, y) ((x) > (y) ? (x) : (y)) + +static inline int get_num_shift(uint64_t num) +{ + int n = 0; + while (!(num & 1)) { + n++; + num >>= 1; + } + return n; +} + +typedef struct { + uint32_t dim; + uint32_t n; + uint32_t c; + union { + uint32_t h; + uint32_t row; + }; + union { + uint32_t w; + uint32_t col; + }; +} shape_t; + +shape_t shape_t4(int n, int c, int h, int w); +shape_t shape_t3(int d3, int d2, int d1); +shape_t shape_t2(int row, int col); +shape_t shape_t1(int len); + +uint8_t shape_equal(shape_t s1, shape_t s2); + +typedef struct { + uint32_t n; + uint32_t c; + union { + uint32_t h; + uint32_t row; + }; + union { + uint32_t w; + uint32_t col; + }; +} stride_t; + +static inline stride_t stride_st4(int n, int c, int h, int w) +{ + stride_t st; + st.n = n; + st.c = c; + st.h = h; + st.w = w; + return st; +} + +typedef uint32_t ctrl_t; +#define CTRL_NULL 0 +#define CTRL_AL (1 << 0) // alloc aligned with EU_NUM +#define CTRL_RA (1 << 2) // result add +#define CTRL_BN (1 << 3) // B_N_is_1 broadcast to A +#define CTRL_TP (1 << 5) // transpose +#define CTRL_ADDR_ALIGN (1 << 7) +#define CTRL_RELU (1 << 8) +#define CTRL_KFLIP (1 << 9) // kernel flip +#define CTRL_WEIGHT (1 << 10) // mark weight address in GDMA +#define CTRL_NEURON (1 << 11) // mark neuron address in GDMA +#define CTRL_WINOGRAD (1 << 12) // GDMA reshap winograd kernel +#define CTRL_WINOGRAD_SCALE_FACTOR (1 << 28) // GDMA reshap winograd kernel + +typedef uint32_t tl_type; +#define TL_TYPE_TENSOR 0 +#define TL_TYPE_TENSOR_PREALLOC 1 +#define TL_TYPE_CONSTANT 2 +#define TL_TYPE_SLICE 3 +#define TL_TYPE_CLONE 4 + +typedef union { + uint32_t reg_val; + float fp32_val; +} const_fp32_t; + +typedef union { + laddr_t laddr; + const_fp32_t const_fp32; +} opd_t; + +typedef struct { + tl_type type; + opd_t operand; + shape_t shape; + stride_t *stride; + uint8_t aligned; + fmt_t fmt; + uint32_t bank_id; + int reserved_size; +} tensor_lmem; + +static inline laddr_t tl_address(tensor_lmem *tlp) +{ + if (tlp->type == TL_TYPE_CONSTANT) { + return LADDR_INVALID; + } + return tlp->operand.laddr; +} + +void tl_reshape(tensor_lmem * tlp, shape_t shape); + +typedef struct { + uint64_t addr; + shape_t shape; + stride_t stride; +} tensor_gmem; + +static inline int tg_is_matrix(tensor_gmem *t) +{ + return t->shape.dim == 2; +} + +static inline int tg_matrix_row(tensor_gmem *t) +{ + return t->shape.row; +} + +static inline int tg_matrix_col(tensor_gmem *t) +{ + return t->shape.col; +} + +static inline int tl_is_const(tensor_lmem *tlp) +{ + return tlp->type == TL_TYPE_CONSTANT; +} + +static inline int tl_is_prealloc(tensor_lmem *tlp) +{ + return tlp->type == TL_TYPE_TENSOR_PREALLOC; +} + +static inline int tl_is_matrix(tensor_lmem *tlp) +{ + int dim = tlp->shape.dim; + return dim == 1 || dim == 2; +} + +#ifdef __cplusplus +} +#endif + +#endif /* __BM_KERNEL_LEGACY_H__ */ diff --git a/cvikernel/include/bmkernel/bm_regcpu.h b/cvikernel/include/bmkernel/bm_regcpu.h new file mode 100644 index 000000000..f6293139b --- /dev/null +++ b/cvikernel/include/bmkernel/bm_regcpu.h @@ -0,0 +1,72 @@ +/* + * Copyright Bitmain Technologies Inc. + * + * Written by: + * Wanwei CAI + * Created Time: 2017-06-29 15:33 + */ + +#ifndef _BM_REG_CPU_H +#define _BM_REG_CPU_H + +#include + +#define CPU_ENGINE_DESCRIPTOR_NUM 56 +#define CPU_ENGINE_DESCRIPTOR_DMA_NUM CPU_ENGINE_DESCRIPTOR_NUM +#define CPU_ENGINE_BYTES (CPU_ENGINE_DESCRIPTOR_NUM*sizeof(uint32_t)) +#define CPU_ENGINE_STR_LIMIT_BYTE (CPU_ENGINE_DESCRIPTOR_NUM - 7) * sizeof(uint32_t) + +#define CPU_CMD_ACCPI0 0 +#define CPU_CMD_ACCPI1 1 +#define CPU_CMD_ACCPI2 2 +#define CPU_CMD_ACCPI3 3 +#define CPU_CMD_ACCPI4 4 +/* CPU_CMD_ACCPI5 ~ CPU_CMD_ACCPI63 +defined here if needed */ + +#define CPU_ACCPI0_OP_BIT 0 +#define CPU_ACCPI1_BD_CMDID_BIT 0 +#define CPU_ACCPI1_CPU_CMDID_BIT 16 +#define CPU_ACCPI2_GDMA_CMDID_BIT 0 +#define CPU_ACCPI2_CDMA_CMDID_BIT 16 +#define CPU_ACCPI3_NEXT_BD_ADDR_BIT 0 +#define CPU_ACCPI4_NEXT_GDMA_ADDR_BIT 0 +#define CPU_ACCPI5_NEXT_CDMA_ADDR_BIT 0 + +typedef enum { + CPU_OP_SYNC = 2, + CPU_OP_INST = 3, + CPU_OP_END +} CPU_OP; + +// CPU common structure +typedef struct { + uint32_t regs[CPU_ENGINE_DESCRIPTOR_NUM]; +} bmk_cpu_desc_t; + +// CPU_OP_SYNC structure +typedef struct { + uint32_t op_type; // CPU_CMD_ACCPI0 + uint32_t num_bd; // CPU_CMD_ACCPI1 + uint32_t num_gdma; // CPU_CMD_ACCPI2 + uint32_t offset_bd; // CPU_CMD_ACCPI3 + uint32_t offset_gdma; // CPU_CMD_ACCPI4 + uint32_t reserved[2]; // CPU_CMD_ACCPI5-CPU_CMD_ACCPI6 + char str[CPU_ENGINE_STR_LIMIT_BYTE]; +} __attribute__((packed)) bmk_cpu_sync_desc_t; + +// CPU_OP_INST structure +#define CPU_INST_HEADER_COUNT 12 +typedef struct { + uint32_t op_type; // CPU_CMD_ACCPI0 + uint32_t num_bd; // CPU_CMD_ACCPI1 + uint32_t num_gdma; // CPU_CMD_ACCPI2 + uint32_t offset_bd; // CPU_CMD_ACCPI3 + uint32_t offset_gdma; // CPU_CMD_ACCPI4 + uint32_t reserved[2]; // CPU_CMD_ACCPI5-CPU_CMD_ACCPI6 + char lib_name[4*sizeof(uint32_t)]; // CPU_CMD_ACCPI7~CPU_CMD_ACCPI10 + uint32_t param_size; //CPU_CMD_ACCPI11 + uint8_t param[0]; +} __attribute__((packed)) bmk_cpu_inst_desc_t; + +#endif diff --git a/cvikernel/include/bmkernel/reg_bdcast.h b/cvikernel/include/bmkernel/reg_bdcast.h new file mode 100644 index 000000000..0b9ed8c59 --- /dev/null +++ b/cvikernel/include/bmkernel/reg_bdcast.h @@ -0,0 +1,37 @@ +#ifndef REG_BDCAST_H +#define REG_BDCAST_H + +#define BD_ENGINE_DESCRIPTOR_NUM 28 +#define BD_REG_BYTES (BD_ENGINE_DESCRIPTOR_NUM * 4) +#define BDC_ENGINE_CMD_ALIGNED_BIT 8 + +#define BD_CMD_BASE_ADDR (TIU_ENGINE_BASE_ADDR + 0) +#define BD_CTRL_BASE_ADDR (TIU_ENGINE_BASE_ADDR + 0x100) +#define BD_ENGINE_MAIN_CTRL (TIU_ENGINE_BASE_ADDR + 0) +#define BD_ENGINE_DESC_ADDR (TIU_ENGINE_BASE_ADDR + 0x4) + +// +// BD operations for BIRD +// +#define DCR_TYPE_CONV_FIX8B 0 +#define DCR_TYPE_DEPTHWISE_POOL_FIX8B 1 +#define DCR_TYPE_FC_FIX8B 2 +#define DCR_TYPE_TENSOR_ARITH_FIX8B 3 +#define DCR_TYPE_FC_TYPE_2_FIX8B 4 + +// BD control bits base on BD_CTRL_BASE_ADDR +#define BD_TPU_EN 0 // TPU Enable bit +#define BD_LANE_NUM 22 // Lane number bit[29:22] +#define BD_DES_ADDR_VLD 30 // enable descriptor mode +#define BD_INTR_ENABLE 31 // TIU interrupt global enable + +typedef enum _TIU_LANNUM { + TIU_LANNUM_2 = 0x1, + TIU_LANNUM_4 = 0x2, + TIU_LANNUM_8 = 0x3, + TIU_LANNUM_16 = 0x4, + TIU_LANNUM_32 = 0x5, + TIU_LANNUM_64 = 0x6, +} TIU_LANNUM; + +#endif /* REG_BDCAST_H */ diff --git a/cvikernel/include/bmkernel/reg_tdma.h b/cvikernel/include/bmkernel/reg_tdma.h new file mode 100644 index 000000000..7a4020546 --- /dev/null +++ b/cvikernel/include/bmkernel/reg_tdma.h @@ -0,0 +1,98 @@ +#ifndef REG_GDMA_H +#define REG_GDMA_H + +#define TDMA_DESC_REG_BYTES (0x40) +#define TDMA_ENGINE_DESCRIPTOR_NUM (TDMA_DESC_REG_BYTES >> 2) +#define TDMA_NUM_BASE_REGS (0x8) + +//backward compatible? +#define GDMA_TYPE_f32 0 +#define GDMA_TYPE_f16 1 +#define GDMA_TYPE_i32 2 +#define GDMA_TYPE_i16 3 +#define GDMA_TYPE_i8 4 +#define GDMA_TYPE_i4 5 +#define GDMA_TYPE_i2 6 +#define GDMA_TYPE_i1 7 +#define LAST_GDMA_TYPE_i1 8 + + +//tdma descriptor define +#define TDMA_DESCRIPTOR_ALIGNED_BIT 6 + +#define TDMA_CMD_ACCP0 0 +#define TDMA_CMD_ACCP1 4 +#define TDMA_CMD_ACCP2 8 +#define TDMA_CMD_ACCP3 12 +#define TDMA_CMD_ACCP4 16 +#define TDMA_CMD_ACCP5 20 +#define TDMA_CMD_ACCP6 24 +#define TDMA_CMD_ACCP7 28 +#define TDMA_CMD_ACCP8 32 +#define TDMA_CMD_ACCP9 36 +#define TDMA_CMD_ACCP10 40 +#define TDMA_CMD_ACCP11 44 +#define TDMA_CMD_ACCP12 48 +#define TDMA_CMD_ACCP13 52 +#define TDMA_CMD_ACCP14 56 + +#define TDMA_ACCPI0_CMD_VALID_BIT 0 +#define TDMA_ACCPI0_EOD_BIT 2 +#define TDMA_ACCPI0_INTERRUPT_BIT 3 +#define TDMA_ACCPI0_BARRIER_ENABLE_BIT 4 + + +//tdma control define +#define TDMA_CTRL (TDMA_ENGINE_BASE_ADDR + 0x0) +#define TDMA_DES_BASE (TDMA_ENGINE_BASE_ADDR + 0x4) +#define TDMA_INT_MASK (TDMA_ENGINE_BASE_ADDR + 0x8) +#define TDMA_SYNC_STATUS (TDMA_ENGINE_BASE_ADDR + 0xC) +#define TDMA_ARRAYBASE0_L (TDMA_ENGINE_BASE_ADDR + 0x70) +#define TDMA_ARRAYBASE1_L (TDMA_ENGINE_BASE_ADDR + 0x74) +#define TDMA_ARRAYBASE2_L (TDMA_ENGINE_BASE_ADDR + 0x78) +#define TDMA_ARRAYBASE3_L (TDMA_ENGINE_BASE_ADDR + 0x7C) +#define TDMA_ARRAYBASE4_L (TDMA_ENGINE_BASE_ADDR + 0x80) +#define TDMA_ARRAYBASE5_L (TDMA_ENGINE_BASE_ADDR + 0x84) +#define TDMA_ARRAYBASE6_L (TDMA_ENGINE_BASE_ADDR + 0x88) +#define TDMA_ARRAYBASE7_L (TDMA_ENGINE_BASE_ADDR + 0x8C) +#define TDMA_ARRAYBASE0_H (TDMA_ENGINE_BASE_ADDR + 0x90) +#define TDMA_ARRAYBASE1_H (TDMA_ENGINE_BASE_ADDR + 0x94) +#define TDMA_DEBUG_MODE (TDMA_ENGINE_BASE_ADDR + 0xA0) + + + +#define TDMA_CTRL_ENABLE_BIT 0 +#define TDMA_CTRL_MODESEL_BIT 1 +#define TDMA_CTRL_RESET_SYNCID_BIT 2 +#define TDMA_CTRL_FORCE_1ARRAY 5 +#define TDMA_CTRL_FORCE_2ARRAY 6 +#define TDMA_CTRL_BURSTLEN_BIT 8 +#define TDMA_CTRL_64BYTE_ALIGN_EN 10 +#define TDMA_CTRL_DESNUM_BIT 16 + + + + +//This function only supports the following condition +//localmem2tensor or tensor2localmem +//The source and dst shares the the same format +//Data is 32 bit +//no stride +//We use it in the forward_cpu backward_cpu +static inline int get_index_data_format(int size) +{ + if (size == 1) { + return GDMA_TYPE_i1; + } else if (size <= 16) { + return GDMA_TYPE_i4; + } else if (size <= 256){ + return GDMA_TYPE_i8; + } else { + return GDMA_TYPE_i16; + } +} +#define LRN_LEFT_SHIFT 0 +#define LRN_RIGHT_SHIFT 1 + +#endif /* REG_GDMA_H */ + diff --git a/cvikernel/include/bmkernel/reg_tiu.h b/cvikernel/include/bmkernel/reg_tiu.h new file mode 100644 index 000000000..edef598bc --- /dev/null +++ b/cvikernel/include/bmkernel/reg_tiu.h @@ -0,0 +1,20 @@ +#ifndef REG_TIU_H +#define REG_TIU_H + +#define TIU_DESC_REG_BYTES (0x70) +#define TIU_ENGINE_DESCRIPTOR_NUM (TIU_DESC_REG_BYTES >> 2) + +// TIU operation data type +#define DCR_TYPE_CONV_FIX8B 0 +#define DCR_TYPE_DEPTHWISE_POOL_FIX8B 1 +#define DCR_TYPE_FC_FIX8B 2 +#define DCR_TYPE_TENSOR_ARITH_FIX8B 3 +#define NR_DCR_TYPES 4 + +// BD control bits base on BD_CTRL_BASE_ADDR +#define BD_TPU_EN 0 // TPU Enable bit +#define BD_LANE_NUM 22 // Lane number bit[29:22] +#define BD_DES_ADDR_VLD 30 // enable descriptor mode +#define BD_INTR_ENABLE 31 // TIU interrupt global enable + +#endif /* REG_TIU_H */ diff --git a/cvikernel/include/cvikernel/cv180x/cv180x_tdma_reg.h b/cvikernel/include/cvikernel/cv180x/cv180x_tdma_reg.h new file mode 100644 index 000000000..4c3f31653 --- /dev/null +++ b/cvikernel/include/cvikernel/cv180x/cv180x_tdma_reg.h @@ -0,0 +1,310 @@ +#ifndef CV180X_TDMA_REG_H +#define CV180X_TDMA_REG_H + +/* + * This file is generated by tools. Do not edit it manually. + */ + +#include +#include + +#define TDMA_DESC_REG_BYTES (0x40) +#define TDMA_ENGINE_DESCRIPTOR_NUM (TDMA_DESC_REG_BYTES >> 2) +#define TDMA_NUM_BASE_REGS (0x8) + +typedef unsigned long long ullong; + +typedef struct { + uint32_t vld; + uint32_t compress_en; + uint32_t eod; + uint32_t intp_en; + uint32_t bar_en; + uint32_t check_bf16_value; + uint32_t trans_dir; + uint32_t rsv00; + uint32_t trans_fmt; + uint32_t transpose_md; + uint32_t rsv01; + uint32_t intra_cmd_paral; + uint32_t outstanding_en; + uint32_t cmd_id; + uint32_t spec_func; + uint32_t dst_fmt; + uint32_t src_fmt; + uint32_t cmprs_fmt; + uint32_t sys_dtype; + uint32_t rsv2_1; + uint32_t int8_sign; + uint32_t compress_zero_guard; + uint32_t int8_rnd_mode; + uint32_t wait_id_tpu; + uint32_t wait_id_other_tdma; + uint32_t wait_id_sdma; + uint32_t const_val; + uint32_t src_base_reg_sel; + uint32_t mv_lut_idx; + uint32_t dst_base_reg_sel; + uint32_t mv_lut_base; + uint32_t rsv4_5; + uint32_t dst_h_stride; + uint32_t dst_c_stride_low; + uint32_t dst_n_stride; + uint32_t src_h_stride; + uint32_t src_c_stride_low; + uint32_t src_n_stride; + uint32_t dst_c; + uint32_t src_c; + uint32_t dst_w; + uint32_t dst_h; + uint32_t src_w; + uint32_t src_h; + uint32_t dst_base_addr_low; + uint32_t src_base_addr_low; + uint32_t src_n; + uint32_t dst_base_addr_high; + uint32_t src_base_addr_high; + uint32_t src_c_stride_high; + uint32_t dst_c_stride_high; + uint32_t compress_bias0; + uint32_t compress_bias1; + uint32_t layer_ID; +} tdma_reg_t; + +static inline void parse_tdma_reg(tdma_reg_t *r, const uint32_t *p) +{ + r->vld = p[0] & 1; + r->compress_en = (p[0] >> 1) & 1; + r->eod = (p[0] >> 2) & 1; + r->intp_en = (p[0] >> 3) & 1; + r->bar_en = (p[0] >> 4) & 1; + r->check_bf16_value = (p[0] >> 5) & 1; + r->trans_dir = (p[0] >> 6) & ((1u << 2) - 1); + r->rsv00 = (p[0] >> 8) & ((1u << 2) - 1); + r->trans_fmt = (p[0] >> 10) & 1; + r->transpose_md = (p[0] >> 11) & ((1u << 2) - 1); + r->rsv01 = (p[0] >> 13) & 1; + r->intra_cmd_paral = (p[0] >> 14) & 1; + r->outstanding_en = (p[0] >> 15) & 1; + r->cmd_id = (p[0] >> 16) & ((1u << 16) - 1); + r->spec_func = p[1] & ((1u << 3) - 1); + r->dst_fmt = (p[1] >> 3) & ((1u << 2) - 1); + r->src_fmt = (p[1] >> 5) & ((1u << 2) - 1); + r->cmprs_fmt = (p[1] >> 7) & 1; + r->sys_dtype = (p[1] >> 8) & 1; + r->rsv2_1 = (p[1] >> 9) & ((1u << 4) - 1); + r->int8_sign = (p[1] >> 13) & 1; + r->compress_zero_guard = (p[1] >> 14) & 1; + r->int8_rnd_mode = (p[1] >> 15) & 1; + r->wait_id_tpu = (p[1] >> 16) & ((1u << 16) - 1); + r->wait_id_other_tdma = p[2] & ((1u << 16) - 1); + r->wait_id_sdma = (p[2] >> 16) & ((1u << 16) - 1); + r->const_val = p[3] & ((1u << 16) - 1); + r->src_base_reg_sel = (p[3] >> 16) & ((1u << 3) - 1); + r->mv_lut_idx = (p[3] >> 19) & 1; + r->dst_base_reg_sel = (p[3] >> 20) & ((1u << 3) - 1); + r->mv_lut_base = (p[3] >> 23) & 1; + r->rsv4_5 = (p[3] >> 24) & ((1u << 8) - 1); + r->dst_h_stride = p[4] & ((1u << 16) - 1); + r->dst_c_stride_low = (p[4] >> 16) & ((1u << 16) - 1); + r->dst_n_stride = p[5]; + r->src_h_stride = p[6] & ((1u << 16) - 1); + r->src_c_stride_low = (p[6] >> 16) & ((1u << 16) - 1); + r->src_n_stride = p[7]; + r->dst_c = p[8] & ((1u << 16) - 1); + r->src_c = (p[8] >> 16) & ((1u << 16) - 1); + r->dst_w = p[9] & ((1u << 16) - 1); + r->dst_h = (p[9] >> 16) & ((1u << 16) - 1); + r->src_w = p[10] & ((1u << 16) - 1); + r->src_h = (p[10] >> 16) & ((1u << 16) - 1); + r->dst_base_addr_low = p[11]; + r->src_base_addr_low = p[12]; + r->src_n = p[13] & ((1u << 16) - 1); + r->dst_base_addr_high = (p[13] >> 16) & ((1u << 8) - 1); + r->src_base_addr_high = (p[13] >> 24) & ((1u << 8) - 1); + r->src_c_stride_high = p[14] & ((1u << 16) - 1); + r->dst_c_stride_high = (p[14] >> 16) & ((1u << 16) - 1); + r->compress_bias0 = p[15] & ((1u << 8) - 1); + r->compress_bias1 = (p[15] >> 8) & ((1u << 8) - 1); + r->layer_ID = (p[15] >> 16) & ((1u << 16) - 1); +} + +static inline void emit_tdma_reg(const tdma_reg_t *r, uint32_t *_p) +{ + volatile uint32_t *p = (typeof(p))_p; + p[15] = (r->compress_bias0 & ((1u << 8) - 1)) | + ((r->compress_bias1 & ((1u << 8) - 1)) << 8) | + ((r->layer_ID & ((1u << 16) - 1)) << 16); + p[14] = (r->src_c_stride_high & ((1u << 16) - 1)) | + ((r->dst_c_stride_high & ((1u << 16) - 1)) << 16); + p[13] = (r->src_n & ((1u << 16) - 1)) | + ((r->dst_base_addr_high & ((1u << 8) - 1)) << 16) | + ((r->src_base_addr_high & ((1u << 8) - 1)) << 24); + p[12] = (r->src_base_addr_low & (((uint64_t)1 << 32) - 1)); + p[11] = (r->dst_base_addr_low & (((uint64_t)1 << 32) - 1)); + p[10] = (r->src_w & ((1u << 16) - 1)) | + ((r->src_h & ((1u << 16) - 1)) << 16); + p[9] = (r->dst_w & ((1u << 16) - 1)) | + ((r->dst_h & ((1u << 16) - 1)) << 16); + p[8] = (r->dst_c & ((1u << 16) - 1)) | + ((r->src_c & ((1u << 16) - 1)) << 16); + p[7] = (r->src_n_stride & (((uint64_t)1 << 32) - 1)); + p[6] = (r->src_h_stride & ((1u << 16) - 1)) | + ((r->src_c_stride_low & ((1u << 16) - 1)) << 16); + p[5] = (r->dst_n_stride & (((uint64_t)1 << 32) - 1)); + p[4] = (r->dst_h_stride & ((1u << 16) - 1)) | + ((r->dst_c_stride_low & ((1u << 16) - 1)) << 16); + p[3] = (r->const_val & ((1u << 16) - 1)) | + ((r->src_base_reg_sel & ((1u << 3) - 1)) << 16) | + ((r->mv_lut_idx & 1) << 19) | + ((r->dst_base_reg_sel & ((1u << 3) - 1)) << 20) | + ((r->mv_lut_base & 1) << 23) | + ((r->rsv4_5 & ((1u << 8) - 1)) << 24); + p[2] = (r->wait_id_other_tdma & ((1u << 16) - 1)) | + ((r->wait_id_sdma & ((1u << 16) - 1)) << 16); + p[1] = (r->spec_func & ((1u << 3) - 1)) | + ((r->dst_fmt & ((1u << 2) - 1)) << 3) | + ((r->src_fmt & ((1u << 2) - 1)) << 5) | + ((r->cmprs_fmt & 1) << 7) | + ((r->sys_dtype & 1) << 8) | + ((r->rsv2_1 & ((1u << 4) - 1)) << 9) | + ((r->int8_sign & 1) << 13) | + ((r->compress_zero_guard & 1) << 14) | + ((r->int8_rnd_mode & 1) << 15) | + ((r->wait_id_tpu & ((1u << 16) - 1)) << 16); + p[0] = (r->vld & 1) | + ((r->compress_en & 1) << 1) | + ((r->eod & 1) << 2) | + ((r->intp_en & 1) << 3) | + ((r->bar_en & 1) << 4) | + ((r->check_bf16_value & 1) << 5) | + ((r->trans_dir & ((1u << 2) - 1)) << 6) | + ((r->rsv00 & ((1u << 2) - 1)) << 8) | + ((r->trans_fmt & 1) << 10) | + ((r->transpose_md & ((1u << 2) - 1)) << 11) | + ((r->rsv01 & 1) << 13) | + ((r->intra_cmd_paral & 1) << 14) | + ((r->outstanding_en & 1) << 15) | + ((r->cmd_id & ((1u << 16) - 1)) << 16); +} + +static inline void reset_tdma_reg(tdma_reg_t *r) +{ + r->vld = 0x0; + r->compress_en = 0x0; + r->eod = 0x0; + r->intp_en = 0x0; + r->bar_en = 0x0; + r->check_bf16_value = 0x0; + r->trans_dir = 0x0; + r->rsv00 = 0x0; + r->trans_fmt = 0x0; + r->transpose_md = 0x0; + r->rsv01 = 0x0; + r->intra_cmd_paral = 0x0; + r->outstanding_en = 0x0; + r->cmd_id = 0x0; + r->spec_func = 0x0; + r->dst_fmt = 0x1; + r->src_fmt = 0x1; + r->cmprs_fmt = 0x0; + r->sys_dtype = 0x0; + r->rsv2_1 = 0x0; + r->int8_sign = 0x0; + r->compress_zero_guard = 0x0; + r->int8_rnd_mode = 0x0; + r->wait_id_tpu = 0x0; + r->wait_id_other_tdma = 0x0; + r->wait_id_sdma = 0x0; + r->const_val = 0x0; + r->src_base_reg_sel = 0x0; + r->mv_lut_idx = 0x0; + r->dst_base_reg_sel = 0x0; + r->mv_lut_base = 0x0; + r->rsv4_5 = 0x0; + r->dst_h_stride = 0x1; + r->dst_c_stride_low = 0x1; + r->dst_n_stride = 0x1; + r->src_h_stride = 0x1; + r->src_c_stride_low = 0x1; + r->src_n_stride = 0x1; + r->dst_c = 0x1; + r->src_c = 0x1; + r->dst_w = 0x1; + r->dst_h = 0x1; + r->src_w = 0x1; + r->src_h = 0x1; + r->dst_base_addr_low = 0x0; + r->src_base_addr_low = 0x0; + r->src_n = 0x1; + r->dst_base_addr_high = 0x0; + r->src_base_addr_high = 0x0; + r->src_c_stride_high = 0x0; + r->dst_c_stride_high = 0x0; + r->compress_bias0 = 0x0; + r->compress_bias1 = 0x0; + r->layer_ID = 0x0; +} + +static inline void trace_tdma_reg(tdma_reg_t *r, const char *tag) +{ +#define trace_one_reg(name) \ + printf(" %s: 0x%llx\n", #name, (ullong)r->name) + + printf("--- %s ---\n", tag); + trace_one_reg(vld); + trace_one_reg(compress_en); + trace_one_reg(eod); + trace_one_reg(intp_en); + trace_one_reg(bar_en); + trace_one_reg(check_bf16_value); + trace_one_reg(trans_dir); + trace_one_reg(rsv00); + trace_one_reg(trans_fmt); + trace_one_reg(transpose_md); + trace_one_reg(rsv01); + trace_one_reg(intra_cmd_paral); + trace_one_reg(outstanding_en); + trace_one_reg(cmd_id); + trace_one_reg(spec_func); + trace_one_reg(dst_fmt); + trace_one_reg(src_fmt); + trace_one_reg(cmprs_fmt); + trace_one_reg(sys_dtype); + trace_one_reg(rsv2_1); + trace_one_reg(int8_sign); + trace_one_reg(compress_zero_guard); + trace_one_reg(int8_rnd_mode); + trace_one_reg(wait_id_tpu); + trace_one_reg(wait_id_other_tdma); + trace_one_reg(wait_id_sdma); + trace_one_reg(const_val); + trace_one_reg(src_base_reg_sel); + trace_one_reg(mv_lut_idx); + trace_one_reg(dst_base_reg_sel); + trace_one_reg(mv_lut_base); + trace_one_reg(rsv4_5); + trace_one_reg(dst_h_stride); + trace_one_reg(dst_c_stride_low); + trace_one_reg(dst_n_stride); + trace_one_reg(src_h_stride); + trace_one_reg(src_c_stride_low); + trace_one_reg(src_n_stride); + trace_one_reg(dst_c); + trace_one_reg(src_c); + trace_one_reg(dst_w); + trace_one_reg(dst_h); + trace_one_reg(src_w); + trace_one_reg(src_h); + trace_one_reg(dst_base_addr_low); + trace_one_reg(src_base_addr_low); + trace_one_reg(src_n); + trace_one_reg(dst_base_addr_high); + trace_one_reg(src_base_addr_high); + trace_one_reg(src_c_stride_high); + trace_one_reg(dst_c_stride_high); + trace_one_reg(compress_bias0); + trace_one_reg(compress_bias1); + trace_one_reg(layer_ID); +} +#endif /* CV180X_TDMA_REG_H */ diff --git a/cvikernel/include/cvikernel/cv180x/cv180x_tiu_reg.h b/cvikernel/include/cvikernel/cv180x/cv180x_tiu_reg.h new file mode 100644 index 000000000..7b615f23b --- /dev/null +++ b/cvikernel/include/cvikernel/cv180x/cv180x_tiu_reg.h @@ -0,0 +1,622 @@ +#ifndef CV180X_TIU_REG_H +#define CV180X_TIU_REG_H + +/* + * This file is generated by tools. Do not edit it manually. + */ + +#include +#include + +#define TIU_DESC_REG_BYTES (0x70) +#define TIU_ENGINE_DESCRIPTOR_NUM (TIU_DESC_REG_BYTES >> 2) + +// TIU operation data type +#define DCR_TYPE_CONV_FIX8B 0 +#define DCR_TYPE_DEPTHWISE_POOL_FIX8B 1 +#define DCR_TYPE_FC_FIX8B 2 +#define DCR_TYPE_TENSOR_ARITH_FIX8B 3 +#define NR_DCR_TYPES 4 + +#define TENSOR_MUL_FIX8B 0 +#define TENSOR_MAC_FIX8B 1 +#define TENSOR_ADD_FIX8B 2 +#define TENSOR_SUB_FIX8B 3 +#define TENSOR_MAX_FIX8B 4 +#define TENSOR_MIN_FIX8B 5 +#define TENSOR_SHIFT_FIX8B 6 +#define TENSOR_AND_FIX8B 7 +#define TENSOR_OR_FIX8B 8 +#define TENSOR_XOR_FIX8B 9 +#define TENSOR_COPY_FIX8B 10 +#define TENSOR_GE_FIX8B 11 + +typedef unsigned long long ullong; + +typedef struct { + uint32_t cmd_en; + uint32_t cmd_end; + uint32_t cmd_id_en; + uint32_t cmd_keep; + uint32_t cmd_intr_en; + uint32_t tsk_typ; + uint32_t tsk_eu_typ; + uint32_t tsk_opd_num; + uint32_t opt_res_shift; + uint32_t opt_left_shift; + uint32_t opt_shift_typ; + uint32_t opt_rshift_typ; + uint32_t dummy1; + uint32_t opd_typ; + uint32_t opt_chl_quan; + uint32_t cmd_id_tpu; + uint32_t cmd_id_gdma; + uint32_t quan_m; + uint32_t opt_res0_sign; + uint32_t opt_opd0_sign; + uint32_t opt_opd1_sign; + uint32_t opt_opd2_sign; + uint32_t opt_res0_seg; + uint32_t opt_opd0_seg; + uint32_t opt_opd1_seg; + uint32_t opt_opd2_seg; + uint32_t ps32_md; + uint32_t double_conv; + uint32_t opt_left_tran; + uint32_t fp_round_typ; + uint32_t opt_relu_typ; + uint32_t opt_relu_value; + uint32_t cmd_pre_exe_typ; + uint32_t opt_res_add; + uint32_t rsvd0; + uint32_t conv_opd0_x_ins0; + uint32_t conv_opd0_y_ins0; + uint32_t conv_opd0_x_ins0_last; + uint32_t conv_opd0_y_ins0_last; + uint32_t conv_opd1_x_ins0; + uint32_t conv_opd1_y_ins0; + uint32_t dummy0; + uint32_t opd0_ins_val; + uint32_t conv_opd0_up_pad; + uint32_t conv_opd0_dn_pad; + uint32_t conv_opd0_lf_pad; + uint32_t conv_opd0_rt_pad; + uint32_t res0_n; + uint32_t res0_c; + uint32_t res0_h; + uint32_t res0_w; + uint32_t conv_op_x_str; + uint32_t conv_op_y_str; + uint32_t cmd_pre_exe; + uint32_t rsvd1; + uint32_t res0_addr; + uint32_t opd0_addr; + uint32_t opd1_addr; + uint32_t opd2_addr; + uint32_t opt_opd0_const; + uint32_t opt_opd1_const; + uint32_t opt_opd2_const; + uint32_t short_nchwstr_same; + uint32_t short_res0_str; + uint32_t short_opd0_str; + uint32_t short_opd1_str; + uint32_t short_opd2_str; + uint32_t dummy2; + uint32_t opd0_n; + uint32_t opd0_c; + uint32_t dummy3; + uint32_t rsvd2; + uint32_t opd0_h; + uint32_t opd0_w; + uint32_t opd1_n; + uint32_t opd1_c; + uint32_t opd1_h; + uint32_t opd1_w; + uint32_t opd2_n; + uint32_t opd2_c; + uint32_t opd2_h; + uint32_t opd2_w; + uint32_t dummy4; + uint32_t rsvd3; + uint32_t layer_info; + uint32_t res0_n_str; + uint32_t res0_c_str; + uint32_t res0_h_str; + uint32_t res0_w_str; + uint32_t res0_b_str; + uint32_t opd0_n_str; + uint32_t dummy5; + uint32_t rsvd4; + uint32_t opd0_c_str; + uint32_t opd0_h_str; + uint32_t opd0_w_str; + uint32_t opd0_b_str; + uint32_t opd1_n_str; + uint32_t opd1_c_str; + uint32_t opd1_h_str; + uint32_t dummy6; + uint32_t rsvd5; + uint32_t opd1_w_str; + uint32_t opd1_b_str; + uint32_t opd2_n_str; + uint32_t opd2_c_str; + uint32_t opd2_h_str; + uint32_t opd2_w_str; + uint32_t opd2_b_str; + uint32_t dummy7; + uint32_t rsvd6; +} tiu_reg_t; + +static inline void parse_tiu_reg(tiu_reg_t *r, const uint32_t *p) +{ + r->cmd_en = p[0] & 1; + r->cmd_end = (p[0] >> 1) & 1; + r->cmd_id_en = (p[0] >> 2) & 1; + r->cmd_keep = (p[0] >> 3) & 1; + r->cmd_intr_en = (p[0] >> 4) & 1; + r->tsk_typ = (p[0] >> 5) & ((1u << 4) - 1); + r->tsk_eu_typ = (p[0] >> 9) & ((1u << 5) - 1); + r->tsk_opd_num = (p[0] >> 14) & ((1u << 2) - 1); + r->opt_res_shift = (p[0] >> 16) & ((1u << 6) - 1); + r->opt_left_shift = (p[0] >> 22) & ((1u << 5) - 1); + r->opt_shift_typ = (p[0] >> 27) & 1; + r->opt_rshift_typ = (p[0] >> 28) & 1; + r->dummy1 = (p[0] >> 29) & 1; + r->opd_typ = (p[0] >> 30) & 1; + r->opt_chl_quan = (p[0] >> 31) & 1; + r->cmd_id_tpu = p[1] & ((1u << 16) - 1); + r->cmd_id_gdma = (p[1] >> 16) & ((1u << 16) - 1); + r->quan_m = p[2]; + r->opt_res0_sign = p[3] & 1; + r->opt_opd0_sign = (p[3] >> 1) & 1; + r->opt_opd1_sign = (p[3] >> 2) & 1; + r->opt_opd2_sign = (p[3] >> 3) & 1; + r->opt_res0_seg = (p[3] >> 4) & ((1u << 2) - 1); + r->opt_opd0_seg = (p[3] >> 6) & ((1u << 2) - 1); + r->opt_opd1_seg = (p[3] >> 8) & ((1u << 2) - 1); + r->opt_opd2_seg = (p[3] >> 10) & 1; + r->ps32_md = (p[3] >> 11) & ((1u << 2) - 1); + r->double_conv = (p[3] >> 13) & 1; + r->opt_left_tran = (p[3] >> 14) & 1; + r->fp_round_typ = (p[3] >> 15) & 1; + r->opt_relu_typ = (p[3] >> 16) & ((1u << 2) - 1); + r->opt_relu_value = (p[3] >> 18) & ((1u << 8) - 1); + r->cmd_pre_exe_typ = (p[3] >> 26) & 1; + r->opt_res_add = (p[3] >> 27) & 1; + r->rsvd0 = (p[3] >> 28) & ((1u << 4) - 1); + r->conv_opd0_x_ins0 = p[4] & ((1u << 4) - 1); + r->conv_opd0_y_ins0 = (p[4] >> 4) & ((1u << 4) - 1); + r->conv_opd0_x_ins0_last = (p[4] >> 8) & ((1u << 4) - 1); + r->conv_opd0_y_ins0_last = (p[4] >> 12) & ((1u << 4) - 1); + r->conv_opd1_x_ins0 = (p[4] >> 16) & ((1u << 4) - 1); + r->conv_opd1_y_ins0 = (p[4] >> 20) & ((1u << 4) - 1); + r->dummy0 = (p[4] >> 24) & ((1u << 8) - 1); + r->opd0_ins_val = p[5] & ((1u << 16) - 1); + r->conv_opd0_up_pad = (p[5] >> 16) & ((1u << 4) - 1); + r->conv_opd0_dn_pad = (p[5] >> 20) & ((1u << 4) - 1); + r->conv_opd0_lf_pad = (p[5] >> 24) & ((1u << 4) - 1); + r->conv_opd0_rt_pad = (p[5] >> 28) & ((1u << 4) - 1); + r->res0_n = p[6] & ((1u << 12) - 1); + r->res0_c = (p[6] >> 12) & ((1u << 12) - 1); + r->res0_h = (p[6] >> 24) & ((1u << 8) - 1); + r->res0_h |= (uint64_t)(p[7] & ((1u << 4) - 1)) << 8; + r->res0_w = (p[7] >> 4) & ((1u << 12) - 1); + r->conv_op_x_str = (p[7] >> 16) & ((1u << 5) - 1); + r->conv_op_y_str = (p[7] >> 21) & ((1u << 5) - 1); + r->cmd_pre_exe = (p[7] >> 26) & ((1u << 2) - 1); + r->rsvd1 = (p[7] >> 28) & ((1u << 4) - 1); + r->res0_addr = p[8] & ((1u << 24) - 1); + r->opd0_addr = (p[8] >> 24) & ((1u << 8) - 1); + r->opd0_addr |= (uint64_t)(p[9] & ((1u << 16) - 1)) << 8; + r->opd1_addr = (p[9] >> 16) & ((1u << 16) - 1); + r->opd2_addr = p[10] & ((1u << 16) - 1); + r->opt_opd0_const = (p[10] >> 16) & 1; + r->opt_opd1_const = (p[10] >> 17) & 1; + r->opt_opd2_const = (p[10] >> 18) & 1; + r->short_nchwstr_same = (p[10] >> 19) & 1; + r->short_res0_str = (p[10] >> 20) & ((1u << 2) - 1); + r->short_opd0_str = (p[10] >> 22) & ((1u << 2) - 1); + r->short_opd1_str = (p[10] >> 24) & ((1u << 2) - 1); + r->short_opd2_str = (p[10] >> 26) & ((1u << 2) - 1); + r->dummy2 = (p[10] >> 28) & ((1u << 4) - 1); + r->opd0_n = p[11] & ((1u << 12) - 1); + r->opd0_c = (p[11] >> 12) & ((1u << 12) - 1); + r->dummy3 = (p[11] >> 24) & ((1u << 4) - 1); + r->rsvd2 = (p[11] >> 28) & ((1u << 4) - 1); + r->opd0_h = p[12] & ((1u << 12) - 1); + r->opd0_w = (p[12] >> 12) & ((1u << 12) - 1); + r->opd1_n = (p[12] >> 24) & ((1u << 8) - 1); + r->opd1_n |= (uint64_t)(p[13] & ((1u << 4) - 1)) << 8; + r->opd1_c = (p[13] >> 4) & ((1u << 12) - 1); + r->opd1_h = (p[13] >> 16) & ((1u << 12) - 1); + r->opd1_w = (p[13] >> 28) & ((1u << 4) - 1); + r->opd1_w |= (uint64_t)(p[14] & ((1u << 8) - 1)) << 4; + r->opd2_n = (p[14] >> 8) & ((1u << 12) - 1); + r->opd2_c = (p[14] >> 20) & ((1u << 12) - 1); + r->opd2_h = p[15] & ((1u << 12) - 1); + r->opd2_w = (p[15] >> 12) & ((1u << 12) - 1); + r->dummy4 = (p[15] >> 24) & ((1u << 4) - 1); + r->rsvd3 = (p[15] >> 28) & ((1u << 4) - 1); + r->layer_info = p[16] & ((1u << 16) - 1); + r->res0_n_str = (p[16] >> 16) & ((1u << 16) - 1); + r->res0_c_str = p[17] & ((1u << 16) - 1); + r->res0_h_str = (p[17] >> 16) & ((1u << 16) - 1); + r->res0_w_str = p[18] & ((1u << 16) - 1); + r->res0_b_str = (p[18] >> 16) & ((1u << 16) - 1); + r->opd0_n_str = p[19] & ((1u << 16) - 1); + r->dummy5 = (p[19] >> 16) & ((1u << 12) - 1); + r->rsvd4 = (p[19] >> 28) & ((1u << 4) - 1); + r->opd0_c_str = p[20] & ((1u << 16) - 1); + r->opd0_h_str = (p[20] >> 16) & ((1u << 16) - 1); + r->opd0_w_str = p[21] & ((1u << 16) - 1); + r->opd0_b_str = (p[21] >> 16) & ((1u << 16) - 1); + r->opd1_n_str = p[22] & ((1u << 16) - 1); + r->opd1_c_str = (p[22] >> 16) & ((1u << 16) - 1); + r->opd1_h_str = p[23] & ((1u << 16) - 1); + r->dummy6 = (p[23] >> 16) & ((1u << 12) - 1); + r->rsvd5 = (p[23] >> 28) & ((1u << 4) - 1); + r->opd1_w_str = p[24] & ((1u << 16) - 1); + r->opd1_b_str = (p[24] >> 16) & ((1u << 16) - 1); + r->opd2_n_str = p[25] & ((1u << 16) - 1); + r->opd2_c_str = (p[25] >> 16) & ((1u << 16) - 1); + r->opd2_h_str = p[26] & ((1u << 16) - 1); + r->opd2_w_str = (p[26] >> 16) & ((1u << 16) - 1); + r->opd2_b_str = p[27] & ((1u << 16) - 1); + r->dummy7 = (p[27] >> 16) & ((1u << 12) - 1); + r->rsvd6 = (p[27] >> 28) & ((1u << 4) - 1); +} + +static inline void emit_tiu_reg(const tiu_reg_t *r, uint32_t *_p) +{ + volatile uint32_t *p = (typeof(p))_p; + p[27] = (r->opd2_b_str & ((1u << 16) - 1)) | + ((r->dummy7 & ((1u << 12) - 1)) << 16) | + ((r->rsvd6 & ((1u << 4) - 1)) << 28); + p[26] = (r->opd2_h_str & ((1u << 16) - 1)) | + ((r->opd2_w_str & ((1u << 16) - 1)) << 16); + p[25] = (r->opd2_n_str & ((1u << 16) - 1)) | + ((r->opd2_c_str & ((1u << 16) - 1)) << 16); + p[24] = (r->opd1_w_str & ((1u << 16) - 1)) | + ((r->opd1_b_str & ((1u << 16) - 1)) << 16); + p[23] = (r->opd1_h_str & ((1u << 16) - 1)) | + ((r->dummy6 & ((1u << 12) - 1)) << 16) | + ((r->rsvd5 & ((1u << 4) - 1)) << 28); + p[22] = (r->opd1_n_str & ((1u << 16) - 1)) | + ((r->opd1_c_str & ((1u << 16) - 1)) << 16); + p[21] = (r->opd0_w_str & ((1u << 16) - 1)) | + ((r->opd0_b_str & ((1u << 16) - 1)) << 16); + p[20] = (r->opd0_c_str & ((1u << 16) - 1)) | + ((r->opd0_h_str & ((1u << 16) - 1)) << 16); + p[19] = (r->opd0_n_str & ((1u << 16) - 1)) | + ((r->dummy5 & ((1u << 12) - 1)) << 16) | + ((r->rsvd4 & ((1u << 4) - 1)) << 28); + p[18] = (r->res0_w_str & ((1u << 16) - 1)) | + ((r->res0_b_str & ((1u << 16) - 1)) << 16); + p[17] = (r->res0_c_str & ((1u << 16) - 1)) | + ((r->res0_h_str & ((1u << 16) - 1)) << 16); + p[16] = (r->layer_info & ((1u << 16) - 1)) | + ((r->res0_n_str & ((1u << 16) - 1)) << 16); + p[15] = (r->opd2_h & ((1u << 12) - 1)) | + ((r->opd2_w & ((1u << 12) - 1)) << 12) | + ((r->dummy4 & ((1u << 4) - 1)) << 24) | + ((r->rsvd3 & ((1u << 4) - 1)) << 28); + p[14] = ((r->opd1_w >> 4) & ((1u << 8) - 1)) | + ((r->opd2_n & ((1u << 12) - 1)) << 8) | + ((r->opd2_c & ((1u << 12) - 1)) << 20); + p[13] = ((r->opd1_n >> 8) & ((1u << 4) - 1)) | + ((r->opd1_c & ((1u << 12) - 1)) << 4) | + ((r->opd1_h & ((1u << 12) - 1)) << 16) | + ((r->opd1_w & ((1u << 4) - 1)) << 28); + p[12] = (r->opd0_h & ((1u << 12) - 1)) | + ((r->opd0_w & ((1u << 12) - 1)) << 12) | + ((r->opd1_n & ((1u << 8) - 1)) << 24); + p[11] = (r->opd0_n & ((1u << 12) - 1)) | + ((r->opd0_c & ((1u << 12) - 1)) << 12) | + ((r->dummy3 & ((1u << 4) - 1)) << 24) | + ((r->rsvd2 & ((1u << 4) - 1)) << 28); + p[10] = (r->opd2_addr & ((1u << 16) - 1)) | + ((r->opt_opd0_const & 1) << 16) | + ((r->opt_opd1_const & 1) << 17) | + ((r->opt_opd2_const & 1) << 18) | + ((r->short_nchwstr_same & 1) << 19) | + ((r->short_res0_str & ((1u << 2) - 1)) << 20) | + ((r->short_opd0_str & ((1u << 2) - 1)) << 22) | + ((r->short_opd1_str & ((1u << 2) - 1)) << 24) | + ((r->short_opd2_str & ((1u << 2) - 1)) << 26) | + ((r->dummy2 & ((1u << 4) - 1)) << 28); + p[9] = ((r->opd0_addr >> 8) & ((1u << 16) - 1)) | + ((r->opd1_addr & ((1u << 16) - 1)) << 16); + p[8] = (r->res0_addr & ((1u << 24) - 1)) | + ((r->opd0_addr & ((1u << 8) - 1)) << 24); + p[7] = ((r->res0_h >> 8) & ((1u << 4) - 1)) | + ((r->res0_w & ((1u << 12) - 1)) << 4) | + ((r->conv_op_x_str & ((1u << 5) - 1)) << 16) | + ((r->conv_op_y_str & ((1u << 5) - 1)) << 21) | + ((r->cmd_pre_exe & ((1u << 2) - 1)) << 26) | + ((r->rsvd1 & ((1u << 4) - 1)) << 28); + p[6] = (r->res0_n & ((1u << 12) - 1)) | + ((r->res0_c & ((1u << 12) - 1)) << 12) | + ((r->res0_h & ((1u << 8) - 1)) << 24); + p[5] = (r->opd0_ins_val & ((1u << 16) - 1)) | + ((r->conv_opd0_up_pad & ((1u << 4) - 1)) << 16) | + ((r->conv_opd0_dn_pad & ((1u << 4) - 1)) << 20) | + ((r->conv_opd0_lf_pad & ((1u << 4) - 1)) << 24) | + ((r->conv_opd0_rt_pad & ((1u << 4) - 1)) << 28); + p[4] = (r->conv_opd0_x_ins0 & ((1u << 4) - 1)) | + ((r->conv_opd0_y_ins0 & ((1u << 4) - 1)) << 4) | + ((r->conv_opd0_x_ins0_last & ((1u << 4) - 1)) << 8) | + ((r->conv_opd0_y_ins0_last & ((1u << 4) - 1)) << 12) | + ((r->conv_opd1_x_ins0 & ((1u << 4) - 1)) << 16) | + ((r->conv_opd1_y_ins0 & ((1u << 4) - 1)) << 20) | + ((r->dummy0 & ((1u << 8) - 1)) << 24); + p[3] = (r->opt_res0_sign & 1) | + ((r->opt_opd0_sign & 1) << 1) | + ((r->opt_opd1_sign & 1) << 2) | + ((r->opt_opd2_sign & 1) << 3) | + ((r->opt_res0_seg & ((1u << 2) - 1)) << 4) | + ((r->opt_opd0_seg & ((1u << 2) - 1)) << 6) | + ((r->opt_opd1_seg & ((1u << 2) - 1)) << 8) | + ((r->opt_opd2_seg & 1) << 10) | + ((r->ps32_md & ((1u << 2) - 1)) << 11) | + ((r->double_conv & 1) << 13) | + ((r->opt_left_tran & 1) << 14) | + ((r->fp_round_typ & 1) << 15) | + ((r->opt_relu_typ & ((1u << 2) - 1)) << 16) | + ((r->opt_relu_value & ((1u << 8) - 1)) << 18) | + ((r->cmd_pre_exe_typ & 1) << 26) | + ((r->opt_res_add & 1) << 27) | + ((r->rsvd0 & ((1u << 4) - 1)) << 28); + p[2] = (r->quan_m & (((uint64_t)1 << 32) - 1)); + p[1] = (r->cmd_id_tpu & ((1u << 16) - 1)) | + ((r->cmd_id_gdma & ((1u << 16) - 1)) << 16); + p[0] = (r->cmd_en & 1) | + ((r->cmd_end & 1) << 1) | + ((r->cmd_id_en & 1) << 2) | + ((r->cmd_keep & 1) << 3) | + ((r->cmd_intr_en & 1) << 4) | + ((r->tsk_typ & ((1u << 4) - 1)) << 5) | + ((r->tsk_eu_typ & ((1u << 5) - 1)) << 9) | + ((r->tsk_opd_num & ((1u << 2) - 1)) << 14) | + ((r->opt_res_shift & ((1u << 6) - 1)) << 16) | + ((r->opt_left_shift & ((1u << 5) - 1)) << 22) | + ((r->opt_shift_typ & 1) << 27) | + ((r->opt_rshift_typ & 1) << 28) | + ((r->dummy1 & 1) << 29) | + ((r->opd_typ & 1) << 30) | + ((r->opt_chl_quan & 1) << 31); +} + +static inline void reset_tiu_reg(tiu_reg_t *r) +{ + r->cmd_en = 0x0; + r->cmd_end = 0x0; + r->cmd_id_en = 0x0; + r->cmd_keep = 0x0; + r->cmd_intr_en = 0x0; + r->tsk_typ = 0x0; + r->tsk_eu_typ = 0x0; + r->tsk_opd_num = 0x3; + r->opt_res_shift = 0xa; + r->opt_left_shift = 0x2; + r->opt_shift_typ = 0x1; + r->opt_rshift_typ = 0x1; + r->dummy1 = 0x0; + r->opd_typ = 0x0; + r->opt_chl_quan = 0x0; + r->cmd_id_tpu = 0x0; + r->cmd_id_gdma = 0x0; + r->quan_m = 0x0; + r->opt_res0_sign = 0x0; + r->opt_opd0_sign = 0x0; + r->opt_opd1_sign = 0x1; + r->opt_opd2_sign = 0x1; + r->opt_res0_seg = 0x1; + r->opt_opd0_seg = 0x1; + r->opt_opd1_seg = 0x1; + r->opt_opd2_seg = 0x0; + r->ps32_md = 0x0; + r->double_conv = 0x0; + r->opt_left_tran = 0x0; + r->fp_round_typ = 0x0; + r->opt_relu_typ = 0x0; + r->opt_relu_value = 0x0; + r->cmd_pre_exe_typ = 0x0; + r->opt_res_add = 0x0; + r->rsvd0 = 0x0; + r->conv_opd0_x_ins0 = 0x0; + r->conv_opd0_y_ins0 = 0x0; + r->conv_opd0_x_ins0_last = 0x0; + r->conv_opd0_y_ins0_last = 0x0; + r->conv_opd1_x_ins0 = 0x0; + r->conv_opd1_y_ins0 = 0x0; + r->dummy0 = 0x0; + r->opd0_ins_val = 0x0; + r->conv_opd0_up_pad = 0x0; + r->conv_opd0_dn_pad = 0x0; + r->conv_opd0_lf_pad = 0x0; + r->conv_opd0_rt_pad = 0x0; + r->res0_n = 0x1; + r->res0_c = 0x1; + r->res0_h = 0x1; + r->res0_w = 0x10; + r->conv_op_x_str = 0x1; + r->conv_op_y_str = 0x1; + r->cmd_pre_exe = 0x0; + r->rsvd1 = 0x1; + r->res0_addr = 0x0; + r->opd0_addr = 0x0; + r->opd1_addr = 0x0; + r->opd2_addr = 0x0; + r->opt_opd0_const = 0x0; + r->opt_opd1_const = 0x0; + r->opt_opd2_const = 0x0; + r->short_nchwstr_same = 0x0; + r->short_res0_str = 0x0; + r->short_opd0_str = 0x0; + r->short_opd1_str = 0x0; + r->short_opd2_str = 0x0; + r->dummy2 = 0x0; + r->opd0_n = 0x1; + r->opd0_c = 0x1; + r->dummy3 = 0x0; + r->rsvd2 = 0x2; + r->opd0_h = 0x1; + r->opd0_w = 0x10; + r->opd1_n = 0x1; + r->opd1_c = 0x1; + r->opd1_h = 0x1; + r->opd1_w = 0x10; + r->opd2_n = 0x1; + r->opd2_c = 0x1; + r->opd2_h = 0x1; + r->opd2_w = 0x10; + r->dummy4 = 0x0; + r->rsvd3 = 0x3; + r->layer_info = 0x0; + r->res0_n_str = 0x10; + r->res0_c_str = 0x10; + r->res0_h_str = 0x0; + r->res0_w_str = 0x1; + r->res0_b_str = 0x10; + r->opd0_n_str = 0x10; + r->dummy5 = 0x0; + r->rsvd4 = 0x4; + r->opd0_c_str = 0x10; + r->opd0_h_str = 0x0; + r->opd0_w_str = 0x1; + r->opd0_b_str = 0x10; + r->opd1_n_str = 0x10; + r->opd1_c_str = 0x10; + r->opd1_h_str = 0x0; + r->dummy6 = 0x0; + r->rsvd5 = 0x5; + r->opd1_w_str = 0x1; + r->opd1_b_str = 0x10; + r->opd2_n_str = 0x10; + r->opd2_c_str = 0x10; + r->opd2_h_str = 0x0; + r->opd2_w_str = 0x1; + r->opd2_b_str = 0x10; + r->dummy7 = 0x0; + r->rsvd6 = 0x6; +} + +static inline void trace_tiu_reg(tiu_reg_t *r, const char *tag) +{ +#define trace_one_reg(name) \ + printf(" %s: 0x%llx\n", #name, (ullong)r->name) + + printf("--- %s ---\n", tag); + trace_one_reg(cmd_en); + trace_one_reg(cmd_end); + trace_one_reg(cmd_id_en); + trace_one_reg(cmd_keep); + trace_one_reg(cmd_intr_en); + trace_one_reg(tsk_typ); + trace_one_reg(tsk_eu_typ); + trace_one_reg(tsk_opd_num); + trace_one_reg(opt_res_shift); + trace_one_reg(opt_left_shift); + trace_one_reg(opt_shift_typ); + trace_one_reg(opt_rshift_typ); + trace_one_reg(dummy1); + trace_one_reg(opd_typ); + trace_one_reg(opt_chl_quan); + trace_one_reg(cmd_id_tpu); + trace_one_reg(cmd_id_gdma); + trace_one_reg(quan_m); + trace_one_reg(opt_res0_sign); + trace_one_reg(opt_opd0_sign); + trace_one_reg(opt_opd1_sign); + trace_one_reg(opt_opd2_sign); + trace_one_reg(opt_res0_seg); + trace_one_reg(opt_opd0_seg); + trace_one_reg(opt_opd1_seg); + trace_one_reg(opt_opd2_seg); + trace_one_reg(ps32_md); + trace_one_reg(double_conv); + trace_one_reg(opt_left_tran); + trace_one_reg(fp_round_typ); + trace_one_reg(opt_relu_typ); + trace_one_reg(opt_relu_value); + trace_one_reg(cmd_pre_exe_typ); + trace_one_reg(opt_res_add); + trace_one_reg(rsvd0); + trace_one_reg(conv_opd0_x_ins0); + trace_one_reg(conv_opd0_y_ins0); + trace_one_reg(conv_opd0_x_ins0_last); + trace_one_reg(conv_opd0_y_ins0_last); + trace_one_reg(conv_opd1_x_ins0); + trace_one_reg(conv_opd1_y_ins0); + trace_one_reg(dummy0); + trace_one_reg(opd0_ins_val); + trace_one_reg(conv_opd0_up_pad); + trace_one_reg(conv_opd0_dn_pad); + trace_one_reg(conv_opd0_lf_pad); + trace_one_reg(conv_opd0_rt_pad); + trace_one_reg(res0_n); + trace_one_reg(res0_c); + trace_one_reg(res0_h); + trace_one_reg(res0_w); + trace_one_reg(conv_op_x_str); + trace_one_reg(conv_op_y_str); + trace_one_reg(cmd_pre_exe); + trace_one_reg(rsvd1); + trace_one_reg(res0_addr); + trace_one_reg(opd0_addr); + trace_one_reg(opd1_addr); + trace_one_reg(opd2_addr); + trace_one_reg(opt_opd0_const); + trace_one_reg(opt_opd1_const); + trace_one_reg(opt_opd2_const); + trace_one_reg(short_nchwstr_same); + trace_one_reg(short_res0_str); + trace_one_reg(short_opd0_str); + trace_one_reg(short_opd1_str); + trace_one_reg(short_opd2_str); + trace_one_reg(dummy2); + trace_one_reg(opd0_n); + trace_one_reg(opd0_c); + trace_one_reg(dummy3); + trace_one_reg(rsvd2); + trace_one_reg(opd0_h); + trace_one_reg(opd0_w); + trace_one_reg(opd1_n); + trace_one_reg(opd1_c); + trace_one_reg(opd1_h); + trace_one_reg(opd1_w); + trace_one_reg(opd2_n); + trace_one_reg(opd2_c); + trace_one_reg(opd2_h); + trace_one_reg(opd2_w); + trace_one_reg(dummy4); + trace_one_reg(rsvd3); + trace_one_reg(layer_info); + trace_one_reg(res0_n_str); + trace_one_reg(res0_c_str); + trace_one_reg(res0_h_str); + trace_one_reg(res0_w_str); + trace_one_reg(res0_b_str); + trace_one_reg(opd0_n_str); + trace_one_reg(dummy5); + trace_one_reg(rsvd4); + trace_one_reg(opd0_c_str); + trace_one_reg(opd0_h_str); + trace_one_reg(opd0_w_str); + trace_one_reg(opd0_b_str); + trace_one_reg(opd1_n_str); + trace_one_reg(opd1_c_str); + trace_one_reg(opd1_h_str); + trace_one_reg(dummy6); + trace_one_reg(rsvd5); + trace_one_reg(opd1_w_str); + trace_one_reg(opd1_b_str); + trace_one_reg(opd2_n_str); + trace_one_reg(opd2_c_str); + trace_one_reg(opd2_h_str); + trace_one_reg(opd2_w_str); + trace_one_reg(opd2_b_str); + trace_one_reg(dummy7); + trace_one_reg(rsvd6); +} +#endif /* CV180X_TIU_REG_H */ diff --git a/cvikernel/include/cvikernel/cv180x/cv180x_tpu_cfg.h b/cvikernel/include/cvikernel/cv180x/cv180x_tpu_cfg.h new file mode 100644 index 000000000..323d2761d --- /dev/null +++ b/cvikernel/include/cvikernel/cv180x/cv180x_tpu_cfg.h @@ -0,0 +1,38 @@ +#ifndef __CV180X_TPU_CFG__ +#define __CV180X_TPU_CFG__ + +#define CV180X_VER 182203 +#define CV180X_HW_NPU_SHIFT 1 +#define CV180X_HW_EU_SHIFT 4 +#define CV180X_HW_LMEM_SHIFT 15 +#define CV180X_HW_LMEM_BANKS 8 +#define CV180X_HW_LMEM_BANK_SIZE 0x1000 +#define CV180X_HW_NODE_CHIP_SHIFT 0 +#define CV180X_HW_NPU_NUM (1 << CV180X_HW_NPU_SHIFT) +#define CV180X_HW_EU_NUM (1 << CV180X_HW_EU_SHIFT) +#define CV180X_HW_LMEM_SIZE (1 << CV180X_HW_LMEM_SHIFT) +#define CV180X_HW_LMEM_START_ADDR 0x0C000000 +#define CV180X_HW_NODE_CHIP_NUM (1 << CV180X_HW_NODE_CHIP_SHIFT) + +#if (CV180X_HW_LMEM_SIZE != (CV180X_HW_LMEM_BANK_SIZE * CV180X_HW_LMEM_BANKS)) +#error "Set wrong TPU configuration." +#endif + +#define CV180X_GLOBAL_MEM_START_ADDR 0x0 +#define CV180X_GLOBAL_MEM_SIZE 0x100000000 // + +#define CV180X_GLOBAL_TIU_CMDBUF_ADDR 0x00000000 +#define CV180X_GLOBAL_TDMA_CMDBUF_ADDR 0x00800000 +#define CV180X_GLOBAL_TIU_CMDBUF_RESERVED_SIZE 0x00800000 // 8MB +#define CV180X_GLOBAL_TDMA_CMDBUF_RESERVED_SIZE 0x00800000 // 8MB +#define CV180X_GLOBAL_POOL_RESERVED_SIZE (CV180X_GLOBAL_MEM_SIZE - CV180X_GLOBAL_TIU_CMDBUF_RESERVED_SIZE - CV180X_GLOBAL_TDMA_CMDBUF_RESERVED_SIZE) + +#define CV180X_UART_CTLR_BASE_ADDR 0x04140000 + +#define CV180X_TDMA_ENGINE_BASE_ADDR 0x0C100000 +#define CV180X_TDMA_ENGINE_END_ADDR (CV180X_TDMA_ENGINE_BASE_ADDR + 0x1000) + +#define CV180X_TIU_ENGINE_BASE_ADDR 0x0C101000 //"NPS Register" in memory map? +#define CV180X_TIU_ENGINE_END_ADDR (CV180X_TIU_ENGINE_BASE_ADDR + 0x1000) + +#endif diff --git a/cvikernel/include/cvikernel/cv181x/cv181x_tdma_reg.h b/cvikernel/include/cvikernel/cv181x/cv181x_tdma_reg.h new file mode 100644 index 000000000..6c8f06ef6 --- /dev/null +++ b/cvikernel/include/cvikernel/cv181x/cv181x_tdma_reg.h @@ -0,0 +1,310 @@ +#ifndef CV181X_TDMA_REG_H +#define CV181X_TDMA_REG_H + +/* + * This file is generated by tools. Do not edit it manually. + */ + +#include +#include + +#define TDMA_DESC_REG_BYTES (0x40) +#define TDMA_ENGINE_DESCRIPTOR_NUM (TDMA_DESC_REG_BYTES >> 2) +#define TDMA_NUM_BASE_REGS (0x8) + +typedef unsigned long long ullong; + +typedef struct { + uint32_t vld; + uint32_t compress_en; + uint32_t eod; + uint32_t intp_en; + uint32_t bar_en; + uint32_t check_bf16_value; + uint32_t trans_dir; + uint32_t rsv00; + uint32_t trans_fmt; + uint32_t transpose_md; + uint32_t rsv01; + uint32_t intra_cmd_paral; + uint32_t outstanding_en; + uint32_t cmd_id; + uint32_t spec_func; + uint32_t dst_fmt; + uint32_t src_fmt; + uint32_t cmprs_fmt; + uint32_t sys_dtype; + uint32_t rsv2_1; + uint32_t int8_sign; + uint32_t compress_zero_guard; + uint32_t int8_rnd_mode; + uint32_t wait_id_tpu; + uint32_t wait_id_other_tdma; + uint32_t wait_id_sdma; + uint32_t const_val; + uint32_t src_base_reg_sel; + uint32_t mv_lut_idx; + uint32_t dst_base_reg_sel; + uint32_t mv_lut_base; + uint32_t rsv4_5; + uint32_t dst_h_stride; + uint32_t dst_c_stride_low; + uint32_t dst_n_stride; + uint32_t src_h_stride; + uint32_t src_c_stride_low; + uint32_t src_n_stride; + uint32_t dst_c; + uint32_t src_c; + uint32_t dst_w; + uint32_t dst_h; + uint32_t src_w; + uint32_t src_h; + uint32_t dst_base_addr_low; + uint32_t src_base_addr_low; + uint32_t src_n; + uint32_t dst_base_addr_high; + uint32_t src_base_addr_high; + uint32_t src_c_stride_high; + uint32_t dst_c_stride_high; + uint32_t compress_bias0; + uint32_t compress_bias1; + uint32_t layer_ID; +} tdma_reg_t; + +static inline void parse_tdma_reg(tdma_reg_t *r, const uint32_t *p) +{ + r->vld = p[0] & 1; + r->compress_en = (p[0] >> 1) & 1; + r->eod = (p[0] >> 2) & 1; + r->intp_en = (p[0] >> 3) & 1; + r->bar_en = (p[0] >> 4) & 1; + r->check_bf16_value = (p[0] >> 5) & 1; + r->trans_dir = (p[0] >> 6) & ((1u << 2) - 1); + r->rsv00 = (p[0] >> 8) & ((1u << 2) - 1); + r->trans_fmt = (p[0] >> 10) & 1; + r->transpose_md = (p[0] >> 11) & ((1u << 2) - 1); + r->rsv01 = (p[0] >> 13) & 1; + r->intra_cmd_paral = (p[0] >> 14) & 1; + r->outstanding_en = (p[0] >> 15) & 1; + r->cmd_id = (p[0] >> 16) & ((1u << 16) - 1); + r->spec_func = p[1] & ((1u << 3) - 1); + r->dst_fmt = (p[1] >> 3) & ((1u << 2) - 1); + r->src_fmt = (p[1] >> 5) & ((1u << 2) - 1); + r->cmprs_fmt = (p[1] >> 7) & 1; + r->sys_dtype = (p[1] >> 8) & 1; + r->rsv2_1 = (p[1] >> 9) & ((1u << 4) - 1); + r->int8_sign = (p[1] >> 13) & 1; + r->compress_zero_guard = (p[1] >> 14) & 1; + r->int8_rnd_mode = (p[1] >> 15) & 1; + r->wait_id_tpu = (p[1] >> 16) & ((1u << 16) - 1); + r->wait_id_other_tdma = p[2] & ((1u << 16) - 1); + r->wait_id_sdma = (p[2] >> 16) & ((1u << 16) - 1); + r->const_val = p[3] & ((1u << 16) - 1); + r->src_base_reg_sel = (p[3] >> 16) & ((1u << 3) - 1); + r->mv_lut_idx = (p[3] >> 19) & 1; + r->dst_base_reg_sel = (p[3] >> 20) & ((1u << 3) - 1); + r->mv_lut_base = (p[3] >> 23) & 1; + r->rsv4_5 = (p[3] >> 24) & ((1u << 8) - 1); + r->dst_h_stride = p[4] & ((1u << 16) - 1); + r->dst_c_stride_low = (p[4] >> 16) & ((1u << 16) - 1); + r->dst_n_stride = p[5]; + r->src_h_stride = p[6] & ((1u << 16) - 1); + r->src_c_stride_low = (p[6] >> 16) & ((1u << 16) - 1); + r->src_n_stride = p[7]; + r->dst_c = p[8] & ((1u << 16) - 1); + r->src_c = (p[8] >> 16) & ((1u << 16) - 1); + r->dst_w = p[9] & ((1u << 16) - 1); + r->dst_h = (p[9] >> 16) & ((1u << 16) - 1); + r->src_w = p[10] & ((1u << 16) - 1); + r->src_h = (p[10] >> 16) & ((1u << 16) - 1); + r->dst_base_addr_low = p[11]; + r->src_base_addr_low = p[12]; + r->src_n = p[13] & ((1u << 16) - 1); + r->dst_base_addr_high = (p[13] >> 16) & ((1u << 8) - 1); + r->src_base_addr_high = (p[13] >> 24) & ((1u << 8) - 1); + r->src_c_stride_high = p[14] & ((1u << 16) - 1); + r->dst_c_stride_high = (p[14] >> 16) & ((1u << 16) - 1); + r->compress_bias0 = p[15] & ((1u << 8) - 1); + r->compress_bias1 = (p[15] >> 8) & ((1u << 8) - 1); + r->layer_ID = (p[15] >> 16) & ((1u << 16) - 1); +} + +static inline void emit_tdma_reg(const tdma_reg_t *r, uint32_t *_p) +{ + volatile uint32_t *p = (typeof(p))_p; + p[15] = (r->compress_bias0 & ((1u << 8) - 1)) | + ((r->compress_bias1 & ((1u << 8) - 1)) << 8) | + ((r->layer_ID & ((1u << 16) - 1)) << 16); + p[14] = (r->src_c_stride_high & ((1u << 16) - 1)) | + ((r->dst_c_stride_high & ((1u << 16) - 1)) << 16); + p[13] = (r->src_n & ((1u << 16) - 1)) | + ((r->dst_base_addr_high & ((1u << 8) - 1)) << 16) | + ((r->src_base_addr_high & ((1u << 8) - 1)) << 24); + p[12] = (r->src_base_addr_low & (((uint64_t)1 << 32) - 1)); + p[11] = (r->dst_base_addr_low & (((uint64_t)1 << 32) - 1)); + p[10] = (r->src_w & ((1u << 16) - 1)) | + ((r->src_h & ((1u << 16) - 1)) << 16); + p[9] = (r->dst_w & ((1u << 16) - 1)) | + ((r->dst_h & ((1u << 16) - 1)) << 16); + p[8] = (r->dst_c & ((1u << 16) - 1)) | + ((r->src_c & ((1u << 16) - 1)) << 16); + p[7] = (r->src_n_stride & (((uint64_t)1 << 32) - 1)); + p[6] = (r->src_h_stride & ((1u << 16) - 1)) | + ((r->src_c_stride_low & ((1u << 16) - 1)) << 16); + p[5] = (r->dst_n_stride & (((uint64_t)1 << 32) - 1)); + p[4] = (r->dst_h_stride & ((1u << 16) - 1)) | + ((r->dst_c_stride_low & ((1u << 16) - 1)) << 16); + p[3] = (r->const_val & ((1u << 16) - 1)) | + ((r->src_base_reg_sel & ((1u << 3) - 1)) << 16) | + ((r->mv_lut_idx & 1) << 19) | + ((r->dst_base_reg_sel & ((1u << 3) - 1)) << 20) | + ((r->mv_lut_base & 1) << 23) | + ((r->rsv4_5 & ((1u << 8) - 1)) << 24); + p[2] = (r->wait_id_other_tdma & ((1u << 16) - 1)) | + ((r->wait_id_sdma & ((1u << 16) - 1)) << 16); + p[1] = (r->spec_func & ((1u << 3) - 1)) | + ((r->dst_fmt & ((1u << 2) - 1)) << 3) | + ((r->src_fmt & ((1u << 2) - 1)) << 5) | + ((r->cmprs_fmt & 1) << 7) | + ((r->sys_dtype & 1) << 8) | + ((r->rsv2_1 & ((1u << 4) - 1)) << 9) | + ((r->int8_sign & 1) << 13) | + ((r->compress_zero_guard & 1) << 14) | + ((r->int8_rnd_mode & 1) << 15) | + ((r->wait_id_tpu & ((1u << 16) - 1)) << 16); + p[0] = (r->vld & 1) | + ((r->compress_en & 1) << 1) | + ((r->eod & 1) << 2) | + ((r->intp_en & 1) << 3) | + ((r->bar_en & 1) << 4) | + ((r->check_bf16_value & 1) << 5) | + ((r->trans_dir & ((1u << 2) - 1)) << 6) | + ((r->rsv00 & ((1u << 2) - 1)) << 8) | + ((r->trans_fmt & 1) << 10) | + ((r->transpose_md & ((1u << 2) - 1)) << 11) | + ((r->rsv01 & 1) << 13) | + ((r->intra_cmd_paral & 1) << 14) | + ((r->outstanding_en & 1) << 15) | + ((r->cmd_id & ((1u << 16) - 1)) << 16); +} + +static inline void reset_tdma_reg(tdma_reg_t *r) +{ + r->vld = 0x0; + r->compress_en = 0x0; + r->eod = 0x0; + r->intp_en = 0x0; + r->bar_en = 0x0; + r->check_bf16_value = 0x0; + r->trans_dir = 0x0; + r->rsv00 = 0x0; + r->trans_fmt = 0x0; + r->transpose_md = 0x0; + r->rsv01 = 0x0; + r->intra_cmd_paral = 0x0; + r->outstanding_en = 0x0; + r->cmd_id = 0x0; + r->spec_func = 0x0; + r->dst_fmt = 0x1; + r->src_fmt = 0x1; + r->cmprs_fmt = 0x0; + r->sys_dtype = 0x0; + r->rsv2_1 = 0x0; + r->int8_sign = 0x0; + r->compress_zero_guard = 0x0; + r->int8_rnd_mode = 0x0; + r->wait_id_tpu = 0x0; + r->wait_id_other_tdma = 0x0; + r->wait_id_sdma = 0x0; + r->const_val = 0x0; + r->src_base_reg_sel = 0x0; + r->mv_lut_idx = 0x0; + r->dst_base_reg_sel = 0x0; + r->mv_lut_base = 0x0; + r->rsv4_5 = 0x0; + r->dst_h_stride = 0x1; + r->dst_c_stride_low = 0x1; + r->dst_n_stride = 0x1; + r->src_h_stride = 0x1; + r->src_c_stride_low = 0x1; + r->src_n_stride = 0x1; + r->dst_c = 0x1; + r->src_c = 0x1; + r->dst_w = 0x1; + r->dst_h = 0x1; + r->src_w = 0x1; + r->src_h = 0x1; + r->dst_base_addr_low = 0x0; + r->src_base_addr_low = 0x0; + r->src_n = 0x1; + r->dst_base_addr_high = 0x0; + r->src_base_addr_high = 0x0; + r->src_c_stride_high = 0x0; + r->dst_c_stride_high = 0x0; + r->compress_bias0 = 0x0; + r->compress_bias1 = 0x0; + r->layer_ID = 0x0; +} + +static inline void trace_tdma_reg(tdma_reg_t *r, const char *tag) +{ +#define trace_one_reg(name) \ + printf(" %s: 0x%llx\n", #name, (ullong)r->name) + + printf("--- %s ---\n", tag); + trace_one_reg(vld); + trace_one_reg(compress_en); + trace_one_reg(eod); + trace_one_reg(intp_en); + trace_one_reg(bar_en); + trace_one_reg(check_bf16_value); + trace_one_reg(trans_dir); + trace_one_reg(rsv00); + trace_one_reg(trans_fmt); + trace_one_reg(transpose_md); + trace_one_reg(rsv01); + trace_one_reg(intra_cmd_paral); + trace_one_reg(outstanding_en); + trace_one_reg(cmd_id); + trace_one_reg(spec_func); + trace_one_reg(dst_fmt); + trace_one_reg(src_fmt); + trace_one_reg(cmprs_fmt); + trace_one_reg(sys_dtype); + trace_one_reg(rsv2_1); + trace_one_reg(int8_sign); + trace_one_reg(compress_zero_guard); + trace_one_reg(int8_rnd_mode); + trace_one_reg(wait_id_tpu); + trace_one_reg(wait_id_other_tdma); + trace_one_reg(wait_id_sdma); + trace_one_reg(const_val); + trace_one_reg(src_base_reg_sel); + trace_one_reg(mv_lut_idx); + trace_one_reg(dst_base_reg_sel); + trace_one_reg(mv_lut_base); + trace_one_reg(rsv4_5); + trace_one_reg(dst_h_stride); + trace_one_reg(dst_c_stride_low); + trace_one_reg(dst_n_stride); + trace_one_reg(src_h_stride); + trace_one_reg(src_c_stride_low); + trace_one_reg(src_n_stride); + trace_one_reg(dst_c); + trace_one_reg(src_c); + trace_one_reg(dst_w); + trace_one_reg(dst_h); + trace_one_reg(src_w); + trace_one_reg(src_h); + trace_one_reg(dst_base_addr_low); + trace_one_reg(src_base_addr_low); + trace_one_reg(src_n); + trace_one_reg(dst_base_addr_high); + trace_one_reg(src_base_addr_high); + trace_one_reg(src_c_stride_high); + trace_one_reg(dst_c_stride_high); + trace_one_reg(compress_bias0); + trace_one_reg(compress_bias1); + trace_one_reg(layer_ID); +} +#endif /* CV181X_TDMA_REG_H */ diff --git a/cvikernel/include/cvikernel/cv181x/cv181x_tiu_reg.h b/cvikernel/include/cvikernel/cv181x/cv181x_tiu_reg.h new file mode 100644 index 000000000..70d8a2b03 --- /dev/null +++ b/cvikernel/include/cvikernel/cv181x/cv181x_tiu_reg.h @@ -0,0 +1,622 @@ +#ifndef CV181X_TIU_REG_H +#define CV181X_TIU_REG_H + +/* + * This file is generated by tools. Do not edit it manually. + */ + +#include +#include + +#define TIU_DESC_REG_BYTES (0x70) +#define TIU_ENGINE_DESCRIPTOR_NUM (TIU_DESC_REG_BYTES >> 2) + +// TIU operation data type +#define DCR_TYPE_CONV_FIX8B 0 +#define DCR_TYPE_DEPTHWISE_POOL_FIX8B 1 +#define DCR_TYPE_FC_FIX8B 2 +#define DCR_TYPE_TENSOR_ARITH_FIX8B 3 +#define NR_DCR_TYPES 4 + +#define TENSOR_MUL_FIX8B 0 +#define TENSOR_MAC_FIX8B 1 +#define TENSOR_ADD_FIX8B 2 +#define TENSOR_SUB_FIX8B 3 +#define TENSOR_MAX_FIX8B 4 +#define TENSOR_MIN_FIX8B 5 +#define TENSOR_SHIFT_FIX8B 6 +#define TENSOR_AND_FIX8B 7 +#define TENSOR_OR_FIX8B 8 +#define TENSOR_XOR_FIX8B 9 +#define TENSOR_COPY_FIX8B 10 +#define TENSOR_GE_FIX8B 11 + +typedef unsigned long long ullong; + +typedef struct { + uint32_t cmd_en; + uint32_t cmd_end; + uint32_t cmd_id_en; + uint32_t cmd_keep; + uint32_t cmd_intr_en; + uint32_t tsk_typ; + uint32_t tsk_eu_typ; + uint32_t tsk_opd_num; + uint32_t opt_res_shift; + uint32_t opt_left_shift; + uint32_t opt_shift_typ; + uint32_t opt_rshift_typ; + uint32_t dummy1; + uint32_t opd_typ; + uint32_t opt_chl_quan; + uint32_t cmd_id_tpu; + uint32_t cmd_id_gdma; + uint32_t quan_m; + uint32_t opt_res0_sign; + uint32_t opt_opd0_sign; + uint32_t opt_opd1_sign; + uint32_t opt_opd2_sign; + uint32_t opt_res0_seg; + uint32_t opt_opd0_seg; + uint32_t opt_opd1_seg; + uint32_t opt_opd2_seg; + uint32_t ps32_md; + uint32_t double_conv; + uint32_t opt_left_tran; + uint32_t fp_round_typ; + uint32_t opt_relu_typ; + uint32_t opt_relu_value; + uint32_t cmd_pre_exe_typ; + uint32_t opt_res_add; + uint32_t rsvd0; + uint32_t conv_opd0_x_ins0; + uint32_t conv_opd0_y_ins0; + uint32_t conv_opd0_x_ins0_last; + uint32_t conv_opd0_y_ins0_last; + uint32_t conv_opd1_x_ins0; + uint32_t conv_opd1_y_ins0; + uint32_t dummy0; + uint32_t opd0_ins_val; + uint32_t conv_opd0_up_pad; + uint32_t conv_opd0_dn_pad; + uint32_t conv_opd0_lf_pad; + uint32_t conv_opd0_rt_pad; + uint32_t res0_n; + uint32_t res0_c; + uint32_t res0_h; + uint32_t res0_w; + uint32_t conv_op_x_str; + uint32_t conv_op_y_str; + uint32_t cmd_pre_exe; + uint32_t rsvd1; + uint32_t res0_addr; + uint32_t opd0_addr; + uint32_t opd1_addr; + uint32_t opd2_addr; + uint32_t opt_opd0_const; + uint32_t opt_opd1_const; + uint32_t opt_opd2_const; + uint32_t short_nchwstr_same; + uint32_t short_res0_str; + uint32_t short_opd0_str; + uint32_t short_opd1_str; + uint32_t short_opd2_str; + uint32_t dummy2; + uint32_t opd0_n; + uint32_t opd0_c; + uint32_t dummy3; + uint32_t rsvd2; + uint32_t opd0_h; + uint32_t opd0_w; + uint32_t opd1_n; + uint32_t opd1_c; + uint32_t opd1_h; + uint32_t opd1_w; + uint32_t opd2_n; + uint32_t opd2_c; + uint32_t opd2_h; + uint32_t opd2_w; + uint32_t dummy4; + uint32_t rsvd3; + uint32_t layer_info; + uint32_t res0_n_str; + uint32_t res0_c_str; + uint32_t res0_h_str; + uint32_t res0_w_str; + uint32_t res0_b_str; + uint32_t opd0_n_str; + uint32_t dummy5; + uint32_t rsvd4; + uint32_t opd0_c_str; + uint32_t opd0_h_str; + uint32_t opd0_w_str; + uint32_t opd0_b_str; + uint32_t opd1_n_str; + uint32_t opd1_c_str; + uint32_t opd1_h_str; + uint32_t dummy6; + uint32_t rsvd5; + uint32_t opd1_w_str; + uint32_t opd1_b_str; + uint32_t opd2_n_str; + uint32_t opd2_c_str; + uint32_t opd2_h_str; + uint32_t opd2_w_str; + uint32_t opd2_b_str; + uint32_t dummy7; + uint32_t rsvd6; +} tiu_reg_t; + +static inline void parse_tiu_reg(tiu_reg_t *r, const uint32_t *p) +{ + r->cmd_en = p[0] & 1; + r->cmd_end = (p[0] >> 1) & 1; + r->cmd_id_en = (p[0] >> 2) & 1; + r->cmd_keep = (p[0] >> 3) & 1; + r->cmd_intr_en = (p[0] >> 4) & 1; + r->tsk_typ = (p[0] >> 5) & ((1u << 4) - 1); + r->tsk_eu_typ = (p[0] >> 9) & ((1u << 5) - 1); + r->tsk_opd_num = (p[0] >> 14) & ((1u << 2) - 1); + r->opt_res_shift = (p[0] >> 16) & ((1u << 6) - 1); + r->opt_left_shift = (p[0] >> 22) & ((1u << 5) - 1); + r->opt_shift_typ = (p[0] >> 27) & 1; + r->opt_rshift_typ = (p[0] >> 28) & 1; + r->dummy1 = (p[0] >> 29) & 1; + r->opd_typ = (p[0] >> 30) & 1; + r->opt_chl_quan = (p[0] >> 31) & 1; + r->cmd_id_tpu = p[1] & ((1u << 16) - 1); + r->cmd_id_gdma = (p[1] >> 16) & ((1u << 16) - 1); + r->quan_m = p[2]; + r->opt_res0_sign = p[3] & 1; + r->opt_opd0_sign = (p[3] >> 1) & 1; + r->opt_opd1_sign = (p[3] >> 2) & 1; + r->opt_opd2_sign = (p[3] >> 3) & 1; + r->opt_res0_seg = (p[3] >> 4) & ((1u << 2) - 1); + r->opt_opd0_seg = (p[3] >> 6) & ((1u << 2) - 1); + r->opt_opd1_seg = (p[3] >> 8) & ((1u << 2) - 1); + r->opt_opd2_seg = (p[3] >> 10) & 1; + r->ps32_md = (p[3] >> 11) & ((1u << 2) - 1); + r->double_conv = (p[3] >> 13) & 1; + r->opt_left_tran = (p[3] >> 14) & 1; + r->fp_round_typ = (p[3] >> 15) & 1; + r->opt_relu_typ = (p[3] >> 16) & ((1u << 2) - 1); + r->opt_relu_value = (p[3] >> 18) & ((1u << 8) - 1); + r->cmd_pre_exe_typ = (p[3] >> 26) & 1; + r->opt_res_add = (p[3] >> 27) & 1; + r->rsvd0 = (p[3] >> 28) & ((1u << 4) - 1); + r->conv_opd0_x_ins0 = p[4] & ((1u << 4) - 1); + r->conv_opd0_y_ins0 = (p[4] >> 4) & ((1u << 4) - 1); + r->conv_opd0_x_ins0_last = (p[4] >> 8) & ((1u << 4) - 1); + r->conv_opd0_y_ins0_last = (p[4] >> 12) & ((1u << 4) - 1); + r->conv_opd1_x_ins0 = (p[4] >> 16) & ((1u << 4) - 1); + r->conv_opd1_y_ins0 = (p[4] >> 20) & ((1u << 4) - 1); + r->dummy0 = (p[4] >> 24) & ((1u << 8) - 1); + r->opd0_ins_val = p[5] & ((1u << 16) - 1); + r->conv_opd0_up_pad = (p[5] >> 16) & ((1u << 4) - 1); + r->conv_opd0_dn_pad = (p[5] >> 20) & ((1u << 4) - 1); + r->conv_opd0_lf_pad = (p[5] >> 24) & ((1u << 4) - 1); + r->conv_opd0_rt_pad = (p[5] >> 28) & ((1u << 4) - 1); + r->res0_n = p[6] & ((1u << 12) - 1); + r->res0_c = (p[6] >> 12) & ((1u << 12) - 1); + r->res0_h = (p[6] >> 24) & ((1u << 8) - 1); + r->res0_h |= (uint64_t)(p[7] & ((1u << 4) - 1)) << 8; + r->res0_w = (p[7] >> 4) & ((1u << 12) - 1); + r->conv_op_x_str = (p[7] >> 16) & ((1u << 5) - 1); + r->conv_op_y_str = (p[7] >> 21) & ((1u << 5) - 1); + r->cmd_pre_exe = (p[7] >> 26) & ((1u << 2) - 1); + r->rsvd1 = (p[7] >> 28) & ((1u << 4) - 1); + r->res0_addr = p[8] & ((1u << 24) - 1); + r->opd0_addr = (p[8] >> 24) & ((1u << 8) - 1); + r->opd0_addr |= (uint64_t)(p[9] & ((1u << 16) - 1)) << 8; + r->opd1_addr = (p[9] >> 16) & ((1u << 16) - 1); + r->opd2_addr = p[10] & ((1u << 16) - 1); + r->opt_opd0_const = (p[10] >> 16) & 1; + r->opt_opd1_const = (p[10] >> 17) & 1; + r->opt_opd2_const = (p[10] >> 18) & 1; + r->short_nchwstr_same = (p[10] >> 19) & 1; + r->short_res0_str = (p[10] >> 20) & ((1u << 2) - 1); + r->short_opd0_str = (p[10] >> 22) & ((1u << 2) - 1); + r->short_opd1_str = (p[10] >> 24) & ((1u << 2) - 1); + r->short_opd2_str = (p[10] >> 26) & ((1u << 2) - 1); + r->dummy2 = (p[10] >> 28) & ((1u << 4) - 1); + r->opd0_n = p[11] & ((1u << 12) - 1); + r->opd0_c = (p[11] >> 12) & ((1u << 12) - 1); + r->dummy3 = (p[11] >> 24) & ((1u << 4) - 1); + r->rsvd2 = (p[11] >> 28) & ((1u << 4) - 1); + r->opd0_h = p[12] & ((1u << 12) - 1); + r->opd0_w = (p[12] >> 12) & ((1u << 12) - 1); + r->opd1_n = (p[12] >> 24) & ((1u << 8) - 1); + r->opd1_n |= (uint64_t)(p[13] & ((1u << 4) - 1)) << 8; + r->opd1_c = (p[13] >> 4) & ((1u << 12) - 1); + r->opd1_h = (p[13] >> 16) & ((1u << 12) - 1); + r->opd1_w = (p[13] >> 28) & ((1u << 4) - 1); + r->opd1_w |= (uint64_t)(p[14] & ((1u << 8) - 1)) << 4; + r->opd2_n = (p[14] >> 8) & ((1u << 12) - 1); + r->opd2_c = (p[14] >> 20) & ((1u << 12) - 1); + r->opd2_h = p[15] & ((1u << 12) - 1); + r->opd2_w = (p[15] >> 12) & ((1u << 12) - 1); + r->dummy4 = (p[15] >> 24) & ((1u << 4) - 1); + r->rsvd3 = (p[15] >> 28) & ((1u << 4) - 1); + r->layer_info = p[16] & ((1u << 16) - 1); + r->res0_n_str = (p[16] >> 16) & ((1u << 16) - 1); + r->res0_c_str = p[17] & ((1u << 16) - 1); + r->res0_h_str = (p[17] >> 16) & ((1u << 16) - 1); + r->res0_w_str = p[18] & ((1u << 16) - 1); + r->res0_b_str = (p[18] >> 16) & ((1u << 16) - 1); + r->opd0_n_str = p[19] & ((1u << 16) - 1); + r->dummy5 = (p[19] >> 16) & ((1u << 12) - 1); + r->rsvd4 = (p[19] >> 28) & ((1u << 4) - 1); + r->opd0_c_str = p[20] & ((1u << 16) - 1); + r->opd0_h_str = (p[20] >> 16) & ((1u << 16) - 1); + r->opd0_w_str = p[21] & ((1u << 16) - 1); + r->opd0_b_str = (p[21] >> 16) & ((1u << 16) - 1); + r->opd1_n_str = p[22] & ((1u << 16) - 1); + r->opd1_c_str = (p[22] >> 16) & ((1u << 16) - 1); + r->opd1_h_str = p[23] & ((1u << 16) - 1); + r->dummy6 = (p[23] >> 16) & ((1u << 12) - 1); + r->rsvd5 = (p[23] >> 28) & ((1u << 4) - 1); + r->opd1_w_str = p[24] & ((1u << 16) - 1); + r->opd1_b_str = (p[24] >> 16) & ((1u << 16) - 1); + r->opd2_n_str = p[25] & ((1u << 16) - 1); + r->opd2_c_str = (p[25] >> 16) & ((1u << 16) - 1); + r->opd2_h_str = p[26] & ((1u << 16) - 1); + r->opd2_w_str = (p[26] >> 16) & ((1u << 16) - 1); + r->opd2_b_str = p[27] & ((1u << 16) - 1); + r->dummy7 = (p[27] >> 16) & ((1u << 12) - 1); + r->rsvd6 = (p[27] >> 28) & ((1u << 4) - 1); +} + +static inline void emit_tiu_reg(const tiu_reg_t *r, uint32_t *_p) +{ + volatile uint32_t *p = (typeof(p))_p; + p[27] = (r->opd2_b_str & ((1u << 16) - 1)) | + ((r->dummy7 & ((1u << 12) - 1)) << 16) | + ((r->rsvd6 & ((1u << 4) - 1)) << 28); + p[26] = (r->opd2_h_str & ((1u << 16) - 1)) | + ((r->opd2_w_str & ((1u << 16) - 1)) << 16); + p[25] = (r->opd2_n_str & ((1u << 16) - 1)) | + ((r->opd2_c_str & ((1u << 16) - 1)) << 16); + p[24] = (r->opd1_w_str & ((1u << 16) - 1)) | + ((r->opd1_b_str & ((1u << 16) - 1)) << 16); + p[23] = (r->opd1_h_str & ((1u << 16) - 1)) | + ((r->dummy6 & ((1u << 12) - 1)) << 16) | + ((r->rsvd5 & ((1u << 4) - 1)) << 28); + p[22] = (r->opd1_n_str & ((1u << 16) - 1)) | + ((r->opd1_c_str & ((1u << 16) - 1)) << 16); + p[21] = (r->opd0_w_str & ((1u << 16) - 1)) | + ((r->opd0_b_str & ((1u << 16) - 1)) << 16); + p[20] = (r->opd0_c_str & ((1u << 16) - 1)) | + ((r->opd0_h_str & ((1u << 16) - 1)) << 16); + p[19] = (r->opd0_n_str & ((1u << 16) - 1)) | + ((r->dummy5 & ((1u << 12) - 1)) << 16) | + ((r->rsvd4 & ((1u << 4) - 1)) << 28); + p[18] = (r->res0_w_str & ((1u << 16) - 1)) | + ((r->res0_b_str & ((1u << 16) - 1)) << 16); + p[17] = (r->res0_c_str & ((1u << 16) - 1)) | + ((r->res0_h_str & ((1u << 16) - 1)) << 16); + p[16] = (r->layer_info & ((1u << 16) - 1)) | + ((r->res0_n_str & ((1u << 16) - 1)) << 16); + p[15] = (r->opd2_h & ((1u << 12) - 1)) | + ((r->opd2_w & ((1u << 12) - 1)) << 12) | + ((r->dummy4 & ((1u << 4) - 1)) << 24) | + ((r->rsvd3 & ((1u << 4) - 1)) << 28); + p[14] = ((r->opd1_w >> 4) & ((1u << 8) - 1)) | + ((r->opd2_n & ((1u << 12) - 1)) << 8) | + ((r->opd2_c & ((1u << 12) - 1)) << 20); + p[13] = ((r->opd1_n >> 8) & ((1u << 4) - 1)) | + ((r->opd1_c & ((1u << 12) - 1)) << 4) | + ((r->opd1_h & ((1u << 12) - 1)) << 16) | + ((r->opd1_w & ((1u << 4) - 1)) << 28); + p[12] = (r->opd0_h & ((1u << 12) - 1)) | + ((r->opd0_w & ((1u << 12) - 1)) << 12) | + ((r->opd1_n & ((1u << 8) - 1)) << 24); + p[11] = (r->opd0_n & ((1u << 12) - 1)) | + ((r->opd0_c & ((1u << 12) - 1)) << 12) | + ((r->dummy3 & ((1u << 4) - 1)) << 24) | + ((r->rsvd2 & ((1u << 4) - 1)) << 28); + p[10] = (r->opd2_addr & ((1u << 16) - 1)) | + ((r->opt_opd0_const & 1) << 16) | + ((r->opt_opd1_const & 1) << 17) | + ((r->opt_opd2_const & 1) << 18) | + ((r->short_nchwstr_same & 1) << 19) | + ((r->short_res0_str & ((1u << 2) - 1)) << 20) | + ((r->short_opd0_str & ((1u << 2) - 1)) << 22) | + ((r->short_opd1_str & ((1u << 2) - 1)) << 24) | + ((r->short_opd2_str & ((1u << 2) - 1)) << 26) | + ((r->dummy2 & ((1u << 4) - 1)) << 28); + p[9] = ((r->opd0_addr >> 8) & ((1u << 16) - 1)) | + ((r->opd1_addr & ((1u << 16) - 1)) << 16); + p[8] = (r->res0_addr & ((1u << 24) - 1)) | + ((r->opd0_addr & ((1u << 8) - 1)) << 24); + p[7] = ((r->res0_h >> 8) & ((1u << 4) - 1)) | + ((r->res0_w & ((1u << 12) - 1)) << 4) | + ((r->conv_op_x_str & ((1u << 5) - 1)) << 16) | + ((r->conv_op_y_str & ((1u << 5) - 1)) << 21) | + ((r->cmd_pre_exe & ((1u << 2) - 1)) << 26) | + ((r->rsvd1 & ((1u << 4) - 1)) << 28); + p[6] = (r->res0_n & ((1u << 12) - 1)) | + ((r->res0_c & ((1u << 12) - 1)) << 12) | + ((r->res0_h & ((1u << 8) - 1)) << 24); + p[5] = (r->opd0_ins_val & ((1u << 16) - 1)) | + ((r->conv_opd0_up_pad & ((1u << 4) - 1)) << 16) | + ((r->conv_opd0_dn_pad & ((1u << 4) - 1)) << 20) | + ((r->conv_opd0_lf_pad & ((1u << 4) - 1)) << 24) | + ((r->conv_opd0_rt_pad & ((1u << 4) - 1)) << 28); + p[4] = (r->conv_opd0_x_ins0 & ((1u << 4) - 1)) | + ((r->conv_opd0_y_ins0 & ((1u << 4) - 1)) << 4) | + ((r->conv_opd0_x_ins0_last & ((1u << 4) - 1)) << 8) | + ((r->conv_opd0_y_ins0_last & ((1u << 4) - 1)) << 12) | + ((r->conv_opd1_x_ins0 & ((1u << 4) - 1)) << 16) | + ((r->conv_opd1_y_ins0 & ((1u << 4) - 1)) << 20) | + ((r->dummy0 & ((1u << 8) - 1)) << 24); + p[3] = (r->opt_res0_sign & 1) | + ((r->opt_opd0_sign & 1) << 1) | + ((r->opt_opd1_sign & 1) << 2) | + ((r->opt_opd2_sign & 1) << 3) | + ((r->opt_res0_seg & ((1u << 2) - 1)) << 4) | + ((r->opt_opd0_seg & ((1u << 2) - 1)) << 6) | + ((r->opt_opd1_seg & ((1u << 2) - 1)) << 8) | + ((r->opt_opd2_seg & 1) << 10) | + ((r->ps32_md & ((1u << 2) - 1)) << 11) | + ((r->double_conv & 1) << 13) | + ((r->opt_left_tran & 1) << 14) | + ((r->fp_round_typ & 1) << 15) | + ((r->opt_relu_typ & ((1u << 2) - 1)) << 16) | + ((r->opt_relu_value & ((1u << 8) - 1)) << 18) | + ((r->cmd_pre_exe_typ & 1) << 26) | + ((r->opt_res_add & 1) << 27) | + ((r->rsvd0 & ((1u << 4) - 1)) << 28); + p[2] = (r->quan_m & (((uint64_t)1 << 32) - 1)); + p[1] = (r->cmd_id_tpu & ((1u << 16) - 1)) | + ((r->cmd_id_gdma & ((1u << 16) - 1)) << 16); + p[0] = (r->cmd_en & 1) | + ((r->cmd_end & 1) << 1) | + ((r->cmd_id_en & 1) << 2) | + ((r->cmd_keep & 1) << 3) | + ((r->cmd_intr_en & 1) << 4) | + ((r->tsk_typ & ((1u << 4) - 1)) << 5) | + ((r->tsk_eu_typ & ((1u << 5) - 1)) << 9) | + ((r->tsk_opd_num & ((1u << 2) - 1)) << 14) | + ((r->opt_res_shift & ((1u << 6) - 1)) << 16) | + ((r->opt_left_shift & ((1u << 5) - 1)) << 22) | + ((r->opt_shift_typ & 1) << 27) | + ((r->opt_rshift_typ & 1) << 28) | + ((r->dummy1 & 1) << 29) | + ((r->opd_typ & 1) << 30) | + ((r->opt_chl_quan & 1) << 31); +} + +static inline void reset_tiu_reg(tiu_reg_t *r) +{ + r->cmd_en = 0x0; + r->cmd_end = 0x0; + r->cmd_id_en = 0x0; + r->cmd_keep = 0x0; + r->cmd_intr_en = 0x0; + r->tsk_typ = 0x0; + r->tsk_eu_typ = 0x0; + r->tsk_opd_num = 0x3; + r->opt_res_shift = 0xa; + r->opt_left_shift = 0x2; + r->opt_shift_typ = 0x1; + r->opt_rshift_typ = 0x1; + r->dummy1 = 0x0; + r->opd_typ = 0x0; + r->opt_chl_quan = 0x0; + r->cmd_id_tpu = 0x0; + r->cmd_id_gdma = 0x0; + r->quan_m = 0x0; + r->opt_res0_sign = 0x0; + r->opt_opd0_sign = 0x0; + r->opt_opd1_sign = 0x1; + r->opt_opd2_sign = 0x1; + r->opt_res0_seg = 0x1; + r->opt_opd0_seg = 0x1; + r->opt_opd1_seg = 0x1; + r->opt_opd2_seg = 0x0; + r->ps32_md = 0x0; + r->double_conv = 0x0; + r->opt_left_tran = 0x0; + r->fp_round_typ = 0x0; + r->opt_relu_typ = 0x0; + r->opt_relu_value = 0x0; + r->cmd_pre_exe_typ = 0x0; + r->opt_res_add = 0x0; + r->rsvd0 = 0x0; + r->conv_opd0_x_ins0 = 0x0; + r->conv_opd0_y_ins0 = 0x0; + r->conv_opd0_x_ins0_last = 0x0; + r->conv_opd0_y_ins0_last = 0x0; + r->conv_opd1_x_ins0 = 0x0; + r->conv_opd1_y_ins0 = 0x0; + r->dummy0 = 0x0; + r->opd0_ins_val = 0x0; + r->conv_opd0_up_pad = 0x0; + r->conv_opd0_dn_pad = 0x0; + r->conv_opd0_lf_pad = 0x0; + r->conv_opd0_rt_pad = 0x0; + r->res0_n = 0x1; + r->res0_c = 0x1; + r->res0_h = 0x1; + r->res0_w = 0x10; + r->conv_op_x_str = 0x1; + r->conv_op_y_str = 0x1; + r->cmd_pre_exe = 0x0; + r->rsvd1 = 0x1; + r->res0_addr = 0x0; + r->opd0_addr = 0x0; + r->opd1_addr = 0x0; + r->opd2_addr = 0x0; + r->opt_opd0_const = 0x0; + r->opt_opd1_const = 0x0; + r->opt_opd2_const = 0x0; + r->short_nchwstr_same = 0x0; + r->short_res0_str = 0x0; + r->short_opd0_str = 0x0; + r->short_opd1_str = 0x0; + r->short_opd2_str = 0x0; + r->dummy2 = 0x0; + r->opd0_n = 0x1; + r->opd0_c = 0x1; + r->dummy3 = 0x0; + r->rsvd2 = 0x2; + r->opd0_h = 0x1; + r->opd0_w = 0x10; + r->opd1_n = 0x1; + r->opd1_c = 0x1; + r->opd1_h = 0x1; + r->opd1_w = 0x10; + r->opd2_n = 0x1; + r->opd2_c = 0x1; + r->opd2_h = 0x1; + r->opd2_w = 0x10; + r->dummy4 = 0x0; + r->rsvd3 = 0x3; + r->layer_info = 0x0; + r->res0_n_str = 0x10; + r->res0_c_str = 0x10; + r->res0_h_str = 0x0; + r->res0_w_str = 0x1; + r->res0_b_str = 0x10; + r->opd0_n_str = 0x10; + r->dummy5 = 0x0; + r->rsvd4 = 0x4; + r->opd0_c_str = 0x10; + r->opd0_h_str = 0x0; + r->opd0_w_str = 0x1; + r->opd0_b_str = 0x10; + r->opd1_n_str = 0x10; + r->opd1_c_str = 0x10; + r->opd1_h_str = 0x0; + r->dummy6 = 0x0; + r->rsvd5 = 0x5; + r->opd1_w_str = 0x1; + r->opd1_b_str = 0x10; + r->opd2_n_str = 0x10; + r->opd2_c_str = 0x10; + r->opd2_h_str = 0x0; + r->opd2_w_str = 0x1; + r->opd2_b_str = 0x10; + r->dummy7 = 0x0; + r->rsvd6 = 0x6; +} + +static inline void trace_tiu_reg(tiu_reg_t *r, const char *tag) +{ +#define trace_one_reg(name) \ + printf(" %s: 0x%llx\n", #name, (ullong)r->name) + + printf("--- %s ---\n", tag); + trace_one_reg(cmd_en); + trace_one_reg(cmd_end); + trace_one_reg(cmd_id_en); + trace_one_reg(cmd_keep); + trace_one_reg(cmd_intr_en); + trace_one_reg(tsk_typ); + trace_one_reg(tsk_eu_typ); + trace_one_reg(tsk_opd_num); + trace_one_reg(opt_res_shift); + trace_one_reg(opt_left_shift); + trace_one_reg(opt_shift_typ); + trace_one_reg(opt_rshift_typ); + trace_one_reg(dummy1); + trace_one_reg(opd_typ); + trace_one_reg(opt_chl_quan); + trace_one_reg(cmd_id_tpu); + trace_one_reg(cmd_id_gdma); + trace_one_reg(quan_m); + trace_one_reg(opt_res0_sign); + trace_one_reg(opt_opd0_sign); + trace_one_reg(opt_opd1_sign); + trace_one_reg(opt_opd2_sign); + trace_one_reg(opt_res0_seg); + trace_one_reg(opt_opd0_seg); + trace_one_reg(opt_opd1_seg); + trace_one_reg(opt_opd2_seg); + trace_one_reg(ps32_md); + trace_one_reg(double_conv); + trace_one_reg(opt_left_tran); + trace_one_reg(fp_round_typ); + trace_one_reg(opt_relu_typ); + trace_one_reg(opt_relu_value); + trace_one_reg(cmd_pre_exe_typ); + trace_one_reg(opt_res_add); + trace_one_reg(rsvd0); + trace_one_reg(conv_opd0_x_ins0); + trace_one_reg(conv_opd0_y_ins0); + trace_one_reg(conv_opd0_x_ins0_last); + trace_one_reg(conv_opd0_y_ins0_last); + trace_one_reg(conv_opd1_x_ins0); + trace_one_reg(conv_opd1_y_ins0); + trace_one_reg(dummy0); + trace_one_reg(opd0_ins_val); + trace_one_reg(conv_opd0_up_pad); + trace_one_reg(conv_opd0_dn_pad); + trace_one_reg(conv_opd0_lf_pad); + trace_one_reg(conv_opd0_rt_pad); + trace_one_reg(res0_n); + trace_one_reg(res0_c); + trace_one_reg(res0_h); + trace_one_reg(res0_w); + trace_one_reg(conv_op_x_str); + trace_one_reg(conv_op_y_str); + trace_one_reg(cmd_pre_exe); + trace_one_reg(rsvd1); + trace_one_reg(res0_addr); + trace_one_reg(opd0_addr); + trace_one_reg(opd1_addr); + trace_one_reg(opd2_addr); + trace_one_reg(opt_opd0_const); + trace_one_reg(opt_opd1_const); + trace_one_reg(opt_opd2_const); + trace_one_reg(short_nchwstr_same); + trace_one_reg(short_res0_str); + trace_one_reg(short_opd0_str); + trace_one_reg(short_opd1_str); + trace_one_reg(short_opd2_str); + trace_one_reg(dummy2); + trace_one_reg(opd0_n); + trace_one_reg(opd0_c); + trace_one_reg(dummy3); + trace_one_reg(rsvd2); + trace_one_reg(opd0_h); + trace_one_reg(opd0_w); + trace_one_reg(opd1_n); + trace_one_reg(opd1_c); + trace_one_reg(opd1_h); + trace_one_reg(opd1_w); + trace_one_reg(opd2_n); + trace_one_reg(opd2_c); + trace_one_reg(opd2_h); + trace_one_reg(opd2_w); + trace_one_reg(dummy4); + trace_one_reg(rsvd3); + trace_one_reg(layer_info); + trace_one_reg(res0_n_str); + trace_one_reg(res0_c_str); + trace_one_reg(res0_h_str); + trace_one_reg(res0_w_str); + trace_one_reg(res0_b_str); + trace_one_reg(opd0_n_str); + trace_one_reg(dummy5); + trace_one_reg(rsvd4); + trace_one_reg(opd0_c_str); + trace_one_reg(opd0_h_str); + trace_one_reg(opd0_w_str); + trace_one_reg(opd0_b_str); + trace_one_reg(opd1_n_str); + trace_one_reg(opd1_c_str); + trace_one_reg(opd1_h_str); + trace_one_reg(dummy6); + trace_one_reg(rsvd5); + trace_one_reg(opd1_w_str); + trace_one_reg(opd1_b_str); + trace_one_reg(opd2_n_str); + trace_one_reg(opd2_c_str); + trace_one_reg(opd2_h_str); + trace_one_reg(opd2_w_str); + trace_one_reg(opd2_b_str); + trace_one_reg(dummy7); + trace_one_reg(rsvd6); +} +#endif /* CV181X_TIU_REG_H */ diff --git a/cvikernel/include/cvikernel/cv181x/cv181x_tpu_cfg.h b/cvikernel/include/cvikernel/cv181x/cv181x_tpu_cfg.h new file mode 100644 index 000000000..df348335b --- /dev/null +++ b/cvikernel/include/cvikernel/cv181x/cv181x_tpu_cfg.h @@ -0,0 +1,38 @@ +#ifndef __CV181X_TPU_CFG__ +#define __CV181X_TPU_CFG__ + +#define CV181X_VER 182202 +#define CV181X_HW_NPU_SHIFT 3 +#define CV181X_HW_EU_SHIFT 4 +#define CV181X_HW_LMEM_SHIFT 15 +#define CV181X_HW_LMEM_BANKS 8 +#define CV181X_HW_LMEM_BANK_SIZE 0x1000 +#define CV181X_HW_NODE_CHIP_SHIFT 0 +#define CV181X_HW_NPU_NUM (1 << CV181X_HW_NPU_SHIFT) +#define CV181X_HW_EU_NUM (1 << CV181X_HW_EU_SHIFT) +#define CV181X_HW_LMEM_SIZE (1 << CV181X_HW_LMEM_SHIFT) +#define CV181X_HW_LMEM_START_ADDR 0x0C000000 +#define CV181X_HW_NODE_CHIP_NUM (1 << CV181X_HW_NODE_CHIP_SHIFT) + +#if (CV181X_HW_LMEM_SIZE != (CV181X_HW_LMEM_BANK_SIZE * CV181X_HW_LMEM_BANKS)) +#error "Set wrong TPU configuration." +#endif + +#define CV181X_GLOBAL_MEM_START_ADDR 0x0 +#define CV181X_GLOBAL_MEM_SIZE 0x100000000 // + +#define CV181X_GLOBAL_TIU_CMDBUF_ADDR 0x00000000 +#define CV181X_GLOBAL_TDMA_CMDBUF_ADDR 0x00800000 +#define CV181X_GLOBAL_TIU_CMDBUF_RESERVED_SIZE 0x00800000 // 8MB +#define CV181X_GLOBAL_TDMA_CMDBUF_RESERVED_SIZE 0x00800000 // 8MB +#define CV181X_GLOBAL_POOL_RESERVED_SIZE (CV181X_GLOBAL_MEM_SIZE - CV181X_GLOBAL_TIU_CMDBUF_RESERVED_SIZE - CV181X_GLOBAL_TDMA_CMDBUF_RESERVED_SIZE) + +#define CV181X_UART_CTLR_BASE_ADDR 0x04140000 + +#define CV181X_TDMA_ENGINE_BASE_ADDR 0x0C100000 +#define CV181X_TDMA_ENGINE_END_ADDR (CV181X_TDMA_ENGINE_BASE_ADDR + 0x1000) + +#define CV181X_TIU_ENGINE_BASE_ADDR 0x0C101000 //"NPS Register" in memory map? +#define CV181X_TIU_ENGINE_END_ADDR (CV181X_TIU_ENGINE_BASE_ADDR + 0x1000) + +#endif diff --git a/cvikernel/include/cvikernel/cvikernel.h b/cvikernel/include/cvikernel/cvikernel.h new file mode 100644 index 000000000..7bdb950cb --- /dev/null +++ b/cvikernel/include/cvikernel/cvikernel.h @@ -0,0 +1,1171 @@ +#ifndef CVIKERNEL_H +#define CVIKERNEL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/* + * Type Definition + */ +typedef enum CVIKERNEL_FMT_E { + CVK_FMT_F32 = 0, + CVK_FMT_F16, + CVK_FMT_I32, + CVK_FMT_I16, + CVK_FMT_I8, + CVK_FMT_I4, + CVK_FMT_I2, + CVK_FMT_I1, + CVK_FMT_U32, + CVK_FMT_U16, + CVK_FMT_U8, + CVK_FMT_BF16, + CVK_FMT_INVALID +} cvk_fmt_t; + + +/* + * CVI TPU Chip Name Definition + */ +#define CVI_TPU_VERSION_183X "cv183x" +#define CVI_TPU_VERSION_182X "cv182x" +#define CVI_TPU_VERSION_181X "cv181x" +#define CVI_TPU_VERSION_180X "cv180x" + +/* + * System information + */ +typedef enum CVIKERNEL_HW_FEATURE_E { + CVK_HWF_NONE = 0, + CVK_HWF_FC_OP1_CONST = 1, // FC op1 const + CVK_HWF_8B_ADD_SUB = 1 << 1, // 8b add/sub + CVK_HWF_MIN_POOL = 1 << 2, // Min pooling + CVK_HWF_M_BRADCAST = 1 << 3, // Multi broadcast + CVK_HWF_QM_LSHIFT = 1 << 4, // Left shift of quan_m op + CVK_HWF_GE = 1 << 5, // Greater than or equal to + CVK_HWF_CMD_PRE_EXE = 1 << 6 // Command pre-execute +} cvk_hw_feature_t; + + +typedef struct cvikernel_chip_info { + uint32_t version; + uint32_t node_num; + uint32_t node_shift; + uint32_t npu_num; + uint32_t npu_shift; + uint32_t eu_num; + uint32_t eu_shift; + uint32_t lmem_size; + uint32_t lmem_shift; + uint32_t lmem_banks; + uint32_t lmem_bank_size; + uint64_t lmem_start; + uint64_t gmem_start; + uint64_t gmem_size; + uint64_t features; +} cvk_chip_info_t; + +/* + * Fundamental structures for tensor and matrix + */ +typedef struct cvikernel_matrix_lmem_shape { + uint32_t n, c, w, col; +} cvk_ml_shape_t; + +typedef struct cvikernel_matrix_gmem_shape { + uint32_t row, col; +} cvk_mg_shape_t; + +typedef struct cvikernel_matrix_lmem_stride { + uint32_t n, c, h; +} cvk_ml_stride_t; + +typedef struct cvikernel_matrix_tgmem_stride { + uint32_t row; +} cvk_mg_stride_t; + +typedef struct cvikernel_tensor_lmem_shape { + uint32_t n, c, h, w; +} cvk_tl_shape_t; + +typedef struct cvikernel_tensor_tgmem_shape { + uint32_t n, c, h, w; +} cvk_tg_shape_t; + +typedef struct cvikernel_tensor_lmem_stride { + uint32_t n, c, h, w; +} cvk_tl_stride_t; + +// Even though width stride is not in TDMA configuration, +// The strides of all dimensions is enough to calculate correct position in +// global memory, especially in bf16. +typedef struct cvikernel_tensor_tgmem_stride { + uint32_t n, c, h, w; +} cvk_tg_stride_t; + +typedef struct cvikernel_tensor_lmem { + uint32_t start_address; + cvk_fmt_t fmt; + cvk_fmt_t cmprs_fmt; + cvk_tl_shape_t shape; + cvk_tl_stride_t stride; + uint8_t int8_rnd_mode; // 0 is round to nearset even, 1 is toward zero, currently used by lut + uint8_t eu_align; +} cvk_tl_t; + +typedef struct cvikernel_matrix_lmem { + uint32_t start_address; + cvk_fmt_t fmt; + cvk_ml_shape_t shape; + cvk_ml_stride_t stride; + uint8_t int8_rnd_mode; // 0 is round to nearset even, 1 is toward zero + uint8_t eu_align; +} cvk_ml_t; + +typedef struct cvikernel_tensor_gmem { + uint8_t base_reg_index; + uint64_t start_address; + cvk_fmt_t fmt; + cvk_tg_shape_t shape; + cvk_tg_stride_t stride; + uint8_t int8_rnd_mode; // 0 is round to nearset even, 1 is toward zero +} cvk_tg_t; + +typedef struct cvikernel_compressed_tensor_gmem { + cvk_tg_t t; + uint64_t reserved_size; + uint8_t bit_length; // deprecated for zero compress + uint8_t bias0; + uint8_t bias1; + int zero_guard_en; +} cvk_cmpr_tg_t; + +typedef struct cvikernel_matrix_gmem { + uint8_t base_reg_index; + uint64_t start_address; + cvk_fmt_t fmt; + cvk_mg_shape_t shape; + cvk_mg_stride_t stride; + uint8_t int8_rnd_mode; // 0 is round to nearset even, 1 is toward zero +} cvk_mg_t; + +typedef struct cvikernel_compressed_matrix_gmem { + cvk_mg_t m; + uint8_t bias0; + uint8_t bias1; + int zero_guard_en; +} cvk_cmpr_mg_t; + +/* + * TDMA Engine APIs: LMEM to LMEM (L2L) + */ +typedef struct { + uint8_t mv_lut_idx; + uint8_t mv_lut_base; + const cvk_tl_t *src; + const cvk_tl_t *dst; + uint8_t outstanding; // Concurrent TDMA LD/ST and TDM L2L + uint16_t layer_id; +} cvk_tdma_l2l_tensor_copy_param_t; + +typedef struct { + const cvk_tl_t *src; + const cvk_tl_t *dst; + int right_shift; + uint32_t lrn_step; + uint16_t layer_id; +} cvk_tdma_l2l_tensor_lrn_shift_param_t; + +/* + * TDMA Engine APIs: LMEM to GMEM (L2G) + */ +typedef struct { + const cvk_tl_t *src; + const cvk_tg_t *dst; + uint16_t layer_id; + uint32_t intra_cmd_paral; // [0]: disable + // [1]: enable TDMA/TIU intra-command parallelism +} cvk_tdma_l2g_tensor_copy_param_t; + +typedef struct { + const cvk_tl_t *src; + const cvk_tg_t *dst; + uint16_t layer_id; +} cvk_tdma_l2g_tensor_copy_nc_transposed_param_t; + +typedef struct { + const cvk_tl_t *src; + const cvk_tg_t *dst; + uint16_t layer_id; +} cvk_tdma_l2g_tensor_copy_cw_transposed_param_t; + +typedef struct { + const cvk_tl_t *src; + const cvk_cmpr_tg_t *dst; + uint16_t layer_id; + uint32_t intra_cmd_paral; // [0]: disable + // [1]: enable TDMA/TIU intra-command parallelism +} cvk_tdma_l2g_tensor_copy_compressed_param_t; + +typedef struct { + uint16_t constant; + const cvk_tg_t *dst; + uint16_t layer_id; +} cvk_tdma_l2g_tensor_fill_constant_param_t; + +typedef struct { + const cvk_ml_t *src; + const cvk_mg_t *dst; + uint16_t layer_id; +} cvk_tdma_l2g_matrix_copy_param_t; + +typedef struct { + uint32_t src_address; + uint8_t dst_base_reg_index; + uint64_t dst_address; + uint32_t bytes; + uint16_t layer_id; +} cvk_tdma_l2g_general_copy_param_t; + +typedef struct { + uint32_t src_address; + uint8_t dst_base_reg_index; + uint64_t dst_address; + uint32_t src_bytes; + cvk_fmt_t src_fmt; + cvk_fmt_t dst_fmt; + uint16_t layer_id; +} cvk_tdma_l2g_bf16_general_copy_param_t; + +/* + * TDMA Engine APIs: GMEM to LMEM (G2L) + */ +typedef struct { + const cvk_tg_t *src; + const cvk_tl_t *dst; + uint16_t layer_id; + uint32_t intra_cmd_paral; // [0]: disable + // [1]: enable TDMA/TIU intra-command parallelism +} cvk_tdma_g2l_tensor_copy_param_t; + +typedef struct { + const cvk_tg_t *src; + const cvk_tl_t *dst; + uint16_t layer_id; +} cvk_tdma_g2l_tensor_copy_nc_transposed_param_t; + +typedef struct { + const cvk_tg_t *src; + const cvk_tl_t *dst; + uint16_t layer_id; +} cvk_tdma_g2l_tensor_copy_chw_rotated_param_t; + +typedef struct { + const cvk_cmpr_tg_t *src; + const cvk_tl_t *dst; + uint16_t layer_id; + uint32_t intra_cmd_paral; // [0]: disable + // [1]: enable TDMA/TIU intra-command parallelism +} cvk_tdma_g2l_tensor_copy_decompressed_param_t; + +typedef struct { + uint16_t constant; + const cvk_tl_t *dst; + uint16_t layer_id; +} cvk_tdma_g2l_tensor_fill_constant_param_t; + +typedef struct { + const cvk_cmpr_mg_t *src; + const cvk_ml_t *dst; + uint16_t layer_id; +} cvk_tdma_g2l_matrix_copy_decompressed_param_t; + +typedef struct { + const cvk_ml_t *src; + const cvk_cmpr_mg_t *dst; + uint16_t layer_id; +} cvk_tdma_l2g_matrix_copy_compressed_param_t; + +typedef struct { + const cvk_mg_t *src; + const cvk_ml_t *dst; + uint16_t layer_id; +} cvk_tdma_g2l_matrix_copy_param_t; + +typedef struct { + const cvk_mg_t *src; + const cvk_ml_t *dst; + uint16_t layer_id; +} cvk_tdma_g2l_matrix_copy_row_col_transposed_param_t; + +typedef struct { + uint8_t src_base_reg_index; + uint64_t src_address; + uint32_t dst_address; + uint32_t bytes; + uint16_t layer_id; +} cvk_tdma_g2l_general_copy_param_t; + +typedef struct { + uint8_t src_base_reg_index; + uint64_t src_address; + uint32_t dst_address; + uint32_t src_bytes; + cvk_fmt_t src_fmt; + cvk_fmt_t dst_fmt; + uint16_t layer_id; +} cvk_tdma_g2l_bf16_general_copy_param_t; + +/* + * TDMA Engine APIs: GEM to GEM (G2G) + */ +typedef struct { + const cvk_tg_t *src; + const cvk_tg_t *dst; + uint16_t layer_id; +} cvk_tdma_g2g_tensor_copy_param_t; + +/* + * TIU Engine APIs + * + * General rules for tensor arithmetic APIs: + * + * 1, All tensors can be either signed or unsigned + * if not mentioned otherwise. + * 2, A tensor @x with both @x_high and @x_low as + * parameters can optionally be 8-bit (when @x_high + * is NULL) or 16-bit (otherwise). + */ +typedef struct { + const cvk_tl_t *res_high; + const cvk_tl_t *res_low; + const cvk_tl_t *a; + int b_is_const; + union { + const cvk_tl_t *b; + struct { + int16_t val; + int is_signed; + } b_const; + }; + uint8_t rshift_bits; + int relu_enable; + uint16_t layer_id; +} cvk_tiu_mul_param_t; + +// Multiplier in quantization down +typedef struct { + const cvk_tl_t *res_high; + const cvk_tl_t *res_low; + const cvk_tl_t *a; + int b_is_const; + union { + const cvk_tl_t *b; + struct { + int8_t val; + int is_signed; + } b_const; + }; + uint8_t rshift_bits; + int relu_enable; + uint32_t multiplier; + uint16_t layer_id; +} cvk_tiu_mul_qm_param_t; + +/* + * @res = @a * @b + @res + * + * 1, @res_high must not be NULL since input @res must be 16-bit. + * 2, If output @res is 8-bit (@res_is_int8 == 1), only @res_low + * is used as output tensor. + */ +typedef struct { + const cvk_tl_t *res_high; + const cvk_tl_t *res_low; + const cvk_tl_t *a; + int b_is_const; + union { + const cvk_tl_t *b; + struct { + int16_t val; + int is_signed; + } b_const; + }; + int res_is_int8; + int relu_enable; + uint8_t lshift_bits; + uint8_t rshift_bits; + uint16_t layer_id; +} cvk_tiu_mac_param_t; + +/* + * @a and @b must all be 16-bit. + */ +typedef struct { + const cvk_tl_t *res_high; + const cvk_tl_t *res_low; + const cvk_tl_t *a_high; + const cvk_tl_t *a_low; + int b_is_const; + union { + struct { + const cvk_tl_t *high; + const cvk_tl_t *low; + } b; + struct { + int16_t val; + int is_signed; + } b_const; + }; + uint8_t rshift_bits; + int relu_enable; + uint16_t layer_id; +} cvk_tiu_add_param_t; + +/* + * 1, @a and @b must all be 16-bit. + * 2, @res must be signed. + */ +typedef struct { + const cvk_tl_t *res_high; + const cvk_tl_t *res_low; + const cvk_tl_t *a_high; + const cvk_tl_t *a_low; + const cvk_tl_t *b_high; + const cvk_tl_t *b_low; + uint8_t rshift_bits; + uint16_t layer_id; +} cvk_tiu_sub_param_t; + +/* + * @a and @b must both be signed or unsigned. + */ +typedef struct { + const cvk_tl_t *max; + const cvk_tl_t *a; + int b_is_const; + union { + const cvk_tl_t *b; + struct { + int16_t val; + int is_signed; + } b_const; + }; + uint16_t layer_id; +} cvk_tiu_max_param_t; + +/* + * @a and @b must both be signed or unsigned. + */ +typedef struct { + const cvk_tl_t *min; + const cvk_tl_t *a; + int b_is_const; + union { + const cvk_tl_t *b; + struct { + int16_t val; + int is_signed; + } b_const; + }; + uint16_t layer_id; +} cvk_tiu_min_param_t; + +/* + * @a and @b must both be signed or unsigned. + */ +typedef struct { + const cvk_tl_t *ge; + const cvk_tl_t *a; + int b_is_const; + union { + const cvk_tl_t *b; + struct { + int16_t val; + int is_signed; + } b_const; + }; + uint16_t layer_id; +} cvk_tiu_ge_param_t; + +/* + * 1, @a must be 16-bit and signed. + * 2, @res must be 16-bit. + * 3, @bits must be signed and must range in [-16, 16]. + */ +typedef struct { + const cvk_tl_t *res_high; + const cvk_tl_t *res_low; + const cvk_tl_t *a_high; + const cvk_tl_t *a_low; + const cvk_tl_t *bits; + uint16_t layer_id; +} cvk_tiu_arith_shift_param_t; + +typedef struct { + const cvk_tl_t *res; + const cvk_tl_t *a; + const cvk_tl_t *b; + uint16_t layer_id; +} cvk_tiu_and_int8_param_t; + +/* + * All parameters must be 16-bit. + */ +typedef struct { + const cvk_tl_t *res_high; + const cvk_tl_t *res_low; + const cvk_tl_t *a_high; + const cvk_tl_t *a_low; + const cvk_tl_t *b_high; + const cvk_tl_t *b_low; +} cvk_tiu_and_int16_param_t; + +typedef struct { + const cvk_tl_t *res; + const cvk_tl_t *a; + const cvk_tl_t *b; + uint16_t layer_id; +} cvk_tiu_or_int8_param_t; + +/* + * All parameters must be 16-bit. + */ +typedef struct { + const cvk_tl_t *res_high; + const cvk_tl_t *res_low; + const cvk_tl_t *a_high; + const cvk_tl_t *a_low; + const cvk_tl_t *b_high; + const cvk_tl_t *b_low; +} cvk_tiu_or_int16_param_t; + +typedef struct { + const cvk_tl_t *res; + const cvk_tl_t *a; + const cvk_tl_t *b; + uint16_t layer_id; +} cvk_tiu_xor_int8_param_t; + +/* + * All parameters must be 16-bit. + */ +typedef struct { + const cvk_tl_t *res_high; + const cvk_tl_t *res_low; + const cvk_tl_t *a_high; + const cvk_tl_t *a_low; + const cvk_tl_t *b_high; + const cvk_tl_t *b_low; +} cvk_tiu_xor_int16_param_t; + +typedef struct { + const cvk_tl_t *src; + const cvk_tl_t *dst; + uint16_t layer_id; +} cvk_tiu_copy_param_t; + +/* + * NOTE: + * @table is treated logically as a linear list of + * length @table_n, where @table_n is a multiple of + * 16 and is smaller than or equal to 256. + * When stored in local memory, @table is a tensor + * of shape (1, npu_num, 1, @table_n), that is, the + * data of the linear list should be copied across + * each NPU's local memory by user. The behavior when + * these copies differ is undefined. + */ +typedef struct { + const cvk_tl_t *ofmap; + const cvk_tl_t *ifmap; + const cvk_tl_t *table; + uint16_t layer_id; +} cvk_tiu_lookup_table_param_t; + +typedef struct { + const cvk_tl_t *ifmap; + const cvk_tl_t *buf; + const cvk_tl_t *tbl_answer; + const cvk_tl_t *tbl_answer_mantissa; + const cvk_tl_t *ofmap; + uint16_t layer_id; + /* + * \brief + * we support 2 method of lut depends on \is_scientific: + * scientific: \tbl_answer_mantissa as mantissa part + * interpolation: \tbl_answer_mantissa as slope part + * e.g: + * interpolation we use activation function to achieve high accuracy + * scientific uses to calucate reciprocal or sqrt + * \is_scientific 1 means set scientific, otherwise is interpolation + */ + uint8_t is_scientific; + uint8_t eu_align; + /* + * for achieving high accuracy, we quant activation function + * with is constrained by a pair ofhorizontal asymptotes that x->infinity + * from [-infinity, infinity] to [\min, \max] + */ + float min; + float max; +} cvk_tiu_bf16_lookup_interp_table_param_t; + +/* + * Convolution weight shape: + * Calibration output (oc, ic, kh, kw) + * bm_build transforms (oc, ic, kh, kw) -> (1, oc, kh*kw, ic) + * TDMA load global (1, oc, kh*w, ic) -> local (1, oc, kh*kw, ic) + * TIU conv opd1 (ic, oc, kh, kw) + * + * Bias (2, oc, 1, 1) + * int8: int16, n=0 [7:0], n=1 [15:8] + * bf16: fp32, n=0 [31:16], n=1 [15:0] + */ +typedef struct { + const cvk_tl_t *ofmap; + const cvk_tl_t *ifmap; + const cvk_tl_t *weight; + const cvk_tl_t *bias; + uint8_t ins_h, ins_last_h; + uint8_t ins_w, ins_last_w; + uint8_t pad_top, pad_bottom; + uint8_t pad_left, pad_right; + uint8_t stride_h, stride_w; + uint8_t dilation_h, dilation_w; + int relu_enable; + uint8_t rshift_bits; + uint8_t ps32_mode; + uint8_t w_is_const; + uint16_t layer_id; + uint8_t fp_round_typ; + uint8_t cmd_pre_exe_typ; // tiu execute cmd when channel data is ready + // wait type: + // 0: activation + // 1: weight + uint8_t cmd_pre_exe; // tiu execute cmd when channel data is ready + // 0: disable + // 1: load pre exec + // 2: store pre exec + // 3: load and store pre exec + int8_t ins_val; // padding value for int8 + uint16_t ins_fp; // padding value for bf16 +} cvk_tiu_pt_convolution_param_t; + +typedef struct { + const cvk_tl_t *ofmap; + const cvk_tl_t *ifmap; + const cvk_tl_t *weight; + const cvk_tl_t *chl_quan_param; + uint8_t ins_h, ins_last_h; + uint8_t ins_w, ins_last_w; + uint8_t pad_top, pad_bottom; + uint8_t pad_left, pad_right; + uint8_t stride_h, stride_w; + uint8_t dilation_h, dilation_w; + uint8_t has_bias; + uint8_t relu_enable; + uint8_t ps32_mode; + uint8_t w_is_const; + uint16_t layer_id; + uint8_t cmd_pre_exe_typ; // tiu execute cmd when channel data is ready + // wait type: + // 0: activation + // 1: weight + uint8_t cmd_pre_exe; // tiu execute cmd when channel data is ready + // 0: disable + // 1: load pre exec + // 2: store pre exec + // 3: load and store pre exec + int8_t ins_val; // padding value for int8 + uint16_t ins_fp; // padding value for bf16 +} cvk_tiu_convolution_param_t; + +typedef struct { + const cvk_tl_t *ofmap; + const cvk_tl_t *ifmap; + uint16_t kh, kw; + uint8_t pad_top, pad_bottom; + uint8_t pad_left, pad_right; + uint8_t stride_h, stride_w; + int8_t ins_val; // padding value for int8 + uint16_t ins_fp; // padding value for bf16 + uint16_t layer_id; +} cvk_tiu_max_pooling_param_t; + +typedef struct { + const cvk_tl_t *ofmap; + const cvk_tl_t *ifmap; + uint16_t kh, kw; + uint8_t pad_top, pad_bottom; + uint8_t pad_left, pad_right; + uint8_t stride_h, stride_w; + uint16_t ins_fp; + uint16_t layer_id; +} cvk_tiu_min_pooling_param_t; + +typedef struct { + const cvk_tl_t *ofmap; + const cvk_tl_t *ifmap; + uint16_t kh, kw; + uint8_t ins_h, ins_last_h; + uint8_t ins_w, ins_last_w; + uint8_t pad_top, pad_bottom; + uint8_t pad_left, pad_right; + uint8_t stride_h, stride_w; + uint16_t avg_pooling_const; + uint8_t rshift_bits; + uint16_t layer_id; + int8_t ins_val; // padding value for int8 + uint16_t ins_fp; // padding value for bf16 +} cvk_tiu_average_pooling_param_t; + +typedef struct { + const cvk_tl_t *ofmap; + const cvk_tl_t *ifmap; + const cvk_tl_t *weight; + const cvk_tl_t *bias; + int weight_is_const; + struct { + int16_t val; + int is_signed; + } weight_const; + uint8_t ins_h, ins_last_h; + uint8_t ins_w, ins_last_w; + uint8_t dilation_h, dilation_w; + uint8_t pad_top, pad_bottom; + uint8_t pad_left, pad_right; + uint8_t stride_h, stride_w; + uint8_t rshift_bits; + int relu_enable; + uint16_t layer_id; + uint8_t cmd_pre_exe_typ; // tiu execute cmd when channel data is ready + // wait type: + // 0: activation + // 1: weight + uint8_t cmd_pre_exe; // tiu execute cmd when channel data is ready + // 0: disable + // 1: load pre exec + // 2: store pre exec + // 3: load and store pre exec + uint8_t ps32_mode; // output fp32 result if ps32_mode == 2 + int8_t ins_val; // padding value for int8 + uint16_t ins_fp; // padding value for bf16 +} cvk_tiu_depthwise_pt_convolution_param_t; + +typedef struct { + const cvk_tl_t *ofmap; + const cvk_tl_t *ifmap; + const cvk_tl_t *weight; + const cvk_tl_t *chl_quan_param; + int weight_is_const; + struct { + int16_t val; + int is_signed; + } weight_const; + uint8_t ins_h, ins_last_h; + uint8_t ins_w, ins_last_w; + uint8_t dilation_h, dilation_w; + uint8_t pad_top, pad_bottom; + uint8_t pad_left, pad_right; + uint8_t stride_h, stride_w; + uint8_t has_bias; + uint8_t relu_enable; + uint16_t layer_id; + uint8_t cmd_pre_exe_typ; // tiu execute cmd when channel data is ready + // wait type: + // 0: activation + // 1: weight + uint8_t cmd_pre_exe; // tiu execute cmd when channel data is ready + // 0: disable + // 1: load pre exec + // 2: store pre exec + // 3: load and store pre exec + int8_t ins_val; // padding value for int8 + uint16_t ins_fp; // padding value for bf16 +} cvk_tiu_depthwise_convolution_param_t; + +typedef struct { + const cvk_ml_t *res; + const cvk_ml_t *left; + const cvk_ml_t *right; + const cvk_ml_t *bias; + uint8_t lshift_bits; + uint8_t rshift_bits; + int res_is_int8; + int relu_enable; + int add_result; + uint8_t ps32_mode; + uint16_t layer_id; +} cvk_tiu_matrix_multiplication_param_t; + +typedef struct { + const cvk_ml_t *res; + const cvk_ml_t *left; + const cvk_ml_t *right; + const cvk_ml_t *bias; + uint8_t lshift_bits; + uint8_t rshift_bits; + int res_is_int8; + int relu_enable; + int add_result; + uint8_t ps32_mode; + int32_t quan_m; + uint16_t layer_id; +} cvk_tiu_matrix_multiplication_qm_param_t; + +/* + * Kernel operations + */ +struct cvikernel_context; + +typedef struct cvikernel_operations { + void (*cleanup)(struct cvikernel_context *ctx); + void (*reset)(struct cvikernel_context *ctx); + uint8_t *(*acquire_cmdbuf)(struct cvikernel_context *ctx, uint32_t *size); + void (*dmabuf_size)(uint8_t *cmdbuf, uint32_t sz, uint32_t *psize, uint32_t *pmu_size); + void (*dmabuf_convert)(uint8_t *cmdbuf, uint32_t sz, uint8_t *dmabuf); + + // Concurrent TDMA and TIU command execution: + // TDMA command runs without waiting previous TIU command: + // 1. parallel_disable + // 2. parallel_enable + // 3. tiu command + // 4. tdma command (not wait TIU command) + // 5. tdma command (not wait TIU command) + void (*parallel_enable)(struct cvikernel_context *ctx); + void (*parallel_disable)(struct cvikernel_context *ctx); + + void (*set_layer_id)( + struct cvikernel_context *ctx, + uint16_t layer_id); + + cvk_tl_t *(*lmem_alloc_tensor)( + struct cvikernel_context *ctx, + cvk_tl_shape_t shape, + cvk_fmt_t fmt, + int eu_align); + + cvk_ml_t *(*lmem_alloc_matrix)( + struct cvikernel_context *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align); + + cvk_ml_t *(*lmem_alloc_ps32_matrix)( + struct cvikernel_context *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align); + + void (*lmem_free_tensor)( + struct cvikernel_context *ctx, + const cvk_tl_t *tl); + + void (*lmem_free_matrix)( + struct cvikernel_context *ctx, + const cvk_ml_t *ml); + + void (*lmem_init_tensor)( + struct cvikernel_context *ctx, + cvk_tl_t *tl, + cvk_tl_shape_t shape, + cvk_fmt_t fmt, + int eu_align); + + void (*lmem_init_matrix)( + struct cvikernel_context *ctx, + cvk_ml_t *ml, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align); + + cvk_tl_stride_t (*tl_default_stride)( + struct cvikernel_context *ctx, + cvk_tl_shape_t shape, + cvk_fmt_t fmt, + int eu_align); + + cvk_tg_stride_t (*tg_default_stride)( + struct cvikernel_context *ctx, + cvk_tg_shape_t shape, + cvk_fmt_t fmt); + + cvk_ml_shape_t (*ml_default_shape)( + struct cvikernel_context *ctx, + uint32_t row, + uint32_t col, + cvk_fmt_t fmt); + + cvk_ml_stride_t (*ml_default_stride)( + struct cvikernel_context *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align); + + cvk_ml_shape_t (*ml_shape_t1)( + struct cvikernel_context *ctx, + uint32_t len, + cvk_fmt_t fmt); + + uint32_t (*lmem_tensor_to_size)( + struct cvikernel_context *ctx, + cvk_tl_shape_t shape, + cvk_fmt_t fmt, + int eu_align); + + uint32_t (*lmem_matrix_to_size)( + struct cvikernel_context *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align); + + uint32_t (*lmem_ps32_matrix_to_size)( + struct cvikernel_context *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align); + + void (*gmem_init_tensor)( + struct cvikernel_context *ctx, + cvk_tg_t *tg, + cvk_tg_shape_t shape, + cvk_fmt_t fmt); + + /* Local to Local DMA API */ + void (*tdma_l2l_tensor_copy)( + struct cvikernel_context *ctx, + const cvk_tdma_l2l_tensor_copy_param_t *param); + void (*tdma_l2l_bf16_tensor_copy)( + struct cvikernel_context *ctx, + const cvk_tdma_l2l_tensor_copy_param_t *param); + void (*tdma_l2l_tensor_lrn_shift)( + struct cvikernel_context *ctx, + const cvk_tdma_l2l_tensor_lrn_shift_param_t *param); + + /* Local to Global DMA API */ + void (*tdma_l2g_tensor_copy)( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_tensor_copy_param_t *param); + void (*tdma_l2g_bf16_tensor_copy)( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_tensor_copy_param_t *param); + void (*tdma_l2g_tensor_copy_nc_transposed)( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_tensor_copy_nc_transposed_param_t *param); + void (*tdma_l2g_bf16_tensor_copy_nc_transposed)( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_tensor_copy_nc_transposed_param_t *param); + void (*tdma_l2g_tensor_copy_compressed)( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_tensor_copy_compressed_param_t *param); + void (*tdma_l2g_tensor_fill_constant)( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_tensor_fill_constant_param_t *param); + void (*tdma_l2g_tensor_copy_cw_transposed)( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_tensor_copy_cw_transposed_param_t *param); + void (*tdma_l2g_bf16_tensor_copy_cw_transposed)( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_tensor_copy_cw_transposed_param_t *param); + void (*tdma_l2g_matrix_copy)( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_matrix_copy_param_t *param); + void (*tdma_l2g_bf16_matrix_copy)( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_matrix_copy_param_t *param); + void (*tdma_l2g_general_copy)( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_general_copy_param_t *param); + void (*tdma_l2g_bf16_general_copy)( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_bf16_general_copy_param_t *param); + + /* Global to Local DMA API */ + void (*tdma_g2l_tensor_copy)( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_tensor_copy_param_t *param); + void (*tdma_g2l_bf16_tensor_copy)( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_tensor_copy_param_t *param); + void (*tdma_g2l_tensor_copy_nc_transposed)( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_tensor_copy_nc_transposed_param_t *param); + void (*tdma_g2l_bf16_tensor_copy_nc_transposed)( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_tensor_copy_nc_transposed_param_t *param); + void (*tdma_g2l_tensor_copy_chw_rotated)( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_tensor_copy_chw_rotated_param_t *param); + void (*tdma_g2l_tensor_copy_decompressed)( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_tensor_copy_decompressed_param_t *param); + void (*tdma_g2l_tensor_fill_constant)( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_tensor_fill_constant_param_t *param); + void (*tdma_g2l_bf16_tensor_fill_constant)( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_tensor_fill_constant_param_t *param); + void (*tdma_g2l_matrix_copy_decompressed)( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_matrix_copy_decompressed_param_t *param); + void (*tdma_l2g_matrix_copy_compressed)( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_matrix_copy_compressed_param_t *param); + void (*tdma_g2l_matrix_copy)( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_matrix_copy_param_t *param); + void (*tdma_g2l_bf16_matrix_copy)( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_matrix_copy_param_t *param); + void (*tdma_g2l_matrix_copy_row_col_transposed)( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_matrix_copy_row_col_transposed_param_t *param); + void (*tdma_g2l_general_copy)( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_general_copy_param_t *param); + void (*tdma_g2l_bf16_general_copy)( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_bf16_general_copy_param_t *param); + + /* Global to Global DMA API */ + void (*tdma_g2g_tensor_copy)( + struct cvikernel_context *ctx, + const cvk_tdma_g2g_tensor_copy_param_t *param); + void (*tdma_g2g_general_copy)( + struct cvikernel_context *ctx, + const cvk_tdma_g2g_tensor_copy_param_t *param); + void (*tdma_g2g_bf16_general_copy)( + struct cvikernel_context *ctx, + const cvk_tdma_g2g_tensor_copy_param_t *param); + void (*tdma_g2g_bf16_tensor_copy)( + struct cvikernel_context *ctx, + const cvk_tdma_g2g_tensor_copy_param_t *param); + + /* TIU API */ + void (*tiu_mul)( + struct cvikernel_context *ctx, + const cvk_tiu_mul_param_t *param); + void (*tiu_mul_qm)( + struct cvikernel_context *ctx, + const cvk_tiu_mul_qm_param_t *param); + void (*tiu_mac)( + struct cvikernel_context *ctx, + const cvk_tiu_mac_param_t *param); + void (*tiu_add)( + struct cvikernel_context *ctx, + const cvk_tiu_add_param_t *param); + void (*tiu_sub)( + struct cvikernel_context *ctx, + const cvk_tiu_sub_param_t *param); + void (*tiu_max)( + struct cvikernel_context *ctx, + const cvk_tiu_max_param_t *param); + void (*tiu_min)( + struct cvikernel_context *ctx, + const cvk_tiu_min_param_t *param); + void (*tiu_and_int8)( + struct cvikernel_context *ctx, + const cvk_tiu_and_int8_param_t *param); + void (*tiu_arith_shift)( + struct cvikernel_context *ctx, + const cvk_tiu_arith_shift_param_t *param); + void (*tiu_and_int16)( + struct cvikernel_context *ctx, + const cvk_tiu_and_int16_param_t *param); + void (*tiu_or_int8)( + struct cvikernel_context *ctx, + const cvk_tiu_or_int8_param_t *param); + void (*tiu_or_int16)( + struct cvikernel_context *ctx, + const cvk_tiu_or_int16_param_t *param); + void (*tiu_xor_int8)( + struct cvikernel_context *ctx, + const cvk_tiu_xor_int8_param_t *param); + void (*tiu_xor_int16)( + struct cvikernel_context *ctx, + const cvk_tiu_xor_int16_param_t *param); + void (*tiu_copy)( + struct cvikernel_context *ctx, + const cvk_tiu_copy_param_t *param); + void (*tiu_lookup_table)( + struct cvikernel_context *ctx, + const cvk_tiu_lookup_table_param_t *param); + void (*tiu_bf16_lookup_interp_table)( + struct cvikernel_context *ctx, + const cvk_tiu_bf16_lookup_interp_table_param_t *param); + void (*tiu_pt_convolution)( + struct cvikernel_context *ctx, + const cvk_tiu_pt_convolution_param_t *param); + void (*tiu_convolution)( + struct cvikernel_context *ctx, + const cvk_tiu_convolution_param_t *param); + void (*tiu_max_pooling)( + struct cvikernel_context *ctx, + const cvk_tiu_max_pooling_param_t *param); + void (*tiu_average_pooling)( + struct cvikernel_context *ctx, + const cvk_tiu_average_pooling_param_t *param); + void (*tiu_pt_depthwise_convolution)( + struct cvikernel_context *ctx, + const cvk_tiu_depthwise_pt_convolution_param_t *param); + void (*tiu_depthwise_convolution)( + struct cvikernel_context *ctx, + const cvk_tiu_depthwise_convolution_param_t *param); + void (*tiu_matrix_multiplication)( + struct cvikernel_context *ctx, + const cvk_tiu_matrix_multiplication_param_t *param); + void (*tiu_matrix_multiplication_qm)( + struct cvikernel_context *ctx, + const cvk_tiu_matrix_multiplication_qm_param_t *param); + void (*tiu_ge)( + struct cvikernel_context *ctx, + const cvk_tiu_ge_param_t *param); + void (*tiu_min_pooling)( + struct cvikernel_context *ctx, + const cvk_tiu_min_pooling_param_t *param); +} cvk_operations_t; + +/* + * Miscellaneous helper function + * Not directly related to tiu/tdma operation + * or not ready to move into official kernel operation yet. + */ +typedef struct { + uint16_t (*float_to_bfloat16) ( + struct cvikernel_context *ctx, + float data); + void (*bf16_table_shape)( + struct cvikernel_context *ctx, + cvk_tl_shape_t *shape); +} cvk_misc_operations_t; + +/* + * Kernel Context + */ +typedef struct cvikernel_context { + cvk_chip_info_t info; + cvk_operations_t *ops; + cvk_misc_operations_t *misc_ops; + void *priv_data; +} cvk_context_t; + +/* + * Register information + */ +typedef struct cvikernel_register_info { + char chip_ver_str[16]; + uint32_t cmdbuf_size; + uint8_t *cmdbuf; +} cvk_reg_info_t; + +cvk_context_t *cvikernel_register(cvk_reg_info_t *req_info); + +#ifdef __cplusplus +} +#endif + +#endif /* CVIKERNEL_H */ diff --git a/cvikernel/include/cvikernel/cvk_fp_convert.h b/cvikernel/include/cvikernel/cvk_fp_convert.h new file mode 100644 index 000000000..cb4524a40 --- /dev/null +++ b/cvikernel/include/cvikernel/cvk_fp_convert.h @@ -0,0 +1,333 @@ +#ifndef CVK_FP_CONVERT_H +#define CVK_FP_CONVERT_H + +#if __arm__ +#define __DISABLE_FENV__ +#endif + +#ifndef __DISABLE_FENV__ +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +static inline uint8_t cvk_convert_bf16_u8(uint16_t data); +static inline uint8_t cvk_convert_bf16_u8_rnd(uint16_t data, int int8_rnd_md); +static inline int8_t cvk_convert_bf16_s8_rnd(uint16_t data, int int8_rnd_md); +static inline int8_t cvk_convert_bf16_s8(uint16_t data); +static inline uint16_t cvk_convert_int8_bf16(uint8_t data, uint8_t sign); +static inline uint32_t cvk_convert_fp32_u32(float fp32); +static inline uint32_t cvk_convert_fp32_hex(float val); +static inline float cvk_convert_hex_fp32(uint32_t hval); + +static inline float cvk_convert_bf16_fp32(uint16_t bf16); +static inline uint16_t cvk_convert_fp32_bf16(float fp32); + +static inline void cvk_f32_integer(void *if32, void *o_integer, int integer_size, int accumulate, int int8_signed, int int8_rnd_md); +//static inline void f32_integer(void *if32, void *o_integer, + // 0 for 32 bit , 1 for 16 bit , 2 for 8 bit +// int integer_size, int accumulate = 0, int int8_signed = 1, int int8_rnd_md = 0); + +union convert_type_float { + float fval; + uint16_t bf16[2]; + uint32_t ival; +}; + +typedef union convert_type_float convert_int_float; +static const uint16_t NAN_VALUE = 0x7FC0; + +//static int round_mode = 0; +static uint8_t cvk_float_isnan(const float x) { + //return isnan(x); + return x != x; +} + +static inline int cvk_set_store_feround() +{ +#ifndef __DISABLE_FENV__ + int round_mode = fegetround(); + fesetround(FE_TOWARDZERO); + return round_mode; +#else + return 0; +#endif +} + +static inline void cvk_restore_feround(int round_mode) +{ +#ifndef __DISABLE_FENV__ + fesetround(round_mode); +#else + (void)round_mode; +#endif +} + +static inline uint8_t cvk_convert_bf16_u8_rnd(uint16_t data, int int8_rnd_md) +{ + /* convert bf16 to float32*/ + float fp32; + convert_int_float convert_val; + fp32 = cvk_convert_bf16_fp32(data); + /* convert float32 to uint8_t*/ + cvk_f32_integer((void*)&fp32, &convert_val.ival, 2, 0, 0, int8_rnd_md); + return (uint8_t) convert_val.ival; +} + +static inline uint8_t cvk_convert_bf16_u8(uint16_t data) +{ + return (uint8_t) cvk_convert_bf16_u8_rnd(data, 0); +} + +static inline int8_t cvk_convert_bf16_s8_rnd(uint16_t data, int int8_rnd_md) +{ + /* convert bf16 to float32 */ + float fp32; + convert_int_float convert_val; + fp32 = cvk_convert_bf16_fp32(data); + /* convert float32 to uint8_t*/ + cvk_f32_integer((void*)&fp32, &convert_val.ival, 2, 0, 1, int8_rnd_md); + return (int8_t) convert_val.ival; +} + +static inline int8_t cvk_convert_bf16_s8(uint16_t data) +{ + return (int8_t) cvk_convert_bf16_s8_rnd(data, 0); +} + +static inline uint16_t cvk_convert_int8_bf16(uint8_t data, uint8_t sign) +{ + int32_t val = sign ? (int8_t) data : (uint8_t) data; + /* need to round to bf16 mode */ + return cvk_convert_fp32_bf16((float) val); +} + +static inline uint16_t cvk_convert_fp32_bf16(float fp32) +{ + if (cvk_float_isnan(fp32)) + return NAN_VALUE; + convert_int_float convert_val; + convert_val.fval = fp32; + uint32_t input = convert_val.ival; + uint32_t lsb = (input >> 16) & 1; + uint32_t rounding_bias = 0x7fff + lsb; + input += rounding_bias; + convert_val.bf16[1] = (uint16_t) (input >> 16); + + /* HW behavior */ + if ((convert_val.bf16[1] & 0x7f80) == 0x7f80) { + convert_val.bf16[1] = 0x7f7f; + } + return convert_val.bf16[1]; +} + +static inline uint8_t cvk_convert_fp32_u8(float fp32) +{ + convert_int_float convert_val; + cvk_f32_integer((void*)&fp32, &convert_val.ival, 2, 0, 0, 0); + return (uint8_t) convert_val.ival; +} + +static inline int8_t cvk_convert_fp32_s8(float fp32) +{ + convert_int_float convert_val; + cvk_f32_integer((void*)&fp32, &convert_val.ival, 2, 0, 1, 0); + return (int8_t) convert_val.ival; +} + +static inline uint32_t cvk_convert_fp32_u32(float fp32) +{ + convert_int_float convert_val; + cvk_f32_integer((void*)&fp32, &convert_val.ival, 0, 0, 0, 0); + return (uint32_t) convert_val.ival; +} + +static inline int32_t cvk_convert_fp32_s32(float fp32) +{ + convert_int_float convert_val; + cvk_f32_integer((void*)&fp32, &convert_val.ival, 0, 0, 1, 0); + return (int32_t) convert_val.ival; +} + +/* convert hex to float directly */ +static inline float cvk_convert_hex_fp32(uint32_t hval) +{ + convert_int_float convert_val; + convert_val.ival = hval; + return convert_val.fval; +} +/* convert float to hex directly */ +static inline uint32_t cvk_convert_fp32_hex(float val) +{ + convert_int_float convert_val; + convert_val.fval = val; + return convert_val.ival; +} +static inline float cvk_convert_bf16_fp32(uint16_t bf16) +{ + convert_int_float convert_val; + convert_val.bf16[1] = bf16; + convert_val.bf16[0] = 0; + return convert_val.fval; +} + +static inline void cvk_flt2int_flt(float x, unsigned long long* integer_part, float * sub_part, uint8_t sign) +{ + convert_int_float work_x; + int level_code; + unsigned long tail_code; + work_x.fval = x; + level_code = ((work_x.ival >> 23) & 0xff) - 127; + + //if the level code is negaive, the integer part of the float is zero + if ( level_code < 0 ){ + *integer_part = 0; + *sub_part = x; + } + else { + tail_code = (work_x.ival) & 0x7fffff; + tail_code = tail_code | 0x800000; + + if (level_code < 23){ + tail_code >>= (23 - level_code); + *integer_part = tail_code; + work_x.ival &= 0xffffffff << (23 - level_code); + *sub_part = x - work_x.fval; + } + else { + tail_code <<= (level_code - 23); + *integer_part = tail_code; + if(level_code>30){ + *integer_part = 0x7fffffff; + if(sign)*integer_part = 0x800000000; + } + *sub_part = 0; + } + } +} + +inline static int cvk_flt2int(float ifval, int int8_rnd_md) +{ + union { + float floatNum; + unsigned long intNum; + } tempIfval; + tempIfval.floatNum = ifval; + uint8_t isPositive = ((tempIfval.intNum & 0x80000000UL) == 0x80000000UL) ? 0 : 1; + float abs_fval = (!isPositive) ? -ifval : ifval; + float sub_part; + unsigned long long integer_part; + uint8_t sign = !isPositive; + cvk_flt2int_flt(abs_fval, &integer_part, &sub_part, sign); + if (!isPositive) + { + unsigned long long result; + if(int8_rnd_md == 0) { // round to nearest even + if ( sub_part > 0.5f ) + { + result = integer_part + 1; + } + else if (sub_part == 0.5f) + { + if ( integer_part & 0x1 ) + { + result = integer_part + 1; + } + else + { + result = integer_part; + } + } + else + { + result = integer_part; + } + } else { //round to zero + result = integer_part; + } + if ( result > 0x80000000UL ) + { + result = 0x80000000UL; + } + return -result; + } + else + { + unsigned long long result; + if(int8_rnd_md == 0) { // round to nearest even + if ( sub_part > 0.5f ) + { + result = integer_part + 1; + } + else if ( sub_part == 0.5f ) + { + if ( integer_part & 0x1 ) + { + result = integer_part + 1; + } + else + { + result = integer_part; + } + } + else + { + result = integer_part; + } + } else { + result = integer_part; + } + if ( result > 0x7fffffff ) + { + result = 0x7fffffff; + } + return result; + } +} + +static inline void cvk_f32_integer(void *if32, void *o_integer, int integer_size, int accumulate, int int8_signed, int int8_rnd_md) +{ + int i_tmp; + float *f_tmp; + f_tmp = (float *)if32; + i_tmp = cvk_flt2int(*f_tmp, int8_rnd_md); + int *o32 = (int *)o_integer; + int dst_f32 = *o32; + short *o16 = (short *)o_integer; + short dst_o16 = *o32; + char *o8 = (char *)o_integer; + char dst_o8 = *o8; + + if (integer_size == 0) { + *o32 = i_tmp; + } else if (integer_size == 1) { + *o16 = i_tmp; + } else{ + *o8 = i_tmp; + int min = (int8_signed) ? -128 : 0; + int max = (int8_signed) ? 127 : 255; + if (i_tmp < min ){ + *o8 = min; + } + else if (i_tmp > max){ + *o8 = max; + } + //*o8 = i_tmp; + } + if (accumulate) { + if (integer_size == 0) { + *o32 += dst_f32; + } else if (integer_size == 1) { + *o16 += dst_o16; + } else + *o8 += dst_o8; + } +} + +#ifdef __cplusplus +} +#endif + +#endif /* CVK_FP_CONVERT_H */ diff --git a/cvikernel/include/cvikernel/cvk_vlc_compress.h b/cvikernel/include/cvikernel/cvk_vlc_compress.h new file mode 100644 index 000000000..8e29145b2 --- /dev/null +++ b/cvikernel/include/cvikernel/cvk_vlc_compress.h @@ -0,0 +1,728 @@ +#ifndef __CVK_VLC_COMPRESS_H__ +#define __CVK_VLC_COMPRESS_H__ + +#include +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + +#define MAX_UNARY_FIELD_SIZE 47 +#define MAX_ORDER_K 5 + +static inline int divide_ceil(int numerator, int denominator) +{ + return (numerator + denominator - 1) / denominator; +} + + /** + * \data_type 0 means 8bit, 1 means 16bit + */ + static inline size_t get_out_bs_buf_size(uint64_t in_size, uint8_t data_type) { + size_t blk_num = (data_type) ? ((in_size + 31) >> 5) : ((in_size + 15) >> 4); + size_t in_size_pad = blk_num << (4 + data_type); + size_t bs_buf_size = in_size_pad + (divide_ceil(blk_num, 16) << 4) + 16; + return bs_buf_size; + } + + typedef struct + { + uint8_t signedness; + uint8_t is_bfloat16; + uint8_t bias0; + uint8_t bias1; + uint8_t zero_guard_en; + } CommandInfo; + typedef struct + { + uint8_t *stream; // stream buffer pointer + int bit_pos; // current pointer (in bit) + int buf_size; // in byte + } StreamBuffer; + +static inline int8_t two_side_circular_shift(int8_t val, uint8_t bias0, uint8_t bias1); +static inline int8_t inv_two_side_circular_shift(int8_t val, uint8_t bias0, uint8_t bias1); +static inline uint8_t center_shift(uint8_t val, uint8_t bias, uint8_t zero_guard); +static inline uint8_t inv_center_shift(uint8_t val, uint8_t bias, uint8_t zero_guard); + +static inline void init_stream(StreamBuffer *bs, const uint8_t *buf, int buf_size, uint8_t read_only); + +static inline void cvk_vlc_est_weight_bias(const uint8_t *ibuf, size_t isz, uint8_t signedness, uint8_t isBfloat16, CommandInfo *cmd_info); +static inline void cvk_vlc_enc_int8(const uint8_t *ibuf, size_t isz, uint8_t *obuf, size_t *osz, CommandInfo *cmd_info); +static inline void cvk_vlc_dec_int8_ext(const uint8_t *ibuf, size_t isz, uint8_t *obuf, size_t *bs_size); +static inline void cvk_vlc_dec_int8(const uint8_t *ibuf, size_t isz, uint8_t *obuf); +static inline void cvk_vlc_enc_bf16(const uint16_t *ibuf, size_t isz, uint8_t *obuf, size_t *osz, CommandInfo *cmd_info); +static inline void cvk_vlc_dec_bf16_ext(const uint8_t *ibuf, size_t isz, uint16_t *obuf, size_t *bs_size); +static inline void cvk_vlc_dec_bf16(const uint8_t *ibuf, size_t isz, uint16_t *obuf); + +static inline uint8_t get_bit_val(uint8_t *buf, int byte_idx, int bit_idx) + { + return (buf[byte_idx] >> bit_idx) & 0x1; + } + +static inline uint8_t sign_to_unsign(uint8_t val) + { + uint8_t sign_i = (val >> 7) & 0x1; + int abs_data_i = abs(((int8_t)val)); + return ((abs_data_i << 1) - sign_i); + } + +static inline int8_t unsign_to_sign(uint8_t val) + { + uint8_t sign_i = val & 0x1; + int abs_data_i = (((int)val) + 1) >> 1; + return (uint8_t)((sign_i == 1) ? (-abs_data_i) : abs_data_i); + } + +static inline void dispatch_bf16_data(const uint16_t *bf16_in, uint8_t *exp, uint8_t *frac, size_t isz) +{ + for (size_t i = 0; i < isz; i++) + { + exp[i] = (uint8_t)((bf16_in[i] >> 7) & 0xFF); + frac[i] = (uint8_t)(((bf16_in[i] >> 15) << 7) | (bf16_in[i] & 0x7F)); + } +} + +static inline void merge_bf16_data(const uint8_t *exp_in, const uint8_t *frac_in, uint16_t *bf16_out, size_t isz) +{ + memset(bf16_out, 0, sizeof(uint16_t)); + for (size_t i = 0; i < isz; i++) + { + bf16_out[i] = ((frac_in[i] >> 7) << 15) | (exp_in[i] << 7) | (frac_in[i] & 0x7F); + } +} + +// -- streaming operation handler -- +static inline void init_stream(StreamBuffer *bs, const uint8_t *buf, int buf_size, uint8_t read_only) +{ + bs->bit_pos = 0; + bs->stream = (uint8_t *)buf; + bs->buf_size = buf_size; + if (!read_only) + memset((uint8_t *)buf, 0, sizeof(uint8_t) * buf_size); +} + +static inline void write_stream(StreamBuffer *bs, uint8_t *src, int bit_len) +{ + for (int bit = 0; bit < bit_len; bit++) + { + int src_byte_i = bit / 8; + int src_bit_i = bit % 8; + int dest_byte_i = (bs->bit_pos + bit) / 8; + int dest_bit_i = (bs->bit_pos + bit) % 8; + bs->stream[dest_byte_i] |= (get_bit_val(src, src_byte_i, src_bit_i) << dest_bit_i); + } + bs->bit_pos += bit_len; +} + +static inline void move_stream_ptr(StreamBuffer *bs, int bit_len) +{ + bs->bit_pos += bit_len; +} + +static inline void parse_stream(StreamBuffer *bs, uint8_t *dest, int bit_len) +{ + memset(dest, 0, sizeof(uint8_t) * (bit_len + 7) >> 3); + for (int bit = 0; bit < bit_len; bit++) + { + int dest_byte_i = bit / 8; + int dest_bit_i = bit % 8; + int bs_byte_i = (bs->bit_pos + bit) / 8; + int bs_bit_i = (bs->bit_pos + bit) % 8; + dest[dest_byte_i] |= (get_bit_val(bs->stream, bs_byte_i, bs_bit_i) << dest_bit_i); + } + bs->bit_pos += bit_len; +} + +// -- header read/write operation handler -- +static inline void vlc_enc_header(StreamBuffer *bs_header, CommandInfo *cmd_info, size_t blk_bs_size) +{ + write_stream(bs_header, (uint8_t *)&blk_bs_size, 24); // bit[23:0] compressed block stream size + move_stream_ptr(bs_header, 4); // bit[27:24] reserved + write_stream(bs_header, (uint8_t *)&cmd_info->signedness, 1); // bit[28] signedness + write_stream(bs_header, (uint8_t *)&cmd_info->is_bfloat16, 1); // bit[29] data type + move_stream_ptr(bs_header, 2); // bit[31:30] bit depth + write_stream(bs_header, (uint8_t *)&cmd_info->bias0, 8); // bit[39:32] bias0 for symbol remapping + write_stream(bs_header, (uint8_t *)&cmd_info->bias1, 7); // bit[46:40] bias1 for symbol remapping + write_stream(bs_header, (uint8_t *)&cmd_info->zero_guard_en, 1); // bit[47] zero guard +} + +static inline void vlc_dec_header_ext(StreamBuffer *bs_header, CommandInfo *cmd_info, size_t *blk_bs_size) +{ + parse_stream(bs_header, (uint8_t *)blk_bs_size, 24); // bit[23:0] compressed block stream size + move_stream_ptr(bs_header, 4); // bit[27:24] reserved + parse_stream(bs_header, (uint8_t *)&cmd_info->signedness, 1); // bit[28] signedness + parse_stream(bs_header, (uint8_t *)&cmd_info->is_bfloat16, 1); // bit[29] data type + move_stream_ptr(bs_header, 2); + parse_stream(bs_header, (uint8_t *)&cmd_info->bias0, 8); // bit[39:32] bias0 for symbol remapping + parse_stream(bs_header, (uint8_t *)&cmd_info->bias1, 7); // bit[46:40] bias1 for symbol remapping + parse_stream(bs_header, (uint8_t *)&cmd_info->zero_guard_en, 1); // bit[47] zero guard +} + +static inline void vlc_dec_header(StreamBuffer *bs_header, CommandInfo *cmd_info) +{ + size_t blk_bs_size; + vlc_dec_header_ext(bs_header, cmd_info, &blk_bs_size); +} + +// -- symbol remmaping handler -- +static inline uint8_t center_shift(uint8_t val, uint8_t bias, uint8_t zero_guard) +{ + if (val == 0 && zero_guard) + return 0; + + int16_t shift_data_i = val - bias; + uint8_t range = (bias <= 128) ? bias : 255 - bias; + if (bias <= 128) + { + return (val >= (range << 1)) ? val : sign_to_unsign(shift_data_i) + zero_guard; + } + else + { + return (val < (bias - range)) ? (range + bias - val + zero_guard) : (sign_to_unsign(shift_data_i) + zero_guard); + } +} + +static inline uint8_t inv_center_shift(uint8_t val, uint8_t bias, uint8_t zero_guard) +{ + if (val == 0 && zero_guard) + return 0; + + uint8_t unsign_data_i = val - zero_guard; + uint8_t range = (bias <= 128) ? bias : 255 - bias; + if (bias <= 128) + { + return (val >= (range << 1)) ? val : unsign_to_sign(unsign_data_i) + bias; + } + else + { + return (unsign_data_i > (range << 1)) ? (range + bias - val + zero_guard) : unsign_to_sign(unsign_data_i) + bias; + } +} + +static inline int8_t two_side_circular_shift(int8_t val, uint8_t bias0, uint8_t bias1) +{ + if (val == 0) + return 0; + + uint8_t sign = (val < 0) ? true : false; + int32_t abs_val = abs(val); + abs_val -= (sign) ? bias1 : bias0; + abs_val += (abs_val <= 0) ? (127 + sign) : 0; + return (sign) ? -abs_val : abs_val; +} + +static inline int8_t inv_two_side_circular_shift(int8_t val, uint8_t bias0, uint8_t bias1) +{ + if (val == 0) + return 0; + + uint8_t sign = (val < 0) ? true : false; + uint32_t abs_val = (uint32_t)abs(val); + abs_val += (sign) ? bias1 : bias0; + int32_t abs_val_minus = abs_val - (127 + sign); + uint8_t abs_val_lsb = ((abs_val_minus <= 0) + ? (uint8_t)abs_val + : (uint8_t)abs_val_minus) & + 0xFF; + return (sign) ? -abs_val_lsb : abs_val_lsb; +} + +static inline void symbol_remapping(uint8_t *blk_in, uint8_t *blk_out, uint8_t bias0, uint8_t bias1, uint8_t signedness, uint8_t is_bf16_exp, uint8_t zero_guard) +{ + if (is_bf16_exp == false && signedness == false) + { + // remapping bypass + memcpy(blk_out, blk_in, sizeof(uint8_t) * 16); + return; + } + + if (is_bf16_exp == true) + { + // center circular shift + for (int i = 0; i < 16; i++) + { + blk_out[i] = center_shift(blk_in[i], bias0, zero_guard); + } + } + else + { + // two-side circular shift + for (int i = 0; i < 16; i++) + { + int8_t shift_data_i = two_side_circular_shift((int8_t)blk_in[i], bias0, bias1); + blk_out[i] = sign_to_unsign(shift_data_i); + } + } +} + +static inline void inv_symbol_remapping(uint8_t *blk_in, uint8_t *blk_out, uint8_t bias0, uint8_t bias1, uint8_t signedness, uint8_t is_bf16_exp, uint8_t zero_guard) +{ + if (is_bf16_exp == false && signedness == false) + { + // remapping bypass + memcpy(blk_out, blk_in, sizeof(uint8_t) * 16); + return; + } + + if (is_bf16_exp == true) + { + // center circular shift + for (int i = 0; i < 16; i++) + { + blk_out[i] = inv_center_shift(blk_in[i], bias0, zero_guard); + } + } + else + { + // two-side circular shift + for (int i = 0; i < 16; i++) + { + int8_t sign_data_i = unsign_to_sign(blk_in[i]); + blk_out[i] = (uint8_t)inv_two_side_circular_shift(sign_data_i, bias0, bias1); + } + } +} + +static inline int vlc_estimate_block_order(uint8_t *blk_in, uint8_t bf16_zvc_en) +{ + int best_k = 0; + int best_bs_size = 0x7FFFFFFF; + + for (int k = 0; k <= (int)MAX_ORDER_K; k++) + { + uint8_t remain_field_size = k << 4; + int unary_field_len = 0; + for (int i = 0; i < 16; i++) + { + uint8_t group_idx = blk_in[i] >> k; + unary_field_len += (group_idx + 1); + } + int znum_bit = (bf16_zvc_en && k > 0) ? 4 : 0; + int blk_size = (unary_field_len <= MAX_UNARY_FIELD_SIZE) + ? remain_field_size + unary_field_len + znum_bit + : 255; + if (blk_size < best_bs_size) + { + best_k = k; + best_bs_size = blk_size; + } + } + + best_k = (best_bs_size > 128) ? -1 : best_k; + return best_k; +} +// -- vlc block parrelel GR encode/decode -- +static inline uint8_t vlc_gr_enc_block_data(uint8_t *blk_in, StreamBuffer *bs, int order_k, uint8_t bf16_zvc_en) +{ + // uncompressed mode + if (order_k == -1) + { + write_stream(bs, blk_in, 128); + return 128; + } + + // remain field + uint8_t remain_field[16] = {0}; + uint8_t unary_field[8] = {0}; + uint8_t sym_end_pos[16] = {0}; + uint8_t unary_field_len = 0; + int sym_end_pos_accum = -1; + + // bit plane encode for remain field + for (int k = 0; k < order_k; k++) + { + uint8_t bit_plane0 = 0, bit_plane1 = 0; + for (int i = 0; i < 8; i++) + { + bit_plane0 |= (get_bit_val(blk_in, i, k) << i); + bit_plane1 |= (get_bit_val(blk_in, i + 8, k) << i); + } + remain_field[k << 1] = bit_plane0; + remain_field[(k << 1) + 1] = bit_plane1; + } + write_stream(bs, remain_field, order_k << 4); + + if (bf16_zvc_en && order_k > 0) + { + int zero_num = 0; + for (int i = 0; i < 16; i++) + { + if (blk_in[i] == 0) + zero_num++; + } + // assert(zero_num < 16); + if (zero_num >= 16) + return 0; + + write_stream(bs, (uint8_t *)&zero_num, 4); + } + + // unary encode for unary field + for (int i = 0; i < 16; i++) + { + int group_idx = blk_in[i] >> order_k; + sym_end_pos_accum += (group_idx + 1); + sym_end_pos[i] = sym_end_pos_accum; + int byte_idx = sym_end_pos[i] / 8; + int bit_idx = sym_end_pos[i] % 8; + unary_field[byte_idx] |= (1 << (bit_idx)); + } + unary_field_len = sym_end_pos[15] + 1; + + //assert(unary_field_len <= MAX_UNARY_FIELD_SIZE); + if (unary_field_len > MAX_UNARY_FIELD_SIZE) + return 0; + + uint8_t ulen = (unary_field_len - 16) & 0x1F; + write_stream(bs, unary_field, unary_field_len); + + return ulen; +} + +static inline void vlc_gr_dec_block_data(StreamBuffer *bs, uint8_t bs_size, uint8_t *rec, int order_k, uint8_t bf16_zvc_en) +{ + // assert(bs_size <= 128); + if (bs_size > 128) + return; + + // uncompressed mode + if (order_k == -1) + { + parse_stream(bs, rec, 128); + return; + } + + // remain field + uint8_t remain_data[16] = {0}; + uint8_t remain_bs[16] = {0}; + uint8_t unary_field[8] = {0}; + uint8_t sym_end_pos[16] = {0}; + uint8_t unary_sym[16] = {0}; + uint8_t remain_field_size = order_k << 4; + + parse_stream(bs, remain_bs, remain_field_size); + // bit plane encode for remain field + for (int k = 0; k < order_k; k++) + { + for (int i = 0; i < 8; i++) + { + remain_data[i] |= (get_bit_val(remain_bs, k << 1, i) << k); + remain_data[i + 8] |= (get_bit_val(remain_bs, (k << 1) + 1, i) << k); + } + } + + // zero number info + int znum_bit = (bf16_zvc_en && order_k > 0) ? 4 : 0; + uint8_t znum = 0; + parse_stream(bs, &znum, znum_bit); + + // unary encode for unary field + uint8_t unary_field_len = bs_size - remain_field_size - znum_bit; + parse_stream(bs, unary_field, unary_field_len); + + int sym_cnt = 0; + for (uint8_t ubit_i = 0; ubit_i < unary_field_len; ubit_i++) + { + int byte_idx = ubit_i / 8; + int bit_idx = ubit_i % 8; + if (get_bit_val(unary_field, byte_idx, bit_idx) == 1) + { + sym_end_pos[sym_cnt] = ubit_i; + sym_cnt++; + } + } + unary_sym[0] = sym_end_pos[0]; + for (int i = 1; i < 16; i++) + { + unary_sym[i] = sym_end_pos[i] - sym_end_pos[i - 1] - 1; + } + for (int i = 0; i < 16; i++) + { + rec[i] = (unary_sym[i] << order_k) + remain_data[i]; + } +} + +// -- vlc encode int8 entry function -- +static inline void cvk_vlc_enc_int8(const uint8_t *ibuf, size_t isz, uint8_t *obuf, size_t *osz, CommandInfo *cmd_info) +{ + StreamBuffer bs_header, bs_kmap, bs_data; + size_t blk_num = (isz + 15) >> 4; + size_t header_size = 16; + size_t kmap_size = divide_ceil(blk_num, 16) << 4; + size_t bs_buf_size = header_size + kmap_size + (blk_num << 4); + uint8_t *bsbuf = (uint8_t *)calloc(bs_buf_size, sizeof(uint8_t)); + + // block encode + init_stream(&bs_kmap, bsbuf + header_size, kmap_size, false); + init_stream(&bs_data, bsbuf + header_size + kmap_size, blk_num << 4, false); + + for (size_t blk_idx = 0; blk_idx < blk_num; blk_idx++) + { + uint8_t blk_data[16] = {0}, blk_sr_data[16] = {0}; + size_t in_size = (blk_idx == (blk_num - 1)) ? isz - (blk_idx << 4) : 16; + memcpy(blk_data, &ibuf[blk_idx << 4], sizeof(uint8_t) * in_size); + + symbol_remapping(blk_data, blk_sr_data, cmd_info->bias0, cmd_info->bias1, cmd_info->signedness, false, false); + + int k = vlc_estimate_block_order(blk_sr_data, false); + uint8_t ulen = vlc_gr_enc_block_data(blk_sr_data, &bs_data, k, false); + uint8_t k_info = (k == -1) ? 0xE0 : (k << 5) + ulen; + write_stream(&bs_kmap, &k_info, 8); + } + + int blk_bs_size = divide_ceil(((bs_data.bit_pos + 7) >> 3), 16) << 4; // 16 byte align + *osz = header_size + kmap_size + blk_bs_size; + + // write header + init_stream(&bs_header, bsbuf, header_size, false); + vlc_enc_header(&bs_header, cmd_info, blk_bs_size); + + memcpy(obuf, bsbuf, (*osz) * sizeof(uint8_t)); + free(bsbuf); +} + +// -- vlc decode int8 entry function -- +static inline void cvk_vlc_dec_int8_ext(const uint8_t *ibuf, size_t isz, uint8_t *obuf, size_t *bs_size) +{ + StreamBuffer bs_header, bs_kmap, bs_data; + CommandInfo cmd_info; + memset(&cmd_info, 0, sizeof(CommandInfo)); + + size_t blk_num = (isz + 15) >> 4; + int header_size = 16; + int kmap_size = divide_ceil(blk_num, 16) << 4; + *bs_size = 0; + + // parse header + init_stream(&bs_header, ibuf, header_size, true); + vlc_dec_header_ext(&bs_header, &cmd_info, bs_size); + + // Check whether valid header + size_t bs_buf_size = get_out_bs_buf_size(isz, 0); // int8 + + //ASSERT(*bs_size <= bs_buf_size); + //ASSERT(cmd_info.is_bfloat16 == 0); + if (*bs_size > bs_buf_size || cmd_info.is_bfloat16) + return; + + // block decode + init_stream(&bs_kmap, ibuf + header_size, kmap_size, true); + init_stream(&bs_data, ibuf + header_size + kmap_size, blk_num << 4, true); + + for (size_t blk_idx = 0; blk_idx < blk_num; blk_idx++) + { + uint8_t blk_data[16] = {0}, blk_sr_data[16] = {0}; + uint8_t k_info = 0; + parse_stream(&bs_kmap, &k_info, 8); + uint8_t ulen = k_info & 0x1F; + int k = (k_info >> 5 == 7) ? -1 : k_info >> 5; + int blk_bs_size = (k == -1) ? 128 : (k << 4) + ulen + 16; + vlc_gr_dec_block_data(&bs_data, blk_bs_size, blk_data, k, false); + + inv_symbol_remapping(blk_data, blk_sr_data, cmd_info.bias0, cmd_info.bias1, cmd_info.signedness, false, false); + + int out_size = (blk_idx == (blk_num - 1)) ? isz - (blk_idx << 4) : 16; + memcpy(&obuf[blk_idx << 4], blk_sr_data, sizeof(uint8_t) * out_size); + } +} + +static inline void cvk_vlc_dec_int8(const uint8_t *ibuf, size_t isz, uint8_t *obuf) +{ + size_t bs_size; + cvk_vlc_dec_int8_ext(ibuf, isz, obuf, &bs_size); +} + +// -- vlc encode bfloat16 entry function -- +static inline void cvk_vlc_enc_bf16(const uint16_t *ibuf, size_t isz, uint8_t *obuf, size_t *osz, CommandInfo *cmd_info) +{ + StreamBuffer bs_header, bs_kmap, bs_data; + size_t blk_num = (isz + 31) >> 5; // 32 bytes per blok + size_t header_size = 16; + size_t kmap_size = divide_ceil(blk_num, 16) << 4; + size_t bs_buf_size = header_size + kmap_size + (blk_num << 5); + uint8_t *bsbuf = (uint8_t *)calloc(bs_buf_size, sizeof(uint8_t)); + + // block encode + init_stream(&bs_kmap, bsbuf + header_size, kmap_size, false); + init_stream(&bs_data, bsbuf + header_size + kmap_size, blk_num << 5, false); + + for (size_t blk_idx = 0; blk_idx < blk_num; blk_idx++) + { + uint8_t blk_data[16] = {0}, blk_sr_data[16] = {0}, blk_data_frac[16] = {0}; + size_t in_num = (blk_idx == (blk_num - 1)) ? ((isz >> 1) - (blk_idx << 4)) : 16; + dispatch_bf16_data(&ibuf[blk_idx << 4], blk_data, blk_data_frac, in_num); + + // exp: BGR encode + symbol_remapping(blk_data, blk_sr_data, cmd_info->bias0, cmd_info->bias1, false, true, cmd_info->zero_guard_en); + + int k = vlc_estimate_block_order(blk_sr_data, cmd_info->zero_guard_en); + uint8_t ulen = vlc_gr_enc_block_data(blk_sr_data, &bs_data, k, cmd_info->zero_guard_en); + uint8_t k_info = (k == -1) ? 0xE0 : (k << 5) + ulen; + write_stream(&bs_kmap, &k_info, 8); + + // frac: implicit zero compression + for (size_t i = 0; i < 16; i++) + { + if (!cmd_info->zero_guard_en || blk_data[i] != 0) + { + write_stream(&bs_data, &blk_data_frac[i], 8); + } + } + } + + int blk_bs_size = divide_ceil(((bs_data.bit_pos + 7) >> 3), 16) << 4; // 16 byte align + *osz = header_size + kmap_size + blk_bs_size; + + // write header + init_stream(&bs_header, bsbuf, header_size, false); + vlc_enc_header(&bs_header, cmd_info, blk_bs_size); + + memcpy(obuf, bsbuf, (*osz) * sizeof(uint8_t)); + free(bsbuf); +} + +// -- vlc decode bfloat16 entry function -- +static inline void cvk_vlc_dec_bf16_ext(const uint8_t *ibuf, size_t isz, uint16_t *obuf, size_t *bs_size) +{ + StreamBuffer bs_header, bs_kmap, bs_data; + CommandInfo cmd_info; + memset(&cmd_info, 0, sizeof(CommandInfo)); + + size_t blk_num = (isz + 31) >> 5; // 32 bytes per blok + int header_size = 16; + int kmap_size = divide_ceil(blk_num, 16) << 4; + *bs_size = 0; + + // parse header + init_stream(&bs_header, ibuf, header_size, true); + vlc_dec_header_ext(&bs_header, &cmd_info, bs_size); + + // Check whether valid header + size_t bs_buf_size = get_out_bs_buf_size(isz, 1); // bf16 + + //ASSERT(*bs_size <= bs_buf_size); + //ASSERT(cmd_info.is_bfloat16 == 1); + if (*bs_size > bs_buf_size || cmd_info.is_bfloat16 != 1) + return; + + // block decode + init_stream(&bs_kmap, ibuf + header_size, kmap_size, true); + init_stream(&bs_data, ibuf + header_size + kmap_size, blk_num << 5, true); + + for (size_t blk_idx = 0; blk_idx < blk_num; blk_idx++) + { + uint8_t blk_data[16] = {0}, blk_sr_data[16] = {0}, blk_data_frac[16] = {0}; + uint8_t k_info = 0; + parse_stream(&bs_kmap, &k_info, 8); + uint8_t ulen = k_info & 0x1F; + int k = (k_info >> 5 == 7) ? -1 : k_info >> 5; + int znum_bit = (cmd_info.zero_guard_en && k > 0) ? 4 : 0; + uint8_t blk_bs_size = (k == -1) ? 128 : (k << 4) + ulen + 16 + znum_bit; + + // exp: BGR decode + vlc_gr_dec_block_data(&bs_data, blk_bs_size, blk_data, k, cmd_info.zero_guard_en); + + inv_symbol_remapping(blk_data, blk_sr_data, cmd_info.bias0, cmd_info.bias1, false, true, cmd_info.zero_guard_en); + + size_t out_num = (blk_idx == (blk_num - 1)) ? ((isz >> 1) - (blk_idx << 4)) : 16; + + // frac: implicit zero compression + for (size_t i = 0; i < out_num; i++) + { + if (!cmd_info.zero_guard_en || blk_sr_data[i] != 0) + { + parse_stream(&bs_data, &blk_data_frac[i], 8); + } + } + merge_bf16_data(blk_sr_data, blk_data_frac, &obuf[blk_idx << 4], out_num); + } +} + +static inline void cvk_vlc_dec_bf16(const uint8_t *ibuf, size_t isz, uint16_t *obuf) +{ + size_t bs_size; + cvk_vlc_dec_bf16_ext(ibuf, isz, obuf, &bs_size); +} + +// -- offline estimate model weight params -- +static inline void cvk_vlc_est_weight_bias(const uint8_t *ibuf, size_t isz, uint8_t signedness, uint8_t isBfloat16, CommandInfo *cmd_info) +{ + //assert(!(isBfloat16 && signedness)); // WARNING: signedness MUST be 0 as isBfloat16==True + + cmd_info->is_bfloat16 = isBfloat16; + if (isBfloat16 == false && signedness == true) + { + // two-side circular shift + int hist[256] = {0}; + for (size_t i = 0; i < isz; i++) + { + hist[ibuf[i]]++; + } + + int8_t pos_v = 1; + //while (pos_v < 128) + // comparison is always true due to limited range of data type [-Werror=type-limits] + while (true) + { + if (hist[((uint8_t)pos_v)] == 0) + { + pos_v++; + } + else + { + break; + } + } + //cmd_info->bias0 = (pos_v > 1 && pos_v < 128) ? (pos_v - 1) : 0; + // comparison is always true due to limited range of data type [-Werror=type-limits] + cmd_info->bias0 = (pos_v > 1) ? (pos_v - 1) : 0; + int8_t neg_v = -1; + //while (neg_v >= (-128)) // comparison is always true due to limited range of data type [-Werror=type-limits] + while (true) + { + if (hist[(uint8_t)neg_v] == 0) + { + neg_v--; + } + else + { + break; + } + } + //cmd_info->bias1 = (neg_v < -1 && neg_v >= -128) ? abs(neg_v + 1) : 0; + // comparison is always true due to limited range of data type [-Werror=type-limits] + cmd_info->bias1 = (neg_v < -1) ? abs(neg_v + 1) : 0; + cmd_info->signedness = true; + } + + if (isBfloat16 == true) + { + // center shift + int64_t exp_accum = 0; + uint16_t *bf16_in = (uint16_t *)ibuf; + size_t inum = (isz >> 1), cnt = 0; + for (size_t i = 0; i < inum; i++) + { + uint8_t exp = ((bf16_in[i] >> 7) & 0xFF); + if (exp != 0) + { + exp_accum += exp; + cnt++; + } + } + if (cnt > 0) + { + cmd_info->bias0 = (uint8_t)((exp_accum / (float)cnt) + 0.5); + } + cmd_info->zero_guard_en = (inum == cnt) ? false : true; + cmd_info->signedness = false; + } +} + #ifdef __cplusplus +} +#endif + +#endif /* __CVK_VLC_COMPRESS_H__ */ diff --git a/cvikernel/src/bm1822/bm_dmabuf.c b/cvikernel/src/bm1822/bm_dmabuf.c new file mode 100644 index 000000000..e718d76ba --- /dev/null +++ b/cvikernel/src/bm1822/bm_dmabuf.c @@ -0,0 +1,423 @@ +#include +#include +#include +#include + +#include "kernel_1822.h" +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define __ALIGN_MASK(x,mask) (((x)+(mask))&~(mask)) +#define ALIGN(x,a) __ALIGN_MASK(x,(__typeof__(x))(a)-1) + +#define BD_DESC_ALIGN_SIZE (1 << BDC_ENGINE_CMD_ALIGNED_BIT) +#define GDMA_DESC_ALIGN_SIZE (1 << TDMA_DESCRIPTOR_ALIGNED_BIT) +#define BD_EOD_PADDING_BYTES (128) +#define TPU_DMABUF_HEADER_M 0xB5B5 + +typedef struct { + cmd_hdr_t hdr; + uint32_t body[0]; +} DESC; + +// CPU_OP_SYNC structure +typedef struct { + uint32_t op_type; + uint32_t num_tiu; + uint32_t num_tdma; + uint32_t offset_tiu; + uint32_t offset_tdma; + uint32_t offset_tiu_ori_bk; + uint32_t offset_tdma_ori_bk; + char str[CPU_ENGINE_STR_LIMIT_BYTE]; +} __attribute__((packed)) cvi_cpu_desc_t; + +static DESC *traverse_start(uint8_t *cmdbuf) +{ + ASSERT(cmdbuf); + DESC *desc = (DESC *)cmdbuf; + ASSERT(desc->hdr.magic == CMDBUF_HDR_MAGIC_1822); + return desc; +} + +static DESC *traverse_next(DESC *desc, uint8_t *cmdbuf, uint32_t size) +{ + DESC *next_desc = (DESC *)((uint8_t *)desc + cmd_hdr_len(&desc->hdr) + sizeof(cmd_hdr_t)); + if ((uint8_t *)next_desc >= cmdbuf + size) + return NULL; + ASSERT(next_desc->hdr.magic == CMDBUF_HDR_MAGIC_1822); + return next_desc; +} + +static bool is_last_desc(DESC *desc, uint8_t *cmdbuf, uint32_t size) +{ + DESC *next_desc = traverse_next(desc, cmdbuf, size); + return next_desc ? false : true; +} + +static void reorder_bd_cmdbuf_reg(uint8_t *cmdbuf) +{ + int total_bits = BD_REG_BYTES * 8; + + for (int i = 0; i < total_bits; i += 128) + cmdbuf[(i + 128 - 8) / 8] |= (i / 128) << 4; + + uint8_t tmp[128 / 8]; + uint8_t *last = &cmdbuf[(total_bits - 128) / 8]; + memcpy(tmp, last, sizeof(tmp)); + memcpy(last, cmdbuf, sizeof(tmp)); + memcpy(cmdbuf, tmp, sizeof(tmp)); +} + +static void adjust_desc_tdma(uint32_t *body, bool eod) +{ + if (eod) { + body[0] |= (1 << TDMA_ACCPI0_EOD_BIT); + body[0] |= (1 << TDMA_ACCPI0_INTERRUPT_BIT); // interrupt + } + body[0] |= (1 << TDMA_ACCPI0_BARRIER_ENABLE_BIT); +} + +static void adjust_desc_bd(uint32_t *body, bool eod) +{ + if (eod) { + tiu_reg_t reg; + parse_tiu_reg(®, body); + reg.cmd_end = 1; + reg.cmd_intr_en = 1; + emit_tiu_reg(®, body); + } + reorder_bd_cmdbuf_reg((uint8_t *)body); +} + +void bmk1822_dmabuf_relocate(uint8_t *dmabuf, uint64_t dmabuf_devaddr, uint32_t original_size, uint32_t pmubuf_size) +{ + dma_hdr_t *header = (dma_hdr_t *)dmabuf; + uint64_t tmpAddress = 0; + + ASSERT(header->dmabuf_magic_m == TPU_DMABUF_HEADER_M); + cvi_cpu_desc_t *desc = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t)); + + for (uint32_t i = 0; i < header->cpu_desc_count; i++, desc++) { + uint32_t tiu_num = desc->num_tiu & 0xFFFF; + uint32_t tdma_num = desc->num_tdma & 0xFFFF; + + if (tiu_num) { + tmpAddress = dmabuf_devaddr + desc->offset_tiu; + //printf("bd tmpAddress = 0x%lu\n", tmpAddress); + desc->offset_tiu_ori_bk = desc->offset_tiu; + desc->offset_tiu = tmpAddress >> BDC_ENGINE_CMD_ALIGNED_BIT; + } + + if (tdma_num) { + tmpAddress = dmabuf_devaddr + desc->offset_tdma; + //printf("tdma tmpAddress = 0x%lu\n", tmpAddress); + desc->offset_tdma_ori_bk = desc->offset_tdma; + desc->offset_tdma = tmpAddress >> TDMA_DESCRIPTOR_ALIGNED_BIT; + } + + //set pmubuf_addr_p to enable pmu kick + header->pmubuf_size = pmubuf_size; + header->pmubuf_offset = original_size; + } +} + +static uint32_t desc_sync_id(DESC *desc) +{ + switch (desc->hdr.engine_id) { + case BMK1822_TIU: { + tiu_reg_t reg; + parse_tiu_reg(®, desc->body); + return reg.cmd_id_tpu; + } + case BMK1822_TDMA: { + tdma_reg_t reg; + parse_tdma_reg(®, desc->body); + return reg.cmd_id; + } + default: + ASSERT(0); + return 1; + } +} + +static void fill_header_and_arm(uint8_t *cmdbuf, uint32_t sz, uint8_t *dmabuf, uint64_t *tiu_offset, uint64_t *tdma_offset) +{ + dma_hdr_t header = {0}; + header.dmabuf_magic_m = TPU_DMABUF_HEADER_M; + header.dmabuf_magic_s = 0x1822; + + cvi_cpu_desc_t *segments = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t)); + DESC *desc = NULL; + uint32_t desc_nums[BMK1822_ENGINE_NUM] = {0}; + uint32_t counters[BMK1822_ENGINE_NUM] = {0}; + uint32_t desc_size[BMK1822_ENGINE_NUM] = {0}; + + ASSERT(segments); + // fill arm descs + desc = traverse_start(cmdbuf); + + while (desc != NULL) { + uint32_t engine_id = (uint32_t)desc->hdr.engine_id; + counters[engine_id]++; + desc_nums[engine_id]++; + if (engine_id != BMK1822_CPU) { + // a new arm desc inserted to do sync operation + if (desc_sync_id(desc) == 0xFFFF || is_last_desc(desc, cmdbuf, sz)) { + desc_nums[BMK1822_CPU]++; + cvi_cpu_desc_t *arm = segments + desc_nums[BMK1822_CPU] - 1; + memset(arm, 0, sizeof(cvi_cpu_desc_t)); + arm->op_type = CPU_OP_SYNC; + arm->num_tiu = counters[BMK1822_TIU]; + arm->num_tdma = counters[BMK1822_TDMA]; + strncpy(arm->str, "layer_end", sizeof(arm->str) - 1); + if (counters[BMK1822_TIU] != 0) { + desc_size[BMK1822_TIU] = + ALIGN(desc_size[BMK1822_TIU] + counters[BMK1822_TIU] * BD_REG_BYTES + BD_EOD_PADDING_BYTES, + BD_DESC_ALIGN_SIZE); + } + counters[BMK1822_TIU] = 0; + counters[BMK1822_TDMA] = 0; + } + } else { + cvi_cpu_desc_t *arm = segments + desc_nums[BMK1822_CPU] - 1; + memcpy(arm, &(desc->body), sizeof(cvi_cpu_desc_t)); + arm->num_tiu = counters[BMK1822_TIU]; + arm->num_tdma = counters[BMK1822_TDMA]; + if (counters[BMK1822_TIU] != 0) { + desc_size[BMK1822_TIU] = + ALIGN(desc_size[BMK1822_TIU] + counters[BMK1822_TIU] * BD_REG_BYTES + BD_EOD_PADDING_BYTES, + BD_DESC_ALIGN_SIZE); + } + counters[BMK1822_TIU] = 0; + counters[BMK1822_TDMA] = 0; + } + desc = traverse_next(desc, cmdbuf, sz); + } + desc_size[BMK1822_CPU] = desc_nums[BMK1822_CPU] * CPU_ENGINE_BYTES; + desc_size[BMK1822_TDMA] = desc_nums[BMK1822_TDMA] * GDMA_DESC_ALIGN_SIZE; + + (*tiu_offset) = ALIGN(sizeof(header) + desc_size[BMK1822_CPU], BD_DESC_ALIGN_SIZE); + (*tdma_offset) = ALIGN((*tiu_offset) + desc_size[BMK1822_TIU], GDMA_DESC_ALIGN_SIZE); + + // dma hdr + arm descs + bd descs + tdma descs + header.dmabuf_size = (*tdma_offset) + desc_size[BMK1822_TDMA]; + header.cpu_desc_count = desc_nums[BMK1822_CPU]; + header.bd_desc_count = desc_nums[BMK1822_TIU]; + header.tdma_desc_count = desc_nums[BMK1822_TDMA]; + + //printf("header.dmabuf_size = %d\n", header.dmabuf_size); + printf("header.cpu_desc_count = %d\n", header.cpu_desc_count); + printf("header.bd_desc_count = %d\n", header.bd_desc_count); + printf("header.tdma_desc_count = %d\n", header.tdma_desc_count); + + memcpy(dmabuf, &header, sizeof(header)); +} + +static void fill_bd_and_tdma(uint8_t *cmdbuf, uint32_t sz, uint8_t *dmabuf, uint64_t tiu_offset, uint64_t tdma_offset) +{ + dma_hdr_t *p_header = (dma_hdr_t *)dmabuf; + cvi_cpu_desc_t *segments = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t)); + DESC *desc = traverse_start(cmdbuf); + //uint64_t address_max = 0x0; + + for (uint32_t i = 0; i < p_header->cpu_desc_count; i++) { + + cvi_cpu_desc_t *arm = segments + i; + + uint32_t tiu_num = arm->num_tiu & 0xFFFF; + uint32_t tdma_num = arm->num_tdma & 0xFFFF; + + if (tiu_num) { + tiu_offset = ALIGN(tiu_offset, 1 << BDC_ENGINE_CMD_ALIGNED_BIT); + arm->offset_tiu = tiu_offset; + //printf("arm->offset_tiu = 0x%x \n", arm->offset_tiu); + } + + if (tdma_num) { + tdma_offset = ALIGN(tdma_offset, 1 << TDMA_DESCRIPTOR_ALIGNED_BIT); + arm->offset_tdma = tdma_offset; + //printf("arm->offset_tdma = 0x%x \n", arm->offset_tdma); + } + + while (tiu_num || tdma_num) { + uint32_t engine_id = (uint32_t)desc->hdr.engine_id; + void *p_body = NULL; + + switch (engine_id) { + case BMK1822_TIU: + tiu_num--; + p_body = (void *)(dmabuf + tiu_offset); + tiu_offset += BD_REG_BYTES; + memcpy(p_body, desc->body, desc->hdr.len); + adjust_desc_bd((uint32_t *)p_body, tiu_num == 0); + break; + case BMK1822_TDMA: + tdma_num--; + tdma_offset = ALIGN(tdma_offset, GDMA_DESC_ALIGN_SIZE); + p_body = (void *)(dmabuf + tdma_offset); + tdma_offset += GDMA_DESC_ALIGN_SIZE; + memcpy(p_body, desc->body, desc->hdr.len); + +#if 0 //debug feature, for checking if neuron overshoot +{ + tdma_reg_t reg_tdma = {0}; + uint64_t tdma_address = 0, tdma_address2 = 0; + + parse_tdma_reg(®_tdma, p_body); + + if (reg_tdma.src_base_reg_sel == 0) { + // reg.trans_dir = 2; // 0:tg2l, 1:l2tg, 2:g2g, 3:l2l + if (reg_tdma.trans_dir == 0) { + printf ("src_base_addr_high=%x, src_base_addr_low=%x\n", reg_tdma.src_base_addr_high, reg_tdma.src_base_addr_low); + tdma_address = ((uint64_t)reg_tdma.src_base_addr_high) << 32 | (uint64_t)reg_tdma.src_base_addr_low; + } else if (reg_tdma.trans_dir == 1) { + printf ("dst_base_addr_high=%x, dst_base_addr_low=%x\n", reg_tdma.dst_base_addr_high, reg_tdma.dst_base_addr_low); + tdma_address = ((uint64_t)reg_tdma.dst_base_addr_high) << 32 | (uint64_t)reg_tdma.dst_base_addr_low; + } else if (reg_tdma.trans_dir == 2) { + printf ("dst_base_addr_high=%x, dst_base_addr_low=%x\n", reg_tdma.dst_base_addr_high, reg_tdma.dst_base_addr_low); + tdma_address = ((uint64_t)reg_tdma.src_base_addr_high) << 32 | (uint64_t)reg_tdma.src_base_addr_low; + tdma_address2 = ((uint64_t)reg_tdma.dst_base_addr_high) << 32 | (uint64_t)reg_tdma.dst_base_addr_low; + + if (tdma_address2 > tdma_address) { + tdma_address = tdma_address2; + } + } + + if (tdma_address > address_max) { + address_max = tdma_address; + printf("address_max=%llx\n", address_max); + } + } +} +#endif + adjust_desc_tdma((uint32_t *)p_body, tdma_num == 0); + break; + default: + break; + } + desc = traverse_next(desc, cmdbuf, sz); + } + + // padding zero after eod to workaroud hardware bug + if (arm->num_tiu & 0xFFFF) { + void *buf = (void *)(dmabuf + tiu_offset); + memset(buf, 0, BD_EOD_PADDING_BYTES); + tiu_offset += BD_EOD_PADDING_BYTES; + } + } + +} + +void bmk1822_dmabuf_convert(uint8_t *cmdbuf, uint32_t sz, uint8_t *dmabuf) +{ + uint64_t tiu_offset = 0; + uint64_t tdma_offset = 0; + fill_header_and_arm(cmdbuf, sz, dmabuf, &tiu_offset, &tdma_offset); + fill_bd_and_tdma(cmdbuf, sz, dmabuf, tiu_offset, tdma_offset); + return; +} + +#define PER_DES_SIZE 16 +#define PADDING_SIZE (1024 * 1024) +void bmk1822_dmabuf_size(uint8_t *cmdbuf, uint32_t sz, uint32_t *psize, uint32_t *pmu_size) +{ + uint32_t tdma_desc_num = {0}; + uint32_t counters[BMK1822_ENGINE_NUM] = {0}; + uint32_t bd_size = 0; + uint32_t dmabuf_size = 0; + + uint32_t tiu_cnt = 0; + uint32_t tdma_cnt = 0; + + // calculate desc numbers + DESC *desc = traverse_start(cmdbuf); + + while (desc != NULL) { + uint32_t engine_id = (uint32_t)desc->hdr.engine_id; + counters[engine_id]++; + if (engine_id != BMK1822_CPU) { + // a new arm desc inserted to do sync operation + if (desc_sync_id(desc) == 0xFFFF || is_last_desc(desc, cmdbuf, sz)) { + counters[BMK1822_CPU]++; + tdma_desc_num += counters[BMK1822_TDMA]; + if (counters[BMK1822_TIU] != 0) { + bd_size = ALIGN(bd_size + counters[BMK1822_TIU] * BD_REG_BYTES + BD_EOD_PADDING_BYTES, + BD_DESC_ALIGN_SIZE); + } + tiu_cnt += counters[BMK1822_TIU] & 0xFFFF; + tdma_cnt += counters[BMK1822_TDMA] & 0xFFFF; + counters[BMK1822_TIU] = 0; + counters[BMK1822_TDMA] = 0; + } + } else { + tdma_desc_num += counters[BMK1822_TDMA]; + if (counters[BMK1822_TIU] != 0) { + bd_size = ALIGN(bd_size + counters[BMK1822_TIU] * BD_REG_BYTES + BD_EOD_PADDING_BYTES, + BD_DESC_ALIGN_SIZE); + } + tiu_cnt += counters[BMK1822_TIU] & 0xFFFF; + tdma_cnt += counters[BMK1822_TDMA] & 0xFFFF; + counters[BMK1822_TIU] = 0; + counters[BMK1822_TDMA] = 0; + } + desc = traverse_next(desc, cmdbuf, sz); + } + // dma hdr + arm descs + bd descs + tdma descs + dmabuf_size = sizeof(dma_hdr_t) + counters[BMK1822_CPU] * CPU_ENGINE_BYTES; + dmabuf_size = ALIGN(dmabuf_size, BD_DESC_ALIGN_SIZE) + bd_size; + dmabuf_size = ALIGN(dmabuf_size, GDMA_DESC_ALIGN_SIZE) + tdma_desc_num * GDMA_DESC_ALIGN_SIZE; + + *psize = dmabuf_size; + + *pmu_size = ALIGN((tiu_cnt + tdma_cnt) * PER_DES_SIZE + PADDING_SIZE, 0x1000); +} + +void bmk1822_arraybase_set(uint8_t *dmabuf, uint32_t arraybase0L, uint32_t arraybase1L, uint32_t arraybase0H, uint32_t arraybase1H) +{ + ASSERT(dmabuf); + dma_hdr_t *header = (dma_hdr_t *)dmabuf; + + ASSERT(header->dmabuf_magic_m == TPU_DMABUF_HEADER_M); + header->arraybase_0_L = arraybase0L; + header->arraybase_1_L = arraybase1L; + header->arraybase_0_H = arraybase0H; + header->arraybase_1_H = arraybase1H; + return; +} + +void bmk1822_dmabuf_dump(uint8_t *dmabuf) +{ + ASSERT(dmabuf); + dma_hdr_t *header = (dma_hdr_t *)dmabuf; + //printf("bmk1822_dmabuf_dump header->arraybase_0_L = 0x%x\n", header->arraybase_0_L); + //printf("bmk1822_dmabuf_dump header->arraybase_1_L = 0x%x\n", header->arraybase_1_L); + //printf("bmk1822_dmabuf_dump header->arraybase_0_H = 0x%x\n", header->arraybase_0_H); + //printf("bmk1822_dmabuf_dump header->arraybase_1_H = 0x%x\n", header->arraybase_1_H); + //printf("bmk1822_dmabuf_dump header->pmubuf_offset = 0x%x\n", header->pmubuf_offset); + + ASSERT(header->dmabuf_magic_m == TPU_DMABUF_HEADER_M); + cvi_cpu_desc_t *desc = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t)); + + for (u32 i = 0; i < header->cpu_desc_count; i++, desc++) { + int bd_num = desc->num_tiu & 0xFFFF; + int tdma_num = desc->num_tdma & 0xFFFF; + u32 bd_offset = desc->offset_tiu; + u32 tdma_offset = desc->offset_tdma; + printf("bmk1822_dmabuf_dump num, offset<0x%08x, 0x%08x>\n", bd_num, tdma_num, bd_offset, tdma_offset); + } +} + +#ifdef __cplusplus +} +#endif + diff --git a/cvikernel/src/bm1822/bm_kernel.c b/cvikernel/src/bm1822/bm_kernel.c new file mode 100644 index 000000000..b2017cfe5 --- /dev/null +++ b/cvikernel/src/bm1822/bm_kernel.c @@ -0,0 +1,586 @@ +#include +#include "kernel_1822.h" +#include + +static void replace_cmd_id(uint32_t *desc, uint32_t eng_id, uint16_t ids[]) +{ + if (eng_id == BMK1822_TIU) { + tiu_reg_t reg; + parse_tiu_reg(®, desc); + reg.cmd_id_en = 1; + reg.cmd_id_tpu = ids[eng_id]; + reg.cmd_id_gdma = ids[BMK1822_TDMA]; + emit_tiu_reg(®, desc); + } else if (eng_id == BMK1822_TDMA) { + tdma_reg_t tdma_reg; + parse_tdma_reg(&tdma_reg, desc); + tdma_reg.cmd_id = ids[eng_id]; + tdma_reg.wait_id_tpu = ids[BMK1822_TIU]; + tdma_reg.bar_en = 1; + emit_tdma_reg(&tdma_reg, desc); + } +} + +static int bm1822_get_engine_desc_length(uint32_t engine_id) +{ + switch (engine_id) { + case BMK1822_TIU: + return TIU_ENGINE_DESCRIPTOR_NUM * sizeof(uint32_t); + case BMK1822_TDMA: + return TDMA_ENGINE_DESCRIPTOR_NUM * sizeof(uint32_t); + case BMK1822_CPU: + return CPU_ENGINE_DESCRIPTOR_NUM * sizeof(uint32_t); + default: + ASSERT(0); + } +} + +// Estimate the number of command descriptor based on buffer size provided +// by the user. +uint32_t bmk1822_estimate_nr_desc(ctx_t *k) +{ + uint32_t tiu_desc_len = bm1822_get_engine_desc_length(BMK1822_TIU); + uint32_t tdma_desc_len = bm1822_get_engine_desc_length(BMK1822_TDMA); + uint32_t hdr_len = sizeof(cmd_hdr_t); + + uint32_t desc_len = + (tiu_desc_len > tdma_desc_len) ? tiu_desc_len : tdma_desc_len; + + return k->info.cmdbuf_size / (desc_len + hdr_len); +} + +static void kernel_init(ctx_t *k, bmk_info_t *info) +{ + k->info = *info; + ASSERT(info->chip_version == BM1822_VER); + k->chip_info = bmk1822_chip_info(); + + uint32_t max_nr_desc = bmk1822_estimate_nr_desc(k); + ec_init(&k->ec, BMK1822_ENGINE_NUM, max_nr_desc); + mode_manager_init(&k->mode_manager, &k->ec, BMK1822_ENGINE_NUM); + + k->cmdbuf_ptr = 0; + k->max_nr_desc = max_nr_desc; + k->cur_nr_desc = 0; + k->desc_pairs = xmalloc(max_nr_desc * sizeof(k->desc_pairs[0])); + + k->lmem_ptr = 0; +} + +static void kernel_destroy(ctx_t *k) +{ + free(k->desc_pairs); + ec_destroy(&k->ec); + mode_manager_destroy(&k->mode_manager); +} + +static void kernel_reset(ctx_t *k) +{ + k->cur_nr_desc = 0; + k->cmdbuf_ptr = 0; + + ec_reset(&k->ec); + mode_manager_reset(&k->mode_manager); +} + +static cmd_hdr_t * kernel_alloc_cmd_hdr( + ctx_t *k, uint8_t eng_id, uint32_t desc_len) +{ + uint32_t free_len = k->info.cmdbuf_size - k->cmdbuf_ptr; + uint32_t hdr_len = sizeof(cmd_hdr_t); + uint32_t total_len = hdr_len + desc_len; + ASSERT(total_len <= free_len); + + cmd_hdr_t *hdr = (cmd_hdr_t *)&k->info.cmdbuf[k->cmdbuf_ptr]; + hdr->magic = CMDBUF_HDR_MAGIC_1822; + hdr->len = desc_len; + hdr->engine_id = eng_id; + hdr->__deprecated = 0; // for valgrind + hdr->flags = 0; + hdr->mask = 0; + + k->cmdbuf_ptr += total_len; + return hdr; +} + +static desc_pair_t * kernel_alloc_desc_pair(ctx_t *k, uint8_t eng_id) +{ + ASSERT(eng_id < BMK1822_ENGINE_NUM); + ASSERT(k->cur_nr_desc < k->max_nr_desc); + + uint32_t desc_len = bm1822_get_engine_desc_length(eng_id); + desc_pair_t *dp = &k->desc_pairs[k->cur_nr_desc++]; + dp->cmd_hdr = kernel_alloc_cmd_hdr(k, eng_id, desc_len); + dp->ec_desc = ec_alloc_desc(&k->ec, eng_id); + + mode_manager_record_ec_desc(&k->mode_manager, dp->ec_desc); + return dp; +} + +static void kernel_update_sync_id(ctx_t *k) +{ + ec_compute_sync_ids(&k->ec); + + for (uint32_t di = 0; di < k->cur_nr_desc; di++) { + desc_pair_t *dp = &k->desc_pairs[di]; + uint8_t eng_id = dp->ec_desc->engine_id; + uint32_t *desc = (uint32_t *)dp->cmd_hdr->cmd; + replace_cmd_id(desc, eng_id, dp->ec_desc->sync_ids); + } +} + +void bmk1822_add_dependency( + ctx_t *ctx, + bmk1822_op_t *before, + bmk1822_op_t *after) +{ + ec_add_dependency(&ctx->ec, before, after); +} + +desc_pair_t * bm1822_get_desc_pair(ctx_t *k, uint8_t eng_id) +{ + if (eng_id == BMK1822_CPU) { + kernel_update_sync_id(k); + k->cur_nr_desc = 0; + + ec_reset(&k->ec); + mode_manager_restart_sync_id(&k->mode_manager); + } + + return kernel_alloc_desc_pair(k, eng_id); +} + +ctx_t * bmk1822_register(bmk_info_t *info) +{ + ASSERT(info); + ASSERT(info->cmdbuf); + ASSERT(info->cmdbuf_size > 0); + ctx_t *k = xmalloc(sizeof(*k)); + kernel_init(k, info); + return k; +} + +void bmk1822_cleanup(ctx_t *ctx) +{ + ASSERT(ctx); + + ctx_t *k = (typeof(k))ctx; + + kernel_destroy(k); + free(k); +} + +void bmk1822_reset(ctx_t *ctx) +{ + ctx_t *k = (typeof(k))ctx; + kernel_reset(k); +} + +uint8_t *bmk1822_acquire_cmdbuf(ctx_t *ctx, uint32_t *size) +{ + ctx_t *k = (typeof(k))ctx; + + *size = k->cmdbuf_ptr; + kernel_update_sync_id(k); + return k->info.cmdbuf; +} + +void bmk1822_parallel_enable(ctx_t *ctx) +{ + ctx_t *k = (typeof(k))ctx; + mode_manager_enable_parallel(&k->mode_manager); +} + +void bmk1822_set_op(ctx_t *ctx, void* op) +{ + ctx_t *k = (typeof(k))ctx; + k->op = op; +} + +void* bmk1822_get_op(ctx_t *ctx) +{ + ctx_t *k = (typeof(k))ctx; + return k->op; +} + +void bmk1822_parallel_disable(ctx_t *ctx) +{ + ctx_t *k = (typeof(k))ctx; + mode_manager_disable_parallel(&k->mode_manager); +} + +void bmk1822_create_streams(ctx_t *ctx, int nr_streams) +{ + ctx_t *k = (typeof(k))ctx; + mode_manager_create_streams(&k->mode_manager, nr_streams); +} + +void bmk1822_set_layer_id(ctx_t *ctx, uint16_t layer_id) +{ + ctx_t *k = (typeof(k))ctx; + k->layer_id = layer_id; +} + +uint16_t bmk1822_layer_id(ctx_t *ctx) +{ + ctx_t *k = (typeof(k))ctx; + return k->layer_id; +} + +void bmk1822_destroy_streams(ctx_t *ctx) +{ + ctx_t *k = (typeof(k))ctx; + mode_manager_destroy_streams(&k->mode_manager); +} + +void bmk1822_set_stream(ctx_t *ctx, int i) +{ + ctx_t *k = (typeof(k))ctx; + mode_manager_set_stream(&k->mode_manager, i); +} + +static bmk1822_chip_info_t bm1822_chip_info = { + .version = BM1822_VER, + .npu_num = BM1822_HW_NPU_NUM, + .eu_num = BM1822_HW_EU_NUM, + .lmem_size = BM1822_HW_LMEM_SIZE, + .lmem_banks = BM1822_HW_LMEM_BANKS, + .lmem_bank_size = BM1822_HW_LMEM_BANK_SIZE, + .lmem_start = BM1822_HW_LMEM_START_ADDR, + .gmem_start = BM1822_GLOBAL_MEM_START_ADDR, + .gmem_size = BM1822_GLOBAL_MEM_SIZE, +}; + +bmk1822_chip_info_t bmk1822_chip_info(void) +{ + return bm1822_chip_info; +} + +bmk1822_tensor_lmem_t * bmk1822_lmem_alloc_tensor( + ctx_t *ctx, + bmk1822_tensor_lmem_shape_t s, + fmt_t fmt, + int eu_align) +{ + ctx_t *k = (typeof(k))ctx; + uint32_t lmem_size = k->chip_info.lmem_size; + uint32_t eu_num = k->chip_info.eu_num; + + bmk1822_tensor_lmem_t *t = xmalloc(sizeof(*t)); + memset(t, 0, sizeof(*t)); + t->start_address = k->lmem_ptr; + t->fmt = fmt; + t->cmprs_fmt = fmt; + t->shape = s; + t->eu_align = eu_align; + t->stride = bmk1822_tensor_lmem_default_stride(ctx, s, fmt, eu_align); + + uint32_t needed = align_up(t->shape.n * t->stride.n, eu_num); + if ((lmem_size - k->lmem_ptr < needed) || !needed) { + free(t); + return NULL; + } + + k->lmem_ptr += needed; + return t; +} + +void bmk1822_lmem_init_tensor( + ctx_t *ctx, + bmk1822_tensor_lmem_t *tl, + bmk1822_tensor_lmem_shape_t shape, + fmt_t fmt, + int eu_align) +{ + memset(tl, 0, sizeof(*tl)); + tl->fmt = fmt; + tl->shape = shape; + tl->eu_align = eu_align; + tl->stride = bmk1822_tensor_lmem_default_stride(ctx, shape, fmt, eu_align); +} + +// Provide the unified api for tensor size calculation. +// Must have the same logic as bmk1822_lmem_bf16_alloc_tensor. +// The backed does not need to duplicate the related code. +uint32_t bmk1822_lmem_tensor_to_size( + ctx_t *ctx, + bmk1822_tensor_lmem_shape_t s, + fmt_t fmt, int eu_align) +{ + ctx_t *k = (typeof(k))ctx; + uint32_t eu_num = k->chip_info.eu_num; + + bmk1822_tensor_lmem_stride_t stride; + stride = bmk1822_tensor_lmem_default_stride(ctx, s, fmt, eu_align); + + uint32_t needed = align_up(s.n * stride.n, eu_num); + + return needed; +} + +bmk1822_tensor_lmem_t * bmk1822_lmem_alloc_ps32_tensor( + bmk1822_context_t *ctx, + bmk1822_tensor_lmem_shape_t s, + fmt_t fmt, + int eu_align) +{ + /* Partial sum is located in lmem in 32-bit format, so we times n to 2 to + * spare a sapce for it. + */ + + uint32_t prev_n; + + prev_n = s.n; + s.n = s.n * (bitsize_of_fmt(FMT_I32) / bitsize_of_fmt(fmt)); + bmk1822_tensor_lmem_t *res = bmk1822_lmem_alloc_tensor(ctx, s, fmt, eu_align); + if(res == NULL) + ASSERT(0); + res->shape.n = prev_n; + return res; +} + +void bmk1822_lmem_free_tensor( + ctx_t *ctx, const bmk1822_tensor_lmem_t *t) +{ + ASSERT(t->start_address < ctx->lmem_ptr); + ctx->lmem_ptr = t->start_address; + + free((void *)t); +} + +bmk1822_matrix_lmem_t * bmk1822_lmem_alloc_matrix( + ctx_t *ctx, + bmk1822_matrix_lmem_shape_t s, + fmt_t fmt, + int eu_align) +{ + uint32_t lmem_size = ctx->chip_info.lmem_size; + uint32_t npu_num = ctx->chip_info.npu_num; + uint32_t eu_num = ctx->chip_info.eu_num; + uint32_t val = (fmt == FMT_BF16) ? 2 : 1; + + bmk1822_matrix_lmem_t *t = xmalloc(sizeof(*t)); + memset(t, 0, sizeof(*t)); + t->start_address = ctx->lmem_ptr; + t->fmt = fmt; + t->shape = s; + t->stride.h = s.w * val; + if (eu_align) + t->stride.c = align_up(s.w * val, eu_num); + else + t->stride.c = s.w * val; + t->stride.n = t->stride.c * ceiling_func(s.c, npu_num); + t->eu_align = eu_align; + + uint32_t needed = align_up(t->shape.n * t->stride.n, eu_num); + if (lmem_size - ctx->lmem_ptr < needed) { + free(t); + return NULL; + } + ctx->lmem_ptr += needed; + return t; +} + +void bmk1822_lmem_init_matrix( + ctx_t *ctx, + bmk1822_matrix_lmem_t *ml, + bmk1822_matrix_lmem_shape_t shape, + fmt_t fmt, + int eu_align) +{ + memset(ml, 0, sizeof(*ml)); + ml->fmt = fmt; + ml->shape = shape; + ml->stride = bmk1822_matrix_lmem_default_stride(ctx, shape, fmt, eu_align); + ml->eu_align = eu_align; +} + +// Provide the unified api for matrix size calculation. +// Must have the same logic as bmk1822_lmem_alloc_matrix. +// The backed does not need to duplicate the related code. +uint32_t bmk1822_lmem_matrix_to_size( + ctx_t *ctx, + bmk1822_matrix_lmem_shape_t s, + fmt_t fmt, + int eu_align) { + uint32_t npu_num = ctx->chip_info.npu_num; + uint32_t eu_num = ctx->chip_info.eu_num; + uint32_t val = (fmt == FMT_BF16) ? 2 : 1; + + bmk1822_matrix_lmem_t t; + t.fmt = fmt; + t.shape = s; + t.stride.h = s.w * val; + if (eu_align) + t.stride.c = align_up(s.w * val, eu_num); + else + t.stride.c = s.w * val; + t.stride.n = t.stride.c * ceiling_func(s.c, npu_num); + + uint32_t needed = align_up(t.shape.n * t.stride.n, eu_num); + + return needed; +} + +bmk1822_matrix_lmem_t * bmk1822_lmem_alloc_ps32_matrix( + bmk1822_context_t *ctx, + bmk1822_matrix_lmem_shape_t s, + fmt_t fmt, + int eu_align) +{ + /* Partial sum is located in lmem in 32-bit format, so we times n to 4 to + * spare a sapce for it. + */ + + uint32_t prev_n; + + prev_n = s.n; + s.n = s.n * (bitsize_of_fmt(FMT_I32) / bitsize_of_fmt(fmt)); + bmk1822_matrix_lmem_t *res = bmk1822_lmem_alloc_matrix(ctx, s, fmt, eu_align); + if(res == NULL) + ASSERT(0); + res->shape.n = prev_n; + return res; +} + +// Provide the unified api for matrix size calculation. +// Must have the same logic as bmk1822_lmem_alloc_ps32_matrix. +// The backed does not need to duplicate the related code. +uint32_t bmk1822_lmem_ps32_matrix_to_size( + bmk1822_context_t *ctx, + bmk1822_matrix_lmem_shape_t s, + fmt_t fmt, + int eu_align) +{ + /* Partial sum is located in lmem in 32-bit format, so we times n to 4 to + * spare a sapce for it. + */ + + s.n = s.n * (bitsize_of_fmt(FMT_I32) / bitsize_of_fmt(fmt)); + + return bmk1822_lmem_matrix_to_size(ctx, s, fmt, eu_align); +} + +void bmk1822_lmem_free_matrix( + ctx_t *ctx, const bmk1822_matrix_lmem_t *t) +{ + ASSERT(t->start_address < ctx->lmem_ptr); + ctx->lmem_ptr = t->start_address; + free((void *)t); +} + +bmk1822_tensor_lmem_stride_t bmk1822_tensor_lmem_default_stride( + ctx_t *ctx, + bmk1822_tensor_lmem_shape_t s, + fmt_t fmt_type, + int eu_align) +{ + bmk1822_tensor_lmem_stride_t stride; + uint32_t eu_num = ctx->chip_info.eu_num; + uint32_t npu_num = ctx->chip_info.npu_num; + uint32_t fmt = (fmt_type == FMT_BF16) ? 2 : 1; + stride.w = fmt; + stride.h = s.w * fmt; + if (eu_align) + stride.c = align_up(s.h * s.w * fmt, eu_num); + else + stride.c = s.h * s.w * fmt; + + stride.n = stride.c * ceiling_func(s.c, npu_num); +// printf("bmk1822_tensor_lmem_default_stride stride n=%x c=%x h=%x w=%x\n", stride.n , stride.c , stride.h, stride.w); + return stride; +} + +bmk1822_tensor_tgmem_stride_t bmk1822_tensor_tgmem_default_stride( + bmk1822_tensor_tgmem_shape_t s, fmt_t fmt_type) +{ + uint32_t data_type_size = (fmt_type == FMT_BF16) ? 2 : 1; + bmk1822_tensor_tgmem_stride_t stride; + stride.h = s.w * data_type_size; + stride.c = s.h * stride.h; + stride.n = s.c * stride.c; + return stride; +} + +static void try_optimize_matrix_shape(ctx_t *ctx, + bmk1822_matrix_lmem_shape_t *s, + fmt_t fmt_type) { + uint32_t eu_num = ctx->chip_info.eu_num; + uint32_t npu_num = ctx->chip_info.npu_num; + uint32_t col = s->col; + bool isBf16 = (fmt_type == FMT_BF16); + uint32_t workingNumber = isBf16 ? eu_num / 2 : eu_num; + + if (col >= workingNumber) { + int num_eu = ceiling_func(col, workingNumber * npu_num); + s->w = workingNumber * num_eu; + s->c = ceiling_func(col, s->w); + } else { + // col < EU_NUM + // Only transfer needed data + // We still change tensor shape in TIU mac op + s->w = col; + s->c = 1; + } +} + +bmk1822_matrix_lmem_shape_t bmk1822_matrix_lmem_default_shape( + ctx_t *ctx, + uint32_t row, + uint32_t col, + fmt_t fmt_type) +{ + bmk1822_matrix_lmem_shape_t s = {0}; + s.n = row; + s.col = col; + + try_optimize_matrix_shape(ctx, &s, fmt_type); + + return s; +} + +bmk1822_matrix_lmem_shape_t bmk1822_matrix_lmem_shape_t1( + ctx_t *ctx, + uint32_t len, + fmt_t fmt_type) +{ + uint32_t lmem_size = ctx->chip_info.lmem_size; + bmk1822_matrix_lmem_shape_t s = {0}; + + uint32_t row = 1; + uint32_t col = len; + + while (col >= lmem_size) { + ASSERT(col % 2 == 0); + col /= 2; + row *= 2; + } + + s.n = row; + s.col = col; + + try_optimize_matrix_shape(ctx, &s, fmt_type); + return s; +} + +// This should be inside bmk1822_lmem_alloc_matrix +bmk1822_matrix_lmem_stride_t bmk1822_matrix_lmem_default_stride( + ctx_t *ctx, + bmk1822_matrix_lmem_shape_t s, + fmt_t fmt, + int eu_align) +{ + uint32_t npu_num = ctx->chip_info.npu_num; + uint32_t eu_num = ctx->chip_info.eu_num; + uint32_t val = (fmt == FMT_BF16) ? 2 : 1; + + bmk1822_matrix_lmem_stride_t stride; + stride.h = s.w * val; + if (eu_align) + stride.c = align_up(s.w * val, eu_num); + else + stride.c = s.w * val; + stride.n = stride.c * ceiling_func(s.c, npu_num); + + return stride; +} diff --git a/cvikernel/src/bm1822/kernel_1822.h b/cvikernel/src/bm1822/kernel_1822.h new file mode 100644 index 000000000..5228906f5 --- /dev/null +++ b/cvikernel/src/bm1822/kernel_1822.h @@ -0,0 +1,374 @@ +#ifndef KERNEL_1822_H +#define KERNEL_1822_H + +#include "kernel_internal.h" + +#include +#include +#include +#include +#include +#include +#include +#include "bmkernel_standard.h" + +#include + +#define TENSOR_MUL_FIX8B 0 +#define TENSOR_MAC_FIX8B 1 +#define TENSOR_ADD_FIX8B 2 +#define TENSOR_SUB_FIX8B 3 +#define TENSOR_MAX_FIX8B 4 +#define TENSOR_MIN_FIX8B 5 +#define TENSOR_SHIFT_FIX8B 6 +#define TENSOR_AND_FIX8B 7 +#define TENSOR_OR_FIX8B 8 +#define TENSOR_XOR_FIX8B 9 +#define TENSOR_COPY_FIX8B 10 +#define TENSOR_GE_FIX8B 11 + +typedef bmk1822_tensor_lmem_shape_t tl_shape_t; +typedef bmk1822_matrix_lmem_shape_t ml_shape_t; +typedef bmk1822_tensor_tgmem_shape_t tg_shape_t; +typedef bmk1822_matrix_tgmem_shape_t mg_shape_t; + +typedef bmk1822_tensor_lmem_stride_t tl_stride_t; + +typedef bmk1822_tensor_lmem_t tl_t; +typedef bmk1822_matrix_lmem_t ml_t; +typedef bmk1822_tensor_tgmem_t tg_t; +typedef bmk1822_matrix_tgmem_t mg_t; +typedef bmk1822_compressed_tensor_tgmem_t compressed_tg_t; +typedef bmk1822_compressed_matrix_tgmem_t compressed_mg_t; + +desc_pair_t * bm1822_get_desc_pair(ctx_t *k, uint8_t eng_id); + +static inline void assert_same_stride(const tl_t *a, const tl_t *b) +{ + ASSERT(a->stride.n == b->stride.n); + ASSERT(a->stride.c == b->stride.c); + ASSERT(a->stride.h == b->stride.h); + ASSERT(a->stride.w == b->stride.w); +} + +static inline void assert_same_shape(const tl_t *a, const tl_t *b) +{ + ASSERT(a->shape.n == b->shape.n); + ASSERT(a->shape.c == b->shape.c); + ASSERT(a->shape.h == b->shape.h); + ASSERT(a->shape.w == b->shape.w); +} + +static inline void assert_same_shape_3( + const tl_t *a, + const tl_t *b, + const tl_t *c) +{ + assert_same_shape(a, b); + assert_same_shape(a, c); +} + +static inline void assert_same_shape_4( + const tl_t *a, + const tl_t *b, + const tl_t *c, + const tl_t *d) +{ + assert_same_shape_3(a, b, c); + assert_same_shape(a, d); +} + +static inline void assert_same_shape_5( + const tl_t *t0, + const tl_t *t1, + const tl_t *t2, + const tl_t *t3, + const tl_t *t4) +{ + assert_same_shape_3(t0, t1, t2); + assert_same_shape_3(t0, t3, t4); +} + +static inline void assert_same_shape_6( + const tl_t *t0, + const tl_t *t1, + const tl_t *t2, + const tl_t *t3, + const tl_t *t4, + const tl_t *t5) +{ + assert_same_shape_5(t0, t1, t2, t3, t4); + assert_same_shape(t0, t5); +} + + +static inline void assert_tiu_tensor_shape(const tl_t *t) +{ + ASSERT(t->shape.n > 0); + ASSERT(t->shape.c > 0); + ASSERT(t->shape.h > 0); + ASSERT(t->shape.w > 0); + + ASSERT(t->shape.n < 0x1000); + ASSERT(t->shape.c < 0x1000); + ASSERT(t->shape.h <= (4095-32)); // 12bit, max 4095-32(lanes) + ASSERT(t->shape.w <= (4095-32)); // 12bit, max 4095-32(lanes) +} + +static inline void check_tiu_tensor(const tl_t *t) +{ + ASSERT(t); + assert_tiu_tensor_shape(t); + ASSERT(t->fmt == FMT_I8 || t->fmt == FMT_U8 || t->fmt == FMT_BF16); +} + +static inline void check_tiu_tensor_2( + const tl_t *t0, + const tl_t *t1) +{ + check_tiu_tensor(t0); + check_tiu_tensor(t1); +} + +static inline void check_tiu_tensor_3( + const tl_t *t0, + const tl_t *t1, + const tl_t *t2) +{ + check_tiu_tensor(t0); + check_tiu_tensor_2(t1, t2); +} + +static inline void check_tiu_tensor_4( + const tl_t *t0, + const tl_t *t1, + const tl_t *t2, + const tl_t *t3) +{ + check_tiu_tensor_3(t0, t1, t2); + check_tiu_tensor(t3); +} + +static inline void check_tiu_tensor_5( + const tl_t *t0, + const tl_t *t1, + const tl_t *t2, + const tl_t *t3, + const tl_t *t4) +{ + check_tiu_tensor_3(t0, t1, t2); + check_tiu_tensor_2(t3, t4); +} + +static inline void check_tiu_tensor_6( + const tl_t *t0, + const tl_t *t1, + const tl_t *t2, + const tl_t *t3, + const tl_t *t4, + const tl_t *t5) +{ + check_tiu_tensor_3(t0, t1, t2); + check_tiu_tensor_3(t3, t4, t5); +} + +static inline void check_16bit_tiu_tensor(const tl_t *low, const tl_t *high) +{ + check_tiu_tensor_2(low, high); + assert_same_shape(low, high); + assert_same_stride(low, high); + ASSERT(low->fmt == high->fmt); + ASSERT(low->start_address < high->start_address); +} + +static inline void assert_stride_type_0(ctx_t *ctx, const tl_t *t) +{ + uint32_t eu_num = ctx->chip_info.eu_num; + uint32_t fmt = (t->fmt == FMT_BF16) ? 2 : 1; + + uint32_t h = t->shape.h; + uint32_t w = t->shape.w * fmt; + uint32_t c_stride = align_up(h * w, eu_num); + + ASSERT(t->stride.c == c_stride); + ASSERT(t->stride.h == w); + ASSERT(t->stride.w == fmt); +} + +static inline void assert_bf16_stride_type_0(ctx_t *ctx, const tl_t *t) +{ + uint32_t eu_num = ctx->chip_info.eu_num; + uint32_t fmt = (t->fmt == FMT_BF16) ? 2 : 1; + + ASSERT(t->stride.c % eu_num == 0); + ASSERT(t->stride.w == fmt); +} + + +static inline void assert_stride_type_2(ctx_t *ctx, const tl_t *t) +{ + ASSERT(t->shape.h == 1); + ASSERT(t->shape.w == 1); + + uint32_t fmt = (t->fmt == FMT_BF16) ? 2 : 1; + uint32_t c = t->shape.c; + uint32_t npu_num = ctx->chip_info.npu_num; + + ASSERT(t->stride.n == fmt * align_up(c, npu_num) / npu_num); + ASSERT(t->stride.c == 1 * fmt); + ASSERT(t->stride.h == 1 * fmt); + ASSERT(t->stride.w == 1 * fmt); +} + +static inline void assert_bf16_stride_type_2(ctx_t *ctx, const tl_t *t) +{ + ASSERT(t->shape.h == 1); + ASSERT(t->shape.w == 1); + + uint32_t fmt = (t->fmt == FMT_BF16) ? 2 : 1; + uint32_t c = t->shape.c; + uint32_t npu_num = ctx->chip_info.npu_num; + + ASSERT(t->stride.n == fmt * align_up(c, npu_num) / npu_num); + ASSERT(t->stride.c == 1 * fmt); + ASSERT(t->stride.h == 1 * fmt); + ASSERT(t->stride.w == 1 * fmt); +} + +static inline int tensor_is_signed(const tl_t *t) +{ + switch (t->fmt) { + case FMT_I8: + return 1; + case FMT_U8: + case FMT_BF16: //does not matter, so set to default 0 + return 0; + default: + ASSERT(0); + } +} + +static inline int matrix_is_signed(const ml_t *t) +{ + switch (t->fmt) { + case FMT_I8: + return 1; + case FMT_U8: + case FMT_BF16: //does not matter, so set to default 0 + return 0; + default: + ASSERT(0); + } +} + +static inline void fill_same_tensor_shape(tiu_reg_t *r, tl_shape_t s) +{ + uint32_t n = s.n; + uint32_t c = s.c; + uint32_t h = s.h; + uint32_t w = s.w; + + r->opd0_n = n; + r->opd0_c = c; + r->opd0_h = h; + r->opd0_w = w; + + r->opd1_n = n; + r->opd1_c = c; + r->opd1_h = h; + r->opd1_w = w; + + r->opd2_n = n; + r->opd2_c = c; + r->opd2_h = h; + r->opd2_w = w; + + r->res0_n = n; + r->res0_c = c; + r->res0_h = h; + r->res0_w = w; +} + +static inline void assert_stride_range(tl_stride_t s) +{ + ASSERT(s.n < 0x10000); + ASSERT(s.c < 0x10000); + ASSERT(s.h < 0x10000); +} + +static inline void fill_same_tensor_stride(tiu_reg_t *r, tl_stride_t s) +{ + uint32_t n = s.n; + uint32_t c = s.c; + uint32_t h = s.h; + uint32_t w = 1; + + r->opd0_n_str = n; + r->opd0_c_str = c; + r->opd0_h_str = h; + r->opd0_w_str = w; + + r->opd1_n_str = n; + r->opd1_c_str = c; + r->opd1_h_str = h; + r->opd1_w_str = w; + + r->opd2_n_str = n; + r->opd2_c_str = c; + r->opd2_h_str = h; + r->opd2_w_str = w; + + r->res0_n_str = n; + r->res0_c_str = c; + r->res0_h_str = h; + r->res0_w_str = w; +} + +#define fill_stride_code(r, op, str) \ + do { \ + r->op##_n_str = str->n; \ + r->op##_c_str = str->c; \ + r->op##_h_str = str->h; \ + r->op##_w_str = str->w; \ + } while (0) + +static inline void fill_opd0_stride(tiu_reg_t *r, const tl_stride_t *str) +{ + fill_stride_code(r, opd0, str); +} + +static inline void fill_opd1_stride(tiu_reg_t *r, const tl_stride_t *str) +{ + fill_stride_code(r, opd1, str); +} + +static inline void fill_opd2_stride(tiu_reg_t *r, const tl_stride_t *str) +{ + fill_stride_code(r, opd2, str); +} + +static inline void fill_res0_stride(tiu_reg_t *r, const tl_stride_t *str) +{ + fill_stride_code(r, res0, str); +} + +static inline void fill_same_tensor_stride_type(tiu_reg_t *r, int type) +{ + r->short_opd0_str = type & 0b11; + r->short_opd1_str = type & 0b11; + r->short_opd2_str = type & 0b11; + r->short_res0_str = type & 0b11; +} + +static inline ec_desc_t * emit_tiu_cmdbuf(ctx_t *k, tiu_reg_t *r) +{ + int engine_id = BMK1822_TIU; + + desc_pair_t *dp = bm1822_get_desc_pair(k, engine_id); + uint32_t *cmdbuf = (uint32_t *)dp->cmd_hdr->cmd; + emit_tiu_reg(r, cmdbuf); + + return dp->ec_desc; +} + +#endif /* KERNEL_1822_H */ diff --git a/cvikernel/src/bm1822/tdma.c b/cvikernel/src/bm1822/tdma.c new file mode 100644 index 000000000..cf4e7c9de --- /dev/null +++ b/cvikernel/src/bm1822/tdma.c @@ -0,0 +1,1977 @@ +#include "kernel_1822.h" +#include "bmkernel/bm1822/1822_fp_convert.h" + +//n < 0x10000); + ASSERT(s->c < 0x10000); + ASSERT(s->h < 0x10000); + ASSERT(s->w < 0x10000); + + ASSERT(s->n > 0x0); + ASSERT(s->c > 0x0); + ASSERT(s->h > 0x0); + ASSERT(s->w > 0x0); +} + +static void check_tdma_tl_bf16_shape(const tl_shape_t *s, fmt_t fmt) +{ + uint8_t fmt_type = (fmt == FMT_BF16 ? 2 : 1); + ASSERT(s->n < 0x10000); + ASSERT(s->c < 0x10000); + ASSERT(s->h < 0x10000); + ASSERT(s->w < 0x10000 / fmt_type); + + ASSERT(s->n > 0x0); + ASSERT(s->c > 0x0); + ASSERT(s->h > 0x0); + ASSERT(s->w > 0x0); +} + +static void check_tdma_tg_shape(const tg_shape_t *s) +{ + ASSERT(s->n < 0x10000); + ASSERT(s->c < 0x10000); + ASSERT(s->h < 0x10000); + ASSERT(s->w < 0x10000); + + ASSERT(s->n > 0x0); + ASSERT(s->c > 0x0); + ASSERT(s->h > 0x0); + ASSERT(s->w > 0x0); +} + +static void check_tdma_tg_bf16_shape(const tg_shape_t *s, fmt_t fmt) +{ + uint8_t fmt_type = (fmt == FMT_BF16 ? 2 : 1); + ASSERT(s->n < 0x10000); + ASSERT(s->c < 0x10000); + ASSERT(s->h < 0x10000); + ASSERT(s->w < 0x10000 / fmt_type); + + ASSERT(s->n > 0x0); + ASSERT(s->c > 0x0); + ASSERT(s->h > 0x0); + ASSERT(s->w > 0x0); +} + + +static void check_tdma_ml_shape(const ml_shape_t *s) +{ + ASSERT(s->n < 0x10000); + ASSERT(s->c < 0x10000); + ASSERT(s->w < 0x10000); + ASSERT(s->col < 0x10000); + + ASSERT(s->n > 0); + ASSERT(s->c > 0); + ASSERT(s->w > 0); + ASSERT(s->col > 0); +} + +static void check_tdma_ml_bf16_shape(const ml_shape_t *s, fmt_t fmt) +{ + uint8_t fmt_type = (fmt == FMT_BF16 ? 2 : 1); + ASSERT(s->n < 0x10000); + ASSERT(s->c < 0x10000); + ASSERT(s->w < 0x10000 / fmt_type); + ASSERT(s->col < 0x10000); + + ASSERT(s->n > 0); + ASSERT(s->c > 0); + ASSERT(s->w > 0); + ASSERT(s->col > 0); +} + +static void check_tdma_mg_shape(const mg_shape_t *s) +{ + ASSERT(s->row < 0x10000); + ASSERT(s->col < 0x10000); + + ASSERT(s->row > 0x0); + ASSERT(s->col > 0x0); +} + +static void check_tdma_mg_bf16_shape(const mg_shape_t *s, fmt_t fmt) +{ + uint8_t fmt_type = (fmt == FMT_BF16 ? 2 : 1); + ASSERT(s->row < 0x10000); + ASSERT(s->col < 0x10000 / fmt_type); + + ASSERT(s->row > 0x0); + ASSERT(s->col > 0x0); +} + +static void check_tdma_tl(const tl_t *t) +{ + ASSERT(t); + ASSERT(t->fmt == FMT_I8 || t->fmt == FMT_U8 || t->fmt == FMT_BF16); + check_tdma_tl_shape(&t->shape); +} + +static void check_tdma_tl_bf16(const tl_t *t) +{ + ASSERT(t); + ASSERT(t->fmt == FMT_I8 || t->fmt == FMT_U8 || t->fmt == FMT_BF16); + check_tdma_tl_bf16_shape(&t->shape, t->fmt); +} + +static void check_tdma_tg(const tg_t *t) +{ + ASSERT(t); + ASSERT(t->base_reg_index < TDMA_NUM_BASE_REGS); + ASSERT(t->fmt == FMT_I8 || t->fmt == FMT_U8 || t->fmt == FMT_BF16); + check_tdma_tg_shape(&t->shape); +} + +static void check_tdma_tg_bf16(const tg_t *t) +{ + ASSERT(t); + ASSERT(t->base_reg_index < TDMA_NUM_BASE_REGS); + ASSERT(t->fmt == FMT_I8 || t->fmt == FMT_U8 || t->fmt == FMT_BF16); + check_tdma_tg_bf16_shape(&t->shape, t->fmt); +} + +static void check_tdma_compressed_tg(const compressed_tg_t *t) +{ + uint32_t stride_w = t->t.fmt == FMT_BF16 ? 2 : 1; + + ASSERT(t); + ASSERT(t->t.base_reg_index < TDMA_NUM_BASE_REGS); + check_tdma_tg_shape(&t->t.shape); + ASSERT(!(t->t.start_address%0x10)); + + // Enable after backend fix + //ASSERT(t->t.stride.n == + // (t->t.shape.w * t->t.shape.h * t->t.shape.c * stride_w)); + + ASSERT(t->t.stride.c == (t->t.shape.w * t->t.shape.h * stride_w)); + ASSERT(t->t.stride.h == (t->t.shape.w * stride_w)); + // m.base_reg_index < TDMA_NUM_BASE_REGS); + ASSERT(!(t->m.start_address%0x10)); + + // the data should be continuous + if (t->m.fmt == FMT_BF16) { + ASSERT(t->m.stride.row == t->m.shape.col * 2); + } + else if (t->m.fmt == FMT_I8 || t->m.fmt == FMT_U8) { + ASSERT(t->m.stride.row == t->m.shape.col); + } + else { + ASSERT(0); //fmt == FMT_I8 || m->fmt == FMT_U8 || m->fmt == FMT_BF16); + check_tdma_ml_shape(&m->shape); +} + +static void check_tdma_ml_bf16(const ml_t *m) +{ + ASSERT(m); + ASSERT(m->fmt == FMT_I8 || m->fmt == FMT_U8 || m->fmt == FMT_BF16); + check_tdma_ml_bf16_shape(&m->shape, m->fmt); +} + +static void check_tdma_mg(const mg_t *m) +{ + ASSERT(m); + ASSERT(m->base_reg_index < TDMA_NUM_BASE_REGS); + check_tdma_mg_shape(&m->shape); +} + +static void check_tdma_mg_bf16(const mg_t *m) +{ + ASSERT(m); + ASSERT(m->base_reg_index < TDMA_NUM_BASE_REGS); + check_tdma_mg_bf16_shape(&m->shape, m->fmt); +} + +static void check_tdma_compress_mg(const compressed_mg_t *m) +{ + ASSERT(m); + ASSERT(m->m.base_reg_index < TDMA_NUM_BASE_REGS); + check_tdma_mg_shape(&m->m.shape); +} + +static void assert_tl_same_size(const tl_t *a, const tl_t *b) +{ + uint32_t a_size = a->shape.n * a->shape.c * a->shape.h * a->shape.w; + uint32_t b_size = b->shape.n * b->shape.c * b->shape.h * b->shape.w; + + ASSERT(a_size == b_size); +} + +static void assert_tl_tg_same_size(const tl_t *tl, const tg_t *tg) +{ + uint32_t tl_size = tl->shape.n * tl->shape.c * tl->shape.h * tl->shape.w; + uint32_t tg_size = tg->shape.n * tg->shape.c * tg->shape.h * tg->shape.w; + ASSERT(tl_size == tg_size); +} + +static void assert_ml_mg_same_size(const ml_t *ml, const mg_t *mg) +{ + uint32_t ml_size = ml->shape.n * ml->shape.col; + uint32_t mg_size = mg->shape.row * mg->shape.col; + + ASSERT(ml_size == mg_size); +} + +#if 0 +static uint64_t absolute_gmem_addr(uint64_t addr) +{ + return (addr & 0x0FFFFFFFFFF) + BM1822_GLOBAL_MEM_START_ADDR; +} +#else +//global memory start = 0x0 from 1822 kernel view, we can use it directlly +//cmdbuf descriptor content dram address does not need offset either +#define absolute_gmem_addr(addr) (addr & 0x0FFFFFFFFFF) +#endif + +static ec_desc_t * emit_tdma_cmdbuf(ctx_t *ctx, tdma_reg_t *reg) +{ + desc_pair_t *dp = bm1822_get_desc_pair(ctx, BMK1822_TDMA); + + reg->layer_ID = ctx->layer_id; + //ASSERT(reg->rsv5 != 0x0);// "this is debug use, it's fine for skip"; + + uint32_t *cmdbuf = (uint32_t *)dp->cmd_hdr->cmd; + emit_tdma_reg(reg, cmdbuf); + + return dp->ec_desc; +} + +static void fill_l2tg_fmt(tdma_reg_t *reg, fmt_t src_fmt, fmt_t dst_fmt) +{ + reg->dst_fmt = (dst_fmt == FMT_BF16) ? 2 : 1; + reg->src_fmt = (src_fmt == FMT_BF16) ? 2 : 1; + // check and decide bf16->int8 or bf16->uint8_t + reg->int8_sign = (dst_fmt == FMT_I8 ? 1 : 0);// | (dst_fmt == FMT_U8 ? 1 : 0); +} + +static void fill_tg2l_fmt(tdma_reg_t *reg, fmt_t src_fmt, fmt_t dst_fmt) +{ + reg->dst_fmt = (dst_fmt == FMT_BF16) ? 2 : 1; + reg->src_fmt = (src_fmt == FMT_BF16) ? 2 : 1; + // check and decide int8->bf16 or uint8_t->bf16 + reg->int8_sign = (src_fmt == FMT_I8 ? 1 : 0) ;//| (src_fmt == FMT_U8 ? 1 : 0); +} + +static void fill_l2l_fmt(tdma_reg_t *reg, fmt_t src_fmt, fmt_t dst_fmt) +{ + reg->dst_fmt = (dst_fmt == FMT_BF16) ? 2 : 1; + reg->src_fmt = (src_fmt == FMT_BF16) ? 2 : 1; + // check and decide bf16->int8 or bf16->uint8_t or int8->bf16 or uint8_t->bf16 + reg->int8_sign = (dst_fmt == FMT_I8 ? 1 : 0) | (src_fmt == FMT_I8 ? 1 : 0); +} + +static void fill_src_addr(tdma_reg_t *r, uint64_t addr) +{ + r->src_base_addr_low = (uint32_t)addr; + r->src_base_addr_high = (addr >> 32); +} + +static void fill_dst_addr(tdma_reg_t *r, uint64_t addr) +{ + r->dst_base_addr_low = (uint32_t)addr; + r->dst_base_addr_high = (addr >> 32); +} + +static void fill_src_c_stride(tdma_reg_t *r, uint32_t str) +{ + r->src_c_stride_low = (uint16_t)str; + r->src_c_stride_high = (str >> 16); +} + +static void fill_dst_c_stride(tdma_reg_t *r, uint32_t str) +{ + r->dst_c_stride_low = (uint16_t)str; + r->dst_c_stride_high = (str >> 16); +} + +static void set_int8_rnd_mode(tdma_reg_t *r, uint32_t int8_rnd_mode) +{ + if (int8_rnd_mode == 1) { + // int8 + if (r->src_fmt == FMT_BF16_TYP && r->dst_fmt == FMT_FIX8B_TYP) { + r->int8_rnd_mode = int8_rnd_mode; + } + } +} + + +/* + * Direction: L2L + */ + +bmk1822_op_t * bmk1822_tdma_l2l_tensor_copy( + ctx_t *ctx, + const bmk1822_tdma_l2l_tensor_copy_param_t *p) +{ + check_tdma_tl(p->src); + check_tdma_tl(p->dst); + ASSERT(p->src->shape.n == p->dst->shape.n); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 3; + reg.spec_func = 0; + reg.sys_dtype = 0; + + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + reg.outstanding_en = p->outstanding; + + return emit_tdma_cmdbuf(ctx, ®); +} + +bmk1822_op_t * bmk1822_tdma_l2l_bf16_tensor_copy( + ctx_t *ctx, + const bmk1822_tdma_l2l_tensor_copy_param_t *p) +{ + check_tdma_tl_bf16(p->src); + check_tdma_tl_bf16(p->dst); + ASSERT(p->src->shape.n == p->dst->shape.n); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 3; + reg.spec_func = 0; + reg.sys_dtype = 0; + fill_l2l_fmt(®, p->src->fmt, p->dst->fmt); + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + // does not allow open `mv_lut_idx and `mv_lut_basemv_lut_base at same time + if (p->mv_lut_idx == 1) { + reg.mv_lut_idx = p->mv_lut_idx; + } + + if (p->mv_lut_base == 1) { + reg.mv_lut_base = p->mv_lut_base; + } + + if (reg.mv_lut_idx == 1 && reg.mv_lut_base == 1) { + ASSERT(0); + } + + set_int8_rnd_mode(®, p->dst->int8_rnd_mode); + + reg.outstanding_en = p->outstanding; + + //trace_tdma_reg(®, __func__); + + return emit_tdma_cmdbuf(ctx, ®); +} + +static uint32_t addr_after_right_shift( + ctx_t *ctx, int addr, uint32_t step, int c_str) +{ + uint32_t npu_num = ctx->chip_info.npu_num; + uint32_t lmem_size = ctx->chip_info.lmem_size;; + + uint32_t lmem_i = (addr / lmem_size + step) % npu_num; + uint32_t offset = addr % lmem_size + (addr / lmem_size + step) / npu_num * c_str; + return lmem_i * lmem_size + offset; +} + +bmk1822_op_t * bmk1822_tdma_l2l_tensor_lrn_shift( + ctx_t *ctx, + const bmk1822_tdma_l2l_tensor_lrn_shift_param_t *p) +{ + check_tdma_tl(p->src); + check_tdma_tl(p->dst); + assert_tl_same_size(p->src, p->dst); + ASSERT(p->src->shape.n == p->dst->shape.n); + ASSERT(p->src->shape.c == p->dst->shape.c); + ASSERT(p->src->shape.c > p->lrn_step); + ASSERT(p->src->shape.h * p->src->shape.w == + p->dst->shape.h * p->dst->shape.w); + ASSERT(p->lrn_step < 16); + + ASSERT(p->src->fmt == p->dst->fmt); + + int is_bf16 = (p->src->fmt == FMT_BF16) ? 1 : 0; + if (is_bf16) { + check_tdma_tl_bf16(p->src); + check_tdma_tl_bf16(p->dst); + } + + /* L2L lrn copy */ + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 3; + reg.spec_func = 0; + reg.sys_dtype = 0; + fill_l2l_fmt(®, p->src->fmt, p->dst->fmt); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c - p->lrn_step; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c - p->lrn_step; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + if (p->right_shift) { + uint32_t dst_addr = addr_after_right_shift( + ctx, p->dst->start_address, p->lrn_step, p->dst->stride.c); + + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + } else { + uint32_t src_addr = addr_after_right_shift( + ctx, p->src->start_address, p->lrn_step, p->src->stride.c); + + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + } + +if (is_bf16) + set_int8_rnd_mode(®, p->dst->int8_rnd_mode); + + emit_tdma_cmdbuf(ctx, ®); + + /* Constant fill with zero */ + reg.trans_dir = 0; + reg.spec_func = 4; + reg.const_val = is_bf16 ? convert_fp32_bf16(0.0): 0; + + reg.dst_c = p->lrn_step; + if (p->right_shift) { + uint32_t dst_addr = addr_after_right_shift( + ctx, p->dst->start_address, p->lrn_step, p->dst->stride.c); + + uint32_t lmem_size = ctx->chip_info.lmem_size;; + uint32_t npu_num = ctx->chip_info.npu_num; + uint32_t sht_num = p->lrn_step; + + uint32_t lmem_i = (dst_addr / lmem_size - sht_num) % npu_num; + uint32_t offset = (lmem_i + sht_num) / npu_num * p->dst->stride.c; + uint32_t zero_addr = lmem_i * lmem_size + dst_addr % lmem_size - offset; + + // printf(" lmem_i 0x%x, offset 0x%x, zero_addr 0x%x\n", + // lmem_i, offset, zero_addr); + + fill_dst_addr(®, zero_addr); + + } else { + uint32_t start_mem = p->dst->start_address / ctx->chip_info.lmem_size; + uint32_t cur_mem = (start_mem + (p->dst->shape.c - p->lrn_step)) % ctx->chip_info.npu_num; + uint32_t offset = + (p->dst->start_address % ctx->chip_info.lmem_size) + + ((start_mem + (p->dst->shape.c - p->lrn_step)) / ctx->chip_info.npu_num) * p->dst->stride.c; + uint32_t zero_addr = cur_mem * ctx->chip_info.lmem_size + offset; + + // printf(" start_mem 0x%x, cur_mem 0x%x, offset 0x%x, zero_addr 0x%x\n", + // start_mem, cur_mem, offset, zero_addr); + + fill_dst_addr(®, zero_addr); + } + + return emit_tdma_cmdbuf(ctx, ®); +} +/* + * Direction: L2TG + */ + +static bmk1822_op_t * tdma_l2tg_tensor_copy( + ctx_t *ctx, + const bmk1822_tdma_l2tg_tensor_copy_param_t *p, + uint64_t dst_addr) +{ + check_tdma_tl(p->src); + check_tdma_tg(p->dst); + assert_tl_tg_same_size(p->src, p->dst); + ASSERT(p->src->shape.n == p->dst->shape.n); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.sys_dtype = 0; + reg.spec_func = 0; + + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + reg.intra_cmd_paral = p->intra_cmd_paral; + + return emit_tdma_cmdbuf(ctx, ®); +} + + +static bmk1822_op_t * tdma_l2tg_bf16_tensor_copy( + ctx_t *ctx, + const bmk1822_tdma_l2tg_tensor_copy_param_t *p, + uint64_t dst_addr) +{ + check_tdma_tl_bf16(p->src); + check_tdma_tg_bf16(p->dst); + assert_tl_tg_same_size(p->src, p->dst); + ASSERT(p->src->shape.n == p->dst->shape.n); + ASSERT(!(p->src->fmt == FMT_I8 && p->dst->fmt == FMT_BF16)); // not support tl(int8)->tg(bf16) + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.sys_dtype = 0; + reg.spec_func = 0; + + fill_l2tg_fmt(®, p->src->fmt, p->dst->fmt); + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + reg.intra_cmd_paral = p->intra_cmd_paral; + + set_int8_rnd_mode(®, p->dst->int8_rnd_mode); + //trace_tdma_reg(®, __func__); + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1822_op_t * tdma_l2tg_tensor_copy_nc_transposed( + ctx_t *ctx, + const bmk1822_tdma_l2tg_tensor_copy_nc_transposed_param_t *p, + uint64_t dst_addr) +{ + check_tdma_tl(p->src); + check_tdma_tg(p->dst); + ASSERT(p->dst->shape.n == p->src->shape.c); + ASSERT(p->dst->shape.c == p->src->shape.n); + ASSERT(p->dst->shape.h * p->dst->shape.w == + p->src->shape.h * p->src->shape.w); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.sys_dtype = 0; + reg.spec_func = 1; + + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1822_op_t * tdma_l2tg_bf16_tensor_copy_nc_transposed( + ctx_t *ctx, + const bmk1822_tdma_l2tg_tensor_copy_nc_transposed_param_t *p, + uint64_t dst_addr) +{ + check_tdma_tl_bf16(p->src); + check_tdma_tg_bf16(p->dst); + ASSERT(p->dst->shape.n == p->src->shape.c); + ASSERT(p->dst->shape.c == p->src->shape.n); + ASSERT(p->dst->shape.h * p->dst->shape.w == + p->src->shape.h * p->src->shape.w); + ASSERT(!(p->src->fmt == FMT_I8 && p->dst->fmt == FMT_BF16)); // not support tl(int8)->tg(bf16) + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.sys_dtype = 0; + reg.spec_func = 1; + + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + fill_l2tg_fmt(®, p->src->fmt, p->dst->fmt); + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + set_int8_rnd_mode(®, p->dst->int8_rnd_mode); + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1822_op_t * tdma_l2tg_tensor_copy_cw_transposed( + ctx_t *ctx, + const bmk1822_tdma_l2tg_tensor_copy_cw_transposed_param_t *p, + uint64_t dst_addr) +{ + check_tdma_tl(p->src); + check_tdma_tg(p->dst); + ASSERT(p->src->shape.n == p->dst->shape.n); + ASSERT(p->src->shape.c == p->dst->shape.w); + ASSERT(p->src->shape.h == p->dst->shape.h); + ASSERT(p->src->shape.w == p->dst->shape.c); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.spec_func = 1; + reg.transpose_md = 3; + + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1822_op_t * tdma_l2tg_bf16_tensor_copy_cw_transposed( + ctx_t *ctx, + const bmk1822_tdma_l2tg_tensor_copy_cw_transposed_param_t *p, + uint64_t dst_addr) +{ + check_tdma_tl_bf16(p->src); + check_tdma_tg_bf16(p->dst); + ASSERT(p->src->shape.n == p->dst->shape.n); + ASSERT(p->src->shape.c == p->dst->shape.w); + ASSERT(p->src->shape.h == p->dst->shape.h); + ASSERT(p->src->shape.w == p->dst->shape.c); + + /*not support bf16 mode*/ + ASSERT(!(p->src->fmt == FMT_BF16 || p->dst->fmt == FMT_BF16)); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.spec_func = 1; + reg.transpose_md = 3; + + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + fill_l2tg_fmt(®, p->src->fmt, p->dst->fmt); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1822_op_t * tdma_l2tg_tensor_copy_compressed( + ctx_t *ctx, + const bmk1822_tdma_l2tg_tensor_copy_compressed_param_t *p, + uint64_t dst_addr) +{ + check_tdma_tl(p->src); + check_tdma_compressed_tg(p->dst); + assert_tl_tg_same_size(p->src, &p->dst->t); + + tdma_reg_t reg; + reset_tdma_reg(®); + + //src->fmt == FMT_BF16 || p->src->fmt == FMT_I8 || p->src->fmt == FMT_U8); + + ASSERT(p->dst->bias1 == 0); + if (p->src->fmt == FMT_BF16) { + ASSERT(p->dst->bias0 == 127); + } + else { + //p->src->fmt == FMT_I8 || p->src->fmt == FMT_U8); + ASSERT(p->dst->bias0 == 0); + ASSERT(p->dst->zero_guard_en == 0); + } + + reg.src_fmt = (p->src->fmt == FMT_BF16) ? FMT_BF16_TYP : FMT_FIX8B_TYP; + reg.dst_fmt = reg.src_fmt; + + reg.vld = 1; + reg.trans_dir = 1; + reg.compress_en = 1; + + // VLC constraint under hw compress + //1. in int8/uint8, bias0/bias should be 0/0 + //2. in bf16, signed should be 0 and bias0 set to 127, bias1 set to 0 + reg.cmprs_fmt = (p->src->fmt == FMT_I8); + + // NOTICE: it recommand set to 1 once data contain '0' under bf16 + reg.compress_zero_guard = p->dst->zero_guard_en ? 1 : 0; + reg.compress_bias0 = p->dst->bias0; + reg.compress_bias1 = p->dst->bias1; + + reg.dst_base_reg_sel = p->dst->t.base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->t.shape.c; + reg.dst_h = p->dst->t.shape.h; + reg.dst_w = p->dst->t.shape.w; + reg.dst_n_stride = p->dst->t.stride.n; + fill_dst_c_stride(®, p->dst->t.stride.c); + reg.dst_h_stride = p->dst->t.stride.h; + + reg.intra_cmd_paral = p->intra_cmd_paral; + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1822_op_t * tdma_l2tg_tensor_fill_constant( + ctx_t *ctx, + const bmk1822_tdma_l2tg_tensor_fill_constant_param_t *p, + uint64_t dst_addr) +{ + check_tdma_tg_bf16(p->dst); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.spec_func = 4; + reg.const_val = p->constant; + + // only support tl(bf16)->tg(bf16) or tl(fix8b)->tg(fix8b) + fill_l2tg_fmt(®, p->dst->fmt, p->dst->fmt); + + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_dst_addr(®, dst_addr); + + reg.src_n = p->dst->shape.n; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1822_op_t * tdma_l2tg_matrix_copy( + ctx_t *ctx, + const bmk1822_tdma_l2tg_matrix_copy_param_t *p, + uint64_t dst_addr) +{ + check_tdma_ml(p->src); + check_tdma_mg(p->dst); + assert_ml_mg_same_size(p->src, p->dst); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.sys_dtype = 1; + reg.spec_func = 0; + + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = 1; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.row; + reg.dst_w = p->dst->shape.col; + fill_dst_c_stride(®, p->dst->stride.row); + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1822_op_t * tdma_l2tg_matrix_copy_compressed( + ctx_t *ctx, + const bmk1822_tdma_l2tg_matrix_copy_compressed_param_t *p, + uint64_t dst_addr) +{ + check_tdma_ml(p->src); + check_tdma_compress_mg(p->dst); + check_tdma_vlc_matrix_compressed_mg(p->dst); + assert_ml_mg_same_size(p->src, &p->dst->m); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.compress_en = 1; + reg.sys_dtype = 1; + reg.spec_func = 0; + + // vlc setting + reg.cmprs_fmt = (p->src->fmt == FMT_I8); + + ASSERT(p->dst->bias1 == 0); + if (p->src->fmt == FMT_BF16) { + ASSERT(p->dst->bias0 == 127); + } + else { + //p->src->fmt == FMT_I8 || p->src->fmt == FMT_U8); + ASSERT(p->dst->bias0 == 0); + ASSERT(p->dst->zero_guard_en == 0); + } + + // NOTICE: it should be 1 once data contain '0' under bf16 + reg.compress_zero_guard = p->dst->zero_guard_en ? 1 : 0; + reg.compress_bias0 = p->dst->bias0; + reg.compress_bias1 = p->dst->bias1; + + reg.dst_base_reg_sel = p->dst->m.base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + fill_l2tg_fmt(®, p->src->fmt, p->dst->m.fmt); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = 1; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->m.shape.row; + reg.dst_w = p->dst->m.shape.col; + fill_dst_c_stride(®, p->dst->m.stride.row); + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1822_op_t * tdma_l2tg_bf16_matrix_copy( + ctx_t *ctx, + const bmk1822_tdma_l2tg_matrix_copy_param_t *p, + uint64_t dst_addr) +{ + check_tdma_ml_bf16(p->src); + check_tdma_mg_bf16(p->dst); + assert_ml_mg_same_size(p->src, p->dst); + ASSERT(!((p->src->fmt == FMT_I8 || p->src->fmt == FMT_U8) && p->dst->fmt == FMT_BF16)); // not support tl(i8/uint8_t)->tg(bf16) + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.sys_dtype = 1; + reg.spec_func = 0; + + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + fill_l2tg_fmt(®, p->src->fmt, p->dst->fmt); + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = 1; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.row; + reg.dst_w = p->dst->shape.col; + fill_dst_c_stride(®, p->dst->stride.row); + set_int8_rnd_mode(®, p->dst->int8_rnd_mode); + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1822_op_t * tdma_l2tg_general_copy( + ctx_t *ctx, + const bmk1822_tdma_l2tg_general_copy_param_t *p, + uint64_t dst_addr) +{ + ASSERT(p->dst_base_reg_index < TDMA_NUM_BASE_REGS); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.trans_fmt = 1; + reg.sys_dtype = 0; + reg.spec_func = 0; + + reg.dst_base_reg_sel = p->dst_base_reg_index; + fill_src_addr(®, p->src_address); + fill_dst_addr(®, dst_addr); + reg.src_n_stride = p->bytes; + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1822_op_t * tdma_l2tg_bf16_general_copy( + ctx_t *ctx, + const bmk1822_tdma_l2tg_bf16_general_copy_param_t *p, + uint64_t dst_addr) +{ + ASSERT(p->dst_base_reg_index < TDMA_NUM_BASE_REGS); + + tdma_reg_t reg; + reset_tdma_reg(®); + // only support fix8b->fix8b or bf16->bf16 + ASSERT(p->src_fmt == p->dst_fmt); + + reg.vld = 1; + reg.trans_dir = 1; + reg.trans_fmt = 1; + reg.sys_dtype = 0; + reg.spec_func = 0; + fill_l2tg_fmt(®, p->src_fmt, p->dst_fmt); + + reg.dst_base_reg_sel = p->dst_base_reg_index; + fill_src_addr(®, p->src_address); + fill_dst_addr(®, dst_addr); + reg.src_n_stride = p->src_bytes; + return emit_tdma_cmdbuf(ctx, ®); +} + +bmk1822_op_t * bmk1822_tdma_l2g_tensor_copy( + ctx_t *ctx, + const bmk1822_tdma_l2tg_tensor_copy_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + return tdma_l2tg_tensor_copy(ctx, p, dst_addr); +} + +bmk1822_op_t * bmk1822_tdma_l2g_bf16_tensor_copy( + ctx_t *ctx, + const bmk1822_tdma_l2tg_tensor_copy_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + return tdma_l2tg_bf16_tensor_copy(ctx, p, dst_addr); +} +bmk1822_op_t * bmk1822_tdma_l2g_tensor_copy_nc_transposed( + ctx_t *ctx, + const bmk1822_tdma_l2tg_tensor_copy_nc_transposed_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + return tdma_l2tg_tensor_copy_nc_transposed(ctx, p, dst_addr); +} + +bmk1822_op_t * bmk1822_tdma_l2g_bf16_tensor_copy_nc_transposed( + ctx_t *ctx, + const bmk1822_tdma_l2tg_tensor_copy_nc_transposed_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + return tdma_l2tg_bf16_tensor_copy_nc_transposed(ctx, p, dst_addr); +} + +bmk1822_op_t * bmk1822_tdma_l2g_tensor_copy_cw_transposed( + ctx_t *ctx, + const bmk1822_tdma_l2tg_tensor_copy_cw_transposed_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + return tdma_l2tg_tensor_copy_cw_transposed(ctx, p, dst_addr); +} + +bmk1822_op_t * bmk1822_tdma_l2g_bf16_tensor_copy_cw_transposed( + ctx_t *ctx, + const bmk1822_tdma_l2tg_tensor_copy_cw_transposed_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + return tdma_l2tg_bf16_tensor_copy_cw_transposed(ctx, p, dst_addr); +} + +bmk1822_op_t * bmk1822_tdma_l2g_tensor_copy_compressed( + ctx_t *ctx, + const bmk1822_tdma_l2tg_tensor_copy_compressed_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->t.start_address); + return tdma_l2tg_tensor_copy_compressed(ctx, p, dst_addr); +} + +bmk1822_op_t * bmk1822_tdma_l2g_tensor_fill_constant( + ctx_t *ctx, + const bmk1822_tdma_l2tg_tensor_fill_constant_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + return tdma_l2tg_tensor_fill_constant(ctx, p, dst_addr); +} + +bmk1822_op_t * bmk1822_tdma_l2g_matrix_copy( + ctx_t *ctx, + const bmk1822_tdma_l2tg_matrix_copy_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + return tdma_l2tg_matrix_copy(ctx, p, dst_addr); +} + +bmk1822_op_t * bmk1822_tdma_l2g_bf16_matrix_copy( + ctx_t *ctx, + const bmk1822_tdma_l2tg_matrix_copy_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + return tdma_l2tg_bf16_matrix_copy(ctx, p, dst_addr); +} + +bmk1822_op_t * bmk1822_tdma_l2g_matrix_copy_compressed( + ctx_t *ctx, + const bmk1822_tdma_l2tg_matrix_copy_compressed_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->m.start_address); + return tdma_l2tg_matrix_copy_compressed(ctx, p, dst_addr); +} + +bmk1822_op_t * bmk1822_tdma_l2g_general_copy( + ctx_t *ctx, + const bmk1822_tdma_l2tg_general_copy_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst_address); + return tdma_l2tg_general_copy(ctx, p, dst_addr); +} + +bmk1822_op_t * bmk1822_tdma_l2g_bf16_general_copy( + ctx_t *ctx, + const bmk1822_tdma_l2tg_bf16_general_copy_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst_address); + return tdma_l2tg_bf16_general_copy(ctx, p, dst_addr); +} +/* + * Direction: TG2L + */ + +static bmk1822_op_t * tdma_tg2l_tensor_copy( + ctx_t *ctx, + const bmk1822_tdma_tg2l_tensor_copy_param_t *p, + uint64_t src_addr) +{ + check_tdma_tg(p->src); + check_tdma_tl(p->dst); + ASSERT(p->src->shape.n == p->dst->shape.n); + assert_tl_tg_same_size(p->dst, p->src); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 0; + reg.spec_func = 0; + + reg.src_base_reg_sel = p->src->base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + reg.intra_cmd_paral = p->intra_cmd_paral; + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1822_op_t * tdma_tg2l_bf16_tensor_copy( + ctx_t *ctx, + const bmk1822_tdma_tg2l_tensor_copy_param_t *p, + uint64_t src_addr) +{ + check_tdma_tg_bf16(p->src); + check_tdma_tl_bf16(p->dst); + ASSERT(p->src->shape.n == p->dst->shape.n); + ASSERT(!(p->src->fmt == FMT_BF16 && p->dst->fmt == FMT_I8)); // not support tg(bf16)->tl(int8) + assert_tl_tg_same_size(p->dst, p->src); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 0; + reg.spec_func = 0; + + reg.src_base_reg_sel = p->src->base_reg_index; + + fill_tg2l_fmt(®, p->src->fmt, p->dst->fmt); + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + reg.intra_cmd_paral = p->intra_cmd_paral; + + //trace_tdma_reg(®, __func__); + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1822_op_t * tdma_tg2l_tensor_copy_nc_transposed( + ctx_t *ctx, + const bmk1822_tdma_tg2l_tensor_copy_nc_transposed_param_t *p, + uint64_t src_addr) +{ + check_tdma_tg(p->src); + check_tdma_tl(p->dst); + ASSERT(p->dst->shape.n == p->src->shape.c); + ASSERT(p->dst->shape.c == p->src->shape.n); + ASSERT(p->dst->shape.h * p->dst->shape.w == + p->src->shape.h * p->src->shape.w); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 0; + reg.spec_func = 1; + + reg.src_base_reg_sel = p->src->base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1822_op_t * tdma_tg2l_bf16_tensor_copy_nc_transposed( + ctx_t *ctx, + const bmk1822_tdma_tg2l_tensor_copy_nc_transposed_param_t *p, + uint64_t src_addr) +{ + check_tdma_tg_bf16(p->src); + check_tdma_tl_bf16(p->dst); + ASSERT(p->dst->shape.n == p->src->shape.c); + ASSERT(p->dst->shape.c == p->src->shape.n); + ASSERT(p->dst->shape.h * p->dst->shape.w == + p->src->shape.h * p->src->shape.w); + + ASSERT(!(p->src->fmt == FMT_BF16 && p->dst->fmt == FMT_I8)); // not support tg(bf16)->tl(int8) + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 0; + reg.spec_func = 1; + + reg.src_base_reg_sel = p->src->base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + fill_tg2l_fmt(®, p->src->fmt, p->dst->fmt); + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1822_op_t * tdma_tg2l_tensor_copy_chw_rotated( + ctx_t *ctx, + const bmk1822_tdma_tg2l_tensor_copy_chw_rotated_param_t *p, + uint64_t src_addr) +{ + check_tdma_tg(p->src); + check_tdma_tl(p->dst); + + ASSERT(p->src->shape.c == 3 || p->src->shape.c == 4); + ASSERT(p->src->shape.n == p->dst->shape.n); + ASSERT(p->src->shape.c == p->dst->shape.c); + ASSERT(p->src->shape.h == p->dst->shape.h); + ASSERT(p->src->shape.w == p->dst->shape.w); + + ASSERT(p->dst->start_address % ctx->chip_info.eu_num == 0); + ASSERT(p->dst->stride.n % ctx->chip_info.eu_num == 0); + ASSERT(p->dst->stride.c % ctx->chip_info.eu_num == 0); + ASSERT(p->dst->stride.h == p->dst->shape.w); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 0; + reg.spec_func = 1; + + if (p->dst->shape.c == 3) + reg.transpose_md = 1; + else if(p->dst->shape.c == 4) + reg.transpose_md = 2; + else + ASSERT(0); + + reg.src_base_reg_sel = p->src->base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, 1); + reg.src_h_stride = p->src->shape.c * p->src->shape.w; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1822_op_t * tdma_tg2l_tensor_copy_decompressed( + ctx_t *ctx, + const bmk1822_tdma_tg2l_tensor_copy_decompressed_param_t *p, + uint64_t src_addr) +{ + check_tdma_compressed_tg(p->src); + check_tdma_tl(p->dst); + assert_tl_tg_same_size(p->dst, &p->src->t); + + tdma_reg_t reg; + reset_tdma_reg(®); + + //dst->fmt == FMT_BF16 || p->dst->fmt == FMT_I8 || p->dst->fmt == FMT_U8); + fill_tg2l_fmt(®, p->src->t.fmt, p->dst->fmt); + + reg.vld = 1; + reg.trans_dir = 0; + reg.compress_en = 1; + reg.cmprs_fmt = (p->src->t.fmt == FMT_I8); + + reg.src_base_reg_sel = p->src->t.base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->t.shape.n; + reg.src_c = p->src->t.shape.c; + reg.src_h = p->src->t.shape.h; + reg.src_w = p->src->t.shape.w; + reg.src_n_stride = p->src->t.stride.n; + fill_src_c_stride(®, p->src->t.stride.c); + reg.src_h_stride = p->src->t.stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + reg.intra_cmd_paral = p->intra_cmd_paral; + + // trace_tdma_reg(®, __FUNCTION__); + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1822_op_t * tdma_tg2l_tensor_fill_constant( + ctx_t *ctx, + const bmk1822_tdma_tg2l_tensor_fill_constant_param_t *p) +{ + check_tdma_tl(p->dst); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.spec_func = 4; + reg.const_val = p->constant; + + reg.dst_fmt = (p->dst->fmt == FMT_BF16) ? 2 : 1; + + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->dst->shape.n; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1822_op_t * tdma_tg2l_bf16_tensor_fill_constant( + ctx_t *ctx, + const bmk1822_tdma_tg2l_tensor_fill_constant_param_t *p) +{ + check_tdma_tl_bf16(p->dst); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.spec_func = 4; + reg.const_val = p->constant; + + /*only suppoert fix8b->fix8b or bf16->bf16*/ + fill_tg2l_fmt(®, p->dst->fmt, p->dst->fmt); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->dst->shape.n; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1822_op_t * tdma_tg2l_matrix_copy( + ctx_t *ctx, + const bmk1822_tdma_tg2l_matrix_copy_param_t *p, + uint64_t src_addr) +{ + check_tdma_mg(p->src); + check_tdma_ml(p->dst); + ASSERT(p->dst->shape.n == p->src->shape.row); + assert_ml_mg_same_size(p->dst, p->src); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 1; + reg.spec_func = 0; + + reg.src_base_reg_sel = p->src->base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.row; + reg.src_c = p->src->shape.row; + reg.src_w = p->src->shape.col; + fill_src_c_stride(®, p->src->stride.row); + + reg.dst_c = p->dst->shape.c; + reg.dst_h = 1; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1822_op_t * tdma_tg2l_matrix_copy_decompressed( + ctx_t *ctx, + const bmk1822_tdma_tg2l_matrix_copy_decompressed_param_t *p, + uint64_t src_addr) +{ + check_tdma_vlc_matrix_compressed_mg(p->src); + check_tdma_mg(&p->src->m); + check_tdma_ml(p->dst); + ASSERT(p->dst->shape.n == p->src->m.shape.row); + assert_ml_mg_same_size(p->dst, &p->src->m); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 1; + reg.spec_func = 0; + reg.compress_en = 1; + reg.cmprs_fmt = (p->src->m.fmt == FMT_I8); + + fill_tg2l_fmt(®, p->src->m.fmt, p->dst->fmt); + reg.src_base_reg_sel = p->src->m.base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->m.shape.row; + reg.src_c = p->src->m.shape.row; + reg.src_w = p->src->m.shape.col; + fill_src_c_stride(®, p->src->m.stride.row); + + reg.dst_c = p->dst->shape.c; + reg.dst_h = 1; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1822_op_t * tdma_tg2l_bf16_matrix_copy( + ctx_t *ctx, + const bmk1822_tdma_tg2l_matrix_copy_param_t *p, + uint64_t src_addr) +{ + check_tdma_mg_bf16(p->src); + check_tdma_ml_bf16(p->dst); + ASSERT(p->dst->shape.n == p->src->shape.row); + assert_ml_mg_same_size(p->dst, p->src); + ASSERT(!(p->src->fmt == FMT_BF16 && p->dst->fmt == FMT_I8)); // not support tg(bf16)->tl(int8) + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 1; + reg.spec_func = 0; + + fill_tg2l_fmt(®, p->src->fmt, p->dst->fmt); + reg.src_base_reg_sel = p->src->base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.row; + reg.src_c = p->src->shape.row; + reg.src_w = p->src->shape.col; + fill_src_c_stride(®, p->src->stride.row); + + reg.dst_c = p->dst->shape.c; + reg.dst_h = 1; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1822_op_t * tdma_tg2l_matrix_copy_row_col_transposed( + ctx_t *ctx, + const bmk1822_tdma_tg2l_matrix_copy_row_col_transposed_param_t *p, + uint64_t src_addr) +{ + check_tdma_mg(p->src); + check_tdma_ml(p->dst); + ASSERT(p->dst->shape.n == p->src->shape.col); + ASSERT(p->dst->shape.col == p->src->shape.row); + assert_ml_mg_same_size(p->dst, p->src); + + ASSERT(p->src->shape.row >= p->dst->shape.w); + ASSERT(p->dst->shape.c == + (uint32_t) ceiling_func(p->src->shape.row, p->dst->shape.w)); + + ASSERT(p->dst->start_address % ctx->chip_info.eu_num == 0); + ASSERT(p->dst->stride.n % ctx->chip_info.eu_num == 0); + ASSERT(p->dst->stride.c % ctx->chip_info.eu_num == 0); + ASSERT(p->dst->stride.h == p->dst->shape.w); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 1; + reg.spec_func = 1; + + reg.src_base_reg_sel = p->src->base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.row; + reg.src_c = p->src->shape.row; + reg.src_w = p->src->shape.col; + fill_src_c_stride(®, p->src->stride.row); + + reg.dst_c = p->dst->shape.c; + reg.dst_h = 1; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1822_op_t * tdma_tg2l_general_copy( + ctx_t *ctx, + const bmk1822_tdma_tg2l_general_copy_param_t *p, + uint64_t src_addr) +{ + ASSERT(p->src_base_reg_index < TDMA_NUM_BASE_REGS); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.trans_fmt = 1; + reg.sys_dtype = 0; + reg.spec_func = 0; + + reg.src_base_reg_sel = p->src_base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst_address); + reg.src_n_stride = p->bytes; + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1822_op_t * tdma_tg2l_bf16_general_copy( + ctx_t *ctx, + const bmk1822_tdma_tg2l_bf16_general_copy_param_t *p, + uint64_t src_addr) +{ + ASSERT(p->src_base_reg_index < TDMA_NUM_BASE_REGS); + // only support fix8b->fix8b or bf16->bf16 + ASSERT(p->dst_fmt == p->src_fmt); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.trans_fmt = 1; + reg.sys_dtype = 0; + reg.spec_func = 0; + + fill_tg2l_fmt(®, p->src_fmt, p->dst_fmt); + + reg.src_base_reg_sel = p->src_base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst_address); + reg.src_n_stride = p->src_bytes; + + return emit_tdma_cmdbuf(ctx, ®); +} + +bmk1822_op_t * bmk1822_tdma_g2l_tensor_copy( + ctx_t *ctx, + const bmk1822_tdma_tg2l_tensor_copy_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->start_address); + return tdma_tg2l_tensor_copy(ctx, p, src_addr); +} + +bmk1822_op_t * bmk1822_tdma_g2l_bf16_tensor_copy( + ctx_t *ctx, + const bmk1822_tdma_tg2l_tensor_copy_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->start_address); + return tdma_tg2l_bf16_tensor_copy(ctx, p, src_addr); +} + +bmk1822_op_t * bmk1822_tdma_g2l_tensor_copy_nc_transposed( + ctx_t *ctx, + const bmk1822_tdma_tg2l_tensor_copy_nc_transposed_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->start_address); + return tdma_tg2l_tensor_copy_nc_transposed(ctx, p, src_addr); +} + +bmk1822_op_t * bmk1822_tdma_g2l_bf16_tensor_copy_nc_transposed( + ctx_t *ctx, + const bmk1822_tdma_tg2l_tensor_copy_nc_transposed_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->start_address); + return tdma_tg2l_bf16_tensor_copy_nc_transposed(ctx, p, src_addr); +} + +bmk1822_op_t * bmk1822_tdma_g2l_tensor_copy_chw_rotated( + ctx_t *ctx, + const bmk1822_tdma_tg2l_tensor_copy_chw_rotated_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->start_address); + return tdma_tg2l_tensor_copy_chw_rotated(ctx, p, src_addr); +} + +bmk1822_op_t * bmk1822_tdma_g2l_tensor_copy_decompressed( + ctx_t *ctx, + const bmk1822_tdma_tg2l_tensor_copy_decompressed_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->t.start_address); + return tdma_tg2l_tensor_copy_decompressed(ctx, p, src_addr); +} + +bmk1822_op_t * bmk1822_tdma_tg2l_tensor_fill_constant( + ctx_t *ctx, + const bmk1822_tdma_tg2l_tensor_fill_constant_param_t *p) +{ + return tdma_tg2l_tensor_fill_constant(ctx, p); +} + +bmk1822_op_t * bmk1822_tdma_tg2l_bf16_tensor_fill_constant( + ctx_t *ctx, + const bmk1822_tdma_tg2l_tensor_fill_constant_param_t *p) +{ + return tdma_tg2l_bf16_tensor_fill_constant(ctx, p); +} + +bmk1822_op_t * bmk1822_tdma_g2l_matrix_copy( + ctx_t *ctx, + const bmk1822_tdma_tg2l_matrix_copy_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->start_address); + return tdma_tg2l_matrix_copy(ctx, p, src_addr); +} + +bmk1822_op_t * bmk1822_tdma_g2l_matrix_copy_decompressed( + ctx_t *ctx, + const bmk1822_tdma_tg2l_matrix_copy_decompressed_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->m.start_address); + return tdma_tg2l_matrix_copy_decompressed(ctx, p, src_addr); +} + +bmk1822_op_t * bmk1822_tdma_g2l_bf16_matrix_copy( + ctx_t *ctx, + const bmk1822_tdma_tg2l_matrix_copy_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->start_address); + return tdma_tg2l_bf16_matrix_copy(ctx, p, src_addr); +} + +bmk1822_op_t * bmk1822_tdma_g2l_matrix_copy_row_col_transposed( + ctx_t *ctx, + const bmk1822_tdma_tg2l_matrix_copy_row_col_transposed_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->start_address); + return tdma_tg2l_matrix_copy_row_col_transposed(ctx, p, src_addr); +} + +bmk1822_op_t * bmk1822_tdma_g2l_general_copy( + ctx_t *ctx, + const bmk1822_tdma_tg2l_general_copy_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src_address); + return tdma_tg2l_general_copy(ctx, p, src_addr); +} + +bmk1822_op_t * bmk1822_tdma_g2l_bf16_general_copy( + ctx_t *ctx, + const bmk1822_tdma_tg2l_bf16_general_copy_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src_address); + return tdma_tg2l_bf16_general_copy(ctx, p, src_addr); +} +/* + * Direction: TG2TG + */ +static bmk1822_op_t * bmk1822_gdma_copy_gmem( + bmk1822_context_t *ctx, + const bmk1822_tdma_tg2tg_tensor_copy_param_t *p, + uint8_t u8_trans_fmt) +{ + tdma_reg_t reg; + + reset_tdma_reg(®); + + uint64_t u64_src_addr; + uint64_t u64_dst_addr; + + reg.vld = 1; + reg.trans_dir = 2; // 0:tg2l, 1:l2tg, 2:g2g, 3:l2l + reg.trans_fmt = u8_trans_fmt; // 1:general copy, 2:tensor copy + reg.sys_dtype = 0; // + reg.spec_func = 0; // + + u64_src_addr = absolute_gmem_addr(p->src->start_address); + u64_dst_addr = absolute_gmem_addr(p->dst->start_address); + fill_src_addr(®, u64_src_addr); + fill_dst_addr(®, u64_dst_addr); + + reg.src_base_reg_sel = p->src->base_reg_index; + reg.dst_base_reg_sel = p->dst->base_reg_index; + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p-> dst->stride.h; + + return emit_tdma_cmdbuf( ctx, ®); +} + +static bmk1822_op_t * bmk1822_gdma_bf16_copy_gmem( + bmk1822_context_t *ctx, + const bmk1822_tdma_tg2tg_tensor_copy_param_t *p, + uint8_t u8_trans_fmt) +{ + tdma_reg_t reg; + + reset_tdma_reg(®); + + uint64_t u64_src_addr; + uint64_t u64_dst_addr; + + reg.vld = 1; + reg.trans_dir = 2; // 0:tg2l, 1:l2tg, 2:g2g, 3:l2l + reg.trans_fmt = u8_trans_fmt; // 1:general copy, 2:tensor copy + reg.sys_dtype = 0; // + reg.spec_func = 0; // + ASSERT(p->src->fmt == p->dst->fmt); + + reg.dst_fmt = (p->dst->fmt == FMT_BF16) ? 2 : 1; + reg.src_fmt = (p->src->fmt == FMT_BF16) ? 2 : 1; + + u64_src_addr = absolute_gmem_addr(p->src->start_address); + u64_dst_addr = absolute_gmem_addr(p->dst->start_address); + fill_src_addr(®, u64_src_addr); + fill_dst_addr(®, u64_dst_addr); + + reg.src_base_reg_sel = p->src->base_reg_index; + reg.dst_base_reg_sel = p->dst->base_reg_index; + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p-> dst->stride.h; + + + return emit_tdma_cmdbuf( ctx, ®); +} + +/* + * Direction: G2G + */ +bmk1822_op_t * bmk1822_tdma_tg2tg_tensor_copy( + bmk1822_context_t *ctx, + const bmk1822_tdma_tg2tg_tensor_copy_param_t *p) +{ + bmk1822_gdma_copy_gmem(ctx, p, 2); + return NULL; +} + +bmk1822_op_t * bmk1822_tdma_tg2tg_bf16_tensor_copy( + bmk1822_context_t *ctx, + const bmk1822_tdma_tg2tg_tensor_copy_param_t *p) +{ + bmk1822_gdma_bf16_copy_gmem(ctx, p, 2); + return NULL; +} + +bmk1822_op_t * bmk1822_tdma_tg2tg_general_copy( + bmk1822_context_t *ctx, + const bmk1822_tdma_tg2tg_tensor_copy_param_t *p) +{ + bmk1822_gdma_copy_gmem(ctx, p, 1); + return NULL; +} + +bmk1822_op_t * bmk1822_tdma_tg2tg_bf16_general_copy( + bmk1822_context_t *ctx, + const bmk1822_tdma_tg2tg_tensor_copy_param_t *p) +{ + bmk1822_gdma_bf16_copy_gmem(ctx, p, 1); + return NULL; +} diff --git a/cvikernel/src/bm1822/tiu_average_pooling.c b/cvikernel/src/bm1822/tiu_average_pooling.c new file mode 100644 index 000000000..0e700f878 --- /dev/null +++ b/cvikernel/src/bm1822/tiu_average_pooling.c @@ -0,0 +1,90 @@ +#include "kernel_1822.h" +#include + +bmk1822_op_t * bmk1822_tiu_average_pooling( + ctx_t *ctx, + const bmk1822_tiu_average_pooling_param_t *p) +{ + int bf16_enable = (p->ifmap->fmt == FMT_BF16) ? 1 : 0; + + ASSERT(p->ifmap->shape.n == p->ofmap->shape.n); + ASSERT(p->ifmap->shape.c == p->ofmap->shape.c); + ASSERT(p->stride_h < 32 && p->stride_h > 0); + ASSERT(p->stride_w < 32 && p->stride_w > 0); + ASSERT(p->pad_top < 16); + ASSERT(p->pad_bottom < 16); + ASSERT(p->pad_left < 16); + ASSERT(p->pad_right < 16); + ASSERT(p->ins_h < 15); + ASSERT(p->ins_last_h < 15); + ASSERT(p->ins_w < 15); + ASSERT(p->ins_last_w < 15); + + check_tiu_tensor_2(p->ifmap, p->ofmap); + if (bf16_enable) { + assert_bf16_stride_type_0(ctx, p->ifmap); + assert_bf16_stride_type_0(ctx, p->ofmap); + } else { + assert_stride_type_0(ctx, p->ifmap); + assert_stride_type_0(ctx, p->ofmap); + } + + int opd0_sign = tensor_is_signed(p->ifmap); + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_DEPTHWISE_POOL_FIX8B; + reg.tsk_eu_typ = 1; + reg.opt_shift_typ = opd0_sign; + reg.opt_res_shift = p->rshift_bits; + reg.opt_relu_typ = 0; /* hardware relu function not verified. */ + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + + reg.res0_addr = p->ofmap->start_address; + reg.opt_res0_sign = opd0_sign; + reg.opt_res0_seg = 1; + reg.res0_n = p->ofmap->shape.n; + reg.res0_c = p->ofmap->shape.c; + reg.res0_h = p->ofmap->shape.h; + reg.res0_w = p->ofmap->shape.w; + + reg.opd0_addr = p->ifmap->start_address; + reg.opt_opd0_sign = opd0_sign; + reg.opt_opd0_seg = 1; + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.conv_opd0_up_pad = p->pad_top; + reg.conv_opd0_dn_pad = p->pad_bottom; + reg.conv_opd0_lf_pad = p->pad_left; + reg.conv_opd0_rt_pad = p->pad_right; + reg.conv_opd0_x_ins0 = p->ins_w; + reg.conv_opd0_y_ins0 = p->ins_h; + reg.conv_opd0_x_ins0_last = p->ins_last_w; + reg.conv_opd0_y_ins0_last = p->ins_last_h; + + reg.opt_opd1_const = 1; + /*HW does not have dive, we need to calculate value here*/ + if (bf16_enable) + reg.opd1_addr = + convert_fp32_bf16( + (float)(convert_bf16_fp32(p->avg_pooling_const) / (p->kh * p->kw))); + else + reg.opd1_addr = p->avg_pooling_const; + + reg.opd1_h = p->kh; + reg.opd1_w = p->kw; + reg.opt_opd1_sign = 0; + reg.opt_opd1_seg = 1; + reg.conv_op_x_str = p->stride_w; + reg.conv_op_y_str = p->stride_h; + reg.opd0_ins_val = bf16_enable ? + (uint32_t)p->ins_fp : (uint32_t)p->ins_val; + + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1822/tiu_convolution.c b/cvikernel/src/bm1822/tiu_convolution.c new file mode 100644 index 000000000..320e19617 --- /dev/null +++ b/cvikernel/src/bm1822/tiu_convolution.c @@ -0,0 +1,176 @@ +#include "kernel_1822.h" + +typedef bmk1822_tiu_convolution_param_t param_t; + +static int can_do_double_conv(ctx_t *ctx, const param_t *p) +{ + uint8_t bf16_enable = (p->ifmap->fmt == FMT_BF16) ? 1 : 0; + + if (((p->ifmap->start_address % ctx->chip_info.lmem_size) % 2 == 0 && + p->ifmap->shape.c % 2 == 0 && + p->ifmap->shape.c >= 4 && + p->weight->start_address % 2 == 0) && !bf16_enable) + return 1; + + return 0; +} + +static void check_conv_param(ctx_t *ctx, const param_t *p) +{ + uint32_t eu_num = ctx->chip_info.eu_num; + uint8_t bf16_enable = (p->ifmap->fmt == FMT_BF16) ? 1 : 0; + + check_tiu_tensor_3(p->ifmap, p->ofmap, p->weight); + if (bf16_enable) { + assert_bf16_stride_type_0(ctx, p->ifmap); + } else { + assert_stride_type_0(ctx, p->ifmap); + } + //assert_stride_type_1(ctx, p->weight); + if (p->bias) { + check_tiu_tensor(p->bias); + if (bf16_enable) + assert_bf16_stride_type_2(ctx, p->bias); + else + assert_stride_type_2(ctx, p->bias); + } + + // n stride must align 16B + ASSERT((p->ofmap->stride.n % 16) == 0); + + ASSERT(p->ifmap->start_address % eu_num == 0); + ASSERT(p->ofmap->start_address % eu_num == 0); + ASSERT(p->ifmap->shape.n == p->ofmap->shape.n); + ASSERT(!(p->ifmap->shape.h == 1 && p->ins_h > 0)); + ASSERT(p->weight->shape.n == p->ifmap->shape.c); + ASSERT(p->weight->shape.c == p->ofmap->shape.c); + if (can_do_double_conv(ctx, p)) { + uint32_t lmem_i = p->ifmap->start_address % ctx->chip_info.lmem_size; + ASSERT(lmem_i % 2 == 0); + ASSERT(p->ifmap->shape.c % 2 == 0); + ASSERT(p->ifmap->shape.c >= 4); /* Otherwise performance will suffer */ + ASSERT(p->weight->start_address % 2 == 0); + } + if(p->ps32_mode & 0x2) + { + ASSERT(!p->relu_enable); + ASSERT(!p->bias); + ASSERT(!p->rshift_bits); + + ASSERT(p->cmd_pre_exe <= 1); + } + ASSERT(p->stride_h < 32 && p->stride_h > 0); + ASSERT(p->stride_w < 32 && p->stride_w > 0); + ASSERT(p->pad_top < 16); + ASSERT(p->pad_bottom < 16); + ASSERT(p->pad_left < 16); + ASSERT(p->pad_right < 16); + ASSERT(p->ins_h < 15); + ASSERT(p->ins_last_h < 15); + ASSERT(p->ins_w < 15); + ASSERT(p->ins_last_w < 15); + ASSERT(p->dilation_h >= 1); + ASSERT(p->dilation_w >= 1); + ASSERT(p->relu_enable == 0 || p->relu_enable == 1); +} + +bmk1822_op_t * bmk1822_tiu_convolution(ctx_t *ctx, const param_t *p) +{ + check_conv_param(ctx, p); + + uint32_t npu_num = ctx->chip_info.npu_num; + int opd0_sign = tensor_is_signed(p->ifmap); + int opd1_sign = tensor_is_signed(p->weight); + int opd2_sign = p->bias? tensor_is_signed(p->bias): 1; + int arith_shift = opd0_sign || opd1_sign || opd2_sign; + int bf16_enable = (p->ifmap->fmt == FMT_BF16) ? 1 : 0; + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_CONV_FIX8B; + reg.opt_shift_typ = arith_shift; + reg.opt_res_shift = p->rshift_bits; + reg.opt_relu_typ = !!(p->relu_enable); + reg.tsk_opd_num = 2; + + reg.opd_typ = bf16_enable; + + /*always automatically enabel double conv at those situations*/ + if (can_do_double_conv(ctx, p)) + reg.double_conv = 1; + + reg.res0_addr = p->ofmap->start_address; + reg.opt_res0_sign = tensor_is_signed(p->ofmap); + reg.opt_res0_seg = 1; + reg.res0_n = p->ofmap->shape.n; + reg.res0_c = p->ofmap->shape.c; + reg.res0_h = p->ofmap->shape.h; + reg.res0_w = p->ofmap->shape.w; + reg.res0_n_str = p->ofmap->stride.n; + reg.res0_c_str = p->ofmap->stride.c; + reg.res0_h_str = p->ofmap->stride.h; + reg.res0_w_str = p->ofmap->stride.w; + reg.short_res0_str = 3; // Manual instead of h/w + reg.ps32_md = p->ps32_mode; + if (p->ps32_mode > 0) + reg.res0_b_str = p->ofmap->shape.n * p->ofmap->stride.n; + + reg.opd0_addr = p->ifmap->start_address; + reg.opt_opd0_sign = opd0_sign; + reg.opt_opd0_seg = 1; + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.short_opd0_str = 0; + reg.conv_opd0_up_pad = p->pad_top; + reg.conv_opd0_dn_pad = p->pad_bottom; + reg.conv_opd0_lf_pad = p->pad_left; + reg.conv_opd0_rt_pad = p->pad_right; + reg.conv_opd0_x_ins0 = p->ins_w; + reg.conv_opd0_y_ins0 = p->ins_h; + reg.conv_opd0_x_ins0_last = p->ins_last_w; + reg.conv_opd0_y_ins0_last = p->ins_last_h; + + reg.opd1_addr = p->weight->start_address; + reg.opt_opd1_sign = opd1_sign; + reg.opt_opd1_seg = 1; + reg.opt_opd1_const = p->w_is_const; + reg.opd1_n = p->weight->shape.n; + reg.opd1_c = p->weight->shape.c; + reg.opd1_h = p->weight->shape.h; + reg.opd1_w = p->weight->shape.w; + reg.short_opd1_str = 1; + reg.conv_opd1_x_ins0 = p->dilation_w - 1; + reg.conv_opd1_y_ins0 = p->dilation_h - 1; + reg.conv_op_x_str = p->stride_w; + reg.conv_op_y_str = p->stride_h; + reg.opd0_ins_val = bf16_enable ? + (uint32_t)p->ins_fp : (uint32_t)p->ins_val; + + if (p->bias) { + ASSERT(p->bias->shape.n == 2); + ASSERT(p->bias->shape.c == p->ofmap->shape.c); + ASSERT(p->bias->shape.h == 1); + ASSERT(p->bias->shape.w == 1); + + reg.tsk_opd_num = 3; + reg.opt_opd2_sign = opd2_sign; + reg.opt_opd2_seg = 0; + reg.opd2_addr = p->bias->start_address; + reg.opd2_n = 1; + reg.opd2_c = p->bias->shape.c; + reg.opd2_h = 1; + reg.opd2_w = 1; + reg.short_opd2_str = 2; + reg.opd2_b_str = ceiling_func(p->bias->shape.c, npu_num) * (bf16_enable ? 2 : 1); + } + + reg.layer_info = p->layer_id; + + reg.cmd_pre_exe_typ = p->cmd_pre_exe_typ; + reg.cmd_pre_exe = p->cmd_pre_exe; + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1822/tiu_convolution_qdm.c b/cvikernel/src/bm1822/tiu_convolution_qdm.c new file mode 100644 index 000000000..c37c3b5e1 --- /dev/null +++ b/cvikernel/src/bm1822/tiu_convolution_qdm.c @@ -0,0 +1,166 @@ +#include "kernel_1822.h" + +typedef bmk1822_tiu_convolution_qdm_param_t param_t; + +static int can_do_double_conv(ctx_t *ctx, const param_t *p) +{ + if ((p->ifmap->start_address % ctx->chip_info.lmem_size) % 2 == 0 && + p->ifmap->shape.c % 2 == 0 && + p->ifmap->shape.c >= 4 && + p->weight->start_address % 2 == 0) + return 1; + + return 0; +} + +static void check_conv_param(ctx_t *ctx, const param_t *p) +{ + uint32_t eu_num = ctx->chip_info.eu_num; + + check_tiu_tensor_3(p->ifmap, p->ofmap, p->weight); + assert_stride_type_0(ctx, p->ifmap); + + ASSERT((p->ofmap->stride.n % eu_num) == 0); + ASSERT(p->ifmap->start_address % eu_num == 0); + ASSERT(p->ofmap->start_address % eu_num == 0); + ASSERT(p->ifmap->shape.n == p->ofmap->shape.n); + ASSERT(!(p->ifmap->shape.h == 1 && p->ins_h > 0)); + ASSERT(p->weight->shape.n == p->ifmap->shape.c); + ASSERT(p->weight->shape.c == p->ofmap->shape.c); + + if (p->chl_quan_param) { + check_tiu_tensor(p->chl_quan_param); + assert_stride_type_2(ctx, p->chl_quan_param); + ASSERT(p->chl_quan_param->start_address % eu_num == 0); + } + if (can_do_double_conv(ctx, p)) { + uint32_t lmem_i = p->ifmap->start_address % ctx->chip_info.lmem_size; + ASSERT(lmem_i % 2 == 0); + ASSERT(p->ifmap->shape.c % 2 == 0); + ASSERT(p->ifmap->shape.c >= 4); /* Otherwise performance will suffer */ + ASSERT(p->weight->start_address % 2 == 0); + } + if(p->ps32_mode & 0x2) + { + ASSERT(!p->relu_enable); + ASSERT(!p->has_bias); + + ASSERT(p->cmd_pre_exe <= 1); + } + ASSERT(p->stride_h < 32 && p->stride_h > 0); + ASSERT(p->stride_w < 32 && p->stride_w > 0); + ASSERT(p->pad_top < 16); + ASSERT(p->pad_bottom < 16); + ASSERT(p->pad_left < 16); + ASSERT(p->pad_right < 16); + ASSERT(p->ins_h < 15); + ASSERT(p->ins_last_h < 15); + ASSERT(p->ins_w < 15); + ASSERT(p->ins_last_w < 15); + ASSERT(p->dilation_h >= 1); + ASSERT(p->dilation_w >= 1); + ASSERT(p->relu_enable == 0 || p->relu_enable == 1); +} + +bmk1822_op_t * bmk1822_tiu_convolution_qdm(ctx_t *ctx, const param_t *p) +{ + check_conv_param(ctx, p); + + int opd0_sign = tensor_is_signed(p->ifmap); + int opd1_sign = tensor_is_signed(p->weight); + int arith_shift = opd0_sign || opd1_sign; + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_CONV_FIX8B; + reg.opt_shift_typ = arith_shift; + reg.opt_relu_typ = !!(p->relu_enable); + reg.tsk_opd_num = 2; + + /*always automatically enabel double conv at those situations*/ + if (can_do_double_conv(ctx, p)) + reg.double_conv = 1; + + reg.res0_addr = p->ofmap->start_address; + reg.opt_res0_sign = tensor_is_signed(p->ofmap); + reg.opt_res0_seg = 1; + reg.res0_n = p->ofmap->shape.n; + reg.res0_c = p->ofmap->shape.c; + reg.res0_h = p->ofmap->shape.h; + reg.res0_w = p->ofmap->shape.w; + reg.res0_n_str = p->ofmap->stride.n; + reg.res0_c_str = p->ofmap->stride.c; + reg.res0_h_str = p->ofmap->stride.h; + reg.res0_w_str = p->ofmap->stride.w; + reg.short_res0_str = 3; // Manual instead of h/w + reg.ps32_md = p->ps32_mode; + if (p->ps32_mode > 0) { + reg.res0_b_str = p->ofmap->shape.n * p->ofmap->stride.n; + + // Per-channel parameter does not has right shift (default is 10). + // Set zero. + reg.opt_res_shift = 0; + } + + reg.opd0_addr = p->ifmap->start_address; + reg.opt_opd0_sign = opd0_sign; + reg.opt_opd0_seg = 1; + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.opd0_ins_val = (uint32_t)p->ins_val; + reg.short_opd0_str = 0; + reg.conv_opd0_up_pad = p->pad_top; + reg.conv_opd0_dn_pad = p->pad_bottom; + reg.conv_opd0_lf_pad = p->pad_left; + reg.conv_opd0_rt_pad = p->pad_right; + reg.conv_opd0_x_ins0 = p->ins_w; + reg.conv_opd0_y_ins0 = p->ins_h; + reg.conv_opd0_x_ins0_last = p->ins_last_w; + reg.conv_opd0_y_ins0_last = p->ins_last_h; + + reg.opd1_addr = p->weight->start_address; + reg.opt_opd1_sign = opd1_sign; + reg.opt_opd1_seg = 1; + reg.opt_opd1_const = p->w_is_const; + reg.opd1_n = p->weight->shape.n; + reg.opd1_c = p->weight->shape.c; + reg.opd1_h = p->weight->shape.h; + reg.opd1_w = p->weight->shape.w; + reg.short_opd1_str = 1; + reg.conv_opd1_x_ins0 = p->dilation_w - 1; + reg.conv_opd1_y_ins0 = p->dilation_h - 1; + reg.conv_op_x_str = p->stride_w; + reg.conv_op_y_str = p->stride_h; + + if (p->chl_quan_param) { + ASSERT(p->chl_quan_param->shape.n == 1); + ASSERT(p->chl_quan_param->shape.c == p->ofmap->shape.c); + ASSERT(p->chl_quan_param->shape.h == 1); + ASSERT(p->chl_quan_param->shape.w == 1); + reg.opt_chl_quan = 1; + reg.opt_res_shift = 0; // useless + reg.opd2_addr = p->chl_quan_param->start_address; + reg.opd2_n = p->chl_quan_param->shape.n; + reg.opd2_c = p->chl_quan_param->shape.c; + reg.opd2_h = p->chl_quan_param->shape.h; + reg.opd2_w = p->chl_quan_param->shape.w; + } + reg.opt_opd2_seg = 1; // useless, force to 1 to skip b_stride check + reg.short_opd2_str = 2; // useless + reg.opd2_b_str = 0; // useless + + if (p->has_bias) { + reg.tsk_opd_num = 3; + reg.opt_opd2_sign = 1; + } + + reg.layer_info = p->layer_id; + + reg.cmd_pre_exe_typ = p->cmd_pre_exe_typ; + reg.cmd_pre_exe = p->cmd_pre_exe; + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1822/tiu_depthwise_convolution.c b/cvikernel/src/bm1822/tiu_depthwise_convolution.c new file mode 100644 index 000000000..30d731153 --- /dev/null +++ b/cvikernel/src/bm1822/tiu_depthwise_convolution.c @@ -0,0 +1,152 @@ +#include "kernel_1822.h" + +bmk1822_op_t * bmk1822_tiu_depthwise_convolution( + ctx_t *ctx, + const bmk1822_tiu_depthwise_convolution_param_t *p) +{ + int bf16_enable = (p->ifmap->fmt == FMT_BF16) ? 1 : 0; + + bool isMulConst = (p->weight_is_const == 1) ? 1 : 0; + + if(isMulConst) { + check_tiu_tensor_2(p->ifmap, p->ofmap); + } else { + check_tiu_tensor_3(p->ifmap, p->ofmap, p->weight); + } + if (bf16_enable) { + assert_bf16_stride_type_0(ctx, p->ifmap); + if(!isMulConst) + assert_bf16_stride_type_0(ctx, p->weight); + if (p->bias) { + check_tiu_tensor(p->bias); + assert_bf16_stride_type_2(ctx, p->bias); + } + } else { + assert_stride_type_0(ctx, p->ifmap); + if(!isMulConst) + assert_stride_type_0(ctx, p->weight); + if (p->bias) { + check_tiu_tensor(p->bias); + assert_stride_type_2(ctx, p->bias); + } + } + + // n stride must align 16B + ASSERT((p->ofmap->stride.n % 16) == 0); + + ASSERT(p->ifmap->shape.n == p->ofmap->shape.n); + ASSERT(p->ifmap->shape.c == p->ofmap->shape.c); + if(!isMulConst){ + ASSERT(p->ifmap->shape.c == p->weight->shape.c); + ASSERT(p->weight->shape.n == 1); + } + ASSERT(p->relu_enable == 0 || p->relu_enable == 1); + ASSERT(p->stride_h < 32 && p->stride_h > 0); + ASSERT(p->stride_w < 32 && p->stride_w > 0); + ASSERT(p->pad_top < 16); + ASSERT(p->pad_bottom < 16); + ASSERT(p->pad_left < 16); + ASSERT(p->pad_right < 16); + ASSERT(p->ins_h < 15); + ASSERT(p->ins_last_h < 15); + ASSERT(p->ins_w < 15); + ASSERT(p->ins_last_w < 15); + ASSERT(p->dilation_h >= 1); + ASSERT(p->dilation_w >= 1); + + int opd0_sign = tensor_is_signed(p->ifmap); + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_DEPTHWISE_POOL_FIX8B; + reg.tsk_eu_typ = 2; + reg.opt_relu_typ = p->relu_enable; + reg.opt_shift_typ = 1; + reg.opt_res_shift = p->rshift_bits; + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + + int res0_sign = tensor_is_signed(p->ofmap); + reg.res0_addr = p->ofmap->start_address; + reg.opt_res0_sign = res0_sign; + reg.opt_res0_seg = 1; + reg.res0_n = p->ofmap->shape.n; + reg.res0_c = p->ofmap->shape.c; + reg.res0_h = p->ofmap->shape.h; + reg.res0_w = p->ofmap->shape.w; + reg.res0_n_str = p->ofmap->stride.n; + reg.res0_c_str = p->ofmap->stride.c; + reg.res0_h_str = p->ofmap->stride.h; + reg.res0_w_str = p->ofmap->stride.w; + reg.short_res0_str = 3; // Manual instead of h/w + + reg.opd0_addr = p->ifmap->start_address; + reg.opt_opd0_sign = opd0_sign; + reg.opt_opd0_seg = 1; + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.opd0_n_str = p->ifmap->stride.n; + reg.opd0_c_str = p->ifmap->stride.c; + reg.opd0_h_str = p->ifmap->stride.h; + reg.opd0_w_str = p->ifmap->stride.w; + reg.short_opd0_str = 3; // Manual instead of h/w + reg.conv_opd0_up_pad = p->pad_top; + reg.conv_opd0_dn_pad = p->pad_bottom; + reg.conv_opd0_lf_pad = p->pad_left; + reg.conv_opd0_rt_pad = p->pad_right; + reg.conv_opd0_x_ins0 = p->ins_w; + reg.conv_opd0_y_ins0 = p->ins_h; + reg.conv_opd0_x_ins0_last = p->ins_last_w; + reg.conv_opd0_y_ins0_last = p->ins_last_h; + + reg.opt_opd1_sign = 1; + reg.opt_opd1_seg = 1; + reg.conv_opd1_x_ins0 = p->dilation_w - 1; + reg.conv_opd1_y_ins0 = p->dilation_h - 1; + if (isMulConst) { + reg.opt_opd1_const = 1; + reg.opt_opd1_sign = p->weight_const.is_signed; + reg.opd1_addr = p->weight_const.val; + reg.opd1_n = p->weight->shape.n; + reg.opd1_c = p->weight->shape.c; + reg.opd1_h = p->weight->shape.h; + reg.opd1_w = p->weight->shape.w; + } else { + reg.opd1_addr = p->weight->start_address; + reg.opd1_n = p->weight->shape.n; + reg.opd1_c = p->weight->shape.c; + reg.opd1_h = p->weight->shape.h; + reg.opd1_w = p->weight->shape.w; + } + reg.conv_op_x_str = p->stride_w; + reg.conv_op_y_str = p->stride_h; + reg.opd0_ins_val = bf16_enable ? + (uint32_t)p->ins_fp : (uint32_t)p->ins_val; + + if (p->bias) { + ASSERT(p->bias->shape.n == 2); + ASSERT(p->bias->shape.c == p->ofmap->shape.c); + ASSERT(p->bias->shape.h == 1); + ASSERT(p->bias->shape.w == 1); + + reg.tsk_opd_num = 3; + reg.opd2_addr = p->bias->start_address; + reg.opt_opd2_seg = 0; + reg.opd2_n = 1; + reg.opd2_c = p->bias->shape.c; + reg.opd2_h = 1; + reg.opd2_w = 1; + reg.short_opd2_str = 2; + reg.opd2_b_str = p->bias->stride.n; + } + + reg.layer_info = p->layer_id; + + reg.cmd_pre_exe_typ = p->cmd_pre_exe_typ; + reg.cmd_pre_exe = p->cmd_pre_exe; + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1822/tiu_depthwise_convolution_qdm.c b/cvikernel/src/bm1822/tiu_depthwise_convolution_qdm.c new file mode 100644 index 000000000..568fbfaf5 --- /dev/null +++ b/cvikernel/src/bm1822/tiu_depthwise_convolution_qdm.c @@ -0,0 +1,142 @@ +#include "kernel_1822.h" + +bmk1822_op_t * bmk1822_tiu_depthwise_convolution_qdm( + ctx_t *ctx, + const bmk1822_tiu_depthwise_convolution_qdm_param_t *p) +{ + uint32_t eu_num = ctx->chip_info.eu_num; + + bool isMulConst = (p->weight_is_const == 1) ? 1 : 0; + + if(isMulConst) { + check_tiu_tensor_2(p->ifmap, p->ofmap); + } else { + check_tiu_tensor_3(p->ifmap, p->ofmap, p->weight); + } + assert_stride_type_0(ctx, p->ifmap); + if(!isMulConst){ + assert_stride_type_0(ctx, p->weight); + } + check_tiu_tensor(p->chl_quan_param); + assert_stride_type_2(ctx, p->chl_quan_param); + + ASSERT((p->ofmap->stride.n % eu_num) == 0); + ASSERT(p->chl_quan_param->start_address %eu_num == 0); + ASSERT(p->ifmap->shape.n == p->ofmap->shape.n); + ASSERT(p->ifmap->shape.c == p->ofmap->shape.c); + if (!isMulConst) { + ASSERT(p->ifmap->shape.c == p->weight->shape.c); + ASSERT(p->weight->shape.n == 1); + } + ASSERT(p->relu_enable == 0 || p->relu_enable == 1); + ASSERT(p->stride_h < 32 && p->stride_h > 0); + ASSERT(p->stride_w < 32 && p->stride_w > 0); + ASSERT(p->pad_top < 16); + ASSERT(p->pad_bottom < 16); + ASSERT(p->pad_left < 16); + ASSERT(p->pad_right < 16); + ASSERT(p->ins_h < 15); + ASSERT(p->ins_last_h < 15); + ASSERT(p->ins_w < 15); + ASSERT(p->ins_last_w < 15); + ASSERT(p->dilation_h >= 1); + ASSERT(p->dilation_w >= 1); + + int opd0_sign = tensor_is_signed(p->ifmap); + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_DEPTHWISE_POOL_FIX8B; + reg.tsk_eu_typ = 2; + reg.opt_relu_typ = p->relu_enable; + reg.opt_shift_typ = 1; + reg.tsk_opd_num = 2; + + int res0_sign = tensor_is_signed(p->ofmap); + reg.res0_addr = p->ofmap->start_address; + reg.opt_res0_sign = res0_sign; + reg.opt_res0_seg = 1; + reg.res0_n = p->ofmap->shape.n; + reg.res0_c = p->ofmap->shape.c; + reg.res0_h = p->ofmap->shape.h; + reg.res0_w = p->ofmap->shape.w; + reg.res0_n_str = p->ofmap->stride.n; + reg.res0_c_str = p->ofmap->stride.c; + reg.res0_h_str = p->ofmap->stride.h; + reg.res0_w_str = p->ofmap->stride.w; + reg.short_res0_str = 3; // Manual instead of h/w + + reg.opd0_addr = p->ifmap->start_address; + reg.opt_opd0_sign = opd0_sign; + reg.opt_opd0_seg = 1; + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.opd0_n_str = p->ifmap->stride.n; + reg.opd0_c_str = p->ifmap->stride.c; + reg.opd0_h_str = p->ifmap->stride.h; + reg.opd0_w_str = p->ifmap->stride.w; + reg.opd0_ins_val = (uint32_t)p->ins_val; + reg.short_opd0_str = 3; // Manual instead of h/w + reg.conv_opd0_up_pad = p->pad_top; + reg.conv_opd0_dn_pad = p->pad_bottom; + reg.conv_opd0_lf_pad = p->pad_left; + reg.conv_opd0_rt_pad = p->pad_right; + reg.conv_opd0_x_ins0 = p->ins_w; + reg.conv_opd0_y_ins0 = p->ins_h; + reg.conv_opd0_x_ins0_last = p->ins_last_w; + reg.conv_opd0_y_ins0_last = p->ins_last_h; + + reg.opt_opd1_sign = 1; + reg.opt_opd1_seg = 1; + reg.conv_opd1_x_ins0 = p->dilation_w - 1; + reg.conv_opd1_y_ins0 = p->dilation_h - 1; + if (isMulConst) { + reg.opt_opd1_const = 1; + reg.opt_opd1_sign = p->weight_const.is_signed; + reg.opd1_addr = p->weight_const.val; + reg.opd1_n = p->weight->shape.n; + reg.opd1_c = p->weight->shape.c; + reg.opd1_h = p->weight->shape.h; + reg.opd1_w = p->weight->shape.w; + } else { + reg.opd1_addr = p->weight->start_address; + reg.opd1_n = p->weight->shape.n; + reg.opd1_c = p->weight->shape.c; + reg.opd1_h = p->weight->shape.h; + reg.opd1_w = p->weight->shape.w; + } + reg.conv_op_x_str = p->stride_w; + reg.conv_op_y_str = p->stride_h; + reg.cmd_pre_exe_typ = p->cmd_pre_exe_typ; + reg.cmd_pre_exe = p->cmd_pre_exe; + + ASSERT(p->chl_quan_param->shape.n == 1); + ASSERT(p->chl_quan_param->shape.c == p->ofmap->shape.c); + ASSERT(p->chl_quan_param->shape.h == 1); + ASSERT(p->chl_quan_param->shape.w == 1); + reg.opt_chl_quan = 1; + reg.opt_res_shift = 0; // useless + reg.opd2_addr = p->chl_quan_param->start_address; + reg.opd2_n = p->chl_quan_param->shape.n; + reg.opd2_c = p->chl_quan_param->shape.c; + reg.opd2_h = p->chl_quan_param->shape.h; + reg.opd2_w = p->chl_quan_param->shape.w; + reg.opt_opd2_seg = 1; // useless, force to 1 to skip b_stride check + reg.short_opd2_str = 2; // useless + reg.opd2_b_str = 0; // useless + + if (p->has_bias) { + reg.tsk_opd_num = 3; + reg.opt_opd2_sign = 1; + } + + reg.layer_info = p->layer_id; + + reg.cmd_pre_exe_typ = p->cmd_pre_exe_typ; + reg.cmd_pre_exe = p->cmd_pre_exe; + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1822/tiu_element_wise_add.c b/cvikernel/src/bm1822/tiu_element_wise_add.c new file mode 100644 index 000000000..ff90912b2 --- /dev/null +++ b/cvikernel/src/bm1822/tiu_element_wise_add.c @@ -0,0 +1,81 @@ +#include "kernel_1822.h" + +bmk1822_op_t * bmk1822_tiu_element_wise_add( + ctx_t *k, + const bmk1822_tiu_element_wise_add_param_t *p) +{ + int bf16_enable = (p->a_low->fmt == FMT_BF16) ? 1 : 0; + + if (bf16_enable) { + /*bf16 only support 16 bit*/ + ASSERT(!p->a_high); + ASSERT(!(p->b_high && !p->b_is_const)); + ASSERT(!p->res_high); + check_tiu_tensor(p->a_low); + check_tiu_tensor(p->res_low); + assert_same_shape(p->res_low, p->a_low); + if (!p->b_is_const) { + check_tiu_tensor(p->b_low); + assert_same_shape(p->res_low, p->b_low); + } + } else { + check_16bit_tiu_tensor(p->a_low, p->a_high); + check_tiu_tensor(p->res_low); + assert_same_shape(p->res_low, p->a_low); + if (!p->b_is_const) { + check_16bit_tiu_tensor(p->b_low, p->b_high); + assert_same_shape(p->res_low, p->b_low); + } + } + if (p->res_high) + check_16bit_tiu_tensor(p->res_low, p->res_high); + ASSERT(p->relu_enable == 0 || p->relu_enable == 1); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_ADD_FIX8B; + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + reg.opt_res_shift = 0; + reg.opt_relu_typ = p->relu_enable; + fill_same_tensor_shape(®, p->a_low->shape); + fill_same_tensor_stride_type(®, 0b11); + + int arith_shift = tensor_is_signed(p->res_low); + reg.opt_shift_typ = arith_shift; + reg.opt_res_shift = p->rshift_bits; + + reg.opd0_addr = p->a_low->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a_low); + reg.opt_opd0_seg = (p->a_high == NULL); + reg.opd0_b_str = bf16_enable ? 0 : (p->a_high->start_address - p->a_low->start_address); + fill_opd0_stride(®, &p->a_low->stride); + + reg.opt_opd1_seg = bf16_enable ? 1 : 0; //(p->b_high == NULL); b_high is the same as b_val + if (p->b_is_const) { + reg.opt_opd1_const = 1; + reg.opt_opd1_sign = !!p->b_const.is_signed; + reg.opd1_addr = p->b_const.val; + } else { + reg.opt_opd1_const = 0; + reg.opt_opd1_sign = tensor_is_signed(p->b_low); + reg.opd1_addr = p->b_low->start_address; + reg.opd1_b_str = bf16_enable ? 0 : (p->b_high->start_address - p->b_low->start_address); + fill_opd1_stride(®, &p->b_low->stride); + } + + reg.res0_addr = p->res_low->start_address; + reg.opt_res0_sign = tensor_is_signed(p->res_low); + reg.opt_res0_seg = (p->res_high == NULL); + fill_res0_stride(®, &p->res_low->stride); + if (p->res_high) + reg.res0_b_str = p->res_high->start_address - p->res_low->start_address; + if (p->relu_enable) + ASSERT(reg.opt_res0_seg); + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + return emit_tiu_cmdbuf(k, ®); +} diff --git a/cvikernel/src/bm1822/tiu_element_wise_and.c b/cvikernel/src/bm1822/tiu_element_wise_and.c new file mode 100644 index 000000000..a2ab385c2 --- /dev/null +++ b/cvikernel/src/bm1822/tiu_element_wise_and.c @@ -0,0 +1,100 @@ +#include "kernel_1822.h" + +bmk1822_op_t * bmk1822_tiu_element_wise_and_int8( + ctx_t *ctx, + const bmk1822_tiu_element_wise_and_int8_param_t *p) +{ + check_tiu_tensor_3(p->res, p->a, p->b); + assert_same_shape_3(p->res, p->a, p->b); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_AND_FIX8B; + reg.tsk_opd_num = 2; + reg.opt_res_shift = 0; + reg.opt_shift_typ = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = 0; + reg.opt_opd0_seg = 1; + fill_opd0_stride(®, &p->a->stride); + + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = 0; + reg.opt_opd1_seg = 1; + fill_opd1_stride(®, &p->b->stride); + + reg.res0_addr = p->res->start_address; + reg.opt_res0_sign = 0; + reg.opt_res0_seg = 1; + fill_res0_stride(®, &p->res->stride); + + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + + return emit_tiu_cmdbuf(ctx, ®); +} + +bmk1822_op_t * bmk1822_tiu_element_wise_and_int16( + ctx_t *ctx, + const bmk1822_tiu_element_wise_and_int16_param_t *p) +{ + check_16bit_tiu_tensor(p->a_low, p->a_high); + check_16bit_tiu_tensor(p->b_low, p->b_high); + check_16bit_tiu_tensor(p->res_low, p->res_high); + assert_same_shape_3(p->res_low, p->a_low, p->b_low); + + int res_high_addr = p->res_high->start_address; + int res_low_addr = p->res_low->start_address; + ASSERT(res_high_addr > res_low_addr); + int res_b_stride = res_high_addr - res_low_addr; + + int a_high_addr = p->a_high->start_address; + int a_low_addr = p->a_low->start_address; + ASSERT(a_high_addr > a_low_addr); + int a_b_stride = a_high_addr - a_low_addr; + + int b_high_addr = p->b_high->start_address; + int b_low_addr = p->b_low->start_address; + ASSERT(b_high_addr > b_low_addr); + int b_b_stride = b_high_addr - b_low_addr; + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_AND_FIX8B; + reg.tsk_opd_num = 2; + reg.opt_res_shift = 0; + reg.opt_shift_typ = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a_low->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = a_low_addr; + reg.opt_opd0_sign = 0; + reg.opt_opd0_seg = 0; + reg.opd0_b_str = a_b_stride; + fill_opd0_stride(®, &p->a_low->stride); + + reg.opd1_addr = b_low_addr; + reg.opt_opd1_sign = 0; + reg.opt_opd1_seg = 0; + reg.opd1_b_str = b_b_stride; + fill_opd1_stride(®, &p->b_low->stride); + + reg.res0_addr = res_low_addr; + reg.opt_res0_sign = 0; + reg.opt_res0_seg = 0; + reg.res0_b_str = res_b_stride; + fill_res0_stride(®, &p->res_low->stride); + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1822/tiu_element_wise_copy.c b/cvikernel/src/bm1822/tiu_element_wise_copy.c new file mode 100644 index 000000000..8ba186e7b --- /dev/null +++ b/cvikernel/src/bm1822/tiu_element_wise_copy.c @@ -0,0 +1,42 @@ +#include "kernel_1822.h" + +bmk1822_op_t * bmk1822_tiu_element_wise_copy( + ctx_t *ctx, + const bmk1822_tiu_element_wise_copy_param_t *p) +{ + int bf16_enable = (p->src->fmt == FMT_BF16) ? 1 : 0; + + check_tiu_tensor_2(p->dst, p->src); + assert_same_shape(p->dst, p->src); + assert_stride_range(p->dst->stride); + assert_stride_range(p->src->stride); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_COPY_FIX8B; + reg.tsk_opd_num = 1; + reg.opd_typ = bf16_enable ? 1: 0; + reg.opt_res_shift = 0; + reg.opt_shift_typ = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->dst->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->src->start_address; + reg.opt_opd0_sign = 0; + reg.opt_opd0_seg = 1; + fill_opd0_stride(®, &p->src->stride); + + reg.res0_addr = p->dst->start_address; + reg.opt_res0_sign = 0; + reg.opt_res0_seg = 1; + fill_res0_stride(®, &p->dst->stride); + + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1822/tiu_element_wise_ge.c b/cvikernel/src/bm1822/tiu_element_wise_ge.c new file mode 100644 index 000000000..97dfa2943 --- /dev/null +++ b/cvikernel/src/bm1822/tiu_element_wise_ge.c @@ -0,0 +1,110 @@ +#include "kernel_1822.h" + +bmk1822_op_t * bmk1822_tiu_element_wise_ge( + ctx_t *ctx, + const bmk1822_tiu_element_wise_ge_param_t *p) +{ + check_tiu_tensor_2(p->ge, p->a); + assert_same_shape(p->ge, p->a); + if (p->b_is_const) { + if (tensor_is_signed(p->a)) + ASSERT(p->b_const.is_signed); + else + ASSERT(!p->b_const.is_signed); + } else { + check_tiu_tensor(p->b); + assert_same_shape(p->ge, p->b); + ASSERT(p->a->fmt == p->b->fmt); + } + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_GE_FIX8B; + reg.tsk_opd_num = 2; + reg.opt_res_shift = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a); + fill_opd0_stride(®, &p->a->stride); + + if (p->b_is_const) { + reg.opt_opd1_const = 1; + reg.opd1_addr = p->b_const.val; + reg.opt_opd1_sign = !!p->b_const.is_signed; + } else { + reg.opt_opd1_const = 0; + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = tensor_is_signed(p->b); + fill_opd1_stride(®, &p->b->stride); + } + + reg.res0_addr = p->ge->start_address; + reg.opt_res0_sign = tensor_is_signed(p->ge); + fill_res0_stride(®, &p->ge->stride); + + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + + return emit_tiu_cmdbuf(ctx, ®); +} + +bmk1822_op_t * bmk1822_tiu_bf16_element_wise_ge( + ctx_t *ctx, + const bmk1822_tiu_element_wise_ge_param_t *p) +{ + int bf16_enable = (p->a->fmt == FMT_BF16) ? 1 : 0; + + check_tiu_tensor_2(p->ge, p->a); + assert_same_shape(p->ge, p->a); + + if (p->b_is_const && !bf16_enable) { + if (tensor_is_signed(p->a)) + ASSERT(p->b_const.is_signed); + else + ASSERT(!p->b_const.is_signed); + } else if (!p->b_is_const) { + check_tiu_tensor(p->b); + assert_same_shape(p->ge, p->b); + ASSERT(p->a->fmt == p->b->fmt); + } + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_GE_FIX8B; + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + reg.opt_res_shift = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a); + fill_opd0_stride(®, &p->a->stride); + + if (p->b_is_const) { + reg.opt_opd1_const = 1; + reg.opd1_addr = bf16_enable ? p->b_const.val : (p->b_const.val & 0xFF); + reg.opt_opd1_sign = !!p->b_const.is_signed; + } else { + reg.opt_opd1_const = 0; + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = tensor_is_signed(p->b); + fill_opd1_stride(®, &p->b->stride); + } + + reg.res0_addr = p->ge->start_address; + reg.opt_res0_sign = tensor_is_signed(p->ge); + fill_res0_stride(®, &p->ge->stride); + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1822/tiu_element_wise_mac.c b/cvikernel/src/bm1822/tiu_element_wise_mac.c new file mode 100644 index 000000000..5c06975ec --- /dev/null +++ b/cvikernel/src/bm1822/tiu_element_wise_mac.c @@ -0,0 +1,68 @@ +#include "kernel_1822.h" + +bmk1822_op_t * bmk1822_tiu_element_wise_mac( + ctx_t *ctx, + const bmk1822_tiu_element_wise_mac_param_t *p) +{ + int bf16_enable = (p->a->fmt == FMT_BF16) ? 1 : 0; + + check_tiu_tensor(p->a); + assert_same_shape(p->res_low, p->a); + if(!bf16_enable) { + check_16bit_tiu_tensor(p->res_low, p->res_high); + ASSERT(p->lshift_bits < 32); + ASSERT(p->rshift_bits < 16); + } + if (!p->b_is_const) { + check_tiu_tensor(p->b); + assert_same_shape(p->res_low, p->b); + } + ASSERT(p->relu_enable == 0 || p->relu_enable == 1); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_MAC_FIX8B; + reg.opt_res_add = 1; + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + reg.opt_left_shift = p->lshift_bits; + reg.opt_relu_typ = p->relu_enable; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + int arith_shift = tensor_is_signed(p->res_low); + reg.opt_shift_typ = arith_shift; + reg.opt_res_shift = p->rshift_bits; + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a); + fill_opd0_stride(®, &p->a->stride); + + if (p->b_is_const) { + reg.opt_opd1_const = 1; + reg.opd1_addr = bf16_enable ? p->b_const.val : (p->b_const.val & 0xFF); + reg.opt_opd1_sign = !!p->b_const.is_signed; + } else { + reg.opt_opd1_const = 0; + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = tensor_is_signed(p->b); + fill_opd1_stride(®, &p->b->stride); + } + + reg.res0_addr = p->res_low->start_address; + reg.opt_res0_sign = tensor_is_signed(p->res_low); + reg.opt_res0_seg = bf16_enable ? 1 : !!p->res_is_int8; + fill_res0_stride(®, &p->res_low->stride); + reg.res0_b_str = bf16_enable ? 0 : (p->res_high->start_address - p->res_low->start_address); + + if (p->relu_enable) + ASSERT(reg.opt_res0_seg); + + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1822/tiu_element_wise_max.c b/cvikernel/src/bm1822/tiu_element_wise_max.c new file mode 100644 index 000000000..7c910a541 --- /dev/null +++ b/cvikernel/src/bm1822/tiu_element_wise_max.c @@ -0,0 +1,56 @@ +#include "kernel_1822.h" + +bmk1822_op_t * bmk1822_tiu_element_wise_max( + ctx_t *ctx, + const bmk1822_tiu_element_wise_max_param_t *p) +{ + int bf16_enable = (p->a->fmt == FMT_BF16) ? 1 : 0; + + check_tiu_tensor_2(p->max, p->a); + assert_same_shape(p->max, p->a); + + if (p->b_is_const && !bf16_enable) { + if (tensor_is_signed(p->a)) + ASSERT(p->b_const.is_signed); + else + ASSERT(!p->b_const.is_signed); + } else if (!p->b_is_const) { + check_tiu_tensor(p->b); + assert_same_shape(p->max, p->b); + ASSERT(p->a->fmt == p->b->fmt); + } + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_MAX_FIX8B; + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + reg.opt_res_shift = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a); + fill_opd0_stride(®, &p->a->stride); + + if (p->b_is_const) { + reg.opt_opd1_const = 1; + reg.opd1_addr = bf16_enable ? p->b_const.val : (p->b_const.val & 0xFF); + reg.opt_opd1_sign = !!p->b_const.is_signed; + } else { + reg.opt_opd1_const = 0; + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = tensor_is_signed(p->b); + fill_opd1_stride(®, &p->b->stride); + } + + reg.res0_addr = p->max->start_address; + reg.opt_res0_sign = tensor_is_signed(p->max); + fill_res0_stride(®, &p->max->stride); + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1822/tiu_element_wise_min.c b/cvikernel/src/bm1822/tiu_element_wise_min.c new file mode 100644 index 000000000..b558dcfb8 --- /dev/null +++ b/cvikernel/src/bm1822/tiu_element_wise_min.c @@ -0,0 +1,58 @@ +#include "kernel_1822.h" + +bmk1822_op_t * bmk1822_tiu_element_wise_min( + ctx_t *ctx, + const bmk1822_tiu_element_wise_min_param_t *p) +{ + int bf16_enable = (p->a->fmt == FMT_BF16) ? 1 : 0; + + check_tiu_tensor_2(p->min, p->a); + assert_same_shape(p->min, p->a); + if (p->b_is_const && !bf16_enable) { + if (tensor_is_signed(p->a)) + ASSERT(p->b_const.is_signed); + else + ASSERT(!p->b_const.is_signed); + } else if (!p->b_is_const) { + check_tiu_tensor(p->b); + assert_same_shape(p->min, p->b); + ASSERT(p->a->fmt == p->b->fmt); + } + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_MIN_FIX8B; + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + reg.opt_res_shift = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a); + fill_opd0_stride(®, &p->a->stride); + + if (p->b_is_const) { + reg.opt_opd1_const = 1; + reg.opd1_addr = bf16_enable ? p->b_const.val : (p->b_const.val & 0xFF); + reg.opt_opd1_sign = !!p->b_const.is_signed; + } else { + reg.opt_opd1_const = 0; + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = tensor_is_signed(p->b); + fill_opd1_stride(®, &p->b->stride); + } + + reg.res0_addr = p->min->start_address; + reg.opt_res0_sign = tensor_is_signed(p->min); + fill_res0_stride(®, &p->min->stride); + + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1822/tiu_element_wise_mul.c b/cvikernel/src/bm1822/tiu_element_wise_mul.c new file mode 100644 index 000000000..55fb7f363 --- /dev/null +++ b/cvikernel/src/bm1822/tiu_element_wise_mul.c @@ -0,0 +1,67 @@ +#include "kernel_1822.h" + +bmk1822_op_t * bmk1822_tiu_element_wise_mul( + ctx_t *ctx, + const bmk1822_tiu_element_wise_mul_param_t *p) +{ + int bf16_enable = (p->res_low->fmt == FMT_BF16) ? 1 : 0; + + check_tiu_tensor_2(p->res_low, p->a); + assert_same_shape(p->res_low, p->a); + if (!p->b_is_const) { + check_tiu_tensor(p->b); + assert_same_shape(p->res_low, p->b); + } + if (p->res_high) + check_16bit_tiu_tensor(p->res_low, p->res_high); + ASSERT(p->relu_enable == 0 || p->relu_enable == 1); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_MUL_FIX8B; + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + int arith_shift = tensor_is_signed(p->res_low); + reg.opt_shift_typ = arith_shift; + reg.opt_res_shift = p->rshift_bits; + reg.opt_relu_typ = p->relu_enable; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a); + fill_opd0_stride(®, &p->a->stride); + + if (p->b_is_const) { + reg.opt_opd1_const = 1; + reg.opd1_addr = bf16_enable ? p->b_const.val : (p->b_const.val & 0xFF); + reg.opt_opd1_sign = !!p->b_const.is_signed; + } else { + reg.opt_opd1_const = 0; + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = tensor_is_signed(p->b); + fill_opd1_stride(®, &p->b->stride); + } + + reg.res0_addr = p->res_low->start_address; + reg.opt_res0_sign = tensor_is_signed(p->res_low); + reg.opt_res0_seg = (p->res_high == NULL); + fill_res0_stride(®, &p->res_low->stride); + if (p->res_high) + reg.res0_b_str = (p->res_high->start_address - p->res_low->start_address); + if (p->relu_enable) + ASSERT(reg.opt_res0_seg); + + ASSERT(( + p->b_is_const || (!reg.opt_opd1_sign && !reg.opt_opd0_sign && !reg.opt_shift_typ) || + ((reg.opt_opd1_sign || reg.opt_opd0_sign) && reg.opt_shift_typ) + )); + + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1822/tiu_element_wise_mul_qdm.c b/cvikernel/src/bm1822/tiu_element_wise_mul_qdm.c new file mode 100644 index 000000000..68da1b72f --- /dev/null +++ b/cvikernel/src/bm1822/tiu_element_wise_mul_qdm.c @@ -0,0 +1,67 @@ +#include "kernel_1822.h" + +bmk1822_op_t * bmk1822_tiu_element_wise_mul_qdm( + ctx_t *ctx, + const bmk1822_tiu_element_wise_mul_qdm_param_t *p) +{ + check_tiu_tensor_2(p->res_low, p->a); + assert_same_shape(p->res_low, p->a); + if (!p->b_is_const) { + check_tiu_tensor(p->b); + assert_same_shape(p->res_low, p->b); + } + if (p->res_high) + check_16bit_tiu_tensor(p->res_low, p->res_high); + ASSERT(p->relu_enable == 0 || p->relu_enable == 1); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_MUL_FIX8B; + reg.tsk_opd_num = 2; + int arith_shift = tensor_is_signed(p->res_low); + reg.opt_shift_typ = arith_shift; + reg.opt_res_shift = p->rshift_bits; + reg.opt_relu_typ = p->relu_enable; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a); + fill_opd0_stride(®, &p->a->stride); + + if (p->b_is_const) { + reg.opt_opd1_const = 1; + reg.opd1_addr = p->b_const.val; + reg.opt_opd1_sign = !!p->b_const.is_signed; + } else { + reg.opt_opd1_const = 0; + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = tensor_is_signed(p->b); + fill_opd1_stride(®, &p->b->stride); + } + + reg.res0_addr = p->res_low->start_address; + reg.opt_res0_sign = tensor_is_signed(p->res_low); + reg.opt_res0_seg = (p->res_high == NULL); + fill_res0_stride(®, &p->res_low->stride); + if (p->res_high) + reg.res0_b_str = p->res_high->start_address - p->res_low->start_address; + if (p->relu_enable) + ASSERT(reg.opt_res0_seg); + + ASSERT(( + (!reg.opt_opd1_sign && !reg.opt_opd0_sign && !reg.opt_shift_typ) || + ((reg.opt_opd1_sign || reg.opt_opd0_sign) && reg.opt_shift_typ) + )); + + reg.opt_chl_quan = 1; + reg.quan_m = p->multiplier; + + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1822/tiu_element_wise_or.c b/cvikernel/src/bm1822/tiu_element_wise_or.c new file mode 100644 index 000000000..e7abed3f9 --- /dev/null +++ b/cvikernel/src/bm1822/tiu_element_wise_or.c @@ -0,0 +1,100 @@ +#include "kernel_1822.h" + +bmk1822_op_t * bmk1822_tiu_element_wise_or_int8( + ctx_t *ctx, + const bmk1822_tiu_element_wise_or_int8_param_t *p) +{ + check_tiu_tensor_3(p->res, p->a, p->b); + assert_same_shape_3(p->res, p->a, p->b); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_OR_FIX8B; + reg.tsk_opd_num = 2; + reg.opt_res_shift = 0; + reg.opt_shift_typ = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = 0; + reg.opt_opd0_seg = 1; + fill_opd0_stride(®, &p->a->stride); + + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = 0; + reg.opt_opd1_seg = 1; + fill_opd1_stride(®, &p->b->stride); + + reg.res0_addr = p->res->start_address; + reg.opt_res0_sign = 0; + reg.opt_res0_seg = 1; + fill_res0_stride(®, &p->res->stride); + + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + + return emit_tiu_cmdbuf(ctx, ®); +} + +bmk1822_op_t * bmk1822_tiu_element_wise_or_int16( + ctx_t *ctx, + const bmk1822_tiu_element_wise_or_int16_param_t *p) +{ + check_16bit_tiu_tensor(p->a_low, p->a_high); + check_16bit_tiu_tensor(p->b_low, p->b_high); + check_16bit_tiu_tensor(p->res_low, p->res_high); + assert_same_shape_3(p->res_low, p->a_low, p->b_low); + + int res_high_addr = p->res_high->start_address; + int res_low_addr = p->res_low->start_address; + ASSERT(res_high_addr > res_low_addr); + int res_b_stride = res_high_addr - res_low_addr; + + int a_high_addr = p->a_high->start_address; + int a_low_addr = p->a_low->start_address; + ASSERT(a_high_addr > a_low_addr); + int a_b_stride = a_high_addr - a_low_addr; + + int b_high_addr = p->b_high->start_address; + int b_low_addr = p->b_low->start_address; + ASSERT(b_high_addr > b_low_addr); + int b_b_stride = b_high_addr - b_low_addr; + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_OR_FIX8B; + reg.tsk_opd_num = 2; + reg.opt_res_shift = 0; + reg.opt_shift_typ = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a_low->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = a_low_addr; + reg.opt_opd0_sign = 0; + reg.opt_opd0_seg = 0; + reg.opd0_b_str = a_b_stride; + fill_opd0_stride(®, &p->a_low->stride); + + reg.opd1_addr = b_low_addr; + reg.opt_opd1_sign = 0; + reg.opt_opd1_seg = 0; + reg.opd1_b_str = b_b_stride; + fill_opd1_stride(®, &p->b_low->stride); + + reg.res0_addr = res_low_addr; + reg.opt_res0_sign = 0; + reg.opt_res0_seg = 0; + reg.res0_b_str = res_b_stride; + fill_res0_stride(®, &p->res_low->stride); + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1822/tiu_element_wise_shift.c b/cvikernel/src/bm1822/tiu_element_wise_shift.c new file mode 100644 index 000000000..0affb10fe --- /dev/null +++ b/cvikernel/src/bm1822/tiu_element_wise_shift.c @@ -0,0 +1,58 @@ +#include "kernel_1822.h" + +bmk1822_op_t * bmk1822_tiu_element_wise_arith_shift( + ctx_t *ctx, + const bmk1822_tiu_element_wise_arith_shift_param_t *p) +{ + check_16bit_tiu_tensor(p->a_low, p->a_high); + check_16bit_tiu_tensor(p->res_low, p->res_high); + check_tiu_tensor(p->bits); + assert_same_shape_3(p->res_low, p->a_low, p->bits); + ASSERT(tensor_is_signed(p->a_low)); + ASSERT(tensor_is_signed(p->bits)); + + int res_high_addr = p->res_high->start_address; + int res_low_addr = p->res_low->start_address; + ASSERT(res_high_addr > res_low_addr); + int res_b_stride = res_high_addr - res_low_addr; + + int a_high_addr = p->a_high->start_address; + int a_low_addr = p->a_low->start_address; + ASSERT(a_high_addr > a_low_addr); + int a_b_stride = a_high_addr - a_low_addr; + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_SHIFT_FIX8B; + reg.tsk_opd_num = 2; + reg.opt_res_shift = 0; + reg.opt_rshift_typ = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a_low->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = a_low_addr; + reg.opt_opd0_sign = 1; + reg.opt_opd0_seg = 0; + reg.opd0_b_str = a_b_stride; + fill_opd0_stride(®, &p->a_low->stride); + + reg.opd1_addr = p->bits->start_address; + reg.opt_opd1_sign = 1; + reg.opt_opd1_seg = 1; + fill_opd1_stride(®, &p->bits->stride); + + reg.res0_addr = res_low_addr; + reg.opt_res0_sign = 1; + reg.opt_res0_seg = 0; + reg.res0_b_str = res_b_stride; + fill_res0_stride(®, &p->res_low->stride); + + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1822/tiu_element_wise_sub.c b/cvikernel/src/bm1822/tiu_element_wise_sub.c new file mode 100644 index 000000000..227333844 --- /dev/null +++ b/cvikernel/src/bm1822/tiu_element_wise_sub.c @@ -0,0 +1,68 @@ +#include "kernel_1822.h" + +bmk1822_op_t * bmk1822_tiu_element_wise_sub( + ctx_t *ctx, + const bmk1822_tiu_element_wise_sub_param_t *p) +{ + int bf16_enable = (p->a_low->fmt == FMT_BF16) ? 1 : 0; + + if (bf16_enable) { + /*bf16 only support 16 bit*/ + ASSERT(!p->a_high); + ASSERT(!p->b_high); + ASSERT(!p->res_high); + check_tiu_tensor(p->a_low); + check_tiu_tensor(p->b_low); + check_tiu_tensor(p->res_low); + assert_same_shape_3(p->res_low, p->a_low, p->b_low); + } else { + check_16bit_tiu_tensor(p->a_low, p->a_high); + check_16bit_tiu_tensor(p->b_low, p->b_high); + check_tiu_tensor(p->res_low); + assert_same_shape_3(p->res_low, p->a_low, p->b_low); + ASSERT(tensor_is_signed(p->res_low)); + } + if (p->res_high) + check_16bit_tiu_tensor(p->res_low, p->res_high); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_SUB_FIX8B; + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + reg.opt_res_shift = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a_low->shape); + fill_same_tensor_stride_type(®, 0b11); + + int arith_shift = tensor_is_signed(p->res_low); + reg.opt_shift_typ = arith_shift; + reg.opt_res_shift = p->rshift_bits; + + reg.opd0_addr = p->a_low->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a_low); + reg.opt_opd0_seg = (p->a_high == NULL); + reg.opd0_b_str = bf16_enable ? 0 : (p->a_high->start_address - p->a_low->start_address); + fill_opd0_stride(®, &p->a_low->stride); + + reg.opd1_addr = p->b_low->start_address; + reg.opt_opd1_sign = tensor_is_signed(p->b_low);; + reg.opt_opd1_seg = (p->b_high == NULL); + reg.opd1_b_str = bf16_enable ? 0 : (p->b_high->start_address - p->b_low->start_address); + fill_opd1_stride(®, &p->b_low->stride); + + reg.res0_addr = p->res_low->start_address; + reg.opt_res0_sign = 1; + reg.opt_res0_seg = (p->res_high == NULL); + fill_res0_stride(®, &p->res_low->stride); + if (p->res_high) + reg.res0_b_str = bf16_enable ? 0 : (p->res_high->start_address - p->res_low->start_address); + + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1822/tiu_element_wise_xor.c b/cvikernel/src/bm1822/tiu_element_wise_xor.c new file mode 100644 index 000000000..ca9246775 --- /dev/null +++ b/cvikernel/src/bm1822/tiu_element_wise_xor.c @@ -0,0 +1,100 @@ +#include "kernel_1822.h" + +bmk1822_op_t * bmk1822_tiu_element_wise_xor_int8( + ctx_t *ctx, + const bmk1822_tiu_element_wise_xor_int8_param_t *p) +{ + check_tiu_tensor_3(p->res, p->a, p->b); + assert_same_shape_3(p->res, p->a, p->b); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_XOR_FIX8B; + reg.tsk_opd_num = 2; + reg.opt_res_shift = 0; + reg.opt_shift_typ = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = 0; + reg.opt_opd0_seg = 1; + fill_opd0_stride(®, &p->a->stride); + + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = 0; + reg.opt_opd1_seg = 1; + fill_opd1_stride(®, &p->b->stride); + + reg.res0_addr = p->res->start_address; + reg.opt_res0_sign = 0; + reg.opt_res0_seg = 1; + fill_res0_stride(®, &p->res->stride); + + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + + return emit_tiu_cmdbuf(ctx, ®); +} + +bmk1822_op_t * bmk1822_tiu_element_wise_xor_int16( + ctx_t *ctx, + const bmk1822_tiu_element_wise_xor_int16_param_t *p) +{ + check_16bit_tiu_tensor(p->a_low, p->a_high); + check_16bit_tiu_tensor(p->b_low, p->b_high); + check_16bit_tiu_tensor(p->res_low, p->res_high); + assert_same_shape_3(p->res_low, p->a_low, p->b_low); + + int res_high_addr = p->res_high->start_address; + int res_low_addr = p->res_low->start_address; + ASSERT(res_high_addr > res_low_addr); + int res_b_stride = res_high_addr - res_low_addr; + + int a_high_addr = p->a_high->start_address; + int a_low_addr = p->a_low->start_address; + ASSERT(a_high_addr > a_low_addr); + int a_b_stride = a_high_addr - a_low_addr; + + int b_high_addr = p->b_high->start_address; + int b_low_addr = p->b_low->start_address; + ASSERT(b_high_addr > b_low_addr); + int b_b_stride = b_high_addr - b_low_addr; + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_XOR_FIX8B; + reg.tsk_opd_num = 2; + reg.opt_res_shift = 0; + reg.opt_shift_typ = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a_low->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = a_low_addr; + reg.opt_opd0_sign = 0; + reg.opt_opd0_seg = 0; + reg.opd0_b_str = a_b_stride; + fill_opd0_stride(®, &p->a_low->stride); + + reg.opd1_addr = b_low_addr; + reg.opt_opd1_sign = 0; + reg.opt_opd1_seg = 0; + reg.opd1_b_str = b_b_stride; + fill_opd1_stride(®, &p->b_low->stride); + + reg.res0_addr = res_low_addr; + reg.opt_res0_sign = 0; + reg.opt_res0_seg = 0; + reg.res0_b_str = res_b_stride; + fill_res0_stride(®, &p->res_low->stride); + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1822/tiu_lookup_table.c b/cvikernel/src/bm1822/tiu_lookup_table.c new file mode 100644 index 000000000..13fbc7cb2 --- /dev/null +++ b/cvikernel/src/bm1822/tiu_lookup_table.c @@ -0,0 +1,112 @@ +#include "kernel_1822.h" + +bmk1822_op_t * bmk1822_tiu_lookup_table( + ctx_t *ctx, + const bmk1822_tiu_lookup_table_param_t *p) +{ + uint32_t eu_num = ctx->chip_info.eu_num; + uint32_t npu_num = ctx->chip_info.npu_num; + + check_tiu_tensor_3(p->ofmap, p->ifmap, p->table); + assert_stride_type_0(ctx, p->ofmap); + assert_stride_type_0(ctx, p->ifmap); + assert_stride_type_0(ctx, p->table); + + uint8_t is_bf16 = (p->ofmap->fmt == FMT_BF16 && p->ifmap->fmt == FMT_BF16); + + ASSERT(p->table->shape.n == 1); + ASSERT(p->table->shape.c == npu_num); + + if (is_bf16) { + ASSERT(p->table->shape.h == 32); + ASSERT(p->table->shape.w == 8); + } + else { + ASSERT(p->table->shape.h == 16); + ASSERT(p->table->shape.w == 16); + } + + ASSERT(p->ifmap->start_address % eu_num == 0); + ASSERT(p->ofmap->start_address % eu_num == 0); + ASSERT(p->table->start_address % eu_num == 0); + + // fmt MUST be same under bf16 + if (p->ofmap->fmt == FMT_BF16) { + ASSERT(p->ifmap->fmt == FMT_BF16); + } + ASSERT(p->ofmap->fmt == FMT_I8 || p->ofmap->fmt == FMT_U8 || p->ofmap->fmt == FMT_BF16); + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + //reg.tens_lookup = 1; + reg.tsk_opd_num = 2; + reg.opt_shift_typ = 0; + reg.opt_res_shift = 0; + reg.opt_relu_typ = 0; + reg.opd_typ = is_bf16; + + reg.res0_addr = p->ofmap->start_address; + if (is_bf16) { + reg.opt_res0_sign = 1; + reg.opt_res0_seg = 1; + } + else { + reg.opt_res0_sign = 0; + reg.opt_res0_seg = 1; + } + + // ifmap->shape.n == p->ofmap->shape.n); + ASSERT(p->ifmap->shape.c == p->ofmap->shape.c); + ASSERT(p->ifmap->shape.h == p->ofmap->shape.h); + ASSERT(p->ifmap->shape.w == p->ofmap->shape.w); + + reg.res0_n = p->ifmap->shape.n; + reg.res0_c = p->ifmap->shape.c; + reg.res0_h = p->ifmap->shape.h; + reg.res0_w = p->ifmap->shape.w; + reg.short_res0_str = 0; + + reg.opd0_addr = p->ifmap->start_address; + if (is_bf16) { + reg.opt_opd0_sign = 1; + reg.opt_opd0_seg = 1; + } + else { + reg.opt_opd0_sign = 0; + reg.opt_opd0_seg = 1; + } + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.short_opd0_str = 0; + + reg.opd1_addr = p->table->start_address; + if (is_bf16) { + reg.opt_opd1_sign = 1; + reg.opt_opd1_seg = 1; + } + else { + reg.opt_opd1_sign = 0; + reg.opt_opd1_seg = 1; + } + reg.opd1_n = p->table->shape.n; + reg.opd1_c = p->table->shape.c; + reg.opd1_h = p->table->shape.h; + reg.opd1_w = p->table->shape.w; + reg.short_opd1_str = 0; + reg.tsk_eu_typ = 12; // 12 means lut + if (is_bf16) { + reg.opt_opd2_seg = 1; // hw check + // dont care once short_xxx_str set to 0 + } + + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + //trace_tiu_reg(®, __FUNCTION__); + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1822/tiu_matrix_multiplication.c b/cvikernel/src/bm1822/tiu_matrix_multiplication.c new file mode 100644 index 000000000..6ce5dc0e4 --- /dev/null +++ b/cvikernel/src/bm1822/tiu_matrix_multiplication.c @@ -0,0 +1,151 @@ +#include "kernel_1822.h" + +typedef bmk1822_tiu_matrix_multiplication_param_t param_t; + +static void check_matrix(ctx_t *ctx, const ml_t *m) +{ + bmk1822_tensor_lmem_t t; + t.start_address = m->start_address; + t.fmt = m->fmt; + t.shape.n = m->shape.n; + t.shape.c = m->shape.c; + t.shape.h = 1; + t.shape.w = m->shape.w; + t.stride.n = m->stride.n; + t.stride.c = m->stride.c; + t.stride.h = m->stride.h; + t.stride.w = 1 * (m->fmt == FMT_BF16 ? 2 : 1); + + check_tiu_tensor(&t); + assert_stride_type_0(ctx, &t); + + uint32_t eu_num = ctx->chip_info.eu_num; + ASSERT(m->start_address % eu_num == 0); +} + +static int is_arith_shift(const param_t *p) +{ + if (p->left->fmt == FMT_I8) + return 1; + if (p->right->fmt == FMT_I8) + return 1; + if (p->bias && p->bias->fmt == FMT_I8) + return 1; + + return 0; +} + +bmk1822_op_t * bmk1822_tiu_matrix_multiplication(ctx_t *ctx, const param_t *p) +{ + const bmk1822_matrix_lmem_t *res = p->res; + const bmk1822_matrix_lmem_t *left = p->left; + const bmk1822_matrix_lmem_t *right = p->right; + const bmk1822_matrix_lmem_t *bias = p->bias; + int bf16_enable = (res->fmt == FMT_BF16) ? 1 : 0; + + check_matrix(ctx, res); + check_matrix(ctx, left); + check_matrix(ctx, right); + if (bias) + check_matrix(ctx, bias); + + ASSERT(p->lshift_bits < 32); + if (bf16_enable) /* bf16 does not support add_result*/ + ASSERT(!p->add_result); + else + ASSERT(!(p->relu_enable && p->add_result)); + + if(p->ps32_mode & 0x2) + { + ASSERT(!p->relu_enable); + ASSERT(!p->bias); + ASSERT(!p->rshift_bits); + } + ASSERT(p->relu_enable == 0 || p->relu_enable == 1); + + uint32_t left_row = left->shape.n; + uint32_t left_col = left->shape.col; + uint32_t right_row = right->shape.n; + uint32_t right_col = right->shape.col; + uint32_t res_row = res->shape.n; + uint32_t res_col = res->shape.col; + ASSERT(left_col == right_row); + ASSERT(res_col == right_col); + + if(p->ps32_mode) + { + ASSERT(!p->add_result); + } else if ((p->add_result || !p->res_is_int8) && !bf16_enable) { + ASSERT(res_row == left_row * 2); + res_row = left_row; + } else { + ASSERT(res_row == left_row); + } + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_FC_FIX8B; + reg.tsk_opd_num = bias? 3: 2; + reg.opd_typ = bf16_enable ? 1 : 0; + reg.opt_shift_typ = is_arith_shift(p); + reg.opt_res_shift = p->rshift_bits; + reg.opt_left_shift = p->lshift_bits; + reg.opt_relu_typ = p->relu_enable; + reg.opt_res_add = p->add_result; + + reg.res0_addr = res->start_address; + reg.opt_res0_seg = (bf16_enable ? 1 : p->res_is_int8); + + reg.opt_res0_sign = matrix_is_signed(res); + reg.res0_n = res_row; + reg.res0_c = res->shape.c; + reg.res0_h = 1; + reg.res0_w = res->shape.w; + reg.short_res0_str = 0; // stride, b_stride calculated by H/W + + reg.opd0_addr = left->start_address; + reg.opt_opd0_seg = 1; + reg.opt_opd0_sign = (left->fmt == FMT_I8); + reg.opd0_n = left_row; + reg.opd0_c = left->shape.c; + reg.opd0_h = 1; + reg.opd0_w = left->shape.w; + reg.short_opd0_str = 0; + + reg.opd1_addr = right->start_address; + reg.opt_opd1_seg = 1; + reg.opt_opd1_sign = (right->fmt == FMT_I8); + reg.opd1_n = right_row; + reg.opd1_c = right->shape.c; + reg.opd1_h = 1; + reg.opd1_w = left_col - left->shape.w * (left->shape.c - 1); + reg.short_opd1_str = 0; + + reg.ps32_md = p->ps32_mode; + if (p->ps32_mode > 0) + reg.res0_b_str = p->res->shape.n * p->res->stride.n; + if(reg.opd0_c == 1) + ASSERT(reg.opd0_w == reg.opd1_w); + + if (bias) { + ASSERT(bias->shape.n == 2); + ASSERT(bias->shape.c == right->shape.c); + ASSERT(bias->shape.w == right->shape.w); + ASSERT(bias->shape.col == right->shape.col); + + reg.opd2_addr = bias->start_address; + reg.opt_opd2_seg = 0; + reg.opt_opd2_sign = (bias->fmt == FMT_I8); + reg.opd2_n = 1; + reg.opd2_c = bias->shape.c; + reg.opd2_h = 1; + reg.opd2_w = bias->shape.w; + reg.short_opd2_str = 0; + } + + reg.layer_info = p->layer_id; + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1822/tiu_matrix_multiplication_qdm.c b/cvikernel/src/bm1822/tiu_matrix_multiplication_qdm.c new file mode 100644 index 000000000..713f95582 --- /dev/null +++ b/cvikernel/src/bm1822/tiu_matrix_multiplication_qdm.c @@ -0,0 +1,151 @@ +#include "kernel_1822.h" + +typedef bmk1822_tiu_matrix_multiplication_qdm_param_t param_t; + +static void check_matrix(ctx_t *ctx, const ml_t *m) +{ + bmk1822_tensor_lmem_t t; + t.start_address = m->start_address; + t.fmt = m->fmt; + t.shape.n = m->shape.n; + t.shape.c = m->shape.c; + t.shape.h = 1; + t.shape.w = m->shape.w; + t.stride.n = m->stride.n; + t.stride.c = m->stride.c; + t.stride.h = m->stride.h; + t.stride.w = 1; + + check_tiu_tensor(&t); + assert_stride_type_0(ctx, &t); + + uint32_t eu_num = ctx->chip_info.eu_num; + ASSERT(m->start_address % eu_num == 0); +} + +static int is_arith_shift(const param_t *p) +{ + if (p->left->fmt == FMT_I8) + return 1; + if (p->right->fmt == FMT_I8) + return 1; + if (p->bias && p->bias->fmt == FMT_I8) + return 1; + + return 0; +} + +bmk1822_op_t * bmk1822_tiu_matrix_multiplication_qdm(ctx_t *ctx, const param_t *p) +{ + const bmk1822_matrix_lmem_t *res = p->res; + const bmk1822_matrix_lmem_t *left = p->left; + const bmk1822_matrix_lmem_t *right = p->right; + const bmk1822_matrix_lmem_t *bias = p->bias; + + check_matrix(ctx, res); + check_matrix(ctx, left); + check_matrix(ctx, right); + if (bias) + check_matrix(ctx, bias); + + ASSERT(p->lshift_bits < 32); + ASSERT(!(p->relu_enable && p->add_result)); + if(p->ps32_mode & 0x2) + { + ASSERT(!p->relu_enable); + ASSERT(!p->bias); + ASSERT(!p->rshift_bits); + } + ASSERT(p->relu_enable == 0 || p->relu_enable == 1); + + uint32_t left_row = left->shape.n; + uint32_t left_col = left->shape.col; + uint32_t right_row = right->shape.n; + uint32_t right_col = right->shape.col; + uint32_t res_row = res->shape.n; + uint32_t res_col = res->shape.col; + ASSERT(left_col == right_row); + ASSERT(res_col == right_col); + ASSERT(p->res_is_int8 == 1); + + if(p->ps32_mode) + { + ASSERT(!p->add_result); + } + else if (p->add_result) { + ASSERT(res_row == left_row * 2); + res_row = left_row; + } else { + ASSERT(res_row == left_row); + } + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_FC_FIX8B; + reg.tsk_opd_num = bias? 3: 2; + reg.opt_shift_typ = is_arith_shift(p); + reg.opt_res_shift = p->rshift_bits; + reg.opt_left_shift = p->lshift_bits; + reg.opt_relu_typ = p->relu_enable; + reg.opt_res_add = p->add_result; + + reg.res0_addr = res->start_address; + reg.opt_res0_seg = 1; + reg.opt_res0_sign = matrix_is_signed(res); + reg.res0_n = res_row; + reg.res0_c = res->shape.c; + reg.res0_h = 1; + reg.res0_w = res->shape.w; + reg.short_res0_str = 0; // stride, b_stride calculated by H/W + + reg.opd0_addr = left->start_address; + reg.opt_opd0_seg = 1; + reg.opt_opd0_sign = (left->fmt == FMT_I8); + reg.opd0_n = left_row; + reg.opd0_c = left->shape.c; + reg.opd0_h = 1; + reg.opd0_w = left->shape.w; + reg.short_opd0_str = 0; + + reg.opd1_addr = right->start_address; + reg.opt_opd1_seg = 1; + reg.opt_opd1_sign = (right->fmt == FMT_I8); + reg.opd1_n = right_row; + reg.opd1_c = right->shape.c; + reg.opd1_h = 1; + reg.opd1_w = left_col - left->shape.w * (left->shape.c - 1); + reg.short_opd1_str = 0; + + reg.ps32_md = p->ps32_mode; + if (p->ps32_mode > 0) + reg.res0_b_str = p->res->shape.n * p->res->stride.n; + if(reg.opd0_c == 1) + ASSERT(reg.opd0_w == reg.opd1_w); + + // Only enable 32-bit multipler at the final post processing stage + reg.opt_chl_quan = ((p->ps32_mode == 0) || (p->ps32_mode == 1)) ? 1 : 0; + reg.quan_m = p->quan_m; + + // 32b bias, determined by b_stride + if (bias) { + ASSERT(bias->shape.n == 4); + ASSERT(bias->shape.c == right->shape.c); + ASSERT(bias->shape.w == right->shape.w); + ASSERT(bias->shape.col == right->shape.col); + + reg.opd2_addr = bias->start_address; + reg.opt_opd2_seg = 0; + reg.opt_opd2_sign = (bias->fmt == FMT_I8); + reg.opd2_n = 1; + reg.opd2_c = bias->shape.c; + reg.opd2_h = 1; + reg.opd2_w = bias->shape.w; + reg.short_opd2_str = 0; + } + + reg.layer_info = p->layer_id; + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1822/tiu_max_pooling.c b/cvikernel/src/bm1822/tiu_max_pooling.c new file mode 100644 index 000000000..31f996a25 --- /dev/null +++ b/cvikernel/src/bm1822/tiu_max_pooling.c @@ -0,0 +1,69 @@ +#include "kernel_1822.h" + +bmk1822_op_t * bmk1822_tiu_max_pooling( + ctx_t *ctx, + const bmk1822_tiu_max_pooling_param_t *p) +{ + int bf16_enable = (p->ifmap->fmt == FMT_BF16); + + check_tiu_tensor_2(p->ifmap, p->ofmap); + ASSERT(p->kh * p->kw >= 1); + ASSERT(p->ifmap->shape.n == p->ofmap->shape.n); + ASSERT(p->ifmap->shape.c == p->ofmap->shape.c); + ASSERT(p->stride_h < 32 && p->stride_h > 0 && "stride_h should be in [1, 31] range"); + ASSERT(p->stride_w < 32 && p->stride_w > 0 && "stride_w should be in [1, 31] range"); + if (bf16_enable) { + assert_bf16_stride_type_0(ctx, p->ifmap); + assert_bf16_stride_type_0(ctx, p->ofmap); + } else { + assert_stride_type_0(ctx, p->ifmap); + assert_stride_type_0(ctx, p->ofmap); + } + int opd0_sign = tensor_is_signed(p->ifmap); + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_DEPTHWISE_POOL_FIX8B; + reg.tsk_eu_typ = 0; + reg.opt_relu_typ = 0; /* Hardware relu function not validated. */ + reg.opt_res_shift = 0; + reg.opt_shift_typ = opd0_sign; + reg.tsk_opd_num = 1; + reg.opd_typ = bf16_enable ? 1: 0; + + reg.res0_addr = p->ofmap->start_address; + reg.opt_res0_sign = opd0_sign; + reg.opt_res0_seg = 1; + reg.res0_n = p->ofmap->shape.n; + reg.res0_c = p->ofmap->shape.c; + reg.res0_h = p->ofmap->shape.h; + reg.res0_w = p->ofmap->shape.w; + reg.opd0_addr = p->ifmap->start_address; + reg.opt_opd0_sign = opd0_sign; + reg.opt_opd0_seg = 1; + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.conv_opd0_up_pad = p->pad_top; + reg.conv_opd0_dn_pad = p->pad_bottom; + reg.conv_opd0_lf_pad = p->pad_left; + reg.conv_opd0_rt_pad = p->pad_right; + if (bf16_enable) { + reg.opd0_ins_val = p->ins_fp; + } else { + //reg.opd0_ins_val = bf16_enable ? 0 : (uint32_t)p->ins_val; + reg.opd0_ins_val = (!p->ins_val && opd0_sign) ? -128 : p->ins_val; // backend not set yet + } + + reg.opt_opd1_seg = 1; + reg.opd1_h = p->kh; + reg.opd1_w = p->kw; + reg.conv_op_x_str = p->stride_w; + reg.conv_op_y_str = p->stride_h; + + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1822/tiu_min_pooling.c b/cvikernel/src/bm1822/tiu_min_pooling.c new file mode 100644 index 000000000..7d54a9fce --- /dev/null +++ b/cvikernel/src/bm1822/tiu_min_pooling.c @@ -0,0 +1,120 @@ +#include "kernel_1822.h" + +bmk1822_op_t * bmk1822_tiu_min_pooling( + ctx_t *ctx, + const bmk1822_tiu_min_pooling_param_t *p) +{ + check_tiu_tensor_2(p->ifmap, p->ofmap); + ASSERT(p->kh * p->kw > 1); + ASSERT(p->ifmap->shape.n == p->ofmap->shape.n); + ASSERT(p->ifmap->shape.c == p->ofmap->shape.c); + assert_stride_type_0(ctx, p->ifmap); + assert_stride_type_0(ctx, p->ofmap); + + int opd0_sign = tensor_is_signed(p->ifmap); + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_DEPTHWISE_POOL_FIX8B; + reg.tsk_eu_typ = 3; + reg.opt_relu_typ = 0; /* Hardware relu function not validated. */ + reg.opt_res_shift = 0; + reg.opt_shift_typ = opd0_sign; + reg.tsk_opd_num = 1; + + reg.res0_addr = p->ofmap->start_address; + reg.opt_res0_sign = opd0_sign; + reg.opt_res0_seg = 1; + reg.res0_n = p->ofmap->shape.n; + reg.res0_c = p->ofmap->shape.c; + reg.res0_h = p->ofmap->shape.h; + reg.res0_w = p->ofmap->shape.w; + + reg.opd0_addr = p->ifmap->start_address; + reg.opt_opd0_sign = opd0_sign; + reg.opt_opd0_seg = 1; + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.conv_opd0_up_pad = p->pad_top; + reg.conv_opd0_dn_pad = p->pad_bottom; + reg.conv_opd0_lf_pad = p->pad_left; + reg.conv_opd0_rt_pad = p->pad_right; + if (opd0_sign) + reg.opd0_ins_val = (uint16_t)127; + else + reg.opd0_ins_val = (uint16_t)255; + reg.opt_opd1_seg = 1; + reg.opd1_h = p->kh; + reg.opd1_w = p->kw; + reg.conv_op_x_str = p->stride_w; + reg.conv_op_y_str = p->stride_h; + + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + + return emit_tiu_cmdbuf(ctx, ®); +} + +bmk1822_op_t * bmk1822_tiu_bf16_min_pooling( + ctx_t *ctx, + const bmk1822_tiu_min_pooling_param_t *p) +{ + int bf16_enable = (p->ifmap->fmt == FMT_BF16) ? 1 : 0; + + check_tiu_tensor_2(p->ifmap, p->ofmap); + ASSERT(p->kh * p->kw > 1); + ASSERT(p->ifmap->shape.n == p->ofmap->shape.n); + ASSERT(p->ifmap->shape.c == p->ofmap->shape.c); + if (bf16_enable) { + assert_bf16_stride_type_0(ctx, p->ifmap); + assert_bf16_stride_type_0(ctx, p->ofmap); + } else { + assert_stride_type_0(ctx, p->ifmap); + assert_stride_type_0(ctx, p->ofmap); + } + int opd0_sign = tensor_is_signed(p->ifmap); + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_DEPTHWISE_POOL_FIX8B; + reg.tsk_eu_typ = 3; + reg.opt_relu_typ = 0; /* Hardware relu function not validated. */ + reg.opt_res_shift = 0; + reg.opt_shift_typ = opd0_sign; + reg.tsk_opd_num = 1; + reg.opd_typ = bf16_enable ? 1: 0; + + reg.res0_addr = p->ofmap->start_address; + reg.opt_res0_sign = opd0_sign; + reg.opt_res0_seg = 1; + reg.res0_n = p->ofmap->shape.n; + reg.res0_c = p->ofmap->shape.c; + reg.res0_h = p->ofmap->shape.h; + reg.res0_w = p->ofmap->shape.w; + reg.opd0_ins_val = p->ins_fp; + reg.opd0_addr = p->ifmap->start_address; + reg.opt_opd0_sign = opd0_sign; + reg.opt_opd0_seg = 1; + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.conv_opd0_up_pad = p->pad_top; + reg.conv_opd0_dn_pad = p->pad_bottom; + reg.conv_opd0_lf_pad = p->pad_left; + reg.conv_opd0_rt_pad = p->pad_right; + + reg.opt_opd1_seg = 1; + reg.opd1_h = p->kh; + reg.opd1_w = p->kw; + reg.conv_op_x_str = p->stride_w; + reg.conv_op_y_str = p->stride_h; + + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1880v2/bm_dmabuf.c b/cvikernel/src/bm1880v2/bm_dmabuf.c new file mode 100644 index 000000000..6cf7b003c --- /dev/null +++ b/cvikernel/src/bm1880v2/bm_dmabuf.c @@ -0,0 +1,410 @@ +#include +#include +#include +#include + +#include "kernel_1880v2.h" +#include +#include +#include +#include +#include +#include +#include + +#define __ALIGN_MASK(x,mask) (((x)+(mask))&~(mask)) +#define ALIGN(x,a) __ALIGN_MASK(x,(__typeof__(x))(a)-1) + +#define BD_DESC_ALIGN_SIZE (1 << BDC_ENGINE_CMD_ALIGNED_BIT) +#define GDMA_DESC_ALIGN_SIZE (1 << TDMA_DESCRIPTOR_ALIGNED_BIT) +#define BD_EOD_PADDING_BYTES (128) +#define TPU_DMABUF_HEADER_M 0xB5B5 + +typedef struct { + cmd_hdr_t hdr; + uint32_t body[0]; +} DESC; + +// CPU_OP_SYNC structure +typedef struct { + uint32_t op_type; + uint32_t num_tiu; + uint32_t num_tdma; + uint32_t offset_tiu; + uint32_t offset_tdma; + uint32_t offset_tiu_ori_bk; + uint32_t offset_tdma_ori_bk; + char str[CPU_ENGINE_STR_LIMIT_BYTE]; +} __attribute__((packed)) cvi_cpu_desc_t; + +static DESC *traverse_start(uint8_t *cmdbuf) +{ + ASSERT(cmdbuf); + DESC *desc = (DESC *)cmdbuf; + ASSERT(desc->hdr.magic == CMDBUF_HDR_MAGIC_1880v2); + return desc; +} + +static DESC *traverse_next(DESC *desc, uint8_t *cmdbuf, uint32_t size) +{ + DESC *next_desc = (DESC *)((uint8_t *)desc + cmd_hdr_len(&desc->hdr) + sizeof(cmd_hdr_t)); + if ((uint8_t *)next_desc >= cmdbuf + size) + return NULL; + ASSERT(next_desc->hdr.magic == CMDBUF_HDR_MAGIC_1880v2); + return next_desc; +} + +static bool is_last_desc(DESC *desc, uint8_t *cmdbuf, uint32_t size) +{ + DESC *next_desc = traverse_next(desc, cmdbuf, size); + return next_desc ? false : true; +} + +static void reorder_bd_cmdbuf_reg(uint8_t *cmdbuf) +{ + int total_bits = BD_REG_BYTES * 8; + + for (int i = 0; i < total_bits; i += 128) + cmdbuf[(i + 128 - 8) / 8] |= (i / 128) << 4; + + uint8_t tmp[128 / 8]; + uint8_t *last = &cmdbuf[(total_bits - 128) / 8]; + memcpy(tmp, last, sizeof(tmp)); + memcpy(last, cmdbuf, sizeof(tmp)); + memcpy(cmdbuf, tmp, sizeof(tmp)); +} + +static void adjust_desc_tdma(uint32_t *body, bool eod) +{ + if (eod) { + body[0] |= (1 << TDMA_ACCPI0_EOD_BIT); + body[0] |= (1 << TDMA_ACCPI0_INTERRUPT_BIT); // interrupt + } + body[0] |= (1 << TDMA_ACCPI0_BARRIER_ENABLE_BIT); +} + +static void adjust_desc_bd(uint32_t *body, bool eod) +{ + if (eod) { + tiu_reg_t reg; + parse_tiu_reg(®, body); + reg.cmd_end = 1; + reg.cmd_intr_en = 1; + emit_tiu_reg(®, body); + } + reorder_bd_cmdbuf_reg((uint8_t *)body); +} + + +static uint32_t desc_sync_id(DESC *desc) +{ + switch (desc->hdr.engine_id) { + case BMK1880v2_TIU: { + tiu_reg_t reg; + parse_tiu_reg(®, desc->body); + return reg.cmd_id_tpu; + } + case BMK1880v2_TDMA: { + tdma_reg_t reg; + parse_tdma_reg(®, desc->body); + return reg.cmd_id; + } + default: + ASSERT(0); + return 1; + } +} + +static void fill_header_and_arm(uint8_t *cmdbuf, uint32_t sz, uint8_t *dmabuf, uint64_t *tiu_offset, uint64_t *tdma_offset) +{ + dma_hdr_t header = {0}; + header.dmabuf_magic_m = TPU_DMABUF_HEADER_M; + header.dmabuf_magic_s = 0x1835; + + cvi_cpu_desc_t *segments = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t)); + DESC *desc = NULL; + uint32_t desc_nums[BMK1880v2_ENGINE_NUM] = {0}; + uint32_t counters[BMK1880v2_ENGINE_NUM] = {0}; + uint32_t desc_size[BMK1880v2_ENGINE_NUM] = {0}; + + ASSERT(segments); + // fill arm descs + desc = traverse_start(cmdbuf); + + while (desc != NULL) { + uint32_t engine_id = (uint32_t)desc->hdr.engine_id; + counters[engine_id]++; + desc_nums[engine_id]++; + if (engine_id != BMK1880v2_CPU) { + // a new arm desc inserted to do sync operation + if (desc_sync_id(desc) == 0xFFFF || is_last_desc(desc, cmdbuf, sz)) { + desc_nums[BMK1880v2_CPU]++; + cvi_cpu_desc_t *arm = segments + desc_nums[BMK1880v2_CPU] - 1; + memset(arm, 0, sizeof(cvi_cpu_desc_t)); + arm->op_type = CPU_OP_SYNC; + arm->num_tiu = counters[BMK1880v2_TIU]; + arm->num_tdma = counters[BMK1880v2_TDMA]; + strncpy(arm->str, "layer_end", sizeof(arm->str) - 1); + if (counters[BMK1880v2_TIU] != 0) { + desc_size[BMK1880v2_TIU] = + ALIGN(desc_size[BMK1880v2_TIU] + counters[BMK1880v2_TIU] * BD_REG_BYTES + BD_EOD_PADDING_BYTES, + BD_DESC_ALIGN_SIZE); + } + counters[BMK1880v2_TIU] = 0; + counters[BMK1880v2_TDMA] = 0; + } + } else { + cvi_cpu_desc_t *arm = segments + desc_nums[BMK1880v2_CPU] - 1; + memcpy(arm, &(desc->body), sizeof(cvi_cpu_desc_t)); + arm->num_tiu = counters[BMK1880v2_TIU]; + arm->num_tdma = counters[BMK1880v2_TDMA]; + if (counters[BMK1880v2_TIU] != 0) { + desc_size[BMK1880v2_TIU] = + ALIGN(desc_size[BMK1880v2_TIU] + counters[BMK1880v2_TIU] * BD_REG_BYTES + BD_EOD_PADDING_BYTES, + BD_DESC_ALIGN_SIZE); + } + counters[BMK1880v2_TIU] = 0; + counters[BMK1880v2_TDMA] = 0; + } + desc = traverse_next(desc, cmdbuf, sz); + } + desc_size[BMK1880v2_CPU] = desc_nums[BMK1880v2_CPU] * CPU_ENGINE_BYTES; + desc_size[BMK1880v2_TDMA] = desc_nums[BMK1880v2_TDMA] * GDMA_DESC_ALIGN_SIZE; + + (*tiu_offset) = ALIGN(sizeof(header) + desc_size[BMK1880v2_CPU], BD_DESC_ALIGN_SIZE); + (*tdma_offset) = ALIGN((*tiu_offset) + desc_size[BMK1880v2_TIU], GDMA_DESC_ALIGN_SIZE); + + // dma hdr + arm descs + bd descs + tdma descs + header.dmabuf_size = (*tdma_offset) + desc_size[BMK1880v2_TDMA]; + header.cpu_desc_count = desc_nums[BMK1880v2_CPU]; + header.bd_desc_count = desc_nums[BMK1880v2_TIU]; + header.tdma_desc_count = desc_nums[BMK1880v2_TDMA]; + + //printf("header.dmabuf_size = %d\n", header.dmabuf_size); + printf("header.cpu_desc_count = %d\n", header.cpu_desc_count); + printf("header.bd_desc_count = %d\n", header.bd_desc_count); + printf("header.tdma_desc_count = %d\n", header.tdma_desc_count); + memcpy(dmabuf, &header, sizeof(header)); +} + +static void fill_bd_and_tdma(uint8_t *cmdbuf, uint32_t sz, uint8_t *dmabuf, uint64_t tiu_offset, uint64_t tdma_offset) +{ + dma_hdr_t *p_header = (dma_hdr_t *)dmabuf; + cvi_cpu_desc_t *segments = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t)); + DESC *desc = traverse_start(cmdbuf); + //uint64_t address_max = 0x100000000; + + for (uint32_t i = 0; i < p_header->cpu_desc_count; i++) { + + cvi_cpu_desc_t *arm = segments + i; + + uint32_t tiu_num = arm->num_tiu & 0xFFFF; + uint32_t tdma_num = arm->num_tdma & 0xFFFF; + + if (tiu_num) { + tiu_offset = ALIGN(tiu_offset, 1 << BDC_ENGINE_CMD_ALIGNED_BIT); + arm->offset_tiu = tiu_offset; + //printf("arm->offset_tiu = 0x%x \n", arm->offset_tiu); + } + + if (tdma_num) { + tdma_offset = ALIGN(tdma_offset, 1 << TDMA_DESCRIPTOR_ALIGNED_BIT); + arm->offset_tdma = tdma_offset; + //printf("arm->offset_tdma = 0x%x \n", arm->offset_tdma); + } + + while (tiu_num || tdma_num) { + uint32_t engine_id = (uint32_t)desc->hdr.engine_id; + void *p_body = NULL; + + switch (engine_id) { + case BMK1880v2_TIU: + tiu_num--; + p_body = (void *)(dmabuf + tiu_offset); + tiu_offset += BD_REG_BYTES; + memcpy(p_body, desc->body, desc->hdr.len); + adjust_desc_bd((uint32_t *)p_body, tiu_num == 0); + break; + case BMK1880v2_TDMA: + tdma_num--; + tdma_offset = ALIGN(tdma_offset, GDMA_DESC_ALIGN_SIZE); + p_body = (void *)(dmabuf + tdma_offset); + tdma_offset += GDMA_DESC_ALIGN_SIZE; + memcpy(p_body, desc->body, desc->hdr.len); + +#if 0 //debug feature, for checking if neuron overshoot +{ + tdma_reg_t reg_tdma = {0}; + uint64_t tdma_address = 0, tdma_address2 = 0; + + parse_tdma_reg(®_tdma, p_body); + + if (reg_tdma.src_base_reg_sel == 0) { + // reg.trans_dir = 2; // 0:tg2l, 1:l2tg, 2:g2g, 3:l2l + if (reg_tdma.trans_dir == 0) { + printf ("src_base_addr_high=%x, src_base_addr_low=%x\n", reg_tdma.src_base_addr_high, reg_tdma.src_base_addr_low); + tdma_address = ((uint64_t)reg_tdma.src_base_addr_high) << 32 | (uint64_t)reg_tdma.src_base_addr_low; + } else if (reg_tdma.trans_dir == 1) { + printf ("dst_base_addr_high=%x, dst_base_addr_low=%x\n", reg_tdma.dst_base_addr_high, reg_tdma.dst_base_addr_low); + tdma_address = ((uint64_t)reg_tdma.dst_base_addr_high) << 32 | (uint64_t)reg_tdma.dst_base_addr_low; + } else if (reg_tdma.trans_dir == 2) { + printf ("dst_base_addr_high=%x, dst_base_addr_low=%x\n", reg_tdma.dst_base_addr_high, reg_tdma.dst_base_addr_low); + tdma_address = ((uint64_t)reg_tdma.src_base_addr_high) << 32 | (uint64_t)reg_tdma.src_base_addr_low; + tdma_address2 = ((uint64_t)reg_tdma.dst_base_addr_high) << 32 | (uint64_t)reg_tdma.dst_base_addr_low; + + if (tdma_address2 > tdma_address) { + tdma_address = tdma_address2; + } + } + + if (tdma_address > address_max) { + address_max = tdma_address; + printf("address_max=%llx\n", address_max); + } + } +} +#endif + adjust_desc_tdma((uint32_t *)p_body, tdma_num == 0); + break; + default: + break; + } + desc = traverse_next(desc, cmdbuf, sz); + } + + // padding zero after eod to workaroud hardware bug + if (arm->num_tiu & 0xFFFF) { + void *buf = (void *)(dmabuf + tiu_offset); + memset(buf, 0, BD_EOD_PADDING_BYTES); + tiu_offset += BD_EOD_PADDING_BYTES; + } + } +} + +void bmk1880v2_dmabuf_convert(uint8_t *cmdbuf, uint32_t sz, uint8_t *dmabuf) +{ + uint64_t tiu_offset = 0; + uint64_t tdma_offset = 0; + fill_header_and_arm(cmdbuf, sz, dmabuf, &tiu_offset, &tdma_offset); + fill_bd_and_tdma(cmdbuf, sz, dmabuf, tiu_offset, tdma_offset); +} + +#define PER_DES_SIZE 16 +#define PADDING_SIZE (1024 * 1024) +void bmk1880v2_dmabuf_size(uint8_t *cmdbuf, uint32_t sz, uint32_t *psize, uint32_t *pmu_size) +{ + uint32_t tdma_desc_num = {0}; + uint32_t counters[BMK1880v2_ENGINE_NUM] = {0}; + uint32_t bd_size = 0; + uint32_t dmabuf_size = 0; + + uint32_t tiu_cnt = 0; + uint32_t tdma_cnt = 0; + + // calculate desc numbers + DESC *desc = traverse_start(cmdbuf); + + while (desc != NULL) { + uint32_t engine_id = (uint32_t)desc->hdr.engine_id; + counters[engine_id]++; + if (engine_id != BMK1880v2_CPU) { + // a new arm desc inserted to do sync operation + if (desc_sync_id(desc) == 0xFFFF || is_last_desc(desc, cmdbuf, sz)) { + counters[BMK1880v2_CPU]++; + tdma_desc_num += counters[BMK1880v2_TDMA]; + if (counters[BMK1880v2_TIU] != 0) { + bd_size = ALIGN(bd_size + counters[BMK1880v2_TIU] * BD_REG_BYTES + BD_EOD_PADDING_BYTES, + BD_DESC_ALIGN_SIZE); + } + tiu_cnt += counters[BMK1880v2_TIU] & 0xFFFF; + tdma_cnt += counters[BMK1880v2_TDMA] & 0xFFFF; + counters[BMK1880v2_TIU] = 0; + counters[BMK1880v2_TDMA] = 0; + } + } else { + tdma_desc_num += counters[BMK1880v2_TDMA]; + if (counters[BMK1880v2_TIU] != 0) { + bd_size = ALIGN(bd_size + counters[BMK1880v2_TIU] * BD_REG_BYTES + BD_EOD_PADDING_BYTES, + BD_DESC_ALIGN_SIZE); + } + tiu_cnt += counters[BMK1880v2_TIU] & 0xFFFF; + tdma_cnt += counters[BMK1880v2_TDMA] & 0xFFFF; + counters[BMK1880v2_TIU] = 0; + counters[BMK1880v2_TDMA] = 0; + } + desc = traverse_next(desc, cmdbuf, sz); + } + // dma hdr + arm descs + bd descs + tdma descs + dmabuf_size = sizeof(dma_hdr_t) + counters[BMK1880v2_CPU] * CPU_ENGINE_BYTES; + dmabuf_size = ALIGN(dmabuf_size, BD_DESC_ALIGN_SIZE) + bd_size; + dmabuf_size = ALIGN(dmabuf_size, GDMA_DESC_ALIGN_SIZE) + tdma_desc_num * GDMA_DESC_ALIGN_SIZE; + + *pmu_size = ALIGN((tiu_cnt + tdma_cnt) * PER_DES_SIZE + PADDING_SIZE, 0x1000); + *psize = dmabuf_size; +} + +void bmk1880v2_arraybase_set(uint8_t *dmabuf, uint32_t arraybase0L, uint32_t arraybase1L, uint32_t arraybase0H, uint32_t arraybase1H) +{ + ASSERT(dmabuf); + dma_hdr_t *header = (dma_hdr_t *)dmabuf; + + ASSERT(header->dmabuf_magic_m == TPU_DMABUF_HEADER_M); + header->arraybase_0_L = arraybase0L; + header->arraybase_1_L = arraybase1L; + header->arraybase_0_H = arraybase0H; + header->arraybase_1_H = arraybase1H; +} + +void bmk1880v2_dmabuf_relocate(uint8_t *dmabuf, uint64_t dmabuf_devaddr, uint32_t original_size, uint32_t pmubuf_size) +{ + dma_hdr_t *header = (dma_hdr_t *)dmabuf; + uint64_t tmpAddress = 0; + + ASSERT(header->dmabuf_magic_m == TPU_DMABUF_HEADER_M); + cvi_cpu_desc_t *desc = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t)); + + for (uint32_t i = 0; i < header->cpu_desc_count; i++, desc++) { + uint32_t tiu_num = desc->num_tiu & 0xFFFF; + uint32_t tdma_num = desc->num_tdma & 0xFFFF; + + if (tiu_num) { + tmpAddress = dmabuf_devaddr + desc->offset_tiu; + //printf("bd tmpAddress = 0x%lu\n", tmpAddress); + desc->offset_tiu_ori_bk = desc->offset_tiu; + desc->offset_tiu = tmpAddress >> BDC_ENGINE_CMD_ALIGNED_BIT; + } + + if (tdma_num) { + tmpAddress = dmabuf_devaddr + desc->offset_tdma; + //printf("tdma tmpAddress = 0x%lu\n", tmpAddress); + desc->offset_tdma_ori_bk = desc->offset_tdma; + desc->offset_tdma = tmpAddress >> TDMA_DESCRIPTOR_ALIGNED_BIT; + } + + //set pmubuf_addr_p to enable pmu kick + header->pmubuf_size = pmubuf_size; + header->pmubuf_offset = original_size; + } +} + +void bmk1880v2_dmabuf_dump(uint8_t *dmabuf) +{ + ASSERT(dmabuf); + dma_hdr_t *header = (dma_hdr_t *)dmabuf; + // printf("bmk1880v2_dmabuf_dump header->arraybase_0_L = 0x%x\n", header->arraybase_0_L); + // printf("bmk1880v2_dmabuf_dump header->arraybase_1_L = 0x%x\n", header->arraybase_1_L); + // printf("bmk1880v2_dmabuf_dump header->arraybase_0_H = 0x%x\n", header->arraybase_0_H); + // printf("bmk1880v2_dmabuf_dump header->arraybase_1_H = 0x%x\n", header->arraybase_1_H); + // printf("bmk1880v2_dmabuf_dump header->pmubuf_offset = 0x%x\n", header->pmubuf_offset); + + ASSERT(header->dmabuf_magic_m == TPU_DMABUF_HEADER_M); + cvi_cpu_desc_t *desc = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t)); + + for (uint32_t i = 0; i < header->cpu_desc_count; i++, desc++) { + int tiu_num = desc->num_tiu & 0xFFFF; + int tdma_num = desc->num_tdma & 0xFFFF; + uint32_t tiu_offset = desc->offset_tiu; + uint32_t tdma_offset = desc->offset_tdma; + printf("bmk1880v2_dmabuf_dump num, offset<0x%08x, 0x%08x>\n", tiu_num, tdma_num, tiu_offset, tdma_offset); + } +} diff --git a/cvikernel/src/bm1880v2/bm_kernel.c b/cvikernel/src/bm1880v2/bm_kernel.c new file mode 100644 index 000000000..0e5f03f13 --- /dev/null +++ b/cvikernel/src/bm1880v2/bm_kernel.c @@ -0,0 +1,594 @@ +#include "kernel_1880v2.h" +#include + +static void replace_cmd_id(uint32_t *desc, uint32_t eng_id, uint16_t ids[]) +{ + if (eng_id == BMK1880v2_TIU) { + tiu_reg_t reg; + parse_tiu_reg(®, desc); + reg.cmd_id_en = 1; + reg.cmd_id_tpu = ids[eng_id]; + reg.cmd_id_gdma = ids[BMK1880v2_TDMA]; + emit_tiu_reg(®, desc); + + // printf(" %s: TIU eng_id %d, [wait_tdma_id=%d|tiu_id=%d] dst shape(%d, %d, %d, %d)\n", + // __FUNCTION__, eng_id, reg.cmd_id_gdma, reg.cmd_id_tpu, + // reg.res0_n, reg.res0_c, reg.res0_h, reg.res0_w); + + } else if (eng_id == BMK1880v2_TDMA) { + tdma_reg_t tdma_reg; + parse_tdma_reg(&tdma_reg, desc); + tdma_reg.cmd_id = ids[eng_id]; + tdma_reg.wait_id_tpu = ids[BMK1880v2_TIU]; + tdma_reg.bar_en = 1; + + // printf(" %s: TDMA eng_id %d, [tdma_id=%d|wait_tiu_id=%d], dst shape(%d, %d, %d, %d)\n", + // __FUNCTION__, eng_id, tdma_reg.cmd_id, tdma_reg.wait_id_tpu, + // tdma_reg.src_n, tdma_reg.dst_c, tdma_reg.dst_h, tdma_reg.dst_w); + + emit_tdma_reg(&tdma_reg, desc); + } +} + +static int bm1880v2_get_engine_desc_length(uint32_t engine_id) +{ + switch (engine_id) { + case BMK1880v2_TIU: + return TIU_ENGINE_DESCRIPTOR_NUM * sizeof(uint32_t); + case BMK1880v2_TDMA: + return TDMA_ENGINE_DESCRIPTOR_NUM * sizeof(uint32_t); + case BMK1880v2_CPU: + return CPU_ENGINE_DESCRIPTOR_NUM * sizeof(uint32_t); + default: + ASSERT(0); + } +} + +// Estimate the number of command descriptor based on buffer size provided +// by the user. +uint32_t bmk1880v2_estimate_nr_desc(ctx_t *k) +{ + uint32_t tiu_desc_len = bm1880v2_get_engine_desc_length(BMK1880v2_TIU); + uint32_t tdma_desc_len = bm1880v2_get_engine_desc_length(BMK1880v2_TDMA); + uint32_t hdr_len = sizeof(cmd_hdr_t); + + uint32_t desc_len = + (tiu_desc_len > tdma_desc_len) ? tiu_desc_len : tdma_desc_len; + + return k->info.cmdbuf_size / (desc_len + hdr_len); +} + +static void kernel_init(ctx_t *k, bmk_info_t *info) +{ + k->info = *info; + //1880v2->18802 + ASSERT(info->chip_version == BM1880V2_VER); + k->chip_info = bmk1880v2_chip_info(); + + uint32_t max_nr_desc = bmk1880v2_estimate_nr_desc(k); + ec_init(&k->ec, BMK1880v2_ENGINE_NUM, max_nr_desc); + mode_manager_init(&k->mode_manager, &k->ec, BMK1880v2_ENGINE_NUM); + + k->cmdbuf_ptr = 0; + k->max_nr_desc = max_nr_desc; + k->cur_nr_desc = 0; + k->desc_pairs = xmalloc(max_nr_desc * sizeof(k->desc_pairs[0])); + + k->lmem_ptr = 0; +} + +static void kernel_destroy(ctx_t *k) +{ + free(k->desc_pairs); + ec_destroy(&k->ec); + mode_manager_destroy(&k->mode_manager); +} + +static void kernel_reset(ctx_t *k) +{ + k->cur_nr_desc = 0; + k->cmdbuf_ptr = 0; + + ec_reset(&k->ec); + mode_manager_reset(&k->mode_manager); +} + +static cmd_hdr_t * kernel_alloc_cmd_hdr( + ctx_t *k, uint8_t eng_id, uint32_t desc_len) +{ + uint32_t free_len = k->info.cmdbuf_size - k->cmdbuf_ptr; + uint32_t hdr_len = sizeof(cmd_hdr_t); + uint32_t total_len = hdr_len + desc_len; + ASSERT(total_len <= free_len); + + cmd_hdr_t *hdr = (cmd_hdr_t *)&k->info.cmdbuf[k->cmdbuf_ptr]; + hdr->magic = CMDBUF_HDR_MAGIC_1880v2; + hdr->len = desc_len; + hdr->engine_id = eng_id; + hdr->__deprecated = 0; // for valgrind + hdr->flags = 0; + hdr->mask = 0; + + k->cmdbuf_ptr += total_len; + return hdr; +} + +static desc_pair_t * kernel_alloc_desc_pair(ctx_t *k, uint8_t eng_id) +{ + ASSERT(eng_id < BMK1880v2_ENGINE_NUM); + ASSERT(k->cur_nr_desc < k->max_nr_desc); + + uint32_t desc_len = bm1880v2_get_engine_desc_length(eng_id); + desc_pair_t *dp = &k->desc_pairs[k->cur_nr_desc++]; + dp->cmd_hdr = kernel_alloc_cmd_hdr(k, eng_id, desc_len); + dp->ec_desc = ec_alloc_desc(&k->ec, eng_id); + + mode_manager_record_ec_desc(&k->mode_manager, dp->ec_desc); + return dp; +} + +static void kernel_update_sync_id(ctx_t *k) +{ + ec_compute_sync_ids(&k->ec); + + for (uint32_t di = 0; di < k->cur_nr_desc; di++) { + desc_pair_t *dp = &k->desc_pairs[di]; + uint8_t eng_id = dp->ec_desc->engine_id; + uint32_t *desc = (uint32_t *)dp->cmd_hdr->cmd; + replace_cmd_id(desc, eng_id, dp->ec_desc->sync_ids); + } +} + +void bmk1880v2_add_dependency( + ctx_t *ctx, + bmk1880v2_op_t *before, + bmk1880v2_op_t *after) +{ + ec_add_dependency(&ctx->ec, before, after); +} + +desc_pair_t * bm1880v2_get_desc_pair(ctx_t *k, uint8_t eng_id) +{ + if (eng_id == BMK1880v2_CPU) { + kernel_update_sync_id(k); + k->cur_nr_desc = 0; + + ec_reset(&k->ec); + mode_manager_restart_sync_id(&k->mode_manager); + } + + return kernel_alloc_desc_pair(k, eng_id); +} + +ctx_t * bmk1880v2_register(bmk_info_t *info) +{ + ASSERT(info); + ASSERT(info->cmdbuf); + ASSERT(info->cmdbuf_size > 0); + ctx_t *k = xmalloc(sizeof(*k)); + kernel_init(k, info); + return k; +} + +void bmk1880v2_cleanup(ctx_t *ctx) +{ + ASSERT(ctx); + + ctx_t *k = (typeof(k))ctx; + + kernel_destroy(k); + free(k); +} + +void bmk1880v2_reset(ctx_t *ctx) +{ + ctx_t *k = (typeof(k))ctx; + kernel_reset(k); +} + +uint8_t *bmk1880v2_acquire_cmdbuf(ctx_t *ctx, uint32_t *size) +{ + ctx_t *k = (typeof(k))ctx; + + *size = k->cmdbuf_ptr; + kernel_update_sync_id(k); + return k->info.cmdbuf; +} + +void bmk1880v2_parallel_enable(ctx_t *ctx) +{ + ctx_t *k = (typeof(k))ctx; + mode_manager_enable_parallel(&k->mode_manager); +} + +void bmk1880v2_set_op(ctx_t *ctx, void* op) +{ + ctx_t *k = (typeof(k))ctx; + k->op = op; +} + +void* bmk1880v2_get_op(ctx_t *ctx) +{ + ctx_t *k = (typeof(k))ctx; + return k->op; +} + +void bmk1880v2_parallel_disable(ctx_t *ctx) +{ + ctx_t *k = (typeof(k))ctx; + mode_manager_disable_parallel(&k->mode_manager); +} + +void bmk1880v2_create_streams(ctx_t *ctx, int nr_streams) +{ + ctx_t *k = (typeof(k))ctx; + mode_manager_create_streams(&k->mode_manager, nr_streams); +} + +void bmk1880v2_set_layer_id(ctx_t *ctx, uint16_t layer_id) +{ + ctx_t *k = (typeof(k))ctx; + k->layer_id = layer_id; +} + +uint16_t bmk1880v2_layer_id(ctx_t *ctx) +{ + ctx_t *k = (typeof(k))ctx; + return k->layer_id; +} + +void bmk1880v2_destroy_streams(ctx_t *ctx) +{ + ctx_t *k = (typeof(k))ctx; + mode_manager_destroy_streams(&k->mode_manager); +} + +void bmk1880v2_set_stream(ctx_t *ctx, int i) +{ + ctx_t *k = (typeof(k))ctx; + mode_manager_set_stream(&k->mode_manager, i); +} + +static bmk1880v2_chip_info_t bm1880v2_chip_info = { + .version = BM1880V2_VER, + .npu_num = BM1880V2_HW_NPU_NUM, + .eu_num = BM1880V2_HW_EU_NUM, + .lmem_size = BM1880V2_HW_LMEM_SIZE, + .lmem_banks = BM1880V2_HW_LMEM_BANKS, + .lmem_bank_size = BM1880V2_HW_LMEM_BANK_SIZE, + .gmem_start = BM1880V2_GLOBAL_MEM_START_ADDR, + .gmem_size = BM1880V2_GLOBAL_MEM_SIZE, +}; + +bmk1880v2_chip_info_t bmk1880v2_chip_info(void) +{ + return bm1880v2_chip_info; +} + +bmk1880v2_tensor_lmem_t * bmk1880v2_lmem_alloc_tensor( + ctx_t *ctx, + bmk1880v2_tensor_lmem_shape_t s, + fmt_t fmt, int eu_align) +{ + ctx_t *k = (typeof(k))ctx; + uint32_t lmem_size = k->chip_info.lmem_size; + uint32_t eu_num = k->chip_info.eu_num; + + bmk1880v2_tensor_lmem_t *t = xmalloc(sizeof(*t)); + memset(t, 0, sizeof(*t)); + t->start_address = k->lmem_ptr; + t->fmt = fmt; + t->cmprs_fmt = fmt; + t->shape = s; + t->eu_align = eu_align; + t->stride = bmk1880v2_tensor_lmem_default_stride(ctx, s, fmt, eu_align); + + uint32_t needed = align_up(t->shape.n * t->stride.n, eu_num); + if ((lmem_size - k->lmem_ptr < needed) || !needed) { + free(t); + return NULL; + } + + k->lmem_ptr += needed; + return t; +} + +void bmk1880v2_lmem_init_tensor( + ctx_t *ctx, + bmk1880v2_tensor_lmem_t *tl, + bmk1880v2_tensor_lmem_shape_t shape, + fmt_t fmt, + int eu_align) +{ + memset(tl, 0, sizeof(*tl)); + tl->fmt = fmt; + tl->shape = shape; + tl->stride = bmk1880v2_tensor_lmem_default_stride(ctx, shape, fmt, eu_align); + tl->eu_align = eu_align; +} + +// Provide the unified api for tensor size calculation. +// Must have the same logic as bmk1880v2_lmem_bf16_alloc_tensor. +// The backed does not need to duplicate the related code. +uint32_t bmk1880v2_lmem_tensor_to_size( + ctx_t *ctx, + bmk1880v2_tensor_lmem_shape_t s, + fmt_t fmt, int eu_align) +{ + ctx_t *k = (typeof(k))ctx; + uint32_t eu_num = k->chip_info.eu_num; + + bmk1880v2_tensor_lmem_stride_t stride; + stride = bmk1880v2_tensor_lmem_default_stride(ctx, s, fmt, eu_align); + + uint32_t needed = align_up(s.n * stride.n, eu_num); + + return needed; +} + +bmk1880v2_tensor_lmem_t * bmk1880v2_lmem_alloc_ps32_tensor( + bmk1880v2_context_t *ctx, + bmk1880v2_tensor_lmem_shape_t s, + fmt_t fmt, + int eu_align) +{ + /* Partial sum is located in lmem in 32-bit format, so we times n to 2 to + * spare a sapce for it. + */ + + uint32_t prev_n; + + prev_n = s.n; + s.n = s.n * (bitsize_of_fmt(FMT_I32) / bitsize_of_fmt(fmt)); + bmk1880v2_tensor_lmem_t *res = bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, eu_align); + if(res == NULL) + ASSERT(0); + res->shape.n = prev_n; + return res; +} + +void bmk1880v2_lmem_free_tensor( + ctx_t *ctx, const bmk1880v2_tensor_lmem_t *t) +{ + ASSERT(t->start_address < ctx->lmem_ptr); + ctx->lmem_ptr = t->start_address; + + free((void *)t); +} + +bmk1880v2_matrix_lmem_t * bmk1880v2_lmem_alloc_matrix( + ctx_t *ctx, + bmk1880v2_matrix_lmem_shape_t s, + fmt_t fmt, + int eu_align) +{ + uint32_t lmem_size = ctx->chip_info.lmem_size; + uint32_t npu_num = ctx->chip_info.npu_num; + uint32_t eu_num = ctx->chip_info.eu_num; + uint32_t val = (fmt == FMT_BF16) ? 2 : 1; + + bmk1880v2_matrix_lmem_t *t = xmalloc(sizeof(*t)); + memset(t, 0, sizeof(*t)); + t->start_address = ctx->lmem_ptr; + t->fmt = fmt; + t->shape = s; + t->stride.h = s.w * val; + if (eu_align) + t->stride.c = align_up(s.w * val, eu_num); + else + t->stride.c = s.w * val; + t->stride.n = t->stride.c * ceiling_func(s.c, npu_num); + t->eu_align = eu_align; + + uint32_t needed = align_up(t->shape.n * t->stride.n, eu_num); + if (lmem_size - ctx->lmem_ptr < needed) { + free(t); + return NULL; + } + ctx->lmem_ptr += needed; + return t; +} + +void bmk1880v2_lmem_init_matrix( + ctx_t *ctx, + bmk1880v2_matrix_lmem_t *ml, + bmk1880v2_matrix_lmem_shape_t shape, + fmt_t fmt, + int eu_align) +{ + memset(ml, 0, sizeof(*ml)); + ml->fmt = fmt; + ml->shape = shape; + ml->stride = bmk1880v2_matrix_lmem_default_stride(ctx, shape, fmt, eu_align); + ml->eu_align = eu_align; +} + +// Provide the unified api for matrix size calculation. +// Must have the same logic as bmk1880v2_lmem_alloc_matrix. +// The backed does not need to duplicate the related code. +uint32_t bmk1880v2_lmem_matrix_to_size( + ctx_t *ctx, + bmk1880v2_matrix_lmem_shape_t s, + fmt_t fmt, + int eu_align) { + uint32_t npu_num = ctx->chip_info.npu_num; + uint32_t eu_num = ctx->chip_info.eu_num; + uint32_t val = (fmt == FMT_BF16) ? 2 : 1; + + bmk1880v2_matrix_lmem_t t; + t.fmt = fmt; + t.shape = s; + t.stride.h = s.w * val; + if (eu_align) + t.stride.c = align_up(s.w * val, eu_num); + else + t.stride.c = s.w * val; + t.stride.n = t.stride.c * ceiling_func(s.c, npu_num); + + uint32_t needed = align_up(t.shape.n * t.stride.n, eu_num); + + return needed; +} + +bmk1880v2_matrix_lmem_t * bmk1880v2_lmem_alloc_ps32_matrix( + bmk1880v2_context_t *ctx, + bmk1880v2_matrix_lmem_shape_t s, + fmt_t fmt, + int eu_align) +{ + /* Partial sum is located in lmem in 32-bit format, so we times n to 4 to + * spare a sapce for it. + */ + + uint32_t prev_n; + + prev_n = s.n; + s.n = s.n * (bitsize_of_fmt(FMT_I32) / bitsize_of_fmt(fmt)); + bmk1880v2_matrix_lmem_t *res = bmk1880v2_lmem_alloc_matrix(ctx, s, fmt, eu_align); + if(res == NULL) + ASSERT(0); + res->shape.n = prev_n; + return res; +} + +// Provide the unified api for matrix size calculation. +// Must have the same logic as bmk1880v2_lmem_alloc_ps32_bf16_matrix. +// The backed does not need to duplicate the related code. +uint32_t bmk1880v2_lmem_ps32_matrix_to_size( + bmk1880v2_context_t *ctx, + bmk1880v2_matrix_lmem_shape_t s, + fmt_t fmt, + int eu_align) +{ + /* Partial sum is located in lmem in 32-bit format, so we times n to 4 to + * spare a sapce for it. + */ + + s.n = s.n * (bitsize_of_fmt(FMT_I32) / bitsize_of_fmt(fmt)); + + return bmk1880v2_lmem_matrix_to_size(ctx, s, fmt, eu_align); +} + +void bmk1880v2_lmem_free_matrix( + ctx_t *ctx, const bmk1880v2_matrix_lmem_t *t) +{ + ASSERT(t->start_address < ctx->lmem_ptr); + ctx->lmem_ptr = t->start_address; + free((void *)t); +} + +bmk1880v2_tensor_lmem_stride_t bmk1880v2_tensor_lmem_default_stride( + ctx_t *ctx, + bmk1880v2_tensor_lmem_shape_t s, + fmt_t fmt_type, + int eu_align) +{ + bmk1880v2_tensor_lmem_stride_t stride; + uint32_t eu_num = ctx->chip_info.eu_num; + uint32_t npu_num = ctx->chip_info.npu_num; + uint32_t fmt = (fmt_type == FMT_BF16) ? 2 : 1; + stride.w = fmt; + stride.h = s.w * fmt; + if (eu_align) + stride.c = align_up(s.h * s.w * fmt, eu_num); + else + stride.c = s.h * s.w * fmt; + + stride.n = stride.c * ceiling_func(s.c, npu_num); +// printf("bmk1880v2_tensor_lmem_default_stride stride n=%x c=%x h=%x w=%x\n", stride.n , stride.c , stride.h, stride.w); + return stride; +} + +bmk1880v2_tensor_tgmem_stride_t bmk1880v2_tensor_tgmem_default_stride( + bmk1880v2_tensor_tgmem_shape_t s, fmt_t fmt) +{ + uint32_t data_type_size = (fmt == FMT_BF16) ? 2 : 1; + bmk1880v2_tensor_tgmem_stride_t stride; + stride.h = s.w * data_type_size; + stride.c = s.h * stride.h; + stride.n = s.c * stride.c; + return stride; +} + +static void try_optimize_matrix_shape(ctx_t *ctx, + bmk1880v2_matrix_lmem_shape_t *s, + fmt_t fmt_type) { + uint32_t eu_num = ctx->chip_info.eu_num; + uint32_t npu_num = ctx->chip_info.npu_num; + uint32_t col = s->col; + bool isBf16 = (fmt_type == FMT_BF16); + uint32_t workingNumber = isBf16 ? eu_num / 2 : eu_num; + + if (col >= workingNumber) { + int num_eu = ceiling_func(col, workingNumber * npu_num); + s->w = workingNumber * num_eu; + s->c = ceiling_func(col, s->w); + } else { + // col < EU_NUM + // Only transfer needed data + // We still change tensor shape in TIU mac op + s->w = col; + s->c = 1; + } +} + +bmk1880v2_matrix_lmem_shape_t bmk1880v2_matrix_lmem_default_shape( + ctx_t *ctx, + uint32_t row, + uint32_t col, + fmt_t fmt_type) +{ + bmk1880v2_matrix_lmem_shape_t s = {0}; + s.n = row; + s.col = col; + + try_optimize_matrix_shape(ctx, &s, fmt_type); + + return s; +} + +bmk1880v2_matrix_lmem_shape_t bmk1880v2_matrix_lmem_shape_t1( + ctx_t *ctx, + uint32_t len, + fmt_t fmt_type) +{ + uint32_t lmem_size = ctx->chip_info.lmem_size; + bmk1880v2_matrix_lmem_shape_t s = {0}; + + uint32_t row = 1; + uint32_t col = len; + + while (col >= lmem_size) { + ASSERT(col % 2 == 0); + col /= 2; + row *= 2; + } + + s.n = row; + s.col = col; + + try_optimize_matrix_shape(ctx, &s, fmt_type); + return s; +} + +// This should be inside bmk1880v2_lmem_alloc_matrix +bmk1880v2_matrix_lmem_stride_t bmk1880v2_matrix_lmem_default_stride( + ctx_t *ctx, + bmk1880v2_matrix_lmem_shape_t s, + fmt_t fmt, + int eu_align) +{ + uint32_t npu_num = ctx->chip_info.npu_num; + uint32_t eu_num = ctx->chip_info.eu_num; + uint32_t val = (fmt == FMT_BF16) ? 2 : 1; + + bmk1880v2_matrix_lmem_stride_t stride; + stride.h = s.w * val; + if (eu_align) + stride.c = align_up(s.w * val, eu_num); + else + stride.c = s.w * val; + stride.n = stride.c * ceiling_func(s.c, npu_num); + + return stride; +} diff --git a/cvikernel/src/bm1880v2/kernel_1880v2.h b/cvikernel/src/bm1880v2/kernel_1880v2.h new file mode 100644 index 000000000..1f3c1ba1b --- /dev/null +++ b/cvikernel/src/bm1880v2/kernel_1880v2.h @@ -0,0 +1,372 @@ +#ifndef KERNEL_1880v2_H +#define KERNEL_1880v2_H + +#include "kernel_internal.h" + +#include +#include +#include +#include +#include +#include +#include +#include "bmkernel_standard.h" + +#include + +#define TENSOR_MUL_FIX8B 0 +#define TENSOR_MAC_FIX8B 1 +#define TENSOR_ADD_FIX8B 2 +#define TENSOR_SUB_FIX8B 3 +#define TENSOR_MAX_FIX8B 4 +#define TENSOR_MIN_FIX8B 5 +#define TENSOR_SHIFT_FIX8B 6 +#define TENSOR_AND_FIX8B 7 +#define TENSOR_OR_FIX8B 8 +#define TENSOR_XOR_FIX8B 9 +#define TENSOR_COPY_FIX8B 10 + +typedef bmk1880v2_tensor_lmem_shape_t tl_shape_t; +typedef bmk1880v2_matrix_lmem_shape_t ml_shape_t; +typedef bmk1880v2_tensor_tgmem_shape_t tg_shape_t; +typedef bmk1880v2_matrix_tgmem_shape_t mg_shape_t; + +typedef bmk1880v2_tensor_lmem_stride_t tl_stride_t; + +typedef bmk1880v2_tensor_lmem_t tl_t; +typedef bmk1880v2_matrix_lmem_t ml_t; +typedef bmk1880v2_tensor_tgmem_t tg_t; +typedef bmk1880v2_matrix_tgmem_t mg_t; +typedef bmk1880v2_compressed_tensor_tgmem_t compressed_tg_t; +typedef bmk1880v2_compressed_matrix_tgmem_t compressed_mg_t; + +desc_pair_t * bm1880v2_get_desc_pair(ctx_t *k, uint8_t eng_id); + +static inline void assert_same_stride(const tl_t *a, const tl_t *b) +{ + ASSERT(a->stride.n == b->stride.n); + ASSERT(a->stride.c == b->stride.c); + ASSERT(a->stride.h == b->stride.h); + ASSERT(a->stride.w == b->stride.w); +} + +static inline void assert_same_shape(const tl_t *a, const tl_t *b) +{ + ASSERT(a->shape.n == b->shape.n); + ASSERT(a->shape.c == b->shape.c); + ASSERT(a->shape.h == b->shape.h); + ASSERT(a->shape.w == b->shape.w); +} + +static inline void assert_same_shape_3( + const tl_t *a, + const tl_t *b, + const tl_t *c) +{ + assert_same_shape(a, b); + assert_same_shape(a, c); +} + +static inline void assert_same_shape_4( + const tl_t *a, + const tl_t *b, + const tl_t *c, + const tl_t *d) +{ + assert_same_shape_3(a, b, c); + assert_same_shape(a, d); +} + +static inline void assert_same_shape_5( + const tl_t *t0, + const tl_t *t1, + const tl_t *t2, + const tl_t *t3, + const tl_t *t4) +{ + assert_same_shape_3(t0, t1, t2); + assert_same_shape_3(t0, t3, t4); +} + +static inline void assert_same_shape_6( + const tl_t *t0, + const tl_t *t1, + const tl_t *t2, + const tl_t *t3, + const tl_t *t4, + const tl_t *t5) +{ + assert_same_shape_5(t0, t1, t2, t3, t4); + assert_same_shape(t0, t5); +} + +static inline void assert_tiu_tensor_shape(const tl_t *t) +{ + ASSERT(t->shape.n > 0); + ASSERT(t->shape.c > 0); + ASSERT(t->shape.h > 0); + ASSERT(t->shape.w > 0); + + ASSERT(t->shape.n < 0x1000); + ASSERT(t->shape.c < 0x1000); + ASSERT(t->shape.h <= (4095-32)); // 12bit, max 4095-32(lanes) + ASSERT(t->shape.w <= (4095-32)); // 12bit, max 4095-32(lanes) +} + +static inline void check_tiu_tensor(const tl_t *t) +{ + ASSERT(t); + assert_tiu_tensor_shape(t); + ASSERT(t->fmt == FMT_I8 || t->fmt == FMT_U8 || t->fmt == FMT_BF16); +} + +static inline void check_tiu_tensor_2( + const tl_t *t0, + const tl_t *t1) +{ + check_tiu_tensor(t0); + check_tiu_tensor(t1); +} + +static inline void check_tiu_tensor_3( + const tl_t *t0, + const tl_t *t1, + const tl_t *t2) +{ + check_tiu_tensor(t0); + check_tiu_tensor_2(t1, t2); +} + +static inline void check_tiu_tensor_4( + const tl_t *t0, + const tl_t *t1, + const tl_t *t2, + const tl_t *t3) +{ + check_tiu_tensor_3(t0, t1, t2); + check_tiu_tensor(t3); +} + +static inline void check_tiu_tensor_5( + const tl_t *t0, + const tl_t *t1, + const tl_t *t2, + const tl_t *t3, + const tl_t *t4) +{ + check_tiu_tensor_3(t0, t1, t2); + check_tiu_tensor_2(t3, t4); +} + +static inline void check_tiu_tensor_6( + const tl_t *t0, + const tl_t *t1, + const tl_t *t2, + const tl_t *t3, + const tl_t *t4, + const tl_t *t5) +{ + check_tiu_tensor_3(t0, t1, t2); + check_tiu_tensor_3(t3, t4, t5); +} + +static inline void check_16bit_tiu_tensor(const tl_t *low, const tl_t *high) +{ + check_tiu_tensor_2(low, high); + assert_same_shape(low, high); + assert_same_stride(low, high); + ASSERT(low->fmt == high->fmt); + ASSERT(low->start_address < high->start_address); +} + +static inline void assert_stride_type_0(ctx_t *ctx, const tl_t *t) +{ + uint32_t eu_num = ctx->chip_info.eu_num; + uint32_t fmt = (t->fmt == FMT_BF16) ? 2 : 1; + + uint32_t h = t->shape.h; + uint32_t w = t->shape.w * fmt; + uint32_t c_stride = align_up(h * w, eu_num); + + ASSERT(t->stride.c == c_stride); + ASSERT(t->stride.h == w); + ASSERT(t->stride.w == fmt); +} + +static inline void assert_bf16_stride_type_0(ctx_t *ctx, const tl_t *t) +{ + uint32_t eu_num = ctx->chip_info.eu_num; + uint32_t fmt = (t->fmt == FMT_BF16) ? 2 : 1; + + ASSERT(t->stride.c % eu_num == 0); + ASSERT(t->stride.w == fmt); +} + + +static inline void assert_stride_type_2(ctx_t *ctx, const tl_t *t) +{ + ASSERT(t->shape.h == 1); + ASSERT(t->shape.w == 1); + + uint32_t fmt = (t->fmt == FMT_BF16) ? 2 : 1; + uint32_t c = t->shape.c; + uint32_t npu_num = ctx->chip_info.npu_num; + + ASSERT(t->stride.n == fmt * align_up(c, npu_num) / npu_num); + ASSERT(t->stride.c == 1 * fmt); + ASSERT(t->stride.h == 1 * fmt); + ASSERT(t->stride.w == 1 * fmt); +} + +static inline void assert_bf16_stride_type_2(ctx_t *ctx, const tl_t *t) +{ + ASSERT(t->shape.h == 1); + ASSERT(t->shape.w == 1); + + uint32_t fmt = (t->fmt == FMT_BF16) ? 2 : 1; + uint32_t c = t->shape.c; + uint32_t npu_num = ctx->chip_info.npu_num; + + ASSERT(t->stride.n == fmt * align_up(c, npu_num) / npu_num); + ASSERT(t->stride.c == 1 * fmt); + ASSERT(t->stride.h == 1 * fmt); + ASSERT(t->stride.w == 1 * fmt); +} + +static inline int tensor_is_signed(const tl_t *t) +{ + switch (t->fmt) { + case FMT_I8: + return 1; + case FMT_U8: + case FMT_BF16: //does not matter, so set to default 0 + return 0; + default: + ASSERT(0); + } +} + +static inline int matrix_is_signed(const ml_t *t) +{ + switch (t->fmt) { + case FMT_I8: + return 1; + case FMT_U8: + case FMT_BF16: //does not matter, so set to default 0 + return 0; + default: + ASSERT(0); + } +} + +static inline void fill_same_tensor_shape(tiu_reg_t *r, tl_shape_t s) +{ + uint32_t n = s.n; + uint32_t c = s.c; + uint32_t h = s.h; + uint32_t w = s.w; + + r->opd0_n = n; + r->opd0_c = c; + r->opd0_h = h; + r->opd0_w = w; + + r->opd1_n = n; + r->opd1_c = c; + r->opd1_h = h; + r->opd1_w = w; + + r->opd2_n = n; + r->opd2_c = c; + r->opd2_h = h; + r->opd2_w = w; + + r->res0_n = n; + r->res0_c = c; + r->res0_h = h; + r->res0_w = w; +} + +static inline void assert_stride_range(tl_stride_t s) +{ + ASSERT(s.n < 0x10000); + ASSERT(s.c < 0x10000); + ASSERT(s.h < 0x10000); +} + +static inline void fill_same_tensor_stride(tiu_reg_t *r, tl_stride_t s) +{ + uint32_t n = s.n; + uint32_t c = s.c; + uint32_t h = s.h; + uint32_t w = 1; + + r->opd0_n_str = n; + r->opd0_c_str = c; + r->opd0_h_str = h; + r->opd0_w_str = w; + + r->opd1_n_str = n; + r->opd1_c_str = c; + r->opd1_h_str = h; + r->opd1_w_str = w; + + r->opd2_n_str = n; + r->opd2_c_str = c; + r->opd2_h_str = h; + r->opd2_w_str = w; + + r->res0_n_str = n; + r->res0_c_str = c; + r->res0_h_str = h; + r->res0_w_str = w; +} + +#define fill_stride_code(r, op, str) \ + do { \ + r->op##_n_str = str->n; \ + r->op##_c_str = str->c; \ + r->op##_h_str = str->h; \ + r->op##_w_str = str->w; \ + } while (0) + +static inline void fill_opd0_stride(tiu_reg_t *r, const tl_stride_t *str) +{ + fill_stride_code(r, opd0, str); +} + +static inline void fill_opd1_stride(tiu_reg_t *r, const tl_stride_t *str) +{ + fill_stride_code(r, opd1, str); +} + +static inline void fill_opd2_stride(tiu_reg_t *r, const tl_stride_t *str) +{ + fill_stride_code(r, opd2, str); +} + +static inline void fill_res0_stride(tiu_reg_t *r, const tl_stride_t *str) +{ + fill_stride_code(r, res0, str); +} + +static inline void fill_same_tensor_stride_type(tiu_reg_t *r, int type) +{ + r->short_opd0_str = type & 0b11; + r->short_opd1_str = type & 0b11; + r->short_opd2_str = type & 0b11; + r->short_res0_str = type & 0b11; +} + +static inline ec_desc_t * emit_tiu_cmdbuf(ctx_t *k, tiu_reg_t *r) +{ + int engine_id = BMK1880v2_TIU; + + desc_pair_t *dp = bm1880v2_get_desc_pair(k, engine_id); + uint32_t *cmdbuf = (uint32_t *)dp->cmd_hdr->cmd; + emit_tiu_reg(r, cmdbuf); + + return dp->ec_desc; +} + +#endif /* KERNEL_1880v2_H */ diff --git a/cvikernel/src/bm1880v2/non_atomic/common.c b/cvikernel/src/bm1880v2/non_atomic/common.c new file mode 100644 index 000000000..5ad8a8287 --- /dev/null +++ b/cvikernel/src/bm1880v2/non_atomic/common.c @@ -0,0 +1,1201 @@ +/** + * \breif common wrap function for lut + */ +#include "gen_lut.h" +#include + +void bf16_table_shape(ctx_t *ctx, bmk1880v2_tensor_lmem_shape_t *s) { + // MUST valid + assert(s); + + uint32_t npu_num = ctx->chip_info.npu_num; + s->n = 1; + s->c = npu_num; + s->h = bf16_table_h(); + s->w = bf16_table_w(); // hard code for hw, hw:32x8 +} + +void bf16_table_check(bmk1880v2_tensor_lmem_t* IN tl_ifmap, + bmk1880v2_tensor_lmem_t *tbl_answer, + bmk1880v2_tensor_lmem_t *tbl_answer_mantissa, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16) { + + // MUST valid + assert(tl_ofmap_bf16); + assert(tl_ifmap); + assert(tbl_answer); + assert(tbl_answer_mantissa); + + // shape should be same + assert_same_shape(tl_ifmap, tl_ofmap_bf16); + + // TODO table channel should be great equal input + + // currently ONLY support bf16 + assert(tl_ifmap->fmt == FMT_BF16); + assert(tbl_answer->fmt == FMT_BF16); + assert(tbl_answer_mantissa->fmt == FMT_BF16); + assert(tl_ofmap_bf16->fmt == FMT_BF16); + + // table shape should fix + assert(is_1880v2_tbl_shape(&tbl_answer->shape)); + assert(is_1880v2_tbl_shape(&tbl_answer_mantissa->shape)); +} + +static void _bf16_table_check(bmk1880v2_tensor_lmem_t* IN tl_ifmap, + bmk1880v2_tensor_lmem_t* IN tl_buf, + bmk1880v2_tensor_lmem_t *tbl_answer, + bmk1880v2_tensor_lmem_t *tbl_answer_mantissa, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16) { + + // check table / input / output + bf16_table_check(tl_ifmap, tbl_answer, tbl_answer_mantissa, tl_ofmap_bf16); + + assert_same_shape_3(tl_ifmap, tl_buf, tl_ofmap_bf16); + + // check buf + assert(tl_buf); + assert(tl_buf->fmt == FMT_BF16); + + // TODO: remove assert for -O2 + +} + +int _bf16_lut_exp_mantissa(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* IN tl_ifmap, + bmk1880v2_tensor_lmem_t* IN tl_buf, + bmk1880v2_tensor_lmem_t *tbl_answer, + bmk1880v2_tensor_lmem_t *tbl_answer_mantissa, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + uint8_t is_dirty_ifmap +) { + bmk1880v2_tensor_lmem_t* tmp = tl_buf; + if (is_dirty_ifmap) { + tmp = tl_ifmap; + } + + // check table / input / output + _bf16_table_check(tl_ifmap, tl_ifmap, tbl_answer, tbl_answer_mantissa, tl_ofmap_bf16); + + // issue lut cmd + bmk1880v2_tdma_l2l_tensor_copy_param_t p10; + // remove low 8 bits by int8 copy with stride + // layer_id; + bmk1880v2_tiu_lookup_table(ctx, &p12); + + // layer_id; + bmk1880v2_tiu_element_wise_mul(ctx, &p1); + + return 0; +} + +int bf16_lut_exp_mantissa(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* IN tl_ifmap, + bmk1880v2_tensor_lmem_t* IN tl_buf, + bmk1880v2_tensor_lmem_t *tbl_answer, + bmk1880v2_tensor_lmem_t *tbl_answer_mantissa, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16 +) { + return _bf16_lut_exp_mantissa(ctx, + tl_ifmap, + tl_buf, + tbl_answer, + tbl_answer_mantissa, + tl_ofmap_bf16, + false); +} + +// \int8_rnd_mode 1 is rounding to 0, e.g: 1.3->1, -1.3->-1, -1.5->-2 +// 0 is rounding to nearset even, e.g: 1.3->1, -1.3->-1, -1.7->-2 +// \return convert bf16 as int8 and locate to lower part +// e.g.: 24 = 0x18 = 1.5* 2^4 = 0x41C0 +// bf16_get_tbl_idx(0x41C0,FMT_U8) = 0x0018 +void _bf16_get_tbl_idx(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* tl_ifmap, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t src_fmt, int int8_rnd_mode) { + + assert((int8_rnd_mode == 0 || int8_rnd_mode == 1) && "only support 2 mode"); + + assert_same_shape(tl_ifmap, tl_ofmap_bf16); + assert(tl_ifmap->fmt == FMT_BF16); + assert(tl_ofmap_bf16->fmt == FMT_BF16); + + // get index + tl_shape_t tl_ofmap_A_idx_int8_shape = {tl_ofmap_bf16->shape.n, tl_ofmap_bf16->shape.c, + tl_ofmap_bf16->shape.h * tl_ofmap_bf16->shape.w, 1}; + + bmk1880v2_tensor_lmem_t dst; + bmk1880v2_tensor_lmem_s_copy(&dst, tl_ofmap_bf16); + dst.start_address = tl_ofmap_bf16->start_address; + dst.fmt = src_fmt; + dst.shape = tl_ofmap_A_idx_int8_shape; + dst.stride = bmk1880v2_tensor_lmem_default_stride(ctx, dst.shape, dst.fmt, CTRL_NULL); + dst.stride.h = dst.stride.h * 2; + dst.int8_rnd_mode = int8_rnd_mode; + + bmk1880v2_tdma_l2l_tensor_copy_param_t p10; + memset(&p10, 0, sizeof(p10)); + p10.dst = &dst; + p10.src = tl_ifmap; + p10.mv_lut_base = false; // MUST init by ifself in soc + p10.mv_lut_idx = false; + bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p10); +} + +void bf16_get_u8_tbl_idx(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* tl_ifmap, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16 + ) { + + _bf16_get_tbl_idx(ctx, tl_ifmap, tl_ofmap_bf16, FMT_U8, 0); +} + +/* + * \brief get bf16 decimal part, bf16_get_dec(12.3) = 12.0 + * it leverages bf16->int8 get integer and move to bf16 + * \tl_ifmap should be FMT_BF16 format / size + */ +void bf16_get_dec(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* tl_ifmap, + bmk1880v2_tensor_lmem_t* tl_buf, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16 + ) { + + assert_same_shape_3(tl_ifmap, tl_buf, tl_ofmap_bf16); + assert(tl_ifmap->fmt == FMT_BF16); + assert(tl_ofmap_bf16->fmt == FMT_BF16); + + bmk1880v2_tdma_l2l_tensor_copy_param_t p10; + memset(&p10, 0, sizeof(p10)); + bmk1880v2_tensor_lmem_t dst, src; + bmk1880v2_tensor_lmem_s_copy(&src, tl_ifmap); + bmk1880v2_tensor_lmem_s_copy(&dst, tl_buf); + + dst.fmt = FMT_I8; + dst.stride = bmk1880v2_tensor_lmem_default_stride(ctx, dst.shape, + dst.fmt, CTRL_AL); + + // bf16 -> int8 + p10.dst = &dst; + p10.src = &src; + p10.mv_lut_base = false; // MUST init by ifself in soc + p10.mv_lut_idx = false; + bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p10); + + // int8 -> bf16 + //src.fmt = FMT_I8; + //tl_shape_t tl_ofmap_A_idx_int8_shape = {tl_ofmap_bf16->shape.n, tl_ofmap_bf16->shape.c, + // tl_ofmap_bf16->shape.h * tl_ofmap_bf16->shape.w, 1}; + //src.shape = tl_ofmap_A_idx_int8_shape; + //src.stride = bmk1880v2_tensor_lmem_default_stride(ctx, src.shape, /*eu_align*/ 1, src.fmt); + //src.stride.w = 2; + + //tl_shape_t tl_dst_reshape = {tl_ofmap_bf16->shape.n, tl_ofmap_bf16->shape.c, + // 1, tl_ofmap_bf16->shape.h * tl_ofmap_bf16->shape.w}; + + p10.dst = tl_ofmap_bf16; + p10.src = &dst; + bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p10); +} + +// \return decimal fractions / mantissa_as_idx, +// e.g: bf16_get_dec_fractions(12.341) = 0.341 +// NOTICE: we use bf16->i8, the decimal part should be -127 ~ +127 +void bf16_get_dec_fractions(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* tl_ifmap, + bmk1880v2_tensor_lmem_t* OUT buf, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16 + ) { + + assert_same_shape_3(tl_ifmap, buf, tl_ofmap_bf16); + assert(tl_ifmap->fmt == FMT_BF16); + assert(tl_ofmap_bf16->fmt == FMT_BF16); + + // idx(i8) to bf16 format to sub it + bf16_get_dec(ctx, tl_ifmap, tl_ofmap_bf16, buf); + + // mantissa part -> sub to get mantissa + bmk1880v2_tiu_element_wise_sub_param_t p5; + memset(&p5, 0, sizeof(p5)); + p5.res_high = 0; + p5.res_low = tl_ofmap_bf16; + p5.a_high = 0; + p5.a_low = tl_ifmap; + p5.b_high = 0; + p5.b_low = buf; + p5.rshift_bits = 0; + bmk1880v2_tiu_element_wise_sub(ctx, &p5); +} + +/** + * \table_shape return table shape under 1880v2 BF16 + * \return table byte size under BF16 + */ +uint64_t bf16_lut_tbl_bytesize(ctx_t *ctx, + bmk1880v2_tensor_lmem_shape_t *table_shape, fmt_t fmt) { + + assert(table_shape); + + int data_type_size = bytesize_of_fmt(fmt); + bf16_table_shape(ctx, table_shape); + uint64_t table_size = tl_shape_size(table_shape); + + return table_size * data_type_size; +} + +/** + * \brief f(x) = x*x + */ +int bf16_emit_square(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* tl_ifmap, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt) { + + (void)fmt; + assert_same_shape(tl_ifmap, tl_ofmap_bf16); + assert(tl_ifmap->fmt == FMT_BF16); + assert(tl_ofmap_bf16->fmt == FMT_BF16); + + bmk1880v2_tiu_element_wise_mul_param_t p1; + memset(&p1, 0, sizeof(p1)); + p1.res_high = NULL; + p1.res_low = tl_ofmap_bf16; + p1.a = tl_ifmap; + p1.b_is_const = 0; + p1.b = tl_ifmap; + p1.rshift_bits = 0; + p1.relu_enable = 0; + bmk1880v2_tiu_element_wise_mul(ctx, &p1); + + return 0; +} + +/** + * \brief f(x) = |x| + * TODO: check tl_ifmap->start_addr != tl_ofmap_bf16->start_addr + */ +int bf16_emit_abs(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* tl_ifmap, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt) { + + (void)fmt; + assert_same_shape(tl_ifmap, tl_ofmap_bf16); + assert(tl_ifmap->fmt == FMT_BF16); + assert(tl_ofmap_bf16->fmt == FMT_BF16); + + // abs it, multiply -1 + bmk1880v2_tiu_element_wise_mul_param_t p1; + memset(&p1, 0, sizeof(p1)); + p1.res_high = NULL; + p1.res_low = tl_ofmap_bf16; + p1.a = tl_ifmap; + p1.b_is_const = 1; + p1.b_const.val = convert_fp32_bf16(-1.0); + p1.rshift_bits = 0; + p1.relu_enable = 0; + bmk1880v2_tiu_element_wise_mul(ctx, &p1); + + // abs it, get max + bmk1880v2_tiu_element_wise_max_param_t p; + memset(&p, 0, sizeof(p)); + p.max = tl_ofmap_bf16; + p.a = tl_ofmap_bf16; + p.b_is_const = 0; + p.b = tl_ifmap; + bmk1880v2_tiu_element_wise_max(ctx, &p); + + return 0; +} + +/** + * \brief pythagoras p(x, y) = pow(x*x + y*y, 0.5) + * plz refer [here](http://www.themathpage.com/Alg/pythagorean-distance.htm) + */ +int bf16_emit_pythagoras(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* y, + bmk1880v2_tensor_lmem_t* x, + bmk1880v2_tensor_lmem_t* tl_buf, + bmk1880v2_tensor_lmem_t* tl_buf2, + bmk1880v2_tensor_lmem_t* tl_sqrt_table_answer, + bmk1880v2_tensor_lmem_t* tl_sqrt_table_answer_mantissa, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt) { + + // y0 = x * x + bf16_emit_square(ctx, x, tl_buf, fmt); + +#if 1 + // y0 = y0 + y * y + bmk1880v2_tiu_element_wise_mac_param_t p2; + memset(&p2, 0, sizeof(p2)); + p2.res_high = 0; + p2.res_low = tl_buf; + p2.res_is_int8 = 0; + p2.a = y; + p2.b_is_const = 0; + p2.b = y; + p2.lshift_bits = 0;//lshift_bits; + p2.rshift_bits = 0;//rshift_bits; + p2.relu_enable = 0; + bmk1880v2_tiu_element_wise_mac(ctx, &p2); +#else + // y * y + bf16_emit_square(ctx, y, tl_buf2, fmt); + // y = x + y + { + bmk1880v2_tiu_element_wise_add_param_t p4; + p4.res_high = 0; + p4.res_low = tl_buf; + p4.a_high = 0; + p4.a_low = tl_buf2; + p4.b_is_const = 0; + p4.b_high = 0; + p4.b_low = tl_buf; + p4.rshift_bits = 0; + p4.relu_enable = 0; + bmk1880v2_tiu_element_wise_add(ctx, &p4); + } +#endif + + // y0 = sqrt(y0) + bf16_emit_sqrt(ctx, + tl_buf, + tl_buf2, + tl_sqrt_table_answer, + tl_sqrt_table_answer_mantissa, + tl_ofmap_bf16 + ); + return 0; +} + +void bf16_gen_0_tbl(uint16_t* OUT table_0, + bmk1880v2_tensor_lmem_shape_t* table_shape) { + + assert(is_1880v2_tbl_shape(table_shape)); + + uint32_t half = half_h_table(); + int table_hw = bf16_table_hw(); + + table_0[0] = convert_fp32_bf16(1.0); + + for (uint32_t i = 1; i < half * 2; i++) { + table_0[i] = convert_fp32_bf16(0.0); + } + +#ifdef DBG + for (uint32_t i = 0; i < 2 * half; i++) { + printf("lut [%u] is %lf, 0x%x\n", i, convert_bf16_fp32(table_0[i]), + table_0[i]); + } +#endif /* ifdef DBG */ + + // duplicate channel #1 to #31 + //TODO: tensor copy + for (uint64_t i = 1; i < table_shape->c; i++) { + memcpy(&table_0[table_hw * i], &table_0[0], sizeof(uint16_t) * table_hw); + } +} + + +/** + * \brief check which element is 0, return 1 others return 0 + * e.g: input = [0, 1, -1, 2] output [1, 0, 0, 0] + */ +int bf16_emit_0_idx(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* IN tl_ifmap, + bmk1880v2_tensor_lmem_t* tl_buf, + bmk1880v2_tensor_lmem_t* tbl_answer, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt + ) { + + // check table / input / output + _bf16_table_check(tl_ifmap, tl_buf, tbl_answer, tbl_answer, tl_ofmap_bf16); + + assert(fmt); + + // TODO: add fmt parameter? + // abs for \bf16_get_u8_tbl_idx we use bf16->uint8_t + //bf16_emit_abs(ctx, tl_ifmap, tl_ofmap_bf16, FMT_BF16); + // TODO check if address == of address + //bf16_get_u8_tbl_idx(ctx, tl_ofmap_bf16, tl_buf); + // re-scale 0.xx to x. + //bf16_emit_mul_const(ctx, tl_ifmap, tl_buf, fmt, 1000); + + // we directly use mantissa as index, try to add mantissa and mul to filter 2's power + //bf16_emit_abs(ctx, tl_ifmap, tl_buf, fmt); + //bf16_emit_add_const(ctx, tl_buf, tl_buf, fmt, convert_bf16_fp32(0x7f) + 1); + //bf16_emit_mul(ctx, tl_ifmap, tl_buf, tl_buf, fmt); + + + bmk1880v2_tiu_lookup_table_param_t p12; + memset(&p12, 0, sizeof(p12)); +#if 1 + // NOTICE: we use int8 mul to enlarge 2^n + bmk1880v2_tensor_lmem_t src, dst; + bmk1880v2_tensor_lmem_s_copy(&src, tl_ifmap); + bmk1880v2_tensor_lmem_s_copy(&dst, tl_buf); + + src.fmt = FMT_U8; + src.shape.w = src.shape.w * 2; // real size + src.stride = bmk1880v2_tensor_lmem_default_stride(ctx, src.shape, FMT_I8, CTRL_NULL); + dst.shape = src.shape; + dst.fmt = src.fmt; + dst.stride = src.stride; + + bmk1880v2_tiu_element_wise_mul_param_t p; + memset(&p, 0, sizeof(p)); + p.res_high = NULL; + p.res_low = &dst; + p.a = &src; + p.b_is_const = 1; + p.b_const.val = 255; // saturate + p.b_const.is_signed = 0; + p.rshift_bits = 2; // avoid unnormal + p.relu_enable = 0; + bmk1880v2_tiu_element_wise_mul(ctx, &p); + + + // get 2^x and 0 + p12.ofmap = tl_buf; + p12.ifmap = tl_buf; + p12.table = tbl_answer; + bmk1880v2_tiu_lookup_table(ctx, &p12); + + //bf16_get_u8_tbl_idx(ctx, tl_buf, tl_ofmap_bf16); + _bf16_get_tbl_idx(ctx, tl_ifmap, tl_ofmap_bf16, FMT_I8, 0); + + // get 0=0 and < 0 + // p = table_pos_neg(x0) (>= 0 return 1, < 0 return -1) + bmk1880v2_tdma_l2l_tensor_copy_param_t p10; + memset(&p10, 0, sizeof(p10)); + p10.dst = tl_buf; + p10.src = tl_ifmap; + p10.mv_lut_base = false; // MUST init by ifself in soc + p10.mv_lut_idx = true; + bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p10); + p10.mv_lut_idx = false; + + bmk1880v2_tiu_lookup_table_param_t p12; + memset(&p12, 0, sizeof(p12)); + p12.ofmap = tl_buf; + p12.ifmap = tl_buf; + p12.table = tl_pos_neg_table; + bmk1880v2_tiu_lookup_table(ctx, &p12); + + return 0; +} + +/** + * \brief check elements are < 0 + * \tl_pos_neg_table plz refer \bf16_atan_pos_neg + * e.g: input = [0, 10, 6, -1, 0] output [0, 0, 0, 1, 0] + */ +int bf16_emit_neg_idx(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* IN tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_buf, + bmk1880v2_tensor_lmem_t* tl_pos_neg_table, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt + ) { + + _bf16_emit_pre_pos_neg(ctx, tl_ifmap, tl_buf, tl_pos_neg_table, tl_ofmap_bf16); + + // sub 1, [1 -1] -> [0 -2] + bf16_emit_add_const(ctx, tl_buf, tl_buf, fmt, -1.0); + + // abs, [0 -2] -> [0 2] + bf16_emit_abs(ctx, tl_buf, tl_ofmap_bf16, fmt); + + // mul 1/2 [0 2] -> [0 1] + bf16_emit_mul_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 0.5); + + return 0; +} + +/** + * \brief check elements are >= 0 + * \tl_pos_neg_table plz refer \bf16_atan_pos_neg + * e.g: input = [0, 10, 6, -1, 0] output [0, 1, 1, 0, 0] + */ +int bf16_emit_pos_idx(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* IN tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_buf, + bmk1880v2_tensor_lmem_t* tl_pos_neg_table, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt + ) { + + _bf16_emit_pre_pos_neg(ctx, tl_ifmap, tl_buf, tl_pos_neg_table, tl_ofmap_bf16); + + // add 1, [1 -1] -> [2 0] + bf16_emit_add_const(ctx, tl_buf, tl_buf, fmt, 1.0); + + // mul 1/2 [2 0] -> [1 0] + bf16_emit_mul_const(ctx, tl_buf, tl_ofmap_bf16, fmt, 0.5); + + return 0; +} + +/** + * \brief invert 0/1 input + * e.g: input = [0, 1, 1, 1, 0] output [1, 0, 0, 0, 1] + */ +int _bf16_emit_0_1_revert_input(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* IN tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_buf, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt, uint8_t is_dirty_ifmap + ) { + // [-1, -1, 0, -1, 0] = sub([0 0 1 0 1], 1) + // [1, 1, 0, 1, 0] = abs([-1, -1, 0, -1, 0]) + bmk1880v2_tensor_lmem_t *_tl_buf = tl_buf; + + // check buf + if (is_dirty_ifmap) { + _tl_buf = tl_ifmap; + } + else { + assert(tl_buf); + assert(tl_buf->fmt == FMT_BF16); + assert_same_shape(tl_buf, tl_ofmap_bf16); + } + + // sub 1, = add -1 + bf16_emit_add_const(ctx, tl_ifmap, _tl_buf, fmt, -1.0); + + // abs + bf16_emit_abs(ctx, _tl_buf, tl_ofmap_bf16, fmt); + + return 0; +} + +int bf16_emit_0_1_revert_input(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* IN tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_buf, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt + ) { + return _bf16_emit_0_1_revert_input(ctx, + tl_ifmap, tl_buf, tl_ofmap_bf16, fmt, false); +} +/** + * \brief invert 0/1 value + * e.g: input = [0, 10, 6, -1, 0] output [1, 0, 0, 0, 1] + * the step is [0, 10, 6, -1, 0] -> [0, 1, 1, 1, 0] -> [1, 0, 0, 0, 1] + */ +int bf16_emit_0_1_revert(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* IN tl_ifmap, + bmk1880v2_tensor_lmem_t* tl_buf, + bmk1880v2_tensor_lmem_t* tbl_answer, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt + ) { + // [-1, -1, 0, -1, 0] = sub([0 0 1 0 1], 1) + // [1, 1, 0, 1, 0] = abs([-1, -1, 0, -1, 0]) + + // check table / input / output + _bf16_table_check(tl_ifmap, tl_buf, tbl_answer, tbl_answer, tl_ofmap_bf16); + + // check which element is 0, return 1 others return 0 + bf16_emit_0_idx(ctx, tl_ifmap, tl_buf, tbl_answer, tl_ofmap_bf16, fmt); + + bf16_emit_0_1_revert_input(ctx, tl_ofmap_bf16, tl_buf, tl_ofmap_bf16, fmt); + + return 0; +} + +// \brief a(tensor) * b(tensor) +int bf16_emit_mul(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* IN tl_ifmap, + bmk1880v2_tensor_lmem_t* IN tl_ifmap2, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt + ) { + + (void)fmt; + + bmk1880v2_tiu_element_wise_mul_param_t p1; + memset(&p1, 0, sizeof(p1)); + p1.res_high = NULL; + p1.res_low = tl_ofmap_bf16; + p1.a = tl_ifmap; + p1.b_is_const = 0; + p1.b = tl_ifmap2; + p1.rshift_bits = 0; + p1.relu_enable = 0; + bmk1880v2_tiu_element_wise_mul(ctx, &p1); + + return 0; +} + +// \brief a(tensor) * b(tensor) +int bf16_emit_add(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* IN tl_ifmap, + bmk1880v2_tensor_lmem_t* IN tl_ifmap2, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt) { + + (void)fmt; + + bmk1880v2_tiu_element_wise_add_param_t p4; + memset(&p4, 0, sizeof(p4)); + p4.res_high = 0; + p4.res_low = tl_ofmap_bf16; + p4.a_high = 0; + p4.a_low = tl_ifmap; + p4.b_is_const = 0; + p4.b_high = 0; + p4.b_low = tl_ifmap2; + p4.rshift_bits = 0; + p4.relu_enable = 0; + bmk1880v2_tiu_element_wise_add(ctx, &p4); + + return 0; +} + +int bf16_emit_add_const(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* IN tl_ifmap, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt, float b) { + + (void)fmt; + + bmk1880v2_tiu_element_wise_add_param_t p4; + memset(&p4, 0, sizeof(p4)); + p4.res_high = 0; + p4.res_low = tl_ofmap_bf16; + p4.a_high = 0; + p4.a_low = tl_ifmap; + p4.b_is_const = 1; + p4.b_high = 0; + p4.b_const.val = convert_fp32_bf16(b); + p4.rshift_bits = 0; + p4.relu_enable = 0; + bmk1880v2_tiu_element_wise_add(ctx, &p4); + + return 0; +} + +// \brief a(tensor) * b(const) +int bf16_emit_mul_const(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* IN tl_ifmap, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt, float b + ) { + + (void)fmt; + + bmk1880v2_tiu_element_wise_mul_param_t p1; + memset(&p1, 0, sizeof(p1)); + p1.res_high = NULL; + p1.res_low = tl_ofmap_bf16; + p1.a = tl_ifmap; + p1.b_is_const = 1; + p1.b_const.val = convert_fp32_bf16(b); + p1.rshift_bits = 0; + p1.relu_enable = 0; + bmk1880v2_tiu_element_wise_mul(ctx, &p1); + + return 0; +} + +// \brief a(tensor) / b(const) +// NOTICE: it could dirty \y if \is_dirty_ifmap set true +int bf16_emit_x_over_y(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* IN x, + bmk1880v2_tensor_lmem_t* IN y, + bmk1880v2_tensor_lmem_t* IN tl_buf, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + bmk1880v2_tensor_lmem_t *tl_table_answer, + bmk1880v2_tensor_lmem_t *tl_table_answer_mantissa, + fmt_t fmt, uint8_t is_dirty_ifmap + ) { + + bmk1880v2_tensor_lmem_t* tmp = tl_buf; + if (is_dirty_ifmap) { + tmp = NULL; + } + + // y = reciprocal(y) + _bf16_lut_exp_mantissa(ctx, + y, + tmp, + tl_table_answer, + tl_table_answer_mantissa, + tl_ofmap_bf16, + is_dirty_ifmap + ); + + // x / y = x * (1/y) + bf16_emit_mul(ctx, x, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + + return 0; +} + +int _bf16_emit_mask(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* IN tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_buf, + bmk1880v2_tensor_lmem_t *tl_buf2, + bmk1880v2_tensor_lmem_t *tl_buf3, + bmk1880v2_tensor_lmem_t* tl_pos_neg_table, + bmk1880v2_tensor_lmem_t* tl_0_idx_table, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt, enum BF16_MASK_TYPE mask, + uint8_t is_dirty_ifmap) { + + _bf16_table_check(tl_ifmap, tl_buf, tl_pos_neg_table, tl_0_idx_table, tl_ofmap_bf16); + if (is_dirty_ifmap) { + assert_same_shape(tl_buf, tl_buf2); + } + else { + assert_same_shape_3(tl_buf, tl_buf2, tl_buf3); + } + + + switch (mask) { + case BF16_MASK_TYPE_GT_0: + // x > 0 + { + // x >= 0 + bf16_emit_pos_idx(ctx, tl_ifmap, tl_buf, tl_pos_neg_table, tl_buf2, fmt); + + bmk1880v2_tensor_lmem_t *out = tl_ofmap_bf16; + bmk1880v2_tensor_lmem_t *in = tl_ofmap_bf16; + if (is_dirty_ifmap) { + // x = 0 + bf16_emit_0_idx(ctx, tl_ifmap, tl_buf, tl_0_idx_table, tl_ofmap_bf16, fmt); // 0.003 could consider 1 + // !(x = 0) + _bf16_emit_0_1_revert_input(ctx, tl_ofmap_bf16, NULL, tl_buf, fmt, true); + in = tl_buf; + out = tl_ofmap_bf16; + } + else { + // x = 0 + bf16_emit_0_idx(ctx, tl_ifmap, tl_buf, tl_0_idx_table, tl_buf3, fmt); // 0.003 could consider 1 + // !(x = 0) + bf16_emit_0_1_revert_input(ctx, tl_buf3, tl_buf, tl_ofmap_bf16, fmt); + } + + // x > 0 = (x >= 0 && !(x = 0)) + bf16_emit_mul(ctx, in, tl_buf2, out, fmt); + } + break; + case BF16_MASK_TYPE_GE_0: + // y >= 0 + + bf16_emit_pos_idx(ctx, tl_ifmap, tl_buf, tl_pos_neg_table, tl_ofmap_bf16, fmt); + break; + case BF16_MASK_TYPE_EQ_0: + bf16_emit_0_idx(ctx, tl_ifmap, tl_buf, tl_0_idx_table, tl_ofmap_bf16, fmt); // 0.003 could consider 1 + break; + case BF16_MASK_TYPE_LT_0: + // x < 0 + + // x < 0 + bf16_emit_neg_idx(ctx, tl_ifmap, tl_buf, tl_pos_neg_table, tl_ofmap_bf16, fmt); + + break; + case BF16_MASK_TYPE_LE_0: + // x < 0 + bf16_emit_neg_idx(ctx, tl_ifmap, tl_buf, tl_pos_neg_table, tl_ofmap_bf16, fmt); + + // x = 0 + bf16_emit_0_idx(ctx, tl_ifmap, tl_buf, tl_0_idx_table, tl_buf2, fmt); // 0.003 could consider 1 + + // x <= 0 = (x < 0 || (x = 0)) + bf16_emit_add(ctx, tl_ofmap_bf16, tl_buf2, tl_ofmap_bf16, fmt); + break; + default: + assert(0 && "not support yet"); + } + return 0; +} + +/** + * \brief return > 0 mask + * e.g.: [1 2 -1 0 -3 -4 ] -> [1 1 0 0 0 0] + */ +int bf16_emit_mask_gt0(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* IN tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_buf, + bmk1880v2_tensor_lmem_t *tl_buf2, + bmk1880v2_tensor_lmem_t *tl_buf3, + bmk1880v2_tensor_lmem_t* tl_pos_neg_table, + bmk1880v2_tensor_lmem_t* tl_0_idx_table, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt) { + + return bf16_emit_mask(ctx, + tl_ifmap, + tl_buf, + tl_buf2, + tl_buf3, + tl_pos_neg_table, + tl_0_idx_table, + tl_ofmap_bf16, fmt, BF16_MASK_TYPE_GT_0); +} + +/** + * \brief return >= 0 mask + * e.g.: [1 2 -1 0 -3 -4 ] -> [1 1 0 1 0 0] + */ +int bf16_emit_mask_ge0(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* IN tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_buf, + bmk1880v2_tensor_lmem_t* tl_pos_neg_table, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt) { + + return bf16_emit_mask(ctx, + tl_ifmap, + tl_buf, + tl_buf, // fake + tl_buf, // fake + tl_pos_neg_table, + tl_pos_neg_table, //fake + tl_ofmap_bf16, fmt, BF16_MASK_TYPE_GE_0); +} + + +/** + * \brief return <= 0 mask + * e.g.: [1 2 -1 0 -3 -4 ] -> [0 0 1 1 0 0] + */ +int bf16_emit_mask_le0(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* IN tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_buf, + bmk1880v2_tensor_lmem_t* tl_pos_neg_table, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt) { + + return bf16_emit_mask(ctx, + tl_ifmap, + tl_buf, + tl_buf, // fake + tl_buf, // fake + tl_pos_neg_table, + tl_pos_neg_table, //fake + tl_ofmap_bf16, fmt, BF16_MASK_TYPE_LE_0); +} + + +/** + * \brief return = 0 mask + * e.g.: [1 2 -1 0 -3 -4 ] -> [0 0 0 1 0 0] + */ +int bf16_emit_mask_eq0(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* IN tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_buf, + bmk1880v2_tensor_lmem_t* tl_0_idx_table, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt) { + + return bf16_emit_mask(ctx, + tl_ifmap, + tl_buf, + tl_buf, // fake + tl_buf, // fake + tl_0_idx_table, // fake + tl_0_idx_table, + tl_ofmap_bf16, fmt, BF16_MASK_TYPE_EQ_0); +} + + +/** + * \brief return < 0 mask + * e.g.: [1 2 -1 0 -3 -4 ] -> [0 0 1 0 1 1] + */ +int bf16_emit_mask_lt0(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* IN tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_buf, + bmk1880v2_tensor_lmem_t* tl_pos_neg_table, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt) { + + return bf16_emit_mask(ctx, + tl_ifmap, + tl_buf, + tl_buf, // fake + tl_buf, // fake + tl_pos_neg_table, + tl_pos_neg_table, // fake + tl_ofmap_bf16, fmt, BF16_MASK_TYPE_LT_0); +} + +int bf16_emit_mask(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* IN tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_buf, + bmk1880v2_tensor_lmem_t *tl_buf2, + bmk1880v2_tensor_lmem_t *tl_buf3, + bmk1880v2_tensor_lmem_t* tl_pos_neg_table, + bmk1880v2_tensor_lmem_t* tl_0_idx_table, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt, enum BF16_MASK_TYPE mask) { + + return _bf16_emit_mask(ctx, + tl_ifmap, + tl_buf, + tl_buf2, + tl_buf3, + tl_pos_neg_table, + tl_0_idx_table, + tl_ofmap_bf16, fmt, mask, false); +} + +// return x >=0 to 1, x < 0 is -1 +void bf16_emit_mask_ge0_lt0( + ctx_t *ctx, + bmk1880v2_tensor_lmem_t* y, + bmk1880v2_tensor_lmem_t* index_i8, + bmk1880v2_tensor_lmem_t* OUT tl_buf3, + fmt_t fmt + ) { + + bmk1880v2_tiu_element_wise_mul_param_t p; + bmk1880v2_tdma_l2l_tensor_copy_param_t p1; + memset(&p, 0, sizeof(p)); + memset(&p1, 0, sizeof(p1)); + + // get y < 0, bf16->int8 and mul 0xff to get -128 and righshift to get 1 + bf16_emit_mul_const(ctx, y, tl_buf3, fmt, pow(2,64)); + p1.src = tl_buf3; + p1.dst = index_i8; + bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p1); + + + p.res_high = 0; + p.res_low = index_i8; + p.a = index_i8; + p.b_is_const = 1; + p.b_const.val = -128; + p.b_const.is_signed = 1; + p.rshift_bits = 0; + p.relu_enable = 1; + bmk1880v2_tiu_element_wise_mul(ctx, &p); + + p.res_high = 0; + p.res_low = index_i8; + p.a = index_i8; + p.b_is_const = 1; + p.b_const.val = 1; + p.b_const.is_signed = 1; + p.rshift_bits = 7; + p.relu_enable = 1; + bmk1880v2_tiu_element_wise_mul(ctx, &p); + + // get y < 0 indicate 1 + p1.src = index_i8; + p1.dst = tl_buf3; + bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p1); + + // merge, y >= 0 is 1, y < 0 is -1 + bf16_emit_mul_const(ctx, tl_buf3, tl_buf3, fmt, -2.0); + bf16_emit_add_const(ctx, tl_buf3, tl_buf3, fmt, 1.0); + +#if 0 + bf16_emit_mul_const(ctx, tl_buf3, tl_buf3, fmt, -1.0); + + // get y > 0 + // y * (-1) + 1 get 0/1 map, 1 indicate xy == 0 + bf16_emit_add_const(ctx, tl_buf3, tl_buf2, fmt, 1.0); + + // reduce y == 0 + if (0) + { + bmk1880v2_tiu_element_wise_max_param_t p3; + bmk1880v2_tensor_lmem_t index_i8; + bmk1880v2_tensor_lmem_s_copy_l2l_bf16_8(ctx, index_i8, tl_ofmap_bf16, FMT_I8); + bf16_emit_mul_const(ctx, y, tl_buf, fmt, -1); + p3.max = tl_buf; + p3.a = y; + p3.b_is_const = 0; + p3.b = tl_buf; + bmk1880v2_tiu_element_wise_max(ctx, &p3); + bf16_emit_mul_const(ctx, tl_buf, tl_buf, fmt, convert_bf16_fp32(0x7f00)); + //bf16_emit_mul_const(ctx, tl_buf, tl_buf, fmt, pow(2, 64)); + + p1.src = tl_buf; + p1.dst = index_i8; + bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p1); + + p.res_high = NULL; + p.res_low = index_i8; + p.a = index_i8; + p.b_is_const = 1; + p.b_const.val = -1; + p.b_const.is_signed = 1; + p.rshift_bits = 7; + p.relu_enable = 0; + bmk1880v2_tiu_element_wise_mul(ctx, &p); + + + p1.src = index_i8; + p1.dst = tl_buf3; + bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p1); + + //revert it + bf16_emit_mul_const(ctx, tl_buf3, tl_buf3, fmt, -1.0); + //bf16_emit_add_const(ctx, tl_buf3, tl_buf3, fmt, 1); + bf16_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt); + } + + bf16_emit_add(ctx, tl_buf2, tl_buf3, tl_buf3, fmt); +#endif +} + +/* + * \return -1 means others, 0 indicate 0 + */ +void bf16_emit_mask_eq_0(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* y, + bmk1880v2_tensor_lmem_t* tl_buf, + bmk1880v2_tensor_lmem_t* index_i8, + bmk1880v2_tensor_lmem_t* OUT tl_buf3, + fmt_t fmt + ) { + + bmk1880v2_tdma_l2l_tensor_copy_param_t p1; + bmk1880v2_tiu_element_wise_mul_param_t p; + memset(&p1, 0, sizeof(p1)); + memset(&p, 0, sizeof(p)); + + bf16_emit_abs(ctx, y, tl_buf, fmt); + //bf16_emit_mul_const(ctx, y, tl_buf, fmt, -1); + //bmk1880v2_tiu_element_wise_max_param_t p3; + //p3.max = tl_buf; + //p3.a = y; + //p3.b_is_const = 0; + //p3.b = tl_buf; + //bmk1880v2_tiu_element_wise_max(ctx, &p3); + bf16_emit_mul_const(ctx, tl_buf, tl_buf, fmt, convert_bf16_fp32(0x7f00)); + + p1.src = tl_buf; + p1.dst = index_i8; + bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p1); + + p.res_high = NULL; + p.res_low = index_i8; + p.a = index_i8; + p.b_is_const = 1; + p.b_const.val = -1; + p.b_const.is_signed = 1; + p.rshift_bits = 7; + p.relu_enable = 0; + bmk1880v2_tiu_element_wise_mul(ctx, &p); + + p1.src = index_i8; + p1.dst = tl_buf3; + bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p1); +} + diff --git a/cvikernel/src/bm1880v2/non_atomic/fp32_bf16_kernel.c b/cvikernel/src/bm1880v2/non_atomic/fp32_bf16_kernel.c new file mode 100644 index 000000000..3c50dd114 --- /dev/null +++ b/cvikernel/src/bm1880v2/non_atomic/fp32_bf16_kernel.c @@ -0,0 +1,49 @@ +#include "../kernel_1880v2.h" + +// only fill base_reg_index/int8_rnd_mode +static void init_tgmem(bmk1880v2_tensor_tgmem_t* t) { + t->base_reg_index = 0; + t->int8_rnd_mode = 0; +} + +int bf16_s2s_fp32_bf16(bmk1880v2_context_t* ctx, uint64_t gaddr_fp32, + bmk1880v2_tensor_tgmem_shape_t fp32_shape, uint64_t gaddr_bf16, + bmk1880v2_tensor_tgmem_shape_t bf16_shape, fmt_t fmt) { + int ret = 0; + ASSERT(fmt == FMT_BF16 && "only support FMT_BF16"); + ASSERT(fp32_shape.w % 2 == 0 && "fp32's w MUST align with 2"); + + bmk1880v2_tdma_tg2tg_tensor_copy_param_t p; + + bmk1880v2_tensor_tgmem_t src, dst; + + init_tgmem(&src); + init_tgmem(&dst); + + int fp32_w = 2; + src.fmt = fmt; + src.start_address = gaddr_fp32 + fp32_w; // copy from high part + src.shape = fp32_shape; + src.shape.h = fp32_shape.w * fp32_shape.h / fp32_w; + src.shape.w = 1; + + int fmt_sz = ceiling_bytesize_of(bitsize_of_fmt(fmt)); + src.stride.n = fp32_shape.w * fp32_shape.h * fp32_shape.c * fmt_sz; + src.stride.c = fp32_shape.w * fp32_shape.h * fmt_sz; + src.stride.h = fp32_w * fmt_sz; + + dst.fmt = fmt; + dst.start_address = gaddr_bf16; + dst.shape = bf16_shape; + dst.shape.h = bf16_shape.w * bf16_shape.h / fp32_w; + dst.shape.w = 1; + dst.stride = bmk1880v2_tensor_tgmem_default_stride(dst.shape, fmt); + + memset(&p, 0, sizeof(p)); + p.src = &src; + p.dst = &dst; + + bmk1880v2_tdma_tg2tg_bf16_tensor_copy(ctx, &p); + + return ret; +} diff --git a/cvikernel/src/bm1880v2/non_atomic/gen_lut.h b/cvikernel/src/bm1880v2/non_atomic/gen_lut.h new file mode 100644 index 000000000..e73763936 --- /dev/null +++ b/cvikernel/src/bm1880v2/non_atomic/gen_lut.h @@ -0,0 +1,182 @@ +#ifndef GEN_LUT_1880v2_H +#define GEN_LUT_1880v2_H + +#include "../kernel_1880v2.h" + +#include + +#define IN +#define OUT +static inline int bf16_exp_start() +{ + return -62; +} +static inline int bf16_exp_end() +{ + return 63; +} +static inline int bf16_table_h() +{ + return 32; +} +static inline int bf16_table_w() +{ + return 8; +} +static inline int bf16_table_hw() +{ + return bf16_table_h() * bf16_table_w(); +} +static inline int half_h_table() +{ + return bf16_table_h() * bf16_table_w() / 2; +} +static inline uint8_t is_1880v2_tbl_shape(bmk1880v2_tensor_lmem_shape_t *s) +{ + // FIXME: h could be reduce less than 32 + assert(s->h == (uint32_t)bf16_table_h() && s->w == (uint32_t)bf16_table_w() && + "table h/w should be 32/8"); + + return s->h == (uint32_t)bf16_table_h() && s->w == (uint32_t)bf16_table_w(); +} + +// n * s->c * s->h * s->w; +} + +// copy bmk1880v2_tensor_lmem_t structure +static inline void bmk1880v2_tensor_lmem_s_copy(bmk1880v2_tensor_lmem_t *dst, + bmk1880v2_tensor_lmem_t *src) +{ + + dst->start_address = src->start_address; + dst->fmt = src->fmt; + dst->shape = src->shape; + dst->stride = src->stride; + dst->int8_rnd_mode = src->int8_rnd_mode; +} + +static inline void +bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx_t *ctx, bmk1880v2_tensor_lmem_t *dst, + bmk1880v2_tensor_lmem_t *src, fmt_t fmt) +{ + assert(src->fmt == FMT_BF16 && (fmt == FMT_I8 || fmt == FMT_U8) && + "only support bf16->i8/uint8_t, plz check fmt\n"); + + dst->start_address = src->start_address; + dst->fmt = fmt; + dst->shape = src->shape; + dst->shape.w *= 2; + dst->stride = bmk1880v2_tensor_lmem_default_stride(ctx, dst->shape, + fmt, CTRL_NULL); + // dst->shape.h *= 2; + // dst->stride = bmk1880v2_tensor_lmem_default_stride(ctx, dst->shape, + // /*eu_align*/ 1, + // fmt); + // dst->shape.h = src->shape.h; + dst->int8_rnd_mode = src->int8_rnd_mode; +} + +// l2l means we keep the same shape between bf16/(u)int8 +static inline void +bmk1880v2_tensor_lmem_s_copy_l2l_bf16_8(ctx_t *ctx, + bmk1880v2_tensor_lmem_t *dst, + bmk1880v2_tensor_lmem_t *src, fmt_t fmt) +{ + assert(src->fmt == FMT_BF16 && (fmt == FMT_I8 || fmt == FMT_U8) && + "only support bf16->i8/uint8_t, plz check fmt\n"); + + dst->start_address = src->start_address; + dst->fmt = fmt; + dst->shape = src->shape; + dst->stride = bmk1880v2_tensor_lmem_default_stride(ctx, dst->shape, + fmt, CTRL_NULL); + dst->int8_rnd_mode = src->int8_rnd_mode; +} + +int bf16_emit_square(ctx_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap, + bmk1880v2_tensor_lmem_t *OUT tl_ofmap_bf16, fmt_t fmt); + +void bf16_table_check(bmk1880v2_tensor_lmem_t *IN tl_ifmap, + bmk1880v2_tensor_lmem_t *tbl_answer, + bmk1880v2_tensor_lmem_t *tbl_answer_mantissa, + bmk1880v2_tensor_lmem_t *OUT tl_ofmap_bf16); + +int bf16_lut_exp_mantissa(ctx_t *ctx, bmk1880v2_tensor_lmem_t *IN tl_ifmap, + bmk1880v2_tensor_lmem_t *IN tl_buf, + bmk1880v2_tensor_lmem_t *tbl_answer, + bmk1880v2_tensor_lmem_t *tbl_answer_mantissa, + bmk1880v2_tensor_lmem_t *OUT tl_ofmap_bf16); + +void bf16_get_u8_tbl_idx(ctx_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap, + bmk1880v2_tensor_lmem_t *OUT tl_ofmap_bf16); + +void bf16_get_dec(ctx_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_buf, + bmk1880v2_tensor_lmem_t *OUT tl_ofmap_bf16); + +void bf16_get_dec_fractions(ctx_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap, + bmk1880v2_tensor_lmem_t *OUT buf, + bmk1880v2_tensor_lmem_t *OUT tl_ofmap_bf16); + +int bf16_emit_abs(ctx_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap, + bmk1880v2_tensor_lmem_t *OUT tl_ofmap_bf16, fmt_t fmt); + +int _bf16_lut_exp_mantissa(ctx_t *ctx, bmk1880v2_tensor_lmem_t *IN tl_ifmap, + bmk1880v2_tensor_lmem_t *IN tl_buf, + bmk1880v2_tensor_lmem_t *tbl_answer, + bmk1880v2_tensor_lmem_t *tbl_answer_mantissa, + bmk1880v2_tensor_lmem_t *OUT tl_ofmap_bf16, + uint8_t is_dirty_ifmap); + +int _bf16_atan_fast_emit(ctx_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_buf, + bmk1880v2_tensor_lmem_t *tl_buf2, + bmk1880v2_tensor_lmem_t *tl_y0_buf, + bmk1880v2_tensor_lmem_t *tl_invert_buf, + bmk1880v2_tensor_lmem_t *tl_pos_neg_buf, + bmk1880v2_tensor_lmem_t *tl_table_answer, + bmk1880v2_tensor_lmem_t *tl_table_answer_mantissa, + bmk1880v2_tensor_lmem_t *OUT tl_ofmap_bf16, fmt_t fmt, + float b, uint8_t is_dirty_ifmap); + +int bf16_emit_x_over_y(ctx_t *ctx, bmk1880v2_tensor_lmem_t *IN x, + bmk1880v2_tensor_lmem_t *IN y, + bmk1880v2_tensor_lmem_t *IN tl_buf, + bmk1880v2_tensor_lmem_t *OUT tl_ofmap_bf16, + bmk1880v2_tensor_lmem_t *tl_table_answer, + bmk1880v2_tensor_lmem_t *tl_table_answer_mantissa, + fmt_t fmt, uint8_t is_dirty_ifmap); + +int _bf16_emit_mask(ctx_t *ctx, bmk1880v2_tensor_lmem_t *IN tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_buf, + bmk1880v2_tensor_lmem_t *tl_buf2, + bmk1880v2_tensor_lmem_t *tl_buf3, + bmk1880v2_tensor_lmem_t *tl_pos_neg_table, + bmk1880v2_tensor_lmem_t *tl_0_idx_table, + bmk1880v2_tensor_lmem_t *OUT tl_ofmap_bf16, fmt_t fmt, + enum BF16_MASK_TYPE mask, uint8_t is_dirty_ifmap); + +void _bf16_get_tbl_idx(ctx_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap, + bmk1880v2_tensor_lmem_t *OUT tl_ofmap_bf16, + fmt_t src_fmt, int int8_rnd_mode); +int __bf16_atan_fast_emit(ctx_t *ctx, bmk1880v2_tensor_lmem_t *tl_ifmap, + bmk1880v2_tensor_lmem_t *tl_buf, + bmk1880v2_tensor_lmem_t *tl_buf2, + bmk1880v2_tensor_lmem_t *tl_y0_buf, + bmk1880v2_tensor_lmem_t *tl_invert_buf, + bmk1880v2_tensor_lmem_t *tl_pos_neg_buf, + bmk1880v2_tensor_lmem_t *tl_table_answer, + bmk1880v2_tensor_lmem_t *tl_table_answer_mantissa, + bmk1880v2_tensor_lmem_t *OUT tl_ofmap_bf16, + fmt_t fmt); + +#endif /* GEN_LUT_1880v2_H */ diff --git a/cvikernel/src/bm1880v2/non_atomic/hists_svm_kernel.c b/cvikernel/src/bm1880v2/non_atomic/hists_svm_kernel.c new file mode 100644 index 000000000..f167b1c9e --- /dev/null +++ b/cvikernel/src/bm1880v2/non_atomic/hists_svm_kernel.c @@ -0,0 +1,929 @@ +#include "../kernel_1880v2.h" +#define LLVM_DEBUG(...) +#define SPLIT_FAILED 0xFFFF + +// only fill base_reg_index/int8_rnd_mode +static void init_tgmem(bmk1880v2_tensor_tgmem_t* t) { + t->base_reg_index = 0; + t->int8_rnd_mode = 0; +} + +static void copy_tg_tl_tensor_shape(bmk1880v2_tensor_lmem_shape_t* dst, + const bmk1880v2_tensor_tgmem_shape_t* src) { + dst->n = src->n; + dst->c = src->c; + dst->h = src->h; + dst->w = src->w; +} + +static void copy_tl_tg_tensor_shape(bmk1880v2_tensor_tgmem_shape_t* dst, + const bmk1880v2_tensor_lmem_shape_t* src) { + dst->n = src->n; + dst->c = src->c; + dst->h = src->h; + dst->w = src->w; +} + +static int conv_out(int conv_in_ext, int conv_kernel_ext, int stride) { + return conv_in_ext - conv_kernel_ext / stride + 1; +} + +static void conv_output(int on, bmk1880v2_tensor_lmem_shape_t* out_shape, + const bmk1880v2_tensor_tgmem_shape_t* image_shape, + const bmk1880v2_tensor_tgmem_shape_t* svm_shape) { + int ins_h = 0, ins_h_last = 0, pad_top = 0, pad_bot = 0, dh = 1; + int conv_ih_ext = (image_shape->h - 1) * (ins_h + 1) + ins_h_last + 1 + pad_top + pad_bot; + int conv_kh_ext = (svm_shape->h - 1) * dh + 1; + int stride_h = 1; + + int ins_w = 0, ins_w_last = 0, pad_left = 0, pad_right = 0, dw = 1; + int conv_kw_ext = (svm_shape->w - 1) * dw + 1; + int conv_iw_ext = (image_shape->w - 1) * (ins_w + 1) + ins_w_last + 1 + pad_left + pad_right; + int stride_w = 1; + + int oh = conv_out(conv_ih_ext, conv_kh_ext, stride_h); + int ow = conv_out(conv_iw_ext, conv_kw_ext, stride_w); + + out_shape->n = on; + out_shape->c = svm_shape->n; + out_shape->h = oh; + out_shape->w = ow; +} + +typedef struct { + int n; + int oc; + int ic; + int h; + int w; + int ic_step; + int oc_step; + int oh_step; + int ow_step; + int ih_step; + int iw_step; +} SLICES; + +SLICES slices; + +static int is_split_ic() { return 0; } +static int is_split_oc() { return 1; } +static int is_reuse_weight() { return 1; } + +static bmk1880v2_tensor_lmem_shape_t _shape_t4(int n, int c, int h, int w) { + bmk1880v2_tensor_lmem_shape_t s; + s.n = n; + s.c = c; + s.h = h; + s.w = w; + return s; +} + +static bmk1880v2_tensor_tgmem_shape_t _tg_shape_t4(int n, int c, int h, int w) { + bmk1880v2_tensor_tgmem_shape_t s; + s.n = n; + s.c = c; + s.h = h; + s.w = w; + return s; +} + +#define NPU_SHIFT (get_num_shift(ctx->chip_info.npu_num)) +static int _split(bmk1880v2_context_t* ctx, int input_n, int input_c, int input_h, int input_w, + int groups, int output_c, uint16_t kh, uint16_t kw, uint16_t dilation_h, uint16_t dilation_w, + uint8_t pad_top, uint8_t pad_bottom, uint8_t pad_left, uint8_t pad_right, uint8_t stride_h, uint8_t stride_w) { + int do_bias = 0; + int duplicate_weights = 2; // force duplicate weight to speed up + + int ic = input_c / groups; + int oc = output_c / groups; + int kh_extent = dilation_h * (kh - 1) + 1; + int kw_extent = dilation_w * (kw - 1) + 1; + int oh = (input_h + pad_top + pad_bottom - kh_extent) / stride_h + 1; + int ow = (input_w + pad_left + pad_right - kw_extent) / stride_w + 1; + int ih = input_h; + int iw = input_w; + int n = input_n; + + // Depthwise + uint8_t isDepthWise = (input_c == groups && output_c == groups && 1 != groups) ? true : false; + if (isDepthWise) { + ic = input_c; + oc = output_c; + } + + LLVM_DEBUG(llvm::errs() << llvm::format( + "BM1880v2ConvBF16::split =>\n" + " groups %d, ifmap (%d, %d, %d, %d), ofmap(%d, %d, %d, %d)\n" + " kernel (%d, %d), pad (top=%d, bot=%d, left=%d, right=%d)\n" + " stride (%d, %d), dilation (%d, %d)\n", + groups, input_n, input_c, input_h, input_w, input_n, oc, oh, ow, kh, kw, pad_top, + pad_bottom, pad_left, pad_right, stride_h, stride_w, dilation_h, dilation_w)); + + slices.n = 1; + slices.oc = oc / ctx->chip_info.npu_num; // lane parallelism + // slices.ic = isDepthWise ? ic : 1; + slices.ic = 1; + slices.h = (ih + (4095 - 32 - 1)) / (4095 - 32); // 12bit, max 4095-32(lanes) + slices.w = (iw + (4095 - 32 - 1)) / (4095 - 32); // 12bit, max 4095-32(lanes) + + // int oc_step = (oc >= (int)ctx->chip_info.npu_num) ? (int)ctx->chip_info.npu_num : oc; // use + // all lanes int ic_step = isDepthWise ? 1 : ic; + int ic_step = ic; + int num_oc_step = 1; + + // + // Slices may not be a good way to find size + // We may try to increase or decrease width in aligned with 4, 8, 16 ... + // or specific height/width (8, 8), (16, 16) ... + // + // Split ow + if (is_split_ic()) { + LLVM_DEBUG(llvm::errs() << "<= slice ic(" << ic << ")\n";); + ASSERT(0); + // return split_ic(ctx); + } + + if (is_split_oc()) { + LLVM_DEBUG(llvm::errs() << "<= slice oc\n";); + num_oc_step = (oc + ctx->chip_info.npu_num - 1) / ctx->chip_info.npu_num; + } + + // TODO: suppot slice kernel + // 'iw / slices.w >= kw_extent' means we CANT slice kernel + for (slices.w = 1; slices.w <= ow && iw / slices.w >= kw_extent; ++slices.w) { + int ow_step = ceiling_func(ow, slices.w); + int iw_step = math_min((ow_step - 1) * stride_w + kw_extent, iw); + + if ((slices.w == 1) && (stride_w > 1)) { + // For better DMA transfer efficiency, use whole width. + // E.g. + // ifmap (1, 512, 28, 28), kernel (1, 1), stride 2 + // + // input (27, 27) needed, but (27, 28) is better + iw_step = math_min(iw_step + stride_w - 1, iw); + slices.iw_step = iw_step; + } + + // Split oh + // TODO: support slice kernel + for (slices.h = 1; slices.h <= oh && ih / slices.h >= kh_extent; ++slices.h) { + // Split oc + // TODO: config not split it + for (int slice_oc = 0; slice_oc < num_oc_step; ++slice_oc) { + // Downward, align lanes + // E.g. oc = 48, oc_step: 48, 32 + int oc_step = math_min((num_oc_step - slice_oc) * (int)ctx->chip_info.npu_num, oc); + if (num_oc_step == 1) { + // FIXME: not check every loop + oc_step = oc; + slices.oc = 1; + } + + uint32_t coeff_oc_step_size = 0; + + if (do_bias) { + // 2x 16bit + coeff_oc_step_size += bmk1880v2_lmem_tensor_to_size(ctx, _shape_t4(2, oc_step, 1, 1), + FMT_BF16, /*eu_align=*/0); + } + + // TODO: handle prelu + + // Add weight size + coeff_oc_step_size += bmk1880v2_lmem_tensor_to_size( + ctx, _shape_t4(ic_step, oc_step, kh, kw), FMT_BF16, /*eu_align=*/0); + + // split n + for (slices.n = 1; slices.n <= n; ++slices.n) { + int n_step = ceiling_func(n, slices.n); + + int oh_step = ceiling_func(oh, slices.h); + int ih_step = math_min((oh_step - 1) * stride_h + kh_extent, ih); + + uint32_t total_needed = 0; + + uint32_t ofmap_size = bmk1880v2_lmem_tensor_to_size( + ctx, _shape_t4(n_step, oc_step, oh_step, ow_step), FMT_BF16, /*eu_align=*/1); + + total_needed += ofmap_size; + + uint32_t ifmap_size = bmk1880v2_lmem_tensor_to_size( + ctx, _shape_t4(n_step, ic_step, ih_step, iw_step), FMT_BF16, /*eu_align=*/1); + total_needed += ifmap_size; + + total_needed += coeff_oc_step_size; + + // Double buffers so that TDMA load and store can run during TIU executes. + total_needed *= duplicate_weights; + + // TODO: handle prelu, leaky relu + // Both prelu and leaky relu need tl_neg, tl_relu. + // tl_relu, tl_neg are not from tmda and not final output. + // One copy is enough. + // if (do_activation && ((activation_method == PRELU) || + // (activation_method == RELU && activation_arg && activation_arg[0] + // != 0.0f))) { + // total_needed += 2 * ofmap_size; // tl_relu + tl_neg + // } + + if (total_needed < BM1880V2_HW_LMEM_SIZE) { + slices.ic_step = ic_step; + slices.oc_step = oc_step; + slices.oh_step = oh_step; + slices.ow_step = ow_step; + slices.ih_step = ih_step; + slices.iw_step = iw_step; + + LLVM_DEBUG( + llvm::errs() << llvm::format( + " Slices(n=%d, oc=%d, ic=%d, h=%d, w=%d), n_step %d, oh_step %d, ih_step %d" + ", coeff_oc_step_size %d, total_needed %d\n", + slices.n, slices.oc, slices.ic, slices.h, slices.w, n_step, oh_step, ih_step, + coeff_oc_step_size, total_needed);); + LLVM_DEBUG(llvm::errs() << "<= BM1880v2ConvFixedParallelv2_qdm::split succeed" + << "/n"); + return total_needed; + } + + } // for (slices.n = 1; slices.n < n; ++slices.n) + + } // for (int slice_oc = 0; slice_oc < num_oc_step; ++slice_oc) + + } // for (slices.h = 1; slices.h <= oh; ++slices.h) + + } // for (slices.w = 1; slices.w <= ow; ++slices.ow) + + LLVM_DEBUG(llvm::errs() << "<= BM1880v2ConvBF16::split fail" + << "\n"); + + return SPLIT_FAILED; +} + +void tdma_load_stride_bf16(bmk1880v2_context_t* ctx, bmk1880v2_tensor_lmem_t* tlp, uint64_t ga_src, + bmk1880v2_tensor_tgmem_stride_t ts_stride, ctrl_t ctrl) { + ASSERT(tlp != NULL); + + uint8_t DoTranspose = (ctrl & CTRL_TP) ? true : false; + + // tensor in system memory + // Global shape use local shape + bmk1880v2_tensor_tgmem_t ts_data; + ts_data.base_reg_index = 0; + ts_data.fmt = tlp->fmt; + ts_data.start_address = ga_src; + ts_data.shape = _tg_shape_t4(tlp->shape.n, tlp->shape.c, tlp->shape.h, tlp->shape.w); + ts_data.stride = ts_stride; + + if (DoTranspose) { + bmk1880v2_tdma_tg2l_tensor_copy_nc_transposed_param_t p1; + memset(&p1, 0, sizeof(p1)); + p1.src = &ts_data; + p1.dst = tlp; + bmk1880v2_tdma_g2l_bf16_tensor_copy_nc_transposed(ctx, &p1); + } else { + bmk1880v2_tdma_tg2l_tensor_copy_param_t p1; + memset(&p1, 0, sizeof(p1)); + p1.src = &ts_data; + p1.dst = tlp; + bmk1880v2_tdma_g2l_bf16_tensor_copy(ctx, &p1); + } +} + +void tdma_store_stride_bf16(bmk1880v2_context_t* ctx, bmk1880v2_tensor_lmem_t* tlp, uint64_t ga_dst, + bmk1880v2_tensor_tgmem_stride_t ts_stride, ctrl_t ctrl) { + ASSERT(tlp != NULL); + + uint8_t DoTranspose = (ctrl & CTRL_TP) ? true : false; + + // tensor in system memory + // Global shape use local shape + // Global shape used for stride calculation + bmk1880v2_tensor_tgmem_t ts_data; + ts_data.base_reg_index = 0; + ts_data.fmt = tlp->fmt; + ts_data.start_address = ga_dst; + ts_data.shape = _tg_shape_t4(tlp->shape.n, tlp->shape.c, tlp->shape.h, tlp->shape.w); + ts_data.stride = ts_stride; + + if (DoTranspose) { + bmk1880v2_tdma_l2tg_tensor_copy_nc_transposed_param_t p1; + memset(&p1, 0, sizeof(p1)); + p1.src = tlp; + p1.dst = &ts_data; + bmk1880v2_tdma_l2g_bf16_tensor_copy_nc_transposed(ctx, &p1); + } else { + bmk1880v2_tdma_l2tg_tensor_copy_param_t p1; + memset(&p1, 0, sizeof(p1)); + p1.src = tlp; + p1.dst = &ts_data; + bmk1880v2_tdma_l2g_bf16_tensor_copy(ctx, &p1); + } +} + +static void ConvReuseWeight(bmk1880v2_context_t* ctx, gaddr_t ga_ifmap, gaddr_t ga_ofmap, + gaddr_t ga_weight, int input_n, int input_c, int input_h, int input_w, + int groups, int output_c, uint16_t kh, uint16_t kw, uint16_t dilation_h, + uint16_t dilation_w, uint8_t pad_top, uint8_t pad_bottom, uint8_t pad_left, uint8_t pad_right, + uint8_t stride_h, uint8_t stride_w) { +#define RELU (0) + int do_scale = 0; + int do_bn = 0; + int do_activation = 0; + int activation_method = 0; + int* activation_arg = NULL; + int do_bias = 0; + int ga_bias = -1; // not support + int layer_id = 2; // debug + + int ic = input_c / groups; + int oc = output_c / groups; + int kh_ext = dilation_h * (kh - 1) + 1; + int kw_ext = dilation_w * (kw - 1) + 1; + int oh = (input_h + pad_top + pad_bottom - kh_ext) / stride_h + 1; + int ow = (input_w + pad_left + pad_right - kw_ext) / stride_w + 1; + + int n_step = ceiling_func(input_n, slices.n); + // int ic_step = ceiling_func(ic, slices.ic); + // ic_step = slices.ic_step; + int oh_step = slices.oh_step; + int ow_step = slices.ow_step; + int ih_step = slices.ih_step; + int iw_step = slices.iw_step; + int oc_step = slices.oc_step; + + // Always use all lanes. + // Not divided by slices.oc. + // E.g. mtcnn_det2_cic oc = 48, slices.oc = 2 + // It is better to store step. + if (slices.oc > 1) { + ASSERT(oc > (int)ctx->chip_info.npu_num); + oc_step = ctx->chip_info.npu_num; + } + + if (slices.h > 1) { + // max input height inside feature map + ih_step = (oh_step - 1) * stride_h + kh_ext; + } + if (slices.w > 1) { + // max input width inside feature map + iw_step = (ow_step - 1) * stride_w + kw_ext; + } + + LLVM_DEBUG(llvm::errs() << llvm::format( + "ConvReuseWeight =>\n" + " groups %d, ifmap (%d, %d, %d, %d), ofmap(%d, %d, %d, %d)\n" + " kernel (%d, %d), pad (top=%d, bot=%d, left=%d, right=%d)\n" + " stride (%d, %d), dilation (%d, %d)\n" + " Slices (n=%d, oc=%d, ic=%d, h=%d, w=%d)\n", + groups, input_n, input_c, input_h, input_w, input_n, oc, oh, ow, kh, kw, pad_top, + pad_bottom, pad_left, pad_right, stride_h, stride_w, dilation_h, dilation_w, + slices.n, slices.oc, slices.ic, slices.h, slices.w)); + + uint8_t fused_conv_relu = (!do_scale && !do_bn && + (do_activation && activation_method == RELU && + (!activation_arg || (activation_arg[0] == 0.0f)))) + ? true + : false; + + // uint8_t fused_conv_bn_relu = + // (!do_scale && do_bn && + // (do_activation && activation_method == RELU && (!activation_arg || (activation_arg[0] == + // 0.0f)))) + // ? true + // : false; + + // bmk1880v2_tensor_lmem_shape_t oc_shape_ = _shape_t4(1, oc_step, 1, 1); + // bmk1880v2_tensor_lmem_shape_t ifmap_shape_ = _shape_t4(n_step, ic_step, ih_step, input_w); + // bmk1880v2_tensor_lmem_shape_t ofmap_shape_ = _shape_t4(n_step, oc_step, oh_step, ow); + + bmk1880v2_tensor_lmem_t *tl_weight[2] = {NULL, NULL}, *tl_bias[2] = {NULL, NULL}; + bmk1880v2_tensor_lmem_t* tl_ifmap[2] = {NULL}; + bmk1880v2_tensor_lmem_t* tl_ofmap[2] = {NULL}; + + // Global memory stride from global memory shape + // input_c, output_c, not ic, oc + // bmk1880v2_tensor_tgmem_stride_t ofmap_gstride = {static_cast(output_c) * oh * ow, + // static_cast(oh) * ow, + // static_cast(ow)}; + // bmk1880v2_tensor_tgmem_stride_t ifmap_gstride = {static_cast(input_c) * input_h * input_w, + // static_cast(input_h) * input_w, + // static_cast(input_w)}; + // bmk1880v2_tensor_tgmem_stride_t bias_gstride = {static_cast(output_c), 1, 1}; + // bmk1880v2_tensor_tgmem_stride_t weight_gstride = { + // static_cast(oc) * kh * kw * ic, static_cast(kh) * kw * ic, static_cast(ic)}; + bmk1880v2_tensor_tgmem_stride_t ofmap_gstride = + bmk1880v2_tensor_tgmem_default_stride(_tg_shape_t4(1, output_c, oh, ow), FMT_BF16); + bmk1880v2_tensor_tgmem_stride_t ifmap_gstride = bmk1880v2_tensor_tgmem_default_stride( + _tg_shape_t4(1, input_c, input_h, input_w), FMT_BF16); + bmk1880v2_tensor_tgmem_stride_t bias_gstride = + bmk1880v2_tensor_tgmem_default_stride(_tg_shape_t4(1, output_c, 1, 1), FMT_BF16); + bmk1880v2_tensor_tgmem_stride_t weight_gstride = + bmk1880v2_tensor_tgmem_default_stride(_tg_shape_t4(1, oc, kh * kw, ic), FMT_BF16); + + // + // Pre-alloc maximum one-step size + // + // Need vector to track the order of local memory. + // The local memory release must be in reverse order. + // + tl_weight[0] = + bmk1880v2_lmem_alloc_tensor(ctx, _shape_t4(ic, oc_step, kh, kw), FMT_BF16, CTRL_NULL); + if (is_reuse_weight()) { + tl_weight[1] = + bmk1880v2_lmem_alloc_tensor(ctx, _shape_t4(ic, oc_step, kh, kw), FMT_BF16, CTRL_NULL); + } else { + // tl_weight[1] = tl_weight[0]; + } + + tl_ifmap[0] = bmk1880v2_lmem_alloc_tensor(ctx, _shape_t4(n_step, ic, ih_step, iw_step), + FMT_BF16, CTRL_AL); + + if (is_reuse_weight()) { + tl_ifmap[1] = bmk1880v2_lmem_alloc_tensor(ctx, _shape_t4(n_step, ic, ih_step, iw_step), + FMT_BF16, CTRL_AL); + } else { + // tl_ifmap[1] = tl_ifmap[0]; + } + + tl_ofmap[0] = bmk1880v2_lmem_alloc_tensor(ctx, _shape_t4(n_step, oc_step, oh_step, ow_step), + FMT_BF16, CTRL_AL); + + if (is_reuse_weight()) { + tl_ofmap[1] = bmk1880v2_lmem_alloc_tensor( + ctx, _shape_t4(n_step, oc_step, oh_step, ow_step), FMT_BF16, CTRL_AL); + } else { + // tl_ofmap[1] = tl_ofmap[0]; + } + + ASSERT(tl_weight[0] && tl_ifmap[0] && tl_ofmap[0]); + + if (is_reuse_weight()) { + ASSERT(tl_weight[1] && tl_ifmap[1] && tl_ofmap[1]); + } + + if (do_bias) { + // 16 bit + tl_bias[0] = bmk1880v2_lmem_alloc_tensor(ctx, _shape_t4(2, oc_step, 1, 1), FMT_BF16, + /*eu_align=*/0); + if (is_reuse_weight()) { + tl_bias[1] = bmk1880v2_lmem_alloc_tensor(ctx, _shape_t4(2, oc_step, 1, 1), FMT_BF16, + /*eu_align=*/0); + } else { + // tl_bias[1] = tl_bias[0]; + } + ASSERT(tl_bias[0]); + if (is_reuse_weight()) { + ASSERT(tl_bias[1]); + } + } + + // split groups + for (int ig = 0; ig < groups; ++ig) { + int first = 1; + int flip = 0; + int coeff_flip = 0; + gaddr_t ga_ofmap_cur[2] = {0}; + + bmk1880v2_parallel_disable(ctx); + + // split oc + for (int oc_pos = 0; oc_pos < oc; oc_pos += oc_step) { + int cur_oc = math_min(oc - oc_pos, oc_step); + + uint64_t coeff_offset = (ig * oc + oc_pos) * sizeof(uint16_t); + + if (do_bias) { + // 2x 16 bit + // bmk does not keep eu-align info, user need to update stride if shape changed + tl_bias[coeff_flip]->shape = _shape_t4(2, cur_oc, 1, 1); + tl_bias[coeff_flip]->stride = bmk1880v2_tensor_lmem_default_stride( + ctx, tl_bias[coeff_flip]->shape, FMT_BF16, /*eu_align=*/0); + + LLVM_DEBUG(llvm::errs() << llvm::format( + " [ig=%d][oc_pos=%d] tdma_load_stride_bf16:\n" + " tl_bias gaddr 0x%lx, laddr 0x%x, shape (%d, %d, " + "%d, %d), stride (%d, %d, %d)\n", + ig, oc_pos, ga_bias + coeff_offset, tl_bias[coeff_flip]->start_address, + tl_bias[coeff_flip]->shape.n, tl_bias[coeff_flip]->shape.c, + tl_bias[coeff_flip]->shape.h, tl_bias[coeff_flip]->shape.w, bias_gstride.n, + bias_gstride.c, bias_gstride.h)); + tdma_load_stride_bf16(ctx, tl_bias[coeff_flip], ga_bias + coeff_offset, bias_gstride, + CTRL_WEIGHT); + } + + // Weight shape for load != shape for tiu + // bmk does not keep eu-align info, user need to update stride if shape changed + tl_weight[coeff_flip]->shape = _shape_t4(ic, cur_oc, kh, kw); + tl_weight[coeff_flip]->stride = bmk1880v2_tensor_lmem_default_stride( + ctx, tl_weight[coeff_flip]->shape, FMT_BF16, /*eu_align*/ 0); + + uint64_t weight_offset = (ig * oc * ic * kh * kw + oc_pos * ic * kh * kw) * sizeof(uint16_t); + { + // Same local address, different shape, stride + bmk1880v2_tensor_lmem_t tl_tmp; + tl_tmp.start_address = tl_weight[coeff_flip]->start_address; + tl_tmp.fmt = FMT_BF16; + tl_tmp.shape = _shape_t4(1, cur_oc, kh * kw, ic); + tl_tmp.stride = + bmk1880v2_tensor_lmem_default_stride(ctx, tl_tmp.shape, FMT_BF16, /*eu_align=*/0); + + LLVM_DEBUG(llvm::errs() << llvm::format( + " [ig=%d][oc_pos=%d] tdma_load_stride_bf16:\n" + " tl_weight gaddr 0x%lx, laddr 0x%x, shape (%d, %d, " + "%d, %d), stride (%d, %d, %d)\n", + ig, oc_pos, weight_offset, tl_tmp.start_address, tl_tmp.shape.n, + tl_tmp.shape.c, tl_tmp.shape.h, tl_tmp.shape.w, tl_tmp.stride.n, + tl_tmp.stride.c, tl_tmp.stride.h, tl_tmp.stride.w)); + tdma_load_stride_bf16(ctx, &tl_tmp, ga_weight + weight_offset, weight_gstride, CTRL_WEIGHT); + } + + // bmk1880v2_tensor_lmem_shape_t ifmap_shape[2] = {0}; + // bmk1880v2_tensor_lmem_shape_t ofmap_shape[2] = {0}; + // gaddr_t ga_ifmap_cur[2] = {0}; + + // split n + for (int n_pos = 0; n_pos < input_n; n_pos += n_step) { + int cur_n = math_min(input_n - n_pos, n_step); + + // split h + for (int oh_pos = 0; oh_pos < oh; oh_pos += oh_step) { + int cur_oh = math_min(oh - oh_pos, oh_step); + + int oh_top = oh_pos; + int oh_bot = oh_top + cur_oh; + int ih_top = math_max(oh_top * stride_h - pad_top, 0); + int ih_bot = math_min((oh_bot - 1) * stride_h + kh_ext - pad_top, input_h); + int cur_ih = ih_bot - ih_top; + + int ph_top = 0; + if (ih_top == 0) { + ph_top = pad_top - oh_top * stride_h; + } + + int ph_bot = 0; + if (ih_bot == input_h) { + ph_bot = (oh_bot - 1) * stride_h + kh_ext - pad_top - input_h; + } + + // split w + for (int ow_pos = 0; ow_pos < ow; ow_pos += ow_step) { + int cur_ow = math_min(ow - ow_pos, ow_step); + + int ow_left = ow_pos; + int ow_right = ow_left + cur_ow; + int iw_left = math_max(ow_left * stride_w - pad_left, 0); + int iw_right = math_min((ow_right - 1) * stride_w + kw_ext - pad_left, input_w); + int cur_iw = iw_right - iw_left; + + int pw_left = 0; + if (iw_left == 0) { + pw_left = pad_left - ow_left * stride_w; + } + + int pw_right = 0; + if (iw_right == input_w) { + pw_right = (ow_right - 1) * stride_w + kw_ext - pad_left - input_w; + } + + LLVM_DEBUG(llvm::errs() + << llvm::format(" [ig=%d][oc_pos=%d][n_pos=%d][oh_pos=%d][ow_pos=%d]" + " cur_oh %d, cur_ih %d, ih_top %d, ih_bot %d" + ", cur_ow %d, cur_iw %d, iw_left %d, iw_right %d\n", + ig, oc_pos, n_pos, oh_pos, ow_pos, cur_oh, cur_ih, ih_top, + ih_bot, cur_ow, cur_iw, iw_left, iw_right)); + + // Adjust current shape and stride + // bmk does not keep eu-align info, user need to update stride if shape changed + tl_ofmap[flip]->shape = _shape_t4(cur_n, cur_oc, cur_oh, cur_ow); + tl_ofmap[flip]->stride = bmk1880v2_tensor_lmem_default_stride( + ctx, tl_ofmap[flip]->shape, FMT_BF16, /*eu_align=*/1); + + // bmk does not keep eu-align info, user need to update stride if shape changed + tl_ifmap[flip]->shape = _shape_t4(cur_n, ic, cur_ih, cur_iw); + tl_ifmap[flip]->stride = bmk1880v2_tensor_lmem_default_stride( + ctx, tl_ifmap[flip]->shape, FMT_BF16, /*eu_align=*/1); + + uint64_t ifmap_offset = (ig * ic * input_h * input_w + n_pos * input_c * input_h * input_w + + ih_top * input_w + iw_left) * + sizeof(uint16_t); + + LLVM_DEBUG( + llvm::errs() << llvm::format( + " [ig=%d][oc_pos=%d][n_pos=%d][oh_pos=%d][ow_pos=%d] tdma_load_stride_bf16:\n" + " tl_ifmap gaddr 0x%lx, laddr 0x%x, shape (%d, %d, " + "%d, %d), stride (%d, %d, %d)\n", + ig, oc_pos, n_pos, oh_pos, ow_pos, ifmap_offset, tl_ifmap[flip]->start_address, + tl_ifmap[flip]->shape.n, tl_ifmap[flip]->shape.c, tl_ifmap[flip]->shape.h, + tl_ifmap[flip]->shape.w, tl_ifmap[flip]->stride.n, tl_ifmap[flip]->stride.c, + tl_ifmap[flip]->stride.h, tl_ifmap[flip]->stride.w)); + + tdma_load_stride_bf16(ctx, tl_ifmap[flip], ga_ifmap + ifmap_offset, ifmap_gstride, + CTRL_NEURON); + + bmk1880v2_parallel_disable(ctx); + bmk1880v2_parallel_enable(ctx); + + { + bmk1880v2_tiu_convolution_param_t param; + memset(¶m, 0, sizeof(param)); + param.ofmap = tl_ofmap[flip]; + param.ifmap = tl_ifmap[flip]; + param.weight = tl_weight[coeff_flip]; + param.bias = tl_bias[coeff_flip]; + param.ins_h = param.ins_last_h = 0; + param.ins_w = param.ins_last_w = 0; + param.pad_top = ph_top; + param.pad_bottom = ph_bot; + param.pad_left = pw_left; + param.pad_right = pw_right; + param.stride_h = stride_h; + param.stride_w = stride_w; + param.dilation_h = dilation_h; + param.dilation_w = dilation_w; + param.relu_enable = fused_conv_relu; + param.ps32_mode = 0; + param.w_is_const = 0; + param.layer_id = layer_id; + + LLVM_DEBUG(llvm::errs() << llvm::format( + " [ig=%d][oc_pos=%d][n_pos=%d][oh_pos=%d][ow_pos=%d] conv:\n" + " ifmap la_addr 0x%x, shape (%d, %d, %d, %d)\n" + " weight la_addr 0x%x, shape (%d, %d, %d, %d)\n" + " ofmap la_addr 0x%x, shape (%d, %d, %d, %d)\n", + ig, oc_pos, n_pos, oh_pos, ow_pos, param.ifmap->start_address, + param.ifmap->shape.n, param.ifmap->shape.c, param.ifmap->shape.h, + param.ifmap->shape.w, param.weight->start_address, + param.weight->shape.n, param.weight->shape.c, param.weight->shape.h, + param.weight->shape.w, param.ofmap->start_address, + param.ofmap->shape.n, param.ofmap->shape.c, param.ofmap->shape.h, + param.ofmap->shape.w)); + + bmk1880v2_tiu_convolution(ctx, ¶m); + } + + ga_ofmap_cur[flip] = ga_ofmap + (ig * oc * oh * ow + n_pos * output_c * oh * ow + + oc_pos * oh * ow + oh_top * ow + ow_left) * + sizeof(uint16_t); + + if (!is_reuse_weight()) { + flip = 1; + first = 0; + } + + if (first) { + // postpone first result to next loop + // loop0: LD0 TIU0 + // loop1: LD1 TIU1 SD0 + // loop2: LD2 TIU2 SD1 + first = 0; + } else { + int flip_back = 1 - flip; + + // Store back to global memory + LLVM_DEBUG(llvm::errs() << llvm::format( + " [ig=%d][oc_pos=%d][n_pos=%d][oh_pos=%d][ow_pos=%d] " + "tdma_store_stride_bf16:\n" + " tl_ofmap gaddr 0x%lx, laddr 0x%x, shape (%d, %d, " + "%d, %d), stride (%d, %d, %d)\n", + ig, oc_pos, n_pos, oh_pos, ow_pos, ga_ofmap_cur[flip_back], + tl_ofmap[flip_back]->start_address, tl_ofmap[flip_back]->shape.n, + tl_ofmap[flip_back]->shape.c, tl_ofmap[flip_back]->shape.h, + tl_ofmap[flip_back]->shape.w, tl_ofmap[flip_back]->stride.n, + tl_ofmap[flip_back]->stride.c, tl_ofmap[flip_back]->stride.h, + tl_ofmap[flip_back]->stride.w)); + + tdma_store_stride_bf16(ctx, tl_ofmap[flip_back], ga_ofmap_cur[flip_back], + ofmap_gstride, CTRL_NEURON); + } + + flip = 1 - flip; + + } // for (int ow_pos = 0; ow_pos < ow; ow_pos += ow_step) + + } // for (int oh_i = 0; oh_i < oh; oh_i += oh_step) + + } // for (int n_i = 0; n_i < n; ni += n_step) + + if (!is_reuse_weight()) { + coeff_flip = 1; + } + + coeff_flip = 1 - coeff_flip; + + } // for (int oc_i = 0; oc_i < oc; oc_i += oc_step + + bmk1880v2_parallel_disable(ctx); + + // the last iteration stored the other side, leave the last side not stored + if (!is_reuse_weight()) { + // TODO: no need to store last one cuz we store every loop + flip = 1; + } else { + int flip_back = 1 - flip; + + // Store back to global memory + LLVM_DEBUG(llvm::errs() << llvm::format( + " [ig=%d] tdma_store_stride_bf16:\n" + " tl_ofmap gaddr 0x%lx, laddr 0x%x, shape (%d, %d, " + "%d, %d), stride (%d, %d, %d)\n", + ig, ga_ofmap_cur[flip_back], tl_ofmap[flip_back]->start_address, + tl_ofmap[flip_back]->shape.n, tl_ofmap[flip_back]->shape.c, + tl_ofmap[flip_back]->shape.h, tl_ofmap[flip_back]->shape.w, + tl_ofmap[flip_back]->stride.n, tl_ofmap[flip_back]->stride.c, + tl_ofmap[flip_back]->stride.h, tl_ofmap[flip_back]->stride.w)); + + tdma_store_stride_bf16(ctx, tl_ofmap[flip_back], ga_ofmap_cur[flip_back], ofmap_gstride, + CTRL_NEURON); + } + + } // for (int group_i = 0; group_i < groups; ++groups) + + // + // Release resource in reverse order + // + if (do_bias) { + if (is_reuse_weight()) bmk1880v2_lmem_free_tensor(ctx, tl_bias[1]); + + bmk1880v2_lmem_free_tensor(ctx, tl_bias[0]); + } + if (is_reuse_weight()) bmk1880v2_lmem_free_tensor(ctx, tl_ofmap[1]); + + bmk1880v2_lmem_free_tensor(ctx, tl_ofmap[0]); + + if (is_reuse_weight()) bmk1880v2_lmem_free_tensor(ctx, tl_ifmap[1]); + + bmk1880v2_lmem_free_tensor(ctx, tl_ifmap[0]); + + if (is_reuse_weight()) bmk1880v2_lmem_free_tensor(ctx, tl_weight[1]); + + bmk1880v2_lmem_free_tensor(ctx, tl_weight[0]); + + LLVM_DEBUG(llvm::errs() << "<=ConvReuseWeight" + << "/n"); +} + +int bf16_hists_svm(bmk1880v2_context_t* ctx, uint64_t gaddr_image, uint64_t gaddr_nc_image, + bmk1880v2_tensor_tgmem_shape_t image_shape, uint64_t re_order_gaddr_svm, + bmk1880v2_tensor_tgmem_shape_t svm_shape, // (oc, ic, kh, kw) + uint64_t gaddr_output, int unit_size, fmt_t fmt) { + int ret = 0; + ASSERT(image_shape.n == 1 && image_shape.c == 1 && "image_shape should 2 dims"); + // ASSERT(svm_shape.n == unit_size && "svm_shape channel MUST eq unit_size"); + ASSERT(fmt == FMT_BF16 && "only support FMT_BF16"); + // 1. nc load transpose, for split unit + // 2. store back for load to channel + // 3. load c by unit + // 4. weight MUST re-order for step 2 + // 5. conv + + bmk1880v2_tensor_tgmem_t src; + init_tgmem(&src); + + // 1. nc load transpose, for split unit + bmk1880v2_tdma_tg2l_tensor_copy_nc_transposed_param_t p1; + bmk1880v2_tensor_tgmem_shape_t image_shape_expend_unit; + image_shape_expend_unit.n = image_shape.h * image_shape.w; + image_shape_expend_unit.c = unit_size; + image_shape_expend_unit.h = 1; + image_shape_expend_unit.w = 1; + + src.fmt = fmt; + src.start_address = gaddr_image; + src.shape = image_shape_expend_unit; + src.stride = bmk1880v2_tensor_tgmem_default_stride(src.shape, src.fmt); + + bmk1880v2_tensor_lmem_shape_t l_image_shape_expend_unit; + l_image_shape_expend_unit.n = image_shape_expend_unit.c; + l_image_shape_expend_unit.c = image_shape_expend_unit.n; + l_image_shape_expend_unit.h = 1; + l_image_shape_expend_unit.w = 1; + + bmk1880v2_tensor_lmem_t* dst = + bmk1880v2_lmem_alloc_tensor(ctx, l_image_shape_expend_unit, fmt, CTRL_NULL); + + memset(&p1, 0, sizeof(p1)); + p1.src = &src; + p1.dst = dst; + bmk1880v2_tdma_g2l_bf16_tensor_copy_nc_transposed(ctx, &p1); + + // 2. store back for load to channel + bmk1880v2_tdma_l2tg_tensor_copy_param_t p2; + memset(&p2, 0, sizeof(p2)); + copy_tl_tg_tensor_shape(&image_shape_expend_unit, &l_image_shape_expend_unit); + + src.start_address = gaddr_nc_image; + src.shape = image_shape_expend_unit; + src.stride = bmk1880v2_tensor_tgmem_default_stride(src.shape, src.fmt); + + p2.src = dst; + p2.dst = &src; + bmk1880v2_tdma_l2g_bf16_tensor_copy(ctx, &p2); + + bmk1880v2_lmem_free_tensor(ctx, dst); + + // tiling conv, copy from backend + if (1) { + int input_n = 1; + int groups = 1; + uint16_t dilation_h = 1, dilation_w = 1; + uint8_t pad_top = 0; + uint8_t pad_bottom = 0, pad_left = 0, pad_right = 0, stride_h = 1, stride_w = 1; + + _split(ctx, input_n, unit_size, image_shape.h, image_shape.w, groups, svm_shape.n, svm_shape.h, + svm_shape.w, dilation_h, dilation_w, pad_top, pad_bottom, pad_left, pad_right, stride_h, + stride_w); + + ConvReuseWeight(ctx, gaddr_nc_image, gaddr_output, re_order_gaddr_svm, input_n, unit_size, + image_shape.h, image_shape.w, groups, svm_shape.n, svm_shape.h, svm_shape.w, + dilation_h, dilation_w, pad_top, pad_bottom, pad_left, pad_right, stride_h, + stride_w); + } else { + // 3. load c by unit + bmk1880v2_tdma_tg2l_tensor_copy_param_t p3; + memset(&p3, 0, sizeof(p3)); + image_shape_expend_unit.n = 1; + image_shape_expend_unit.c = unit_size; + image_shape_expend_unit.h = image_shape.h; + image_shape_expend_unit.w = image_shape.w; + + copy_tg_tl_tensor_shape(&l_image_shape_expend_unit, &image_shape_expend_unit); + + bmk1880v2_tensor_lmem_t* tl_ifmap = + bmk1880v2_lmem_alloc_tensor(ctx, l_image_shape_expend_unit, fmt, CTRL_AL); + + p3.src = &src; + p3.dst = tl_ifmap; + bmk1880v2_tdma_g2l_bf16_tensor_copy(ctx, &p3); + + // 4. weight MUST re-order for step 2 + // bmk1880v2_tensor_lmem_t bmk1880v2_lmem_alloc_tensor(ctx, _shape_t4(ic, oc_step, kh, kw), + // FMT_BF16, CTRL_NULL); + // weight from origin layout (oc, ic, kh, kw) transform to (1, oc, kh*kw, ic) + bmk1880v2_tensor_tgmem_shape_t transpose_svm_shape; + transpose_svm_shape.n = 1; + transpose_svm_shape.c = svm_shape.n; + transpose_svm_shape.h = svm_shape.h * svm_shape.w; + transpose_svm_shape.w = svm_shape.c; + + src.start_address = re_order_gaddr_svm; + src.shape = image_shape_expend_unit; + src.base_reg_index = 1; + + bmk1880v2_tensor_lmem_shape_t l_transpose_svm_shape; + copy_tg_tl_tensor_shape(&l_transpose_svm_shape, &transpose_svm_shape); + bmk1880v2_tensor_lmem_t* tl_weight = + bmk1880v2_lmem_alloc_tensor(ctx, l_transpose_svm_shape, fmt, CTRL_NULL); + + p3.src = &src; + p3.dst = tl_weight; + bmk1880v2_tdma_g2l_bf16_tensor_copy(ctx, &p3); + + // 5. conv + // alloc output + bmk1880v2_tensor_lmem_shape_t l_out_shape; + conv_output(1, &l_out_shape, &image_shape, &svm_shape); + bmk1880v2_tensor_lmem_t* tl_ofmap = + bmk1880v2_lmem_alloc_tensor(ctx, l_out_shape, fmt, CTRL_AL); + + bmk1880v2_tiu_convolution_param_t param; + memset(¶m, 0, sizeof(param)); + param.ofmap = tl_ofmap; + param.ifmap = tl_ifmap; + param.weight = tl_weight; + param.bias = NULL; + param.ins_h = param.ins_last_h = 0; + param.ins_w = param.ins_last_w = 0; + param.pad_top = 0; + param.pad_bottom = 0; + param.pad_left = 0; + param.pad_right = 0; + param.stride_h = 1; + param.stride_w = 1; + param.dilation_h = 1; + param.dilation_w = 1; + param.relu_enable = 0; + param.ps32_mode = 0; + param.w_is_const = 0; + param.layer_id = 0; + + bmk1880v2_tiu_convolution(ctx, ¶m); + + bmk1880v2_tensor_tgmem_shape_t out_shape; + copy_tl_tg_tensor_shape(&out_shape, &l_out_shape); + + src.start_address = gaddr_output; + src.shape = out_shape; + src.base_reg_index = 0; + + p2.src = tl_ofmap; + p2.dst = &src; + bmk1880v2_tdma_l2g_bf16_tensor_copy(ctx, &p2); + + bmk1880v2_lmem_free_tensor(ctx, tl_ofmap); + bmk1880v2_lmem_free_tensor(ctx, tl_weight); + bmk1880v2_lmem_free_tensor(ctx, tl_ifmap); + } + + return ret; +} diff --git a/cvikernel/src/bm1880v2/non_atomic/tiu_lut_atan.c b/cvikernel/src/bm1880v2/non_atomic/tiu_lut_atan.c new file mode 100644 index 000000000..2bda3bf1e --- /dev/null +++ b/cvikernel/src/bm1880v2/non_atomic/tiu_lut_atan.c @@ -0,0 +1,1105 @@ +/** + * plz refer [git](https://github.com/xiezhq-hermann/atan_lookup) + * input range is `all real numbers` and output range is -pi/2 < x < pi/2, + * you can refer [here](https://www.mathopenref.com/arctan.html) for more details + */ +// +// xiezhq@shanghaitech.edu.cn && wanghe@shanghaitech.edu.cn +/* Reference: + [1] Abhisek Ukil, Vishal H Shah, Bernhard Deck, + "Fast Computation of arctangent Functions for Embedded Applications: A + Comparative Analysis" IEEE International Symposium on Industrial Electronics, +Pages: 1206 - 1211, DOI: 10.1109/ISIE.2011.5984330, 2011 +[2] Sreeraman Rajan, Sichun Wang, Robert Inkol, and Alain Joyal +"Efficient Approximations for the Arctangent Function" +IEEE SIGNAL PROCESSING MAGAZINE [108] MAY 2006 +*/ + +#include "gen_lut.h" +#include + +//#define DBG + +static double LUT_d[102] = { + 0, 0.00999966668666524, 0.0199973339731505, 0.0299910048568779, 0.0399786871232900, + 0.0499583957219428, 0.0599281551212079, 0.0698860016346425, 0.0798299857122373, 0.0897581741899505, + 0.0996686524911620, 0.109559526773944, 0.119428926018338, 0.129275004048143, 0.139095941482071, + 0.148889947609497, 0.158655262186401, 0.168390157147530, 0.178092938231198, 0.187761946513593, + 0.197395559849881, 0.206992194219821, 0.216550304976089, 0.226068387993884, 0.235544980720863, + 0.244978663126864, 0.254368058553266, 0.263711834462266, 0.273008703086711, 0.282257421981491, + 0.291456794477867, 0.300605670042395, 0.309702944542456, 0.318747560420644, 0.327738506780556, + 0.336674819386727, 0.345555580581712, 0.354379919123438, 0.363147009946176, 0.371856073848581, + 0.380506377112365, 0.389097231055278, 0.397627991522129, 0.406098058317616, 0.414506874584786, + 0.422853926132941, 0.431138740718782, 0.439360887284591, 0.447519975157170, 0.455615653211225, + 0.463647609000806, 0.471615567862328, 0.479519291992596, 0.487358579505190, 0.495133263468404, + 0.502843210927861, 0.510488321916776, 0.518068528456721, 0.525583793551610, 0.533034110177490, + 0.540419500270584, 0.547740013715902, 0.554995727338587, 0.562186743900029, 0.569313191100662, + 0.576375220591184, 0.583373006993856, 0.590306746935372, 0.597176658092678, 0.603982978252998, + 0.610725964389209, 0.617405891751573, 0.624023052976757, 0.630577757214935, 0.637070329275684, + 0.643501108793284, 0.649870449411948, 0.656178717991395, 0.662426293833151, 0.668613567927821, + 0.674740942223553, 0.680808828915828, 0.686817649758645, 0.692767835397122, 0.698659824721463, + 0.704494064242218, 0.710271007486686, 0.715991114416300, 0.721654850864761, 0.727262687996690, + 0.732815101786507, 0.738312572517228, 0.743755584298860, 0.749144624606017, 0.754480183834406, + 0.759762754875771, 0.764992832710910, 0.770170914020331, 0.775297496812126, 0.780373080066636, + 0.785398163397448, 0.790373246728302 +}; + + +void bf16_atan_y0(uint16_t *table_data_y0, bmk1880v2_tensor_lmem_shape_t* table_shape) { + + assert(is_1880v2_tbl_shape(table_shape)); + + int table_hw = bf16_table_hw(); + + /** + * index 0 1 2 3 60 61 62 63 64 65 123 124 125 126 + *-------- + * exp (2) x -62 -61 -60 ... -3 -2 -1 0 1 2 .... 60 61 62 63 + * + * index 128 129 130 131 188 189 190 191 192 193 251 252 253 254 255 + *-------- + * exp (-2)x -62 -61 -60 ... -3 -2 -1 0 1 2 ... 60 61 62 63 x + * + */ + + // [0 102) for > 1 + int lut_sz = sizeof(LUT_d) / sizeof(LUT_d[0]); + for (int i = 0; i < lut_sz; i++) { + table_data_y0[i] = convert_fp32_bf16(M_PI_2 - LUT_d[i]); + } + + // [102 204) for [0 1] + for (int i = lut_sz; i < lut_sz * 2; i++) { + table_data_y0[i] = convert_fp32_bf16(LUT_d[i - lut_sz]); + } + +#ifdef DBG + for (int i = 0; i < lut_sz * 2; i++) { + printf("y0[%d] is %f(0x%x)\n", i, convert_bf16_fp32(table_data_y0[i]), table_data_y0[i]); + } +#endif + // duplicate channel #1 to #31 + //TODO: tensor copy + for (uint32_t i = 1; i < table_shape->c; i++) { + memcpy(&table_data_y0[i * table_hw], &table_data_y0[0], sizeof(uint16_t) * table_hw); + } +} + +void bf16_atan_fast_degree_y0(uint16_t *table_data_y0, bmk1880v2_tensor_lmem_shape_t* table_shape) { + + assert(is_1880v2_tbl_shape(table_shape)); + + int table_hw = bf16_table_hw(); + + /** + * index 0 1 2 3 60 61 62 63 64 65 123 124 125 126 + *-------- + * exp (2) x -62 -61 -60 ... -3 -2 -1 0 1 2 .... 60 61 62 63 + * + * index 128 129 130 131 188 189 190 191 192 193 251 252 253 254 255 + *-------- + * exp (-2)x -62 -61 -60 ... -3 -2 -1 0 1 2 ... 60 61 62 63 x + * + */ + + // [0 102) for > 1 + int lut_sz = sizeof(LUT_d) / sizeof(LUT_d[0]); + for (int i = 0; i < lut_sz; i++) { + table_data_y0[i] = convert_fp32_bf16((M_PI_2 - LUT_d[i]) * 180 / M_PI); + } + + // [102 204) for [0 1] + for (int i = lut_sz; i < lut_sz * 2; i++) { + table_data_y0[i] = convert_fp32_bf16(LUT_d[i - lut_sz] * 180 / M_PI); + } + +#ifdef DBG + for (int i = 0; i < lut_sz * 2; i++) { + printf("y0[%d] is %f(0x%x)\n", i, convert_bf16_fp32(table_data_y0[i]), table_data_y0[i]); + } +#endif + // duplicate channel #1 to #31 + //TODO: tensor copy + for (uint32_t i = 1; i < table_shape->c; i++) { + memcpy(&table_data_y0[i * table_hw], &table_data_y0[0], sizeof(uint16_t) * table_hw); + } +} + +void bf16_atan_slope(uint16_t* OUT table_slope, bmk1880v2_tensor_lmem_shape_t* table_shape) { + + int table_hw = bf16_table_hw(); + + int lut_sz = sizeof(LUT_d) / sizeof(LUT_d[0]) - 1; + for (volatile int i = 0; i < lut_sz; i++) { + table_slope[i] = convert_fp32_bf16(LUT_d[i+1] - LUT_d[i]); + } + + // duplicate channel #1 to #31 + //TODO: tensor copy + for (uint64_t i = 1; i < table_shape->c; i++) { + memcpy(&table_slope[table_hw * i], &table_slope[0], sizeof(uint16_t) * table_hw); + } +} + +// 'bf16_atan_s_01' means atan split [0 1] and (1, +// data in [0-1] mutilply 1, > 1 mutiply with -1 +void bf16_atan_s_01(uint16_t* OUT table_invert, bmk1880v2_tensor_lmem_shape_t* table_shape) { + int half = half_h_table(); + int table_hw = bf16_table_hw(); + + // data in [0, 1], mutilply 1 +#if 1 + for (uint32_t i = 0; i < 63; i++) { + table_invert[i] = convert_fp32_bf16(1.0); + table_invert[i+half] = convert_fp32_bf16(1.0); + } + + // data > 1 + for (int i = 63; i < half; i++) { + table_invert[i] = convert_fp32_bf16(-1.0); + table_invert[i+half] = convert_fp32_bf16(-1.0); + } +#endif + + // duplicate channel #1 to #31 + //TODO: tensor copy + for (uint64_t i = 1; i < table_shape->c; i++) { + memcpy(&table_invert[table_hw * i], &table_invert[0], sizeof(uint16_t) * table_hw); + } +} + +// 'pos_neg' means data is positive(>=0) is 1 or negtive(<0) is -1 +void bf16_atan_pos_neg(uint16_t* OUT table_pos_neg, bmk1880v2_tensor_lmem_shape_t* table_shape) { + + uint32_t half = half_h_table(); + int table_hw = bf16_table_hw(); + + // data >= 0 + for (uint32_t i = 0; i < half; i++) { + table_pos_neg[i] = convert_fp32_bf16(1.0); + } + + // data < 0 + for (uint32_t i = half; i < half * 2; i++) { + table_pos_neg[i] = convert_fp32_bf16(-1.0); + } + + // duplicate channel #1 to #31 + //TODO: tensor copy + for (uint64_t i = 1; i < table_shape->c; i++) { + memcpy(&table_pos_neg[table_hw * i], &table_pos_neg[0], sizeof(uint16_t) * table_hw); + } +} + +/* Syntactic sugar for get more precision + * raw implement code : + + double re_x = 1 / x; + int index = round(re_x * 100); + return (M_PI_2 - (LUT_d[index] + (re_x * 100 - index) * (LUT_d[index + 1] - LUT_d[index]))); + and we want to get `(LUT_d[index] + (re_x * 100 - index)` part + */ +int bf16_atan_slope_multipilier(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* tl_buf, + bmk1880v2_tensor_lmem_t* tl_buf2, + bmk1880v2_tensor_lmem_t* tl_buf3, + bmk1880v2_tensor_lmem_t* tl_ifmap, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt) { + + (void)fmt; + + bf16_get_dec(ctx, tl_buf, tl_buf2, tl_buf3); + // z = (min(x,y) * 100 - index) * slope(index) + + // fill to 100 + bmk1880v2_tiu_element_wise_mul_param_t p1; + memset(&p1, 0, sizeof(p1)); + p1.res_high = NULL; + p1.res_low = tl_buf2; + p1.a = tl_buf; + p1.b_is_const = 1; + p1.b_const.val = convert_fp32_bf16(0); + p1.rshift_bits = 0; + p1.relu_enable = 0; + bmk1880v2_tiu_element_wise_mul(ctx, &p1); + + // add + bmk1880v2_tiu_element_wise_add_param_t p4; + p4.res_high = 0; + p4.res_low = tl_buf2; + p4.a_high = 0; + p4.a_low = tl_buf2; + p4.b_is_const = 1; + p4.b_high = 0; + p4.b_const.val = convert_fp32_bf16(-100.0); + p4.rshift_bits = 0; + p4.relu_enable = 0; + bmk1880v2_tiu_element_wise_add(ctx, &p4); + + bmk1880v2_tiu_element_wise_mac_param_t p2; + p2.res_high = 0; + p2.res_low = tl_buf3; + p2.res_is_int8 = 0; + p2.a = tl_ifmap; + p2.b_is_const = 0; + p2.b = tl_buf2; + p2.lshift_bits = 0;//lshift_bits; + p2.rshift_bits = 0;//rshift_bits; + p2.relu_enable = 0; + bmk1880v2_tiu_element_wise_mac(ctx, &p2); + + p1.res_high = NULL; + p1.res_low = tl_ofmap_bf16; + p1.a = tl_buf3; + p1.b_is_const = 1; + p1.b_const.val = convert_fp32_bf16(-1.0); + p1.rshift_bits = 0; + p1.relu_enable = 0; + bmk1880v2_tiu_element_wise_mul(ctx, &p1); + + return 0; +} + +/** issue atan >= 0 + * \b for more precision, we use mac for atan2 + * if (x > 1) { + * x = 1 / x + * } + * int index = round(x * 100); + * double r = (x * 100 - index) * (LUT_d[index + 1] - LUT_d[index]); + * double shift = LUT_d[index]; + * if (x > 1) { + * shift = M_PI_2 - LUT_d[index]; + * } + * return r + shift; + * FIXME: reduce temp buffer count + */ +int _bf16_atan_emit(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* tl_ifmap, + bmk1880v2_tensor_lmem_t* tl_buf, + bmk1880v2_tensor_lmem_t* tl_buf2, + bmk1880v2_tensor_lmem_t* tl_buf3, + bmk1880v2_tensor_lmem_t* tl_y0_buf, + bmk1880v2_tensor_lmem_t* tl_slope_buf, + bmk1880v2_tensor_lmem_t* tl_invert_buf, + bmk1880v2_tensor_lmem_t* tl_pos_neg_buf, + bmk1880v2_tensor_lmem_t *tl_table_answer, + bmk1880v2_tensor_lmem_t *tl_table_answer_mantissa, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt, float b) { + + bf16_table_check(tl_ifmap, tl_y0_buf, tl_slope_buf, tl_ifmap); + bf16_table_check(tl_buf, tl_invert_buf, tl_pos_neg_buf, tl_buf2); + bf16_table_check(tl_buf3, tl_table_answer, tl_table_answer_mantissa, tl_ofmap_bf16); + + // x = abs(x0) + // y = 1 / x + // index = 100 * min(x, y) + // z = (min(x,y) * 100 - index) * slope(index) + // invert = table_0_102(x) ([0-1] return 1, >1 return -1) + // invert = invert * z + // t = 64 * (table_0_102 + 1) + // shift_index = t(index) ([0-1] return 102, >1 return 0) + // shift = y0(shift_index + index) ([0-1] return LUT_d[index], >1 return M_PI_2 - LUT_d[index] + // p = table_pos_neg(x0) (>= 0 return 1, < 0 return -1) + // p(shift + invert * z) + + bf16_emit_abs(ctx, tl_ifmap, tl_buf, fmt); + + // y = 1 / x + bf16_emit_reciprocal(ctx, + tl_buf, + tl_buf2, + tl_table_answer, + tl_table_answer_mantissa, + tl_ofmap_bf16 + ); + + bmk1880v2_tiu_element_wise_min_param_t p7; + p7.min = tl_ofmap_bf16; + p7.a = tl_buf; + p7.b_is_const = 0; + p7.b = tl_ofmap_bf16; + bmk1880v2_tiu_element_wise_min(ctx, &p7); + + // get index + bmk1880v2_tiu_element_wise_mul_param_t p1; + p1.res_high = NULL; + p1.res_low = tl_buf; + p1.a = tl_ofmap_bf16; + p1.b_is_const = 1; + p1.b_const.val = convert_fp32_bf16(100.0); + p1.rshift_bits = 0; + p1.relu_enable = 0; + bmk1880v2_tiu_element_wise_mul(ctx, &p1); + + bf16_atan_slope_multipilier(ctx, tl_buf, tl_buf2, tl_buf3, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + + // get int8 index of x2 + bf16_get_u8_tbl_idx(ctx, tl_buf, tl_buf2); + + // x0 = base[x2] + (0.x * (slope[x2]) + // TODO: use mac + + // get slope[x2] + bmk1880v2_tiu_lookup_table_param_t p12; + p12.ofmap = tl_buf3; + p12.ifmap = tl_buf2; + p12.table = tl_slope_buf; + bmk1880v2_tiu_lookup_table(ctx, &p12); + + // z = (min(x,y) * 100 - index) * slope(index) + p1.res_high = NULL; + p1.res_low = tl_ofmap_bf16; + p1.a = tl_ofmap_bf16; + p1.b_is_const = 0; + p1.b = tl_buf3; + p1.rshift_bits = 0; + p1.relu_enable = 0; + bmk1880v2_tiu_element_wise_mul(ctx, &p1); + + // get index from exp, + // mv_lut_base get exp as index, remove mantissa + bmk1880v2_tdma_l2l_tensor_copy_param_t p10; + p10.dst = tl_buf3; + p10.src = tl_ifmap; + p10.mv_lut_base = false; + p10.mv_lut_idx = true; + bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p10); + p10.mv_lut_idx = false; + + // invert = table_0_102(x) ([0-1] return 1, >1 return -1) + p12.ofmap = tl_buf3; + p12.ifmap = tl_buf3; + p12.table = tl_invert_buf; + bmk1880v2_tiu_lookup_table(ctx, &p12); + + // z = invert * z + p1.res_high = NULL; + p1.res_low = tl_ofmap_bf16; + p1.a = tl_ofmap_bf16; + p1.b_is_const = 0; + p1.b = tl_buf3; + p1.rshift_bits = 0; + p1.relu_enable = 0; + bmk1880v2_tiu_element_wise_mul(ctx, &p1); + + // t = 51 * (invert + 1), -> invert + 1 + bmk1880v2_tiu_element_wise_add_param_t p4; + p4.res_high = 0; + p4.res_low = tl_buf3; + p4.a_high = 0; + p4.a_low = tl_buf3; + p4.b_is_const = 1; + p4.b_high = 0; + p4.b_const.val = convert_fp32_bf16(1.0); + p4.rshift_bits = 0; + p4.relu_enable = 0; + bmk1880v2_tiu_element_wise_add(ctx, &p4); + + // t = 51 * (invert + 1) + p1.res_high = NULL; + p1.res_low = tl_buf3; + p1.a = tl_buf3; + p1.b_is_const = 1; + p1.b_const.val = convert_fp32_bf16(51.0); + p1.rshift_bits = 0; + p1.relu_enable = 0; + bmk1880v2_tiu_element_wise_mul(ctx, &p1); + +#if 1 + // avoid rounding, we first round org index + bf16_get_u8_tbl_idx(ctx, tl_buf, tl_buf2); + tl_buf2->fmt = FMT_U8; + tl_shape_t t = tl_buf2->shape; + bmk1880v2_tensor_lmem_stride_t s = tl_buf2->stride; + tl_buf2->shape.h = tl_buf2->shape.h * tl_buf2->shape.w; + tl_buf2->shape.w = 1; + tl_buf2->stride.h = 2; + tl_buf2->stride.c = tl_buf2->shape.h * tl_buf2->shape.w; + tl_buf2->stride.c = tl_buf2->shape.c * tl_buf2->shape.h * tl_buf2->shape.w; + p10.dst = tl_buf; + p10.src = tl_buf2; + p10.mv_lut_base = false; + p10.mv_lut_idx = false; + bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p10); + tl_buf2->fmt = FMT_BF16; + tl_buf2->shape = t; + tl_buf2->stride = s; +#else +#endif + // t = t + index + p4.res_high = 0; + p4.res_low = tl_buf3; + p4.a_high = 0; + p4.a_low = tl_buf3; + p4.b_is_const = 0; + p4.b_high = 0; + p4.b_low = tl_buf; + p4.rshift_bits = 0; + p4.relu_enable = 0; + bmk1880v2_tiu_element_wise_add(ctx, &p4); + + // get int8 index for lut + bf16_get_u8_tbl_idx(ctx, tl_buf3, tl_buf); + + // shift = y0(t) ([0-1] return LUT_d[index], >1 return M_PI_2 - LUT_d[index] + p12.ofmap = tl_buf3; + p12.ifmap = tl_buf; + p12.table = tl_y0_buf; + bmk1880v2_tiu_lookup_table(ctx, &p12); + + // z = base[x2] + (0.x * (slope[x2]) + p4.res_high = 0; + p4.res_low = tl_buf2; + p4.a_high = 0; + p4.a_low = tl_ofmap_bf16; + p4.b_is_const = 0; + p4.b_high = 0; + p4.b_low = tl_buf3; + p4.rshift_bits = 0; + p4.relu_enable = 0; + bmk1880v2_tiu_element_wise_add(ctx, &p4); + + // get pos neg, use mv_lut_idx + p10.dst = tl_buf3; + p10.src = tl_ifmap; + p10.mv_lut_base = false; + p10.mv_lut_idx = true; + bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p10); + p10.mv_lut_idx = false; + + // p = table_pos_neg(x0) (>= 0 return 1, < 0 return -1) + p12.ofmap = tl_buf3; + p12.ifmap = tl_buf3; + p12.table = tl_pos_neg_buf; + bmk1880v2_tiu_lookup_table(ctx, &p12); +#if 0 + // p * z + p1.res_high = NULL; + p1.res_low = tl_ofmap_bf16; + p1.a = tl_buf2; + p1.b_is_const = 0; + p1.b = tl_buf3; + p1.rshift_bits = 0; + p1.relu_enable = 0; + bmk1880v2_tiu_element_wise_mul(ctx, &p1); +#else + + // add pi/-pi for atan2 + bf16_emit_mul_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 0.0); + bf16_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, b); + + // p * z + pi + bmk1880v2_tiu_element_wise_mac_param_t p2; + p2.res_high = 0; + p2.res_low = tl_ofmap_bf16; + p2.res_is_int8 = 0; + p2.a = tl_buf2; + p2.b_is_const = 0; + p2.b = tl_buf3; + p2.lshift_bits = 0;//lshift_bits; + p2.rshift_bits = 0;//rshift_bits; + p2.relu_enable = 0; + bmk1880v2_tiu_element_wise_mac(ctx, &p2); +#endif + + return 0; +} + +int bf16_atan_emit(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* tl_ifmap, + bmk1880v2_tensor_lmem_t* tl_buf, + bmk1880v2_tensor_lmem_t* tl_buf2, + bmk1880v2_tensor_lmem_t* tl_buf3, + bmk1880v2_tensor_lmem_t* tl_y0_buf, + bmk1880v2_tensor_lmem_t* tl_slope_buf, + bmk1880v2_tensor_lmem_t* tl_invert_buf, + bmk1880v2_tensor_lmem_t* tl_pos_neg_buf, + bmk1880v2_tensor_lmem_t *tl_table_answer, + bmk1880v2_tensor_lmem_t *tl_table_answer_mantissa, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt) { + return _bf16_atan_emit(ctx, + tl_ifmap, + tl_buf, + tl_buf2, + tl_buf3, + tl_y0_buf, + tl_slope_buf, + tl_invert_buf, + tl_pos_neg_buf, + tl_table_answer, + tl_table_answer_mantissa, + tl_ofmap_bf16, + fmt, 0.0); +} + +/** + * \table_data_atan_slope is optional, NULL for not assign it + */ +void bf16_atan_tbl(uint16_t *table_data_atan_y0, + uint16_t* table_data_atan_slope, uint16_t* table_data_atan_invert, uint16_t* table_data_atan_pos_neg, + bmk1880v2_tensor_lmem_shape_t* table_shape) { + + assert(table_data_atan_y0); + //assert(table_data_atan_slope); + assert(table_data_atan_invert); + assert(table_data_atan_pos_neg); + assert(table_shape); + + bf16_atan_y0(table_data_atan_y0, table_shape); + if (table_data_atan_slope) { + bf16_atan_slope(table_data_atan_slope, table_shape); + } + bf16_atan_s_01(table_data_atan_invert, table_shape); + bf16_atan_pos_neg(table_data_atan_pos_neg, table_shape); +} + +void bf16_atan_fast_degree_tbl(uint16_t *table_data_atan_y0, + uint16_t* table_data_atan_invert, uint16_t* table_data_atan_pos_neg, + bmk1880v2_tensor_lmem_shape_t* table_shape) { + + assert(table_data_atan_y0); + assert(table_data_atan_invert); + assert(table_data_atan_pos_neg); + assert(table_shape); + + bf16_atan_fast_degree_y0(table_data_atan_y0, table_shape); + bf16_atan_s_01(table_data_atan_invert, table_shape); + bf16_atan_pos_neg(table_data_atan_pos_neg, table_shape); +} + +/** issue atan >= 0 + * for fast version, we discard slope + * tl_y0_buf[0-102) put 'LUT[index]', [102-204) for 'M_PI_2 - LUT[index]' + */ +int _bf16_atan_fast_emit(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* tl_ifmap, + bmk1880v2_tensor_lmem_t* tl_buf, + bmk1880v2_tensor_lmem_t* tl_buf2, + bmk1880v2_tensor_lmem_t* tl_y0_buf, + bmk1880v2_tensor_lmem_t* tl_invert_buf, + bmk1880v2_tensor_lmem_t* tl_pos_neg_buf, + bmk1880v2_tensor_lmem_t *tl_table_answer, + bmk1880v2_tensor_lmem_t *tl_table_answer_mantissa, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt, float b, uint8_t is_dirty_ifmap) { + + bf16_table_check(tl_ifmap, tl_y0_buf, tl_y0_buf, tl_ifmap); + bf16_table_check(tl_buf, tl_invert_buf, tl_pos_neg_buf, tl_buf); + bf16_table_check(tl_buf, tl_table_answer, tl_table_answer_mantissa, tl_ofmap_bf16); + + bmk1880v2_tiu_lookup_table_param_t p12; + bmk1880v2_tdma_l2l_tensor_copy_param_t p10; + + // plz refer https://github.com/xiezhq-hermann/atan_lookup/blob/master/atan.cpp + // for faster version + bf16_emit_abs(ctx, tl_ifmap, tl_buf, fmt); + + // y = 1 / x + _bf16_lut_exp_mantissa(ctx, + tl_buf, + NULL, + tl_table_answer, + tl_table_answer_mantissa, + tl_ofmap_bf16, + true + ); + + // once again cuz recipical's input dirtied + bf16_emit_abs(ctx, tl_ifmap, tl_buf, fmt); + + bmk1880v2_tiu_element_wise_min_param_t p7; + p7.min = tl_buf; + p7.a = tl_buf; + p7.b_is_const = 0; + p7.b = tl_ofmap_bf16; + bmk1880v2_tiu_element_wise_min(ctx, &p7); + + // get index + bf16_emit_mul_const(ctx, tl_buf, tl_buf, fmt, 100.0); + + // get index from exp, + // mv_lut_base get exp as index, remove mantissa + p10.dst = tl_ofmap_bf16; + p10.src = tl_ifmap; + p10.mv_lut_base = false; + p10.mv_lut_idx = true; + bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p10); + p10.mv_lut_idx = false; + + bmk1880v2_tensor_lmem_t* tmp = tl_buf2; + if (is_dirty_ifmap) { + tmp = tl_ifmap; + } + + // get pos neg, use mv_lut_idx + p10.dst = tmp; + p10.src = tl_ifmap; + p10.mv_lut_base = false; + p10.mv_lut_idx = true; + bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p10); + p10.mv_lut_idx = false; + + // p = table_pos_neg(x0) (>= 0 return 1, < 0 return -1) + p12.ofmap = tmp; + p12.ifmap = tmp; + p12.table = tl_pos_neg_buf; + bmk1880v2_tiu_lookup_table(ctx, &p12); + + + // get index of LUT[index] or (M_PI_2 - LUT[index]) + { + + // invert = table_0_102(x) ([0-1] return 1, >1 return -1) + p12.ofmap = tl_ofmap_bf16; + p12.ifmap = tl_ofmap_bf16; + p12.table = tl_invert_buf; + bmk1880v2_tiu_lookup_table(ctx, &p12); + + bmk1880v2_tensor_lmem_t *out = tl_buf; +#if 1 + // t = 51 * (invert + 1), -> invert + 1 + bf16_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 1.0); + + // t = 51 * (invert + 1) + bf16_emit_mul_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 51.0); + + // t = t + index + bf16_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + + // get int8 index for lut + //bf16_get_u8_tbl_idx(ctx, tl_ofmap_bf16, tl_buf); + //_bf16_get_tbl_idx(ctx, tl_ofmap_bf16, tl_buf, FMT_U8, 0); + + //// shift = y0(t) ([0-1] return LUT_d[index], >1 return M_PI_2 - LUT_d[index] + //p12.ofmap = tl_buf; + //p12.ifmap = tl_buf; + //p12.table = tl_y0_buf; + //bmk1880v2_tiu_lookup_table(ctx, &p12); + + _bf16_get_tbl_idx(ctx, tl_ofmap_bf16, tl_buf, FMT_U8, 0); + +#else + // index, output is uint8 format + _bf16_get_tbl_idx(ctx, tl_buf, tl_buf, FMT_U8, 0); + + // mask value from bf16 -> int8, we add as bf16 + // int8 format (51*(mask + 1) + index) is real remap index for table + // mask = mask + 1 + bf16_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 1.0); + + // mask = 51 * mask + bf16_emit_mul_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 51.0); + + // mask value change to int8 format for lut + _bf16_get_tbl_idx(ctx, tl_ofmap_bf16, tl_ofmap_bf16, FMT_U8, 0); + + // int8 format (51*(mask) + index) is real remap index for table + if (1) + { + bmk1880v2_tensor_lmem_t index_u8, mask_u8, fake_u8, out_u8; + bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &index_u8, tl_buf, FMT_U8); + bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &out_u8, tl_buf, FMT_U8); + bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &mask_u8, tl_ofmap_bf16, FMT_U8); + bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &fake_u8, tmp, FMT_U8); + //fake_u8.start_address = + // ctx->chip_info.lmem_size - (fake_u8.shape.h * fake_u8.shape.w); + // //tl_buf->start_address + 1; + // //tl_ifmap->start_address; + + // mask + index + // its safe we only need low part value, so we give fake high part + + bmk1880v2_tensor_lmem_t * a = bmk1880v2_lmem_alloc_tensor( + ctx, + out_u8.shape, + FMT_U8, CTRL_NULL); +#if 1 + bmk1880v2_tiu_element_wise_add_param_t p4; + p4.res_high = 0; + //p4.res_low = &mask_u8; + p4.res_low = &index_u8; + p4.a_high = a; + p4.a_low = &index_u8; + p4.b_is_const = 0; + p4.b_high = a; + p4.b_low = &mask_u8; + p4.rshift_bits = 0; + p4.relu_enable = 0; + bmk1880v2_tiu_element_wise_add(ctx, &p4); + //out = tl_ofmap_bf16; +#else + { + bmk1880v2_tiu_element_wise_mul_param_t p; + p.res_high = NULL; + p.res_low = a; + p.a = a; + p.b_is_const = 1; + p.b_const.val = 0; + p.b_const.is_signed = 0; + p.rshift_bits = 0; + p.relu_enable = 0; + bmk1880v2_tiu_element_wise_mul(ctx, &p); + } + + out = tl_ofmap_bf16; + bmk1880v2_tiu_element_wise_mac_param_t p2; + p2.res_high = a; + p2.res_low = &mask_u8; + p2.res_is_int8 = 0; + p2.a = &index_u8; + p2.b_is_const = 1; + p2.b = 0; + p2.b_const.val = 1; + p2.b_const.is_signed = 0; + p2.lshift_bits = 0; + p2.rshift_bits = 0; + p2.relu_enable = 0; + bmk1880v2_tiu_element_wise_mac(ctx, &p2); +#endif + bmk1880v2_lmem_free_tensor( + ctx, a); + } + else { + // move bak to bf16 + //bmk1880v2_tensor_lmem_t index_u8, mask_u8; + //bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &index_u8, tl_buf, FMT_U8); + //bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &mask_u8, tl_ofmap_bf16, FMT_U8); + + //p10.dst = tl_buf; + //p10.src = &index_u8; + //p10.mv_lut_base = false; + //p10.mv_lut_idx = false; + //bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p10); + + //p10.dst = tl_ofmap_bf16; + //p10.src = &mask_u8; + //bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p10); + + //bf16_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + //_bf16_get_tbl_idx(ctx, tl_ofmap_bf16, tl_buf, FMT_U8, 0); + + } +#endif + + // shift = y0(t) ([0-1] return LUT_d[index], >1 return M_PI_2 - LUT_d[index] + p12.ofmap = out; + p12.ifmap = out; + p12.table = tl_y0_buf; + bmk1880v2_tiu_lookup_table(ctx, &p12); + } + +#if 0 + // add pi/-pi for atan2 + bf16_emit_mul_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 0.0); + bf16_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, b); + + // p * z + pi + bmk1880v2_tiu_element_wise_mac_param_t p2; + p2.res_high = 0; + p2.res_low = tl_ofmap_bf16; + p2.res_is_int8 = 0; + p2.a = tl_buf; + p2.b_is_const = 0; + p2.b = tmp; + p2.lshift_bits = 0;//lshift_bits; + p2.rshift_bits = 0;//rshift_bits; + p2.relu_enable = 0; + bmk1880v2_tiu_element_wise_mac(ctx, &p2); +#else + bf16_emit_mul(ctx, tl_buf, tmp, tl_ofmap_bf16, fmt); + bf16_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, b); +#endif + + return 0; +} + +/** + * \brief using \tl_buf2 as temp buffer for uint8_t add + * \NOTICE: it dirties input: \tl_ifmap + */ +int __bf16_atan_fast_emit(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* tl_ifmap, + bmk1880v2_tensor_lmem_t* tl_buf, + bmk1880v2_tensor_lmem_t* tl_buf2, + bmk1880v2_tensor_lmem_t* tl_y0_buf, + bmk1880v2_tensor_lmem_t* tl_invert_buf, + bmk1880v2_tensor_lmem_t* tl_pos_neg_buf, + bmk1880v2_tensor_lmem_t *tl_table_answer, + bmk1880v2_tensor_lmem_t *tl_table_answer_mantissa, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt) { + + bf16_table_check(tl_ifmap, tl_y0_buf, tl_y0_buf, tl_ifmap); + bf16_table_check(tl_buf, tl_invert_buf, tl_pos_neg_buf, tl_buf); + bf16_table_check(tl_buf, tl_table_answer, tl_table_answer_mantissa, tl_ofmap_bf16); + + bmk1880v2_tiu_lookup_table_param_t p12; + + // plz refer https://github.com/xiezhq-hermann/atan_lookup/blob/master/atan.cpp + // for faster version + bf16_emit_abs(ctx, tl_ifmap, tl_buf, fmt); + + // y = 1 / x + _bf16_lut_exp_mantissa(ctx, + tl_buf, + NULL, + tl_table_answer, + tl_table_answer_mantissa, + tl_ofmap_bf16, + true + ); + + // once again cuz recipical's input dirtied + bf16_emit_abs(ctx, tl_ifmap, tl_buf, fmt); + + bmk1880v2_tiu_element_wise_min_param_t p7; + p7.min = tl_buf; + p7.a = tl_buf; + p7.b_is_const = 0; + p7.b = tl_ofmap_bf16; + bmk1880v2_tiu_element_wise_min(ctx, &p7); + + // get index + bf16_emit_mul_const(ctx, tl_buf, tl_buf, fmt, 100.0); + + // get index from exp, + // mv_lut_base get exp as index, remove mantissa +#if 1 + bmk1880v2_tdma_l2l_tensor_copy_param_t p10; + p10.dst = tl_ofmap_bf16; + p10.src = tl_ifmap; + p10.mv_lut_base = false; + p10.mv_lut_idx = true; + bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p10); + p10.mv_lut_idx = false; +#else + bf16_emit_abs(ctx, tl_ifmap, tl_ofmap_bf16, fmt); + bf16_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 0.5); +#endif + + bmk1880v2_tensor_lmem_t* tmp = tl_buf2; + tmp = tl_ifmap; + +#if 0 + // get pos neg, use mv_lut_idx + p10.dst = tmp; + p10.src = tl_ifmap; + p10.mv_lut_base = false; + p10.mv_lut_idx = true; + bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p10); + p10.mv_lut_idx = false; + + // p = table_pos_neg(x0) (>= 0 return 1, < 0 return -1) + p12.ofmap = tmp; + p12.ifmap = tmp; + p12.table = tl_pos_neg_buf; + bmk1880v2_tiu_lookup_table(ctx, &p12); + //p12.ofmap = tl_ofmap_bf16; + //p12.ifmap = tmp; + //p12.table = tl_pos_neg_buf; + //bmk1880v2_tiu_lookup_table(ctx, &p12); + //return 0; +#else + // dirty input is ok + bmk1880v2_tensor_lmem_t index_i8; + bmk1880v2_tensor_lmem_s_copy_l2l_bf16_8(ctx, &index_i8, tl_buf2, FMT_I8); + bf16_emit_mask_ge0_lt0(ctx, tmp, &index_i8, tmp, fmt); + //bf16_emit_mask_ge0_lt0(ctx, tmp, &index_i8, tl_ofmap_bf16, fmt); + //return 0; +#endif + + // get index of LUT[index] or (M_PI_2 - LUT[index]) + { + +#if 1 + // invert = table_0_102(x) ([0-1] return 1, >1 return -1) + p12.ofmap = tl_ofmap_bf16; + p12.ifmap = tl_ofmap_bf16; + p12.table = tl_invert_buf; + bmk1880v2_tiu_lookup_table(ctx, &p12); +#else + { + bmk1880v2_tensor_lmem_t index_i8; + bmk1880v2_tensor_lmem_s_copy_l2l_bf16_8(ctx, &index_i8, tl_buf2, FMT_I8); + // 1. abs + // 2. add 0.5 to round bf16->int8 + // 3. leave (0,1) and others, rightshift 1 to get 0, others + // 4. saturate to int max, and transform from int8 to bf16 + + //bf16_emit_mul_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, convert_bf16_fp32(0x7f00)); + bmk1880v2_tdma_l2l_tensor_copy_param_t p1; + p1.src = tl_ofmap_bf16; + p1.dst = &index_i8; + bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p1); + + bmk1880v2_tiu_element_wise_mul_param_t p; + p.res_high = NULL; + p.res_low = &index_i8; + p.a = &index_i8; + p.b_is_const = 1; + p.b_const.val = 1; + p.b_const.is_signed = 0; + p.rshift_bits = 1; + p.relu_enable = 0; + bmk1880v2_tiu_element_wise_mul(ctx, &p); + + //p.res_high = NULL; + //p.res_low = &index_i8; + //p.a = &index_i8; + //p.b_is_const = 1; + //p.b_const.val = -1; + //p.b_const.is_signed = 1; + //p.rshift_bits = 7; + //p.relu_enable = 0; + //bmk1880v2_tiu_element_wise_mul(ctx, &p); + + p1.src = &index_i8; + p1.dst = tl_ofmap_bf16; + bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p1); + return 0; + + //bf16_emit_mask_eq_0(ctx, tl_ofmap_bf16, tl_ofmap_bf16, &index_i8, tl_ofmap_bf16, fmt); + bf16_emit_mul_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 2.0); + bf16_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 1.0); + } +#endif + + bmk1880v2_tensor_lmem_t *out = tl_buf; +#if 0 + // t = 51 * (invert + 1), -> invert + 1 + bf16_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 1.0); + + // t = 51 * (invert + 1) + bf16_emit_mul_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 51.0); + + // t = t + index + bf16_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + + // get int8 index for lut + //bf16_get_u8_tbl_idx(ctx, tl_ofmap_bf16, tl_buf); + //_bf16_get_tbl_idx(ctx, tl_ofmap_bf16, tl_buf, FMT_U8, 0); + + //// shift = y0(t) ([0-1] return LUT_d[index], >1 return M_PI_2 - LUT_d[index] + //p12.ofmap = tl_buf; + //p12.ifmap = tl_buf; + //p12.table = tl_y0_buf; + //bmk1880v2_tiu_lookup_table(ctx, &p12); + + _bf16_get_tbl_idx(ctx, tl_ofmap_bf16, tl_buf, FMT_U8, 0); + +#else + // index, output is uint8 format + _bf16_get_tbl_idx(ctx, tl_buf, tl_buf, FMT_U8, 0); + + // mask value from bf16 -> int8, we add as bf16 + // int8 format (51*(mask + 1) + index) is real remap index for table + // mask = mask + 1 + bf16_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 1.0); + + // mask = 51 * mask + bf16_emit_mul_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 51.0); + + // mask value change to int8 format for lut + _bf16_get_tbl_idx(ctx, tl_ofmap_bf16, tl_ofmap_bf16, FMT_U8, 0); + + // int8 format (51*(mask) + index) is real remap index for table + if (1) + { + bmk1880v2_tensor_lmem_t index_u8, mask_u8, fake_u8, out_u8; + bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &index_u8, tl_buf, FMT_U8); + bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &out_u8, tl_buf, FMT_U8); + bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &mask_u8, tl_ofmap_bf16, FMT_U8); + bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &fake_u8, tl_buf2, FMT_U8); +#if 0 + // mask + index + // its safe we only need low part value, so we give fake high part + bmk1880v2_tiu_element_wise_add_param_t p4; + p4.res_high = 0; + p4.res_low = &index_u8; + p4.a_high = &fake_u8; + p4.a_low = &index_u8; + p4.b_is_const = 0; + p4.b_high = &fake_u8; + p4.b_low = &mask_u8; + p4.rshift_bits = 0; + p4.relu_enable = 0; + bmk1880v2_tiu_element_wise_add(ctx, &p4); +#else + bmk1880v2_tiu_element_wise_mac_param_t p2; + p2.res_high = &fake_u8; + p2.res_low = &index_u8; + p2.res_is_int8 = 0; + p2.a = &mask_u8; + p2.b_is_const = 1; + p2.b = 0; + p2.b_const.val = 1; + p2.b_const.is_signed = 0; + p2.lshift_bits = 0; + p2.rshift_bits = 0; + p2.relu_enable = 0; + bmk1880v2_tiu_element_wise_mac(ctx, &p2); +#endif + + } + else { + // move bak to bf16 + //bmk1880v2_tensor_lmem_t index_u8, mask_u8; + //bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &index_u8, tl_buf, FMT_U8); + //bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &mask_u8, tl_ofmap_bf16, FMT_U8); + + //p10.dst = tl_buf; + //p10.src = &index_u8; + //p10.mv_lut_base = false; + //p10.mv_lut_idx = false; + //bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p10); + + //p10.dst = tl_ofmap_bf16; + //p10.src = &mask_u8; + //bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p10); + + //bf16_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + //_bf16_get_tbl_idx(ctx, tl_ofmap_bf16, tl_buf, FMT_U8, 0); + + } +#endif + + // shift = y0(t) ([0-1] return LUT_d[index], >1 return M_PI_2 - LUT_d[index] + p12.ofmap = out; + p12.ifmap = out; + p12.table = tl_y0_buf; + bmk1880v2_tiu_lookup_table(ctx, &p12); + } + + bf16_emit_mul(ctx, tl_buf, tmp, tl_ofmap_bf16, fmt); + + return 0; +} + +int bf16_atan_fast_emit(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* tl_ifmap, + bmk1880v2_tensor_lmem_t* tl_buf, + bmk1880v2_tensor_lmem_t* tl_buf2, + bmk1880v2_tensor_lmem_t* tl_y0_buf, + bmk1880v2_tensor_lmem_t* tl_invert_buf, + bmk1880v2_tensor_lmem_t* tl_pos_neg_buf, + bmk1880v2_tensor_lmem_t *tl_table_answer, + bmk1880v2_tensor_lmem_t *tl_table_answer_mantissa, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt, uint8_t is_dirty_ifmap) { + + return _bf16_atan_fast_emit(ctx, + tl_ifmap, + tl_buf, + tl_buf2, + tl_y0_buf, + tl_invert_buf, + tl_pos_neg_buf, + tl_table_answer, + tl_table_answer_mantissa, + tl_ofmap_bf16, + fmt, 0.0, is_dirty_ifmap); +} diff --git a/cvikernel/src/bm1880v2/non_atomic/tiu_lut_atan2.c b/cvikernel/src/bm1880v2/non_atomic/tiu_lut_atan2.c new file mode 100644 index 000000000..56612cdea --- /dev/null +++ b/cvikernel/src/bm1880v2/non_atomic/tiu_lut_atan2.c @@ -0,0 +1,1015 @@ +/** + * \brirf implement with atan, plz refer https://en.wikipedia.org/wiki/Atan2 + * NOTICE: current epsilon set to 0.1 + */ +#include "gen_lut.h" +#include + +//#define DBG + +static void _bf16_atan2_emit_case_3(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* y, + bmk1880v2_tensor_lmem_t* x, + bmk1880v2_tensor_lmem_t* tl_buf, + bmk1880v2_tensor_lmem_t* tl_buf2, + bmk1880v2_tensor_lmem_t* tl_buf3, + bmk1880v2_tensor_lmem_t* tl_buf4, + bmk1880v2_tensor_lmem_t* tl_y0_buf, + bmk1880v2_tensor_lmem_t* tl_slope_buf, + bmk1880v2_tensor_lmem_t* tl_invert_buf, + bmk1880v2_tensor_lmem_t* tl_pos_neg_table, + bmk1880v2_tensor_lmem_t* tl_table_answer, + bmk1880v2_tensor_lmem_t* tl_table_answer_mantissa, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt, float b) { + // case 3 + // atan( y / x) + + // x0 = reciprocal(x) + bf16_emit_reciprocal(ctx, + x, + tl_buf2, + tl_table_answer, + tl_table_answer_mantissa, + tl_buf + ); + + // y0 = x0 * y + bmk1880v2_tiu_element_wise_mul_param_t p1; + memset(&p1, 0, sizeof(p1)); + p1.res_high = NULL; + p1.res_low = tl_buf4; + p1.a = y; + p1.b_is_const = 0; + p1.b = tl_buf; + p1.rshift_bits = 0; + p1.relu_enable = 0; + bmk1880v2_tiu_element_wise_mul(ctx, &p1); + + // x0 = atan(y0) + _bf16_atan_emit(ctx, + tl_buf4, + tl_buf, + tl_buf2, + tl_buf3, + tl_y0_buf, + tl_slope_buf, + tl_invert_buf, + tl_pos_neg_table, + tl_table_answer, + tl_table_answer_mantissa, + OUT tl_ofmap_bf16, + fmt, b); + +} + +static void bf16_atan2_emit_case_3(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* y, + bmk1880v2_tensor_lmem_t* x, + bmk1880v2_tensor_lmem_t* tl_buf, + bmk1880v2_tensor_lmem_t* tl_buf2, + bmk1880v2_tensor_lmem_t* tl_buf3, + bmk1880v2_tensor_lmem_t* tl_buf4, + bmk1880v2_tensor_lmem_t* tl_y0_buf, + bmk1880v2_tensor_lmem_t* tl_slope_buf, + bmk1880v2_tensor_lmem_t* tl_invert_buf, + bmk1880v2_tensor_lmem_t* tl_pos_neg_table, + bmk1880v2_tensor_lmem_t* tl_table_answer, + bmk1880v2_tensor_lmem_t* tl_table_answer_mantissa, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt) { + + _bf16_atan2_emit_case_3(ctx, + y, + x, + tl_buf, + tl_buf2, + tl_buf3, + tl_buf4, + tl_y0_buf, + tl_slope_buf, + tl_invert_buf, + tl_pos_neg_table, + tl_table_answer, + tl_table_answer_mantissa, + tl_ofmap_bf16, + fmt, 0.0); +} + +//NOTICE: it could dirty \y +/** + * atan2(y, x) should express 4 condition using atan express from [here](https://en.wikipedia.org/wiki/Atan2) + */ +void bf16_atan2_emit(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* y, + bmk1880v2_tensor_lmem_t* x, + bmk1880v2_tensor_lmem_t* tl_buf, + bmk1880v2_tensor_lmem_t* tl_buf2, + bmk1880v2_tensor_lmem_t* tl_buf3, + bmk1880v2_tensor_lmem_t* tl_buf4, + bmk1880v2_tensor_lmem_t* tl_buf5, + bmk1880v2_tensor_lmem_t* tl_buf6, + bmk1880v2_tensor_lmem_t* tl_y0_buf, + bmk1880v2_tensor_lmem_t* tl_slope_buf, + bmk1880v2_tensor_lmem_t* tl_invert_buf, + bmk1880v2_tensor_lmem_t* tl_pos_neg_table, + bmk1880v2_tensor_lmem_t* tl_table_answer, + bmk1880v2_tensor_lmem_t* tl_table_answer_mantissa, + bmk1880v2_tensor_lmem_t* tl_sqrt_table_answer, + bmk1880v2_tensor_lmem_t* tl_sqrt_table_answer_mantissa, + bmk1880v2_tensor_lmem_t* tl_0_idx_table, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt) +{ + bf16_table_check(y, tl_y0_buf, tl_slope_buf, x); + bf16_table_check(tl_buf, tl_invert_buf, tl_pos_neg_table, tl_buf2); + bf16_table_check(tl_buf3, tl_table_answer, tl_table_answer_mantissa, tl_buf4); + bf16_table_check(tl_buf6, tl_table_answer, tl_0_idx_table, tl_buf5); + bf16_table_check(y, tl_sqrt_table_answer, tl_sqrt_table_answer_mantissa, x); + + // atan(y/x), x > 0 + // atan(y/x) + PI , x < 0 and y >= 0 + // atan(y/x) - PI , x < 0 and y < 0 + // pi / 2, x = 0 and y > 0 + // -pi / 2, x = 0 and y < 0 + // 0, x = 0 and y = 0 + + // atan(y/x), x > 0 + bf16_emit_max_const(ctx, x, tl_buf4, fmt, 0.0); + bf16_atan2_emit_case_3(ctx, + y, + tl_buf4, + tl_buf, + tl_buf2, + tl_buf3, + tl_buf5, + tl_y0_buf, + tl_slope_buf, + tl_invert_buf, + tl_pos_neg_table, + tl_table_answer, + tl_table_answer_mantissa, + tl_ofmap_bf16, + fmt); + + // x > 0 + bf16_emit_mask_gt0(ctx, x, tl_buf, tl_buf3, tl_buf4, + tl_pos_neg_table, tl_0_idx_table, tl_buf2, fmt); + + bf16_emit_mul(ctx, tl_ofmap_bf16, tl_buf2, tl_ofmap_bf16, fmt); + + // atan(y/x) + PI , x < 0 and y >= 0 + bf16_emit_min_const(ctx, x, tl_buf4, fmt, 0.0); + _bf16_atan2_emit_case_3(ctx, + y, + tl_buf4, + tl_buf, + tl_buf2, + tl_buf3, + tl_buf5, + tl_y0_buf, + tl_slope_buf, + tl_invert_buf, + tl_pos_neg_table, + tl_table_answer, + tl_table_answer_mantissa, + tl_buf6, + fmt, M_PI); + //bf16_emit_add_const(ctx, tl_buf6, tl_buf6, fmt, M_PI); + + // get index map that x < 0 and y >= 0 + // !(y >= 0) = !(y < 0) +#if 0 + bf16_emit_pos_idx(ctx, y, tl_buf, tl_pos_neg_table, tl_buf3, fmt); + // y == 0 + bf16_emit_0_idx(ctx, y, tl_buf, tl_0_idx_table, tl_buf2, fmt); + bf16_emit_add(ctx, tl_buf2, tl_buf3, tl_buf2, fmt); +#else + // y >= 0 + bf16_emit_mask_ge0(ctx, y, tl_buf, tl_pos_neg_table, tl_buf2, fmt); +#endif + // x < 0 + bf16_emit_mask_lt0(ctx, x, tl_buf, tl_pos_neg_table, tl_buf3, fmt); + // x < 0 && y >= 0 + bf16_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt); + + bf16_emit_mul(ctx, tl_buf6, tl_buf2, tl_buf, fmt); + bf16_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + + // atan(y/x) - PI , x < 0 and y < 0 + bf16_emit_min_const(ctx, x, tl_buf4, fmt, 0.0); + bf16_atan2_emit_case_3(ctx, + y, + tl_buf4, + tl_buf, + tl_buf2, + tl_buf3, + tl_buf5, + tl_y0_buf, + tl_slope_buf, + tl_invert_buf, + tl_pos_neg_table, + tl_table_answer, + tl_table_answer_mantissa, + tl_buf6, + fmt); + bf16_emit_add_const(ctx, tl_buf6, tl_buf6, fmt, -1.0 * M_PI); + // x < 0 and y < 0 + + // we leverage x <= 0 and y <= 0 cuz we filter out x = 0 case, speed up it + // x < 0 + bf16_emit_mask_lt0(ctx, x, tl_buf, tl_pos_neg_table, tl_buf2, fmt); + // y < 0 + bf16_emit_mask_lt0(ctx, y, tl_buf, tl_pos_neg_table, tl_buf3, fmt); + // x < 0 && y < 0 + bf16_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt); + + bf16_emit_mul(ctx, tl_buf6, tl_buf2, tl_buf, fmt); + bf16_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + + // pi / 2, x = 0 and y > 0 + // x = 0 + bf16_emit_mask_eq0(ctx, x, tl_buf, tl_0_idx_table, tl_buf2, fmt); // 0.003 could consider 1 + + // y > 0 + bf16_emit_mask_gt0(ctx, y, tl_buf, tl_buf5, tl_buf4, tl_pos_neg_table, tl_0_idx_table, tl_buf3, fmt); + // x = 0 && y > 0 + bf16_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt); + bf16_emit_mul_const(ctx, tl_buf2, tl_buf2, fmt, M_PI / 2.0); + + bf16_emit_add(ctx, tl_buf2, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + + // -pi / 2, x = 0 and y < 0 + // x = 0 + bf16_emit_mask_eq0(ctx, x, tl_buf, tl_0_idx_table, tl_buf2, fmt); // 0.003 could consider 1 + // y < 0 + bf16_emit_mask_lt0(ctx, y, tl_buf, tl_pos_neg_table, tl_buf3, fmt); + // x = 0 && y < 0 + bf16_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt); + + bf16_emit_mul_const(ctx, tl_buf2, tl_buf2, fmt, -1.0 * M_PI / 2.0); + + bf16_emit_add(ctx, tl_buf2, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + + // 0, x = 0 and y = 0 + // x = 0 + bf16_emit_mask_eq0(ctx, x, tl_buf, tl_0_idx_table, tl_buf2, fmt); // 0.003 could consider 1 + // y = 0 + bf16_emit_mask_eq0(ctx, y, tl_buf, tl_0_idx_table, tl_buf3, fmt); // 0.003 could consider 1 + + // x = 0 && y = 0 + bf16_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf3, fmt); + + // !(x = 0 and y = 0) keep it + bf16_emit_0_1_revert_input(ctx, tl_buf3, tl_buf, tl_buf2, fmt); + bf16_emit_mul(ctx, tl_buf2, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + +} + +// ==== fast version === +static void __bf16_atan2_fast_emit_case_3(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* y, + bmk1880v2_tensor_lmem_t* x, + bmk1880v2_tensor_lmem_t* tl_buf, + bmk1880v2_tensor_lmem_t* tl_y0_buf, + bmk1880v2_tensor_lmem_t* tl_invert_buf, + bmk1880v2_tensor_lmem_t* tl_pos_neg_table, + bmk1880v2_tensor_lmem_t* tl_table_answer, + bmk1880v2_tensor_lmem_t* tl_table_answer_mantissa, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + bmk1880v2_tensor_lmem_t* OUT y_over_x, + fmt_t fmt, float b) { + // case 3 + // atan( y / x) + +#if 0 + // x0 = reciprocal(x) + _bf16_lut_exp_mantissa(ctx, + x, + NULL, + tl_table_answer, + tl_table_answer_mantissa, + tl_buf, + true + ); + + // y0 = x0 * y + bf16_emit_mul(ctx, y, tl_buf, tl_buf, fmt); +#else + bf16_emit_x_over_y(ctx, y, x, NULL, tl_buf, + tl_table_answer, tl_table_answer_mantissa, fmt, true); + + if (y_over_x) { + bf16_emit_add_const(ctx, tl_buf, y_over_x, fmt, 0); + } +#endif + + // x0 = atan(y0) + _bf16_atan_fast_emit(ctx, + tl_buf, + x, + NULL, + tl_y0_buf, + tl_invert_buf, + tl_pos_neg_table, + tl_table_answer, + tl_table_answer_mantissa, + OUT tl_ofmap_bf16, + fmt, b, true); +} + +#if 0 +static void _bf16_atan2_fast_emit(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* y, + bmk1880v2_tensor_lmem_t* tl_buf, + bmk1880v2_tensor_lmem_t* tl_buf2, + bmk1880v2_tensor_lmem_t* tl_buf4, + bmk1880v2_tensor_lmem_t* tl_y0_buf, + bmk1880v2_tensor_lmem_t* tl_invert_buf, + bmk1880v2_tensor_lmem_t* tl_pos_neg_table, + bmk1880v2_tensor_lmem_t* tl_table_answer, + bmk1880v2_tensor_lmem_t* tl_table_answer_mantissa, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + bmk1880v2_tensor_lmem_t* OUT tl_buf3, + fmt_t fmt) { + // case 3 + // atan( y / x) + +#if 0 + // x0 = reciprocal(tl_buf) + _bf16_lut_exp_mantissa(ctx, + tl_buf, + NULL, + tl_table_answer, + tl_table_answer_mantissa, + tl_buf2, + true + ); + + // y0 = x0 * y + bf16_emit_mul(ctx, y, tl_buf2, tl_buf2, fmt); +#else +#if 0 + bf16_emit_x_over_y(ctx, y, tl_buf, NULL, tl_buf2, + tl_table_answer, tl_table_answer_mantissa, fmt, true); + + if (tl_buf3) { + bf16_emit_add_const(ctx, tl_buf2, tl_buf3, fmt, 0); + } +#else + //if (tl_buf3) { + // bf16_emit_add_const(ctx, tl_buf, tl_buf3, fmt, 0); + //} + + // get xy == 0 and y < 0, add pi + // using xy to depend x = 0 or y = 0 + // recipical y < 0 get 0xFEFF, y > 0 get 0x7F7F, + // 1. b = xy to get other/(x = 0 or y = 0) + // 2. c = b * 2^64 to saturate it + // 3. c(bf16) = c(int8) >> 10 to get 1/0 map, 1 indicate xy > 0 + // 4. c = c * -1 + 1 to invert map, 1 indicate x = 0 or y = 0 + // 5. d = b(int8) - 0x7f, 0 means y > 0 + // 6. d = d(int8) + 0xff to get inf + bf16_emit_mul(ctx, y, tl_buf, tl_buf2, fmt); + // get 7f7f / 0 + bf16_emit_mul_const(ctx, tl_buf2, tl_ofmap_bf16, fmt, convert_bf16_fp32(0x7f00)); + //// 1 = 0x3f80 + //bf16_emit_mul_const(ctx, tl_buf2, tl_ofmap_bf16, fmt, 0); + //bf16_emit_add_const(ctx, tl_ofmap_bf16, tl_buf4, fmt, 1.0); + // bf16->uint8_t and back uint8_t->bf16 to get 0/1 map + +#if 1 + bmk1880v2_tensor_lmem_t index_u8; + bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &index_u8, tl_buf2, FMT_U8); + + index_u8.shape.w = index_u8.shape.w / 2; + index_u8.stride = bmk1880v2_tensor_lmem_default_stride(ctx, index_u8.shape, + CTRL_NULL, FMT_I8); + + index_u8.fmt = FMT_I8; + + bmk1880v2_tdma_l2l_tensor_copy_param_t p1; + p1.src = tl_ofmap_bf16; + p1.dst = &index_u8; + bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p1); + bmk1880v2_tiu_element_wise_mul_param_t p; + +#if 0 + + p.res_high = NULL; + p.res_low = &index_u8; + p.a = &index_u8; + p.b_is_const = 1; + p.b_const.val = -1; + p.b_const.is_signed = 1; + p.rshift_bits = 0; + p.relu_enable = 0; + bmk1880v2_tiu_element_wise_mul(ctx, &p); +#else + p.res_high = NULL; + p.res_low = &index_u8; + p.a = &index_u8; + p.b_is_const = 1; + p.b_const.val = -1; + p.b_const.is_signed = 1; + p.rshift_bits = 7; + p.relu_enable = 0; + bmk1880v2_tiu_element_wise_mul(ctx, &p); + +#endif + + // get -1/0 map, -1 indicate xy != 0 + p1.src = &index_u8; + p1.dst = tl_ofmap_bf16; + bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p1); + + // x * (-1) + 1 get 0/1 map, 1 indicate xy == 0 + //bf16_emit_mul_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, -1.0); + bf16_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 1.0); + + // get y < 0, bf16->int8 and mul 0xff to get -128 and righshift to get 1 + bf16_emit_mul_const(ctx, y, tl_buf3, fmt, pow(2,64)); + p1.src = tl_buf3; + p1.dst = &index_u8; + bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p1); + + + p.res_high = 0; + p.res_low = &index_u8; + p.a = &index_u8; + p.b_is_const = 1; + p.b_const.val = -128; + p.b_const.is_signed = 1; + p.rshift_bits = 0; + p.relu_enable = 1; + bmk1880v2_tiu_element_wise_mul(ctx, &p); + + p.res_high = 0; + p.res_low = &index_u8; + p.a = &index_u8; + p.b_is_const = 1; + p.b_const.val = 1; + p.b_const.is_signed = 1; + p.rshift_bits = 7; + p.relu_enable = 1; + bmk1880v2_tiu_element_wise_mul(ctx, &p); + + // get y < 0 + p1.src = &index_u8; + p1.dst = tl_buf4; + bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p1); + bf16_emit_mul_const(ctx, tl_buf4, tl_buf4, fmt, -1.0); + + // get y > 0 + // y * (-1) + 1 get 0/1 map, 1 indicate xy == 0 + bf16_emit_add_const(ctx, tl_buf4, tl_buf2, fmt, 1.0); + bf16_emit_add(ctx, tl_buf2, tl_buf4, tl_buf2, fmt); + + // merge y > 0 && y < 0 && x == 0 + bf16_emit_mul(ctx, tl_ofmap_bf16, tl_buf2, tl_buf3, fmt); + //bf16_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 0); + //bf16_emit_mul_const(ctx, tl_ofmap_bf16, tl_buf3, fmt, M_PI); + +#endif + + + bf16_emit_x_over_y(ctx, y, tl_buf, NULL, tl_buf2, + tl_table_answer, tl_table_answer_mantissa, fmt, true); +#endif +#endif + + // x0 = atan(y0) + __bf16_atan_fast_emit(ctx, + tl_buf2, + tl_buf, + tl_buf4, + tl_y0_buf, + tl_invert_buf, + tl_pos_neg_table, + tl_table_answer, + tl_table_answer_mantissa, + OUT tl_ofmap_bf16, + fmt); + + // abs tl_buf3 + // revert and mul to clean !(x == 0 && (y != 0) case + // add pi/2 + bf16_emit_mul_const(ctx, tl_buf3, tl_buf2, fmt, -1); + bmk1880v2_tiu_element_wise_min_param_t p3; + p3.min = tl_buf2; + p3.a = tl_buf3; + p3.b_is_const = 0; + p3.b = tl_buf2; + bmk1880v2_tiu_element_wise_min(ctx, &p3); + bf16_emit_add_const(ctx, tl_buf2, tl_buf2, fmt, 1.0); + bf16_emit_mul(ctx, tl_ofmap_bf16, tl_buf2, tl_ofmap_bf16, fmt); + + bf16_emit_mul_const(ctx, tl_buf3, tl_buf3, fmt, M_PI_2); + bf16_emit_add(ctx, tl_buf3, tl_ofmap_bf16, tl_ofmap_bf16, fmt); +} +#endif + +static void _bf16_atan2_fast_emit_case_3(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* y, + bmk1880v2_tensor_lmem_t* x, + bmk1880v2_tensor_lmem_t* tl_buf, + bmk1880v2_tensor_lmem_t* tl_y0_buf, + bmk1880v2_tensor_lmem_t* tl_invert_buf, + bmk1880v2_tensor_lmem_t* tl_pos_neg_table, + bmk1880v2_tensor_lmem_t* tl_table_answer, + bmk1880v2_tensor_lmem_t* tl_table_answer_mantissa, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt, float b) { + // case 3 + // atan( y / x) + return __bf16_atan2_fast_emit_case_3(ctx, + y, + x, + tl_buf, + tl_y0_buf, + tl_invert_buf, + tl_pos_neg_table, + tl_table_answer, + tl_table_answer_mantissa, + tl_ofmap_bf16, + NULL, + fmt, b); +} + +void bf16_atan2_fast_emit(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* y, + bmk1880v2_tensor_lmem_t* x, + bmk1880v2_tensor_lmem_t* tl_buf, + bmk1880v2_tensor_lmem_t* tl_buf2, + bmk1880v2_tensor_lmem_t* tl_buf3, + bmk1880v2_tensor_lmem_t* tl_buf4, + bmk1880v2_tensor_lmem_t* tl_y0_buf, + bmk1880v2_tensor_lmem_t* tl_slope_buf, + bmk1880v2_tensor_lmem_t* tl_invert_buf, + bmk1880v2_tensor_lmem_t* tl_pos_neg_table, + bmk1880v2_tensor_lmem_t* tl_table_answer, + bmk1880v2_tensor_lmem_t* tl_table_answer_mantissa, + bmk1880v2_tensor_lmem_t* tl_0_idx_table, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt) +{ + bf16_table_check(y, tl_y0_buf, tl_slope_buf, x); + bf16_table_check(tl_buf, tl_invert_buf, tl_pos_neg_table, tl_buf2); + bf16_table_check(tl_buf3, tl_table_answer, tl_table_answer_mantissa, tl_buf4); + bf16_table_check(tl_buf4, tl_table_answer, tl_0_idx_table, tl_buf4); + + // atan(y/x), x > 0 + // atan(y/x) + PI , x < 0 and y >= 0 + // atan(y/x) - PI , x < 0 and y < 0 + // pi / 2, x = 0 and y > 0 + // -pi / 2, x = 0 and y < 0 + // 0, x = 0 and y = 0 + + // atan(y/x), x > 0 + bf16_emit_max_const(ctx, x, tl_buf, fmt, 0.0); + _bf16_atan2_fast_emit_case_3(ctx, + y, + tl_buf, + tl_buf2, + tl_y0_buf, + tl_invert_buf, + tl_pos_neg_table, + tl_table_answer, + tl_table_answer_mantissa, + tl_ofmap_bf16, + fmt, 0.0); + + // x > 0 + bf16_emit_mask_gt0(ctx, x, tl_buf, tl_buf2, tl_buf3, + tl_pos_neg_table, tl_0_idx_table, tl_buf, fmt); + + bf16_emit_mul(ctx, tl_ofmap_bf16, tl_buf, tl_ofmap_bf16, fmt); + + // atan(y/x) + PI , x < 0 and y >= 0 + bf16_emit_min_const(ctx, x, tl_buf, fmt, 0.0); + _bf16_atan2_fast_emit_case_3(ctx, + y, + tl_buf, + tl_buf2, + tl_y0_buf, + tl_invert_buf, + tl_pos_neg_table, + tl_table_answer, + tl_table_answer_mantissa, + tl_buf4, + fmt, M_PI); + //bf16_emit_add_const(ctx, tl_buf4, tl_buf4, fmt, M_PI); + + // get index map that x < 0 and y >= 0 + // !(y >= 0) = !(y < 0) +#if 0 + bf16_emit_pos_idx(ctx, y, tl_buf, tl_pos_neg_table, tl_buf3, fmt); + // y == 0 + bf16_emit_0_idx(ctx, y, tl_buf, tl_0_idx_table, tl_buf2, fmt); + bf16_emit_add(ctx, tl_buf2, tl_buf3, tl_buf2, fmt); +#else + // y >= 0 + bf16_emit_mask_ge0(ctx, y, tl_buf, tl_pos_neg_table, tl_buf2, fmt); +#endif + // x < 0 + bf16_emit_mask_lt0(ctx, x, tl_buf, tl_pos_neg_table, tl_buf3, fmt); + // x < 0 && y >= 0 + bf16_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt); + + bf16_emit_mul(ctx, tl_buf4, tl_buf2, tl_buf, fmt); + bf16_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + + // atan(y/x) - PI , x < 0 and y < 0 + bf16_emit_min_const(ctx, x, tl_buf, fmt, 0.0); + _bf16_atan2_fast_emit_case_3(ctx, + y, + tl_buf, + tl_buf2, + tl_y0_buf, + tl_invert_buf, + tl_pos_neg_table, + tl_table_answer, + tl_table_answer_mantissa, + tl_buf4, + fmt, 0.0); + bf16_emit_add_const(ctx, tl_buf4, tl_buf4, fmt, -1.0 * M_PI); + // x < 0 and y < 0 + + // we leverage x <= 0 and y <= 0 cuz we filter out x = 0 case, speed up it + // x < 0 + bf16_emit_mask_lt0(ctx, x, tl_buf, tl_pos_neg_table, tl_buf2, fmt); + // y < 0 + bf16_emit_mask_lt0(ctx, y, tl_buf, tl_pos_neg_table, tl_buf3, fmt); + // x < 0 && y < 0 + bf16_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt); + + bf16_emit_mul(ctx, tl_buf4, tl_buf2, tl_buf, fmt); + bf16_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + + // pi / 2, x = 0 and y > 0 + // x = 0 + bf16_emit_mask_eq0(ctx, x, tl_buf, tl_0_idx_table, tl_buf2, fmt); // 0.003 could consider 1 + + // y > 0 + //bf16_emit_mask_gt0(ctx, y, tl_buf, tl_buf5, tl_buf4, tl_pos_neg_table, tl_0_idx_table, tl_buf3, fmt); + _bf16_emit_mask(ctx, y, tl_buf, tl_buf4, NULL, tl_pos_neg_table, tl_0_idx_table, tl_buf3, fmt, BF16_MASK_TYPE_GT_0, true); + // x = 0 && y > 0 + bf16_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt); + bf16_emit_mul_const(ctx, tl_buf2, tl_buf2, fmt, M_PI / 2.0); + + bf16_emit_add(ctx, tl_buf2, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + + // -pi / 2, x = 0 and y < 0 + // x = 0 + bf16_emit_mask_eq0(ctx, x, tl_buf, tl_0_idx_table, tl_buf2, fmt); // 0.003 could consider 1 + // y < 0 + bf16_emit_mask_lt0(ctx, y, tl_buf, tl_pos_neg_table, tl_buf3, fmt); + // x = 0 && y < 0 + bf16_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt); + + bf16_emit_mul_const(ctx, tl_buf2, tl_buf2, fmt, -1.0 * M_PI / 2.0); + + bf16_emit_add(ctx, tl_buf2, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + + // 0, x = 0 and y = 0 + // x = 0 + bf16_emit_mask_eq0(ctx, x, tl_buf, tl_0_idx_table, tl_buf2, fmt); // 0.003 could consider 1 + // y = 0 + bf16_emit_mask_eq0(ctx, y, tl_buf, tl_0_idx_table, tl_buf3, fmt); // 0.003 could consider 1 + + // x = 0 && y = 0 + bf16_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf3, fmt); + + // !(x = 0 and y = 0) keep it + bf16_emit_0_1_revert_input(ctx, tl_buf3, tl_buf, tl_buf2, fmt); + bf16_emit_mul(ctx, tl_buf2, tl_ofmap_bf16, tl_ofmap_bf16, fmt); +} + +static void _x_lt_0(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* x, + bmk1880v2_tensor_lmem_t* tl_buf, + bmk1880v2_tensor_lmem_t* index_i8, + fmt_t fmt, + bmk1880v2_tensor_lmem_t* OUT tl_buf2 + ) { + + bmk1880v2_tiu_element_wise_min_param_t p7; + bmk1880v2_tiu_element_wise_mul_param_t p; + bmk1880v2_tdma_l2l_tensor_copy_param_t p1; + + memset(&p7, 0, sizeof(p7)); + memset(&p, 0, sizeof(p)); + memset(&p1, 0, sizeof(p1)); + + // x < 0 + p7.min = tl_buf; + p7.a = x; + p7.b_is_const = 1; + p7.b_const.val = 0; + p7.b_const.is_signed = 1; + bmk1880v2_tiu_element_wise_min(ctx, &p7); + bf16_emit_mul_const(ctx, tl_buf, tl_buf, fmt, pow(2, 64)); + + p1.src = tl_buf; + p1.dst = index_i8; + bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p1); + + p.res_high = 0; + p.res_low = index_i8; + p.a = index_i8; + p.b_is_const = 1; + p.b_const.val = -128; + p.b_const.is_signed = 1; + p.rshift_bits = 0; + p.relu_enable = 1; + bmk1880v2_tiu_element_wise_mul(ctx, &p); + + p.res_high = 0; + p.res_low = index_i8; + p.a = index_i8; + p.b_is_const = 1; + p.b_const.val = 1; + p.b_const.is_signed = 1; + p.rshift_bits = 7; + p.relu_enable = 1; + bmk1880v2_tiu_element_wise_mul(ctx, &p); + + // get x < 0 + p1.src = index_i8; + p1.dst = tl_buf2; + bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p1); + +} + +static void _bf16_atan2_merge_emit(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* y, + bmk1880v2_tensor_lmem_t* x, + bmk1880v2_tensor_lmem_t* tl_buf, + bmk1880v2_tensor_lmem_t* tl_buf2, + bmk1880v2_tensor_lmem_t* tl_buf3, + bmk1880v2_tensor_lmem_t* tl_y0_buf, + bmk1880v2_tensor_lmem_t* tl_invert_buf, + bmk1880v2_tensor_lmem_t* tl_pos_neg_table, + bmk1880v2_tensor_lmem_t* tl_table_answer, + bmk1880v2_tensor_lmem_t* tl_table_answer_mantissa, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt, float degree_factor) +{ + bf16_table_check(y, tl_y0_buf, tl_invert_buf, x); + bf16_table_check(tl_buf, tl_invert_buf, tl_pos_neg_table, tl_buf2); + bf16_table_check(tl_buf3, tl_table_answer, tl_table_answer_mantissa, tl_ofmap_bf16); + + + bmk1880v2_tensor_lmem_t index_i8; + bmk1880v2_tensor_lmem_s_copy_l2l_bf16_8(ctx, &index_i8, tl_buf2, FMT_I8); + + /** + * step 1. atan(y/x) + */ + bf16_emit_mul_const(ctx, tl_buf, tl_buf, fmt, 0.0); + bf16_emit_add(ctx, x, tl_buf, tl_buf, fmt); + +#if 0 + // get y < 0, bf16->int8 and mul 0xff to get -128 and righshift to get 1 + bmk1880v2_tiu_element_wise_mul_param_t p; + bmk1880v2_tdma_l2l_tensor_copy_param_t p1; + bf16_emit_mul_const(ctx, y, tl_buf3, fmt, pow(2,64)); + p1.src = tl_buf3; + p1.dst = &index_i8; + bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p1); + + + p.res_high = 0; + p.res_low = &index_i8; + p.a = &index_i8; + p.b_is_const = 1; + p.b_const.val = -128; + p.b_const.is_signed = 1; + p.rshift_bits = 0; + p.relu_enable = 1; + bmk1880v2_tiu_element_wise_mul(ctx, &p); + + p.res_high = 0; + p.res_low = &index_i8; + p.a = &index_i8; + p.b_is_const = 1; + p.b_const.val = 1; + p.b_const.is_signed = 1; + p.rshift_bits = 7; + p.relu_enable = 1; + bmk1880v2_tiu_element_wise_mul(ctx, &p); + + // get y < 0 + p1.src = &index_i8; + p1.dst = tl_buf3; + bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p1); + bf16_emit_mul_const(ctx, tl_buf3, tl_buf3, fmt, -1.0); + + // get y > 0 + // y * (-1) + 1 get 0/1 map, 1 indicate xy == 0 + bf16_emit_add_const(ctx, tl_buf3, tl_buf2, fmt, 1.0); + + // reduce y == 0 + if (0) + { + bmk1880v2_tiu_element_wise_max_param_t p3; + bmk1880v2_tensor_lmem_t index_i8; + bmk1880v2_tensor_lmem_s_copy_l2l_bf16_8(ctx, &index_i8, tl_ofmap_bf16, FMT_I8); + bf16_emit_mul_const(ctx, y, tl_buf, fmt, -1); + p3.max = tl_buf; + p3.a = y; + p3.b_is_const = 0; + p3.b = tl_buf; + bmk1880v2_tiu_element_wise_max(ctx, &p3); + bf16_emit_mul_const(ctx, tl_buf, tl_buf, fmt, convert_bf16_fp32(0x7f00)); + //bf16_emit_mul_const(ctx, tl_buf, tl_buf, fmt, pow(2, 64)); + + p1.src = tl_buf; + p1.dst = &index_i8; + bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p1); + + p.res_high = NULL; + p.res_low = &index_i8; + p.a = &index_i8; + p.b_is_const = 1; + p.b_const.val = -1; + p.b_const.is_signed = 1; + p.rshift_bits = 7; + p.relu_enable = 0; + bmk1880v2_tiu_element_wise_mul(ctx, &p); + + + p1.src = &index_i8; + p1.dst = tl_buf3; + bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p1); + + //revert it + bf16_emit_mul_const(ctx, tl_buf3, tl_buf3, fmt, -1.0); + //bf16_emit_add_const(ctx, tl_buf3, tl_buf3, fmt, 1); + bf16_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt); + } + + bf16_emit_add(ctx, tl_buf2, tl_buf3, tl_buf3, fmt); +#endif + + bf16_emit_x_over_y(ctx, y, tl_buf, NULL, tl_buf2, + tl_table_answer, tl_table_answer_mantissa, fmt, true); + + // x0 = atan(y0) + __bf16_atan_fast_emit(ctx, + tl_buf2, + tl_buf, + tl_buf3, + tl_y0_buf, + tl_invert_buf, + tl_pos_neg_table, + tl_table_answer, + tl_table_answer_mantissa, + OUT tl_ofmap_bf16, + fmt); + + bmk1880v2_tensor_lmem_s_copy_l2l_bf16_8(ctx, &index_i8, tl_buf, FMT_I8); + + // seperate y >= 0 or < 0 to handle 0 degree / 180 degree + bf16_emit_mask_ge0_lt0( + ctx, + y, + &index_i8, + tl_buf3, + fmt + ); + + + /** + * step 2. set x == 0, y >=0 to pi/2, y < 0 to -pi/2 + * FIXME: atan(0) not eq PI/2 + */ + + // x = 0 and y != 0 + // reset all x = 0 + // y >= 0 as pi/2, y < 0 as -pi/2 + // merge + + bf16_emit_mask_eq_0(ctx, x, tl_buf, &index_i8, tl_buf2, fmt); + + // clear x = 0 + bf16_emit_mul_const(ctx, tl_buf2, tl_buf, fmt, -1); + bf16_emit_mul(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + + // get revert map, x = -x + 1 cuz original -1 menas x != 0 + bf16_emit_mul_const(ctx, tl_buf3, tl_buf, fmt, M_PI_2 * degree_factor); + bf16_emit_add_const(ctx, tl_buf2, tl_buf2, fmt, 1); + + bf16_emit_mul(ctx, tl_buf, tl_buf2, tl_buf, fmt); + + bf16_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + +// return; + /** + * step 3. handle x < 0 && y != 0 + */ + + // x < 0 + _x_lt_0(ctx, x, tl_buf, &index_i8, fmt, tl_buf2); + + // x < 0 && (y >= 1 && y < 1) + bf16_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf, fmt); + bf16_emit_mul_const(ctx, tl_buf, tl_buf, fmt, M_PI * degree_factor); + bf16_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + + + /** + * 4. handle x != 0 && y == 0, x>0: 0, x<0: PI, tpu atan default all pi/2 + */ + // tl_buf2 as x < 0 + // get y == 0, tl_buf3 keep y>=0 is 1, y<1 = -1 + bf16_emit_mask_eq_0(ctx, y, tl_buf, &index_i8, tl_buf3, fmt); + // revert + bf16_emit_mul_const(ctx, tl_buf3, tl_buf, fmt, -1.0); + + // reset y = 0 x = ? as 0, other case leave to step 5 + bf16_emit_mul(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + + /** + * 5. set y == 0 and x < 0 as pi + */ + + // get y == 0 + bf16_emit_add_const(ctx, tl_buf3, tl_buf, fmt, 1.0); + // y == 0 && x < 0 + bf16_emit_mul(ctx, tl_buf, tl_buf2, tl_buf, fmt); + bf16_emit_mul_const(ctx, tl_buf, tl_buf, fmt, M_PI * degree_factor); + + // merge + bf16_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + return; +} + +/** + * \brief reduce lut table with following step + * 1. atan(y/x) + * 2. handle x = 0 && y != 0, directly set pi/2, -pi/2 + * 3. handle x < 0 && y != 0 + * => y>0: PI/2, y <0: -PI/2, tpu atan default y>0: -PI/2, y <0: PI/2 + * 4. handle x != 0 && y == 0, x>0: 0, x<0: PI, tpu atan default all pi/2 + * 5. handle x = 0 && y = 0 => PI + */ +void bf16_atan2_merge_emit(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* y, + bmk1880v2_tensor_lmem_t* x, + bmk1880v2_tensor_lmem_t* tl_buf, + bmk1880v2_tensor_lmem_t* tl_buf2, + bmk1880v2_tensor_lmem_t* tl_buf3, + bmk1880v2_tensor_lmem_t* tl_y0_buf, + bmk1880v2_tensor_lmem_t* tl_invert_buf, + bmk1880v2_tensor_lmem_t* tl_pos_neg_table, + bmk1880v2_tensor_lmem_t* tl_table_answer, + bmk1880v2_tensor_lmem_t* tl_table_answer_mantissa, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt) +{ + return + _bf16_atan2_merge_emit(ctx, + y, + x, + tl_buf, + tl_buf2, + tl_buf3, + tl_y0_buf, + tl_invert_buf, + tl_pos_neg_table, + tl_table_answer, + tl_table_answer_mantissa, + tl_ofmap_bf16, + fmt, 1.0); +} + +void bf16_atan2_fast_degree_emit(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* y, + bmk1880v2_tensor_lmem_t* x, + bmk1880v2_tensor_lmem_t* tl_buf, + bmk1880v2_tensor_lmem_t* tl_buf2, + bmk1880v2_tensor_lmem_t* tl_buf3, + bmk1880v2_tensor_lmem_t* tl_y0_buf, + bmk1880v2_tensor_lmem_t* tl_invert_buf, + bmk1880v2_tensor_lmem_t* tl_pos_neg_table, + bmk1880v2_tensor_lmem_t* tl_table_answer, + bmk1880v2_tensor_lmem_t* tl_table_answer_mantissa, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + fmt_t fmt) +{ + return + _bf16_atan2_merge_emit(ctx, + y, + x, + tl_buf, + tl_buf2, + tl_buf3, + tl_y0_buf, + tl_invert_buf, + tl_pos_neg_table, + tl_table_answer, + tl_table_answer_mantissa, + tl_ofmap_bf16, + fmt, 180/M_PI); +} diff --git a/cvikernel/src/bm1880v2/non_atomic/tiu_reciprocal.c b/cvikernel/src/bm1880v2/non_atomic/tiu_reciprocal.c new file mode 100644 index 000000000..449476264 --- /dev/null +++ b/cvikernel/src/bm1880v2/non_atomic/tiu_reciprocal.c @@ -0,0 +1,167 @@ +/** + */ +#include "gen_lut.h" +#include + +//#define DBG + +/* + * NOTICE: it could occupy 2 lookup table size which shape is <1,32,32,8> with bf16 data type + * + * \tl_buf tmp buffer, the shape MUST be same with \tl_ifmap + * \tl_ofmap_bf16 result as bf16, MUST given for tmp buffer used + */ +int bf16_emit_reciprocal(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* IN tl_ifmap, + bmk1880v2_tensor_lmem_t* IN tl_buf, + bmk1880v2_tensor_lmem_t *tbl_answer, + bmk1880v2_tensor_lmem_t *tbl_answer_mantissa, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16 + ) { + + return bf16_lut_exp_mantissa(ctx, + tl_ifmap, + tl_buf, + tbl_answer, + tbl_answer_mantissa, + tl_ofmap_bf16 + ); +} + +// 0, exp from 0 -62 -61 .. 62 63 + for (int i = 0; i < half - 1; i++) { + int shift = (exp_start + i); + uint8_t is_odd = (shift % 2); + float exp = shift; + if (is_odd) { + exp = exp - 1; + } + + double s = _gen_reciprocal(2, exp); + table_data[idx] = convert_fp32_bf16(s); +#ifdef DBG + printf("t [%lu] is %f [idx:%f][2^%f] bf %x\n", idx, + convert_bf16_fp32(table_data[idx]), + (float)(exp_start + i), -1 * exp, + table_data[idx]); +#endif + idx++; + } + + s = _gen_reciprocal(2, -0); + table_data[idx] = convert_fp32_bf16(s); + table_data[idx] = 0x7F80; //c; i++) { + memcpy(&table_data[i * table_hw], &table_data[0], sizeof(uint16_t) * table_hw); + } +} + +void bf16_gen_reciprocal_mantissa(uint16_t* OUT table_mantissa, + bmk1880v2_tensor_lmem_shape_t* table_shape) { + + assert(is_1880v2_tbl_shape(table_shape)); + + uint32_t half = half_h_table(); + int table_hw = bf16_table_hw(); + + int idx = 0; + double d; + for (uint32_t i = 0; i < half; i++) { + d = 1 + i * 1 / 128.0; + d = (double) pow(d, -1); + table_mantissa[128+idx] = convert_fp32_bf16(d); + + //13=2^3x1.625=(2^2)x(2^1x1.625) + d = 2 * (1 + i * 1 / 128.0); + d = (double) pow(d, -1); + table_mantissa[idx] = convert_fp32_bf16(d); + idx++; + } + +#ifdef DBG + for (uint32_t i = 0; i < 2 * half; i++) { + printf("mantissa [%u] is %lf, 0x%x\n", i, convert_bf16_fp32(table_mantissa[i]), + table_mantissa[i]); + } +#endif /* ifdef DBG */ + + // duplicate channel #1 to #31 + //TODO: tensor copy + for (uint64_t i = 1; i < table_shape->c; i++) { + memcpy(&table_mantissa[table_hw * i], &table_mantissa[0], sizeof(uint16_t) * table_hw); + } +} + +void bf16_reciprocal_tbl(uint16_t *table_data, uint16_t* table_mantissa, + bmk1880v2_tensor_lmem_shape_t* table_shape) { + + assert(table_data); + assert(table_mantissa); + assert(table_shape); + + bf16_gen_reciprocal(table_data, table_shape); + bf16_gen_reciprocal_mantissa(table_mantissa, table_shape); +} diff --git a/cvikernel/src/bm1880v2/non_atomic/tiu_reshape_c.c b/cvikernel/src/bm1880v2/non_atomic/tiu_reshape_c.c new file mode 100644 index 000000000..d717e9b2f --- /dev/null +++ b/cvikernel/src/bm1880v2/non_atomic/tiu_reshape_c.c @@ -0,0 +1,438 @@ +/** + * reshape channel under depthwise + */ +// + +#include "gen_lut.h" +#include + +//#define DBG +// copy from \1880v2_test_util.h +static int calc_dilute_hw(int h, int ins_h, int ins_h_l, int pad_h_b, int pad_h_t) +{ + return (h - 1) * (ins_h + 1) + ins_h_l + + 1 + pad_h_t + pad_h_b; +} + +// get padding as 'SAME' mode in tensorflow +// https://www.jianshu.com/p/05c4f1621c7e +static int get_same_pad(int ih, int sh, int kh) { + return (((ih + sh - 1) / sh) - 1) * sh + kh - ih; +} + +// get real 'h' with pad/ins +static int pooling_ih_ext(int ins_h, int ins_last_h, int pad_top, int pad_bottom, int ih) +{ + int ins = ins_h; + int ins_last = ins_last_h; + int pad = pad_top + pad_bottom; + return (ih - 1) * (ins + 1) + ins_last + 1 + pad; +} + +// get real 'w' with pad/ins +static int pooling_iw_ext(int ins_w, int ins_last_w, int pad_left, int pad_right, int iw) +{ + int ins = ins_w; + int ins_last = ins_last_w; + int pad = pad_left + pad_right; + return (iw - 1) * (ins + 1) + ins_last + 1 + pad; +} + +// get output h with parameter +static int pooling_oh( + int ins_h, int ins_last_h, int pad_top, int pad_bottom, + int stride_h, int ih, int kh, int dh) +{ + int ih_ext = pooling_ih_ext(ins_h, ins_last_h, pad_top, pad_bottom, ih); + int d_h = (kh -1) * dh + 1; + return (ih_ext - d_h) / stride_h + 1; +} + +// get output w with parameter +static int pooling_ow( + int ins_w, int ins_last_w, int pad_left, int pad_right, + int stride_w, int iw, int kw, int dw) +{ + int iw_ext = pooling_iw_ext(ins_w, ins_last_w, pad_left, pad_right, iw); + int d_w = (kw -1) * dw +1; + return (iw_ext - d_w) / stride_w + 1; +} + +/** + * \brief get extended bias + * \return allocated new bias + */ +uint32_t* bm1880v2_reshape_channel_bias(uint8_t* bias, + int ni, int ci, int hi, int wi, + int old_bias_c, fmt_t fmt + ) { + + assert(bias); + assert((ni == 2 || ni == 1) && "not support bias batch > 1"); + assert(ci / old_bias_c > 0 && ci % old_bias_c == 0); + int sz = fmt == FMT_BF16 ? 4 : 2; + + int d_c_bias_sz = ni * ci * hi * wi; + uint8_t *new_bias = (uint8_t *)malloc(d_c_bias_sz * sz); + int bias_hw = hi * wi; + int duplicat_c = ci / old_bias_c; + + for (int c = 0; c < old_bias_c; c++) { + int shift = (c * bias_hw) * sz; + for (int i = 0; i < duplicat_c; i++) { + int new_bias_shift = (c * duplicat_c + i) * bias_hw * sz; + memcpy(&new_bias[new_bias_shift], &bias[shift], bias_hw * sz); + } + } + return (uint32_t* )new_bias; +} + +/* + * \brief prepare load shape/stride + * \return -1 means fail to reshape, 0 means success + * \TODO check memory usage + */ +static inline int _get_dup_shape( + bmk1880v2_context_t *bk_ctx, + int in, int ic, int ih, int iw, + int d_kh, int stride_h, int npu_num, + bmk1880v2_tensor_lmem_shape_t* tl_shape, bmk1880v2_tensor_lmem_stride_t* tl_load_stride, + bmk1880v2_tensor_tgmem_shape_t* tg_shape, bmk1880v2_tensor_tgmem_stride_t* tg_stride, + fmt_t src_tg_fmt, fmt_t dst_tl_fmt + ) { + + assert(in > 0 && ic > 0 && ih > 0 && iw > 0 && d_kh > 0 && stride_h > 0); + assert(tl_shape && tl_load_stride && tg_shape && tg_stride); + + // 1. reshape and extend c, h axis in order + int ch = ic * ih; + int oc; + int oh; + + // FIXME: check kernel setting + oh = 0; + + for (int i = npu_num/ic; i > 0; i--) { +#if 0 + int hw = ih * iw; + int _oh = hw / i / iw; + if (hw % i == 0 && (hw / i) % stride_h == 0 && _oh >= stride_h) { + oh = _oh; + break; + } +#else + int _oh = ih / i; + if (ih % i == 0 && (_oh) % stride_h == 0 && _oh >= stride_h /*&& _oh >= d_kh*/) { + oh = _oh; + break; + } +#endif + } + + + if (!oh) { + // FIXME: check terminal condition + return -1; + } + + oc = ch / oh; + +#ifdef DBG + printf ("ic:ih is %d %d, oc:oh is %d:%d\n", ic, ih, oc, oh); +#endif + + // tg/tl MUST be same shape size + tl_shape->n = tg_shape->n = 1; + tl_shape->c = tg_shape->c = oc; + tl_shape->h = tg_shape->h = oh; + tl_shape->w = tg_shape->w = iw; + + // init tl + bmk1880v2_tensor_lmem_stride_t s = + bmk1880v2_tensor_lmem_default_stride(bk_ctx, *tl_shape, dst_tl_fmt, CTRL_NULL); + tl_load_stride->n = s.n; + tl_load_stride->c = s.c; + tl_load_stride->h = s.h; + tl_load_stride->w = s.w; + + // init tg + bmk1880v2_tensor_tgmem_stride_t gs = + bmk1880v2_tensor_tgmem_default_stride(*tg_shape, src_tg_fmt); + + tg_stride->n = gs.n; + tg_stride->c = gs.c; + tg_stride->h = gs.h; + + return 0; +} + + +/** + * \brief get proper reshape size for depthwise conv with 'same' mode in h direction + * \return -1 means alloc fail + * \NOTICE: not support batch/ins_x/dilated_x/pad_top/pad_bottom + */ +int bm1880v2_reshape_channel_same( + bmk1880v2_context_t *bk_ctx, + int ic, int ih, int iw, int kh, int kw, + int pad_right, int pad_left, int stride_h, int stride_w, + bmk1880v2_tensor_lmem_shape_t* tl_load_shape, + bmk1880v2_tensor_lmem_stride_t* new_tl_ifmap_stride, + bmk1880v2_tensor_tgmem_shape_t* new_tg_ifmap_shape, + bmk1880v2_tensor_tgmem_stride_t* new_tg_ifmap_stride, + bmk1880v2_tensor_lmem_shape_t* new_tl_weight_shape, + bmk1880v2_tensor_lmem_shape_t* new_tl_bias_shape, + bmk1880v2_tensor_lmem_shape_t* new_tl_ofmap_shape, + fmt_t fmt, int eu_align) { + + assert(eu_align == 0 || eu_align == 1); + + bmk1880v2_chip_info_t chip_info = bmk1880v2_chip_info(); + // TODO: verify dilation_h/dilation_w + int dilation_h = 1; + int dilation_w = 1; + // TODO: verify p->ins_h, p->ins_last_h + int d_kh = calc_dilute_hw(kh, dilation_h - 1, 0, 0, 0); + int h_after = calc_dilute_hw(ih, 0, 0, 0, 0); + int in = 1; + //int h_after = calc_dilute_hw(ih, p->ins_h, p->ins_last_h, p->pad_top, p->pad_bottom); + //int w_after = calc_dilute_hw(iw, p->ins_w, p->ins_last_w, p->pad_left, p->pad_right); + int ret = _get_dup_shape(bk_ctx, in, ic, h_after, iw, d_kh, stride_h, chip_info.npu_num, + tl_load_shape, new_tl_ifmap_stride, new_tg_ifmap_shape, new_tg_ifmap_stride, + fmt, fmt); + + if (ret == -1) { + return ret; + } + + new_tl_weight_shape->n = 1; + new_tl_weight_shape->c = tl_load_shape->c; + new_tl_weight_shape->h = kh; + new_tl_weight_shape->w = kw; + + new_tl_bias_shape->n = 2; + new_tl_bias_shape->c = tl_load_shape->c; + new_tl_bias_shape->h = 1; + new_tl_bias_shape->w = 1; + + int pad_h = get_same_pad(tl_load_shape->h, stride_h, kh); + //int no_pad_h = tl_load_shape->h; + + // reserve for padding + new_tg_ifmap_shape->h += pad_h; + tl_load_shape->h += pad_h; + + bmk1880v2_tensor_lmem_stride_t s = + bmk1880v2_tensor_lmem_default_stride(bk_ctx, *tl_load_shape, fmt, eu_align); + + new_tl_ifmap_stride->n = s.n; + new_tl_ifmap_stride->c = s.c; + new_tl_ifmap_stride->h = s.h; + new_tl_ifmap_stride->w = s.w; + + // TODO: verity ins_x + int oh = pooling_oh(0, 0, 0, 0, + stride_h, tl_load_shape->h, kh, dilation_h); + int ow = pooling_ow(0, 0, pad_left, pad_right, + stride_w, tl_load_shape->w, kw, dilation_w); + +#ifdef DBG + printf("new oh/ow pad_h is %d/%d %d\n", oh, ow, pad_h); +#endif + new_tl_ofmap_shape->n = in; + new_tl_ofmap_shape->c = tl_load_shape->c; + new_tl_ofmap_shape->h = oh; + new_tl_ofmap_shape->w = ow; + + return ret; +} + +/* + * \brief duplicate weight for reshaped c + */ +uint8_t* bm1880v2_reshape_channel_weight(uint8_t* weight, + int ni, int ci, int hi, int wi, + int old_weight_c, + fmt_t fmt) { + + assert(weight); + assert(ci / old_weight_c > 0 && ci % old_weight_c == 0); + + int sz = fmt == FMT_BF16 ? 2 : 1; + + int new_weight_hw_shape_size = hi * wi; + int new_weight_shape_size = ni * ci * hi * wi; + int duplicat_c = ci / old_weight_c; + uint8_t *new_weight = (uint8_t *)malloc(new_weight_shape_size * sz); + + + for (int n = 0; n < ni; n++) { + for (int c = 0; c < old_weight_c; c++) { + int index = (n * old_weight_c + c) * new_weight_hw_shape_size * sz; + for (int i = 0; i < duplicat_c; i++) { + int new_weight_index = (n * old_weight_c * duplicat_c + + c * duplicat_c + i) * new_weight_hw_shape_size * sz; + memcpy(&new_weight[new_weight_index], &weight[index], new_weight_hw_shape_size * sz); + } + } + } + + return new_weight; +} + +/* + * \brief prepare load shape/stride with pad + * \return -1 means fail to reshape, 0 means success + * \TODO check memory usage + */ +static inline int _get_dup_shape_same_pad( + bmk1880v2_context_t *bk_ctx, + int in, int ic, int ih, int iw, + int d_kh, int stride_h, int npu_num, + bmk1880v2_tensor_lmem_shape_t* tl_load_shape, + bmk1880v2_tensor_lmem_stride_t* tl_load_stride, + bmk1880v2_tensor_tgmem_shape_t* tg_shape, + bmk1880v2_tensor_tgmem_stride_t* tg_stride, + fmt_t src_tg_fmt, fmt_t dst_tl_fmt + ) { + + assert(in > 0 && ic > 0 && ih > 0 && iw > 0 && d_kh > 0 && stride_h > 0); + assert(tl_load_shape && tl_load_stride && tg_shape && tg_stride); + + // 1. reshape and extend c, h axis in order + int oc; + int oh; + + // FIXME: check kernel setting + oh = 0; + + // 2. get total output + // 3. slice output + assert((ih - d_kh) % stride_h == 0); + int ih_ext = pooling_ih_ext(0, 0, 0, 0, ih); + int _oh = (ih_ext - d_kh) / stride_h + 1; + + for (int i = npu_num/ic; i > 0; i--) { + if (_oh % i == 0) { + // add 1 for later padding + oh = stride_h * (_oh / i - 1) + 1; + oc = i * ic; + break; + } + } + + if (!oh) { + // FIXME: check terminal condition + return -1; + } + +#ifdef DBG + printf ("ic:ih is %d %d, oc:oh is %d:%d\n", ic, ih, oc, oh); +#endif + + // tg/tl MUST be same shape size + tl_load_shape->n = tg_shape->n = 1; + tl_load_shape->c = tg_shape->c = oc; + tl_load_shape->h = tg_shape->h = oh; + tl_load_shape->w = tg_shape->w = iw; + + // init tl + bmk1880v2_tensor_lmem_stride_t s = + bmk1880v2_tensor_lmem_default_stride(bk_ctx, *tl_load_shape, dst_tl_fmt, CTRL_NULL); + tl_load_stride->n = s.n; + tl_load_stride->c = s.c; + tl_load_stride->h = s.h; + tl_load_stride->w = s.w; + + // init tg + bmk1880v2_tensor_tgmem_stride_t gs = + bmk1880v2_tensor_tgmem_default_stride(*tg_shape, src_tg_fmt); + + tg_stride->n = gs.n; + tg_stride->c = gs.c; + tg_stride->h = gs.h; + + return 0; +} + +/** + * \brief get proper reshape size for depthwise conv with 'same' mode in h direction + * 'pad' means \ih is padded + * \return -1 means alloc fail + * \NOTICE: not support batch/ins_x/dilated_x/pad_top/pad_bottom + */ +int bm1880v2_reshape_channel_same_pad( + bmk1880v2_context_t *bk_ctx, + int ic, int ih, int iw, int kh, int kw, + int pad_right, int pad_left, int stride_h, int stride_w, + bmk1880v2_tensor_lmem_shape_t* tl_load_shape, + bmk1880v2_tensor_lmem_stride_t* new_tl_ifmap_stride, + bmk1880v2_tensor_tgmem_shape_t* new_tg_ifmap_shape, + bmk1880v2_tensor_tgmem_stride_t* new_tg_ifmap_stride, + bmk1880v2_tensor_lmem_shape_t* new_tl_weight_shape, + bmk1880v2_tensor_lmem_shape_t* new_tl_bias_shape, + bmk1880v2_tensor_lmem_shape_t* new_tl_ofmap_shape, + fmt_t fmt, int eu_align) { + + assert(eu_align == 0 || eu_align == 1); + + bmk1880v2_chip_info_t chip_info = bmk1880v2_chip_info(); + // TODO: verify dilation_h/dilation_w + int dilation_h = 1; + int dilation_w = 1; + // TODO: verify p->ins_h, p->ins_last_h + int d_kh = calc_dilute_hw(kh, dilation_h - 1, 0, 0, 0); + int h_after = calc_dilute_hw(ih, 0, 0, 0, 0); + int in = 1; + //int h_after = calc_dilute_hw(ih, p->ins_h, p->ins_last_h, p->pad_top, p->pad_bottom); + //int w_after = calc_dilute_hw(iw, p->ins_w, p->ins_last_w, p->pad_left, p->pad_right); + int ret = _get_dup_shape_same_pad(bk_ctx, in, ic, + h_after, iw, d_kh, stride_h, chip_info.npu_num, + tl_load_shape, new_tl_ifmap_stride, new_tg_ifmap_shape, new_tg_ifmap_stride, + fmt, fmt); + + if (ret == -1) { + return ret; + } + + new_tl_weight_shape->n = 1; + new_tl_weight_shape->c = tl_load_shape->c; + new_tl_weight_shape->h = kh; + new_tl_weight_shape->w = kw; + + new_tl_bias_shape->n = 2; + new_tl_bias_shape->c = tl_load_shape->c; + new_tl_bias_shape->h = 1; + new_tl_bias_shape->w = 1; + + int pad_h = get_same_pad(tl_load_shape->h, stride_h, kh); + //int no_pad_h = tl_load_shape->h; + + // reserve for padding + new_tg_ifmap_shape->h += pad_h; + tl_load_shape->h += pad_h; + + bmk1880v2_tensor_lmem_stride_t s = + bmk1880v2_tensor_lmem_default_stride(bk_ctx, *tl_load_shape, fmt, eu_align); + + new_tl_ifmap_stride->n = s.n; + new_tl_ifmap_stride->c = s.c; + new_tl_ifmap_stride->h = s.h; + new_tl_ifmap_stride->w = s.w; + + // TODO: verity ins_x + int oh = pooling_oh(0, 0, 0, 0, + stride_h, tl_load_shape->h, kh, dilation_h); + int ow = pooling_ow(0, 0, pad_left, pad_right, + stride_w, tl_load_shape->w, kw, dilation_w); + +#ifdef DBG + printf("new oh/ow pad_h is %d/%d %d\n", oh, ow, pad_h); +#endif + new_tl_ofmap_shape->n = in; + new_tl_ofmap_shape->c = tl_load_shape->c; + new_tl_ofmap_shape->h = oh; + new_tl_ofmap_shape->w = ow; + + return ret; +} diff --git a/cvikernel/src/bm1880v2/non_atomic/tiu_sigmoid.c b/cvikernel/src/bm1880v2/non_atomic/tiu_sigmoid.c new file mode 100644 index 000000000..71910b217 --- /dev/null +++ b/cvikernel/src/bm1880v2/non_atomic/tiu_sigmoid.c @@ -0,0 +1,277 @@ +/** + * implement Linear interpolation search + * + * we need to pass 2 table, one is answer(lut_answer), another is slope with anwser(lut_answer_slope), + * + * for example, we want to get x value + * +------+----+ + * x0 x x1 + * + * the [Linear interpolation defined] (https://en.wikipedia.org/wiki/Linear_interpolation) as flowing: + * + * part C part A part B + * +--+ +---+ +----------------------------------------+ + * + * p(x) = f(x0) + ( (f(x1) - f(x0)) / (x1 - x0) ) * (x - x0) + * + * +---+ +-----------------------------+ + * lut_answer lut_answer_slope + */ + +#include "gen_lut.h" +#include + +//#define DBG +/* + * NOTICE: it could occupy 2 lookup table size which shape is <1,32,32,8> with bf16 data type + * + * \tl_buf tmp buffer, the shape MUST be same with \tl_ifmap + * \tl_ofmap_bf16 result as bf16, MUST given for tmp buffer used + */ +int bf16_emit_sigmoid(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* IN tl_ifmap, + bmk1880v2_tensor_lmem_t* IN tl_buf, + bmk1880v2_tensor_lmem_t *tl_table_answer, + bmk1880v2_tensor_lmem_t *tl_table_answer_slope, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16, + float scale + ) { + bf16_table_check(tl_ifmap, tl_table_answer, tl_table_answer_slope, tl_buf); + assert_same_shape_3(tl_ifmap, tl_buf, tl_ofmap_bf16); + + fmt_t fmt = FMT_BF16; + + tl_shape_t tl_ofmap_A_idx_int8_shape = {1, tl_buf->shape.c, tl_buf->shape.h * tl_buf->shape.w, 1}; + + bmk1880v2_tdma_l2l_tensor_copy_param_t p10; + + // scale input for remap its idx(-x~x) to (-127~127), dirty tl_ifmap + bmk1880v2_tiu_element_wise_mul_param_t p1; + memset(&p1, 0, sizeof(p1)); + p1.res_high = NULL; + p1.res_low = tl_ifmap; + p1.a = tl_ifmap; + p1.b_is_const = 1; + p1.b_const.val = convert_fp32_bf16(scale); + p1.rshift_bits = 0; + p1.relu_enable = 0; + bmk1880v2_tiu_element_wise_mul(ctx, &p1); + + + // int8 + // save by stride + memset(&p10, 0x00, sizeof(bmk1880v2_tdma_l2l_tensor_copy_param_t)); + bmk1880v2_tensor_lmem_t dst; + memcpy(&dst, tl_ofmap_bf16, sizeof(bmk1880v2_tensor_lmem_t)); + dst.fmt = FMT_I8; + dst.shape = tl_ofmap_A_idx_int8_shape; + //dst.stride = bmk1880v2_tensor_lmem_default_stride(ctx, dst.shape, /*eu_align*/ 1, dst.fmt); + dst.stride = bmk1880v2_tensor_lmem_default_stride(ctx, dst.shape, dst.fmt, CTRL_NULL); + dst.stride.h = dst.stride.h * 2; + dst.int8_rnd_mode = 1; + p10.dst = &dst; + p10.src = tl_ifmap; + bmk1880v2_tdma_l2l_bf16_tensor_copy(ctx, &p10); + dst.int8_rnd_mode = 0; // reset + + // 16 + dst.fmt = fmt; + dst.shape = tl_buf->shape; + dst.stride = tl_buf->stride; + + // c; i++) { + memcpy(&table_data[i * table_hw], &table_data[0], sizeof(uint16_t) * table_hw); + } +} + +float bf16_sigmoid_scale(int range_start, int range_end) { + int table_hw = bf16_table_hw(); + return table_hw / (1.0 * abs(range_start - range_end)); // 256 / 16 = 16 +} + +void bf16_gen_sigmoid_slope(uint16_t* OUT table_slope, + bmk1880v2_tensor_lmem_shape_t* table_shape, + double *sigmode_hw, float scale, + int range_start, int range_end) { + + assert(is_1880v2_tbl_shape(table_shape)); + + int half = half_h_table(); + int table_hw = bf16_table_hw(); + + for (int i = 0; i < table_hw; i++) { + double x0 = sigmode_hw[i]; + double x1 = sigmode_hw[i+1]; + double delta = 1.0; + if (i == half - 1) { + // half) { + x0 = sigmode_hw[i]; + x1 = sigmode_hw[i-1]; + delta = -1.0; + } + double s = (x1 - x0) / delta; // x1 already scale up + table_slope[i] = convert_fp32_bf16((float)s); +#ifdef GDB + printf ("slope table [%u] = (bf16 %f double %.8lf float %f), 0x%x, %.8lf - %.8lf(%.8lf)\n", + i, convert_bf16_fp32(table_slope[i]), s, (float)s, table_slope[i], x1, x0, x1-x0); +#endif + } + + // duplicate channel #1 to #31 + + //TODO: tensor copy + for (uint64_t i = 1; i < table_shape->c; i++) { + memcpy(&table_slope[table_hw * i], &table_slope[0], sizeof(uint16_t) * table_hw); + } +} + +void bf16_sigmoid_tbl(uint16_t *sigmoid_table_data, uint16_t* sigmoid_table_data_slope, + bmk1880v2_tensor_lmem_shape_t* table_shape, + int range_start, int range_end + ) { + + assert(sigmoid_table_data); + assert(sigmoid_table_data_slope); + assert(table_shape); + + double* sigmode_hw = bf16_gen_sigmoid_double(); + + float scale = bf16_sigmoid_scale(range_start, range_end); + + bf16_gen_sigmoid(sigmoid_table_data, table_shape, sigmode_hw, scale, range_start); + + bf16_gen_sigmoid_slope(sigmoid_table_data_slope, + table_shape, sigmode_hw, scale, + range_start, range_end); + + bf16_free_sigmoid_double(sigmode_hw); +} diff --git a/cvikernel/src/bm1880v2/non_atomic/tiu_sqrt.c b/cvikernel/src/bm1880v2/non_atomic/tiu_sqrt.c new file mode 100644 index 000000000..412469958 --- /dev/null +++ b/cvikernel/src/bm1880v2/non_atomic/tiu_sqrt.c @@ -0,0 +1,138 @@ +/** + */ +#include "gen_lut.h" +#include + +//#define DBG +/* + * NOTICE: it could occupy 2 lookup table size which shape is <1,32,32,8> with bf16 data type + * + * \tl_buf tmp buffer, the shape MUST be same with \tl_ifmap + * \tl_ofmap_bf16 result as bf16, MUST given for tmp buffer used + */ +int bf16_emit_sqrt(ctx_t *ctx, + bmk1880v2_tensor_lmem_t* IN tl_ifmap, + bmk1880v2_tensor_lmem_t* IN tl_buf, + bmk1880v2_tensor_lmem_t *tbl_answer, + bmk1880v2_tensor_lmem_t *tbl_answer_mantissa, + bmk1880v2_tensor_lmem_t* OUT tl_ofmap_bf16 +) { + + return bf16_lut_exp_mantissa(ctx, + tl_ifmap, + tl_buf, + tbl_answer, + tbl_answer_mantissa, + tl_ofmap_bf16 + ); +} + +static double _gen_sqrt(int base, int p) { + // y = x ^ 0.5 + double f = (double) (pow(base, p * 0.5)); + + if (isnan(f)) { + assert(0); + } + return f; +} + +void bf16_gen_sqrt(uint16_t *table_data, bmk1880v2_tensor_lmem_shape_t* table_shape) { + + assert(is_1880v2_tbl_shape(table_shape)); + + int exp_start = bf16_exp_start(); + int half = half_h_table(); + int table_hw = bf16_table_hw(); + uint64_t idx = 0; + + // prepare channel 0 + double s = 0.0; + table_data[idx] = convert_fp32_bf16(s); // 0^0.5 = 0 +#ifdef DBG + printf("t [%lu] is %f(%.8lf)[idx:%f][2^%f] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), s, (float)exp_start, (float)(exp_start/2), table_data[idx]); +#endif + idx++; + + // > 0, exp from 0 -62 -61 .. 62 63 + for (int i = 0; i < half; i++) { + int shift = (exp_start + i); + uint8_t is_odd = (shift % 2); + float exp = shift; + if (is_odd) { + exp = exp - 1; + } + + double s = _gen_sqrt(2, exp); + table_data[idx] = convert_fp32_bf16(s); +#ifdef DBG + printf("t [%lu] is %f [idx:%f][2^%f(%f)] bf %x\n", idx, + convert_bf16_fp32(table_data[idx]), + (float)(exp_start + i), exp/2, (exp_start + i) / 2.0, + table_data[idx]); +#endif + idx++; + } + + //// idx = 127 dont care + // duplicate channel #1 to #channel + //TODO: tensor copy + + for (uint32_t i = 1; i < table_shape->c; i++) { + memcpy(&table_data[i * table_hw], &table_data[0], sizeof(uint16_t) * table_hw); + } +} + +void bf16_gen_sqrt_mantissa(uint16_t* OUT table_mantissa, + bmk1880v2_tensor_lmem_shape_t* table_shape) { + + assert(is_1880v2_tbl_shape(table_shape)); + + uint32_t half = half_h_table(); + int table_hw = bf16_table_hw(); + + int idx = 0; + double d; + for (uint32_t i = 0; i < half; i++) { + d = 1 + i * 1 / 128.0; + d = (double) pow(d, 0.5); + table_mantissa[128+idx] = convert_fp32_bf16(d); +#ifdef DBG + //printf(", [%u] is %lf\n", i+128, d); +#endif /* ifdef DBG */ + + //13=2^3x1.625=(2^2)x(2^1x1.625) + d = 2 * (1 + i * 1 / 128.0); + d = (double) pow(d, 0.5); + table_mantissa[idx] = convert_fp32_bf16(d); +#ifdef DBG + //printf("mantissa [%u] is %lf", i, d); +#endif /* ifdef DBG */ + idx++; + } +#ifdef DBG + for (uint32_t i = 0; i < 2 * half; i++) { + printf("mantissa [%u] is %lf, 0x%x\n", i, convert_bf16_fp32(table_mantissa[i]), + table_mantissa[i]); + } +#endif /* ifdef DBG */ + + // duplicate channel #1 to #31 + //TODO: tensor copy + for (uint64_t i = 1; i < table_shape->c; i++) { + memcpy(&table_mantissa[table_hw * i], &table_mantissa[0], sizeof(uint16_t) * table_hw); + } +} + + +void bf16_sqrt_tbl(uint16_t *sqrt_table_data, uint16_t* sqrt_table_data_mantissa, + bmk1880v2_tensor_lmem_shape_t* table_shape + ) { + + assert(sqrt_table_data); + assert(sqrt_table_data_mantissa); + assert(table_shape); + + bf16_gen_sqrt (sqrt_table_data, table_shape); + bf16_gen_sqrt_mantissa(sqrt_table_data_mantissa, table_shape); +} diff --git a/cvikernel/src/bm1880v2/tdma.c b/cvikernel/src/bm1880v2/tdma.c new file mode 100644 index 000000000..fa7569b69 --- /dev/null +++ b/cvikernel/src/bm1880v2/tdma.c @@ -0,0 +1,1960 @@ +#include "kernel_1880v2.h" +#include "bmkernel/bm1880v2/1880v2_fp_convert.h" + +//n < 0x10000); + ASSERT(s->c < 0x10000); + ASSERT(s->h < 0x10000); + ASSERT(s->w < 0x10000); + + ASSERT(s->n > 0x0); + ASSERT(s->c > 0x0); + ASSERT(s->h > 0x0); + ASSERT(s->w > 0x0); +} + +static void check_tdma_tl_bf16_shape(const tl_shape_t *s, fmt_t fmt) +{ + uint8_t fmt_type = (fmt == FMT_BF16 ? 2 : 1); + ASSERT(s->n < 0x10000); + ASSERT(s->c < 0x10000); + ASSERT(s->h < 0x10000); + ASSERT(s->w < 0x10000 / fmt_type); + + ASSERT(s->n > 0x0); + ASSERT(s->c > 0x0); + ASSERT(s->h > 0x0); + ASSERT(s->w > 0x0); +} + +static void check_tdma_tg_shape(const tg_shape_t *s) +{ + ASSERT(s->n < 0x10000); + ASSERT(s->c < 0x10000); + ASSERT(s->h < 0x10000); + ASSERT(s->w < 0x10000); + + ASSERT(s->n > 0x0); + ASSERT(s->c > 0x0); + ASSERT(s->h > 0x0); + ASSERT(s->w > 0x0); +} + +static void check_tdma_tg_bf16_shape(const tg_shape_t *s, fmt_t fmt) +{ + uint8_t fmt_type = (fmt == FMT_BF16 ? 2 : 1); + ASSERT(s->n < 0x10000); + ASSERT(s->c < 0x10000); + ASSERT(s->h < 0x10000); + ASSERT(s->w < 0x10000 / fmt_type); + + ASSERT(s->n > 0x0); + ASSERT(s->c > 0x0); + ASSERT(s->h > 0x0); + ASSERT(s->w > 0x0); +} + + +static void check_tdma_ml_shape(const ml_shape_t *s) +{ + ASSERT(s->n < 0x10000); + ASSERT(s->c < 0x10000); + ASSERT(s->w < 0x10000); + ASSERT(s->col < 0x10000); + + ASSERT(s->n > 0); + ASSERT(s->c > 0); + ASSERT(s->w > 0); + ASSERT(s->col > 0); +} + +static void check_tdma_ml_bf16_shape(const ml_shape_t *s, fmt_t fmt) +{ + uint8_t fmt_type = (fmt == FMT_BF16 ? 2 : 1); + ASSERT(s->n < 0x10000); + ASSERT(s->c < 0x10000); + ASSERT(s->w < 0x10000 / fmt_type); + ASSERT(s->col < 0x10000); + + ASSERT(s->n > 0); + ASSERT(s->c > 0); + ASSERT(s->w > 0); + ASSERT(s->col > 0); +} + +static void check_tdma_mg_shape(const mg_shape_t *s) +{ + ASSERT(s->row < 0x10000); + ASSERT(s->col < 0x10000); + + ASSERT(s->row > 0x0); + ASSERT(s->col > 0x0); +} + +static void check_tdma_mg_bf16_shape(const mg_shape_t *s, fmt_t fmt) +{ + uint8_t fmt_type = (fmt == FMT_BF16 ? 2 : 1); + ASSERT(s->row < 0x10000); + ASSERT(s->col < 0x10000 / fmt_type); + + ASSERT(s->row > 0x0); + ASSERT(s->col > 0x0); +} + +static void check_tdma_tl(const tl_t *t) +{ + ASSERT(t); + ASSERT(t->fmt == FMT_I8 || t->fmt == FMT_U8 || t->fmt == FMT_BF16); + check_tdma_tl_shape(&t->shape); +} + +static void check_tdma_tl_bf16(const tl_t *t) +{ + ASSERT(t); + ASSERT(t->fmt == FMT_I8 || t->fmt == FMT_U8 || t->fmt == FMT_BF16); + check_tdma_tl_bf16_shape(&t->shape, t->fmt); +} + +static void check_tdma_tg(const tg_t *t) +{ + ASSERT(t); + ASSERT(t->base_reg_index < TDMA_NUM_BASE_REGS); + ASSERT(t->fmt == FMT_I8 || t->fmt == FMT_U8 || t->fmt == FMT_BF16); + check_tdma_tg_shape(&t->shape); +} + +static void check_tdma_tg_bf16(const tg_t *t) +{ + ASSERT(t); + ASSERT(t->base_reg_index < TDMA_NUM_BASE_REGS); + ASSERT(t->fmt == FMT_I8 || t->fmt == FMT_U8 || t->fmt == FMT_BF16); + check_tdma_tg_bf16_shape(&t->shape, t->fmt); +} + +static void check_tdma_compressed_tg(const compressed_tg_t *t) +{ + uint32_t stride_w = t->t.fmt == FMT_BF16 ? 2 : 1; + + ASSERT(t); + ASSERT(t->t.base_reg_index < TDMA_NUM_BASE_REGS); + check_tdma_tg_shape(&t->t.shape); + ASSERT(!(t->t.start_address%0x10)); + + // Enable after backend fix + //ASSERT(t->t.stride.n == + // (t->t.shape.w * t->t.shape.h * t->t.shape.c * stride_w)); + + ASSERT(t->t.stride.c == (t->t.shape.w * t->t.shape.h * stride_w)); + ASSERT(t->t.stride.h == (t->t.shape.w * stride_w)); + // m.base_reg_index < TDMA_NUM_BASE_REGS); + ASSERT(!(t->m.start_address%0x10)); + + // the data should be continuous + if (t->m.fmt == FMT_BF16) { + ASSERT(t->m.stride.row == t->m.shape.col * 2); + } + else if (t->m.fmt == FMT_I8 || t->m.fmt == FMT_U8) { + ASSERT(t->m.stride.row == t->m.shape.col); + } + else { + ASSERT(0); //fmt == FMT_I8 || m->fmt == FMT_U8 || m->fmt == FMT_BF16); + check_tdma_ml_shape(&m->shape); +} + +static void check_tdma_ml_bf16(const ml_t *m) +{ + ASSERT(m); + ASSERT(m->fmt == FMT_I8 || m->fmt == FMT_U8 || m->fmt == FMT_BF16); + check_tdma_ml_bf16_shape(&m->shape, m->fmt); +} + +static void check_tdma_mg(const mg_t *m) +{ + ASSERT(m); + ASSERT(m->base_reg_index < TDMA_NUM_BASE_REGS); + check_tdma_mg_shape(&m->shape); +} + +static void check_tdma_mg_bf16(const mg_t *m) +{ + ASSERT(m); + ASSERT(m->base_reg_index < TDMA_NUM_BASE_REGS); + check_tdma_mg_bf16_shape(&m->shape, m->fmt); +} + +static void check_tdma_compress_mg(const compressed_mg_t *m) +{ + ASSERT(m); + ASSERT(m->m.base_reg_index < TDMA_NUM_BASE_REGS); + check_tdma_mg_shape(&m->m.shape); +} + +static void assert_tl_same_size(const tl_t *a, const tl_t *b) +{ + uint32_t a_size = a->shape.n * a->shape.c * a->shape.h * a->shape.w; + uint32_t b_size = b->shape.n * b->shape.c * b->shape.h * b->shape.w; + + ASSERT(a_size == b_size); +} + +static void assert_tl_tg_same_size(const tl_t *tl, const tg_t *tg) +{ + uint32_t tl_size = tl->shape.n * tl->shape.c * tl->shape.h * tl->shape.w; + uint32_t tg_size = tg->shape.n * tg->shape.c * tg->shape.h * tg->shape.w; + ASSERT(tl_size == tg_size); +} + +static void assert_ml_mg_same_size(const ml_t *ml, const mg_t *mg) +{ + uint32_t ml_size = ml->shape.n * ml->shape.col; + uint32_t mg_size = mg->shape.row * mg->shape.col; + + ASSERT(ml_size == mg_size); +} + +static uint64_t absolute_gmem_addr(uint64_t addr) +{ + return (addr & 0x0FFFFFFFFFF) + BM1880V2_GLOBAL_MEM_START_ADDR; +} + +static ec_desc_t * emit_tdma_cmdbuf(ctx_t *ctx, tdma_reg_t *reg) +{ + desc_pair_t *dp = bm1880v2_get_desc_pair(ctx, BMK1880v2_TDMA); + + reg->layer_ID = ctx->layer_id; + //ASSERT(reg->rsv5 != 0x0);// "this is debug use, it's fine for skip"; + + uint32_t *cmdbuf = (uint32_t *)dp->cmd_hdr->cmd; + emit_tdma_reg(reg, cmdbuf); + + return dp->ec_desc; +} + +static void fill_l2tg_fmt(tdma_reg_t *reg, fmt_t src_fmt, fmt_t dst_fmt) +{ + reg->dst_fmt = (dst_fmt == FMT_BF16) ? 2 : 1; + reg->src_fmt = (src_fmt == FMT_BF16) ? 2 : 1; + // check and decide bf16->int8 or bf16->uint8_t + reg->int8_sign = (dst_fmt == FMT_I8 ? 1 : 0);// | (dst_fmt == FMT_U8 ? 1 : 0); +} + +static void fill_tg2l_fmt(tdma_reg_t *reg, fmt_t src_fmt, fmt_t dst_fmt) +{ + reg->dst_fmt = (dst_fmt == FMT_BF16) ? 2 : 1; + reg->src_fmt = (src_fmt == FMT_BF16) ? 2 : 1; + // check and decide int8->bf16 or uint8_t->bf16 + reg->int8_sign = (src_fmt == FMT_I8 ? 1 : 0) ;//| (src_fmt == FMT_U8 ? 1 : 0); +} + +static void fill_l2l_fmt(tdma_reg_t *reg, fmt_t src_fmt, fmt_t dst_fmt) +{ + reg->dst_fmt = (dst_fmt == FMT_BF16) ? 2 : 1; + reg->src_fmt = (src_fmt == FMT_BF16) ? 2 : 1; + // check and decide bf16->int8 or bf16->uint8_t or int8->bf16 or uint8_t->bf16 + reg->int8_sign = (dst_fmt == FMT_I8 ? 1 : 0) | (src_fmt == FMT_I8 ? 1 : 0); +} + +static void fill_src_addr(tdma_reg_t *r, uint64_t addr) +{ + r->src_base_addr_low = (uint32_t)addr; + r->src_base_addr_high = (addr >> 32); +} + +static void fill_dst_addr(tdma_reg_t *r, uint64_t addr) +{ + r->dst_base_addr_low = (uint32_t)addr; + r->dst_base_addr_high = (addr >> 32); +} + +static void fill_src_c_stride(tdma_reg_t *r, uint32_t str) +{ + r->src_c_stride_low = (uint16_t)str; + r->src_c_stride_high = (str >> 16); +} + +static void fill_dst_c_stride(tdma_reg_t *r, uint32_t str) +{ + r->dst_c_stride_low = (uint16_t)str; + r->dst_c_stride_high = (str >> 16); +} + +static void set_int8_rnd_mode(tdma_reg_t *r, uint32_t int8_rnd_mode) +{ + if (int8_rnd_mode == 1) { + // int8 + if (r->src_fmt == FMT_BF16_TYP && r->dst_fmt == FMT_FIX8B_TYP) { + r->int8_rnd_mode = int8_rnd_mode; + } + } +} + + +/* + * Direction: L2L + */ + +bmk1880v2_op_t * bmk1880v2_tdma_l2l_tensor_copy( + ctx_t *ctx, + const bmk1880v2_tdma_l2l_tensor_copy_param_t *p) +{ + check_tdma_tl(p->src); + check_tdma_tl(p->dst); + ASSERT(p->src->shape.n == p->dst->shape.n); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 3; + reg.spec_func = 0; + reg.sys_dtype = 0; + + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + reg.outstanding_en = p->outstanding; + + return emit_tdma_cmdbuf(ctx, ®); +} + +bmk1880v2_op_t * bmk1880v2_tdma_l2l_bf16_tensor_copy( + ctx_t *ctx, + const bmk1880v2_tdma_l2l_tensor_copy_param_t *p) +{ + check_tdma_tl_bf16(p->src); + check_tdma_tl_bf16(p->dst); + ASSERT(p->src->shape.n == p->dst->shape.n); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 3; + reg.spec_func = 0; + reg.sys_dtype = 0; + fill_l2l_fmt(®, p->src->fmt, p->dst->fmt); + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + // does not allow open `mv_lut_idx and `mv_lut_basemv_lut_base at same time + if (p->mv_lut_idx == 1) { + reg.mv_lut_idx = p->mv_lut_idx; + } + + if (p->mv_lut_base == 1) { + reg.mv_lut_base = p->mv_lut_base; + } + + if (reg.mv_lut_idx == 1 && reg.mv_lut_base == 1) { + ASSERT(0); + } + + set_int8_rnd_mode(®, p->dst->int8_rnd_mode); + + reg.outstanding_en = p->outstanding; + + //trace_tdma_reg(®, __func__); + + return emit_tdma_cmdbuf(ctx, ®); +} + +static uint32_t addr_after_right_shift( + ctx_t *ctx, int addr, uint32_t step, int c_str) +{ + uint32_t npu_num = ctx->chip_info.npu_num; + uint32_t lmem_size = ctx->chip_info.lmem_size;; + + uint32_t lmem_i = (addr / lmem_size + step) % npu_num; + uint32_t offset = addr % lmem_size + (addr / lmem_size + step) / npu_num * c_str; + return lmem_i * lmem_size + offset; +} + +bmk1880v2_op_t * bmk1880v2_tdma_l2l_tensor_lrn_shift( + ctx_t *ctx, + const bmk1880v2_tdma_l2l_tensor_lrn_shift_param_t *p) +{ + check_tdma_tl(p->src); + check_tdma_tl(p->dst); + assert_tl_same_size(p->src, p->dst); + ASSERT(p->src->shape.n == p->dst->shape.n); + ASSERT(p->src->shape.c == p->dst->shape.c); + ASSERT(p->src->shape.c > p->lrn_step); + ASSERT(p->src->shape.h * p->src->shape.w == + p->dst->shape.h * p->dst->shape.w); + ASSERT(p->lrn_step < 16); + + ASSERT(p->src->fmt == p->dst->fmt); + + int is_bf16 = (p->src->fmt == FMT_BF16) ? 1 : 0; + if (is_bf16) { + check_tdma_tl_bf16(p->src); + check_tdma_tl_bf16(p->dst); + } + + /* L2L lrn copy */ + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 3; + reg.spec_func = 0; + reg.sys_dtype = 0; + fill_l2l_fmt(®, p->src->fmt, p->dst->fmt); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c - p->lrn_step; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c - p->lrn_step; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + if (p->right_shift) { + uint32_t dst_addr = addr_after_right_shift( + ctx, p->dst->start_address, p->lrn_step, p->dst->stride.c); + + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + } else { + uint32_t src_addr = addr_after_right_shift( + ctx, p->src->start_address, p->lrn_step, p->src->stride.c); + + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + } + + if (is_bf16) + set_int8_rnd_mode(®, p->dst->int8_rnd_mode); + + emit_tdma_cmdbuf(ctx, ®); + + /* Constant fill with zero */ + reg.trans_dir = 0; + reg.spec_func = 4; + reg.const_val = is_bf16 ? convert_fp32_bf16(0.0): 0; + + reg.dst_c = p->lrn_step; + if (p->right_shift) { + uint32_t dst_addr = addr_after_right_shift( + ctx, p->dst->start_address, p->lrn_step, p->dst->stride.c); + + uint32_t lmem_size = ctx->chip_info.lmem_size;; + uint32_t npu_num = ctx->chip_info.npu_num; + uint32_t sht_num = p->lrn_step; + + uint32_t lmem_i = (dst_addr / lmem_size - sht_num) % npu_num; + uint32_t offset = (lmem_i + sht_num) / npu_num * p->dst->stride.c; + uint32_t zero_addr = lmem_i * lmem_size + dst_addr % lmem_size - offset; + + // printf(" lmem_i 0x%x, offset 0x%x, zero_addr 0x%x\n", + // lmem_i, offset, zero_addr); + + fill_dst_addr(®, zero_addr); + + } else { + uint32_t start_mem = p->dst->start_address / ctx->chip_info.lmem_size; + uint32_t cur_mem = (start_mem + (p->dst->shape.c - p->lrn_step)) % ctx->chip_info.npu_num; + uint32_t offset = + (p->dst->start_address % ctx->chip_info.lmem_size) + + ((start_mem + (p->dst->shape.c - p->lrn_step)) / ctx->chip_info.npu_num) * p->dst->stride.c; + uint32_t zero_addr = cur_mem * ctx->chip_info.lmem_size + offset; + + // printf(" start_mem 0x%x, cur_mem 0x%x, offset 0x%x, zero_addr 0x%x\n", + // start_mem, cur_mem, offset, zero_addr); + + fill_dst_addr(®, zero_addr); + } + + return emit_tdma_cmdbuf(ctx, ®); +} + +/* + * Direction: L2TG + */ + +static bmk1880v2_op_t * tdma_l2tg_tensor_copy( + ctx_t *ctx, + const bmk1880v2_tdma_l2tg_tensor_copy_param_t *p, + uint64_t dst_addr) +{ + check_tdma_tl(p->src); + check_tdma_tg(p->dst); + assert_tl_tg_same_size(p->src, p->dst); + ASSERT(p->src->shape.n == p->dst->shape.n); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.sys_dtype = 0; + reg.spec_func = 0; + + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + return emit_tdma_cmdbuf(ctx, ®); +} + + +static bmk1880v2_op_t * tdma_l2tg_bf16_tensor_copy( + ctx_t *ctx, + const bmk1880v2_tdma_l2tg_tensor_copy_param_t *p, + uint64_t dst_addr) +{ + check_tdma_tl_bf16(p->src); + check_tdma_tg_bf16(p->dst); + assert_tl_tg_same_size(p->src, p->dst); + ASSERT(p->src->shape.n == p->dst->shape.n); + ASSERT(!(p->src->fmt == FMT_I8 && p->dst->fmt == FMT_BF16)); // not support tl(int8)->tg(bf16) + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.sys_dtype = 0; + reg.spec_func = 0; + + fill_l2tg_fmt(®, p->src->fmt, p->dst->fmt); + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + set_int8_rnd_mode(®, p->dst->int8_rnd_mode); + //trace_tdma_reg(®, __func__); + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1880v2_op_t * tdma_l2tg_tensor_copy_nc_transposed( + ctx_t *ctx, + const bmk1880v2_tdma_l2tg_tensor_copy_nc_transposed_param_t *p, + uint64_t dst_addr) +{ + check_tdma_tl(p->src); + check_tdma_tg(p->dst); + ASSERT(p->dst->shape.n == p->src->shape.c); + ASSERT(p->dst->shape.c == p->src->shape.n); + ASSERT(p->dst->shape.h * p->dst->shape.w == + p->src->shape.h * p->src->shape.w); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.sys_dtype = 0; + reg.spec_func = 1; + + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1880v2_op_t * tdma_l2tg_bf16_tensor_copy_nc_transposed( + ctx_t *ctx, + const bmk1880v2_tdma_l2tg_tensor_copy_nc_transposed_param_t *p, + uint64_t dst_addr) +{ + check_tdma_tl_bf16(p->src); + check_tdma_tg_bf16(p->dst); + ASSERT(p->dst->shape.n == p->src->shape.c); + ASSERT(p->dst->shape.c == p->src->shape.n); + ASSERT(p->dst->shape.h * p->dst->shape.w == + p->src->shape.h * p->src->shape.w); + ASSERT(!(p->src->fmt == FMT_I8 && p->dst->fmt == FMT_BF16)); // not support tl(int8)->tg(bf16) + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.sys_dtype = 0; + reg.spec_func = 1; + + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + fill_l2tg_fmt(®, p->src->fmt, p->dst->fmt); + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + set_int8_rnd_mode(®, p->dst->int8_rnd_mode); + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1880v2_op_t * tdma_l2tg_tensor_copy_cw_transposed( + ctx_t *ctx, + const bmk1880v2_tdma_l2tg_tensor_copy_cw_transposed_param_t *p, + uint64_t dst_addr) +{ + check_tdma_tl(p->src); + check_tdma_tg(p->dst); + ASSERT(p->src->shape.n == p->dst->shape.n); + ASSERT(p->src->shape.c == p->dst->shape.w); + ASSERT(p->src->shape.h == p->dst->shape.h); + ASSERT(p->src->shape.w == p->dst->shape.c); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.spec_func = 1; + reg.transpose_md = 3; + + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1880v2_op_t * tdma_l2tg_bf16_tensor_copy_cw_transposed( + ctx_t *ctx, + const bmk1880v2_tdma_l2tg_tensor_copy_cw_transposed_param_t *p, + uint64_t dst_addr) +{ + check_tdma_tl_bf16(p->src); + check_tdma_tg_bf16(p->dst); + ASSERT(p->src->shape.n == p->dst->shape.n); + ASSERT(p->src->shape.c == p->dst->shape.w); + ASSERT(p->src->shape.h == p->dst->shape.h); + ASSERT(p->src->shape.w == p->dst->shape.c); + + /*not support bf16 mode*/ + ASSERT(!(p->src->fmt == FMT_BF16 || p->dst->fmt == FMT_BF16)); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.spec_func = 1; + reg.transpose_md = 3; + + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + fill_l2tg_fmt(®, p->src->fmt, p->dst->fmt); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1880v2_op_t * tdma_l2tg_tensor_copy_compressed( + ctx_t *ctx, + const bmk1880v2_tdma_l2tg_tensor_copy_compressed_param_t *p, + uint64_t dst_addr) +{ + check_tdma_tl(p->src); + check_tdma_compressed_tg(p->dst); + assert_tl_tg_same_size(p->src, &p->dst->t); + + tdma_reg_t reg; + reset_tdma_reg(®); + + //src->fmt == FMT_BF16 || p->src->fmt == FMT_I8 || p->src->fmt == FMT_U8); + + ASSERT(p->dst->bias1 == 0); + if (p->src->fmt == FMT_BF16) { + ASSERT(p->dst->bias0 == 127); + } + else { + //p->src->fmt == FMT_I8 || p->src->fmt == FMT_U8); + ASSERT(p->dst->bias0 == 0); + ASSERT(p->dst->zero_guard_en == 0); + } + + reg.src_fmt = (p->src->fmt == FMT_BF16) ? FMT_BF16_TYP : FMT_FIX8B_TYP; + reg.dst_fmt = reg.src_fmt; + + reg.vld = 1; + reg.trans_dir = 1; + reg.compress_en = 1; + + // VLC constraint under hw compress + //1. in int8/uint8, bias0/bias should be 0/0 + //2. in bf16, signed should be 0 and bias0 set to 127, bias1 set to 0 + reg.cmprs_fmt = (p->src->fmt == FMT_I8); + + // NOTICE: it recommand set to 1 once data contain '0' under bf16 + reg.compress_zero_guard = p->dst->zero_guard_en ? 1 : 0; + reg.compress_bias0 = p->dst->bias0; + reg.compress_bias1 = p->dst->bias1; + + reg.dst_base_reg_sel = p->dst->t.base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->t.shape.c; + reg.dst_h = p->dst->t.shape.h; + reg.dst_w = p->dst->t.shape.w; + reg.dst_n_stride = p->dst->t.stride.n; + fill_dst_c_stride(®, p->dst->t.stride.c); + reg.dst_h_stride = p->dst->t.stride.h; + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1880v2_op_t * tdma_l2tg_tensor_fill_constant( + ctx_t *ctx, + const bmk1880v2_tdma_l2tg_tensor_fill_constant_param_t *p, + uint64_t dst_addr) +{ + check_tdma_tg_bf16(p->dst); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.spec_func = 4; + reg.const_val = p->constant; + + // only support tl(bf16)->tg(bf16) or tl(fix8b)->tg(fix8b) + fill_l2tg_fmt(®, p->dst->fmt, p->dst->fmt); + + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_dst_addr(®, dst_addr); + + reg.src_n = p->dst->shape.n; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1880v2_op_t * tdma_l2tg_matrix_copy( + ctx_t *ctx, + const bmk1880v2_tdma_l2tg_matrix_copy_param_t *p, + uint64_t dst_addr) +{ + check_tdma_ml(p->src); + check_tdma_mg(p->dst); + assert_ml_mg_same_size(p->src, p->dst); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.sys_dtype = 1; + reg.spec_func = 0; + + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = 1; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.row; + reg.dst_w = p->dst->shape.col; + fill_dst_c_stride(®, p->dst->stride.row); + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1880v2_op_t * tdma_l2tg_matrix_copy_compressed( + ctx_t *ctx, + const bmk1880v2_tdma_l2tg_matrix_copy_compressed_param_t *p, + uint64_t dst_addr) +{ + check_tdma_ml(p->src); + check_tdma_compress_mg(p->dst); + check_tdma_vlc_matrix_compressed_mg(p->dst); + assert_ml_mg_same_size(p->src, &p->dst->m); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.compress_en = 1; + reg.sys_dtype = 1; + reg.spec_func = 0; + + // vlc setting + reg.cmprs_fmt = (p->src->fmt == FMT_I8); + + ASSERT(p->dst->bias1 == 0); + if (p->src->fmt == FMT_BF16) { + ASSERT(p->dst->bias0 == 127); + } + else { + //p->src->fmt == FMT_I8 || p->src->fmt == FMT_U8); + ASSERT(p->dst->bias0 == 0); + ASSERT(p->dst->zero_guard_en == 0); + } + + // NOTICE: it should be 1 once data contain '0' under bf16 + reg.compress_zero_guard = p->dst->zero_guard_en ? 1 : 0; + reg.compress_bias0 = p->dst->bias0; + reg.compress_bias1 = p->dst->bias1; + + reg.dst_base_reg_sel = p->dst->m.base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + fill_l2tg_fmt(®, p->src->fmt, p->dst->m.fmt); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = 1; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->m.shape.row; + reg.dst_w = p->dst->m.shape.col; + fill_dst_c_stride(®, p->dst->m.stride.row); + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1880v2_op_t * tdma_l2tg_bf16_matrix_copy( + ctx_t *ctx, + const bmk1880v2_tdma_l2tg_matrix_copy_param_t *p, + uint64_t dst_addr) +{ + check_tdma_ml_bf16(p->src); + check_tdma_mg_bf16(p->dst); + assert_ml_mg_same_size(p->src, p->dst); + ASSERT(!((p->src->fmt == FMT_I8 || p->src->fmt == FMT_U8) && p->dst->fmt == FMT_BF16)); // not support tl(i8/uint8_t)->tg(bf16) + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.sys_dtype = 1; + reg.spec_func = 0; + + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + fill_l2tg_fmt(®, p->src->fmt, p->dst->fmt); + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = 1; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.row; + reg.dst_w = p->dst->shape.col; + fill_dst_c_stride(®, p->dst->stride.row); + set_int8_rnd_mode(®, p->dst->int8_rnd_mode); + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1880v2_op_t * tdma_l2tg_general_copy( + ctx_t *ctx, + const bmk1880v2_tdma_l2tg_general_copy_param_t *p, + uint64_t dst_addr) +{ + ASSERT(p->dst_base_reg_index < TDMA_NUM_BASE_REGS); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.trans_fmt = 1; + reg.sys_dtype = 0; + reg.spec_func = 0; + + reg.dst_base_reg_sel = p->dst_base_reg_index; + fill_src_addr(®, p->src_address); + fill_dst_addr(®, dst_addr); + reg.src_n_stride = p->bytes; + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1880v2_op_t * tdma_l2tg_bf16_general_copy( + ctx_t *ctx, + const bmk1880v2_tdma_l2tg_bf16_general_copy_param_t *p, + uint64_t dst_addr) +{ + ASSERT(p->dst_base_reg_index < TDMA_NUM_BASE_REGS); + + tdma_reg_t reg; + reset_tdma_reg(®); + // only support fix8b->fix8b or bf16->bf16 + ASSERT(p->src_fmt == p->dst_fmt); + + reg.vld = 1; + reg.trans_dir = 1; + reg.trans_fmt = 1; + reg.sys_dtype = 0; + reg.spec_func = 0; + fill_l2tg_fmt(®, p->src_fmt, p->dst_fmt); + + reg.dst_base_reg_sel = p->dst_base_reg_index; + fill_src_addr(®, p->src_address); + fill_dst_addr(®, dst_addr); + reg.src_n_stride = p->src_bytes; + return emit_tdma_cmdbuf(ctx, ®); +} + +bmk1880v2_op_t * bmk1880v2_tdma_l2g_tensor_copy( + ctx_t *ctx, + const bmk1880v2_tdma_l2tg_tensor_copy_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + return tdma_l2tg_tensor_copy(ctx, p, dst_addr); +} + +bmk1880v2_op_t * bmk1880v2_tdma_l2g_bf16_tensor_copy( + ctx_t *ctx, + const bmk1880v2_tdma_l2tg_tensor_copy_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + return tdma_l2tg_bf16_tensor_copy(ctx, p, dst_addr); +} +bmk1880v2_op_t * bmk1880v2_tdma_l2g_tensor_copy_nc_transposed( + ctx_t *ctx, + const bmk1880v2_tdma_l2tg_tensor_copy_nc_transposed_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + return tdma_l2tg_tensor_copy_nc_transposed(ctx, p, dst_addr); +} + +bmk1880v2_op_t * bmk1880v2_tdma_l2g_bf16_tensor_copy_nc_transposed( + ctx_t *ctx, + const bmk1880v2_tdma_l2tg_tensor_copy_nc_transposed_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + return tdma_l2tg_bf16_tensor_copy_nc_transposed(ctx, p, dst_addr); +} + +bmk1880v2_op_t * bmk1880v2_tdma_l2g_tensor_copy_cw_transposed( + ctx_t *ctx, + const bmk1880v2_tdma_l2tg_tensor_copy_cw_transposed_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + return tdma_l2tg_tensor_copy_cw_transposed(ctx, p, dst_addr); +} + +bmk1880v2_op_t * bmk1880v2_tdma_l2g_bf16_tensor_copy_cw_transposed( + ctx_t *ctx, + const bmk1880v2_tdma_l2tg_tensor_copy_cw_transposed_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + return tdma_l2tg_bf16_tensor_copy_cw_transposed(ctx, p, dst_addr); +} + +bmk1880v2_op_t * bmk1880v2_tdma_l2g_tensor_copy_compressed( + ctx_t *ctx, + const bmk1880v2_tdma_l2tg_tensor_copy_compressed_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->t.start_address); + return tdma_l2tg_tensor_copy_compressed(ctx, p, dst_addr); +} + +bmk1880v2_op_t * bmk1880v2_tdma_l2g_tensor_fill_constant( + ctx_t *ctx, + const bmk1880v2_tdma_l2tg_tensor_fill_constant_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + return tdma_l2tg_tensor_fill_constant(ctx, p, dst_addr); +} + +bmk1880v2_op_t * bmk1880v2_tdma_l2g_matrix_copy( + ctx_t *ctx, + const bmk1880v2_tdma_l2tg_matrix_copy_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + return tdma_l2tg_matrix_copy(ctx, p, dst_addr); +} + +bmk1880v2_op_t * bmk1880v2_tdma_l2g_bf16_matrix_copy( + ctx_t *ctx, + const bmk1880v2_tdma_l2tg_matrix_copy_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + return tdma_l2tg_bf16_matrix_copy(ctx, p, dst_addr); +} + +bmk1880v2_op_t * bmk1880v2_tdma_l2g_matrix_copy_compressed( + ctx_t *ctx, + const bmk1880v2_tdma_l2tg_matrix_copy_compressed_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->m.start_address); + return tdma_l2tg_matrix_copy_compressed(ctx, p, dst_addr); +} + +bmk1880v2_op_t * bmk1880v2_tdma_l2g_general_copy( + ctx_t *ctx, + const bmk1880v2_tdma_l2tg_general_copy_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst_address); + return tdma_l2tg_general_copy(ctx, p, dst_addr); +} + +bmk1880v2_op_t * bmk1880v2_tdma_l2g_bf16_general_copy( + ctx_t *ctx, + const bmk1880v2_tdma_l2tg_bf16_general_copy_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst_address); + return tdma_l2tg_bf16_general_copy(ctx, p, dst_addr); +} +/* + * Direction: TG2L + */ + +static bmk1880v2_op_t * tdma_tg2l_tensor_copy( + ctx_t *ctx, + const bmk1880v2_tdma_tg2l_tensor_copy_param_t *p, + uint64_t src_addr) +{ + check_tdma_tg(p->src); + check_tdma_tl(p->dst); + ASSERT(p->src->shape.n == p->dst->shape.n); + assert_tl_tg_same_size(p->dst, p->src); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 0; + reg.spec_func = 0; + + reg.src_base_reg_sel = p->src->base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1880v2_op_t * tdma_tg2l_bf16_tensor_copy( + ctx_t *ctx, + const bmk1880v2_tdma_tg2l_tensor_copy_param_t *p, + uint64_t src_addr) +{ + check_tdma_tg_bf16(p->src); + check_tdma_tl_bf16(p->dst); + ASSERT(p->src->shape.n == p->dst->shape.n); + ASSERT(!(p->src->fmt == FMT_BF16 && p->dst->fmt == FMT_I8)); // not support tg(bf16)->tl(int8) + assert_tl_tg_same_size(p->dst, p->src); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 0; + reg.spec_func = 0; + + reg.src_base_reg_sel = p->src->base_reg_index; + + fill_tg2l_fmt(®, p->src->fmt, p->dst->fmt); + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + //trace_tdma_reg(®, __func__); + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1880v2_op_t * tdma_tg2l_tensor_copy_nc_transposed( + ctx_t *ctx, + const bmk1880v2_tdma_tg2l_tensor_copy_nc_transposed_param_t *p, + uint64_t src_addr) +{ + check_tdma_tg(p->src); + check_tdma_tl(p->dst); + ASSERT(p->dst->shape.n == p->src->shape.c); + ASSERT(p->dst->shape.c == p->src->shape.n); + ASSERT(p->dst->shape.h * p->dst->shape.w == + p->src->shape.h * p->src->shape.w); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 0; + reg.spec_func = 1; + + reg.src_base_reg_sel = p->src->base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1880v2_op_t * tdma_tg2l_bf16_tensor_copy_nc_transposed( + ctx_t *ctx, + const bmk1880v2_tdma_tg2l_tensor_copy_nc_transposed_param_t *p, + uint64_t src_addr) +{ + check_tdma_tg_bf16(p->src); + check_tdma_tl_bf16(p->dst); + ASSERT(p->dst->shape.n == p->src->shape.c); + ASSERT(p->dst->shape.c == p->src->shape.n); + ASSERT(p->dst->shape.h * p->dst->shape.w == + p->src->shape.h * p->src->shape.w); + + ASSERT(!(p->src->fmt == FMT_BF16 && p->dst->fmt == FMT_I8)); // not support tg(bf16)->tl(int8) + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 0; + reg.spec_func = 1; + + reg.src_base_reg_sel = p->src->base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + fill_tg2l_fmt(®, p->src->fmt, p->dst->fmt); + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1880v2_op_t * tdma_tg2l_tensor_copy_chw_rotated( + ctx_t *ctx, + const bmk1880v2_tdma_tg2l_tensor_copy_chw_rotated_param_t *p, + uint64_t src_addr) +{ + check_tdma_tg(p->src); + check_tdma_tl(p->dst); + + ASSERT(p->src->shape.c == 3 || p->src->shape.c == 4); + ASSERT(p->src->shape.n == p->dst->shape.n); + ASSERT(p->src->shape.c == p->dst->shape.c); + ASSERT(p->src->shape.h == p->dst->shape.h); + ASSERT(p->src->shape.w == p->dst->shape.w); + + ASSERT(p->dst->start_address % ctx->chip_info.eu_num == 0); + ASSERT(p->dst->stride.n % ctx->chip_info.eu_num == 0); + ASSERT(p->dst->stride.c % ctx->chip_info.eu_num == 0); + ASSERT(p->dst->stride.h == p->dst->shape.w); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 0; + reg.spec_func = 1; + + if (p->dst->shape.c == 3) + reg.transpose_md = 1; + else if(p->dst->shape.c == 4) + reg.transpose_md = 2; + else + ASSERT(0); + + reg.src_base_reg_sel = p->src->base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, 1); + reg.src_h_stride = p->src->shape.c * p->src->shape.w; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1880v2_op_t * tdma_tg2l_tensor_copy_decompressed( + ctx_t *ctx, + const bmk1880v2_tdma_tg2l_tensor_copy_decompressed_param_t *p, + uint64_t src_addr) +{ + check_tdma_compressed_tg(p->src); + check_tdma_tl(p->dst); + assert_tl_tg_same_size(p->dst, &p->src->t); + + tdma_reg_t reg; + reset_tdma_reg(®); + + //dst->fmt == FMT_BF16 || p->dst->fmt == FMT_I8 || p->dst->fmt == FMT_U8); + fill_tg2l_fmt(®, p->src->t.fmt, p->dst->fmt); + + reg.vld = 1; + reg.trans_dir = 0; + reg.compress_en = 1; + reg.cmprs_fmt = (p->src->t.fmt == FMT_I8); + + reg.src_base_reg_sel = p->src->t.base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->t.shape.n; + reg.src_c = p->src->t.shape.c; + reg.src_h = p->src->t.shape.h; + reg.src_w = p->src->t.shape.w; + reg.src_n_stride = p->src->t.stride.n; + fill_src_c_stride(®, p->src->t.stride.c); + reg.src_h_stride = p->src->t.stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + // trace_tdma_reg(®, __FUNCTION__); + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1880v2_op_t * tdma_tg2l_tensor_fill_constant( + ctx_t *ctx, + const bmk1880v2_tdma_tg2l_tensor_fill_constant_param_t *p) +{ + check_tdma_tl(p->dst); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.spec_func = 4; + reg.const_val = p->constant; + + reg.dst_fmt = (p->dst->fmt == FMT_BF16) ? 2 : 1; + + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->dst->shape.n; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1880v2_op_t * tdma_tg2l_bf16_tensor_fill_constant( + ctx_t *ctx, + const bmk1880v2_tdma_tg2l_tensor_fill_constant_param_t *p) +{ + check_tdma_tl_bf16(p->dst); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.spec_func = 4; + reg.const_val = p->constant; + + /*only suppoert fix8b->fix8b or bf16->bf16*/ + fill_tg2l_fmt(®, p->dst->fmt, p->dst->fmt); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->dst->shape.n; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1880v2_op_t * tdma_tg2l_matrix_copy( + ctx_t *ctx, + const bmk1880v2_tdma_tg2l_matrix_copy_param_t *p, + uint64_t src_addr) +{ + check_tdma_mg(p->src); + check_tdma_ml(p->dst); + ASSERT(p->dst->shape.n == p->src->shape.row); + assert_ml_mg_same_size(p->dst, p->src); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 1; + reg.spec_func = 0; + + reg.src_base_reg_sel = p->src->base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.row; + reg.src_c = p->src->shape.row; + reg.src_w = p->src->shape.col; + fill_src_c_stride(®, p->src->stride.row); + + reg.dst_c = p->dst->shape.c; + reg.dst_h = 1; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1880v2_op_t * tdma_tg2l_matrix_copy_decompressed( + ctx_t *ctx, + const bmk1880v2_tdma_tg2l_matrix_copy_decompressed_param_t *p, + uint64_t src_addr) +{ + check_tdma_vlc_matrix_compressed_mg(p->src); + check_tdma_mg(&p->src->m); + check_tdma_ml(p->dst); + ASSERT(p->dst->shape.n == p->src->m.shape.row); + assert_ml_mg_same_size(p->dst, &p->src->m); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 1; + reg.spec_func = 0; + reg.compress_en = 1; + reg.cmprs_fmt = (p->src->m.fmt == FMT_I8); + + fill_tg2l_fmt(®, p->src->m.fmt, p->dst->fmt); + reg.src_base_reg_sel = p->src->m.base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->m.shape.row; + reg.src_c = p->src->m.shape.row; + reg.src_w = p->src->m.shape.col; + fill_src_c_stride(®, p->src->m.stride.row); + + reg.dst_c = p->dst->shape.c; + reg.dst_h = 1; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1880v2_op_t * tdma_tg2l_bf16_matrix_copy( + ctx_t *ctx, + const bmk1880v2_tdma_tg2l_matrix_copy_param_t *p, + uint64_t src_addr) +{ + check_tdma_mg_bf16(p->src); + check_tdma_ml_bf16(p->dst); + ASSERT(p->dst->shape.n == p->src->shape.row); + assert_ml_mg_same_size(p->dst, p->src); + ASSERT(!(p->src->fmt == FMT_BF16 && p->dst->fmt == FMT_I8)); // not support tg(bf16)->tl(int8) + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 1; + reg.spec_func = 0; + + fill_tg2l_fmt(®, p->src->fmt, p->dst->fmt); + reg.src_base_reg_sel = p->src->base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.row; + reg.src_c = p->src->shape.row; + reg.src_w = p->src->shape.col; + fill_src_c_stride(®, p->src->stride.row); + + reg.dst_c = p->dst->shape.c; + reg.dst_h = 1; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1880v2_op_t * tdma_tg2l_matrix_copy_row_col_transposed( + ctx_t *ctx, + const bmk1880v2_tdma_tg2l_matrix_copy_row_col_transposed_param_t *p, + uint64_t src_addr) +{ + check_tdma_mg(p->src); + check_tdma_ml(p->dst); + ASSERT(p->dst->shape.n == p->src->shape.col); + ASSERT(p->dst->shape.col == p->src->shape.row); + assert_ml_mg_same_size(p->dst, p->src); + + ASSERT(p->src->shape.row >= p->dst->shape.w); + ASSERT(p->dst->shape.c == + (uint32_t) ceiling_func(p->src->shape.row, p->dst->shape.w)); + + ASSERT(p->dst->start_address % ctx->chip_info.eu_num == 0); + ASSERT(p->dst->stride.n % ctx->chip_info.eu_num == 0); + ASSERT(p->dst->stride.c % ctx->chip_info.eu_num == 0); + ASSERT(p->dst->stride.h == p->dst->shape.w); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 1; + reg.spec_func = 1; + + reg.src_base_reg_sel = p->src->base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.row; + reg.src_c = p->src->shape.row; + reg.src_w = p->src->shape.col; + fill_src_c_stride(®, p->src->stride.row); + + reg.dst_c = p->dst->shape.c; + reg.dst_h = 1; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1880v2_op_t * tdma_tg2l_general_copy( + ctx_t *ctx, + const bmk1880v2_tdma_tg2l_general_copy_param_t *p, + uint64_t src_addr) +{ + ASSERT(p->src_base_reg_index < TDMA_NUM_BASE_REGS); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.trans_fmt = 1; + reg.sys_dtype = 0; + reg.spec_func = 0; + + reg.src_base_reg_sel = p->src_base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst_address); + reg.src_n_stride = p->bytes; + + return emit_tdma_cmdbuf(ctx, ®); +} + +static bmk1880v2_op_t * tdma_tg2l_bf16_general_copy( + ctx_t *ctx, + const bmk1880v2_tdma_tg2l_bf16_general_copy_param_t *p, + uint64_t src_addr) +{ + ASSERT(p->src_base_reg_index < TDMA_NUM_BASE_REGS); + // only support fix8b->fix8b or bf16->bf16 + ASSERT(p->dst_fmt == p->src_fmt); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.trans_fmt = 1; + reg.sys_dtype = 0; + reg.spec_func = 0; + + fill_tg2l_fmt(®, p->src_fmt, p->dst_fmt); + + reg.src_base_reg_sel = p->src_base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst_address); + reg.src_n_stride = p->src_bytes; + + return emit_tdma_cmdbuf(ctx, ®); +} + +bmk1880v2_op_t * bmk1880v2_tdma_g2l_tensor_copy( + ctx_t *ctx, + const bmk1880v2_tdma_tg2l_tensor_copy_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->start_address); + return tdma_tg2l_tensor_copy(ctx, p, src_addr); +} + +bmk1880v2_op_t * bmk1880v2_tdma_g2l_bf16_tensor_copy( + ctx_t *ctx, + const bmk1880v2_tdma_tg2l_tensor_copy_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->start_address); + return tdma_tg2l_bf16_tensor_copy(ctx, p, src_addr); +} + +bmk1880v2_op_t * bmk1880v2_tdma_g2l_tensor_copy_nc_transposed( + ctx_t *ctx, + const bmk1880v2_tdma_tg2l_tensor_copy_nc_transposed_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->start_address); + return tdma_tg2l_tensor_copy_nc_transposed(ctx, p, src_addr); +} + +bmk1880v2_op_t * bmk1880v2_tdma_g2l_bf16_tensor_copy_nc_transposed( + ctx_t *ctx, + const bmk1880v2_tdma_tg2l_tensor_copy_nc_transposed_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->start_address); + return tdma_tg2l_bf16_tensor_copy_nc_transposed(ctx, p, src_addr); +} + +bmk1880v2_op_t * bmk1880v2_tdma_g2l_tensor_copy_chw_rotated( + ctx_t *ctx, + const bmk1880v2_tdma_tg2l_tensor_copy_chw_rotated_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->start_address); + return tdma_tg2l_tensor_copy_chw_rotated(ctx, p, src_addr); +} + +bmk1880v2_op_t * bmk1880v2_tdma_g2l_tensor_copy_decompressed( + ctx_t *ctx, + const bmk1880v2_tdma_tg2l_tensor_copy_decompressed_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->t.start_address); + return tdma_tg2l_tensor_copy_decompressed(ctx, p, src_addr); +} + +bmk1880v2_op_t * bmk1880v2_tdma_tg2l_tensor_fill_constant( + ctx_t *ctx, + const bmk1880v2_tdma_tg2l_tensor_fill_constant_param_t *p) +{ + return tdma_tg2l_tensor_fill_constant(ctx, p); +} + +bmk1880v2_op_t * bmk1880v2_tdma_tg2l_bf16_tensor_fill_constant( + ctx_t *ctx, + const bmk1880v2_tdma_tg2l_tensor_fill_constant_param_t *p) +{ + return tdma_tg2l_bf16_tensor_fill_constant(ctx, p); +} + +bmk1880v2_op_t * bmk1880v2_tdma_g2l_matrix_copy( + ctx_t *ctx, + const bmk1880v2_tdma_tg2l_matrix_copy_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->start_address); + return tdma_tg2l_matrix_copy(ctx, p, src_addr); +} + +bmk1880v2_op_t * bmk1880v2_tdma_g2l_matrix_copy_decompressed( + ctx_t *ctx, + const bmk1880v2_tdma_tg2l_matrix_copy_decompressed_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->m.start_address); + return tdma_tg2l_matrix_copy_decompressed(ctx, p, src_addr); +} + +bmk1880v2_op_t * bmk1880v2_tdma_g2l_bf16_matrix_copy( + ctx_t *ctx, + const bmk1880v2_tdma_tg2l_matrix_copy_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->start_address); + return tdma_tg2l_bf16_matrix_copy(ctx, p, src_addr); +} + +bmk1880v2_op_t * bmk1880v2_tdma_g2l_matrix_copy_row_col_transposed( + ctx_t *ctx, + const bmk1880v2_tdma_tg2l_matrix_copy_row_col_transposed_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->start_address); + return tdma_tg2l_matrix_copy_row_col_transposed(ctx, p, src_addr); +} + +bmk1880v2_op_t * bmk1880v2_tdma_g2l_general_copy( + ctx_t *ctx, + const bmk1880v2_tdma_tg2l_general_copy_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src_address); + return tdma_tg2l_general_copy(ctx, p, src_addr); +} + +bmk1880v2_op_t * bmk1880v2_tdma_g2l_bf16_general_copy( + ctx_t *ctx, + const bmk1880v2_tdma_tg2l_bf16_general_copy_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src_address); + return tdma_tg2l_bf16_general_copy(ctx, p, src_addr); +} + +/* + * Direction: TG2TG + */ + +static bmk1880v2_op_t * bmk1880v2_gdma_copy_gmem( + bmk1880v2_context_t *ctx, + const bmk1880v2_tdma_tg2tg_tensor_copy_param_t *p, + uint8_t u8_trans_fmt) +{ + tdma_reg_t reg; + + reset_tdma_reg(®); + + uint64_t u64_src_addr; + uint64_t u64_dst_addr; + + reg.vld = 1; + reg.trans_dir = 2; // 0:tg2l, 1:l2tg, 2:g2g, 3:l2l + reg.trans_fmt = u8_trans_fmt; // 1:general copy, 2:tensor copy + reg.sys_dtype = 0; // + reg.spec_func = 0; // + + u64_src_addr = absolute_gmem_addr(p->src->start_address); + u64_dst_addr = absolute_gmem_addr(p->dst->start_address); + fill_src_addr(®, u64_src_addr); + fill_dst_addr(®, u64_dst_addr); + + reg.src_base_reg_sel = p->src->base_reg_index; + reg.dst_base_reg_sel = p->dst->base_reg_index; + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p-> dst->stride.h; + + return emit_tdma_cmdbuf( ctx, ®); +} + +static bmk1880v2_op_t * bmk1880v2_gdma_bf16_copy_gmem( + bmk1880v2_context_t *ctx, + const bmk1880v2_tdma_tg2tg_tensor_copy_param_t *p, + uint8_t u8_trans_fmt) +{ + tdma_reg_t reg; + + reset_tdma_reg(®); + + uint64_t u64_src_addr; + uint64_t u64_dst_addr; + + reg.vld = 1; + reg.trans_dir = 2; // 0:tg2l, 1:l2tg, 2:g2g, 3:l2l + reg.trans_fmt = u8_trans_fmt; // 1:general copy, 2:tensor copy + reg.sys_dtype = 0; // + reg.spec_func = 0; // + ASSERT(p->src->fmt == p->dst->fmt); + + reg.dst_fmt = (p->dst->fmt == FMT_BF16) ? 2 : 1; + reg.src_fmt = (p->src->fmt == FMT_BF16) ? 2 : 1; + + u64_src_addr = absolute_gmem_addr(p->src->start_address); + u64_dst_addr = absolute_gmem_addr(p->dst->start_address); + fill_src_addr(®, u64_src_addr); + fill_dst_addr(®, u64_dst_addr); + + reg.src_base_reg_sel = p->src->base_reg_index; + reg.dst_base_reg_sel = p->dst->base_reg_index; + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p-> dst->stride.h; + + return emit_tdma_cmdbuf( ctx, ®); +} + +/* + * Direction: G2G + */ +bmk1880v2_op_t * bmk1880v2_tdma_tg2tg_tensor_copy( + bmk1880v2_context_t *ctx, + const bmk1880v2_tdma_tg2tg_tensor_copy_param_t *p) +{ + bmk1880v2_gdma_copy_gmem(ctx, p, 2); + return NULL; +} + +bmk1880v2_op_t * bmk1880v2_tdma_tg2tg_bf16_tensor_copy( + bmk1880v2_context_t *ctx, + const bmk1880v2_tdma_tg2tg_tensor_copy_param_t *p) +{ + bmk1880v2_gdma_bf16_copy_gmem(ctx, p, 2); + return NULL; +} + +bmk1880v2_op_t * bmk1880v2_tdma_tg2tg_general_copy( + bmk1880v2_context_t *ctx, + const bmk1880v2_tdma_tg2tg_tensor_copy_param_t *p) +{ + bmk1880v2_gdma_copy_gmem(ctx, p, 1); + return NULL; +} + +bmk1880v2_op_t * bmk1880v2_tdma_tg2tg_bf16_general_copy( + bmk1880v2_context_t *ctx, + const bmk1880v2_tdma_tg2tg_tensor_copy_param_t *p) +{ + bmk1880v2_gdma_bf16_copy_gmem(ctx, p, 1); + return NULL; +} diff --git a/cvikernel/src/bm1880v2/tiu_average_pooling.c b/cvikernel/src/bm1880v2/tiu_average_pooling.c new file mode 100644 index 000000000..64624eed7 --- /dev/null +++ b/cvikernel/src/bm1880v2/tiu_average_pooling.c @@ -0,0 +1,79 @@ +#include "kernel_1880v2.h" +#include + +bmk1880v2_op_t * bmk1880v2_tiu_average_pooling( + ctx_t *ctx, + const bmk1880v2_tiu_average_pooling_param_t *p) +{ + int bf16_enable = (p->ifmap->fmt == FMT_BF16) ? 1 : 0; + + ASSERT(p->ifmap->shape.n == p->ofmap->shape.n); + ASSERT(p->ifmap->shape.c == p->ofmap->shape.c); + check_tiu_tensor_2(p->ifmap, p->ofmap); + if (bf16_enable) { + assert_bf16_stride_type_0(ctx, p->ifmap); + assert_bf16_stride_type_0(ctx, p->ofmap); + } else { + assert_stride_type_0(ctx, p->ifmap); + assert_stride_type_0(ctx, p->ofmap); + } + + int opd0_sign = tensor_is_signed(p->ifmap); + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_DEPTHWISE_POOL_FIX8B; + reg.tsk_eu_typ = 1; + reg.opt_shift_typ = opd0_sign; + reg.opt_right_shift = p->rshift_bits; + reg.opt_relu = 0; /* hardware relu function not verified. */ + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + + reg.res0_addr = p->ofmap->start_address; + reg.opt_res0_sign = opd0_sign; + reg.opt_res0_int8 = 1; + reg.res0_n = p->ofmap->shape.n; + reg.res0_c = p->ofmap->shape.c; + reg.res0_h = p->ofmap->shape.h; + reg.res0_w = p->ofmap->shape.w; + + reg.opd0_addr = p->ifmap->start_address; + reg.opt_opd0_sign = opd0_sign; + reg.opt_opd0_int8 = 1; + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.opd0_ins_val = bf16_enable ? 0 : (uint32_t)p->ins_val; + reg.opd0_ins_fp = bf16_enable ? (uint32_t)p->ins_fp : 0; + reg.conv_opd0_up_pad = p->pad_top; + reg.conv_opd0_dn_pad = p->pad_bottom; + reg.conv_opd0_lf_pad = p->pad_left; + reg.conv_opd0_rt_pad = p->pad_right; + reg.conv_opd0_x_ins0 = p->ins_w; + reg.conv_opd0_y_ins0 = p->ins_h; + reg.conv_opd0_x_ins0_last = p->ins_last_w; + reg.conv_opd0_y_ins0_last = p->ins_last_h; + + reg.opt_opd1_const = 1; + /*HW does not have dive, we need to calculate value here*/ + if (bf16_enable) + reg.opd1_addr = + convert_fp32_bf16( + (float)(convert_bf16_fp32(p->avg_pooling_const) / (p->kh * p->kw))); + else + reg.opd1_addr = p->avg_pooling_const; + + reg.opd1_h = p->kh; + reg.opd1_w = p->kw; + reg.opt_opd1_sign = 0; + reg.opt_opd1_int8 = 1; + reg.conv_op_x_str = p->stride_w; + reg.conv_op_y_str = p->stride_h; + + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1880v2/tiu_convolution.c b/cvikernel/src/bm1880v2/tiu_convolution.c new file mode 100644 index 000000000..a02f5d4bc --- /dev/null +++ b/cvikernel/src/bm1880v2/tiu_convolution.c @@ -0,0 +1,170 @@ +#include "kernel_1880v2.h" + +typedef bmk1880v2_tiu_convolution_param_t param_t; + +static int can_do_double_conv(ctx_t *ctx, const param_t *p) +{ + uint8_t bf16_enable = (p->ifmap->fmt == FMT_BF16) ? 1 : 0; + + if (((p->ifmap->start_address % ctx->chip_info.lmem_size) % 2 == 0 && + p->ifmap->shape.c % 2 == 0 && + p->ifmap->shape.c >= 4 && + p->weight->start_address % 2 == 0) && !bf16_enable) + return 1; + + return 0; +} + +static void check_conv_param(ctx_t *ctx, const param_t *p) +{ + uint32_t eu_num = ctx->chip_info.eu_num; + uint8_t bf16_enable = (p->ifmap->fmt == FMT_BF16) ? 1 : 0; + + check_tiu_tensor_3(p->ifmap, p->ofmap, p->weight); + if (bf16_enable) { + assert_bf16_stride_type_0(ctx, p->ifmap); + } else { + assert_stride_type_0(ctx, p->ifmap); + } + //assert_stride_type_1(ctx, p->weight); + if (p->bias) { + check_tiu_tensor(p->bias); + if (bf16_enable) + assert_bf16_stride_type_2(ctx, p->bias); + else + assert_stride_type_2(ctx, p->bias); + } + + // n stride must align 16B + ASSERT((p->ofmap->stride.n % 16) == 0); + + ASSERT(p->ifmap->start_address % eu_num == 0); + ASSERT(p->ofmap->start_address % eu_num == 0); + ASSERT(p->ifmap->shape.n == p->ofmap->shape.n); + ASSERT(!(p->ifmap->shape.h == 1 && p->ins_h > 0)); + ASSERT(p->weight->shape.n == p->ifmap->shape.c); + ASSERT(p->weight->shape.c == p->ofmap->shape.c); + if (can_do_double_conv(ctx, p)) { + uint32_t lmem_i = p->ifmap->start_address % ctx->chip_info.lmem_size; + ASSERT(lmem_i % 2 == 0); + ASSERT(p->ifmap->shape.c % 2 == 0); + ASSERT(p->ifmap->shape.c >= 4); /* Otherwise performance will suffer */ + ASSERT(p->weight->start_address % 2 == 0); + } + if(p->ps32_mode & 0x2) + { + ASSERT(!p->relu_enable); + ASSERT(!p->bias); + ASSERT(!p->rshift_bits); + } + ASSERT(p->stride_h < 16); + ASSERT(p->stride_w < 16); + ASSERT(p->pad_top < 16); + ASSERT(p->pad_bottom < 16); + ASSERT(p->pad_left < 16); + ASSERT(p->pad_right < 16); + ASSERT(p->ins_h < 16); + ASSERT(p->ins_last_h < 16); + ASSERT(p->ins_w < 16); + ASSERT(p->ins_last_w < 16); + ASSERT(p->dilation_h >= 1); + ASSERT(p->dilation_w >= 1); +} + +bmk1880v2_op_t * bmk1880v2_tiu_convolution(ctx_t *ctx, const param_t *p) +{ + check_conv_param(ctx, p); + + uint32_t npu_num = ctx->chip_info.npu_num; + int opd0_sign = tensor_is_signed(p->ifmap); + int opd1_sign = tensor_is_signed(p->weight); + int opd2_sign = p->bias? tensor_is_signed(p->bias): 1; + int arith_shift = opd0_sign || opd1_sign || opd2_sign; + int bf16_enable = (p->ifmap->fmt == FMT_BF16) ? 1 : 0; + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_CONV_FIX8B; + reg.opt_shift_typ = arith_shift; + reg.opt_right_shift = p->rshift_bits; + reg.opt_relu = !!(p->relu_enable); + reg.tsk_opd_num = 2; + + reg.opd_typ = bf16_enable ? 1: 0; + + /*always automatically enabel double conv at those situations*/ + if (can_do_double_conv(ctx, p)) + reg.double_conv = 1; + + reg.res0_addr = p->ofmap->start_address; + reg.opt_res0_sign = tensor_is_signed(p->ofmap); + reg.opt_res0_int8 = 1; + reg.res0_n = p->ofmap->shape.n; + reg.res0_c = p->ofmap->shape.c; + reg.res0_h = p->ofmap->shape.h; + reg.res0_w = p->ofmap->shape.w; + reg.res0_n_str = p->ofmap->stride.n; + reg.res0_c_str = p->ofmap->stride.c; + reg.res0_h_str = p->ofmap->stride.h; + reg.res0_w_str = p->ofmap->stride.w; + reg.short_res0_str = 3; // Manual instead of h/w + reg.ps32_md = p->ps32_mode; + if (p->ps32_mode > 0) + reg.res0_b_str = p->ofmap->shape.n * p->ofmap->stride.n; + + reg.opd0_addr = p->ifmap->start_address; + reg.opt_opd0_sign = opd0_sign; + reg.opt_opd0_int8 = 1; + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.opd0_ins_val = bf16_enable ? 0 : (uint32_t)p->ins_val; + reg.opd0_ins_fp = bf16_enable ? (uint32_t)p->ins_fp : 0; + reg.short_opd0_str = 0; + reg.conv_opd0_up_pad = p->pad_top; + reg.conv_opd0_dn_pad = p->pad_bottom; + reg.conv_opd0_lf_pad = p->pad_left; + reg.conv_opd0_rt_pad = p->pad_right; + reg.conv_opd0_x_ins0 = p->ins_w; + reg.conv_opd0_y_ins0 = p->ins_h; + reg.conv_opd0_x_ins0_last = p->ins_last_w; + reg.conv_opd0_y_ins0_last = p->ins_last_h; + + reg.opd1_addr = p->weight->start_address; + reg.opt_opd1_sign = opd1_sign; + reg.opt_opd1_int8 = 1; + reg.opt_opd1_const = p->w_is_const; + reg.opd1_n = p->weight->shape.n; + reg.opd1_c = p->weight->shape.c; + reg.opd1_h = p->weight->shape.h; + reg.opd1_w = p->weight->shape.w; + reg.short_opd1_str = 1; + reg.conv_opd1_x_ins0 = p->dilation_w - 1; + reg.conv_opd1_y_ins0 = p->dilation_h - 1; + reg.conv_op_x_str = p->stride_w; + reg.conv_op_y_str = p->stride_h; + + if (p->bias) { + ASSERT(p->bias->shape.n == 2); + ASSERT(p->bias->shape.c == p->ofmap->shape.c); + ASSERT(p->bias->shape.h == 1); + ASSERT(p->bias->shape.w == 1); + + reg.tsk_opd_num = 3; + reg.opt_opd2_sign = opd2_sign; + reg.opt_opd2_int8 = 0; + reg.opd2_addr = p->bias->start_address; + reg.opd2_n = 1; + reg.opd2_c = p->bias->shape.c; + reg.opd2_h = 1; + reg.opd2_w = 1; + reg.short_opd2_str = 2; + reg.opd2_b_str = ceiling_func(p->bias->shape.c, npu_num) * (bf16_enable ? 2 : 1); + } + + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1880v2/tiu_convolution_qdm.c b/cvikernel/src/bm1880v2/tiu_convolution_qdm.c new file mode 100644 index 000000000..c625d1b34 --- /dev/null +++ b/cvikernel/src/bm1880v2/tiu_convolution_qdm.c @@ -0,0 +1,160 @@ +#include "kernel_1880v2.h" + +typedef bmk1880v2_tiu_convolution_qdm_param_t param_t; + +static int can_do_double_conv(ctx_t *ctx, const param_t *p) +{ + if ((p->ifmap->start_address % ctx->chip_info.lmem_size) % 2 == 0 && + p->ifmap->shape.c % 2 == 0 && + p->ifmap->shape.c >= 4 && + p->weight->start_address % 2 == 0) + return 1; + + return 0; +} + +static void check_conv_param(ctx_t *ctx, const param_t *p) +{ + uint32_t eu_num = ctx->chip_info.eu_num; + + check_tiu_tensor_3(p->ifmap, p->ofmap, p->weight); + assert_stride_type_0(ctx, p->ifmap); + + ASSERT((p->ofmap->stride.n % eu_num) == 0); + ASSERT(p->ifmap->start_address % eu_num == 0); + ASSERT(p->ofmap->start_address % eu_num == 0); + ASSERT(p->ifmap->shape.n == p->ofmap->shape.n); + ASSERT(!(p->ifmap->shape.h == 1 && p->ins_h > 0)); + ASSERT(p->weight->shape.n == p->ifmap->shape.c); + ASSERT(p->weight->shape.c == p->ofmap->shape.c); + + if (p->chl_quan_param) { + check_tiu_tensor(p->chl_quan_param); + assert_stride_type_2(ctx, p->chl_quan_param); + ASSERT(p->chl_quan_param->start_address % eu_num == 0); + } + if (can_do_double_conv(ctx, p)) { + uint32_t lmem_i = p->ifmap->start_address % ctx->chip_info.lmem_size; + ASSERT(lmem_i % 2 == 0); + ASSERT(p->ifmap->shape.c % 2 == 0); + ASSERT(p->ifmap->shape.c >= 4); /* Otherwise performance will suffer */ + ASSERT(p->weight->start_address % 2 == 0); + } + if(p->ps32_mode & 0x2) + { + ASSERT(!p->relu_enable); + ASSERT(!p->has_bias); + } + ASSERT(p->stride_h < 16); + ASSERT(p->stride_w < 16); + ASSERT(p->pad_top < 16); + ASSERT(p->pad_bottom < 16); + ASSERT(p->pad_left < 16); + ASSERT(p->pad_right < 16); + ASSERT(p->ins_h < 16); + ASSERT(p->ins_last_h < 16); + ASSERT(p->ins_w < 16); + ASSERT(p->ins_last_w < 16); + ASSERT(p->dilation_h >= 1); + ASSERT(p->dilation_w >= 1); +} + +bmk1880v2_op_t * bmk1880v2_tiu_convolution_qdm(ctx_t *ctx, const param_t *p) +{ + check_conv_param(ctx, p); + + int opd0_sign = tensor_is_signed(p->ifmap); + int opd1_sign = tensor_is_signed(p->weight); + int arith_shift = opd0_sign || opd1_sign; + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_CONV_FIX8B; + reg.opt_shift_typ = arith_shift; + reg.opt_relu = !!(p->relu_enable); + reg.tsk_opd_num = 2; + + /*always automatically enabel double conv at those situations*/ + if (can_do_double_conv(ctx, p)) + reg.double_conv = 1; + + reg.res0_addr = p->ofmap->start_address; + reg.opt_res0_sign = tensor_is_signed(p->ofmap); + reg.opt_res0_int8 = 1; + reg.res0_n = p->ofmap->shape.n; + reg.res0_c = p->ofmap->shape.c; + reg.res0_h = p->ofmap->shape.h; + reg.res0_w = p->ofmap->shape.w; + reg.res0_n_str = p->ofmap->stride.n; + reg.res0_c_str = p->ofmap->stride.c; + reg.res0_h_str = p->ofmap->stride.h; + reg.res0_w_str = p->ofmap->stride.w; + reg.short_res0_str = 3; // Manual instead of h/w + reg.ps32_md = p->ps32_mode; + if (p->ps32_mode > 0) { + reg.res0_b_str = p->ofmap->shape.n * p->ofmap->stride.n; + + // Per-channel parameter does not has right shift (default is 10). + // Set zero. + reg.opt_right_shift = 0; + } + + reg.opd0_addr = p->ifmap->start_address; + reg.opt_opd0_sign = opd0_sign; + reg.opt_opd0_int8 = 1; + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.opd0_ins_val = (uint32_t)p->ins_val; + reg.short_opd0_str = 0; + reg.conv_opd0_up_pad = p->pad_top; + reg.conv_opd0_dn_pad = p->pad_bottom; + reg.conv_opd0_lf_pad = p->pad_left; + reg.conv_opd0_rt_pad = p->pad_right; + reg.conv_opd0_x_ins0 = p->ins_w; + reg.conv_opd0_y_ins0 = p->ins_h; + reg.conv_opd0_x_ins0_last = p->ins_last_w; + reg.conv_opd0_y_ins0_last = p->ins_last_h; + + reg.opd1_addr = p->weight->start_address; + reg.opt_opd1_sign = opd1_sign; + reg.opt_opd1_int8 = 1; + reg.opt_opd1_const = p->w_is_const; + reg.opd1_n = p->weight->shape.n; + reg.opd1_c = p->weight->shape.c; + reg.opd1_h = p->weight->shape.h; + reg.opd1_w = p->weight->shape.w; + reg.short_opd1_str = 1; + reg.conv_opd1_x_ins0 = p->dilation_w - 1; + reg.conv_opd1_y_ins0 = p->dilation_h - 1; + reg.conv_op_x_str = p->stride_w; + reg.conv_op_y_str = p->stride_h; + + if (p->chl_quan_param) { + ASSERT(p->chl_quan_param->shape.n == 1); + ASSERT(p->chl_quan_param->shape.c == p->ofmap->shape.c); + ASSERT(p->chl_quan_param->shape.h == 1); + ASSERT(p->chl_quan_param->shape.w == 1); + reg.opt_chl_quan = 1; + reg.opt_right_shift = 0; // useless + reg.opd2_addr = p->chl_quan_param->start_address; + reg.opd2_n = p->chl_quan_param->shape.n; + reg.opd2_c = p->chl_quan_param->shape.c; + reg.opd2_h = p->chl_quan_param->shape.h; + reg.opd2_w = p->chl_quan_param->shape.w; + } + reg.opt_opd2_int8 = 1; // useless, force to 1 to skip b_stride check + reg.short_opd2_str = 2; // useless + reg.opd2_b_str = 0; // useless + + if (p->has_bias) { + reg.tsk_opd_num = 3; + reg.opt_opd2_sign = 1; + } + + reg.layer_info = p->layer_id; + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1880v2/tiu_depthwise_convolution.c b/cvikernel/src/bm1880v2/tiu_depthwise_convolution.c new file mode 100644 index 000000000..61d90092b --- /dev/null +++ b/cvikernel/src/bm1880v2/tiu_depthwise_convolution.c @@ -0,0 +1,148 @@ +#include "kernel_1880v2.h" + +bmk1880v2_op_t * bmk1880v2_tiu_depthwise_convolution( + ctx_t *ctx, + const bmk1880v2_tiu_depthwise_convolution_param_t *p) +{ + int bf16_enable = (p->ifmap->fmt == FMT_BF16) ? 1 : 0; + bool isMulConst = (p->weight_is_const == 1) ? 1 : 0; + + if(isMulConst) { + check_tiu_tensor_2(p->ifmap, p->ofmap); + } else { + check_tiu_tensor_3(p->ifmap, p->ofmap, p->weight); + } + if (bf16_enable) { + assert_bf16_stride_type_0(ctx, p->ifmap); + if(!isMulConst) + assert_bf16_stride_type_0(ctx, p->weight); + if (p->bias) { + check_tiu_tensor(p->bias); + assert_bf16_stride_type_2(ctx, p->bias); + } + } else { + assert_stride_type_0(ctx, p->ifmap); + if(!isMulConst) + assert_stride_type_0(ctx, p->weight); + if (p->bias) { + check_tiu_tensor(p->bias); + assert_stride_type_2(ctx, p->bias); + } + } + + // n stride must align 16B + ASSERT((p->ofmap->stride.n % 16) == 0); + + // Support fp32 result in bf16 + uint32_t res0_n = p->ofmap->shape.n; + int ps32_mode = 0; + if (bf16_enable && (p->ifmap->shape.n != p->ofmap->shape.n)) { + ASSERT((2 * p->ifmap->shape.n) == p->ofmap->shape.n); + ASSERT(p->ps32_mode == 2); // bit[1]: write + ps32_mode = 2; + res0_n = p->ifmap->shape.n; + } else { + ASSERT(p->ifmap->shape.n == p->ofmap->shape.n); + } + + ASSERT(p->ifmap->shape.c == p->ofmap->shape.c); + if(!isMulConst){ + ASSERT(p->ifmap->shape.c == p->weight->shape.c); + ASSERT(p->weight->shape.n == 1); + } + + int opd0_sign = tensor_is_signed(p->ifmap); + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_DEPTHWISE_POOL_FIX8B; + reg.tsk_eu_typ = 2; + reg.opt_relu = p->relu_enable; + reg.opt_shift_typ = 1; + reg.opt_right_shift = p->rshift_bits; + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + + int res0_sign = tensor_is_signed(p->ofmap); + reg.res0_addr = p->ofmap->start_address; + reg.opt_res0_sign = res0_sign; + reg.opt_res0_int8 = 1; + reg.res0_n = res0_n; + reg.res0_c = p->ofmap->shape.c; + reg.res0_h = p->ofmap->shape.h; + reg.res0_w = p->ofmap->shape.w; + reg.res0_n_str = p->ofmap->stride.n; + reg.res0_c_str = p->ofmap->stride.c; + reg.res0_h_str = p->ofmap->stride.h; + reg.res0_w_str = p->ofmap->stride.w; + reg.res0_b_str = reg.res0_n_str * reg.res0_n; + reg.short_res0_str = 3; // Manual instead of h/w + reg.ps32_md = ps32_mode; + + reg.opd0_addr = p->ifmap->start_address; + reg.opt_opd0_sign = opd0_sign; + reg.opt_opd0_int8 = 1; + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.opd0_n_str = p->ifmap->stride.n; + reg.opd0_c_str = p->ifmap->stride.c; + reg.opd0_h_str = p->ifmap->stride.h; + reg.opd0_w_str = p->ifmap->stride.w; + reg.opd0_ins_val = bf16_enable ? 0 : (uint32_t)p->ins_val; + reg.opd0_ins_fp = bf16_enable ? (uint32_t)p->ins_fp : 0; + reg.short_opd0_str = 3; // Manual instead of h/w + reg.conv_opd0_up_pad = p->pad_top; + reg.conv_opd0_dn_pad = p->pad_bottom; + reg.conv_opd0_lf_pad = p->pad_left; + reg.conv_opd0_rt_pad = p->pad_right; + reg.conv_opd0_x_ins0 = p->ins_w; + reg.conv_opd0_y_ins0 = p->ins_h; + reg.conv_opd0_x_ins0_last = p->ins_last_w; + reg.conv_opd0_y_ins0_last = p->ins_last_h; + + reg.opt_opd1_sign = 1; + reg.opt_opd1_int8 = 1; + reg.conv_opd1_x_ins0 = p->dilation_w - 1; + reg.conv_opd1_y_ins0 = p->dilation_h - 1; + if (isMulConst) { + reg.opt_opd1_const = 1; + reg.opt_opd1_sign = p->weight_const.is_signed; + reg.opd1_addr = p->weight_const.val; + reg.opd1_n = p->weight->shape.n; + reg.opd1_c = p->weight->shape.c; + reg.opd1_h = p->weight->shape.h; + reg.opd1_w = p->weight->shape.w; + } else { + reg.opd1_addr = p->weight->start_address; + reg.opd1_n = p->weight->shape.n; + reg.opd1_c = p->weight->shape.c; + reg.opd1_h = p->weight->shape.h; + reg.opd1_w = p->weight->shape.w; + } + reg.conv_op_x_str = p->stride_w; + reg.conv_op_y_str = p->stride_h; + + if (p->bias) { + ASSERT(p->bias->shape.n == 2); + ASSERT(p->bias->shape.c == p->ofmap->shape.c); + ASSERT(p->bias->shape.h == 1); + ASSERT(p->bias->shape.w == 1); + + reg.tsk_opd_num = 3; + reg.opd2_addr = p->bias->start_address; + reg.opt_opd2_int8 = 0; + reg.opd2_n = 1; + reg.opd2_c = p->bias->shape.c; + reg.opd2_h = 1; + reg.opd2_w = 1; + reg.short_opd2_str = 2; + reg.opd2_b_str = p->bias->stride.n; + } + + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1880v2/tiu_depthwise_convolution_qdm.c b/cvikernel/src/bm1880v2/tiu_depthwise_convolution_qdm.c new file mode 100644 index 000000000..6aa048d88 --- /dev/null +++ b/cvikernel/src/bm1880v2/tiu_depthwise_convolution_qdm.c @@ -0,0 +1,122 @@ +#include "kernel_1880v2.h" + +bmk1880v2_op_t * bmk1880v2_tiu_depthwise_convolution_qdm( + ctx_t *ctx, + const bmk1880v2_tiu_depthwise_convolution_qdm_param_t *p) +{ + uint32_t eu_num = ctx->chip_info.eu_num; + bool isMulConst = (p->weight_is_const == 1) ? 1 : 0; + + if(isMulConst) { + check_tiu_tensor_2(p->ifmap, p->ofmap); + } else { + check_tiu_tensor_3(p->ifmap, p->ofmap, p->weight); + } + assert_stride_type_0(ctx, p->ifmap); + if(!isMulConst) + assert_stride_type_0(ctx, p->weight); + check_tiu_tensor(p->chl_quan_param); + assert_stride_type_2(ctx, p->chl_quan_param); + + ASSERT((p->ofmap->stride.n % eu_num) == 0); + ASSERT(p->chl_quan_param->start_address % eu_num == 0); + ASSERT(p->ifmap->shape.n == p->ofmap->shape.n); + ASSERT(p->ifmap->shape.c == p->ofmap->shape.c); + if(!isMulConst){ + ASSERT(p->ifmap->shape.c == p->weight->shape.c); + ASSERT(p->weight->shape.n == 1); + } + + int opd0_sign = tensor_is_signed(p->ifmap); + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_DEPTHWISE_POOL_FIX8B; + reg.tsk_eu_typ = 2; + reg.opt_relu = p->relu_enable; + reg.opt_shift_typ = 1; + reg.tsk_opd_num = 2; + + int res0_sign = tensor_is_signed(p->ofmap); + reg.res0_addr = p->ofmap->start_address; + reg.opt_res0_sign = res0_sign; + reg.opt_res0_int8 = 1; + reg.res0_n = p->ofmap->shape.n; + reg.res0_c = p->ofmap->shape.c; + reg.res0_h = p->ofmap->shape.h; + reg.res0_w = p->ofmap->shape.w; + reg.res0_n_str = p->ofmap->stride.n; + reg.res0_c_str = p->ofmap->stride.c; + reg.res0_h_str = p->ofmap->stride.h; + reg.res0_w_str = p->ofmap->stride.w; + reg.short_res0_str = 3; // Manual instead of h/w + + reg.opd0_addr = p->ifmap->start_address; + reg.opt_opd0_sign = opd0_sign; + reg.opt_opd0_int8 = 1; + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.opd0_ins_val = (uint32_t)p->ins_val; + reg.opd0_n_str = p->ifmap->stride.n; + reg.opd0_c_str = p->ifmap->stride.c; + reg.opd0_h_str = p->ifmap->stride.h; + reg.opd0_w_str = p->ifmap->stride.w; + reg.short_opd0_str = 3; // Manual instead of h/w + reg.conv_opd0_up_pad = p->pad_top; + reg.conv_opd0_dn_pad = p->pad_bottom; + reg.conv_opd0_lf_pad = p->pad_left; + reg.conv_opd0_rt_pad = p->pad_right; + reg.conv_opd0_x_ins0 = p->ins_w; + reg.conv_opd0_y_ins0 = p->ins_h; + reg.conv_opd0_x_ins0_last = p->ins_last_w; + reg.conv_opd0_y_ins0_last = p->ins_last_h; + + reg.opt_opd1_sign = 1; + reg.opt_opd1_int8 = 1; + reg.conv_opd1_x_ins0 = p->dilation_w - 1; + reg.conv_opd1_y_ins0 = p->dilation_h - 1; + if (isMulConst) { + reg.opt_opd1_const = 1; + reg.opt_opd1_sign = p->weight_const.is_signed; + reg.opd1_addr = p->weight_const.val; + reg.opd1_n = p->weight->shape.n; + reg.opd1_c = p->weight->shape.c; + reg.opd1_h = p->weight->shape.h; + reg.opd1_w = p->weight->shape.w; + } else { + reg.opd1_addr = p->weight->start_address; + reg.opd1_n = p->weight->shape.n; + reg.opd1_c = p->weight->shape.c; + reg.opd1_h = p->weight->shape.h; + reg.opd1_w = p->weight->shape.w; + } + reg.conv_op_x_str = p->stride_w; + reg.conv_op_y_str = p->stride_h; + + ASSERT(p->chl_quan_param->shape.n == 1); + ASSERT(p->chl_quan_param->shape.c == p->ofmap->shape.c); + ASSERT(p->chl_quan_param->shape.h == 1); + ASSERT(p->chl_quan_param->shape.w == 1); + reg.opt_chl_quan = 1; + reg.opt_right_shift = 0; // useless + reg.opd2_addr = p->chl_quan_param->start_address; + reg.opd2_n = p->chl_quan_param->shape.n; + reg.opd2_c = p->chl_quan_param->shape.c; + reg.opd2_h = p->chl_quan_param->shape.h; + reg.opd2_w = p->chl_quan_param->shape.w; + reg.opt_opd2_int8 = 1; // useless, force to 1 to skip b_stride check + reg.short_opd2_str = 2; // useless + reg.opd2_b_str = 0; // useless + + if (p->has_bias) { + reg.tsk_opd_num = 3; + reg.opt_opd2_sign = 1; + } + + reg.layer_info = p->layer_id; + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1880v2/tiu_element_wise_add.c b/cvikernel/src/bm1880v2/tiu_element_wise_add.c new file mode 100644 index 000000000..cb1dddc9e --- /dev/null +++ b/cvikernel/src/bm1880v2/tiu_element_wise_add.c @@ -0,0 +1,79 @@ +#include "kernel_1880v2.h" + +bmk1880v2_op_t * bmk1880v2_tiu_element_wise_add( + ctx_t *k, + const bmk1880v2_tiu_element_wise_add_param_t *p) +{ + int bf16_enable = (p->a_low->fmt == FMT_BF16) ? 1 : 0; + + if (bf16_enable) { + /*bf16 only support 16 bit*/ + ASSERT(!p->a_high); + ASSERT(!(p->b_high && !p->b_is_const)); + ASSERT(!p->res_high); + check_tiu_tensor(p->a_low); + check_tiu_tensor(p->res_low); + assert_same_shape(p->res_low, p->a_low); + if (!p->b_is_const) { + check_tiu_tensor(p->b_low); + assert_same_shape(p->res_low, p->b_low); + } + } else { + check_16bit_tiu_tensor(p->a_low, p->a_high); + check_tiu_tensor(p->res_low); + assert_same_shape(p->res_low, p->a_low); + if (!p->b_is_const) { + check_16bit_tiu_tensor(p->b_low, p->b_high); + assert_same_shape(p->res_low, p->b_low); + } + } + if (p->res_high) + check_16bit_tiu_tensor(p->res_low, p->res_high); + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_ADD_FIX8B; + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + reg.opt_right_shift = 0; + reg.opt_relu = p->relu_enable; + fill_same_tensor_shape(®, p->a_low->shape); + fill_same_tensor_stride_type(®, 0b11); + + int arith_shift = tensor_is_signed(p->res_low); + reg.opt_shift_typ = arith_shift; + reg.opt_right_shift = p->rshift_bits; + + reg.opd0_addr = p->a_low->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a_low); + reg.opt_opd0_int8 = (p->a_high == NULL); + reg.opd0_b_str = bf16_enable ? 0 : (p->a_high->start_address - p->a_low->start_address); + fill_opd0_stride(®, &p->a_low->stride); + + reg.opt_opd1_int8 = bf16_enable ? 1 : 0; //(p->b_high == NULL); b_high is the same as b_val + if (p->b_is_const) { + reg.opt_opd1_const = 1; + reg.opt_opd1_sign = !!p->b_const.is_signed; + reg.opd1_addr = p->b_const.val; + } else { + reg.opt_opd1_const = 0; + reg.opt_opd1_sign = tensor_is_signed(p->b_low); + reg.opd1_addr = p->b_low->start_address; + reg.opd1_b_str = bf16_enable ? 0 : (p->b_high->start_address - p->b_low->start_address); + fill_opd1_stride(®, &p->b_low->stride); + } + + reg.res0_addr = p->res_low->start_address; + reg.opt_res0_sign = tensor_is_signed(p->res_low); + reg.opt_res0_int8 = (p->res_high == NULL); + fill_res0_stride(®, &p->res_low->stride); + if (p->res_high) + reg.res0_b_str = p->res_high->start_address - p->res_low->start_address; + if (p->relu_enable) + ASSERT(reg.opt_res0_int8); + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + return emit_tiu_cmdbuf(k, ®); +} diff --git a/cvikernel/src/bm1880v2/tiu_element_wise_and.c b/cvikernel/src/bm1880v2/tiu_element_wise_and.c new file mode 100644 index 000000000..305443fd0 --- /dev/null +++ b/cvikernel/src/bm1880v2/tiu_element_wise_and.c @@ -0,0 +1,100 @@ +#include "kernel_1880v2.h" + +bmk1880v2_op_t * bmk1880v2_tiu_element_wise_and_int8( + ctx_t *ctx, + const bmk1880v2_tiu_element_wise_and_int8_param_t *p) +{ + check_tiu_tensor_3(p->res, p->a, p->b); + assert_same_shape_3(p->res, p->a, p->b); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_AND_FIX8B; + reg.tsk_opd_num = 2; + reg.opt_right_shift = 0; + reg.opt_shift_typ = 0; + reg.opt_relu = 0; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = 0; + reg.opt_opd0_int8 = 1; + fill_opd0_stride(®, &p->a->stride); + + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = 0; + reg.opt_opd1_int8 = 1; + fill_opd1_stride(®, &p->b->stride); + + reg.res0_addr = p->res->start_address; + reg.opt_res0_sign = 0; + reg.opt_res0_int8 = 1; + fill_res0_stride(®, &p->res->stride); + + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + + return emit_tiu_cmdbuf(ctx, ®); +} + +bmk1880v2_op_t * bmk1880v2_tiu_element_wise_and_int16( + ctx_t *ctx, + const bmk1880v2_tiu_element_wise_and_int16_param_t *p) +{ + check_16bit_tiu_tensor(p->a_low, p->a_high); + check_16bit_tiu_tensor(p->b_low, p->b_high); + check_16bit_tiu_tensor(p->res_low, p->res_high); + assert_same_shape_3(p->res_low, p->a_low, p->b_low); + + int res_high_addr = p->res_high->start_address; + int res_low_addr = p->res_low->start_address; + ASSERT(res_high_addr > res_low_addr); + int res_b_stride = res_high_addr - res_low_addr; + + int a_high_addr = p->a_high->start_address; + int a_low_addr = p->a_low->start_address; + ASSERT(a_high_addr > a_low_addr); + int a_b_stride = a_high_addr - a_low_addr; + + int b_high_addr = p->b_high->start_address; + int b_low_addr = p->b_low->start_address; + ASSERT(b_high_addr > b_low_addr); + int b_b_stride = b_high_addr - b_low_addr; + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_AND_FIX8B; + reg.tsk_opd_num = 2; + reg.opt_right_shift = 0; + reg.opt_shift_typ = 0; + reg.opt_relu = 0; + fill_same_tensor_shape(®, p->a_low->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = a_low_addr; + reg.opt_opd0_sign = 0; + reg.opt_opd0_int8 = 0; + reg.opd0_b_str = a_b_stride; + fill_opd0_stride(®, &p->a_low->stride); + + reg.opd1_addr = b_low_addr; + reg.opt_opd1_sign = 0; + reg.opt_opd1_int8 = 0; + reg.opd1_b_str = b_b_stride; + fill_opd1_stride(®, &p->b_low->stride); + + reg.res0_addr = res_low_addr; + reg.opt_res0_sign = 0; + reg.opt_res0_int8 = 0; + reg.res0_b_str = res_b_stride; + fill_res0_stride(®, &p->res_low->stride); + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1880v2/tiu_element_wise_copy.c b/cvikernel/src/bm1880v2/tiu_element_wise_copy.c new file mode 100644 index 000000000..bd71e6436 --- /dev/null +++ b/cvikernel/src/bm1880v2/tiu_element_wise_copy.c @@ -0,0 +1,42 @@ +#include "kernel_1880v2.h" + +bmk1880v2_op_t * bmk1880v2_tiu_element_wise_copy( + ctx_t *ctx, + const bmk1880v2_tiu_element_wise_copy_param_t *p) +{ + int bf16_enable = (p->src->fmt == FMT_BF16) ? 1 : 0; + + check_tiu_tensor_2(p->dst, p->src); + assert_same_shape(p->dst, p->src); + assert_stride_range(p->dst->stride); + assert_stride_range(p->src->stride); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_COPY_FIX8B; + reg.tsk_opd_num = 1; + reg.opd_typ = bf16_enable ? 1: 0; + reg.opt_right_shift = 0; + reg.opt_shift_typ = 0; + reg.opt_relu = 0; + fill_same_tensor_shape(®, p->dst->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->src->start_address; + reg.opt_opd0_sign = 0; + reg.opt_opd0_int8 = 1; + fill_opd0_stride(®, &p->src->stride); + + reg.res0_addr = p->dst->start_address; + reg.opt_res0_sign = 0; + reg.opt_res0_int8 = 1; + fill_res0_stride(®, &p->dst->stride); + + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1880v2/tiu_element_wise_mac.c b/cvikernel/src/bm1880v2/tiu_element_wise_mac.c new file mode 100644 index 000000000..a1b99d9aa --- /dev/null +++ b/cvikernel/src/bm1880v2/tiu_element_wise_mac.c @@ -0,0 +1,67 @@ +#include "kernel_1880v2.h" + +bmk1880v2_op_t * bmk1880v2_tiu_element_wise_mac( + ctx_t *ctx, + const bmk1880v2_tiu_element_wise_mac_param_t *p) +{ + int bf16_enable = (p->a->fmt == FMT_BF16) ? 1 : 0; + + check_tiu_tensor(p->a); + assert_same_shape(p->res_low, p->a); + if (!bf16_enable) { + check_16bit_tiu_tensor(p->res_low, p->res_high); + ASSERT(p->lshift_bits < 32); + ASSERT(p->rshift_bits < 16); + } + if (!p->b_is_const) { + check_tiu_tensor(p->b); + assert_same_shape(p->res_low, p->b); + } + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_MAC_FIX8B; + reg.opt_res_add = 1; + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + reg.opt_left_shift = p->lshift_bits; + reg.opt_relu = p->relu_enable; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + int arith_shift = tensor_is_signed(p->res_low); + reg.opt_shift_typ = arith_shift; + reg.opt_right_shift = p->rshift_bits; + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a); + fill_opd0_stride(®, &p->a->stride); + + if (p->b_is_const) { + reg.opt_opd1_const = 1; + reg.opd1_addr = bf16_enable ? p->b_const.val : (p->b_const.val & 0xFF); + reg.opt_opd1_sign = !!p->b_const.is_signed; + } else { + reg.opt_opd1_const = 0; + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = tensor_is_signed(p->b); + fill_opd1_stride(®, &p->b->stride); + } + + reg.res0_addr = p->res_low->start_address; + reg.opt_res0_sign = tensor_is_signed(p->res_low); + reg.opt_res0_int8 = bf16_enable ? 1 : !!p->res_is_int8; + fill_res0_stride(®, &p->res_low->stride); + reg.res0_b_str = bf16_enable ? 0 : (p->res_high->start_address - p->res_low->start_address); + + if (p->relu_enable) + ASSERT(reg.opt_res0_int8); + + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1880v2/tiu_element_wise_max.c b/cvikernel/src/bm1880v2/tiu_element_wise_max.c new file mode 100644 index 000000000..5717b9f4e --- /dev/null +++ b/cvikernel/src/bm1880v2/tiu_element_wise_max.c @@ -0,0 +1,56 @@ +#include "kernel_1880v2.h" + +bmk1880v2_op_t * bmk1880v2_tiu_element_wise_max( + ctx_t *ctx, + const bmk1880v2_tiu_element_wise_max_param_t *p) +{ + int bf16_enable = (p->a->fmt == FMT_BF16) ? 1 : 0; + + check_tiu_tensor_2(p->max, p->a); + assert_same_shape(p->max, p->a); + + if (p->b_is_const && !bf16_enable) { + if (tensor_is_signed(p->a)) + ASSERT(p->b_const.is_signed); + else + ASSERT(!p->b_const.is_signed); + } else if (!p->b_is_const) { + check_tiu_tensor(p->b); + assert_same_shape(p->max, p->b); + ASSERT(p->a->fmt == p->b->fmt); + } + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_MAX_FIX8B; + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + reg.opt_right_shift = 0; + reg.opt_relu = 0; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a); + fill_opd0_stride(®, &p->a->stride); + + if (p->b_is_const) { + reg.opt_opd1_const = 1; + reg.opd1_addr = bf16_enable ? p->b_const.val : (p->b_const.val & 0xFF); + reg.opt_opd1_sign = !!p->b_const.is_signed; + } else { + reg.opt_opd1_const = 0; + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = tensor_is_signed(p->b); + fill_opd1_stride(®, &p->b->stride); + } + + reg.res0_addr = p->max->start_address; + reg.opt_res0_sign = tensor_is_signed(p->max); + fill_res0_stride(®, &p->max->stride); + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1880v2/tiu_element_wise_min.c b/cvikernel/src/bm1880v2/tiu_element_wise_min.c new file mode 100644 index 000000000..ee29a5ed7 --- /dev/null +++ b/cvikernel/src/bm1880v2/tiu_element_wise_min.c @@ -0,0 +1,58 @@ +#include "kernel_1880v2.h" + +bmk1880v2_op_t * bmk1880v2_tiu_element_wise_min( + ctx_t *ctx, + const bmk1880v2_tiu_element_wise_min_param_t *p) +{ + int bf16_enable = (p->a->fmt == FMT_BF16) ? 1 : 0; + + check_tiu_tensor_2(p->min, p->a); + assert_same_shape(p->min, p->a); + if (p->b_is_const && !bf16_enable) { + if (tensor_is_signed(p->a)) + ASSERT(p->b_const.is_signed); + else + ASSERT(!p->b_const.is_signed); + } else if (!p->b_is_const) { + check_tiu_tensor(p->b); + assert_same_shape(p->min, p->b); + ASSERT(p->a->fmt == p->b->fmt); + } + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_MIN_FIX8B; + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + reg.opt_right_shift = 0; + reg.opt_relu = 0; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a); + fill_opd0_stride(®, &p->a->stride); + + if (p->b_is_const) { + reg.opt_opd1_const = 1; + reg.opd1_addr = bf16_enable ? p->b_const.val : (p->b_const.val & 0xFF); + reg.opt_opd1_sign = !!p->b_const.is_signed; + } else { + reg.opt_opd1_const = 0; + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = tensor_is_signed(p->b); + fill_opd1_stride(®, &p->b->stride); + } + + reg.res0_addr = p->min->start_address; + reg.opt_res0_sign = tensor_is_signed(p->min); + fill_res0_stride(®, &p->min->stride); + + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1880v2/tiu_element_wise_mul.c b/cvikernel/src/bm1880v2/tiu_element_wise_mul.c new file mode 100644 index 000000000..de7a0800c --- /dev/null +++ b/cvikernel/src/bm1880v2/tiu_element_wise_mul.c @@ -0,0 +1,66 @@ +#include "kernel_1880v2.h" + +bmk1880v2_op_t * bmk1880v2_tiu_element_wise_mul( + ctx_t *ctx, + const bmk1880v2_tiu_element_wise_mul_param_t *p) +{ + int bf16_enable = (p->a->fmt == FMT_BF16) ? 1 : 0; + + check_tiu_tensor_2(p->res_low, p->a); + assert_same_shape(p->res_low, p->a); + if (!p->b_is_const) { + check_tiu_tensor(p->b); + assert_same_shape(p->res_low, p->b); + } + if (p->res_high) + check_16bit_tiu_tensor(p->res_low, p->res_high); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_MUL_FIX8B; + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + int arith_shift = tensor_is_signed(p->res_low); + reg.opt_shift_typ = arith_shift; + reg.opt_right_shift = p->rshift_bits; + reg.opt_relu = p->relu_enable; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a); + fill_opd0_stride(®, &p->a->stride); + + if (p->b_is_const) { + reg.opt_opd1_const = 1; + reg.opd1_addr = bf16_enable ? p->b_const.val : (p->b_const.val & 0xFF); + reg.opt_opd1_sign = !!p->b_const.is_signed; + } else { + reg.opt_opd1_const = 0; + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = tensor_is_signed(p->b); + fill_opd1_stride(®, &p->b->stride); + } + + reg.res0_addr = p->res_low->start_address; + reg.opt_res0_sign = tensor_is_signed(p->res_low); + reg.opt_res0_int8 = (p->res_high == NULL); + fill_res0_stride(®, &p->res_low->stride); + if (p->res_high) + reg.res0_b_str = (p->res_high->start_address - p->res_low->start_address); + if (p->relu_enable) + ASSERT(reg.opt_res0_int8); + + ASSERT(( + p->b_is_const || (!reg.opt_opd1_sign && !reg.opt_opd0_sign && !reg.opt_shift_typ) || + ((reg.opt_opd1_sign || reg.opt_opd0_sign) && reg.opt_shift_typ) + )); + + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1880v2/tiu_element_wise_mul_qdm.c b/cvikernel/src/bm1880v2/tiu_element_wise_mul_qdm.c new file mode 100644 index 000000000..95c0cdd7a --- /dev/null +++ b/cvikernel/src/bm1880v2/tiu_element_wise_mul_qdm.c @@ -0,0 +1,66 @@ +#include "kernel_1880v2.h" + +bmk1880v2_op_t * bmk1880v2_tiu_element_wise_mul_qdm( + ctx_t *ctx, + const bmk1880v2_tiu_element_wise_mul_qdm_param_t *p) +{ + check_tiu_tensor_2(p->res_low, p->a); + assert_same_shape(p->res_low, p->a); + if (!p->b_is_const) { + check_tiu_tensor(p->b); + assert_same_shape(p->res_low, p->b); + } + if (p->res_high) + check_16bit_tiu_tensor(p->res_low, p->res_high); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_MUL_FIX8B; + reg.tsk_opd_num = 2; + int arith_shift = tensor_is_signed(p->res_low); + reg.opt_shift_typ = arith_shift; + reg.opt_right_shift = p->rshift_bits; + reg.opt_relu = p->relu_enable; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a); + fill_opd0_stride(®, &p->a->stride); + + if (p->b_is_const) { + reg.opt_opd1_const = 1; + reg.opd1_addr = p->b_const.val; + reg.opt_opd1_sign = !!p->b_const.is_signed; + } else { + reg.opt_opd1_const = 0; + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = tensor_is_signed(p->b); + fill_opd1_stride(®, &p->b->stride); + } + + reg.res0_addr = p->res_low->start_address; + reg.opt_res0_sign = tensor_is_signed(p->res_low); + reg.opt_res0_int8 = (p->res_high == NULL); + fill_res0_stride(®, &p->res_low->stride); + if (p->res_high) + reg.res0_b_str = p->res_high->start_address - p->res_low->start_address; + if (p->relu_enable) + ASSERT(reg.opt_res0_int8); + + ASSERT(( + (!reg.opt_opd1_sign && !reg.opt_opd0_sign && !reg.opt_shift_typ) || + ((reg.opt_opd1_sign || reg.opt_opd0_sign) && reg.opt_shift_typ) + )); + + reg.opt_chl_quan = 1; + reg.quan_m = p->multiplier; + + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1880v2/tiu_element_wise_or.c b/cvikernel/src/bm1880v2/tiu_element_wise_or.c new file mode 100644 index 000000000..e8a401768 --- /dev/null +++ b/cvikernel/src/bm1880v2/tiu_element_wise_or.c @@ -0,0 +1,100 @@ +#include "kernel_1880v2.h" + +bmk1880v2_op_t * bmk1880v2_tiu_element_wise_or_int8( + ctx_t *ctx, + const bmk1880v2_tiu_element_wise_or_int8_param_t *p) +{ + check_tiu_tensor_3(p->res, p->a, p->b); + assert_same_shape_3(p->res, p->a, p->b); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_OR_FIX8B; + reg.tsk_opd_num = 2; + reg.opt_right_shift = 0; + reg.opt_shift_typ = 0; + reg.opt_relu = 0; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = 0; + reg.opt_opd0_int8 = 1; + fill_opd0_stride(®, &p->a->stride); + + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = 0; + reg.opt_opd1_int8 = 1; + fill_opd1_stride(®, &p->b->stride); + + reg.res0_addr = p->res->start_address; + reg.opt_res0_sign = 0; + reg.opt_res0_int8 = 1; + fill_res0_stride(®, &p->res->stride); + + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + + return emit_tiu_cmdbuf(ctx, ®); +} + +bmk1880v2_op_t * bmk1880v2_tiu_element_wise_or_int16( + ctx_t *ctx, + const bmk1880v2_tiu_element_wise_or_int16_param_t *p) +{ + check_16bit_tiu_tensor(p->a_low, p->a_high); + check_16bit_tiu_tensor(p->b_low, p->b_high); + check_16bit_tiu_tensor(p->res_low, p->res_high); + assert_same_shape_3(p->res_low, p->a_low, p->b_low); + + int res_high_addr = p->res_high->start_address; + int res_low_addr = p->res_low->start_address; + ASSERT(res_high_addr > res_low_addr); + int res_b_stride = res_high_addr - res_low_addr; + + int a_high_addr = p->a_high->start_address; + int a_low_addr = p->a_low->start_address; + ASSERT(a_high_addr > a_low_addr); + int a_b_stride = a_high_addr - a_low_addr; + + int b_high_addr = p->b_high->start_address; + int b_low_addr = p->b_low->start_address; + ASSERT(b_high_addr > b_low_addr); + int b_b_stride = b_high_addr - b_low_addr; + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_OR_FIX8B; + reg.tsk_opd_num = 2; + reg.opt_right_shift = 0; + reg.opt_shift_typ = 0; + reg.opt_relu = 0; + fill_same_tensor_shape(®, p->a_low->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = a_low_addr; + reg.opt_opd0_sign = 0; + reg.opt_opd0_int8 = 0; + reg.opd0_b_str = a_b_stride; + fill_opd0_stride(®, &p->a_low->stride); + + reg.opd1_addr = b_low_addr; + reg.opt_opd1_sign = 0; + reg.opt_opd1_int8 = 0; + reg.opd1_b_str = b_b_stride; + fill_opd1_stride(®, &p->b_low->stride); + + reg.res0_addr = res_low_addr; + reg.opt_res0_sign = 0; + reg.opt_res0_int8 = 0; + reg.res0_b_str = res_b_stride; + fill_res0_stride(®, &p->res_low->stride); + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1880v2/tiu_element_wise_shift.c b/cvikernel/src/bm1880v2/tiu_element_wise_shift.c new file mode 100644 index 000000000..081e779f0 --- /dev/null +++ b/cvikernel/src/bm1880v2/tiu_element_wise_shift.c @@ -0,0 +1,58 @@ +#include "kernel_1880v2.h" + +bmk1880v2_op_t * bmk1880v2_tiu_element_wise_arith_shift( + ctx_t *ctx, + const bmk1880v2_tiu_element_wise_arith_shift_param_t *p) +{ + check_16bit_tiu_tensor(p->a_low, p->a_high); + check_16bit_tiu_tensor(p->res_low, p->res_high); + check_tiu_tensor(p->bits); + assert_same_shape_3(p->res_low, p->a_low, p->bits); + ASSERT(tensor_is_signed(p->a_low)); + ASSERT(tensor_is_signed(p->bits)); + + int res_high_addr = p->res_high->start_address; + int res_low_addr = p->res_low->start_address; + ASSERT(res_high_addr > res_low_addr); + int res_b_stride = res_high_addr - res_low_addr; + + int a_high_addr = p->a_high->start_address; + int a_low_addr = p->a_low->start_address; + ASSERT(a_high_addr > a_low_addr); + int a_b_stride = a_high_addr - a_low_addr; + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_SHIFT_FIX8B; + reg.tsk_opd_num = 2; + reg.opt_right_shift = 0; + reg.opt_rshift_typ = 0; + reg.opt_relu = 0; + fill_same_tensor_shape(®, p->a_low->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = a_low_addr; + reg.opt_opd0_sign = 1; + reg.opt_opd0_int8 = 0; + reg.opd0_b_str = a_b_stride; + fill_opd0_stride(®, &p->a_low->stride); + + reg.opd1_addr = p->bits->start_address; + reg.opt_opd1_sign = 1; + reg.opt_opd1_int8 = 1; + fill_opd1_stride(®, &p->bits->stride); + + reg.res0_addr = res_low_addr; + reg.opt_res0_sign = 1; + reg.opt_res0_int8 = 0; + reg.res0_b_str = res_b_stride; + fill_res0_stride(®, &p->res_low->stride); + + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1880v2/tiu_element_wise_sub.c b/cvikernel/src/bm1880v2/tiu_element_wise_sub.c new file mode 100644 index 000000000..6f19a57f4 --- /dev/null +++ b/cvikernel/src/bm1880v2/tiu_element_wise_sub.c @@ -0,0 +1,68 @@ +#include "kernel_1880v2.h" + +bmk1880v2_op_t * bmk1880v2_tiu_element_wise_sub( + ctx_t *ctx, + const bmk1880v2_tiu_element_wise_sub_param_t *p) +{ + int bf16_enable = (p->a_low->fmt == FMT_BF16) ? 1 : 0; + + if (bf16_enable) { + /*bf16 only support 16 bit*/ + ASSERT(!p->a_high); + ASSERT(!p->b_high); + ASSERT(!p->res_high); + check_tiu_tensor(p->a_low); + check_tiu_tensor(p->b_low); + check_tiu_tensor(p->res_low); + assert_same_shape_3(p->res_low, p->a_low, p->b_low); + } else { + check_16bit_tiu_tensor(p->a_low, p->a_high); + check_16bit_tiu_tensor(p->b_low, p->b_high); + check_tiu_tensor(p->res_low); + assert_same_shape_3(p->res_low, p->a_low, p->b_low); + ASSERT(tensor_is_signed(p->res_low)); + } + if (p->res_high) + check_16bit_tiu_tensor(p->res_low, p->res_high); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_SUB_FIX8B; + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + reg.opt_right_shift = 0; + reg.opt_relu = 0; + fill_same_tensor_shape(®, p->a_low->shape); + fill_same_tensor_stride_type(®, 0b11); + + int arith_shift = tensor_is_signed(p->res_low); + reg.opt_shift_typ = arith_shift; + reg.opt_right_shift = p->rshift_bits; + + reg.opd0_addr = p->a_low->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a_low); + reg.opt_opd0_int8 = (p->a_high == NULL); + reg.opd0_b_str = bf16_enable ? 0 : (p->a_high->start_address - p->a_low->start_address); + fill_opd0_stride(®, &p->a_low->stride); + + reg.opd1_addr = p->b_low->start_address; + reg.opt_opd1_sign = tensor_is_signed(p->b_low);; + reg.opt_opd1_int8 = (p->b_high == NULL); + reg.opd1_b_str = bf16_enable ? 0 : (p->b_high->start_address - p->b_low->start_address); + fill_opd1_stride(®, &p->b_low->stride); + + reg.res0_addr = p->res_low->start_address; + reg.opt_res0_sign = 1; + reg.opt_res0_int8 = (p->res_high == NULL); + fill_res0_stride(®, &p->res_low->stride); + if (p->res_high) + reg.res0_b_str = bf16_enable ? 0 : (p->res_high->start_address - p->res_low->start_address); + + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1880v2/tiu_element_wise_xor.c b/cvikernel/src/bm1880v2/tiu_element_wise_xor.c new file mode 100644 index 000000000..9ae625ed0 --- /dev/null +++ b/cvikernel/src/bm1880v2/tiu_element_wise_xor.c @@ -0,0 +1,100 @@ +#include "kernel_1880v2.h" + +bmk1880v2_op_t * bmk1880v2_tiu_element_wise_xor_int8( + ctx_t *ctx, + const bmk1880v2_tiu_element_wise_xor_int8_param_t *p) +{ + check_tiu_tensor_3(p->res, p->a, p->b); + assert_same_shape_3(p->res, p->a, p->b); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_XOR_FIX8B; + reg.tsk_opd_num = 2; + reg.opt_right_shift = 0; + reg.opt_shift_typ = 0; + reg.opt_relu = 0; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = 0; + reg.opt_opd0_int8 = 1; + fill_opd0_stride(®, &p->a->stride); + + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = 0; + reg.opt_opd1_int8 = 1; + fill_opd1_stride(®, &p->b->stride); + + reg.res0_addr = p->res->start_address; + reg.opt_res0_sign = 0; + reg.opt_res0_int8 = 1; + fill_res0_stride(®, &p->res->stride); + + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + + return emit_tiu_cmdbuf(ctx, ®); +} + +bmk1880v2_op_t * bmk1880v2_tiu_element_wise_xor_int16( + ctx_t *ctx, + const bmk1880v2_tiu_element_wise_xor_int16_param_t *p) +{ + check_16bit_tiu_tensor(p->a_low, p->a_high); + check_16bit_tiu_tensor(p->b_low, p->b_high); + check_16bit_tiu_tensor(p->res_low, p->res_high); + assert_same_shape_3(p->res_low, p->a_low, p->b_low); + + int res_high_addr = p->res_high->start_address; + int res_low_addr = p->res_low->start_address; + ASSERT(res_high_addr > res_low_addr); + int res_b_stride = res_high_addr - res_low_addr; + + int a_high_addr = p->a_high->start_address; + int a_low_addr = p->a_low->start_address; + ASSERT(a_high_addr > a_low_addr); + int a_b_stride = a_high_addr - a_low_addr; + + int b_high_addr = p->b_high->start_address; + int b_low_addr = p->b_low->start_address; + ASSERT(b_high_addr > b_low_addr); + int b_b_stride = b_high_addr - b_low_addr; + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_XOR_FIX8B; + reg.tsk_opd_num = 2; + reg.opt_right_shift = 0; + reg.opt_shift_typ = 0; + reg.opt_relu = 0; + fill_same_tensor_shape(®, p->a_low->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = a_low_addr; + reg.opt_opd0_sign = 0; + reg.opt_opd0_int8 = 0; + reg.opd0_b_str = a_b_stride; + fill_opd0_stride(®, &p->a_low->stride); + + reg.opd1_addr = b_low_addr; + reg.opt_opd1_sign = 0; + reg.opt_opd1_int8 = 0; + reg.opd1_b_str = b_b_stride; + fill_opd1_stride(®, &p->b_low->stride); + + reg.res0_addr = res_low_addr; + reg.opt_res0_sign = 0; + reg.opt_res0_int8 = 0; + reg.res0_b_str = res_b_stride; + fill_res0_stride(®, &p->res_low->stride); + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1880v2/tiu_lookup_table.c b/cvikernel/src/bm1880v2/tiu_lookup_table.c new file mode 100644 index 000000000..983aa1517 --- /dev/null +++ b/cvikernel/src/bm1880v2/tiu_lookup_table.c @@ -0,0 +1,113 @@ +#include "kernel_1880v2.h" + +bmk1880v2_op_t * bmk1880v2_tiu_lookup_table( + ctx_t *ctx, + const bmk1880v2_tiu_lookup_table_param_t *p) +{ + uint32_t eu_num = ctx->chip_info.eu_num; + uint32_t npu_num = ctx->chip_info.npu_num; + + check_tiu_tensor_3(p->ofmap, p->ifmap, p->table); + assert_stride_type_0(ctx, p->ofmap); + assert_stride_type_0(ctx, p->ifmap); + assert_stride_type_0(ctx, p->table); + + uint8_t is_bf16 = (p->ofmap->fmt == FMT_BF16 && p->ifmap->fmt == FMT_BF16); + + ASSERT(p->table->shape.n == 1); + ASSERT(p->table->shape.c == npu_num); + + if (is_bf16) { + ASSERT(p->table->shape.h == 32); + ASSERT(p->table->shape.w == 8); + } + else { + ASSERT(p->table->shape.h == 16); + ASSERT(p->table->shape.w == 16); + } + + ASSERT(p->ifmap->start_address % eu_num == 0); + ASSERT(p->ofmap->start_address % eu_num == 0); + ASSERT(p->table->start_address % eu_num == 0); + + // fmt MUST be same under bf16 + if (p->ofmap->fmt == FMT_BF16) { + ASSERT(p->ifmap->fmt == FMT_BF16); + } + ASSERT(p->ofmap->fmt == FMT_I8 || p->ofmap->fmt == FMT_U8 || p->ofmap->fmt == FMT_BF16); + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tens_lookup = 1; + reg.tsk_opd_num = 2; + reg.opt_shift_typ = 0; + reg.opt_right_shift = 0; + reg.opt_relu = 0; + reg.opd_typ = is_bf16; + + reg.res0_addr = p->ofmap->start_address; + if (is_bf16) { + reg.opt_res0_sign = 1; + reg.opt_res0_int8 = 1; + } + else { + reg.opt_res0_sign = 0; + reg.opt_res0_int8 = 1; + } + + // ifmap->shape.n == p->ofmap->shape.n); + ASSERT(p->ifmap->shape.c == p->ofmap->shape.c); + ASSERT(p->ifmap->shape.h == p->ofmap->shape.h); + ASSERT(p->ifmap->shape.w == p->ofmap->shape.w); + + reg.res0_n = p->ifmap->shape.n; + reg.res0_c = p->ifmap->shape.c; + reg.res0_h = p->ifmap->shape.h; + reg.res0_w = p->ifmap->shape.w; + reg.short_res0_str = 0; + + reg.opd0_addr = p->ifmap->start_address; + if (is_bf16) { + reg.opt_opd0_sign = 1; + reg.opt_opd0_int8 = 1; + } + else { + reg.opt_opd0_sign = 0; + reg.opt_opd0_int8 = 1; + } + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.short_opd0_str = 0; + + reg.opd1_addr = p->table->start_address; + if (is_bf16) { + reg.opt_opd1_sign = 1; + reg.opt_opd1_int8 = 1; + } + else { + reg.opt_opd1_sign = 0; + reg.opt_opd1_int8 = 1; + } + reg.opd1_n = p->table->shape.n; + reg.opd1_c = p->table->shape.c; + reg.opd1_h = p->table->shape.h; + reg.opd1_w = p->table->shape.w; + reg.short_opd1_str = 0; + + if (is_bf16) { + reg.opt_opd2_int8 = 1; // hw check + reg.tsk_eu_typ = 12; // 12 means lut + // dont care once short_xxx_str set to 0 + } + + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + //trace_tiu_reg(®, __FUNCTION__); + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1880v2/tiu_matrix_multiplication.c b/cvikernel/src/bm1880v2/tiu_matrix_multiplication.c new file mode 100644 index 000000000..2f97e542a --- /dev/null +++ b/cvikernel/src/bm1880v2/tiu_matrix_multiplication.c @@ -0,0 +1,149 @@ +#include "kernel_1880v2.h" + +typedef bmk1880v2_tiu_matrix_multiplication_param_t param_t; + +static void check_matrix(ctx_t *ctx, const ml_t *m) +{ + bmk1880v2_tensor_lmem_t t; + t.start_address = m->start_address; + t.fmt = m->fmt; + t.shape.n = m->shape.n; + t.shape.c = m->shape.c; + t.shape.h = 1; + t.shape.w = m->shape.w; + t.stride.n = m->stride.n; + t.stride.c = m->stride.c; + t.stride.h = m->stride.h; + t.stride.w = 1 * (m->fmt == FMT_BF16 ? 2 : 1); + + check_tiu_tensor(&t); + assert_stride_type_0(ctx, &t); + + uint32_t eu_num = ctx->chip_info.eu_num; + ASSERT(m->start_address % eu_num == 0); +} + +static int is_arith_shift(const param_t *p) +{ + if (p->left->fmt == FMT_I8) + return 1; + if (p->right->fmt == FMT_I8) + return 1; + if (p->bias && p->bias->fmt == FMT_I8) + return 1; + + return 0; +} + +bmk1880v2_op_t * bmk1880v2_tiu_matrix_multiplication(ctx_t *ctx, const param_t *p) +{ + const bmk1880v2_matrix_lmem_t *res = p->res; + const bmk1880v2_matrix_lmem_t *left = p->left; + const bmk1880v2_matrix_lmem_t *right = p->right; + const bmk1880v2_matrix_lmem_t *bias = p->bias; + int bf16_enable = (p->res->fmt == FMT_BF16) ? 1 : 0; + + check_matrix(ctx, res); + check_matrix(ctx, left); + check_matrix(ctx, right); + if (bias) + check_matrix(ctx, bias); + + ASSERT(p->lshift_bits < 32); + if (bf16_enable) /* bf16 does not support add_result*/ + ASSERT(!p->add_result); + else + ASSERT(!(p->relu_enable && p->add_result)); + + if(p->ps32_mode & 0x2) + { + ASSERT(!p->relu_enable); + ASSERT(!p->bias); + ASSERT(!p->rshift_bits); + } + + uint32_t left_row = left->shape.n; + uint32_t left_col = left->shape.col; + uint32_t right_row = right->shape.n; + uint32_t right_col = right->shape.col; + uint32_t res_row = res->shape.n; + uint32_t res_col = res->shape.col; + ASSERT(left_col == right_row); + ASSERT(res_col == right_col); + + if(p->ps32_mode) + { + ASSERT(!p->add_result); + } else if ((p->add_result || !p->res_is_int8) && !bf16_enable) { + ASSERT(res_row == left_row * 2); + res_row = left_row; + } else { + ASSERT(res_row == left_row); + } + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_FC_FIX8B; + reg.tsk_opd_num = bias? 3: 2; + reg.opd_typ = bf16_enable ? 1 : 0; + reg.opt_shift_typ = is_arith_shift(p); + reg.opt_right_shift = p->rshift_bits; + reg.opt_left_shift = p->lshift_bits; + reg.opt_relu = p->relu_enable; + reg.opt_res_add = p->add_result; + + reg.res0_addr = res->start_address; + reg.opt_res0_int8 = (bf16_enable ? 1 : p->res_is_int8); + + reg.opt_res0_sign = matrix_is_signed(res); + reg.res0_n = res_row; + reg.res0_c = res->shape.c; + reg.res0_h = 1; + reg.res0_w = res->shape.w; + reg.short_res0_str = 0; // stride, b_stride calculated by H/W + + reg.opd0_addr = left->start_address; + reg.opt_opd0_int8 = 1; + reg.opt_opd0_sign = (left->fmt == FMT_I8); + reg.opd0_n = left_row; + reg.opd0_c = left->shape.c; + reg.opd0_h = 1; + reg.opd0_w = left->shape.w; + reg.short_opd0_str = 0; + + reg.opd1_addr = right->start_address; + reg.opt_opd1_int8 = 1; + reg.opt_opd1_sign = (right->fmt == FMT_I8); + reg.opd1_n = right_row; + reg.opd1_c = right->shape.c; + reg.opd1_h = 1; + reg.opd1_w = left_col - left->shape.w * (left->shape.c - 1); + reg.short_opd1_str = 0; + + reg.ps32_md = p->ps32_mode; + if (p->ps32_mode > 0) + reg.res0_b_str = p->res->shape.n * p->res->stride.n; + if(reg.opd0_c == 1) + ASSERT(reg.opd0_w == reg.opd1_w); + + if (bias) { + ASSERT(bias->shape.n == 2); + ASSERT(bias->shape.c == right->shape.c); + ASSERT(bias->shape.w == right->shape.w); + ASSERT(bias->shape.col == right->shape.col); + + reg.opd2_addr = bias->start_address; + reg.opt_opd2_int8 = 0; + reg.opt_opd2_sign = (bias->fmt == FMT_I8); + reg.opd2_n = 1; + reg.opd2_c = bias->shape.c; + reg.opd2_h = 1; + reg.opd2_w = bias->shape.w; + reg.short_opd2_str = 0; + } + + reg.layer_info = p->layer_id; + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1880v2/tiu_matrix_multiplication_qdm.c b/cvikernel/src/bm1880v2/tiu_matrix_multiplication_qdm.c new file mode 100644 index 000000000..307a6189a --- /dev/null +++ b/cvikernel/src/bm1880v2/tiu_matrix_multiplication_qdm.c @@ -0,0 +1,150 @@ +#include "kernel_1880v2.h" + +typedef bmk1880v2_tiu_matrix_multiplication_qdm_param_t param_t; + +static void check_matrix(ctx_t *ctx, const ml_t *m) +{ + bmk1880v2_tensor_lmem_t t; + t.start_address = m->start_address; + t.fmt = m->fmt; + t.shape.n = m->shape.n; + t.shape.c = m->shape.c; + t.shape.h = 1; + t.shape.w = m->shape.w; + t.stride.n = m->stride.n; + t.stride.c = m->stride.c; + t.stride.h = m->stride.h; + t.stride.w = 1; + + check_tiu_tensor(&t); + assert_stride_type_0(ctx, &t); + + uint32_t eu_num = ctx->chip_info.eu_num; + ASSERT(m->start_address % eu_num == 0); +} + +static int is_arith_shift(const param_t *p) +{ + if (p->left->fmt == FMT_I8) + return 1; + if (p->right->fmt == FMT_I8) + return 1; + if (p->bias && p->bias->fmt == FMT_I8) + return 1; + + return 0; +} + +bmk1880v2_op_t * bmk1880v2_tiu_matrix_multiplication_qdm(ctx_t *ctx, const param_t *p) +{ + const bmk1880v2_matrix_lmem_t *res = p->res; + const bmk1880v2_matrix_lmem_t *left = p->left; + const bmk1880v2_matrix_lmem_t *right = p->right; + const bmk1880v2_matrix_lmem_t *bias = p->bias; + + check_matrix(ctx, res); + check_matrix(ctx, left); + check_matrix(ctx, right); + if (bias) + check_matrix(ctx, bias); + + ASSERT(p->lshift_bits < 32); + ASSERT(!(p->relu_enable && p->add_result)); + if(p->ps32_mode & 0x2) + { + ASSERT(!p->relu_enable); + ASSERT(!p->bias); + ASSERT(!p->rshift_bits); + } + + uint32_t left_row = left->shape.n; + uint32_t left_col = left->shape.col; + uint32_t right_row = right->shape.n; + uint32_t right_col = right->shape.col; + uint32_t res_row = res->shape.n; + uint32_t res_col = res->shape.col; + ASSERT(left_col == right_row); + ASSERT(res_col == right_col); + ASSERT(p->res_is_int8 == 1); + + if(p->ps32_mode) + { + ASSERT(!p->add_result); + } + else if (p->add_result) { + ASSERT(res_row == left_row * 2); + res_row = left_row; + } else { + ASSERT(res_row == left_row); + } + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_FC_FIX8B; + reg.tsk_opd_num = bias? 3: 2; + reg.opt_shift_typ = is_arith_shift(p); + reg.opt_right_shift = p->rshift_bits; + reg.opt_left_shift = p->lshift_bits; + reg.opt_relu = p->relu_enable; + reg.opt_res_add = p->add_result; + + reg.res0_addr = res->start_address; + reg.opt_res0_int8 = 1; + reg.opt_res0_sign = matrix_is_signed(res); + reg.res0_n = res_row; + reg.res0_c = res->shape.c; + reg.res0_h = 1; + reg.res0_w = res->shape.w; + reg.short_res0_str = 0; // stride, b_stride calculated by H/W + + reg.opd0_addr = left->start_address; + reg.opt_opd0_int8 = 1; + reg.opt_opd0_sign = (left->fmt == FMT_I8); + reg.opd0_n = left_row; + reg.opd0_c = left->shape.c; + reg.opd0_h = 1; + reg.opd0_w = left->shape.w; + reg.short_opd0_str = 0; + + reg.opd1_addr = right->start_address; + reg.opt_opd1_int8 = 1; + reg.opt_opd1_sign = (right->fmt == FMT_I8); + reg.opd1_n = right_row; + reg.opd1_c = right->shape.c; + reg.opd1_h = 1; + reg.opd1_w = left_col - left->shape.w * (left->shape.c - 1); + reg.short_opd1_str = 0; + + reg.ps32_md = p->ps32_mode; + if (p->ps32_mode > 0) + reg.res0_b_str = p->res->shape.n * p->res->stride.n; + if(reg.opd0_c == 1) + ASSERT(reg.opd0_w == reg.opd1_w); + + // Only enable 32-bit multipler at the final post processing stage + reg.opt_chl_quan = ((p->ps32_mode == 0) || (p->ps32_mode == 1)) ? 1 : 0; + reg.quan_m = p->quan_m; + + // 32b bias, determined by b_stride + if (bias) { + ASSERT(bias->shape.n == 4); + ASSERT(bias->shape.c == right->shape.c); + ASSERT(bias->shape.w == right->shape.w); + ASSERT(bias->shape.col == right->shape.col); + + reg.opd2_addr = bias->start_address; + reg.opt_opd2_int8 = 0; + reg.opt_opd2_sign = (bias->fmt == FMT_I8); + reg.opd2_n = 1; + reg.opd2_c = bias->shape.c; + reg.opd2_h = 1; + reg.opd2_w = bias->shape.w; + reg.short_opd2_str = 0; + } + + reg.layer_info = p->layer_id; + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1880v2/tiu_max_pooling.c b/cvikernel/src/bm1880v2/tiu_max_pooling.c new file mode 100644 index 000000000..070691a8b --- /dev/null +++ b/cvikernel/src/bm1880v2/tiu_max_pooling.c @@ -0,0 +1,64 @@ +#include "kernel_1880v2.h" + +bmk1880v2_op_t * bmk1880v2_tiu_max_pooling( + ctx_t *ctx, + const bmk1880v2_tiu_max_pooling_param_t *p) +{ + int bf16_enable = (p->ifmap->fmt == FMT_BF16) ? 1 : 0; + + check_tiu_tensor_2(p->ifmap, p->ofmap); + ASSERT(p->kh * p->kw >= 1); + ASSERT(p->ifmap->shape.n == p->ofmap->shape.n); + ASSERT(p->ifmap->shape.c == p->ofmap->shape.c); + if (bf16_enable) { + assert_bf16_stride_type_0(ctx, p->ifmap); + assert_bf16_stride_type_0(ctx, p->ofmap); + } else { + assert_stride_type_0(ctx, p->ifmap); + assert_stride_type_0(ctx, p->ofmap); + } + int opd0_sign = tensor_is_signed(p->ifmap); + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_DEPTHWISE_POOL_FIX8B; + reg.tsk_eu_typ = 0; + reg.opt_relu = 0; /* Hardware relu function not validated. */ + reg.opt_right_shift = 0; + reg.opt_shift_typ = opd0_sign; + reg.tsk_opd_num = 1; + reg.opd_typ = bf16_enable ? 1: 0; + + reg.res0_addr = p->ofmap->start_address; + reg.opt_res0_sign = opd0_sign; + reg.opt_res0_int8 = 1; + reg.res0_n = p->ofmap->shape.n; + reg.res0_c = p->ofmap->shape.c; + reg.res0_h = p->ofmap->shape.h; + reg.res0_w = p->ofmap->shape.w; + //reg.opd0_ins_val = bf16_enable ? 0 : (uint32_t)p->ins_val; + reg.opd0_ins_val = (!p->ins_val && opd0_sign) ? -128 : p->ins_val; // backend not set yet + reg.opd0_ins_fp = bf16_enable ? p->ins_fp : 0; + reg.opd0_addr = p->ifmap->start_address; + reg.opt_opd0_sign = opd0_sign; + reg.opt_opd0_int8 = 1; + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.conv_opd0_up_pad = p->pad_top; + reg.conv_opd0_dn_pad = p->pad_bottom; + reg.conv_opd0_lf_pad = p->pad_left; + reg.conv_opd0_rt_pad = p->pad_right; + + reg.opt_opd1_int8 = 1; + reg.opd1_h = p->kh; + reg.opd1_w = p->kw; + reg.conv_op_x_str = p->stride_w; + reg.conv_op_y_str = p->stride_h; + + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm1880v2/tiu_mdsum.c b/cvikernel/src/bm1880v2/tiu_mdsum.c new file mode 100644 index 000000000..238615dfb --- /dev/null +++ b/cvikernel/src/bm1880v2/tiu_mdsum.c @@ -0,0 +1,60 @@ +#include "kernel_1880v2.h" + +bmk1880v2_op_t * bmk1880v2_tiu_mdsum( + ctx_t *ctx, + const bmk1880v2_tiu_mdsum_param_t *p) +{ + const bmk1880v2_tensor_lmem_t *res = p->res; + const bmk1880v2_tensor_lmem_t *input = p->input; + + check_tiu_tensor_2(res, input); + ASSERT(res->fmt == input->fmt); + if (p->res_is_int8) + ASSERT(res->shape.n == 1); + else + ASSERT(res->shape.n == 2); + ASSERT(res->shape.c == input->shape.c); + ASSERT(res->shape.h == 1); + ASSERT(res->shape.w == 1); + + int res_addr = res->start_address; + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tens_mdsum = 1; + reg.tsk_opd_num = 1; + reg.opt_relu = 0; + + int arith_shift = tensor_is_signed(res); + reg.opt_shift_typ = arith_shift; + reg.opt_right_shift = p->rshift_bits; + + reg.opd0_addr = input->start_address; + reg.opt_opd0_sign = tensor_is_signed(input); + reg.opt_opd0_int8 = 1; + reg.opd0_n = input->shape.n; + reg.opd0_c = input->shape.c; + reg.opd0_h = input->shape.h; + reg.opd0_w = input->shape.w; + reg.opd0_n_str = input->stride.n; + reg.opd0_c_str = input->stride.c; + reg.opd0_h_str = input->stride.h; + reg.opd0_w_str = 1; + + reg.res0_addr = res_addr; + reg.opt_res0_sign = tensor_is_signed(res); + reg.opt_res0_int8 = p->res_is_int8; + reg.res0_n = 1; + reg.res0_c = res->shape.c; + reg.res0_h = 1; + reg.res0_w = 1; + reg.short_res0_str = 0b01; + + /* [15:0] layer id */ + reg.layer_info = p->layer_id; + + return emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/bm_kernel.c b/cvikernel/src/bm_kernel.c new file mode 100644 index 000000000..5b88c3785 --- /dev/null +++ b/cvikernel/src/bm_kernel.c @@ -0,0 +1,67 @@ +#include "kernel_internal.h" + +shape_t shape_t4(int n, int c, int h, int w) +{ + shape_t s; + s.n = n; + s.c = c; + s.h = h; + s.w = w; + s.dim = 4; + return s; +} + +shape_t shape_t3(int c, int h, int w) +{ + shape_t s; + s.n = 1; + s.c = c; + s.h = h; + s.w = w; + s.dim = 3; + return s; +} + +shape_t shape_t2(int row, int col) +{ + shape_t s; + s.n = 1; + s.c = 1; + s.h = row; + s.w = col; + s.dim = 2; + return s; +} + +shape_t shape_t1(int len) +{ + int row = 1, col = len; + while (col >= 65536) { + ASSERT(col % 2 == 0); + col /= 2; + row *= 2; + } + shape_t s = { + .dim = 2, + .n = 1, + .c = 1, + .h = row, + .w = col, + }; + return s; +} + +uint8_t shape_equal(shape_t s1, shape_t s2) +{ + return (s1.dim == s2.dim) && + (s1.n == s2.n) && + (s1.c == s2.c) && + (s1.h == s2.h) && + (s1.w == s2.w); +} + +void tl_reshape(tensor_lmem *tlp, shape_t shape) +{ + ASSERT(tlp); + tlp->shape = shape; +} diff --git a/cvikernel/src/bmkernel_standard.h b/cvikernel/src/bmkernel_standard.h new file mode 100644 index 000000000..230fbb2a7 --- /dev/null +++ b/cvikernel/src/bmkernel_standard.h @@ -0,0 +1,24 @@ +#ifndef BMKERNEL_STANDARD_H +#define BMKERNEL_STANDARD_H +#include +#include "kernel_internal.h" +#include + +typedef struct bmk_context { + bmk_info_t info; + cvk_chip_info_t chip_info; + + ec_t ec; + mode_manager_t mode_manager; + + uint32_t cmdbuf_ptr; + uint32_t max_nr_desc; + uint32_t cur_nr_desc; + desc_pair_t *desc_pairs; + + uint32_t lmem_ptr; + uint16_t layer_id; + void* op; // +#include + +static inline int bitsize_of_fmt(cvk_fmt_t fmt) +{ + switch (fmt) { + case CVK_FMT_F32: + case CVK_FMT_I32: + return 32; + case CVK_FMT_F16: + case CVK_FMT_I16: + case CVK_FMT_U16: + case CVK_FMT_BF16: + return 16; + case CVK_FMT_I8: + case CVK_FMT_U8: + return 8; + default: + return 32; + } +} + +static void cvkcv180x_replace_cmd_id(uint32_t *desc, uint32_t eng_id, uint16_t ids[]) +{ + if (eng_id == CV180X_TIU) { + tiu_reg_t reg; + parse_tiu_reg(®, desc); + reg.cmd_id_en = 1; + reg.cmd_id_tpu = ids[eng_id]; + reg.cmd_id_gdma = ids[CV180X_TDMA]; + emit_tiu_reg(®, desc); + } else if (eng_id == CV180X_TDMA) { + tdma_reg_t tdma_reg; + parse_tdma_reg(&tdma_reg, desc); + tdma_reg.cmd_id = ids[eng_id]; + tdma_reg.wait_id_tpu = ids[CV180X_TIU]; + tdma_reg.bar_en = 1; + emit_tdma_reg(&tdma_reg, desc); + } +} + +static int cvkcv180x_get_engine_desc_length(uint32_t engine_id) +{ + switch (engine_id) { + case CV180X_TIU: + return TIU_ENGINE_DESCRIPTOR_NUM * sizeof(uint32_t); + case CV180X_TDMA: + return TDMA_ENGINE_DESCRIPTOR_NUM * sizeof(uint32_t); + //case CV180X_CPU: + // return CPU_ENGINE_DESCRIPTOR_NUM * sizeof(uint32_t); + default: + //ASSERT(0); + break; + } + + return 0; +} + +// Estimate the number of command descriptor based on buffer size provided +// by the user. +static uint32_t cvkcv180x_estimate_nr_desc(uint32_t cmdbuf_size) +{ + uint32_t tiu_desc_len = cvkcv180x_get_engine_desc_length(CV180X_TIU); + uint32_t tdma_desc_len = cvkcv180x_get_engine_desc_length(CV180X_TDMA); + uint32_t hdr_len = sizeof(cmd_hdr_t); + + uint32_t desc_len = + (tiu_desc_len > tdma_desc_len) ? tiu_desc_len : tdma_desc_len; + + return cmdbuf_size / (desc_len + hdr_len); +} + +static cmd_hdr_t *kernel_alloc_cmd_hdr( + cvk_context_t *ctx, uint8_t eng_id, uint32_t desc_len) +{ + cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data; + uint32_t free_len = prv_data->cmdbuf_size - prv_data->cmdbuf_ptr; + uint32_t hdr_len = sizeof(cmd_hdr_t); + uint32_t total_len = hdr_len + desc_len; + + if (total_len > free_len) + return NULL; + + cmd_hdr_t *hdr = (cmd_hdr_t *)&prv_data->cmdbuf[prv_data->cmdbuf_ptr]; + hdr->magic = 0xA8; // CMDBUF_HDR_MAGIC_180X + hdr->len = desc_len; + hdr->engine_id = eng_id; + hdr->__deprecated = 0; // for valgrind + hdr->flags = 0; + hdr->mask = 0; + + prv_data->cmdbuf_ptr += total_len; + return hdr; +} + +static desc_pair_t *kernel_alloc_desc_pair(cvk_context_t *ctx, uint8_t eng_id) +{ + cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data; + + if (eng_id >= CV180X_ENGINE_NUM || prv_data->cur_nr_desc >= prv_data->max_nr_desc) + return NULL; + + uint32_t desc_len = cvkcv180x_get_engine_desc_length(eng_id); + desc_pair_t *dp = &prv_data->desc_pairs[prv_data->cur_nr_desc++]; + dp->cmd_hdr = kernel_alloc_cmd_hdr(ctx, eng_id, desc_len); + dp->ec_desc = ec_alloc_desc(&prv_data->ec, eng_id); + + mode_manager_record_ec_desc(&prv_data->mode_manager, dp->ec_desc); + return dp; +} + +static void cvkcv180x_update_sync_id(cvk_context_t *ctx) +{ + cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data; + ec_compute_sync_ids(&prv_data->ec); + + for (uint32_t di = 0; di < prv_data->cur_nr_desc; di++) { + desc_pair_t *dp = &prv_data->desc_pairs[di]; + uint8_t eng_id = dp->ec_desc->engine_id; + uint32_t *desc = (uint32_t *)dp->cmd_hdr->cmd; + cvkcv180x_replace_cmd_id(desc, eng_id, dp->ec_desc->sync_ids); + } +} + +desc_pair_t *cvkcv180x_get_desc_pair(cvk_context_t *ctx, uint8_t eng_id) +{ +#if 0 + if (eng_id == BMK1822_CPU) { + kernel_update_sync_id(k); + k->cur_nr_desc = 0; + + ec_reset(&k->ec); + mode_manager_restart_sync_id(&k->mode_manager); + } +#endif + + return kernel_alloc_desc_pair(ctx, eng_id); +} + +void cvkcv180x_cleanup(cvk_context_t *ctx) +{ + cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data; + + free(prv_data->desc_pairs); + ec_destroy(&prv_data->ec); + mode_manager_destroy(&prv_data->mode_manager); +} + +void cvkcv180x_reset(cvk_context_t *ctx) +{ + cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data; + + prv_data->cur_nr_desc = 0; + prv_data->cmdbuf_ptr = 0; + + ec_reset(&prv_data->ec); + mode_manager_reset(&prv_data->mode_manager); +} + +static uint8_t *cvkcv180x_acquire_cmdbuf(cvk_context_t *ctx, uint32_t *size) +{ + cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data; + + *size = prv_data->cmdbuf_ptr; + cvkcv180x_update_sync_id(ctx); + return prv_data->cmdbuf; +} + +void cvkcv180x_set_layer_id( + struct cvikernel_context *ctx, + uint16_t layer_id) +{ + cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data; + + prv_data->layer_id = layer_id; +} + +void cvkcv180x_parallel_enable(struct cvikernel_context *ctx) +{ + cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data; + + mode_manager_enable_parallel(&prv_data->mode_manager); +} + +void cvkcv180x_parallel_disable(struct cvikernel_context *ctx) +{ + cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data; + + mode_manager_disable_parallel(&prv_data->mode_manager); +} + +cvk_tl_stride_t cvkcv180x_tl_default_stride( + cvk_context_t *ctx, + cvk_tl_shape_t s, + cvk_fmt_t fmt_type, + int eu_align) +{ + cvk_tl_stride_t stride; + uint32_t eu_num = ctx->info.eu_num; + uint32_t npu_num = ctx->info.npu_num; + uint32_t fmt = (fmt_type == CVK_FMT_BF16) ? 2 : 1; + stride.w = fmt; + stride.h = s.w * fmt; + if (eu_align) + stride.c = align_up(s.h * s.w * fmt, eu_num); + else + stride.c = s.h * s.w * fmt; + + stride.n = stride.c * ceiling_func(s.c, npu_num); + + return stride; +} + +void cvkcv180x_lmem_init_tensor( + struct cvikernel_context *ctx, + cvk_tl_t *tl, + cvk_tl_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + memset(tl, 0, sizeof(*tl)); + tl->fmt = fmt; + tl->shape = shape; + tl->eu_align = eu_align; + tl->stride = cvkcv180x_tl_default_stride(ctx, shape, fmt, eu_align); +} + +uint32_t cvkcv180x_lmem_tensor_to_size( + struct cvikernel_context *ctx, + cvk_tl_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + uint32_t eu_num = ctx->info.eu_num; + + cvk_tl_stride_t stride; + stride = cvkcv180x_tl_default_stride(ctx, shape, fmt, eu_align); + + uint32_t needed = align_up(shape.n * stride.n, eu_num); + + return needed; +} + +cvk_tl_t *cvkcv180x_lmem_alloc_tensor( + cvk_context_t *ctx, + cvk_tl_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data; + uint32_t lmem_size = ctx->info.lmem_size; + uint32_t eu_num = ctx->info.eu_num; + + cvk_tl_t *t = malloc(sizeof(*t)); + if (!t) + return NULL; + + memset(t, 0, sizeof(*t)); + t->start_address = prv_data->lmem_ptr; + t->fmt = fmt; + t->cmprs_fmt = fmt; + t->shape = shape; + t->eu_align = eu_align; + t->stride = cvkcv180x_tl_default_stride(ctx, shape, fmt, eu_align); + + uint32_t needed = align_up(t->shape.n * t->stride.n, eu_num); + if ((lmem_size - prv_data->lmem_ptr < needed) || !needed) { + free(t); + return NULL; + } + + prv_data->lmem_ptr += needed; + return t; +} + +void cvkcv180x_lmem_free_tensor( + struct cvikernel_context *ctx, + const cvk_tl_t *tl) +{ + cvk_prv_data_t *prv_data; + + if (!ctx || !tl) + return; + + prv_data = (cvk_prv_data_t *)ctx->priv_data; + + if (tl->start_address >= prv_data->lmem_ptr) + printf("cvkcv180x lm free tensor: ptr out of range\n"); + + prv_data->lmem_ptr = tl->start_address; + + free((void *)tl); +} + +static void try_optimize_matrix_shape(cvk_context_t *ctx, cvk_ml_shape_t *s, + cvk_fmt_t fmt_type) { + uint32_t eu_num = ctx->info.eu_num; + uint32_t npu_num = ctx->info.npu_num; + uint32_t col = s->col; + uint8_t isBf16 = (fmt_type == CVK_FMT_BF16); + uint32_t workingNumber = isBf16 ? eu_num / 2 : eu_num; + + if (col >= workingNumber) { + int num_eu = ceiling_func(col, workingNumber * npu_num); + s->w = workingNumber * num_eu; + s->c = ceiling_func(col, s->w); + } else { + // col < EU_NUM + // Only transfer needed data + // We still change tensor shape in TIU mac op + s->w = col; + s->c = 1; + } +} + +cvk_ml_shape_t cvkcv180x_ml_default_shape( + struct cvikernel_context *ctx, + uint32_t row, + uint32_t col, + cvk_fmt_t fmt_type) +{ + cvk_ml_shape_t shape = {0}; + shape.n = row; + shape.col = col; + + try_optimize_matrix_shape(ctx, &shape, fmt_type); + + return shape; +} + +cvk_ml_stride_t cvkcv180x_ml_default_stride( + struct cvikernel_context *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + uint32_t npu_num = ctx->info.npu_num; + uint32_t eu_num = ctx->info.eu_num; + uint32_t val = (fmt == CVK_FMT_BF16) ? 2 : 1; + + cvk_ml_stride_t stride; + stride.h = shape.w * val; + if (eu_align) + stride.c = align_up(shape.w * val, eu_num); + else + stride.c = shape.w * val; + stride.n = stride.c * ceiling_func(shape.c, npu_num); + + return stride; +} + +cvk_ml_shape_t cvkcv180x_ml_shape_t1( + struct cvikernel_context *ctx, + uint32_t len, + cvk_fmt_t fmt_type) +{ + uint32_t lmem_size = ctx->info.lmem_size; + cvk_ml_shape_t shape = {0}; + + uint32_t row = 1; + uint32_t col = len; + + while (col >= lmem_size) { + if (col % 2) + return shape; + + col /= 2; + row *= 2; + } + + shape.n = row; + shape.col = col; + + try_optimize_matrix_shape(ctx, &shape, fmt_type); + return shape; +} + +void cvkcv180x_lmem_init_matrix( + struct cvikernel_context *ctx, + cvk_ml_t *ml, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + memset(ml, 0, sizeof(*ml)); + ml->fmt = fmt; + ml->shape = shape; + ml->stride = cvkcv180x_ml_default_stride(ctx, shape, fmt, eu_align); + ml->eu_align = eu_align; +} + + +uint32_t cvkcv180x_lmem_matrix_to_size( + struct cvikernel_context *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + uint32_t npu_num = ctx->info.npu_num; + uint32_t eu_num = ctx->info.eu_num; + uint32_t val = (fmt == CVK_FMT_BF16) ? 2 : 1; + + cvk_ml_t t; + t.fmt = fmt; + t.shape = shape; + t.stride.h = shape.w * val; + if (eu_align) + t.stride.c = align_up(shape.w * val, eu_num); + else + t.stride.c = shape.w * val; + t.stride.n = t.stride.c * ceiling_func(shape.c, npu_num); + + uint32_t needed = align_up(t.shape.n * t.stride.n, eu_num); + + return needed; +} + +uint32_t cvkcv180x_lmem_ps32_matrix_to_size( + struct cvikernel_context *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + /* Partial sum is located in lmem in 32-bit format, so we times n to 4 to + * spare a sapce for it. + */ + + shape.n = shape.n * (bitsize_of_fmt(CVK_FMT_I32) / bitsize_of_fmt(fmt)); + + return cvkcv180x_lmem_matrix_to_size(ctx, shape, fmt, eu_align); + +} + +cvk_ml_t *cvkcv180x_lmem_alloc_matrix( + cvk_context_t *ctx, + cvk_ml_shape_t s, + cvk_fmt_t fmt, + int eu_align) +{ + cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data; + uint32_t lmem_size = ctx->info.lmem_size; + uint32_t npu_num = ctx->info.npu_num; + uint32_t eu_num = ctx->info.eu_num; + uint32_t val = (fmt == CVK_FMT_BF16) ? 2 : 1; + + cvk_ml_t *t = malloc(sizeof(*t)); + if (!t) + return NULL; + + memset(t, 0, sizeof(*t)); + t->start_address = prv_data->lmem_ptr; + t->fmt = fmt; + t->shape = s; + t->stride.h = s.w * val; + if (eu_align) + t->stride.c = align_up(s.w * val, eu_num); + else + t->stride.c = s.w * val; + t->stride.n = t->stride.c * ceiling_func(s.c, npu_num); + t->eu_align = eu_align; + + uint32_t needed = align_up(t->shape.n * t->stride.n, eu_num); + if (lmem_size - prv_data->lmem_ptr < needed) { + free(t); + return NULL; + } + prv_data->lmem_ptr += needed; + + return t; +} + +void cvkcv180x_lmem_free_matrix( + struct cvikernel_context *ctx, + const cvk_ml_t *ml) +{ + cvk_prv_data_t *prv_data; + + if (!ctx || !ml) + return; + + prv_data = (cvk_prv_data_t *)ctx->priv_data; + + if (ml->start_address >= prv_data->lmem_ptr) + printf("cvkcv180x lm free matrix: ptr out of range\n"); + + prv_data->lmem_ptr = ml->start_address; + free((void *)ml); +} + +cvk_ml_t *cvkcv180x_lmem_alloc_ps32_matrix( + cvk_context_t *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + /* Partial sum is located in lmem in 32-bit format, so we times n to 4 to + * spare a space for it. + */ + + uint32_t prev_n; + + prev_n = shape.n; + shape.n = shape.n * (bitsize_of_fmt(CVK_FMT_I32) / bitsize_of_fmt(fmt)); + cvk_ml_t *res = cvkcv180x_lmem_alloc_matrix(ctx, shape, fmt, eu_align); + + if(res == NULL) { + printf("cvkcv180x: alloc ps32 matrix fail\n"); + return NULL; + } + + res->shape.n = prev_n; + return res; +} + +cvk_tg_stride_t cvkcv180x_tg_default_stride( + struct cvikernel_context *ctx, + cvk_tg_shape_t shape, + cvk_fmt_t fmt) +{ + uint32_t data_type_size = (fmt == CVK_FMT_BF16) ? 2 : 1; + cvk_tg_stride_t stride; + stride.h = shape.w * data_type_size; + stride.c = shape.h * stride.h; + stride.n = shape.c * stride.c; + stride.w = (fmt == CVK_FMT_BF16) ? 2 : 1; + + (void)ctx; + + return stride; +} + +void cvkcv180x_tiu_bf16_lookup_interp_table( + cvk_context_t *ctx, + const cvk_tiu_bf16_lookup_interp_table_param_t *param) +{ + if (param->is_scientific) { + // issue lut cmd + cvk_tdma_l2l_tensor_copy_param_t p10; + // remove low 8 bits by int8 copy with stride + // get index(pow) + memset(&p10, 0x00, sizeof(cvk_tdma_l2l_tensor_copy_param_t)); + p10.dst = param->ofmap; + p10.src = param->ifmap; + p10.mv_lut_base = 0; // MUST init by ifself in soc + p10.mv_lut_idx = 1; + p10.layer_id = param->layer_id; + cvkcv180x_tdma_l2l_bf16_tensor_copy(ctx, &p10); + p10.mv_lut_idx = 0; + + // get f(x0) = 2^(x0*-0.5) + cvk_tiu_lookup_table_param_t p12; + p12.ofmap = param->ofmap; + p12.ifmap = param->ofmap; + p12.table = param->tbl_answer; + p12.layer_id = param->layer_id; + cvkcv180x_tiu_lookup_table(ctx, &p12); + + // get mantissa value + p12.ofmap = param->buf; + p12.ifmap = param->ifmap; + p12.table = param->tbl_answer_mantissa; + cvkcv180x_tiu_lookup_table(ctx, &p12); + + // (2^exp) * mantissa + cvk_tiu_mul_param_t p1; + p1.res_high = NULL; + p1.res_low = param->ofmap; + p1.a = param->ofmap; + p1.b_is_const = 0; + p1.b = param->buf; + p1.rshift_bits = 0; + p1.relu_enable = 0; + p1.layer_id = param->layer_id; + cvkcv180x_tiu_mul(ctx, &p1); + } + else { + // duplicate from cvikernel_1880v2.c + const cvk_tl_t *tl_ifmap = param->ifmap; + const cvk_tl_t *tl_ofmap_slope = param->buf; + const cvk_tl_t *tl_table_answer = param->tbl_answer; + const cvk_tl_t *tl_table_answer_slope = param->tbl_answer_mantissa; + const cvk_tl_t *tl_ofmap_y0 = param->ofmap; + float min = param->min; + float max = param->max; + float scale = 256 / (max - min); // 256 means hw support lut index size + uint8_t eu_align = param->eu_align; + cvk_fmt_t fmt = CVK_FMT_BF16; + + cvk_tl_shape_t tl_ofmap_x0_int8_shape = { + 1, tl_ifmap->shape.c, tl_ifmap->shape.h * tl_ifmap->shape.w, 1}; + + // filter y = max(range_min, x) + cvk_tiu_max_param_t p1 = {0}; + p1.max = tl_ifmap; + p1.a = tl_ifmap; + p1.b_is_const = 1; + p1.b_const.is_signed = 1; + p1.b_const.val = cvk_convert_fp32_bf16(min); + p1.layer_id = param->layer_id; + ctx->ops->tiu_max(ctx, &p1); + + // filter y = min(8, x) + cvk_tiu_min_param_t p2 = {0}; + p2.min = tl_ifmap; + p2.a = tl_ifmap; + p2.b_is_const = 1; + p2.b_const.val = cvk_convert_fp32_bf16(max - 1 / scale); // corner + p2.b_const.is_signed = 1; + p2.layer_id = param->layer_id; + ctx->ops->tiu_min(ctx, &p2); + + cvk_tdma_l2l_tensor_copy_param_t p3 = {0}; + // scale input for remap its idx(-x~x) to (-127~127), dirty tl_ifmap + cvk_tiu_mul_param_t p4 = {0}; + p4.res_high = NULL; + p4.res_low = tl_ifmap; + p4.a = tl_ifmap; + p4.b_is_const = 1; + p4.b_const.val = cvk_convert_fp32_bf16(scale); + p4.rshift_bits = 0; + p4.relu_enable = 0; + p4.layer_id = param->layer_id; + ctx->ops->tiu_mul(ctx, &p4); + + // int8 + memset(&p3, 0x00, sizeof(cvk_tdma_l2l_tensor_copy_param_t)); + cvk_tl_t dst; + memcpy(&dst, tl_ofmap_y0, sizeof(cvk_tl_t)); + + dst.shape = tl_ofmap_x0_int8_shape; + dst.fmt = CVK_FMT_I8; + dst.stride = + ctx->ops->tl_default_stride(ctx, tl_ofmap_x0_int8_shape, CVK_FMT_I8, eu_align); + dst.stride.h = dst.stride.h * 2; + dst.int8_rnd_mode = 1; + p3.dst = &dst; + p3.src = tl_ifmap; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p3); + dst.int8_rnd_mode = 0; // reset + + // ops->tdma_l2l_bf16_tensor_copy(ctx, &p3); + + // ops->tiu_sub(ctx, &p5); + + // get f(x0) and slope(x) + // reshape, 16->16 + dst.fmt = fmt; + dst.shape = tl_ofmap_slope->shape; + dst.stride = tl_ofmap_slope->stride; + + // layer_id; + ctx->ops->tiu_lookup_table(ctx, &p6); + + // base f(x0) + memset(&p6, 0x0, sizeof(cvk_tiu_lookup_table_param_t)); + p6.ofmap = tl_ofmap_y0; + p6.ifmap = &dst; + p6.table = tl_table_answer; + p6.layer_id = param->layer_id; + ctx->ops->tiu_lookup_table(ctx, &p6); + + // layer_id; + ctx->ops->tiu_mac(ctx, &p7); + } +} + +void cvkcv180x_gmem_init_tensor( + struct cvikernel_context *ctx, + cvk_tg_t *tg, + cvk_tg_shape_t shape, + cvk_fmt_t fmt) { + memset(tg, 0, sizeof(*tg)); + tg->fmt = fmt; + tg->shape = shape; + tg->stride = cvkcv180x_tg_default_stride(ctx, tg->shape, tg->fmt); +} + +static uint16_t cvkcv180x_float_to_bfloat16( + cvk_context_t *ctx, + float data) +{ + (void)ctx; + + return cvk_convert_fp32_bf16(data); +} + +static void cvkcv180x_bf16_table_shape( + cvk_context_t *ctx, + cvk_tl_shape_t *shape) +{ + if (!ctx || !shape) + return; + + shape->n = 1; + shape->c = ctx->info.npu_num; + shape->h = 32; // hard-coded in cv180x + shape->w = 8; // hard-coded in cv180x +} + +static cvk_operations_t cvk_cv180x_ops = { + .cleanup = cvkcv180x_cleanup, + .reset = cvkcv180x_reset, + .acquire_cmdbuf = cvkcv180x_acquire_cmdbuf, + .set_layer_id = cvkcv180x_set_layer_id, + .parallel_enable = cvkcv180x_parallel_enable, + .parallel_disable = cvkcv180x_parallel_disable, + .lmem_alloc_tensor = cvkcv180x_lmem_alloc_tensor, + .lmem_alloc_matrix = cvkcv180x_lmem_alloc_matrix, + .lmem_alloc_ps32_matrix = cvkcv180x_lmem_alloc_ps32_matrix, + .lmem_free_tensor = cvkcv180x_lmem_free_tensor, + .lmem_free_matrix = cvkcv180x_lmem_free_matrix, + .lmem_init_tensor = cvkcv180x_lmem_init_tensor, + .lmem_init_matrix = cvkcv180x_lmem_init_matrix, + .tl_default_stride = cvkcv180x_tl_default_stride, + .tg_default_stride = cvkcv180x_tg_default_stride, + .ml_default_shape = cvkcv180x_ml_default_shape, + .ml_default_stride = cvkcv180x_ml_default_stride, + .ml_shape_t1 = cvkcv180x_ml_shape_t1, + .lmem_tensor_to_size = cvkcv180x_lmem_tensor_to_size, + .lmem_matrix_to_size = cvkcv180x_lmem_matrix_to_size, + .lmem_ps32_matrix_to_size = cvkcv180x_lmem_ps32_matrix_to_size, + .gmem_init_tensor = cvkcv180x_gmem_init_tensor, + .tdma_l2l_tensor_copy = cvkcv180x_tdma_l2l_bf16_tensor_copy, + .tdma_l2l_bf16_tensor_copy = cvkcv180x_tdma_l2l_bf16_tensor_copy, + .tdma_l2l_tensor_lrn_shift = cvkcv180x_tdma_l2l_tensor_lrn_shift, + .tdma_l2g_tensor_copy = cvkcv180x_tdma_l2g_bf16_tensor_copy, + .tdma_l2g_bf16_tensor_copy = cvkcv180x_tdma_l2g_bf16_tensor_copy, + .tdma_l2g_tensor_copy_nc_transposed = cvkcv180x_tdma_l2g_bf16_tensor_copy_nc_transposed, + .tdma_l2g_bf16_tensor_copy_nc_transposed = cvkcv180x_tdma_l2g_bf16_tensor_copy_nc_transposed, + .tdma_l2g_tensor_copy_compressed = cvkcv180x_tdma_l2g_tensor_copy_compressed, + .tdma_l2g_tensor_fill_constant = cvkcv180x_tdma_l2g_tensor_fill_constant, + .tdma_l2g_tensor_copy_cw_transposed = cvkcv180x_tdma_l2g_bf16_tensor_copy_cw_transposed, + .tdma_l2g_bf16_tensor_copy_cw_transposed = cvkcv180x_tdma_l2g_bf16_tensor_copy_cw_transposed, + .tdma_l2g_matrix_copy = cvkcv180x_tdma_l2g_bf16_matrix_copy, + .tdma_l2g_bf16_matrix_copy = cvkcv180x_tdma_l2g_bf16_matrix_copy, + .tdma_l2g_matrix_copy_compressed = cvkcv180x_tdma_l2g_matrix_copy_compressed, + .tdma_l2g_general_copy = cvkcv180x_tdma_l2g_general_copy, + .tdma_l2g_bf16_general_copy = cvkcv180x_tdma_l2g_bf16_general_copy, + .tdma_g2l_tensor_copy = cvkcv180x_tdma_g2l_bf16_tensor_copy, + .tdma_g2l_bf16_tensor_copy = cvkcv180x_tdma_g2l_bf16_tensor_copy, + .tdma_g2l_tensor_copy_nc_transposed = cvkcv180x_tdma_g2l_bf16_tensor_copy_nc_transposed, + .tdma_g2l_bf16_tensor_copy_nc_transposed = cvkcv180x_tdma_g2l_bf16_tensor_copy_nc_transposed, + .tdma_g2l_tensor_copy_chw_rotated = cvkcv180x_tdma_g2l_tensor_copy_chw_rotated, + .tdma_g2l_tensor_copy_decompressed = cvkcv180x_tdma_g2l_tensor_copy_decompressed, + .tdma_g2l_tensor_fill_constant = cvkcv180x_tdma_g2l_bf16_tensor_fill_constant, + .tdma_g2l_bf16_tensor_fill_constant = cvkcv180x_tdma_g2l_bf16_tensor_fill_constant, + .tdma_g2l_matrix_copy_decompressed = cvkcv180x_tdma_g2l_matrix_copy_decompressed, + .tdma_g2l_matrix_copy = cvkcv180x_tdma_g2l_bf16_matrix_copy, + .tdma_g2l_bf16_matrix_copy = cvkcv180x_tdma_g2l_bf16_matrix_copy, + .tdma_g2l_matrix_copy_row_col_transposed = cvkcv180x_tdma_g2l_matrix_copy_row_col_transposed, + .tdma_g2l_general_copy = cvkcv180x_tdma_g2l_general_copy, + .tdma_g2l_bf16_general_copy = cvkcv180x_tdma_g2l_bf16_general_copy, + .tdma_g2g_tensor_copy = cvkcv180x_tdma_g2g_tensor_copy, + .tdma_g2g_general_copy = cvkcv180x_tdma_g2g_general_copy, + .tdma_g2g_bf16_general_copy = cvkcv180x_tdma_g2g_bf16_general_copy, + .tdma_g2g_bf16_tensor_copy = cvkcv180x_tdma_g2g_bf16_tensor_copy, + .tiu_mul = cvkcv180x_tiu_mul, + .tiu_mul_qm = cvkcv180x_tiu_mul_qm, + .tiu_mac = cvkcv180x_tiu_mac, + .tiu_add = cvkcv180x_tiu_add, + .tiu_sub = cvkcv180x_tiu_sub, + .tiu_max = cvkcv180x_tiu_max, + .tiu_min = cvkcv180x_tiu_min, + .tiu_and_int8 = cvkcv180x_tiu_and_int8, + .tiu_arith_shift = cvkcv180x_tiu_arith_shift, + .tiu_and_int16 = cvkcv180x_tiu_and_int16, + .tiu_or_int8 = cvkcv180x_tiu_or_int8, + .tiu_or_int16 = cvkcv180x_tiu_or_int16, + .tiu_xor_int8 = cvkcv180x_tiu_xor_int8, + .tiu_xor_int16 = cvkcv180x_tiu_xor_int16, + .tiu_copy = cvkcv180x_tiu_copy, + .tiu_lookup_table = cvkcv180x_tiu_lookup_table, + .tiu_bf16_lookup_interp_table = cvkcv180x_tiu_bf16_lookup_interp_table, + .tiu_pt_convolution = cvkcv180x_tiu_pt_convolution, + .tiu_convolution = cvkcv180x_tiu_convolution, + .tiu_max_pooling = cvkcv180x_tiu_max_pooling, + .tiu_average_pooling = cvkcv180x_tiu_average_pooling, + .tiu_pt_depthwise_convolution = cvkcv180x_tiu_pt_depthwise_convolution, + .tiu_depthwise_convolution = cvkcv180x_tiu_depthwise_convolution, + .tiu_matrix_multiplication = cvkcv180x_tiu_matrix_multiplication, + .tiu_matrix_multiplication_qm = cvkcv180x_tiu_matrix_multiplication_qm, + .tiu_ge = cvkcv180x_tiu_ge, + .tiu_min_pooling = cvkcv180x_tiu_min_pooling, +}; + +static cvk_misc_operations_t cvk_cv180x_misc_ops = { + .float_to_bfloat16 = cvkcv180x_float_to_bfloat16, + .bf16_table_shape = cvkcv180x_bf16_table_shape, +}; + +char *cvikernel_get_chip_info_cv180x(void) +{ + return CVI_TPU_VERSION_180X; +} + +void cvikernel_init_cv180x( + cvk_reg_info_t *req_info, + cvk_context_t *ctx) +{ + uint32_t max_nr_desc = cvkcv180x_estimate_nr_desc(req_info->cmdbuf_size); + cvk_prv_data_t *prv_data; + desc_pair_t *desc_pairs; + + prv_data = malloc(sizeof(cvk_prv_data_t)); + desc_pairs = malloc(max_nr_desc * sizeof(desc_pair_t)); + if (!req_info || !ctx || !prv_data || !desc_pairs) { + if (prv_data) + free(prv_data); + if (desc_pairs) + free(desc_pairs); + return; + } + + ctx->info.version = CV180X_VER; + ctx->info.node_num = CV180X_HW_NODE_CHIP_NUM; + ctx->info.node_shift = CV180X_HW_NODE_CHIP_SHIFT; + ctx->info.npu_num = CV180X_HW_NPU_NUM; + ctx->info.npu_shift = CV180X_HW_NPU_SHIFT; + ctx->info.eu_num = CV180X_HW_EU_NUM; + ctx->info.eu_shift = CV180X_HW_EU_SHIFT; + ctx->info.lmem_size = CV180X_HW_LMEM_SIZE; + ctx->info.lmem_shift = CV180X_HW_LMEM_SHIFT; + ctx->info.lmem_banks = CV180X_HW_LMEM_BANKS; + ctx->info.lmem_bank_size = CV180X_HW_LMEM_BANK_SIZE; + ctx->info.gmem_start = CV180X_GLOBAL_MEM_START_ADDR; + ctx->info.features = CVK_HWF_FC_OP1_CONST | CVK_HWF_8B_ADD_SUB | + CVK_HWF_MIN_POOL | CVK_HWF_M_BRADCAST | + CVK_HWF_QM_LSHIFT | CVK_HWF_GE | CVK_HWF_CMD_PRE_EXE; + ctx->info.gmem_size = CV180X_GLOBAL_MEM_SIZE; + + ctx->ops = &cvk_cv180x_ops; + ctx->misc_ops = &cvk_cv180x_misc_ops; + + prv_data->cmdbuf_ptr = 0; + prv_data->max_nr_desc = max_nr_desc; + prv_data->cur_nr_desc = 0; + prv_data->desc_pairs = desc_pairs; + prv_data->lmem_ptr = 0; + + if (!prv_data->desc_pairs) { + printf("cvkcv180x init: fail to allocate internal data\n"); + free(prv_data); + return; + } + + ec_init(&prv_data->ec, CV180X_ENGINE_NUM, max_nr_desc); + mode_manager_init(&prv_data->mode_manager, &prv_data->ec, CV180X_ENGINE_NUM); + + prv_data->cmdbuf = req_info->cmdbuf; + prv_data->cmdbuf_size = req_info->cmdbuf_size; + ctx->priv_data = prv_data; +} diff --git a/cvikernel/src/cv180x/cvkcv180x.h b/cvikernel/src/cv180x/cvkcv180x.h new file mode 100644 index 000000000..1cbcec1d4 --- /dev/null +++ b/cvikernel/src/cv180x/cvkcv180x.h @@ -0,0 +1,753 @@ +#ifndef CVKCV180X_H +#define CVKCV180X_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "engine_conductor.h" +#include "engine_state.h" +#include "mode_manager.h" +#include +#include +#include "../../include/cvikernel/cv180x/cv180x_tiu_reg.h" +#include "../../include/cvikernel/cv180x/cv180x_tdma_reg.h" +#include "../../include/cvikernel/cv180x/cv180x_tpu_cfg.h" + +#define CV180X_TIU 0 // Tensor Instruction Unit +#define CV180X_CPU 1 // CPU, Reserved for common cpu op +#define CV180X_TDMA 2 // TPU DMA +#define CV180X_ENGINE_NUM 3 // Number of Engines + +typedef struct __cmd_hdr_s { + uint8_t magic; // 0xA5 + uint8_t len; // lens in bytes + uint8_t engine_id: 4; // TPU, TDMA + uint8_t __deprecated: 4; + uint8_t flags; // CMD_ID, sync flags, etc. TBD + uint32_t mask; // bit mask for which register need to write + uint8_t cmd[0]; +} __attribute__((packed)) cmd_hdr_t; + +typedef struct { + cmd_hdr_t *cmd_hdr; + ec_desc_t *ec_desc; +} desc_pair_t; + +typedef struct cvk_prv_data { + ec_t ec; + mode_manager_t mode_manager; + + uint32_t cmdbuf_ptr; + uint32_t max_nr_desc; + uint32_t cur_nr_desc; + desc_pair_t *desc_pairs; + + uint32_t lmem_ptr; + uint16_t layer_id; + + uint32_t cmdbuf_size; + uint8_t *cmdbuf; +} cvk_prv_data_t; + +desc_pair_t *cvkcv180x_get_desc_pair(cvk_context_t *ctx, uint8_t eng_id); + +#define CHECK(_status, _cond) \ + do { \ + (_status) |= (_cond) ? 0 : -1; \ + } while (0) + +static inline int ceiling_func(int numerator, int denominator) +{ + return (numerator + denominator - 1) / denominator; +} + +static inline int ceiling_func_shift(int numerator, int shift) +{ + return (numerator + (1 << shift) - 1) >> shift; +} + +static inline uint64_t align_up(uint64_t x, uint64_t n) +{ + return (x + n - 1) / n * n; +} + +static inline int8_t check_same_stride(const cvk_tl_t *a, const cvk_tl_t *b) +{ + int8_t status = 0; + + CHECK(status, a->stride.n == b->stride.n); + CHECK(status, a->stride.c == b->stride.c); + CHECK(status, a->stride.h == b->stride.h); + CHECK(status, a->stride.w == b->stride.w); + + return status; +} + +static inline int8_t check_same_shape(const cvk_tl_t *a, const cvk_tl_t *b) +{ + int8_t status = 0; + + CHECK(status, a->shape.n == b->shape.n); + CHECK(status, a->shape.c == b->shape.c); + CHECK(status, a->shape.h == b->shape.h); + CHECK(status, a->shape.w == b->shape.w); + + return status; +} + +static inline int8_t check_same_shape_3( + const cvk_tl_t *a, + const cvk_tl_t *b, + const cvk_tl_t *c) +{ + int8_t status = 0; + status |= check_same_shape(a, b); + status |= check_same_shape(a, c); + + return status; +} + +static inline int8_t check_same_shape_4( + const cvk_tl_t *a, + const cvk_tl_t *b, + const cvk_tl_t *c, + const cvk_tl_t *d) +{ + int8_t status = 0; + status |= check_same_shape_3(a, b, c); + status |= check_same_shape(a, d); + + return status; +} + +static inline int8_t check_same_shape_5( + const cvk_tl_t *t0, + const cvk_tl_t *t1, + const cvk_tl_t *t2, + const cvk_tl_t *t3, + const cvk_tl_t *t4) +{ + int8_t status = 0; + status |= check_same_shape_3(t0, t1, t2); + status |= check_same_shape_3(t0, t3, t4); + + return status; +} + +static inline int8_t check_same_shape_6( + const cvk_tl_t *t0, + const cvk_tl_t *t1, + const cvk_tl_t *t2, + const cvk_tl_t *t3, + const cvk_tl_t *t4, + const cvk_tl_t *t5) +{ + int8_t status = 0; + status |= check_same_shape_5(t0, t1, t2, t3, t4); + status |=check_same_shape(t0, t5); + + return status; +} + + +static inline int8_t check_tiu_tensor_shape(const cvk_tl_t *t) +{ + int8_t status = 0; + CHECK(status, t->shape.n > 0); + CHECK(status, t->shape.c > 0); + CHECK(status, t->shape.h > 0); + CHECK(status, t->shape.w > 0); + + CHECK(status, t->shape.n < 0x1000); + CHECK(status, t->shape.c < 0x1000); + CHECK(status, t->shape.h <= (4095-32)); // 12bit, max 4095-32(lanes) + CHECK(status, t->shape.w <= (4095-32)); // 12bit, max 4095-32(lanes) + + return status; +} + +static inline int8_t check_tiu_tensor(const cvk_tl_t *t) +{ + int8_t status = 0; + + if (!t) + return -1; + + status |= check_tiu_tensor_shape(t); + CHECK(status, t->fmt == CVK_FMT_I8 || t->fmt == CVK_FMT_U8 || t->fmt == CVK_FMT_BF16); + + return status; +} + +static inline int8_t check_tiu_tensor_2( + const cvk_tl_t *t0, + const cvk_tl_t *t1) +{ + int8_t status = 0; + status |= check_tiu_tensor(t0); + status |= check_tiu_tensor(t1); + + return status; +} + +static inline int8_t check_tiu_tensor_3( + const cvk_tl_t *t0, + const cvk_tl_t *t1, + const cvk_tl_t *t2) +{ + int8_t status = 0; + status |= check_tiu_tensor(t0); + status |= check_tiu_tensor_2(t1, t2); + + return status; +} + +static inline int8_t check_tiu_tensor_4( + const cvk_tl_t *t0, + const cvk_tl_t *t1, + const cvk_tl_t *t2, + const cvk_tl_t *t3) +{ + int8_t status = 0; + status |= check_tiu_tensor_3(t0, t1, t2); + status |= check_tiu_tensor(t3); + + return status; +} + +static inline int8_t check_tiu_tensor_5( + const cvk_tl_t *t0, + const cvk_tl_t *t1, + const cvk_tl_t *t2, + const cvk_tl_t *t3, + const cvk_tl_t *t4) +{ + int8_t status = 0; + status |= check_tiu_tensor_3(t0, t1, t2); + status |= check_tiu_tensor_2(t3, t4); + + return status; +} + +static inline int8_t check_tiu_tensor_6( + const cvk_tl_t *t0, + const cvk_tl_t *t1, + const cvk_tl_t *t2, + const cvk_tl_t *t3, + const cvk_tl_t *t4, + const cvk_tl_t *t5) +{ + int8_t status = 0; + status |= check_tiu_tensor_3(t0, t1, t2); + status |= check_tiu_tensor_3(t3, t4, t5); + + return status; +} + +static inline int8_t check_16bit_tiu_tensor(const cvk_tl_t *low, const cvk_tl_t *high) +{ + int8_t status = 0; + + status |= check_tiu_tensor_2(low, high); + status |= check_same_shape(low, high); + status |= check_same_stride(low, high); + CHECK(status, low->fmt == high->fmt); + CHECK(status, low->start_address < high->start_address); + + return status; +} + +static inline int8_t check_stride_type_0(cvk_context_t *ctx, const cvk_tl_t *t) +{ + int8_t status = 0; + uint32_t eu_num = ctx->info.eu_num; + uint32_t fmt = (t->fmt == CVK_FMT_BF16) ? 2 : 1; + + uint32_t h = t->shape.h; + uint32_t w = t->shape.w * fmt; + uint32_t c_stride = align_up(h * w, eu_num); + + CHECK(status, t->stride.c == c_stride); + CHECK(status, t->stride.h == w); + CHECK(status, t->stride.w == fmt); + + return status; +} + +static inline int8_t check_bf16_stride_type_0(cvk_context_t *ctx, const cvk_tl_t *t) +{ + int8_t status = 0; + uint32_t eu_num = ctx->info.eu_num; + uint32_t fmt = (t->fmt == CVK_FMT_BF16) ? 2 : 1; + + CHECK(status, t->stride.c % eu_num == 0); + CHECK(status, t->stride.w == fmt); + + return status; +} + +static inline int8_t check_stride_type_2(cvk_context_t *ctx, const cvk_tl_t *t) +{ + int8_t status = 0; + + CHECK(status, t->shape.h == 1); + CHECK(status, t->shape.w == 1); + + uint32_t fmt = (t->fmt == CVK_FMT_BF16) ? 2 : 1; + uint32_t c = t->shape.c; + uint32_t npu_num = ctx->info.npu_num; + + CHECK(status, t->stride.n == fmt * align_up(c, npu_num) / npu_num); + CHECK(status, t->stride.c == 1 * fmt); + CHECK(status, t->stride.h == 1 * fmt); + CHECK(status, t->stride.w == 1 * fmt); + + return status; +} + +static inline int8_t check_bf16_stride_type_2(cvk_context_t *ctx, const cvk_tl_t *t) +{ + int8_t status = 0; + CHECK(status, t->shape.h == 1); + CHECK(status, t->shape.w == 1); + + uint32_t fmt = (t->fmt == CVK_FMT_BF16) ? 2 : 1; + uint32_t c = t->shape.c; + uint32_t npu_num = ctx->info.npu_num; + + CHECK(status, t->stride.n == fmt * align_up(c, npu_num) / npu_num); + CHECK(status, t->stride.c == 1 * fmt); + CHECK(status, t->stride.h == 1 * fmt); + CHECK(status, t->stride.w == 1 * fmt); + + return status; +} + +static inline int tensor_is_signed(const cvk_tl_t *t) +{ + switch (t->fmt) { + case CVK_FMT_I8: + return 1; + case CVK_FMT_U8: + case CVK_FMT_BF16: //does not matter, so set to default 0 + return 0; + default: + break; + } + + return 1; +} + +static inline int matrix_is_signed(const cvk_ml_t *t) +{ + switch (t->fmt) { + case CVK_FMT_I8: + return 1; + case CVK_FMT_U8: + case CVK_FMT_BF16: //does not matter, so set to default 0 + return 0; + default: + break; + } + + return 1; +} + +static inline void fill_same_tensor_shape(tiu_reg_t *r, cvk_tl_shape_t s) +{ + uint32_t n = s.n; + uint32_t c = s.c; + uint32_t h = s.h; + uint32_t w = s.w; + + r->opd0_n = n; + r->opd0_c = c; + r->opd0_h = h; + r->opd0_w = w; + + r->opd1_n = n; + r->opd1_c = c; + r->opd1_h = h; + r->opd1_w = w; + + r->opd2_n = n; + r->opd2_c = c; + r->opd2_h = h; + r->opd2_w = w; + + r->res0_n = n; + r->res0_c = c; + r->res0_h = h; + r->res0_w = w; +} + +static inline int8_t check_stride_range(cvk_tl_stride_t s) +{ + int8_t status = 0; + + CHECK(status, s.n < 0x10000); + CHECK(status, s.c < 0x10000); + CHECK(status, s.h < 0x10000); + + return status; +} + +static inline void fill_same_tensor_stride(tiu_reg_t *r, cvk_tl_stride_t s) +{ + uint32_t n = s.n; + uint32_t c = s.c; + uint32_t h = s.h; + uint32_t w = 1; + + r->opd0_n_str = n; + r->opd0_c_str = c; + r->opd0_h_str = h; + r->opd0_w_str = w; + + r->opd1_n_str = n; + r->opd1_c_str = c; + r->opd1_h_str = h; + r->opd1_w_str = w; + + r->opd2_n_str = n; + r->opd2_c_str = c; + r->opd2_h_str = h; + r->opd2_w_str = w; + + r->res0_n_str = n; + r->res0_c_str = c; + r->res0_h_str = h; + r->res0_w_str = w; +} + +#define fill_stride_code(r, op, str) \ + do { \ + r->op##_n_str = str->n; \ + r->op##_c_str = str->c; \ + r->op##_h_str = str->h; \ + r->op##_w_str = str->w; \ + } while (0) + +static inline void fill_opd0_stride(tiu_reg_t *r, const cvk_tl_stride_t *str) +{ + fill_stride_code(r, opd0, str); +} + +static inline void fill_opd1_stride(tiu_reg_t *r, const cvk_tl_stride_t *str) +{ + fill_stride_code(r, opd1, str); +} + +static inline void fill_opd2_stride(tiu_reg_t *r, const cvk_tl_stride_t *str) +{ + fill_stride_code(r, opd2, str); +} + +static inline void fill_res0_stride(tiu_reg_t *r, const cvk_tl_stride_t *str) +{ + fill_stride_code(r, res0, str); +} + +static inline void fill_same_tensor_stride_type(tiu_reg_t *r, int type) +{ + r->short_opd0_str = type & 0b11; + r->short_opd1_str = type & 0b11; + r->short_opd2_str = type & 0b11; + r->short_res0_str = type & 0b11; +} + +static inline ec_desc_t * emit_tiu_cmdbuf(cvk_context_t *ctx, tiu_reg_t *r) +{ + int engine_id = CV180X_TIU; + + desc_pair_t *dp = cvkcv180x_get_desc_pair(ctx, engine_id); + uint32_t *cmdbuf = (uint32_t *)dp->cmd_hdr->cmd; + emit_tiu_reg(r, cmdbuf); + + return dp->ec_desc; +} + +void cvkcv180x_cleanup(struct cvikernel_context *ctx); +void cvkcv180x_reset(struct cvikernel_context *ctx); + +void cvkcv180x_parallel_enable(struct cvikernel_context *ctx); +void cvkcv180x_parallel_disable(struct cvikernel_context *ctx); +void cvkcv180x_set_layer_id( + struct cvikernel_context *ctx, + uint16_t layer_id); +cvk_tl_t *cvkcv180x_lmem_alloc_tensor( + struct cvikernel_context *ctx, + cvk_tl_shape_t shape, + cvk_fmt_t fmt, + int eu_align); +cvk_ml_t *cvkcv180x_lmem_alloc_matrix( + struct cvikernel_context *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align); +cvk_ml_t *cvkcv180x_lmem_alloc_ps32_matrix( + struct cvikernel_context *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align); +void cvkcv180x_lmem_free_tensor( + struct cvikernel_context *ctx, + const cvk_tl_t *tl); +void cvkcv180x_lmem_free_matrix( + struct cvikernel_context *ctx, + const cvk_ml_t *ml); +void cvkcv180x_lmem_init_tensor( + struct cvikernel_context *ctx, + cvk_tl_t *tl, + cvk_tl_shape_t shape, + cvk_fmt_t fmt, + int eu_align); +void cvkcv180x_lmem_init_matrix( + struct cvikernel_context *ctx, + cvk_ml_t *ml, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align); +cvk_tl_stride_t cvkcv180x_tl_default_stride( + struct cvikernel_context *ctx, + cvk_tl_shape_t shape, + cvk_fmt_t fmt, + int eu_align); +cvk_tg_stride_t cvkcv180x_tg_default_stride( + struct cvikernel_context *ctx, + cvk_tg_shape_t shape, + cvk_fmt_t fmt); +cvk_ml_shape_t cvkcv180x_ml_default_shape( + struct cvikernel_context *ctx, + uint32_t row, + uint32_t col, + cvk_fmt_t fmt); +cvk_ml_stride_t cvkcv180x_ml_default_stride( + struct cvikernel_context *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align); +cvk_ml_shape_t cvkcv180x_ml_shape_t1( + struct cvikernel_context *ctx, + uint32_t len, + cvk_fmt_t fmt); +uint32_t cvkcv180x_lmem_tensor_to_size( + struct cvikernel_context *ctx, + cvk_tl_shape_t shape, + cvk_fmt_t fmt, + int eu_align); +uint32_t cvkcv180x_lmem_matrix_to_size( + struct cvikernel_context *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align); +uint32_t cvkcv180x_lmem_ps32_matrix_to_size( + struct cvikernel_context *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align); +void cvkcv180x_gmem_init_tensor( + struct cvikernel_context *ctx, + cvk_tg_t *tg, + cvk_tg_shape_t shape, + cvk_fmt_t fmt); + +/* Local to Local DMA API */ +void cvkcv180x_tdma_l2l_tensor_copy( + struct cvikernel_context *ctx, + const cvk_tdma_l2l_tensor_copy_param_t *param); +void cvkcv180x_tdma_l2l_bf16_tensor_copy( + struct cvikernel_context *ctx, + const cvk_tdma_l2l_tensor_copy_param_t *param); +void cvkcv180x_tdma_l2l_tensor_lrn_shift( + struct cvikernel_context *ctx, + const cvk_tdma_l2l_tensor_lrn_shift_param_t *param); + +/* Local to Global DMA API */ +void cvkcv180x_tdma_l2g_tensor_copy( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_tensor_copy_param_t *param); +void cvkcv180x_tdma_l2g_bf16_tensor_copy( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_tensor_copy_param_t *param); +void cvkcv180x_tdma_l2g_tensor_copy_nc_transposed( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_tensor_copy_nc_transposed_param_t *param); +void cvkcv180x_tdma_l2g_bf16_tensor_copy_nc_transposed( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_tensor_copy_nc_transposed_param_t *param); +void cvkcv180x_tdma_l2g_tensor_copy_compressed( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_tensor_copy_compressed_param_t *param); +void cvkcv180x_tdma_l2g_tensor_fill_constant( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_tensor_fill_constant_param_t *param); +void cvkcv180x_tdma_l2g_tensor_copy_cw_transposed( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_tensor_copy_cw_transposed_param_t *param); +void cvkcv180x_tdma_l2g_bf16_tensor_copy_cw_transposed( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_tensor_copy_cw_transposed_param_t *param); +void cvkcv180x_tdma_l2g_matrix_copy( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_matrix_copy_param_t *param); +void cvkcv180x_tdma_l2g_bf16_matrix_copy( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_matrix_copy_param_t *param); +void cvkcv180x_tdma_l2g_general_copy( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_general_copy_param_t *param); +void cvkcv180x_tdma_l2g_bf16_general_copy( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_bf16_general_copy_param_t *param); + +/* Global to Local DMA API */ +void cvkcv180x_tdma_g2l_tensor_copy( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_tensor_copy_param_t *param); +void cvkcv180x_tdma_g2l_bf16_tensor_copy( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_tensor_copy_param_t *param); +void cvkcv180x_tdma_g2l_tensor_copy_nc_transposed( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_tensor_copy_nc_transposed_param_t *param); +void cvkcv180x_tdma_g2l_bf16_tensor_copy_nc_transposed( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_tensor_copy_nc_transposed_param_t *param); +void cvkcv180x_tdma_g2l_tensor_copy_chw_rotated( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_tensor_copy_chw_rotated_param_t *param); +void cvkcv180x_tdma_g2l_tensor_copy_decompressed( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_tensor_copy_decompressed_param_t *param); +void cvkcv180x_tdma_g2l_tensor_fill_constant( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_tensor_fill_constant_param_t *param); +void cvkcv180x_tdma_g2l_bf16_tensor_fill_constant( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_tensor_fill_constant_param_t *param); +void cvkcv180x_tdma_g2l_matrix_copy_decompressed( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_matrix_copy_decompressed_param_t *param); +void cvkcv180x_tdma_l2g_matrix_copy_compressed( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_matrix_copy_compressed_param_t *param); +void cvkcv180x_tdma_g2l_matrix_copy( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_matrix_copy_param_t *param); +void cvkcv180x_tdma_g2l_bf16_matrix_copy( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_matrix_copy_param_t *param); +void cvkcv180x_tdma_g2l_matrix_copy_row_col_transposed( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_matrix_copy_row_col_transposed_param_t *param); +void cvkcv180x_tdma_g2l_general_copy( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_general_copy_param_t *param); +void cvkcv180x_tdma_g2l_bf16_general_copy( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_bf16_general_copy_param_t *param); + +/* Global to Global DMA API */ +void cvkcv180x_tdma_g2g_tensor_copy( + struct cvikernel_context *ctx, + const cvk_tdma_g2g_tensor_copy_param_t *param); +void cvkcv180x_tdma_g2g_general_copy( + struct cvikernel_context *ctx, + const cvk_tdma_g2g_tensor_copy_param_t *param); +void cvkcv180x_tdma_g2g_bf16_general_copy( + struct cvikernel_context *ctx, + const cvk_tdma_g2g_tensor_copy_param_t *param); +void cvkcv180x_tdma_g2g_bf16_tensor_copy( + struct cvikernel_context *ctx, + const cvk_tdma_g2g_tensor_copy_param_t *param); + +/* TIU API */ +void cvkcv180x_tiu_mul( + struct cvikernel_context *ctx, + const cvk_tiu_mul_param_t *param); +void cvkcv180x_tiu_mul_qm( + struct cvikernel_context *ctx, + const cvk_tiu_mul_qm_param_t *param); +void cvkcv180x_tiu_mac( + struct cvikernel_context *ctx, + const cvk_tiu_mac_param_t *param); +void cvkcv180x_tiu_add( + struct cvikernel_context *ctx, + const cvk_tiu_add_param_t *param); +void cvkcv180x_tiu_sub( + struct cvikernel_context *ctx, + const cvk_tiu_sub_param_t *param); +void cvkcv180x_tiu_max( + struct cvikernel_context *ctx, + const cvk_tiu_max_param_t *param); +void cvkcv180x_tiu_min( + struct cvikernel_context *ctx, + const cvk_tiu_min_param_t *param); +void cvkcv180x_tiu_and_int8( + struct cvikernel_context *ctx, + const cvk_tiu_and_int8_param_t *param); +void cvkcv180x_tiu_arith_shift( + struct cvikernel_context *ctx, + const cvk_tiu_arith_shift_param_t *param); +void cvkcv180x_tiu_and_int16( + struct cvikernel_context *ctx, + const cvk_tiu_and_int16_param_t *param); +void cvkcv180x_tiu_or_int8( + struct cvikernel_context *ctx, + const cvk_tiu_or_int8_param_t *param); +void cvkcv180x_tiu_or_int16( + struct cvikernel_context *ctx, + const cvk_tiu_or_int16_param_t *param); +void cvkcv180x_tiu_xor_int8( + struct cvikernel_context *ctx, + const cvk_tiu_xor_int8_param_t *param); +void cvkcv180x_tiu_xor_int16( + struct cvikernel_context *ctx, + const cvk_tiu_xor_int16_param_t *param); +void cvkcv180x_tiu_copy( + struct cvikernel_context *ctx, + const cvk_tiu_copy_param_t *param); +void cvkcv180x_tiu_lookup_table( + struct cvikernel_context *ctx, + const cvk_tiu_lookup_table_param_t *param); +void cvkcv180x_tiu_bf16_lookup_interp_table( + struct cvikernel_context *ctx, + const cvk_tiu_bf16_lookup_interp_table_param_t *param); +void cvkcv180x_tiu_pt_convolution( + struct cvikernel_context *ctx, + const cvk_tiu_pt_convolution_param_t *param); +void cvkcv180x_tiu_convolution( + struct cvikernel_context *ctx, + const cvk_tiu_convolution_param_t *param); +void cvkcv180x_tiu_max_pooling( + struct cvikernel_context *ctx, + const cvk_tiu_max_pooling_param_t *param); +void cvkcv180x_tiu_average_pooling( + struct cvikernel_context *ctx, + const cvk_tiu_average_pooling_param_t *param); +void cvkcv180x_tiu_pt_depthwise_convolution( + struct cvikernel_context *ctx, + const cvk_tiu_depthwise_pt_convolution_param_t *param); +void cvkcv180x_tiu_depthwise_convolution( + struct cvikernel_context *ctx, + const cvk_tiu_depthwise_convolution_param_t *param); +void cvkcv180x_tiu_matrix_multiplication( + struct cvikernel_context *ctx, + const cvk_tiu_matrix_multiplication_param_t *param); +void cvkcv180x_tiu_matrix_multiplication_qm( + struct cvikernel_context *ctx, + const cvk_tiu_matrix_multiplication_qm_param_t *param); +void cvkcv180x_tiu_ge( + cvk_context_t *ctx, + const cvk_tiu_ge_param_t *p); +void cvkcv180x_tiu_min_pooling( + cvk_context_t *ctx, + const cvk_tiu_min_pooling_param_t *p); + +#ifdef __cplusplus +} +#endif + +#endif /* CVKCV180X_H */ diff --git a/cvikernel/src/cv180x/tdma.c b/cvikernel/src/cv180x/tdma.c new file mode 100644 index 000000000..4596d07fd --- /dev/null +++ b/cvikernel/src/cv180x/tdma.c @@ -0,0 +1,2267 @@ +#include "cvkcv180x.h" + +//n < 0x10000); + CHECK(status, s->c < 0x10000); + CHECK(status, s->h < 0x10000); + CHECK(status, s->w < 0x10000); + + CHECK(status, s->n > 0x0); + CHECK(status, s->c > 0x0); + CHECK(status, s->h > 0x0); + CHECK(status, s->w > 0x0); + + return status; +} + +static int8_t check_tdma_tl_bf16_shape(const cvk_tl_shape_t *s, cvk_fmt_t fmt) +{ + int8_t status = 0; + uint8_t fmt_type = (fmt == CVK_FMT_BF16 ? 2 : 1); + + CHECK(status, s->n < 0x10000); + CHECK(status, s->c < 0x10000); + CHECK(status, s->h < 0x10000); + CHECK(status, s->w < 0x10000 / fmt_type); + + CHECK(status, s->n > 0x0); + CHECK(status, s->c > 0x0); + CHECK(status, s->h > 0x0); + CHECK(status, s->w > 0x0); + + return status; +} + +static int8_t check_tdma_tg_shape(const cvk_tg_shape_t *s) +{ + int8_t status = 0; + + CHECK(status, s->n < 0x10000); + CHECK(status, s->c < 0x10000); + CHECK(status, s->h < 0x10000); + CHECK(status, s->w < 0x10000); + + CHECK(status, s->n > 0x0); + CHECK(status, s->c > 0x0); + CHECK(status, s->h > 0x0); + CHECK(status, s->w > 0x0); + + return status; +} + +static int8_t check_tdma_tg_bf16_shape(const cvk_tg_shape_t *s, cvk_fmt_t fmt) +{ + int8_t status = 0; + uint8_t fmt_type = (fmt == CVK_FMT_BF16 ? 2 : 1); + + CHECK(status, s->n < 0x10000); + CHECK(status, s->c < 0x10000); + CHECK(status, s->h < 0x10000); + CHECK(status, s->w < 0x10000 / fmt_type); + + CHECK(status, s->n > 0x0); + CHECK(status, s->c > 0x0); + CHECK(status, s->h > 0x0); + CHECK(status, s->w > 0x0); + + return status; +} + + +static int8_t check_tdma_ml_shape(const cvk_ml_shape_t *s) +{ + int8_t status = 0; + + CHECK(status, s->n < 0x10000); + CHECK(status, s->c < 0x10000); + CHECK(status, s->w < 0x10000); + CHECK(status, s->col < 0x10000); + + CHECK(status, s->n > 0); + CHECK(status, s->c > 0); + CHECK(status, s->w > 0); + CHECK(status, s->col > 0); + + return status; +} + +static int8_t check_tdma_ml_bf16_shape(const cvk_ml_shape_t *s, cvk_fmt_t fmt) +{ + int8_t status = 0; + uint8_t fmt_type = (fmt == CVK_FMT_BF16 ? 2 : 1); + + CHECK(status, s->n < 0x10000); + CHECK(status, s->c < 0x10000); + CHECK(status, s->w < 0x10000 / fmt_type); + CHECK(status, s->col < 0x10000); + + CHECK(status, s->n > 0); + CHECK(status, s->c > 0); + CHECK(status, s->w > 0); + CHECK(status, s->col > 0); + + return status; +} + +static int8_t check_tdma_mg_shape(const cvk_mg_shape_t *s) +{ + int8_t status = 0; + + CHECK(status, s->row < 0x10000); + CHECK(status, s->col < 0x10000); + + CHECK(status, s->row > 0x0); + CHECK(status, s->col > 0x0); + + return status; +} + +static int8_t check_tdma_mg_bf16_shape(const cvk_mg_shape_t *s, cvk_fmt_t fmt) +{ + int8_t status = 0; + uint8_t fmt_type = (fmt == CVK_FMT_BF16 ? 2 : 1); + + CHECK(status, s->row < 0x10000); + CHECK(status, s->col < 0x10000 / fmt_type); + + CHECK(status, s->row > 0x0); + CHECK(status, s->col > 0x0); + + return status; +} + +static int8_t check_tdma_tl(const cvk_tl_t *t) +{ + int8_t status = 0; + + CHECK(status, t); + CHECK(status, t->fmt == CVK_FMT_I8 || t->fmt == CVK_FMT_U8 || t->fmt == CVK_FMT_BF16); + status |= check_tdma_tl_shape(&t->shape); + + return status; +} + +static int8_t check_tdma_tl_bf16(const cvk_tl_t *t) +{ + int8_t status = 0; + + CHECK(status, t); + CHECK(status, t->fmt == CVK_FMT_I8 || t->fmt == CVK_FMT_U8 || t->fmt == CVK_FMT_BF16); + status |= check_tdma_tl_bf16_shape(&t->shape, t->fmt); + + return status; +} + +static int8_t check_tdma_tg(const cvk_tg_t *t) +{ + int8_t status = 0; + + CHECK(status, t); + CHECK(status, t->base_reg_index < TDMA_NUM_BASE_REGS); + CHECK(status, t->fmt == CVK_FMT_I8 || t->fmt == CVK_FMT_U8 || t->fmt == CVK_FMT_BF16); + status |= check_tdma_tg_shape(&t->shape); + + return status; +} + +static int8_t check_tdma_tg_bf16(const cvk_tg_t *t) +{ + int8_t status = 0; + + CHECK(status, t); + CHECK(status, t->base_reg_index < TDMA_NUM_BASE_REGS); + CHECK(status, t->fmt == CVK_FMT_I8 || t->fmt == CVK_FMT_U8 || t->fmt == CVK_FMT_BF16); + status |= check_tdma_tg_bf16_shape(&t->shape, t->fmt); + + return status; +} + +static int8_t check_tdma_compressed_tg(const cvk_cmpr_tg_t *t) +{ + int8_t status = 0; + uint32_t stride_w = t->t.fmt == CVK_FMT_BF16 ? 2 : 1; + + CHECK(status, t); + CHECK(status, t->t.base_reg_index < TDMA_NUM_BASE_REGS); + status |= check_tdma_tg_shape(&t->t.shape); + CHECK(status, !(t->t.start_address%0x10)); + + // Enable after backend fix + //CHECK(status, t->t.stride.n == + // (t->t.shape.w * t->t.shape.h * t->t.shape.c * stride_w)); + + CHECK(status, t->t.stride.c == (t->t.shape.w * t->t.shape.h * stride_w)); + CHECK(status, t->t.stride.h == (t->t.shape.w * stride_w)); + // m.base_reg_index < TDMA_NUM_BASE_REGS); + CHECK(status, !(t->m.start_address%0x10)); + + // the data should be continuous + if (t->m.fmt == CVK_FMT_BF16) { + CHECK(status, t->m.stride.row == t->m.shape.col * 2); + } + else if (t->m.fmt == CVK_FMT_I8 || t->m.fmt == CVK_FMT_U8) { + CHECK(status, t->m.stride.row == t->m.shape.col); + } + else { + CHECK(status, 0); //fmt == CVK_FMT_I8 || m->fmt == CVK_FMT_U8 || m->fmt == CVK_FMT_BF16); + status |= check_tdma_ml_shape(&m->shape); + + return status; +} + +static int8_t check_tdma_ml_bf16(const cvk_ml_t *m) +{ + int8_t status = 0; + + CHECK(status, m); + CHECK(status, m->fmt == CVK_FMT_I8 || m->fmt == CVK_FMT_U8 || m->fmt == CVK_FMT_BF16); + status |= check_tdma_ml_bf16_shape(&m->shape, m->fmt); + + return status; +} + +static int8_t check_tdma_mg(const cvk_mg_t *m) +{ + int8_t status = 0; + + CHECK(status, m); + CHECK(status, m->base_reg_index < TDMA_NUM_BASE_REGS); + status |= check_tdma_mg_shape(&m->shape); + + return status; +} + +static int8_t check_tdma_mg_bf16(const cvk_mg_t *m) +{ + int8_t status = 0; + + CHECK(status, m); + CHECK(status, m->base_reg_index < TDMA_NUM_BASE_REGS); + status |= check_tdma_mg_bf16_shape(&m->shape, m->fmt); + + return status; +} + +static int8_t check_tdma_compress_mg(const cvk_cmpr_mg_t *m) +{ + int8_t status = 0; + + CHECK(status, m); + CHECK(status, m->m.base_reg_index < TDMA_NUM_BASE_REGS); + status |= check_tdma_mg_shape(&m->m.shape); + + return status; +} + +static int8_t check_tl_same_size(const cvk_tl_t *a, const cvk_tl_t *b) +{ + int8_t status = 0; + uint32_t a_size = a->shape.n * a->shape.c * a->shape.h * a->shape.w; + uint32_t b_size = b->shape.n * b->shape.c * b->shape.h * b->shape.w; + + CHECK(status, a_size == b_size); + + return status; +} + +static int8_t check_tl_tg_same_size(const cvk_tl_t *tl, const cvk_tg_t *tg) +{ + int8_t status = 0; + uint32_t tl_size = tl->shape.n * tl->shape.c * tl->shape.h * tl->shape.w; + uint32_t tg_size = tg->shape.n * tg->shape.c * tg->shape.h * tg->shape.w; + + CHECK(status, tl_size == tg_size); + + return status; +} + +static int8_t check_ml_mg_same_size(const cvk_ml_t *ml, const cvk_mg_t *mg) +{ + int8_t status = 0; + uint32_t ml_size = ml->shape.n * ml->shape.col; + uint32_t mg_size = mg->shape.row * mg->shape.col; + + CHECK(status, ml_size == mg_size); + + return status; +} + +#if 0 +static uint64_t absolute_gmem_addr(uint64_t addr) +{ + return (addr & 0x0FFFFFFFFFF) + BM1822_GLOBAL_MEM_START_ADDR; +} +#else +//global memory start = 0x0 from 1822 kernel view, we can use it directlly +//cmdbuf descriptor content dram address does not need offset either +#define absolute_gmem_addr(addr) (addr & 0x0FFFFFFFFFF) +#endif + +static ec_desc_t * emit_tdma_cmdbuf(cvk_context_t *ctx, tdma_reg_t *reg) +{ + cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data; + desc_pair_t *dp = cvkcv180x_get_desc_pair(ctx, CV180X_TDMA); + + reg->layer_ID = prv_data->layer_id; + //CHECK(status, reg->rsv5 != 0x0);// "this is debug use, it's fine for skip"; + + uint32_t *cmdbuf = (uint32_t *)dp->cmd_hdr->cmd; + emit_tdma_reg(reg, cmdbuf); + + return dp->ec_desc; +} + +static void fill_l2g_fmt(tdma_reg_t *reg, cvk_fmt_t src_fmt, cvk_fmt_t dst_fmt) +{ + reg->dst_fmt = (dst_fmt == CVK_FMT_BF16) ? 2 : 1; + reg->src_fmt = (src_fmt == CVK_FMT_BF16) ? 2 : 1; + // check and decide bf16->int8 or bf16->uint8_t + reg->int8_sign = (dst_fmt == CVK_FMT_I8 ? 1 : 0);// | (dst_fmt == CVK_FMT_U8 ? 1 : 0); +} + +static void fill_g2l_fmt(tdma_reg_t *reg, cvk_fmt_t src_fmt, cvk_fmt_t dst_fmt) +{ + reg->dst_fmt = (dst_fmt == CVK_FMT_BF16) ? 2 : 1; + reg->src_fmt = (src_fmt == CVK_FMT_BF16) ? 2 : 1; + // check and decide int8->bf16 or uint8_t->bf16 + reg->int8_sign = (src_fmt == CVK_FMT_I8 ? 1 : 0) ;//| (src_fmt == CVK_FMT_U8 ? 1 : 0); +} + +static void fill_l2l_fmt(tdma_reg_t *reg, cvk_fmt_t src_fmt, cvk_fmt_t dst_fmt) +{ + reg->dst_fmt = (dst_fmt == CVK_FMT_BF16) ? 2 : 1; + reg->src_fmt = (src_fmt == CVK_FMT_BF16) ? 2 : 1; + // check and decide bf16->int8 or bf16->uint8_t or int8->bf16 or uint8_t->bf16 + reg->int8_sign = (dst_fmt == CVK_FMT_I8 ? 1 : 0) | (src_fmt == CVK_FMT_I8 ? 1 : 0); +} + +static void fill_src_addr(tdma_reg_t *r, uint64_t addr) +{ + r->src_base_addr_low = (uint32_t)addr; + r->src_base_addr_high = (addr >> 32); +} + +static void fill_dst_addr(tdma_reg_t *r, uint64_t addr) +{ + r->dst_base_addr_low = (uint32_t)addr; + r->dst_base_addr_high = (addr >> 32); +} + +static void fill_src_c_stride(tdma_reg_t *r, uint32_t str) +{ + r->src_c_stride_low = (uint16_t)str; + r->src_c_stride_high = (str >> 16); +} + +static void fill_dst_c_stride(tdma_reg_t *r, uint32_t str) +{ + r->dst_c_stride_low = (uint16_t)str; + r->dst_c_stride_high = (str >> 16); +} + +static void set_int8_rnd_mode(tdma_reg_t *r, uint32_t int8_rnd_mode) +{ + if (int8_rnd_mode == 1) { + // int8 + if (r->src_fmt == FMT_BF16_TYP && r->dst_fmt == FMT_FIX8B_TYP) { + r->int8_rnd_mode = int8_rnd_mode; + } + } +} + + +/* + * Direction: L2L + */ +void cvkcv180x_tdma_l2l_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_l2l_tensor_copy_param_t *p) +{ + int8_t status = 0; + + status |= check_tdma_tl(p->src); + status |= check_tdma_tl(p->dst); + CHECK(status, p->src->shape.n == p->dst->shape.n); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 3; + reg.spec_func = 0; + reg.sys_dtype = 0; + + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + reg.outstanding_en = p->outstanding; + + if (status) { + printf("cvkcv180x l2l: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +void cvkcv180x_tdma_l2l_bf16_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_l2l_tensor_copy_param_t *p) +{ + int8_t status = 0; + + status |= check_tdma_tl_bf16(p->src); + status |= check_tdma_tl_bf16(p->dst); + CHECK(status, p->src->shape.n == p->dst->shape.n); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 3; + reg.spec_func = 0; + reg.sys_dtype = 0; + fill_l2l_fmt(®, p->src->fmt, p->dst->fmt); + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + // does not allow open `mv_lut_idx and `mv_lut_basemv_lut_base at same time + if (p->mv_lut_idx == 1) { + reg.mv_lut_idx = p->mv_lut_idx; + } + + if (p->mv_lut_base == 1) { + reg.mv_lut_base = p->mv_lut_base; + } + + if (reg.mv_lut_idx == 1 && reg.mv_lut_base == 1) { + CHECK(status, 0); + } + + set_int8_rnd_mode(®, p->dst->int8_rnd_mode); + + reg.outstanding_en = p->outstanding; + + if (status) { + printf("cvkcv180x l2l bf16: wrong parameter\n"); + return; + } + + //trace_tdma_reg(®, __func__); + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static uint32_t addr_after_right_shift( + cvk_context_t *ctx, int addr, uint32_t step, int c_str) +{ + uint32_t npu_num = ctx->info.npu_num; + uint32_t lmem_size = ctx->info.lmem_size;; + + uint32_t lmem_i = (addr / lmem_size + step) % npu_num; + uint32_t offset = addr % lmem_size + (addr / lmem_size + step) / npu_num * c_str; + return lmem_i * lmem_size + offset; +} + +void cvkcv180x_tdma_l2l_tensor_lrn_shift( + cvk_context_t *ctx, + const cvk_tdma_l2l_tensor_lrn_shift_param_t *p) +{ + int8_t status = 0; + status |= check_tdma_tl(p->src); + status |= check_tdma_tl(p->dst); + status |= check_tl_same_size(p->src, p->dst); + CHECK(status, p->src->shape.n == p->dst->shape.n); + CHECK(status, p->src->shape.c == p->dst->shape.c); + CHECK(status, p->src->shape.c > p->lrn_step); + CHECK(status, p->src->shape.h * p->src->shape.w == + p->dst->shape.h * p->dst->shape.w); + CHECK(status, p->lrn_step < 16); + + CHECK(status, p->src->fmt == p->dst->fmt); + + int is_bf16 = (p->src->fmt == CVK_FMT_BF16) ? 1 : 0; + if (is_bf16) { + check_tdma_tl_bf16(p->src); + check_tdma_tl_bf16(p->dst); + } + + /* L2L lrn copy */ + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 3; + reg.spec_func = 0; + reg.sys_dtype = 0; + fill_l2l_fmt(®, p->src->fmt, p->dst->fmt); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c - p->lrn_step; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c - p->lrn_step; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + if (p->right_shift) { + uint32_t dst_addr = addr_after_right_shift( + ctx, p->dst->start_address, p->lrn_step, p->dst->stride.c); + + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + } else { + uint32_t src_addr = addr_after_right_shift( + ctx, p->src->start_address, p->lrn_step, p->src->stride.c); + + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + } + + if (is_bf16) + set_int8_rnd_mode(®, p->dst->int8_rnd_mode); + + emit_tdma_cmdbuf(ctx, ®); + + /* Constant fill with zero */ + reg.trans_dir = 0; + reg.spec_func = 4; + reg.const_val = is_bf16 ? cvk_convert_fp32_bf16(0.0): 0; + + reg.dst_c = p->lrn_step; + if (p->right_shift) { + uint32_t dst_addr = addr_after_right_shift( + ctx, p->dst->start_address, p->lrn_step, p->dst->stride.c); + + uint32_t lmem_size = ctx->info.lmem_size;; + uint32_t npu_num = ctx->info.npu_num; + uint32_t sht_num = p->lrn_step; + + uint32_t lmem_i = (dst_addr / lmem_size - sht_num) % npu_num; + uint32_t offset = (lmem_i + sht_num) / npu_num * p->dst->stride.c; + uint32_t zero_addr = lmem_i * lmem_size + dst_addr % lmem_size - offset; + + // printf(" lmem_i 0x%x, offset 0x%x, zero_addr 0x%x\n", + // lmem_i, offset, zero_addr); + + fill_dst_addr(®, zero_addr); + + } else { + uint32_t start_mem = p->dst->start_address / ctx->info.lmem_size; + uint32_t cur_mem = (start_mem + (p->dst->shape.c - p->lrn_step)) % ctx->info.npu_num; + uint32_t offset = + (p->dst->start_address % ctx->info.lmem_size) + + ((start_mem + (p->dst->shape.c - p->lrn_step)) / ctx->info.npu_num) * p->dst->stride.c; + uint32_t zero_addr = cur_mem * ctx->info.lmem_size + offset; + + // printf(" start_mem 0x%x, cur_mem 0x%x, offset 0x%x, zero_addr 0x%x\n", + // start_mem, cur_mem, offset, zero_addr); + + fill_dst_addr(®, zero_addr); + } + + if (status) { + printf("cvkcv180x tdma l2l lrn shift: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} +/* + * Direction: L2G + */ + +static void tdma_l2g_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_param_t *p, + uint64_t dst_addr) +{ + int8_t status = 0; + status |= check_tdma_tl(p->src); + status |= check_tdma_tg(p->dst); + status |= check_tl_tg_same_size(p->src, p->dst); + CHECK(status, p->src->shape.n == p->dst->shape.n); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.sys_dtype = 0; + reg.spec_func = 0; + + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + reg.intra_cmd_paral = p->intra_cmd_paral; + + if (status) { + printf("cvkcv180x l2g: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + + +static void tdma_l2g_bf16_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_param_t *p, + uint64_t dst_addr) +{ + int8_t status = 0; + status |= check_tdma_tl_bf16(p->src); + status |= check_tdma_tg_bf16(p->dst); + status |= check_tl_tg_same_size(p->src, p->dst); + CHECK(status, p->src->shape.n == p->dst->shape.n); + CHECK(status, !(p->src->fmt == CVK_FMT_I8 && p->dst->fmt == CVK_FMT_BF16)); // not support tl(int8)->tg(bf16) + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.sys_dtype = 0; + reg.spec_func = 0; + + fill_l2g_fmt(®, p->src->fmt, p->dst->fmt); + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + reg.intra_cmd_paral = p->intra_cmd_paral; + + set_int8_rnd_mode(®, p->dst->int8_rnd_mode); + //trace_tdma_reg(®, __func__); + + if (status) { + printf("cvkcv180x l2g bf16: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_l2g_tensor_copy_nc_transposed( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_nc_transposed_param_t *p, + uint64_t dst_addr) +{ + int8_t status = 0; + status |= check_tdma_tl(p->src); + status |= check_tdma_tg(p->dst); + CHECK(status, p->dst->shape.n == p->src->shape.c); + CHECK(status, p->dst->shape.c == p->src->shape.n); + CHECK(status, p->dst->shape.h * p->dst->shape.w == + p->src->shape.h * p->src->shape.w); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.sys_dtype = 0; + reg.spec_func = 1; + + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + if (status) { + printf("cvcv180x l2g nc tp: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_l2g_bf16_tensor_copy_nc_transposed( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_nc_transposed_param_t *p, + uint64_t dst_addr) +{ + int8_t status = 0; + status |= check_tdma_tl_bf16(p->src); + status |= check_tdma_tg_bf16(p->dst); + CHECK(status, p->dst->shape.n == p->src->shape.c); + CHECK(status, p->dst->shape.c == p->src->shape.n); + CHECK(status, p->dst->shape.h * p->dst->shape.w == + p->src->shape.h * p->src->shape.w); + CHECK(status, !(p->src->fmt == CVK_FMT_I8 && p->dst->fmt == CVK_FMT_BF16)); // not support tl(int8)->tg(bf16) + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.sys_dtype = 0; + reg.spec_func = 1; + + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + fill_l2g_fmt(®, p->src->fmt, p->dst->fmt); + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + set_int8_rnd_mode(®, p->dst->int8_rnd_mode); + + if (status) { + printf("cvkcv180x: l2g bf16 nc tp: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_l2g_tensor_copy_cw_transposed( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_cw_transposed_param_t *p, + uint64_t dst_addr) +{ + int8_t status = 0; + status |= check_tdma_tl(p->src); + status |= check_tdma_tg(p->dst); + CHECK(status, p->src->shape.n == p->dst->shape.n); + CHECK(status, p->src->shape.c == p->dst->shape.w); + CHECK(status, p->src->shape.h == p->dst->shape.h); + CHECK(status, p->src->shape.w == p->dst->shape.c); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.spec_func = 1; + reg.transpose_md = 3; + + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + if (status) { + printf("cvkcv180x l2g cw tp: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_l2g_bf16_tensor_copy_cw_transposed( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_cw_transposed_param_t *p, + uint64_t dst_addr) +{ + int8_t status = 0; + status |= check_tdma_tl_bf16(p->src); + status |= check_tdma_tg_bf16(p->dst); + CHECK(status, p->src->shape.n == p->dst->shape.n); + CHECK(status, p->src->shape.c == p->dst->shape.w); + CHECK(status, p->src->shape.h == p->dst->shape.h); + CHECK(status, p->src->shape.w == p->dst->shape.c); + + /*not support bf16 mode*/ + CHECK(status, !(p->src->fmt == CVK_FMT_BF16 || p->dst->fmt == CVK_FMT_BF16)); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.spec_func = 1; + reg.transpose_md = 3; + + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + fill_l2g_fmt(®, p->src->fmt, p->dst->fmt); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + if (status) { + printf("cvkcv180x l2g bf16 cw tp: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_l2g_tensor_copy_compressed( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_compressed_param_t *p, + uint64_t dst_addr) +{ + int8_t status = 0; + status |= check_tdma_tl(p->src); + status |= check_tdma_compressed_tg(p->dst); + status |= check_tl_tg_same_size(p->src, &p->dst->t); + + tdma_reg_t reg; + reset_tdma_reg(®); + + //src->fmt == CVK_FMT_BF16 || p->src->fmt == CVK_FMT_I8 || p->src->fmt == CVK_FMT_U8); + + CHECK(status, p->dst->bias1 == 0); + if (p->src->fmt == CVK_FMT_BF16) { + CHECK(status, p->dst->bias0 == 127); + } + else { + //p->src->fmt == CVK_FMT_I8 || p->src->fmt == CVK_FMT_U8); + CHECK(status, p->dst->bias0 == 0); + CHECK(status, p->dst->zero_guard_en == 0); + } + + reg.src_fmt = (p->src->fmt == CVK_FMT_BF16) ? FMT_BF16_TYP : FMT_FIX8B_TYP; + reg.dst_fmt = reg.src_fmt; + + reg.vld = 1; + reg.trans_dir = 1; + reg.compress_en = 1; + + // VLC constraint under hw compress + //1. in int8/uint8, bias0/bias should be 0/0 + //2. in bf16, signed should be 0 and bias0 set to 127, bias1 set to 0 + reg.cmprs_fmt = (p->src->fmt == CVK_FMT_I8); + + // NOTICE: it recommend set to 1 once data contain '0' under bf16 + reg.compress_zero_guard = p->dst->zero_guard_en ? 1 : 0; + reg.compress_bias0 = p->dst->bias0; + reg.compress_bias1 = p->dst->bias1; + + reg.dst_base_reg_sel = p->dst->t.base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->t.shape.c; + reg.dst_h = p->dst->t.shape.h; + reg.dst_w = p->dst->t.shape.w; + reg.dst_n_stride = p->dst->t.stride.n; + fill_dst_c_stride(®, p->dst->t.stride.c); + reg.dst_h_stride = p->dst->t.stride.h; + + reg.intra_cmd_paral = p->intra_cmd_paral; + + if (status) { + printf("cvkcv180x: l2g cmpr: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_l2g_tensor_fill_constant( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_fill_constant_param_t *p, + uint64_t dst_addr) +{ + int8_t status = 0; + status |= check_tdma_tg_bf16(p->dst); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.spec_func = 4; + reg.const_val = p->constant; + + // only support tl(bf16)->tg(bf16) or tl(fix8b)->tg(fix8b) + fill_l2g_fmt(®, p->dst->fmt, p->dst->fmt); + + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_dst_addr(®, dst_addr); + + reg.src_n = p->dst->shape.n; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + if (status) { + printf("cvkcv180x l2g fill const: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_l2g_matrix_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_matrix_copy_param_t *p, + uint64_t dst_addr) +{ + int8_t status = 0; + status |= check_tdma_ml(p->src); + status |= check_tdma_mg(p->dst); + status |= check_ml_mg_same_size(p->src, p->dst); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.sys_dtype = 1; + reg.spec_func = 0; + + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = 1; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.row; + reg.dst_w = p->dst->shape.col; + fill_dst_c_stride(®, p->dst->stride.row); + + if (status) { + printf("cvkcv180x l2g matrix: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_l2g_matrix_copy_compressed( + cvk_context_t *ctx, + const cvk_tdma_l2g_matrix_copy_compressed_param_t *p, + uint64_t dst_addr) +{ + int8_t status = 0; + + status |= check_tdma_ml(p->src); + status |= check_tdma_compress_mg(p->dst); + status |= check_tdma_vlc_matrix_compressed_mg(p->dst); + status |= check_ml_mg_same_size(p->src, &p->dst->m); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.compress_en = 1; + reg.sys_dtype = 1; + reg.spec_func = 0; + + // vlc setting + reg.cmprs_fmt = (p->src->fmt == CVK_FMT_I8); + + CHECK(status, p->dst->bias1 == 0); + if (p->src->fmt == CVK_FMT_BF16) { + CHECK(status, p->dst->bias0 == 127); + } + else { + //p->src->fmt == CVK_FMT_I8 || p->src->fmt == CVK_FMT_U8); + CHECK(status, p->dst->bias0 == 0); + CHECK(status, p->dst->zero_guard_en == 0); + } + + // NOTICE: it should be 1 once data contain '0' under bf16 + reg.compress_zero_guard = p->dst->zero_guard_en ? 1 : 0; + reg.compress_bias0 = p->dst->bias0; + reg.compress_bias1 = p->dst->bias1; + + reg.dst_base_reg_sel = p->dst->m.base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + fill_l2g_fmt(®, p->src->fmt, p->dst->m.fmt); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = 1; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->m.shape.row; + reg.dst_w = p->dst->m.shape.col; + fill_dst_c_stride(®, p->dst->m.stride.row); + + if (status) { + printf("cvkcv180x l2g matrix cmpr: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_l2g_bf16_matrix_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_matrix_copy_param_t *p, + uint64_t dst_addr) +{ + int8_t status = 0; + + status |= check_tdma_ml_bf16(p->src); + status |= check_tdma_mg_bf16(p->dst); + status |= check_ml_mg_same_size(p->src, p->dst); + CHECK(status, !((p->src->fmt == CVK_FMT_I8 || p->src->fmt == CVK_FMT_U8) && p->dst->fmt == CVK_FMT_BF16)); // not support tl(i8/uint8_t)->tg(bf16) + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.sys_dtype = 1; + reg.spec_func = 0; + + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + fill_l2g_fmt(®, p->src->fmt, p->dst->fmt); + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = 1; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.row; + reg.dst_w = p->dst->shape.col; + fill_dst_c_stride(®, p->dst->stride.row); + set_int8_rnd_mode(®, p->dst->int8_rnd_mode); + + if (status) { + printf("cvkcv180x l2g bf16 matrix: wrong paramter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_l2g_general_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_general_copy_param_t *p, + uint64_t dst_addr) +{ + int8_t status = 0; + + CHECK(status, p->dst_base_reg_index < TDMA_NUM_BASE_REGS); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.trans_fmt = 1; + reg.sys_dtype = 0; + reg.spec_func = 0; + + reg.dst_base_reg_sel = p->dst_base_reg_index; + fill_src_addr(®, p->src_address); + fill_dst_addr(®, dst_addr); + reg.src_n_stride = p->bytes; + + if (status) { + printf("cvkcv180x l2g general: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_l2g_bf16_general_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_bf16_general_copy_param_t *p, + uint64_t dst_addr) +{ + int8_t status = 0; + + CHECK(status, p->dst_base_reg_index < TDMA_NUM_BASE_REGS); + + tdma_reg_t reg; + reset_tdma_reg(®); + // only support fix8b->fix8b or bf16->bf16 + CHECK(status, p->src_fmt == p->dst_fmt); + + reg.vld = 1; + reg.trans_dir = 1; + reg.trans_fmt = 1; + reg.sys_dtype = 0; + reg.spec_func = 0; + fill_l2g_fmt(®, p->src_fmt, p->dst_fmt); + + reg.dst_base_reg_sel = p->dst_base_reg_index; + fill_src_addr(®, p->src_address); + fill_dst_addr(®, dst_addr); + reg.src_n_stride = p->src_bytes; + + if (status) { + printf("cvkcv180x l2g bf16 general: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +void cvkcv180x_tdma_l2g_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + tdma_l2g_tensor_copy(ctx, p, dst_addr); +} + +void cvkcv180x_tdma_l2g_bf16_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + tdma_l2g_bf16_tensor_copy(ctx, p, dst_addr); +} +void cvkcv180x_tdma_l2g_tensor_copy_nc_transposed( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_nc_transposed_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + tdma_l2g_tensor_copy_nc_transposed(ctx, p, dst_addr); +} + +void cvkcv180x_tdma_l2g_bf16_tensor_copy_nc_transposed( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_nc_transposed_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + tdma_l2g_bf16_tensor_copy_nc_transposed(ctx, p, dst_addr); +} + +void cvkcv180x_tdma_l2g_tensor_copy_cw_transposed( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_cw_transposed_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + tdma_l2g_tensor_copy_cw_transposed(ctx, p, dst_addr); +} + +void cvkcv180x_tdma_l2g_bf16_tensor_copy_cw_transposed( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_cw_transposed_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + tdma_l2g_bf16_tensor_copy_cw_transposed(ctx, p, dst_addr); +} + +void cvkcv180x_tdma_l2g_tensor_copy_compressed( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_compressed_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->t.start_address); + tdma_l2g_tensor_copy_compressed(ctx, p, dst_addr); +} + +void cvkcv180x_tdma_l2g_tensor_fill_constant( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_fill_constant_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + tdma_l2g_tensor_fill_constant(ctx, p, dst_addr); +} + +void cvkcv180x_tdma_l2g_matrix_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_matrix_copy_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + tdma_l2g_matrix_copy(ctx, p, dst_addr); +} + +void cvkcv180x_tdma_l2g_bf16_matrix_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_matrix_copy_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + tdma_l2g_bf16_matrix_copy(ctx, p, dst_addr); +} + +void cvkcv180x_tdma_l2g_matrix_copy_compressed( + cvk_context_t *ctx, + const cvk_tdma_l2g_matrix_copy_compressed_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->m.start_address); + tdma_l2g_matrix_copy_compressed(ctx, p, dst_addr); +} + +void cvkcv180x_tdma_l2g_general_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_general_copy_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst_address); + tdma_l2g_general_copy(ctx, p, dst_addr); +} + +void cvkcv180x_tdma_l2g_bf16_general_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_bf16_general_copy_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst_address); + tdma_l2g_bf16_general_copy(ctx, p, dst_addr); +} + +/* + * Direction: G2L + */ + +static void tdma_g2l_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_param_t *p, + uint64_t src_addr) +{ + int8_t status = 0; + status |= check_tdma_tg(p->src); + status |= check_tdma_tl(p->dst); + CHECK(status, p->src->shape.n == p->dst->shape.n); + status |= check_tl_tg_same_size(p->dst, p->src); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 0; + reg.spec_func = 0; + + reg.src_base_reg_sel = p->src->base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + reg.intra_cmd_paral = p->intra_cmd_paral; + + if (status) { + printf("cvkcv180x g2l: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_g2l_bf16_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_param_t *p, + uint64_t src_addr) +{ + int8_t status = 0; + + status |= check_tdma_tg_bf16(p->src); + status |= check_tdma_tl_bf16(p->dst); + CHECK(status, p->src->shape.n == p->dst->shape.n); + CHECK(status, !(p->src->fmt == CVK_FMT_BF16 && p->dst->fmt == CVK_FMT_I8)); // not support tg(bf16)->tl(int8) + status |= check_tl_tg_same_size(p->dst, p->src); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 0; + reg.spec_func = 0; + + reg.src_base_reg_sel = p->src->base_reg_index; + + fill_g2l_fmt(®, p->src->fmt, p->dst->fmt); + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + reg.intra_cmd_paral = p->intra_cmd_paral; + + if (status) { + printf("cvkcv180x g2l bf16: wrong parameter\n"); + return; + } + + //trace_tdma_reg(®, __func__); + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_g2l_tensor_copy_nc_transposed( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_nc_transposed_param_t *p, + uint64_t src_addr) +{ + int8_t status = 0; + + status |= check_tdma_tg(p->src); + status |= check_tdma_tl(p->dst); + CHECK(status, p->dst->shape.n == p->src->shape.c); + CHECK(status, p->dst->shape.c == p->src->shape.n); + CHECK(status, p->dst->shape.h * p->dst->shape.w == + p->src->shape.h * p->src->shape.w); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 0; + reg.spec_func = 1; + + reg.src_base_reg_sel = p->src->base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + if (status) { + printf("cvkcv180x g2l nc tp: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_g2l_bf16_tensor_copy_nc_transposed( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_nc_transposed_param_t *p, + uint64_t src_addr) +{ + int8_t status = 0; + + status |= check_tdma_tg_bf16(p->src); + status |= check_tdma_tl_bf16(p->dst); + CHECK(status, p->dst->shape.n == p->src->shape.c); + CHECK(status, p->dst->shape.c == p->src->shape.n); + CHECK(status, p->dst->shape.h * p->dst->shape.w == + p->src->shape.h * p->src->shape.w); + + CHECK(status, !(p->src->fmt == CVK_FMT_BF16 && p->dst->fmt == CVK_FMT_I8)); // not support tg(bf16)->tl(int8) + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 0; + reg.spec_func = 1; + + reg.src_base_reg_sel = p->src->base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + fill_g2l_fmt(®, p->src->fmt, p->dst->fmt); + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + if (status) { + printf("cvkcv180x g2l bf16 nc tp: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_g2l_tensor_copy_chw_rotated( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_chw_rotated_param_t *p, + uint64_t src_addr) +{ + int8_t status = 0; + + status |= check_tdma_tg(p->src); + status |= check_tdma_tl(p->dst); + + CHECK(status, p->src->shape.c == 3 || p->src->shape.c == 4); + CHECK(status, p->src->shape.n == p->dst->shape.n); + CHECK(status, p->src->shape.c == p->dst->shape.c); + CHECK(status, p->src->shape.h == p->dst->shape.h); + CHECK(status, p->src->shape.w == p->dst->shape.w); + + CHECK(status, p->dst->start_address % ctx->info.eu_num == 0); + CHECK(status, p->dst->stride.n % ctx->info.eu_num == 0); + CHECK(status, p->dst->stride.c % ctx->info.eu_num == 0); + CHECK(status, p->dst->stride.h == p->dst->shape.w); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 0; + reg.spec_func = 1; + + if (p->dst->shape.c == 3) + reg.transpose_md = 1; + else if(p->dst->shape.c == 4) + reg.transpose_md = 2; + else + CHECK(status, 0); + + reg.src_base_reg_sel = p->src->base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, 1); + reg.src_h_stride = p->src->shape.c * p->src->shape.w; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + if (status) { + printf("cvkcv180x g2l chw: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_g2l_tensor_copy_decompressed( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_decompressed_param_t *p, + uint64_t src_addr) +{ + int8_t status = 0; + + status |= check_tdma_compressed_tg(p->src); + status |= check_tdma_tl(p->dst); + status |= check_tl_tg_same_size(p->dst, &p->src->t); + + tdma_reg_t reg; + reset_tdma_reg(®); + + //dst->fmt == CVK_FMT_BF16 || p->dst->fmt == CVK_FMT_I8 || p->dst->fmt == CVK_FMT_U8); + fill_g2l_fmt(®, p->src->t.fmt, p->dst->fmt); + + reg.vld = 1; + reg.trans_dir = 0; + reg.compress_en = 1; + reg.cmprs_fmt = (p->src->t.fmt == CVK_FMT_I8); + + reg.src_base_reg_sel = p->src->t.base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->t.shape.n; + reg.src_c = p->src->t.shape.c; + reg.src_h = p->src->t.shape.h; + reg.src_w = p->src->t.shape.w; + reg.src_n_stride = p->src->t.stride.n; + fill_src_c_stride(®, p->src->t.stride.c); + reg.src_h_stride = p->src->t.stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + reg.intra_cmd_paral = p->intra_cmd_paral; + + if (status) { + printf("cvkcv180x g2l cmpr: wrong parameter\n"); + return; + } + + // trace_tdma_reg(®, __FUNCTION__); + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_g2l_tensor_fill_constant( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_fill_constant_param_t *p) +{ + int8_t status = 0; + + status |= check_tdma_tl(p->dst); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.spec_func = 4; + reg.const_val = p->constant; + + reg.dst_fmt = (p->dst->fmt == CVK_FMT_BF16) ? 2 : 1; + + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->dst->shape.n; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + if (status) { + printf("cvkcv180x g2l fill const: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_g2l_bf16_tensor_fill_constant( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_fill_constant_param_t *p) +{ + int8_t status = 0; + + status |= check_tdma_tl_bf16(p->dst); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.spec_func = 4; + reg.const_val = p->constant; + + /*only suppoert fix8b->fix8b or bf16->bf16*/ + fill_g2l_fmt(®, p->dst->fmt, p->dst->fmt); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->dst->shape.n; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + if (status) { + printf("cvkcv180x g2l bf16 fill const: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_g2l_matrix_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_matrix_copy_param_t *p, + uint64_t src_addr) +{ + int8_t status = 0; + + status |= check_tdma_mg(p->src); + status |= check_tdma_ml(p->dst); + CHECK(status, p->dst->shape.n == p->src->shape.row); + status |= check_ml_mg_same_size(p->dst, p->src); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 1; + reg.spec_func = 0; + + reg.src_base_reg_sel = p->src->base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.row; + reg.src_c = p->src->shape.row; + reg.src_w = p->src->shape.col; + fill_src_c_stride(®, p->src->stride.row); + + reg.dst_c = p->dst->shape.c; + reg.dst_h = 1; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + if (status) { + printf("cvkcv180x g2l matrix: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_g2l_matrix_copy_decompressed( + cvk_context_t *ctx, + const cvk_tdma_g2l_matrix_copy_decompressed_param_t *p, + uint64_t src_addr) +{ + int8_t status = 0; + + status |= check_tdma_vlc_matrix_compressed_mg(p->src); + status |= check_tdma_mg(&p->src->m); + status |= check_tdma_ml(p->dst); + CHECK(status, p->dst->shape.n == p->src->m.shape.row); + status |= check_ml_mg_same_size(p->dst, &p->src->m); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 1; + reg.spec_func = 0; + reg.compress_en = 1; + reg.cmprs_fmt = (p->src->m.fmt == CVK_FMT_I8); + + fill_g2l_fmt(®, p->src->m.fmt, p->dst->fmt); + reg.src_base_reg_sel = p->src->m.base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->m.shape.row; + reg.src_c = p->src->m.shape.row; + reg.src_w = p->src->m.shape.col; + fill_src_c_stride(®, p->src->m.stride.row); + + reg.dst_c = p->dst->shape.c; + reg.dst_h = 1; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + if (status) { + printf("cvkcv180x g2l matrix cmpr: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_g2l_bf16_matrix_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_matrix_copy_param_t *p, + uint64_t src_addr) +{ + int8_t status = 0; + + status |= check_tdma_mg_bf16(p->src); + status |= check_tdma_ml_bf16(p->dst); + CHECK(status, p->dst->shape.n == p->src->shape.row); + status |= check_ml_mg_same_size(p->dst, p->src); + CHECK(status, !(p->src->fmt == CVK_FMT_BF16 && p->dst->fmt == CVK_FMT_I8)); // not support tg(bf16)->tl(int8) + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 1; + reg.spec_func = 0; + + fill_g2l_fmt(®, p->src->fmt, p->dst->fmt); + reg.src_base_reg_sel = p->src->base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.row; + reg.src_c = p->src->shape.row; + reg.src_w = p->src->shape.col; + fill_src_c_stride(®, p->src->stride.row); + + reg.dst_c = p->dst->shape.c; + reg.dst_h = 1; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + if (status) { + printf("cvkcv180x g2l bf16 matrix: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_g2l_matrix_copy_row_col_transposed( + cvk_context_t *ctx, + const cvk_tdma_g2l_matrix_copy_row_col_transposed_param_t *p, + uint64_t src_addr) +{ + int8_t status = 0; + + status |= check_tdma_mg(p->src); + status |= check_tdma_ml(p->dst); + CHECK(status, p->dst->shape.n == p->src->shape.col); + CHECK(status, p->dst->shape.col == p->src->shape.row); + status |= check_ml_mg_same_size(p->dst, p->src); + + CHECK(status, p->src->shape.row >= p->dst->shape.w); + CHECK(status, p->dst->shape.c == + (uint32_t) ceiling_func(p->src->shape.row, p->dst->shape.w)); + + CHECK(status, p->dst->start_address % ctx->info.eu_num == 0); + CHECK(status, p->dst->stride.n % ctx->info.eu_num == 0); + CHECK(status, p->dst->stride.c % ctx->info.eu_num == 0); + CHECK(status, p->dst->stride.h == p->dst->shape.w); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 1; + reg.spec_func = 1; + + reg.src_base_reg_sel = p->src->base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.row; + reg.src_c = p->src->shape.row; + reg.src_w = p->src->shape.col; + fill_src_c_stride(®, p->src->stride.row); + + reg.dst_c = p->dst->shape.c; + reg.dst_h = 1; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + if (status) { + printf("cvkcv180x g2l matrix tp: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_g2l_general_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_general_copy_param_t *p, + uint64_t src_addr) +{ + int8_t status = 0; + CHECK(status, p->src_base_reg_index < TDMA_NUM_BASE_REGS); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.trans_fmt = 1; + reg.sys_dtype = 0; + reg.spec_func = 0; + + reg.src_base_reg_sel = p->src_base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst_address); + reg.src_n_stride = p->bytes; + + if (status) { + printf("cvkcv180x g2l general: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_g2l_bf16_general_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_bf16_general_copy_param_t *p, + uint64_t src_addr) +{ + int8_t status = 0; + + CHECK(status, p->src_base_reg_index < TDMA_NUM_BASE_REGS); + // only support fix8b->fix8b or bf16->bf16 + CHECK(status, p->dst_fmt == p->src_fmt); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.trans_fmt = 1; + reg.sys_dtype = 0; + reg.spec_func = 0; + + fill_g2l_fmt(®, p->src_fmt, p->dst_fmt); + + reg.src_base_reg_sel = p->src_base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst_address); + reg.src_n_stride = p->src_bytes; + + if (status) { + printf("cvkcv180x g2l bf16 general: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +void cvkcv180x_tdma_g2l_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->start_address); + tdma_g2l_tensor_copy(ctx, p, src_addr); +} + +void cvkcv180x_tdma_g2l_bf16_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->start_address); + tdma_g2l_bf16_tensor_copy(ctx, p, src_addr); +} + +void cvkcv180x_tdma_g2l_tensor_copy_nc_transposed( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_nc_transposed_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->start_address); + tdma_g2l_tensor_copy_nc_transposed(ctx, p, src_addr); +} + +void cvkcv180x_tdma_g2l_bf16_tensor_copy_nc_transposed( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_nc_transposed_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->start_address); + tdma_g2l_bf16_tensor_copy_nc_transposed(ctx, p, src_addr); +} + +void cvkcv180x_tdma_g2l_tensor_copy_chw_rotated( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_chw_rotated_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->start_address); + tdma_g2l_tensor_copy_chw_rotated(ctx, p, src_addr); +} + +void cvkcv180x_tdma_g2l_tensor_copy_decompressed( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_decompressed_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->t.start_address); + tdma_g2l_tensor_copy_decompressed(ctx, p, src_addr); +} + +void cvkcv180x_tdma_g2l_tensor_fill_constant( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_fill_constant_param_t *p) +{ + tdma_g2l_tensor_fill_constant(ctx, p); +} + +void cvkcv180x_tdma_g2l_bf16_tensor_fill_constant( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_fill_constant_param_t *p) +{ + tdma_g2l_bf16_tensor_fill_constant(ctx, p); +} + +void cvkcv180x_tdma_g2l_matrix_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_matrix_copy_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->start_address); + tdma_g2l_matrix_copy(ctx, p, src_addr); +} + +void cvkcv180x_tdma_g2l_matrix_copy_decompressed( + cvk_context_t *ctx, + const cvk_tdma_g2l_matrix_copy_decompressed_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->m.start_address); + tdma_g2l_matrix_copy_decompressed(ctx, p, src_addr); +} + +void cvkcv180x_tdma_g2l_bf16_matrix_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_matrix_copy_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->start_address); + tdma_g2l_bf16_matrix_copy(ctx, p, src_addr); +} + +void cvkcv180x_tdma_g2l_matrix_copy_row_col_transposed( + cvk_context_t *ctx, + const cvk_tdma_g2l_matrix_copy_row_col_transposed_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->start_address); + tdma_g2l_matrix_copy_row_col_transposed(ctx, p, src_addr); +} + +void cvkcv180x_tdma_g2l_general_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_general_copy_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src_address); + tdma_g2l_general_copy(ctx, p, src_addr); +} + +void cvkcv180x_tdma_g2l_bf16_general_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_bf16_general_copy_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src_address); + tdma_g2l_bf16_general_copy(ctx, p, src_addr); +} +/* + * Direction: TG2TG + */ +static void cvkcv180x_tdma_copy_gmem( + cvk_context_t *ctx, + const cvk_tdma_g2g_tensor_copy_param_t *p, + uint8_t u8_trans_fmt) +{ + tdma_reg_t reg; + + reset_tdma_reg(®); + + uint64_t u64_src_addr; + uint64_t u64_dst_addr; + + reg.vld = 1; + reg.trans_dir = 2; // 0:g2l, 1:l2g, 2:g2g, 3:l2l + reg.trans_fmt = u8_trans_fmt; // 1:general copy, 2:tensor copy + reg.sys_dtype = 0; // + reg.spec_func = 0; // + + u64_src_addr = absolute_gmem_addr(p->src->start_address); + u64_dst_addr = absolute_gmem_addr(p->dst->start_address); + fill_src_addr(®, u64_src_addr); + fill_dst_addr(®, u64_dst_addr); + + reg.src_base_reg_sel = p->src->base_reg_index; + reg.dst_base_reg_sel = p->dst->base_reg_index; + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p-> dst->stride.h; + + (void *)emit_tdma_cmdbuf( ctx, ®); +} + +static void cvkcv180x_tdma_bf16_copy_gmem( + cvk_context_t *ctx, + const cvk_tdma_g2g_tensor_copy_param_t *p, + uint8_t u8_trans_fmt) +{ + int8_t status = 0; + tdma_reg_t reg; + + reset_tdma_reg(®); + + uint64_t u64_src_addr; + uint64_t u64_dst_addr; + + reg.vld = 1; + reg.trans_dir = 2; // 0:g2l, 1:l2g, 2:g2g, 3:l2l + reg.trans_fmt = u8_trans_fmt; // 1:general copy, 2:tensor copy + reg.sys_dtype = 0; // + reg.spec_func = 0; // + CHECK(status, p->src->fmt == p->dst->fmt); + + reg.dst_fmt = (p->dst->fmt == CVK_FMT_BF16) ? 2 : 1; + reg.src_fmt = (p->src->fmt == CVK_FMT_BF16) ? 2 : 1; + + u64_src_addr = absolute_gmem_addr(p->src->start_address); + u64_dst_addr = absolute_gmem_addr(p->dst->start_address); + fill_src_addr(®, u64_src_addr); + fill_dst_addr(®, u64_dst_addr); + + reg.src_base_reg_sel = p->src->base_reg_index; + reg.dst_base_reg_sel = p->dst->base_reg_index; + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p-> dst->stride.h; + + if (status) { + printf("cvkcv180x bf16 gmem: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +/* + * Direction: G2G + */ +void cvkcv180x_tdma_g2g_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_g2g_tensor_copy_param_t *p) +{ + cvkcv180x_tdma_copy_gmem(ctx, p, 2); +} + +void cvkcv180x_tdma_g2g_bf16_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_g2g_tensor_copy_param_t *p) +{ + cvkcv180x_tdma_bf16_copy_gmem(ctx, p, 2); +} + +void cvkcv180x_tdma_g2g_general_copy( + cvk_context_t *ctx, + const cvk_tdma_g2g_tensor_copy_param_t *p) +{ + cvkcv180x_tdma_copy_gmem(ctx, p, 1); +} + +void cvkcv180x_tdma_g2g_bf16_general_copy( + cvk_context_t *ctx, + const cvk_tdma_g2g_tensor_copy_param_t *p) +{ + cvkcv180x_tdma_bf16_copy_gmem(ctx, p, 1); +} diff --git a/cvikernel/src/cv180x/tiu_add.c b/cvikernel/src/cv180x/tiu_add.c new file mode 100644 index 000000000..e4db3fa14 --- /dev/null +++ b/cvikernel/src/cv180x/tiu_add.c @@ -0,0 +1,88 @@ +#include "cvkcv180x.h" + +void cvkcv180x_tiu_add( + cvk_context_t *ctx, + const cvk_tiu_add_param_t *p) +{ + int8_t status = 0; + int bf16_enable = (p->a_low->fmt == CVK_FMT_BF16) ? 1 : 0; + + if (bf16_enable) { + /*bf16 only support 16 bit*/ + CHECK(status, !p->a_high); + CHECK(status, !(p->b.high && !p->b_is_const)); + CHECK(status, !p->res_high); + status |= check_tiu_tensor(p->a_low); + status |= check_tiu_tensor(p->res_low); + status |= check_same_shape(p->res_low, p->a_low); + if (!p->b_is_const) { + status |= check_tiu_tensor(p->b.low); + status |= check_same_shape(p->res_low, p->b.low); + } + } else { + status |= check_16bit_tiu_tensor(p->a_low, p->a_high); + status |= check_tiu_tensor(p->res_low); + status |= check_same_shape(p->res_low, p->a_low); + if (!p->b_is_const) { + status |= check_16bit_tiu_tensor(p->b.low, p->b.high); + status |= check_same_shape(p->res_low, p->b.low); + } + } + if (p->res_high) + status |= check_16bit_tiu_tensor(p->res_low, p->res_high); + CHECK(status, p->relu_enable == 0 || p->relu_enable == 1); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_ADD_FIX8B; + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + reg.opt_res_shift = 0; + reg.opt_relu_typ = p->relu_enable; + fill_same_tensor_shape(®, p->a_low->shape); + fill_same_tensor_stride_type(®, 0b11); + + int arith_shift = tensor_is_signed(p->res_low); + reg.opt_shift_typ = arith_shift; + reg.opt_res_shift = p->rshift_bits; + + reg.opd0_addr = p->a_low->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a_low); + reg.opt_opd0_seg = (p->a_high == NULL); + reg.opd0_b_str = bf16_enable ? 0 : (p->a_high->start_address - p->a_low->start_address); + fill_opd0_stride(®, &p->a_low->stride); + + reg.opt_opd1_seg = bf16_enable ? 1 : 0; //(p->b_high == NULL); b_high is the same as b_val + if (p->b_is_const) { + reg.opt_opd1_const = 1; + reg.opt_opd1_sign = !!p->b_const.is_signed; + reg.opd1_addr = p->b_const.val; + } else { + reg.opt_opd1_const = 0; + reg.opt_opd1_sign = tensor_is_signed(p->b.low); + reg.opd1_addr = p->b.low->start_address; + reg.opd1_b_str = bf16_enable ? 0 : (p->b.high->start_address - p->b.low->start_address); + fill_opd1_stride(®, &p->b.low->stride); + } + + reg.res0_addr = p->res_low->start_address; + reg.opt_res0_sign = tensor_is_signed(p->res_low); + reg.opt_res0_seg = (p->res_high == NULL); + fill_res0_stride(®, &p->res_low->stride); + if (p->res_high) + reg.res0_b_str = p->res_high->start_address - p->res_low->start_address; + if (p->relu_enable) + CHECK(status, reg.opt_res0_seg); + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv180x tiu_add: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv180x/tiu_and.c b/cvikernel/src/cv180x/tiu_and.c new file mode 100644 index 000000000..8ead53205 --- /dev/null +++ b/cvikernel/src/cv180x/tiu_and.c @@ -0,0 +1,111 @@ +#include "cvkcv180x.h" + +void cvkcv180x_tiu_and_int8( + cvk_context_t *ctx, + const cvk_tiu_and_int8_param_t *p) +{ + int8_t status = 0; + status |= check_tiu_tensor_3(p->res, p->a, p->b); + status |= check_same_shape_3(p->res, p->a, p->b); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_AND_FIX8B; + reg.tsk_opd_num = 2; + reg.opt_res_shift = 0; + reg.opt_shift_typ = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = 0; + reg.opt_opd0_seg = 1; + fill_opd0_stride(®, &p->a->stride); + + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = 0; + reg.opt_opd1_seg = 1; + fill_opd1_stride(®, &p->b->stride); + + reg.res0_addr = p->res->start_address; + reg.opt_res0_sign = 0; + reg.opt_res0_seg = 1; + fill_res0_stride(®, &p->res->stride); + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv180x tiu and: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} + +void cvkcv180x_tiu_and_int16( + cvk_context_t *ctx, + const cvk_tiu_and_int16_param_t *p) +{ + int8_t status = 0; + status |= check_16bit_tiu_tensor(p->a_low, p->a_high); + status |= check_16bit_tiu_tensor(p->b_low, p->b_high); + status |= check_16bit_tiu_tensor(p->res_low, p->res_high); + status |= check_same_shape_3(p->res_low, p->a_low, p->b_low); + + int res_high_addr = p->res_high->start_address; + int res_low_addr = p->res_low->start_address; + CHECK(status, res_high_addr > res_low_addr); + int res_b_stride = res_high_addr - res_low_addr; + + int a_high_addr = p->a_high->start_address; + int a_low_addr = p->a_low->start_address; + CHECK(status, a_high_addr > a_low_addr); + int a_b_stride = a_high_addr - a_low_addr; + + int b_high_addr = p->b_high->start_address; + int b_low_addr = p->b_low->start_address; + CHECK(status, b_high_addr > b_low_addr); + int b_b_stride = b_high_addr - b_low_addr; + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_AND_FIX8B; + reg.tsk_opd_num = 2; + reg.opt_res_shift = 0; + reg.opt_shift_typ = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a_low->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = a_low_addr; + reg.opt_opd0_sign = 0; + reg.opt_opd0_seg = 0; + reg.opd0_b_str = a_b_stride; + fill_opd0_stride(®, &p->a_low->stride); + + reg.opd1_addr = b_low_addr; + reg.opt_opd1_sign = 0; + reg.opt_opd1_seg = 0; + reg.opd1_b_str = b_b_stride; + fill_opd1_stride(®, &p->b_low->stride); + + reg.res0_addr = res_low_addr; + reg.opt_res0_sign = 0; + reg.opt_res0_seg = 0; + reg.res0_b_str = res_b_stride; + fill_res0_stride(®, &p->res_low->stride); + + if (status) { + printf("cvkcv180x tiu and: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv180x/tiu_average_pooling.c b/cvikernel/src/cv180x/tiu_average_pooling.c new file mode 100644 index 000000000..2f11199c6 --- /dev/null +++ b/cvikernel/src/cv180x/tiu_average_pooling.c @@ -0,0 +1,94 @@ +#include "cvkcv180x.h" + +void cvkcv180x_tiu_average_pooling( + cvk_context_t *ctx, + const cvk_tiu_average_pooling_param_t *p) +{ + int8_t status = 0; + int bf16_enable = (p->ifmap->fmt == CVK_FMT_BF16) ? 1 : 0; + + CHECK(status, p->ifmap->shape.n == p->ofmap->shape.n); + CHECK(status, p->ifmap->shape.c == p->ofmap->shape.c); + CHECK(status, p->stride_h < 32 && p->stride_h > 0); + CHECK(status, p->stride_w < 32 && p->stride_w > 0); + CHECK(status, p->pad_top < 16); + CHECK(status, p->pad_bottom < 16); + CHECK(status, p->pad_left < 16); + CHECK(status, p->pad_right < 16); + CHECK(status, p->ins_h < 15); + CHECK(status, p->ins_last_h < 15); + CHECK(status, p->ins_w < 15); + CHECK(status, p->ins_last_w < 15); + + status |= check_tiu_tensor_2(p->ifmap, p->ofmap); + if (bf16_enable) { + status |= check_bf16_stride_type_0(ctx, p->ifmap); + status |= check_bf16_stride_type_0(ctx, p->ofmap); + } else { + status |= check_stride_type_0(ctx, p->ifmap); + status |= check_stride_type_0(ctx, p->ofmap); + } + + int opd0_sign = tensor_is_signed(p->ifmap); + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_DEPTHWISE_POOL_FIX8B; + reg.tsk_eu_typ = 1; + reg.opt_shift_typ = opd0_sign; + reg.opt_res_shift = p->rshift_bits; + reg.opt_relu_typ = 0; /* hardware relu function not verified. */ + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + + reg.res0_addr = p->ofmap->start_address; + reg.opt_res0_sign = opd0_sign; + reg.opt_res0_seg = 1; + reg.res0_n = p->ofmap->shape.n; + reg.res0_c = p->ofmap->shape.c; + reg.res0_h = p->ofmap->shape.h; + reg.res0_w = p->ofmap->shape.w; + + reg.opd0_addr = p->ifmap->start_address; + reg.opt_opd0_sign = opd0_sign; + reg.opt_opd0_seg = 1; + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.conv_opd0_up_pad = p->pad_top; + reg.conv_opd0_dn_pad = p->pad_bottom; + reg.conv_opd0_lf_pad = p->pad_left; + reg.conv_opd0_rt_pad = p->pad_right; + reg.conv_opd0_x_ins0 = p->ins_w; + reg.conv_opd0_y_ins0 = p->ins_h; + reg.conv_opd0_x_ins0_last = p->ins_last_w; + reg.conv_opd0_y_ins0_last = p->ins_last_h; + + reg.opt_opd1_const = 1; + /*HW does not have dive, we need to calculate value here*/ + if (bf16_enable) + reg.opd1_addr = + cvk_convert_fp32_bf16( + (float)(cvk_convert_bf16_fp32(p->avg_pooling_const) / (p->kh * p->kw))); + else + reg.opd1_addr = p->avg_pooling_const; + + reg.opd1_h = p->kh; + reg.opd1_w = p->kw; + reg.opt_opd1_sign = 0; + reg.opt_opd1_seg = 1; + reg.conv_op_x_str = p->stride_w; + reg.conv_op_y_str = p->stride_h; + reg.opd0_ins_val = bf16_enable ? + (uint32_t)p->ins_fp : (uint32_t)p->ins_val; + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv180x tiu avg pool: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv180x/tiu_convolution.c b/cvikernel/src/cv180x/tiu_convolution.c new file mode 100644 index 000000000..904b50a8f --- /dev/null +++ b/cvikernel/src/cv180x/tiu_convolution.c @@ -0,0 +1,175 @@ +#include "cvkcv180x.h" + +typedef cvk_tiu_convolution_param_t param_t; + +static int can_do_double_conv(cvk_context_t *ctx, const param_t *p) +{ + if ((p->ifmap->start_address % ctx->info.lmem_size) % 2 == 0 && + p->ifmap->shape.c % 2 == 0 && + p->ifmap->shape.c >= 4 && + p->weight->start_address % 2 == 0) + return 1; + + return 0; +} + +static int8_t check_conv_param(cvk_context_t *ctx, const param_t *p) +{ + int8_t status = 0; + uint32_t eu_num = ctx->info.eu_num; + + status |= check_tiu_tensor_3(p->ifmap, p->ofmap, p->weight); + status |= check_stride_type_0(ctx, p->ifmap); + + CHECK(status, (p->ofmap->stride.n % eu_num) == 0); + CHECK(status, p->ifmap->start_address % eu_num == 0); + CHECK(status, p->ofmap->start_address % eu_num == 0); + CHECK(status, p->ifmap->shape.n == p->ofmap->shape.n); + CHECK(status, !(p->ifmap->shape.h == 1 && p->ins_h > 0)); + CHECK(status, p->weight->shape.n == p->ifmap->shape.c); + CHECK(status, p->weight->shape.c == p->ofmap->shape.c); + + if (p->chl_quan_param) { + status |= check_tiu_tensor(p->chl_quan_param); + status |= check_stride_type_2(ctx, p->chl_quan_param); + CHECK(status, p->chl_quan_param->start_address % eu_num == 0); + } + if (can_do_double_conv(ctx, p)) { + uint32_t lmem_i = p->ifmap->start_address % ctx->info.lmem_size; + CHECK(status, lmem_i % 2 == 0); + CHECK(status, p->ifmap->shape.c % 2 == 0); + CHECK(status, p->ifmap->shape.c >= 4); /* Otherwise performance will suffer */ + CHECK(status, p->weight->start_address % 2 == 0); + } + if(p->ps32_mode & 0x2) + { + CHECK(status, !p->relu_enable); + CHECK(status, !p->has_bias); + + CHECK(status, p->cmd_pre_exe <= 1); + } + CHECK(status, p->stride_h < 32 && p->stride_h > 0); + CHECK(status, p->stride_w < 32 && p->stride_w > 0); + CHECK(status, p->pad_top < 16); + CHECK(status, p->pad_bottom < 16); + CHECK(status, p->pad_left < 16); + CHECK(status, p->pad_right < 16); + CHECK(status, p->ins_h < 15); + CHECK(status, p->ins_last_h < 15); + CHECK(status, p->ins_w < 15); + CHECK(status, p->ins_last_w < 15); + CHECK(status, p->dilation_h >= 1); + CHECK(status, p->dilation_w >= 1); + CHECK(status, p->relu_enable == 0 || p->relu_enable == 1); + + return status; +} + +void cvkcv180x_tiu_convolution(cvk_context_t *ctx, const param_t *p) +{ + int8_t status = 0; + + status |= check_conv_param(ctx, p); + + int opd0_sign = tensor_is_signed(p->ifmap); + int opd1_sign = tensor_is_signed(p->weight); + int arith_shift = opd0_sign || opd1_sign; + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_CONV_FIX8B; + reg.opt_shift_typ = arith_shift; + reg.opt_relu_typ = !!(p->relu_enable); + reg.tsk_opd_num = 2; + + /*always automatically enabel double conv at those situations*/ + if (can_do_double_conv(ctx, p)) + reg.double_conv = 1; + + reg.res0_addr = p->ofmap->start_address; + reg.opt_res0_sign = tensor_is_signed(p->ofmap); + reg.opt_res0_seg = 1; + reg.res0_n = p->ofmap->shape.n; + reg.res0_c = p->ofmap->shape.c; + reg.res0_h = p->ofmap->shape.h; + reg.res0_w = p->ofmap->shape.w; + reg.res0_n_str = p->ofmap->stride.n; + reg.res0_c_str = p->ofmap->stride.c; + reg.res0_h_str = p->ofmap->stride.h; + reg.res0_w_str = p->ofmap->stride.w; + reg.short_res0_str = 3; // Manual instead of h/w + reg.ps32_md = p->ps32_mode; + if (p->ps32_mode > 0) { + reg.res0_b_str = p->ofmap->shape.n * p->ofmap->stride.n; + + // Per-channel parameter does not has right shift (default is 10). + // Set zero. + reg.opt_res_shift = 0; + } + + reg.opd0_addr = p->ifmap->start_address; + reg.opt_opd0_sign = opd0_sign; + reg.opt_opd0_seg = 1; + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.opd0_ins_val = (uint32_t)p->ins_val; + reg.short_opd0_str = 0; + reg.conv_opd0_up_pad = p->pad_top; + reg.conv_opd0_dn_pad = p->pad_bottom; + reg.conv_opd0_lf_pad = p->pad_left; + reg.conv_opd0_rt_pad = p->pad_right; + reg.conv_opd0_x_ins0 = p->ins_w; + reg.conv_opd0_y_ins0 = p->ins_h; + reg.conv_opd0_x_ins0_last = p->ins_last_w; + reg.conv_opd0_y_ins0_last = p->ins_last_h; + + reg.opd1_addr = p->weight->start_address; + reg.opt_opd1_sign = opd1_sign; + reg.opt_opd1_seg = 1; + reg.opt_opd1_const = p->w_is_const; + reg.opd1_n = p->weight->shape.n; + reg.opd1_c = p->weight->shape.c; + reg.opd1_h = p->weight->shape.h; + reg.opd1_w = p->weight->shape.w; + reg.short_opd1_str = 1; + reg.conv_opd1_x_ins0 = p->dilation_w - 1; + reg.conv_opd1_y_ins0 = p->dilation_h - 1; + reg.conv_op_x_str = p->stride_w; + reg.conv_op_y_str = p->stride_h; + + if (p->chl_quan_param) { + CHECK(status, p->chl_quan_param->shape.n == 1); + CHECK(status, p->chl_quan_param->shape.c == p->ofmap->shape.c); + CHECK(status, p->chl_quan_param->shape.h == 1); + CHECK(status, p->chl_quan_param->shape.w == 1); + reg.opt_chl_quan = 1; + reg.opt_res_shift = 0; // useless + reg.opd2_addr = p->chl_quan_param->start_address; + reg.opd2_n = p->chl_quan_param->shape.n; + reg.opd2_c = p->chl_quan_param->shape.c; + reg.opd2_h = p->chl_quan_param->shape.h; + reg.opd2_w = p->chl_quan_param->shape.w; + } + reg.opt_opd2_seg = 1; // useless, force to 1 to skip b_stride check + reg.short_opd2_str = 2; // useless + reg.opd2_b_str = 0; // useless + + if (p->has_bias) { + reg.tsk_opd_num = 3; + reg.opt_opd2_sign = 1; + } + + reg.layer_info = p->layer_id; + reg.cmd_pre_exe_typ = p->cmd_pre_exe_typ; + reg.cmd_pre_exe = p->cmd_pre_exe; + + if (status) { + printf("cvkcv180x tiu conv: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv180x/tiu_copy.c b/cvikernel/src/cv180x/tiu_copy.c new file mode 100644 index 000000000..5d627eeee --- /dev/null +++ b/cvikernel/src/cv180x/tiu_copy.c @@ -0,0 +1,47 @@ +#include "cvkcv180x.h" + +void cvkcv180x_tiu_copy( + cvk_context_t *ctx, + const cvk_tiu_copy_param_t *p) +{ + int8_t status = 0; + int bf16_enable = (p->src->fmt == CVK_FMT_BF16) ? 1 : 0; + + status |= check_tiu_tensor_2(p->dst, p->src); + status |= check_same_shape(p->dst, p->src); + status |= check_stride_range(p->dst->stride); + status |= check_stride_range(p->src->stride); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_COPY_FIX8B; + reg.tsk_opd_num = 1; + reg.opd_typ = bf16_enable ? 1: 0; + reg.opt_res_shift = 0; + reg.opt_shift_typ = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->dst->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->src->start_address; + reg.opt_opd0_sign = 0; + reg.opt_opd0_seg = 1; + fill_opd0_stride(®, &p->src->stride); + + reg.res0_addr = p->dst->start_address; + reg.opt_res0_sign = 0; + reg.opt_res0_seg = 1; + fill_res0_stride(®, &p->dst->stride); + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv180x tiu copy: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv180x/tiu_depthwise_convolution.c b/cvikernel/src/cv180x/tiu_depthwise_convolution.c new file mode 100644 index 000000000..5d3f4420a --- /dev/null +++ b/cvikernel/src/cv180x/tiu_depthwise_convolution.c @@ -0,0 +1,147 @@ +#include "cvkcv180x.h" + +void cvkcv180x_tiu_depthwise_convolution( + cvk_context_t *ctx, + const cvk_tiu_depthwise_convolution_param_t *p) +{ + int8_t status = 0; + uint32_t eu_num = ctx->info.eu_num; + + int8_t isMulConst = (p->weight_is_const == 1) ? 1 : 0; + + if(isMulConst) { + status |= check_tiu_tensor_2(p->ifmap, p->ofmap); + } else { + status |= check_tiu_tensor_3(p->ifmap, p->ofmap, p->weight); + } + status |= check_stride_type_0(ctx, p->ifmap); + if(!isMulConst){ + status |= check_stride_type_0(ctx, p->weight); + } + status |= check_tiu_tensor(p->chl_quan_param); + status |= check_stride_type_2(ctx, p->chl_quan_param); + + CHECK(status, (p->ofmap->stride.n % eu_num) == 0); + CHECK(status, p->chl_quan_param->start_address %eu_num == 0); + CHECK(status, p->ifmap->shape.n == p->ofmap->shape.n); + CHECK(status, p->ifmap->shape.c == p->ofmap->shape.c); + if (!isMulConst) { + CHECK(status, p->ifmap->shape.c == p->weight->shape.c); + CHECK(status, p->weight->shape.n == 1); + } + CHECK(status, p->relu_enable == 0 || p->relu_enable == 1); + CHECK(status, p->stride_h < 32 && p->stride_h > 0); + CHECK(status, p->stride_w < 32 && p->stride_w > 0); + CHECK(status, p->pad_top < 16); + CHECK(status, p->pad_bottom < 16); + CHECK(status, p->pad_left < 16); + CHECK(status, p->pad_right < 16); + CHECK(status, p->ins_h < 15); + CHECK(status, p->ins_last_h < 15); + CHECK(status, p->ins_w < 15); + CHECK(status, p->ins_last_w < 15); + CHECK(status, p->dilation_h >= 1); + CHECK(status, p->dilation_w >= 1); + + int opd0_sign = tensor_is_signed(p->ifmap); + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_DEPTHWISE_POOL_FIX8B; + reg.tsk_eu_typ = 2; + reg.opt_relu_typ = p->relu_enable; + reg.opt_shift_typ = 1; + reg.tsk_opd_num = 2; + + int res0_sign = tensor_is_signed(p->ofmap); + reg.res0_addr = p->ofmap->start_address; + reg.opt_res0_sign = res0_sign; + reg.opt_res0_seg = 1; + reg.res0_n = p->ofmap->shape.n; + reg.res0_c = p->ofmap->shape.c; + reg.res0_h = p->ofmap->shape.h; + reg.res0_w = p->ofmap->shape.w; + reg.res0_n_str = p->ofmap->stride.n; + reg.res0_c_str = p->ofmap->stride.c; + reg.res0_h_str = p->ofmap->stride.h; + reg.res0_w_str = p->ofmap->stride.w; + reg.short_res0_str = 3; // Manual instead of h/w + + reg.opd0_addr = p->ifmap->start_address; + reg.opt_opd0_sign = opd0_sign; + reg.opt_opd0_seg = 1; + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.opd0_n_str = p->ifmap->stride.n; + reg.opd0_c_str = p->ifmap->stride.c; + reg.opd0_h_str = p->ifmap->stride.h; + reg.opd0_w_str = p->ifmap->stride.w; + reg.opd0_ins_val = (uint32_t)p->ins_val; + reg.short_opd0_str = 3; // Manual instead of h/w + reg.conv_opd0_up_pad = p->pad_top; + reg.conv_opd0_dn_pad = p->pad_bottom; + reg.conv_opd0_lf_pad = p->pad_left; + reg.conv_opd0_rt_pad = p->pad_right; + reg.conv_opd0_x_ins0 = p->ins_w; + reg.conv_opd0_y_ins0 = p->ins_h; + reg.conv_opd0_x_ins0_last = p->ins_last_w; + reg.conv_opd0_y_ins0_last = p->ins_last_h; + + reg.opt_opd1_sign = 1; + reg.opt_opd1_seg = 1; + reg.conv_opd1_x_ins0 = p->dilation_w - 1; + reg.conv_opd1_y_ins0 = p->dilation_h - 1; + if (isMulConst) { + reg.opt_opd1_const = 1; + reg.opt_opd1_sign = p->weight_const.is_signed; + reg.opd1_addr = p->weight_const.val; + reg.opd1_n = p->weight->shape.n; + reg.opd1_c = p->weight->shape.c; + reg.opd1_h = p->weight->shape.h; + reg.opd1_w = p->weight->shape.w; + } else { + reg.opd1_addr = p->weight->start_address; + reg.opd1_n = p->weight->shape.n; + reg.opd1_c = p->weight->shape.c; + reg.opd1_h = p->weight->shape.h; + reg.opd1_w = p->weight->shape.w; + } + reg.conv_op_x_str = p->stride_w; + reg.conv_op_y_str = p->stride_h; + reg.cmd_pre_exe_typ = p->cmd_pre_exe_typ; + reg.cmd_pre_exe = p->cmd_pre_exe; + + CHECK(status, p->chl_quan_param->shape.n == 1); + CHECK(status, p->chl_quan_param->shape.c == p->ofmap->shape.c); + CHECK(status, p->chl_quan_param->shape.h == 1); + CHECK(status, p->chl_quan_param->shape.w == 1); + reg.opt_chl_quan = 1; + reg.opt_res_shift = 0; // useless + reg.opd2_addr = p->chl_quan_param->start_address; + reg.opd2_n = p->chl_quan_param->shape.n; + reg.opd2_c = p->chl_quan_param->shape.c; + reg.opd2_h = p->chl_quan_param->shape.h; + reg.opd2_w = p->chl_quan_param->shape.w; + reg.opt_opd2_seg = 1; // useless, force to 1 to skip b_stride check + reg.short_opd2_str = 2; // useless + reg.opd2_b_str = 0; // useless + + if (p->has_bias) { + reg.tsk_opd_num = 3; + reg.opt_opd2_sign = 1; + } + + reg.layer_info = p->layer_id; + reg.cmd_pre_exe_typ = p->cmd_pre_exe_typ; + reg.cmd_pre_exe = p->cmd_pre_exe; + + if (status) { + printf("cvkcv180x tiu_dw_conv: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv180x/tiu_ge.c b/cvikernel/src/cv180x/tiu_ge.c new file mode 100644 index 000000000..d041daeb0 --- /dev/null +++ b/cvikernel/src/cv180x/tiu_ge.c @@ -0,0 +1,123 @@ +#include "cvkcv180x.h" + +#if 0 +void cvkcv180x_tiu_ge( + cvk_context_t *ctx, + const cvk_tiu_ge_param_t *p) +{ + int8_t status = 0; + status |= check_tiu_tensor_2(p->ge, p->a); + status |= check_same_shape(p->ge, p->a); + if (p->b_is_const) { + if (tensor_is_signed(p->a)) + CHECK(status, p->b_const.is_signed); + else + CHECK(status, !p->b_const.is_signed); + } else { + status |= check_tiu_tensor(p->b); + status |= check_same_shape(p->ge, p->b); + CHECK(status, p->a->fmt == p->b->fmt); + } + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_GE_FIX8B; + reg.tsk_opd_num = 2; + reg.opt_res_shift = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a); + fill_opd0_stride(®, &p->a->stride); + + if (p->b_is_const) { + reg.opt_opd1_const = 1; + reg.opd1_addr = p->b_const.val; + reg.opt_opd1_sign = !!p->b_const.is_signed; + } else { + reg.opt_opd1_const = 0; + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = tensor_is_signed(p->b); + fill_opd1_stride(®, &p->b->stride); + } + + reg.res0_addr = p->ge->start_address; + reg.opt_res0_sign = tensor_is_signed(p->ge); + fill_res0_stride(®, &p->ge->stride); + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv180x tiu ge: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} +#endif + +void cvkcv180x_tiu_ge( + cvk_context_t *ctx, + const cvk_tiu_ge_param_t *p) +{ + int8_t status = 0; + int bf16_enable = (p->a->fmt == CVK_FMT_BF16) ? 1 : 0; + + status |= check_tiu_tensor_2(p->ge, p->a); + status |= check_same_shape(p->ge, p->a); + + if (p->b_is_const && !bf16_enable) { + if (tensor_is_signed(p->a)) + CHECK(status, p->b_const.is_signed); + else + CHECK(status, !p->b_const.is_signed); + } else if (!p->b_is_const) { + status |= check_tiu_tensor(p->b); + status |= check_same_shape(p->ge, p->b); + CHECK(status, p->a->fmt == p->b->fmt); + } + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_GE_FIX8B; + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + reg.opt_res_shift = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a); + fill_opd0_stride(®, &p->a->stride); + + if (p->b_is_const) { + reg.opt_opd1_const = 1; + reg.opd1_addr = bf16_enable ? p->b_const.val : (p->b_const.val & 0xFF); + reg.opt_opd1_sign = !!p->b_const.is_signed; + } else { + reg.opt_opd1_const = 0; + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = tensor_is_signed(p->b); + fill_opd1_stride(®, &p->b->stride); + } + + reg.res0_addr = p->ge->start_address; + reg.opt_res0_sign = tensor_is_signed(p->ge); + fill_res0_stride(®, &p->ge->stride); + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv180x tiu ge: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv180x/tiu_lookup_table.c b/cvikernel/src/cv180x/tiu_lookup_table.c new file mode 100644 index 000000000..f52270969 --- /dev/null +++ b/cvikernel/src/cv180x/tiu_lookup_table.c @@ -0,0 +1,118 @@ +#include "cvkcv180x.h" + +void cvkcv180x_tiu_lookup_table( + cvk_context_t *ctx, + const cvk_tiu_lookup_table_param_t *p) +{ + int8_t status = 0; + uint32_t eu_num = ctx->info.eu_num; + uint32_t npu_num = ctx->info.npu_num; + + status |= check_tiu_tensor_3(p->ofmap, p->ifmap, p->table); + status |= check_stride_type_0(ctx, p->ofmap); + status |= check_stride_type_0(ctx, p->ifmap); + status |= check_stride_type_0(ctx, p->table); + + uint8_t is_bf16 = (p->ofmap->fmt == CVK_FMT_BF16 && p->ifmap->fmt == CVK_FMT_BF16); + + CHECK(status, p->table->shape.n == 1); + CHECK(status, p->table->shape.c == npu_num); + + if (is_bf16) { + CHECK(status, p->table->shape.h == 32); + CHECK(status, p->table->shape.w == 8); + } + else { + CHECK(status, p->table->shape.h == 16); + CHECK(status, p->table->shape.w == 16); + } + + CHECK(status, p->ifmap->start_address % eu_num == 0); + CHECK(status, p->ofmap->start_address % eu_num == 0); + CHECK(status, p->table->start_address % eu_num == 0); + + // fmt MUST be same under bf16 + if (p->ofmap->fmt == CVK_FMT_BF16) { + CHECK(status, p->ifmap->fmt == CVK_FMT_BF16); + } + CHECK(status, p->ofmap->fmt == CVK_FMT_I8 || p->ofmap->fmt == CVK_FMT_U8 || p->ofmap->fmt == CVK_FMT_BF16); + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + //reg.tens_lookup = 1; + reg.tsk_opd_num = 2; + reg.opt_shift_typ = 0; + reg.opt_res_shift = 0; + reg.opt_relu_typ = 0; + reg.opd_typ = is_bf16; + + reg.res0_addr = p->ofmap->start_address; + if (is_bf16) { + reg.opt_res0_sign = 1; + reg.opt_res0_seg = 1; + } + else { + reg.opt_res0_sign = 0; + reg.opt_res0_seg = 1; + } + + // ifmap->shape.n == p->ofmap->shape.n); + CHECK(status, p->ifmap->shape.c == p->ofmap->shape.c); + CHECK(status, p->ifmap->shape.h == p->ofmap->shape.h); + CHECK(status, p->ifmap->shape.w == p->ofmap->shape.w); + + reg.res0_n = p->ifmap->shape.n; + reg.res0_c = p->ifmap->shape.c; + reg.res0_h = p->ifmap->shape.h; + reg.res0_w = p->ifmap->shape.w; + reg.short_res0_str = 0; + + reg.opd0_addr = p->ifmap->start_address; + if (is_bf16) { + reg.opt_opd0_sign = 1; + reg.opt_opd0_seg = 1; + } + else { + reg.opt_opd0_sign = 0; + reg.opt_opd0_seg = 1; + } + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.short_opd0_str = 0; + + reg.opd1_addr = p->table->start_address; + if (is_bf16) { + reg.opt_opd1_sign = 1; + reg.opt_opd1_seg = 1; + } + else { + reg.opt_opd1_sign = 0; + reg.opt_opd1_seg = 1; + } + reg.opd1_n = p->table->shape.n; + reg.opd1_c = p->table->shape.c; + reg.opd1_h = p->table->shape.h; + reg.opd1_w = p->table->shape.w; + reg.short_opd1_str = 0; + reg.tsk_eu_typ = 12; // 12 means lut + if (is_bf16) { + reg.opt_opd2_seg = 1; // hw check + // dont care once short_xxx_str set to 0 + } + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv180x tiu lookup: wrong parameter\n"); + return; + } + + //trace_tiu_reg(®, __FUNCTION__); + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv180x/tiu_mac.c b/cvikernel/src/cv180x/tiu_mac.c new file mode 100644 index 000000000..71f41d605 --- /dev/null +++ b/cvikernel/src/cv180x/tiu_mac.c @@ -0,0 +1,73 @@ +#include "cvkcv180x.h" + +void cvkcv180x_tiu_mac( + cvk_context_t *ctx, + const cvk_tiu_mac_param_t *p) +{ + int8_t status = 0; + int bf16_enable = (p->a->fmt == CVK_FMT_BF16) ? 1 : 0; + + status |= check_tiu_tensor(p->a); + status |= check_same_shape(p->res_low, p->a); + if(!bf16_enable) { + status |= check_16bit_tiu_tensor(p->res_low, p->res_high); + CHECK(status, p->lshift_bits < 32); + CHECK(status, p->rshift_bits < 16); + } + if (!p->b_is_const) { + status |= check_tiu_tensor(p->b); + status |= check_same_shape(p->res_low, p->b); + } + CHECK(status, p->relu_enable == 0 || p->relu_enable == 1); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_MAC_FIX8B; + reg.opt_res_add = 1; + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + reg.opt_left_shift = p->lshift_bits; + reg.opt_relu_typ = p->relu_enable; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + int arith_shift = tensor_is_signed(p->res_low); + reg.opt_shift_typ = arith_shift; + reg.opt_res_shift = p->rshift_bits; + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a); + fill_opd0_stride(®, &p->a->stride); + + if (p->b_is_const) { + reg.opt_opd1_const = 1; + reg.opd1_addr = bf16_enable ? p->b_const.val : (p->b_const.val & 0xFF); + reg.opt_opd1_sign = !!p->b_const.is_signed; + } else { + reg.opt_opd1_const = 0; + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = tensor_is_signed(p->b); + fill_opd1_stride(®, &p->b->stride); + } + + reg.res0_addr = p->res_low->start_address; + reg.opt_res0_sign = tensor_is_signed(p->res_low); + reg.opt_res0_seg = bf16_enable ? 1 : !!p->res_is_int8; + fill_res0_stride(®, &p->res_low->stride); + reg.res0_b_str = bf16_enable ? 0 : (p->res_high->start_address - p->res_low->start_address); + + if (p->relu_enable) + CHECK(status, reg.opt_res0_seg); + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv180x tiu avg pool: wrong paramter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv180x/tiu_matrix_multiplication.c b/cvikernel/src/cv180x/tiu_matrix_multiplication.c new file mode 100644 index 000000000..cd25fe762 --- /dev/null +++ b/cvikernel/src/cv180x/tiu_matrix_multiplication.c @@ -0,0 +1,160 @@ +#include "cvkcv180x.h" +#include "assert.h" + +static int8_t check_matrix(cvk_context_t *ctx, const cvk_ml_t *m) +{ + int8_t status = 0; + cvk_tl_t t; + t.start_address = m->start_address; + t.fmt = m->fmt; + t.shape.n = m->shape.n; + t.shape.c = m->shape.c; + t.shape.h = 1; + t.shape.w = m->shape.w; + t.stride.n = m->stride.n; + t.stride.c = m->stride.c; + t.stride.h = m->stride.h; + t.stride.w = 1 * (m->fmt == CVK_FMT_BF16 ? 2 : 1); + + status |= check_tiu_tensor(&t); + status |= check_stride_type_0(ctx, &t); + + uint32_t eu_num = ctx->info.eu_num; + CHECK(status, m->start_address % eu_num == 0); + + return status; +} + +static int is_arith_shift(const cvk_tiu_matrix_multiplication_param_t *p) +{ + if (p->left->fmt == CVK_FMT_I8) + return 1; + if (p->right->fmt == CVK_FMT_I8) + return 1; + if (p->bias && p->bias->fmt == CVK_FMT_I8) + return 1; + + return 0; +} + +void cvkcv180x_tiu_matrix_multiplication(cvk_context_t *ctx, const cvk_tiu_matrix_multiplication_param_t *p) +{ + int8_t status = 0; + const cvk_ml_t *res = p->res; + const cvk_ml_t *left = p->left; + const cvk_ml_t *right = p->right; + const cvk_ml_t *bias = p->bias; + int bf16_enable = (res->fmt == CVK_FMT_BF16) ? 1 : 0; + + status |= check_matrix(ctx, res); + status |= check_matrix(ctx, left); + status |= check_matrix(ctx, right); + if (bias) + status |= check_matrix(ctx, bias); + + CHECK(status, p->lshift_bits < 32); + if (bf16_enable) /* bf16 does not support add_result*/ + CHECK(status, !p->add_result); + else + CHECK(status, !(p->relu_enable && p->add_result)); + + if(p->ps32_mode & 0x2) + { + CHECK(status, !p->relu_enable); + CHECK(status, !p->bias); + CHECK(status, !p->rshift_bits); + } + CHECK(status, p->relu_enable == 0 || p->relu_enable == 1); + + uint32_t left_row = left->shape.n; + uint32_t left_col = left->shape.col; + uint32_t right_row = right->shape.n; + uint32_t right_col = right->shape.col; + uint32_t res_row = res->shape.n; + uint32_t res_col = res->shape.col; + CHECK(status, left_col == right_row); + CHECK(status, res_col == right_col); + + if(p->ps32_mode) + { + CHECK(status, !p->add_result); + } else if ((p->add_result || !p->res_is_int8) && !bf16_enable) { + CHECK(status, res_row == left_row * 2); + res_row = left_row; + } else { + CHECK(status, res_row == left_row); + } + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_FC_FIX8B; + reg.tsk_opd_num = bias? 3: 2; + reg.opd_typ = bf16_enable ? 1 : 0; + reg.opt_shift_typ = is_arith_shift(p); + reg.opt_res_shift = p->rshift_bits; + reg.opt_left_shift = p->lshift_bits; + reg.opt_relu_typ = p->relu_enable; + reg.opt_res_add = p->add_result; + + reg.res0_addr = res->start_address; + reg.opt_res0_seg = (bf16_enable ? 1 : p->res_is_int8); + + reg.opt_res0_sign = matrix_is_signed(res); + reg.res0_n = res_row; + reg.res0_c = res->shape.c; + reg.res0_h = 1; + reg.res0_w = res->shape.w; + reg.short_res0_str = 0; // stride, b_stride calculated by H/W + + reg.opd0_addr = left->start_address; + reg.opt_opd0_seg = 1; + reg.opt_opd0_sign = (left->fmt == CVK_FMT_I8); + reg.opd0_n = left_row; + reg.opd0_c = left->shape.c; + reg.opd0_h = 1; + reg.opd0_w = left->shape.w; + reg.short_opd0_str = 0; + + reg.opd1_addr = right->start_address; + reg.opt_opd1_seg = 1; + reg.opt_opd1_sign = (right->fmt == CVK_FMT_I8); + reg.opd1_n = right_row; + reg.opd1_c = right->shape.c; + reg.opd1_h = 1; + reg.opd1_w = left_col - left->shape.w * (left->shape.c - 1); + reg.short_opd1_str = 0; + + reg.ps32_md = p->ps32_mode; + if (p->ps32_mode > 0) + reg.res0_b_str = p->res->shape.n * p->res->stride.n; + if(reg.opd0_c == 1) + CHECK(status, reg.opd0_w == reg.opd1_w); + + if (bias) { + CHECK(status, bias->shape.n == 2); + CHECK(status, bias->shape.c == right->shape.c); + CHECK(status, bias->shape.w == right->shape.w); + CHECK(status, bias->shape.col == right->shape.col); + + reg.opd2_addr = bias->start_address; + reg.opt_opd2_seg = 0; + reg.opt_opd2_sign = (bias->fmt == CVK_FMT_I8); + reg.opd2_n = 1; + reg.opd2_c = bias->shape.c; + reg.opd2_h = 1; + reg.opd2_w = bias->shape.w; + reg.short_opd2_str = 0; + } + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv180x tiu matrix: wrong parameter"); + assert(0); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv180x/tiu_matrix_multiplication_qm.c b/cvikernel/src/cv180x/tiu_matrix_multiplication_qm.c new file mode 100644 index 000000000..5be935b51 --- /dev/null +++ b/cvikernel/src/cv180x/tiu_matrix_multiplication_qm.c @@ -0,0 +1,153 @@ +#include "cvkcv180x.h" + +static int8_t check_matrix(cvk_context_t *ctx, const cvk_ml_t *m) +{ + int8_t status = 0; + cvk_tl_t t; + t.start_address = m->start_address; + t.fmt = m->fmt; + t.shape.n = m->shape.n; + t.shape.c = m->shape.c; + t.shape.h = 1; + t.shape.w = m->shape.w; + t.stride.n = m->stride.n; + t.stride.c = m->stride.c; + t.stride.h = m->stride.h; + t.stride.w = 1; + + status |= check_tiu_tensor(&t); + status |= check_stride_type_0(ctx, &t); + + uint32_t eu_num = ctx->info.eu_num; + CHECK(status, m->start_address % eu_num == 0); + + return status; +} + +static int is_arith_shift(const cvk_tiu_matrix_multiplication_qm_param_t *p) +{ + if (p->left->fmt == CVK_FMT_I8) + return 1; + if (p->right->fmt == CVK_FMT_I8) + return 1; + if (p->bias && p->bias->fmt == CVK_FMT_I8) + return 1; + + return 0; +} + +void cvkcv180x_tiu_matrix_multiplication_qm(cvk_context_t *ctx, const cvk_tiu_matrix_multiplication_qm_param_t *p) +{ + int8_t status = 0; + const cvk_ml_t *res = p->res; + const cvk_ml_t *left = p->left; + const cvk_ml_t *right = p->right; + const cvk_ml_t *bias = p->bias; + + status |= check_matrix(ctx, res); + status |= check_matrix(ctx, left); + status |= check_matrix(ctx, right); + if (bias) + status |= check_matrix(ctx, bias); + + CHECK(status, p->lshift_bits < 32); + CHECK(status, !(p->relu_enable && p->add_result)); + if(p->ps32_mode & 0x2) + { + CHECK(status, !p->relu_enable); + CHECK(status, !p->bias); + CHECK(status, !p->rshift_bits); + } + CHECK(status, p->relu_enable == 0 || p->relu_enable == 1); + + uint32_t left_row = left->shape.n; + uint32_t left_col = left->shape.col; + uint32_t right_row = right->shape.n; + uint32_t right_col = right->shape.col; + uint32_t res_row = res->shape.n; + uint32_t res_col = res->shape.col; + CHECK(status, left_col == right_row); + CHECK(status, res_col == right_col); + CHECK(status, p->res_is_int8 == 1); + + if(p->ps32_mode) + { + CHECK(status, !p->add_result); + } + else if (p->add_result) { + CHECK(status, res_row == left_row * 2); + res_row = left_row; + } else { + CHECK(status, res_row == left_row); + } + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_FC_FIX8B; + reg.tsk_opd_num = bias? 3: 2; + reg.opt_shift_typ = is_arith_shift(p); + reg.opt_res_shift = p->rshift_bits; + reg.opt_left_shift = p->lshift_bits; + reg.opt_relu_typ = p->relu_enable; + reg.opt_res_add = p->add_result; + + reg.res0_addr = res->start_address; + reg.opt_res0_seg = 1; + reg.opt_res0_sign = matrix_is_signed(res); + reg.res0_n = res_row; + reg.res0_c = res->shape.c; + reg.res0_h = 1; + reg.res0_w = res->shape.w; + reg.short_res0_str = 0; // stride, b_stride calculated by H/W + + reg.opd0_addr = left->start_address; + reg.opt_opd0_seg = 1; + reg.opt_opd0_sign = (left->fmt == CVK_FMT_I8); + reg.opd0_n = left_row; + reg.opd0_c = left->shape.c; + reg.opd0_h = 1; + reg.opd0_w = left->shape.w; + reg.short_opd0_str = 0; + + reg.opd1_addr = right->start_address; + reg.opt_opd1_seg = 1; + reg.opt_opd1_sign = (right->fmt == CVK_FMT_I8); + reg.opd1_n = right_row; + reg.opd1_c = right->shape.c; + reg.opd1_h = 1; + reg.opd1_w = left_col - left->shape.w * (left->shape.c - 1); + reg.short_opd1_str = 0; + + reg.ps32_md = p->ps32_mode; + if (p->ps32_mode > 0) + reg.res0_b_str = p->res->shape.n * p->res->stride.n; + if(reg.opd0_c == 1) + CHECK(status, reg.opd0_w == reg.opd1_w); + + // Only enable 32-bit multiplier at the final post processing stage + reg.opt_chl_quan = ((p->ps32_mode == 0) || (p->ps32_mode == 1)) ? 1 : 0; + reg.quan_m = p->quan_m; + + // 32b bias, determined by b_stride + if (bias) { + CHECK(status, bias->shape.n == 4); + CHECK(status, bias->shape.c == right->shape.c); + CHECK(status, bias->shape.w == right->shape.w); + CHECK(status, bias->shape.col == right->shape.col); + + reg.opd2_addr = bias->start_address; + reg.opt_opd2_seg = 0; + reg.opt_opd2_sign = (bias->fmt == CVK_FMT_I8); + reg.opd2_n = 1; + reg.opd2_c = bias->shape.c; + reg.opd2_h = 1; + reg.opd2_w = bias->shape.w; + reg.short_opd2_str = 0; + } + + reg.layer_info = p->layer_id; + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv180x/tiu_max.c b/cvikernel/src/cv180x/tiu_max.c new file mode 100644 index 000000000..c1ad211ba --- /dev/null +++ b/cvikernel/src/cv180x/tiu_max.c @@ -0,0 +1,62 @@ +#include "cvkcv180x.h" + +void cvkcv180x_tiu_max( + cvk_context_t *ctx, + const cvk_tiu_max_param_t *p) +{ + int8_t status = 0; + int bf16_enable = (p->a->fmt == CVK_FMT_BF16) ? 1 : 0; + + status |= check_tiu_tensor_2(p->max, p->a); + status |= check_same_shape(p->max, p->a); + + if (p->b_is_const && !bf16_enable) { + if (tensor_is_signed(p->a)) + CHECK(status, p->b_const.is_signed); + else + CHECK(status, !p->b_const.is_signed); + } else if (!p->b_is_const) { + status |= check_tiu_tensor(p->b); + status |= check_same_shape(p->max, p->b); + CHECK(status, p->a->fmt == p->b->fmt); + } + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_MAX_FIX8B; + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + reg.opt_res_shift = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a); + fill_opd0_stride(®, &p->a->stride); + + if (p->b_is_const) { + reg.opt_opd1_const = 1; + reg.opd1_addr = bf16_enable ? p->b_const.val : (p->b_const.val & 0xFF); + reg.opt_opd1_sign = !!p->b_const.is_signed; + } else { + reg.opt_opd1_const = 0; + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = tensor_is_signed(p->b); + fill_opd1_stride(®, &p->b->stride); + } + + reg.res0_addr = p->max->start_address; + reg.opt_res0_sign = tensor_is_signed(p->max); + fill_res0_stride(®, &p->max->stride); + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv180x tiu max: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv180x/tiu_max_pooling.c b/cvikernel/src/cv180x/tiu_max_pooling.c new file mode 100644 index 000000000..ee3c92ba0 --- /dev/null +++ b/cvikernel/src/cv180x/tiu_max_pooling.c @@ -0,0 +1,74 @@ +#include "cvkcv180x.h" + +void cvkcv180x_tiu_max_pooling( + cvk_context_t *ctx, + const cvk_tiu_max_pooling_param_t *p) +{ + int8_t status = 0; + int bf16_enable = (p->ifmap->fmt == CVK_FMT_BF16); + + status |= check_tiu_tensor_2(p->ifmap, p->ofmap); + CHECK(status, p->kh * p->kw >= 1); + CHECK(status, p->ifmap->shape.n == p->ofmap->shape.n); + CHECK(status, p->ifmap->shape.c == p->ofmap->shape.c); + CHECK(status, p->stride_h < 32 && p->stride_h > 0 && "stride_h should be in [1, 31] range"); + CHECK(status, p->stride_w < 32 && p->stride_w > 0 && "stride_w should be in [1, 31] range"); + if (bf16_enable) { + status |= check_bf16_stride_type_0(ctx, p->ifmap); + status |= check_bf16_stride_type_0(ctx, p->ofmap); + } else { + status |= check_stride_type_0(ctx, p->ifmap); + status |= check_stride_type_0(ctx, p->ofmap); + } + int opd0_sign = tensor_is_signed(p->ifmap); + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_DEPTHWISE_POOL_FIX8B; + reg.tsk_eu_typ = 0; + reg.opt_relu_typ = 0; /* Hardware relu function not validated. */ + reg.opt_res_shift = 0; + reg.opt_shift_typ = opd0_sign; + reg.tsk_opd_num = 1; + reg.opd_typ = bf16_enable ? 1: 0; + + reg.res0_addr = p->ofmap->start_address; + reg.opt_res0_sign = opd0_sign; + reg.opt_res0_seg = 1; + reg.res0_n = p->ofmap->shape.n; + reg.res0_c = p->ofmap->shape.c; + reg.res0_h = p->ofmap->shape.h; + reg.res0_w = p->ofmap->shape.w; + reg.opd0_addr = p->ifmap->start_address; + reg.opt_opd0_sign = opd0_sign; + reg.opt_opd0_seg = 1; + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.conv_opd0_up_pad = p->pad_top; + reg.conv_opd0_dn_pad = p->pad_bottom; + reg.conv_opd0_lf_pad = p->pad_left; + reg.conv_opd0_rt_pad = p->pad_right; + if (bf16_enable) { + reg.opd0_ins_val = p->ins_fp; + } else { + //reg.opd0_ins_val = bf16_enable ? 0 : (uint32_t)p->ins_val; + reg.opd0_ins_val = (!p->ins_val && opd0_sign) ? -128 : p->ins_val; // backend not set yet + } + + reg.opt_opd1_seg = 1; + reg.opd1_h = p->kh; + reg.opd1_w = p->kw; + reg.conv_op_x_str = p->stride_w; + reg.conv_op_y_str = p->stride_h; + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv180x tiu max pool: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv180x/tiu_min.c b/cvikernel/src/cv180x/tiu_min.c new file mode 100644 index 000000000..554220c59 --- /dev/null +++ b/cvikernel/src/cv180x/tiu_min.c @@ -0,0 +1,63 @@ +#include "cvkcv180x.h" + +void cvkcv180x_tiu_min( + cvk_context_t *ctx, + const cvk_tiu_min_param_t *p) +{ + int8_t status = 0; + int bf16_enable = (p->a->fmt == CVK_FMT_BF16) ? 1 : 0; + + status |= check_tiu_tensor_2(p->min, p->a); + status |= check_same_shape(p->min, p->a); + if (p->b_is_const && !bf16_enable) { + if (tensor_is_signed(p->a)) + CHECK(status, p->b_const.is_signed); + else + CHECK(status, !p->b_const.is_signed); + } else if (!p->b_is_const) { + status |= check_tiu_tensor(p->b); + status |= check_same_shape(p->min, p->b); + CHECK(status, p->a->fmt == p->b->fmt); + } + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_MIN_FIX8B; + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + reg.opt_res_shift = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a); + fill_opd0_stride(®, &p->a->stride); + + if (p->b_is_const) { + reg.opt_opd1_const = 1; + reg.opd1_addr = bf16_enable ? p->b_const.val : (p->b_const.val & 0xFF); + reg.opt_opd1_sign = !!p->b_const.is_signed; + } else { + reg.opt_opd1_const = 0; + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = tensor_is_signed(p->b); + fill_opd1_stride(®, &p->b->stride); + } + + reg.res0_addr = p->min->start_address; + reg.opt_res0_sign = tensor_is_signed(p->min); + fill_res0_stride(®, &p->min->stride); + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv180x tiu min: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv180x/tiu_min_pooling.c b/cvikernel/src/cv180x/tiu_min_pooling.c new file mode 100644 index 000000000..5279e9dee --- /dev/null +++ b/cvikernel/src/cv180x/tiu_min_pooling.c @@ -0,0 +1,140 @@ +#include "cvkcv180x.h" + +#if 0 +void cvkcv180x_tiu_min_pooling( + cvk_context_t *ctx, + const cvk_tiu_min_pooling_param_t *p) +{ + int8_t status = 0; + status |= check_tiu_tensor_2(p->ifmap, p->ofmap); + CHECK(status, p->kh * p->kw > 1); + CHECK(status, p->ifmap->shape.n == p->ofmap->shape.n); + CHECK(status, p->ifmap->shape.c == p->ofmap->shape.c); + status |= check_stride_type_0(ctx, p->ifmap); + status |= check_stride_type_0(ctx, p->ofmap); + + int opd0_sign = tensor_is_signed(p->ifmap); + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_DEPTHWISE_POOL_FIX8B; + reg.tsk_eu_typ = 3; + reg.opt_relu_typ = 0; /* Hardware relu function not validated. */ + reg.opt_res_shift = 0; + reg.opt_shift_typ = opd0_sign; + reg.tsk_opd_num = 1; + + reg.res0_addr = p->ofmap->start_address; + reg.opt_res0_sign = opd0_sign; + reg.opt_res0_seg = 1; + reg.res0_n = p->ofmap->shape.n; + reg.res0_c = p->ofmap->shape.c; + reg.res0_h = p->ofmap->shape.h; + reg.res0_w = p->ofmap->shape.w; + + reg.opd0_addr = p->ifmap->start_address; + reg.opt_opd0_sign = opd0_sign; + reg.opt_opd0_seg = 1; + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.conv_opd0_up_pad = p->pad_top; + reg.conv_opd0_dn_pad = p->pad_bottom; + reg.conv_opd0_lf_pad = p->pad_left; + reg.conv_opd0_rt_pad = p->pad_right; + if (opd0_sign) + reg.opd0_ins_val = (uint16_t)127; + else + reg.opd0_ins_val = (uint16_t)255; + reg.opt_opd1_seg = 1; + reg.opd1_h = p->kh; + reg.opd1_w = p->kw; + reg.conv_op_x_str = p->stride_w; + reg.conv_op_y_str = p->stride_h; + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv180x tiu min pool: wrong parameter\n"); + return; + } + + (void *) emit_tiu_cmdbuf(ctx, ®); +} +#endif + +void cvkcv180x_tiu_min_pooling( + cvk_context_t *ctx, + const cvk_tiu_min_pooling_param_t *p) +{ + int8_t status = 0; + int bf16_enable = (p->ifmap->fmt == CVK_FMT_BF16) ? 1 : 0; + + status |= check_tiu_tensor_2(p->ifmap, p->ofmap); + CHECK(status, p->kh * p->kw > 1); + CHECK(status, p->ifmap->shape.n == p->ofmap->shape.n); + CHECK(status, p->ifmap->shape.c == p->ofmap->shape.c); + if (bf16_enable) { + status |= check_bf16_stride_type_0(ctx, p->ifmap); + status |= check_bf16_stride_type_0(ctx, p->ofmap); + } else { + status |= check_stride_type_0(ctx, p->ifmap); + status |= check_stride_type_0(ctx, p->ofmap); + } + int opd0_sign = tensor_is_signed(p->ifmap); + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_DEPTHWISE_POOL_FIX8B; + reg.tsk_eu_typ = 3; + reg.opt_relu_typ = 0; /* Hardware relu function not validated. */ + reg.opt_res_shift = 0; + reg.opt_shift_typ = opd0_sign; + reg.tsk_opd_num = 1; + reg.opd_typ = bf16_enable ? 1: 0; + + reg.res0_addr = p->ofmap->start_address; + reg.opt_res0_sign = opd0_sign; + reg.opt_res0_seg = 1; + reg.res0_n = p->ofmap->shape.n; + reg.res0_c = p->ofmap->shape.c; + reg.res0_h = p->ofmap->shape.h; + reg.res0_w = p->ofmap->shape.w; + + if (!bf16_enable) { + if (opd0_sign) + reg.opd0_ins_val = (uint16_t)127; + else + reg.opd0_ins_val = (uint16_t)255; + } else + reg.opd0_ins_val = p->ins_fp; + + reg.opd0_addr = p->ifmap->start_address; + reg.opt_opd0_sign = opd0_sign; + reg.opt_opd0_seg = 1; + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.conv_opd0_up_pad = p->pad_top; + reg.conv_opd0_dn_pad = p->pad_bottom; + reg.conv_opd0_lf_pad = p->pad_left; + reg.conv_opd0_rt_pad = p->pad_right; + + reg.opt_opd1_seg = 1; + reg.opd1_h = p->kh; + reg.opd1_w = p->kw; + reg.conv_op_x_str = p->stride_w; + reg.conv_op_y_str = p->stride_h; + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv180x tiu min pool: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv180x/tiu_mul.c b/cvikernel/src/cv180x/tiu_mul.c new file mode 100644 index 000000000..3ed8912f3 --- /dev/null +++ b/cvikernel/src/cv180x/tiu_mul.c @@ -0,0 +1,72 @@ +#include "cvkcv180x.h" + +void cvkcv180x_tiu_mul( + cvk_context_t *ctx, + const cvk_tiu_mul_param_t *p) +{ + int8_t status = 0; + int bf16_enable = (p->res_low->fmt == CVK_FMT_BF16) ? 1 : 0; + + status |= check_tiu_tensor_2(p->res_low, p->a); + status |= check_same_shape(p->res_low, p->a); + if (!p->b_is_const) { + status |= check_tiu_tensor(p->b); + status |= check_same_shape(p->res_low, p->b); + } + if (p->res_high) + status |= check_16bit_tiu_tensor(p->res_low, p->res_high); + CHECK(status, p->relu_enable == 0 || p->relu_enable == 1); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_MUL_FIX8B; + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + int arith_shift = tensor_is_signed(p->res_low); + reg.opt_shift_typ = arith_shift; + reg.opt_res_shift = p->rshift_bits; + reg.opt_relu_typ = p->relu_enable; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a); + fill_opd0_stride(®, &p->a->stride); + + if (p->b_is_const) { + reg.opt_opd1_const = 1; + reg.opd1_addr = bf16_enable ? p->b_const.val : (p->b_const.val & 0xFF); + reg.opt_opd1_sign = !!p->b_const.is_signed; + } else { + reg.opt_opd1_const = 0; + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = tensor_is_signed(p->b); + fill_opd1_stride(®, &p->b->stride); + } + + reg.res0_addr = p->res_low->start_address; + reg.opt_res0_sign = tensor_is_signed(p->res_low); + reg.opt_res0_seg = (p->res_high == NULL); + fill_res0_stride(®, &p->res_low->stride); + if (p->res_high) + reg.res0_b_str = (p->res_high->start_address - p->res_low->start_address); + if (p->relu_enable) + CHECK(status, reg.opt_res0_seg); + + CHECK(status, ( + p->b_is_const || (!reg.opt_opd1_sign && !reg.opt_opd0_sign && !reg.opt_shift_typ) || + ((reg.opt_opd1_sign || reg.opt_opd0_sign) && reg.opt_shift_typ) + )); + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv180x tiu mul: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv180x/tiu_mul_qm.c b/cvikernel/src/cv180x/tiu_mul_qm.c new file mode 100644 index 000000000..eb7b37add --- /dev/null +++ b/cvikernel/src/cv180x/tiu_mul_qm.c @@ -0,0 +1,71 @@ +#include "cvkcv180x.h" + +void cvkcv180x_tiu_mul_qm( + cvk_context_t *ctx, + const cvk_tiu_mul_qm_param_t *p) +{ + int8_t status = 0; + status |= check_tiu_tensor_2(p->res_low, p->a); + status |= check_same_shape(p->res_low, p->a); + if (!p->b_is_const) { + status |= check_tiu_tensor(p->b); + status |= check_same_shape(p->res_low, p->b); + } + if (p->res_high) + status |= check_16bit_tiu_tensor(p->res_low, p->res_high); + CHECK(status, p->relu_enable == 0 || p->relu_enable == 1); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_MUL_FIX8B; + reg.tsk_opd_num = 2; + int arith_shift = tensor_is_signed(p->res_low); + reg.opt_shift_typ = arith_shift; + reg.opt_res_shift = p->rshift_bits; + reg.opt_relu_typ = p->relu_enable; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a); + fill_opd0_stride(®, &p->a->stride); + + if (p->b_is_const) { + reg.opt_opd1_const = 1; + reg.opd1_addr = p->b_const.val; + reg.opt_opd1_sign = !!p->b_const.is_signed; + } else { + reg.opt_opd1_const = 0; + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = tensor_is_signed(p->b); + fill_opd1_stride(®, &p->b->stride); + } + + reg.res0_addr = p->res_low->start_address; + reg.opt_res0_sign = tensor_is_signed(p->res_low); + reg.opt_res0_seg = (p->res_high == NULL); + fill_res0_stride(®, &p->res_low->stride); + if (p->res_high) + reg.res0_b_str = p->res_high->start_address - p->res_low->start_address; + if (p->relu_enable) + CHECK(status, reg.opt_res0_seg); + + CHECK(status, ( + (!reg.opt_opd1_sign && !reg.opt_opd0_sign && !reg.opt_shift_typ) || + ((reg.opt_opd1_sign || reg.opt_opd0_sign) && reg.opt_shift_typ) + )); + + reg.opt_chl_quan = 1; + reg.quan_m = p->multiplier; + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv180x tiu mul qm: wrong parameter\n"); + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv180x/tiu_or.c b/cvikernel/src/cv180x/tiu_or.c new file mode 100644 index 000000000..f9ba53a77 --- /dev/null +++ b/cvikernel/src/cv180x/tiu_or.c @@ -0,0 +1,112 @@ +#include "cvkcv180x.h" + +void cvkcv180x_tiu_or_int8( + cvk_context_t *ctx, + const cvk_tiu_or_int8_param_t *p) +{ + int8_t status = 0; + + status |= check_tiu_tensor_3(p->res, p->a, p->b); + status |= check_same_shape_3(p->res, p->a, p->b); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_OR_FIX8B; + reg.tsk_opd_num = 2; + reg.opt_res_shift = 0; + reg.opt_shift_typ = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = 0; + reg.opt_opd0_seg = 1; + fill_opd0_stride(®, &p->a->stride); + + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = 0; + reg.opt_opd1_seg = 1; + fill_opd1_stride(®, &p->b->stride); + + reg.res0_addr = p->res->start_address; + reg.opt_res0_sign = 0; + reg.opt_res0_seg = 1; + fill_res0_stride(®, &p->res->stride); + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv180x tiu or: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} + +void cvkcv180x_tiu_or_int16( + cvk_context_t *ctx, + const cvk_tiu_or_int16_param_t *p) +{ + int8_t status = 0; + status |= check_16bit_tiu_tensor(p->a_low, p->a_high); + status |= check_16bit_tiu_tensor(p->b_low, p->b_high); + status |= check_16bit_tiu_tensor(p->res_low, p->res_high); + status |= check_same_shape_3(p->res_low, p->a_low, p->b_low); + + int res_high_addr = p->res_high->start_address; + int res_low_addr = p->res_low->start_address; + CHECK(status, res_high_addr > res_low_addr); + int res_b_stride = res_high_addr - res_low_addr; + + int a_high_addr = p->a_high->start_address; + int a_low_addr = p->a_low->start_address; + CHECK(status, a_high_addr > a_low_addr); + int a_b_stride = a_high_addr - a_low_addr; + + int b_high_addr = p->b_high->start_address; + int b_low_addr = p->b_low->start_address; + CHECK(status, b_high_addr > b_low_addr); + int b_b_stride = b_high_addr - b_low_addr; + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_OR_FIX8B; + reg.tsk_opd_num = 2; + reg.opt_res_shift = 0; + reg.opt_shift_typ = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a_low->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = a_low_addr; + reg.opt_opd0_sign = 0; + reg.opt_opd0_seg = 0; + reg.opd0_b_str = a_b_stride; + fill_opd0_stride(®, &p->a_low->stride); + + reg.opd1_addr = b_low_addr; + reg.opt_opd1_sign = 0; + reg.opt_opd1_seg = 0; + reg.opd1_b_str = b_b_stride; + fill_opd1_stride(®, &p->b_low->stride); + + reg.res0_addr = res_low_addr; + reg.opt_res0_sign = 0; + reg.opt_res0_seg = 0; + reg.res0_b_str = res_b_stride; + fill_res0_stride(®, &p->res_low->stride); + + if (status) { + printf("cvkcv180x tiu or: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv180x/tiu_pt_convolution.c b/cvikernel/src/cv180x/tiu_pt_convolution.c new file mode 100644 index 000000000..1b630d321 --- /dev/null +++ b/cvikernel/src/cv180x/tiu_pt_convolution.c @@ -0,0 +1,183 @@ +#include "cvkcv180x.h" + +static int can_do_double_conv(cvk_context_t *ctx, const cvk_tiu_pt_convolution_param_t *p) +{ + uint8_t bf16_enable = (p->ifmap->fmt == CVK_FMT_BF16) ? 1 : 0; + + if (((p->ifmap->start_address % ctx->info.lmem_size) % 2 == 0 && + p->ifmap->shape.c % 2 == 0 && + p->ifmap->shape.c >= 4 && + p->weight->start_address % 2 == 0) && !bf16_enable) + return 1; + + return 0; +} + +static int8_t check_conv_param(cvk_context_t *ctx, const cvk_tiu_pt_convolution_param_t *p) +{ + int8_t status = 0; + uint32_t eu_num = ctx->info.eu_num; + uint8_t bf16_enable = (p->ifmap->fmt == CVK_FMT_BF16) ? 1 : 0; + + status |= check_tiu_tensor_3(p->ifmap, p->ofmap, p->weight); + if (bf16_enable) { + status |= check_bf16_stride_type_0(ctx, p->ifmap); + } else { + status |= check_stride_type_0(ctx, p->ifmap); + } + //assert_stride_type_1(ctx, p->weight); + if (p->bias) { + status |= check_tiu_tensor(p->bias); + if (bf16_enable) + status |= check_bf16_stride_type_2(ctx, p->bias); + else + status |= check_stride_type_2(ctx, p->bias); + } + + // n stride must align 16B + CHECK(status, (p->ofmap->stride.n % 16) == 0); + + CHECK(status, p->ifmap->start_address % eu_num == 0); + CHECK(status, p->ofmap->start_address % eu_num == 0); + CHECK(status, p->ifmap->shape.n == p->ofmap->shape.n); + CHECK(status, !(p->ifmap->shape.h == 1 && p->ins_h > 0)); + CHECK(status, p->weight->shape.n == p->ifmap->shape.c); + CHECK(status, p->weight->shape.c == p->ofmap->shape.c); + if (can_do_double_conv(ctx, p)) { + uint32_t lmem_i = p->ifmap->start_address % ctx->info.lmem_size; + CHECK(status, lmem_i % 2 == 0); + CHECK(status, p->ifmap->shape.c % 2 == 0); + CHECK(status, p->ifmap->shape.c >= 4); /* Otherwise performance will suffer */ + CHECK(status, p->weight->start_address % 2 == 0); + } + if(p->ps32_mode & 0x2) + { + CHECK(status, !p->relu_enable); + CHECK(status, !p->bias); + CHECK(status, !p->rshift_bits); + + CHECK(status, p->cmd_pre_exe <= 1); + } + CHECK(status, p->stride_h < 32 && p->stride_h > 0); + CHECK(status, p->stride_w < 32 && p->stride_w > 0); + CHECK(status, p->pad_top < 16); + CHECK(status, p->pad_bottom < 16); + CHECK(status, p->pad_left < 16); + CHECK(status, p->pad_right < 16); + CHECK(status, p->ins_h < 15); + CHECK(status, p->ins_last_h < 15); + CHECK(status, p->ins_w < 15); + CHECK(status, p->ins_last_w < 15); + CHECK(status, p->dilation_h >= 1); + CHECK(status, p->dilation_w >= 1); + CHECK(status, p->relu_enable == 0 || p->relu_enable == 1); + + return status; +} + +void cvkcv180x_tiu_pt_convolution(cvk_context_t *ctx, const cvk_tiu_pt_convolution_param_t *p) +{ + int8_t status = 0; + + status |= check_conv_param(ctx, p); + + uint32_t npu_num = ctx->info.npu_num; + int opd0_sign = tensor_is_signed(p->ifmap); + int opd1_sign = tensor_is_signed(p->weight); + int opd2_sign = p->bias? tensor_is_signed(p->bias): 1; + int arith_shift = opd0_sign || opd1_sign || opd2_sign; + int bf16_enable = (p->ifmap->fmt == CVK_FMT_BF16) ? 1 : 0; + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_CONV_FIX8B; + reg.opt_shift_typ = arith_shift; + reg.opt_res_shift = p->rshift_bits; + reg.opt_relu_typ = !!(p->relu_enable); + reg.tsk_opd_num = 2; + + reg.opd_typ = bf16_enable; + + /*always automatically enabel double conv at those situations*/ + if (can_do_double_conv(ctx, p)) + reg.double_conv = 1; + + reg.res0_addr = p->ofmap->start_address; + reg.opt_res0_sign = tensor_is_signed(p->ofmap); + reg.opt_res0_seg = 1; + reg.res0_n = p->ofmap->shape.n; + reg.res0_c = p->ofmap->shape.c; + reg.res0_h = p->ofmap->shape.h; + reg.res0_w = p->ofmap->shape.w; + reg.res0_n_str = p->ofmap->stride.n; + reg.res0_c_str = p->ofmap->stride.c; + reg.res0_h_str = p->ofmap->stride.h; + reg.res0_w_str = p->ofmap->stride.w; + reg.short_res0_str = 3; // Manual instead of h/w + reg.ps32_md = p->ps32_mode; + if (p->ps32_mode > 0) + reg.res0_b_str = p->ofmap->shape.n * p->ofmap->stride.n; + + reg.opd0_addr = p->ifmap->start_address; + reg.opt_opd0_sign = opd0_sign; + reg.opt_opd0_seg = 1; + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.short_opd0_str = 0; + reg.conv_opd0_up_pad = p->pad_top; + reg.conv_opd0_dn_pad = p->pad_bottom; + reg.conv_opd0_lf_pad = p->pad_left; + reg.conv_opd0_rt_pad = p->pad_right; + reg.conv_opd0_x_ins0 = p->ins_w; + reg.conv_opd0_y_ins0 = p->ins_h; + reg.conv_opd0_x_ins0_last = p->ins_last_w; + reg.conv_opd0_y_ins0_last = p->ins_last_h; + + reg.opd1_addr = p->weight->start_address; + reg.opt_opd1_sign = opd1_sign; + reg.opt_opd1_seg = 1; + reg.opt_opd1_const = p->w_is_const; + reg.opd1_n = p->weight->shape.n; + reg.opd1_c = p->weight->shape.c; + reg.opd1_h = p->weight->shape.h; + reg.opd1_w = p->weight->shape.w; + reg.short_opd1_str = 1; + reg.conv_opd1_x_ins0 = p->dilation_w - 1; + reg.conv_opd1_y_ins0 = p->dilation_h - 1; + reg.conv_op_x_str = p->stride_w; + reg.conv_op_y_str = p->stride_h; + reg.opd0_ins_val = bf16_enable ? + (uint32_t)p->ins_fp : (uint32_t)p->ins_val; + + if (p->bias) { + CHECK(status, p->bias->shape.n == 2); + CHECK(status, p->bias->shape.c == p->ofmap->shape.c); + CHECK(status, p->bias->shape.h == 1); + CHECK(status, p->bias->shape.w == 1); + + reg.tsk_opd_num = 3; + reg.opt_opd2_sign = opd2_sign; + reg.opt_opd2_seg = 0; + reg.opd2_addr = p->bias->start_address; + reg.opd2_n = 1; + reg.opd2_c = p->bias->shape.c; + reg.opd2_h = 1; + reg.opd2_w = 1; + reg.short_opd2_str = 2; + reg.opd2_b_str = ceiling_func(p->bias->shape.c, npu_num) * (bf16_enable ? 2 : 1); + } + + reg.cmd_pre_exe_typ = p->cmd_pre_exe_typ; + reg.cmd_pre_exe = p->cmd_pre_exe; + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv180x tiu_pt_conv: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv180x/tiu_pt_depthwise_convolution.c b/cvikernel/src/cv180x/tiu_pt_depthwise_convolution.c new file mode 100644 index 000000000..3860f4a9c --- /dev/null +++ b/cvikernel/src/cv180x/tiu_pt_depthwise_convolution.c @@ -0,0 +1,158 @@ +#include "cvkcv180x.h" + +void cvkcv180x_tiu_pt_depthwise_convolution( + cvk_context_t *ctx, + const cvk_tiu_depthwise_pt_convolution_param_t *p) +{ + int8_t status = 0; + int bf16_enable = (p->ifmap->fmt == CVK_FMT_BF16) ? 1 : 0; + + int8_t isMulConst = (p->weight_is_const == 1) ? 1 : 0; + + if(isMulConst) { + status |= check_tiu_tensor_2(p->ifmap, p->ofmap); + } else { + status |= check_tiu_tensor_3(p->ifmap, p->ofmap, p->weight); + } + if (bf16_enable) { + status |= check_bf16_stride_type_0(ctx, p->ifmap); + if(!isMulConst) + status |= check_bf16_stride_type_0(ctx, p->weight); + if (p->bias) { + status |= check_tiu_tensor(p->bias); + status |= check_bf16_stride_type_2(ctx, p->bias); + } + } else { + status |= check_stride_type_0(ctx, p->ifmap); + if(!isMulConst) + status |= check_stride_type_0(ctx, p->weight); + if (p->bias) { + status |= check_tiu_tensor(p->bias); + status |= check_stride_type_2(ctx, p->bias); + } + } + + // n stride must align 16B + CHECK(status, (p->ofmap->stride.n % 16) == 0); + + CHECK(status, p->ifmap->shape.n == p->ofmap->shape.n); + CHECK(status, p->ifmap->shape.c == p->ofmap->shape.c); + if (!isMulConst){ + CHECK(status, p->ifmap->shape.c == p->weight->shape.c); + CHECK(status, p->weight->shape.n == 1); + } + CHECK(status, p->relu_enable == 0 || p->relu_enable == 1); + CHECK(status, p->stride_h < 32 && p->stride_h > 0); + CHECK(status, p->stride_w < 32 && p->stride_w > 0); + CHECK(status, p->pad_top < 16); + CHECK(status, p->pad_bottom < 16); + CHECK(status, p->pad_left < 16); + CHECK(status, p->pad_right < 16); + CHECK(status, p->ins_h < 15); + CHECK(status, p->ins_last_h < 15); + CHECK(status, p->ins_w < 15); + CHECK(status, p->ins_last_w < 15); + CHECK(status, p->dilation_h >= 1); + CHECK(status, p->dilation_w >= 1); + + int opd0_sign = tensor_is_signed(p->ifmap); + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_DEPTHWISE_POOL_FIX8B; + reg.tsk_eu_typ = 2; + reg.opt_relu_typ = p->relu_enable; + reg.opt_shift_typ = 1; + reg.opt_res_shift = p->rshift_bits; + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + + int res0_sign = tensor_is_signed(p->ofmap); + reg.res0_addr = p->ofmap->start_address; + reg.opt_res0_sign = res0_sign; + reg.opt_res0_seg = 1; + reg.res0_n = p->ofmap->shape.n; + reg.res0_c = p->ofmap->shape.c; + reg.res0_h = p->ofmap->shape.h; + reg.res0_w = p->ofmap->shape.w; + reg.res0_n_str = p->ofmap->stride.n; + reg.res0_c_str = p->ofmap->stride.c; + reg.res0_h_str = p->ofmap->stride.h; + reg.res0_w_str = p->ofmap->stride.w; + reg.short_res0_str = 3; // Manual instead of h/w + + reg.opd0_addr = p->ifmap->start_address; + reg.opt_opd0_sign = opd0_sign; + reg.opt_opd0_seg = 1; + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.opd0_n_str = p->ifmap->stride.n; + reg.opd0_c_str = p->ifmap->stride.c; + reg.opd0_h_str = p->ifmap->stride.h; + reg.opd0_w_str = p->ifmap->stride.w; + reg.short_opd0_str = 3; // Manual instead of h/w + reg.conv_opd0_up_pad = p->pad_top; + reg.conv_opd0_dn_pad = p->pad_bottom; + reg.conv_opd0_lf_pad = p->pad_left; + reg.conv_opd0_rt_pad = p->pad_right; + reg.conv_opd0_x_ins0 = p->ins_w; + reg.conv_opd0_y_ins0 = p->ins_h; + reg.conv_opd0_x_ins0_last = p->ins_last_w; + reg.conv_opd0_y_ins0_last = p->ins_last_h; + + reg.opt_opd1_sign = 1; + reg.opt_opd1_seg = 1; + reg.conv_opd1_x_ins0 = p->dilation_w - 1; + reg.conv_opd1_y_ins0 = p->dilation_h - 1; + if (isMulConst) { + reg.opt_opd1_const = 1; + reg.opt_opd1_sign = p->weight_const.is_signed; + reg.opd1_addr = p->weight_const.val; + reg.opd1_n = p->weight->shape.n; + reg.opd1_c = p->weight->shape.c; + reg.opd1_h = p->weight->shape.h; + reg.opd1_w = p->weight->shape.w; + } else { + reg.opd1_addr = p->weight->start_address; + reg.opd1_n = p->weight->shape.n; + reg.opd1_c = p->weight->shape.c; + reg.opd1_h = p->weight->shape.h; + reg.opd1_w = p->weight->shape.w; + } + reg.conv_op_x_str = p->stride_w; + reg.conv_op_y_str = p->stride_h; + reg.opd0_ins_val = bf16_enable ? + (uint32_t)p->ins_fp : (uint32_t)p->ins_val; + + if (p->bias) { + CHECK(status, p->bias->shape.n == 2); + CHECK(status, p->bias->shape.c == p->ofmap->shape.c); + CHECK(status, p->bias->shape.h == 1); + CHECK(status, p->bias->shape.w == 1); + + reg.tsk_opd_num = 3; + reg.opd2_addr = p->bias->start_address; + reg.opt_opd2_seg = 0; + reg.opd2_n = 1; + reg.opd2_c = p->bias->shape.c; + reg.opd2_h = 1; + reg.opd2_w = 1; + reg.short_opd2_str = 2; + reg.opd2_b_str = p->bias->stride.n; + } + + reg.layer_info = p->layer_id; + + reg.cmd_pre_exe_typ = p->cmd_pre_exe_typ; + reg.cmd_pre_exe = p->cmd_pre_exe; + + if (status) { + printf("cvkcv180x pt dw-conv: invalid param\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv180x/tiu_shift.c b/cvikernel/src/cv180x/tiu_shift.c new file mode 100644 index 000000000..85bc57df8 --- /dev/null +++ b/cvikernel/src/cv180x/tiu_shift.c @@ -0,0 +1,63 @@ +#include "cvkcv180x.h" + +void cvkcv180x_tiu_arith_shift( + cvk_context_t *ctx, + const cvk_tiu_arith_shift_param_t *p) +{ + int8_t status = 0; + status |= check_16bit_tiu_tensor(p->a_low, p->a_high); + status |= check_16bit_tiu_tensor(p->res_low, p->res_high); + status |= check_tiu_tensor(p->bits); + status |= check_same_shape_3(p->res_low, p->a_low, p->bits); + CHECK(status, tensor_is_signed(p->a_low)); + CHECK(status, tensor_is_signed(p->bits)); + + int res_high_addr = p->res_high->start_address; + int res_low_addr = p->res_low->start_address; + CHECK(status, res_high_addr > res_low_addr); + int res_b_stride = res_high_addr - res_low_addr; + + int a_high_addr = p->a_high->start_address; + int a_low_addr = p->a_low->start_address; + CHECK(status, a_high_addr > a_low_addr); + int a_b_stride = a_high_addr - a_low_addr; + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_SHIFT_FIX8B; + reg.tsk_opd_num = 2; + reg.opt_res_shift = 0; + reg.opt_rshift_typ = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a_low->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = a_low_addr; + reg.opt_opd0_sign = 1; + reg.opt_opd0_seg = 0; + reg.opd0_b_str = a_b_stride; + fill_opd0_stride(®, &p->a_low->stride); + + reg.opd1_addr = p->bits->start_address; + reg.opt_opd1_sign = 1; + reg.opt_opd1_seg = 1; + fill_opd1_stride(®, &p->bits->stride); + + reg.res0_addr = res_low_addr; + reg.opt_res0_sign = 1; + reg.opt_res0_seg = 0; + reg.res0_b_str = res_b_stride; + fill_res0_stride(®, &p->res_low->stride); + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv180x tiu shift: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv180x/tiu_sub.c b/cvikernel/src/cv180x/tiu_sub.c new file mode 100644 index 000000000..e0b01136d --- /dev/null +++ b/cvikernel/src/cv180x/tiu_sub.c @@ -0,0 +1,73 @@ +#include "cvkcv180x.h" + +void cvkcv180x_tiu_sub( + cvk_context_t *ctx, + const cvk_tiu_sub_param_t *p) +{ + int8_t status = 0; + int bf16_enable = (p->a_low->fmt == CVK_FMT_BF16) ? 1 : 0; + + if (bf16_enable) { + /*bf16 only support 16 bit*/ + CHECK(status, !p->a_high); + CHECK(status, !p->b_high); + CHECK(status, !p->res_high); + status |= check_tiu_tensor(p->a_low); + status |= check_tiu_tensor(p->b_low); + status |= check_tiu_tensor(p->res_low); + status |= check_same_shape_3(p->res_low, p->a_low, p->b_low); + } else { + status |= check_16bit_tiu_tensor(p->a_low, p->a_high); + status |= check_16bit_tiu_tensor(p->b_low, p->b_high); + status |= check_tiu_tensor(p->res_low); + status |= check_same_shape_3(p->res_low, p->a_low, p->b_low); + CHECK(status, tensor_is_signed(p->res_low)); + } + if (p->res_high) + status |= check_16bit_tiu_tensor(p->res_low, p->res_high); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_SUB_FIX8B; + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + reg.opt_res_shift = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a_low->shape); + fill_same_tensor_stride_type(®, 0b11); + + int arith_shift = tensor_is_signed(p->res_low); + reg.opt_shift_typ = arith_shift; + reg.opt_res_shift = p->rshift_bits; + + reg.opd0_addr = p->a_low->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a_low); + reg.opt_opd0_seg = (p->a_high == NULL); + reg.opd0_b_str = bf16_enable ? 0 : (p->a_high->start_address - p->a_low->start_address); + fill_opd0_stride(®, &p->a_low->stride); + + reg.opd1_addr = p->b_low->start_address; + reg.opt_opd1_sign = tensor_is_signed(p->b_low);; + reg.opt_opd1_seg = (p->b_high == NULL); + reg.opd1_b_str = bf16_enable ? 0 : (p->b_high->start_address - p->b_low->start_address); + fill_opd1_stride(®, &p->b_low->stride); + + reg.res0_addr = p->res_low->start_address; + reg.opt_res0_sign = 1; + reg.opt_res0_seg = (p->res_high == NULL); + fill_res0_stride(®, &p->res_low->stride); + if (p->res_high) + reg.res0_b_str = bf16_enable ? 0 : (p->res_high->start_address - p->res_low->start_address); + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv180x tiu sub: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv180x/tiu_xor.c b/cvikernel/src/cv180x/tiu_xor.c new file mode 100644 index 000000000..77f946292 --- /dev/null +++ b/cvikernel/src/cv180x/tiu_xor.c @@ -0,0 +1,111 @@ +#include "cvkcv180x.h" + +void cvkcv180x_tiu_xor_int8( + cvk_context_t *ctx, + const cvk_tiu_xor_int8_param_t *p) +{ + int8_t status = 0; + status |= check_tiu_tensor_3(p->res, p->a, p->b); + status |= check_same_shape_3(p->res, p->a, p->b); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_XOR_FIX8B; + reg.tsk_opd_num = 2; + reg.opt_res_shift = 0; + reg.opt_shift_typ = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = 0; + reg.opt_opd0_seg = 1; + fill_opd0_stride(®, &p->a->stride); + + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = 0; + reg.opt_opd1_seg = 1; + fill_opd1_stride(®, &p->b->stride); + + reg.res0_addr = p->res->start_address; + reg.opt_res0_sign = 0; + reg.opt_res0_seg = 1; + fill_res0_stride(®, &p->res->stride); + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv180x tiu xor: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} + +void cvkcv180x_tiu_xor_int16( + cvk_context_t *ctx, + const cvk_tiu_xor_int16_param_t *p) +{ + int8_t status = 0; + status |= check_16bit_tiu_tensor(p->a_low, p->a_high); + status |= check_16bit_tiu_tensor(p->b_low, p->b_high); + status |= check_16bit_tiu_tensor(p->res_low, p->res_high); + status |= check_same_shape_3(p->res_low, p->a_low, p->b_low); + + int res_high_addr = p->res_high->start_address; + int res_low_addr = p->res_low->start_address; + CHECK(status, res_high_addr > res_low_addr); + int res_b_stride = res_high_addr - res_low_addr; + + int a_high_addr = p->a_high->start_address; + int a_low_addr = p->a_low->start_address; + CHECK(status, a_high_addr > a_low_addr); + int a_b_stride = a_high_addr - a_low_addr; + + int b_high_addr = p->b_high->start_address; + int b_low_addr = p->b_low->start_address; + CHECK(status, b_high_addr > b_low_addr); + int b_b_stride = b_high_addr - b_low_addr; + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_XOR_FIX8B; + reg.tsk_opd_num = 2; + reg.opt_res_shift = 0; + reg.opt_shift_typ = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a_low->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = a_low_addr; + reg.opt_opd0_sign = 0; + reg.opt_opd0_seg = 0; + reg.opd0_b_str = a_b_stride; + fill_opd0_stride(®, &p->a_low->stride); + + reg.opd1_addr = b_low_addr; + reg.opt_opd1_sign = 0; + reg.opt_opd1_seg = 0; + reg.opd1_b_str = b_b_stride; + fill_opd1_stride(®, &p->b_low->stride); + + reg.res0_addr = res_low_addr; + reg.opt_res0_sign = 0; + reg.opt_res0_seg = 0; + reg.res0_b_str = res_b_stride; + fill_res0_stride(®, &p->res_low->stride); + + if (status) { + printf("cvkcv180x tiu xor: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv181x/cvkcv181x.c b/cvikernel/src/cv181x/cvkcv181x.c new file mode 100644 index 000000000..153d6f7fc --- /dev/null +++ b/cvikernel/src/cv181x/cvkcv181x.c @@ -0,0 +1,885 @@ +#include "cvkcv181x.h" +#include +#include + +static inline int bitsize_of_fmt(cvk_fmt_t fmt) +{ + switch (fmt) { + case CVK_FMT_F32: + case CVK_FMT_I32: + return 32; + case CVK_FMT_F16: + case CVK_FMT_I16: + case CVK_FMT_U16: + case CVK_FMT_BF16: + return 16; + case CVK_FMT_I8: + case CVK_FMT_U8: + return 8; + default: + return 32; + } +} + +static void cvkcv181x_replace_cmd_id(uint32_t *desc, uint32_t eng_id, uint16_t ids[]) +{ + if (eng_id == CV181X_TIU) { + tiu_reg_t reg; + parse_tiu_reg(®, desc); + reg.cmd_id_en = 1; + reg.cmd_id_tpu = ids[eng_id]; + reg.cmd_id_gdma = ids[CV181X_TDMA]; + emit_tiu_reg(®, desc); + } else if (eng_id == CV181X_TDMA) { + tdma_reg_t tdma_reg; + parse_tdma_reg(&tdma_reg, desc); + tdma_reg.cmd_id = ids[eng_id]; + tdma_reg.wait_id_tpu = ids[CV181X_TIU]; + tdma_reg.bar_en = 1; + emit_tdma_reg(&tdma_reg, desc); + } +} + +static int cvkcv181x_get_engine_desc_length(uint32_t engine_id) +{ + switch (engine_id) { + case CV181X_TIU: + return TIU_ENGINE_DESCRIPTOR_NUM * sizeof(uint32_t); + case CV181X_TDMA: + return TDMA_ENGINE_DESCRIPTOR_NUM * sizeof(uint32_t); + //case CV181X_CPU: + // return CPU_ENGINE_DESCRIPTOR_NUM * sizeof(uint32_t); + default: + //ASSERT(0); + break; + } + + return 0; +} + +// Estimate the number of command descriptor based on buffer size provided +// by the user. +static uint32_t cvkcv181x_estimate_nr_desc(uint32_t cmdbuf_size) +{ + uint32_t tiu_desc_len = cvkcv181x_get_engine_desc_length(CV181X_TIU); + uint32_t tdma_desc_len = cvkcv181x_get_engine_desc_length(CV181X_TDMA); + uint32_t hdr_len = sizeof(cmd_hdr_t); + + uint32_t desc_len = + (tiu_desc_len > tdma_desc_len) ? tiu_desc_len : tdma_desc_len; + + return cmdbuf_size / (desc_len + hdr_len); +} + +static cmd_hdr_t *kernel_alloc_cmd_hdr( + cvk_context_t *ctx, uint8_t eng_id, uint32_t desc_len) +{ + cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data; + uint32_t free_len = prv_data->cmdbuf_size - prv_data->cmdbuf_ptr; + uint32_t hdr_len = sizeof(cmd_hdr_t); + uint32_t total_len = hdr_len + desc_len; + + if (total_len > free_len) + return NULL; + + cmd_hdr_t *hdr = (cmd_hdr_t *)&prv_data->cmdbuf[prv_data->cmdbuf_ptr]; + hdr->magic = 0xA7; // CMDBUF_HDR_MAGIC_181X + hdr->len = desc_len; + hdr->engine_id = eng_id; + hdr->__deprecated = 0; // for valgrind + hdr->flags = 0; + hdr->mask = 0; + + prv_data->cmdbuf_ptr += total_len; + return hdr; +} + +static desc_pair_t *kernel_alloc_desc_pair(cvk_context_t *ctx, uint8_t eng_id) +{ + cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data; + + if (eng_id >= CV181X_ENGINE_NUM || prv_data->cur_nr_desc >= prv_data->max_nr_desc) + return NULL; + + uint32_t desc_len = cvkcv181x_get_engine_desc_length(eng_id); + desc_pair_t *dp = &prv_data->desc_pairs[prv_data->cur_nr_desc++]; + dp->cmd_hdr = kernel_alloc_cmd_hdr(ctx, eng_id, desc_len); + dp->ec_desc = ec_alloc_desc(&prv_data->ec, eng_id); + + mode_manager_record_ec_desc(&prv_data->mode_manager, dp->ec_desc); + return dp; +} + +static void cvkcv181x_update_sync_id(cvk_context_t *ctx) +{ + cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data; + ec_compute_sync_ids(&prv_data->ec); + + for (uint32_t di = 0; di < prv_data->cur_nr_desc; di++) { + desc_pair_t *dp = &prv_data->desc_pairs[di]; + uint8_t eng_id = dp->ec_desc->engine_id; + uint32_t *desc = (uint32_t *)dp->cmd_hdr->cmd; + cvkcv181x_replace_cmd_id(desc, eng_id, dp->ec_desc->sync_ids); + } +} + +desc_pair_t *cvkcv181x_get_desc_pair(cvk_context_t *ctx, uint8_t eng_id) +{ +#if 0 + if (eng_id == BMK1822_CPU) { + kernel_update_sync_id(k); + k->cur_nr_desc = 0; + + ec_reset(&k->ec); + mode_manager_restart_sync_id(&k->mode_manager); + } +#endif + + return kernel_alloc_desc_pair(ctx, eng_id); +} + +void cvkcv181x_cleanup(cvk_context_t *ctx) +{ + cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data; + + free(prv_data->desc_pairs); + ec_destroy(&prv_data->ec); + mode_manager_destroy(&prv_data->mode_manager); +} + +void cvkcv181x_reset(cvk_context_t *ctx) +{ + cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data; + + prv_data->cur_nr_desc = 0; + prv_data->cmdbuf_ptr = 0; + + ec_reset(&prv_data->ec); + mode_manager_reset(&prv_data->mode_manager); +} + +static uint8_t *cvkcv181x_acquire_cmdbuf(cvk_context_t *ctx, uint32_t *size) +{ + cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data; + + *size = prv_data->cmdbuf_ptr; + cvkcv181x_update_sync_id(ctx); + return prv_data->cmdbuf; +} + +void cvkcv181x_set_layer_id( + struct cvikernel_context *ctx, + uint16_t layer_id) +{ + cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data; + + prv_data->layer_id = layer_id; +} + +void cvkcv181x_parallel_enable(struct cvikernel_context *ctx) +{ + cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data; + + mode_manager_enable_parallel(&prv_data->mode_manager); +} + +void cvkcv181x_parallel_disable(struct cvikernel_context *ctx) +{ + cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data; + + mode_manager_disable_parallel(&prv_data->mode_manager); +} + +cvk_tl_stride_t cvkcv181x_tl_default_stride( + cvk_context_t *ctx, + cvk_tl_shape_t s, + cvk_fmt_t fmt_type, + int eu_align) +{ + cvk_tl_stride_t stride; + uint32_t eu_num = ctx->info.eu_num; + uint32_t npu_num = ctx->info.npu_num; + uint32_t fmt = (fmt_type == CVK_FMT_BF16) ? 2 : 1; + stride.w = fmt; + stride.h = s.w * fmt; + if (eu_align) + stride.c = align_up(s.h * s.w * fmt, eu_num); + else + stride.c = s.h * s.w * fmt; + + stride.n = stride.c * ceiling_func(s.c, npu_num); + + return stride; +} + +void cvkcv181x_lmem_init_tensor( + struct cvikernel_context *ctx, + cvk_tl_t *tl, + cvk_tl_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + memset(tl, 0, sizeof(*tl)); + tl->fmt = fmt; + tl->shape = shape; + tl->eu_align = eu_align; + tl->stride = cvkcv181x_tl_default_stride(ctx, shape, fmt, eu_align); +} + +uint32_t cvkcv181x_lmem_tensor_to_size( + struct cvikernel_context *ctx, + cvk_tl_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + uint32_t eu_num = ctx->info.eu_num; + + cvk_tl_stride_t stride; + stride = cvkcv181x_tl_default_stride(ctx, shape, fmt, eu_align); + + uint32_t needed = align_up(shape.n * stride.n, eu_num); + + return needed; +} + +cvk_tl_t *cvkcv181x_lmem_alloc_tensor( + cvk_context_t *ctx, + cvk_tl_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data; + uint32_t lmem_size = ctx->info.lmem_size; + uint32_t eu_num = ctx->info.eu_num; + + cvk_tl_t *t = malloc(sizeof(*t)); + if (!t) + return NULL; + + memset(t, 0, sizeof(*t)); + t->start_address = prv_data->lmem_ptr; + t->fmt = fmt; + t->cmprs_fmt = fmt; + t->shape = shape; + t->eu_align = eu_align; + t->stride = cvkcv181x_tl_default_stride(ctx, shape, fmt, eu_align); + + uint32_t needed = align_up(t->shape.n * t->stride.n, eu_num); + if ((lmem_size - prv_data->lmem_ptr < needed) || !needed) { + free(t); + return NULL; + } + + prv_data->lmem_ptr += needed; + return t; +} + +void cvkcv181x_lmem_free_tensor( + struct cvikernel_context *ctx, + const cvk_tl_t *tl) +{ + cvk_prv_data_t *prv_data; + + if (!ctx || !tl) + return; + + prv_data = (cvk_prv_data_t *)ctx->priv_data; + + if (tl->start_address >= prv_data->lmem_ptr) + printf("cvkcv181x lm free tensor: ptr out of range\n"); + + prv_data->lmem_ptr = tl->start_address; + + free((void *)tl); +} + +static void try_optimize_matrix_shape(cvk_context_t *ctx, cvk_ml_shape_t *s, + cvk_fmt_t fmt_type) { + uint32_t eu_num = ctx->info.eu_num; + uint32_t npu_num = ctx->info.npu_num; + uint32_t col = s->col; + uint8_t isBf16 = (fmt_type == CVK_FMT_BF16); + uint32_t workingNumber = isBf16 ? eu_num / 2 : eu_num; + + if (col >= workingNumber) { + int num_eu = ceiling_func(col, workingNumber * npu_num); + s->w = workingNumber * num_eu; + s->c = ceiling_func(col, s->w); + } else { + // col < EU_NUM + // Only transfer needed data + // We still change tensor shape in TIU mac op + s->w = col; + s->c = 1; + } +} + +cvk_ml_shape_t cvkcv181x_ml_default_shape( + struct cvikernel_context *ctx, + uint32_t row, + uint32_t col, + cvk_fmt_t fmt_type) +{ + cvk_ml_shape_t shape = {0}; + shape.n = row; + shape.col = col; + + try_optimize_matrix_shape(ctx, &shape, fmt_type); + + return shape; +} + +cvk_ml_stride_t cvkcv181x_ml_default_stride( + struct cvikernel_context *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + uint32_t npu_num = ctx->info.npu_num; + uint32_t eu_num = ctx->info.eu_num; + uint32_t val = (fmt == CVK_FMT_BF16) ? 2 : 1; + + cvk_ml_stride_t stride; + stride.h = shape.w * val; + if (eu_align) + stride.c = align_up(shape.w * val, eu_num); + else + stride.c = shape.w * val; + stride.n = stride.c * ceiling_func(shape.c, npu_num); + + return stride; +} + +cvk_ml_shape_t cvkcv181x_ml_shape_t1( + struct cvikernel_context *ctx, + uint32_t len, + cvk_fmt_t fmt_type) +{ + uint32_t lmem_size = ctx->info.lmem_size; + cvk_ml_shape_t shape = {0}; + + uint32_t row = 1; + uint32_t col = len; + + while (col >= lmem_size) { + if (col % 2) + return shape; + + col /= 2; + row *= 2; + } + + shape.n = row; + shape.col = col; + + try_optimize_matrix_shape(ctx, &shape, fmt_type); + return shape; +} + +void cvkcv181x_lmem_init_matrix( + struct cvikernel_context *ctx, + cvk_ml_t *ml, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + memset(ml, 0, sizeof(*ml)); + ml->fmt = fmt; + ml->shape = shape; + ml->stride = cvkcv181x_ml_default_stride(ctx, shape, fmt, eu_align); + ml->eu_align = eu_align; +} + + +uint32_t cvkcv181x_lmem_matrix_to_size( + struct cvikernel_context *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + uint32_t npu_num = ctx->info.npu_num; + uint32_t eu_num = ctx->info.eu_num; + uint32_t val = (fmt == CVK_FMT_BF16) ? 2 : 1; + + cvk_ml_t t; + t.fmt = fmt; + t.shape = shape; + t.stride.h = shape.w * val; + if (eu_align) + t.stride.c = align_up(shape.w * val, eu_num); + else + t.stride.c = shape.w * val; + t.stride.n = t.stride.c * ceiling_func(shape.c, npu_num); + + uint32_t needed = align_up(t.shape.n * t.stride.n, eu_num); + + return needed; +} + +uint32_t cvkcv181x_lmem_ps32_matrix_to_size( + struct cvikernel_context *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + /* Partial sum is located in lmem in 32-bit format, so we times n to 4 to + * spare a sapce for it. + */ + + shape.n = shape.n * (bitsize_of_fmt(CVK_FMT_I32) / bitsize_of_fmt(fmt)); + + return cvkcv181x_lmem_matrix_to_size(ctx, shape, fmt, eu_align); + +} + +cvk_ml_t *cvkcv181x_lmem_alloc_matrix( + cvk_context_t *ctx, + cvk_ml_shape_t s, + cvk_fmt_t fmt, + int eu_align) +{ + cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data; + uint32_t lmem_size = ctx->info.lmem_size; + uint32_t npu_num = ctx->info.npu_num; + uint32_t eu_num = ctx->info.eu_num; + uint32_t val = (fmt == CVK_FMT_BF16) ? 2 : 1; + + cvk_ml_t *t = malloc(sizeof(*t)); + if (!t) + return NULL; + + memset(t, 0, sizeof(*t)); + t->start_address = prv_data->lmem_ptr; + t->fmt = fmt; + t->shape = s; + t->stride.h = s.w * val; + if (eu_align) + t->stride.c = align_up(s.w * val, eu_num); + else + t->stride.c = s.w * val; + t->stride.n = t->stride.c * ceiling_func(s.c, npu_num); + t->eu_align = eu_align; + + uint32_t needed = align_up(t->shape.n * t->stride.n, eu_num); + if (lmem_size - prv_data->lmem_ptr < needed) { + free(t); + return NULL; + } + prv_data->lmem_ptr += needed; + + return t; +} + +void cvkcv181x_lmem_free_matrix( + struct cvikernel_context *ctx, + const cvk_ml_t *ml) +{ + cvk_prv_data_t *prv_data; + + if (!ctx || !ml) + return; + + prv_data = (cvk_prv_data_t *)ctx->priv_data; + + if (ml->start_address >= prv_data->lmem_ptr) + printf("cvkcv181x lm free matrix: ptr out of range\n"); + + prv_data->lmem_ptr = ml->start_address; + free((void *)ml); +} + +cvk_ml_t *cvkcv181x_lmem_alloc_ps32_matrix( + cvk_context_t *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + /* Partial sum is located in lmem in 32-bit format, so we times n to 4 to + * spare a space for it. + */ + + uint32_t prev_n; + + prev_n = shape.n; + shape.n = shape.n * (bitsize_of_fmt(CVK_FMT_I32) / bitsize_of_fmt(fmt)); + cvk_ml_t *res = cvkcv181x_lmem_alloc_matrix(ctx, shape, fmt, eu_align); + + if(res == NULL) { + printf("cvkcv181x: alloc ps32 matrix fail\n"); + return NULL; + } + + res->shape.n = prev_n; + return res; +} + +cvk_tg_stride_t cvkcv181x_tg_default_stride( + struct cvikernel_context *ctx, + cvk_tg_shape_t shape, + cvk_fmt_t fmt) +{ + uint32_t data_type_size = (fmt == CVK_FMT_BF16) ? 2 : 1; + cvk_tg_stride_t stride; + stride.h = shape.w * data_type_size; + stride.c = shape.h * stride.h; + stride.n = shape.c * stride.c; + stride.w = (fmt == CVK_FMT_BF16) ? 2 : 1; + + (void)ctx; + + return stride; +} + +void cvkcv181x_tiu_bf16_lookup_interp_table( + cvk_context_t *ctx, + const cvk_tiu_bf16_lookup_interp_table_param_t *param) +{ + if (param->is_scientific) { + // issue lut cmd + cvk_tdma_l2l_tensor_copy_param_t p10; + // remove low 8 bits by int8 copy with stride + // get index(pow) + memset(&p10, 0x00, sizeof(cvk_tdma_l2l_tensor_copy_param_t)); + p10.dst = param->ofmap; + p10.src = param->ifmap; + p10.mv_lut_base = 0; // MUST init by ifself in soc + p10.mv_lut_idx = 1; + p10.layer_id = param->layer_id; + cvkcv181x_tdma_l2l_bf16_tensor_copy(ctx, &p10); + p10.mv_lut_idx = 0; + + // get f(x0) = 2^(x0*-0.5) + cvk_tiu_lookup_table_param_t p12; + p12.ofmap = param->ofmap; + p12.ifmap = param->ofmap; + p12.table = param->tbl_answer; + p12.layer_id = param->layer_id; + cvkcv181x_tiu_lookup_table(ctx, &p12); + + // get mantissa value + p12.ofmap = param->buf; + p12.ifmap = param->ifmap; + p12.table = param->tbl_answer_mantissa; + cvkcv181x_tiu_lookup_table(ctx, &p12); + + // (2^exp) * mantissa + cvk_tiu_mul_param_t p1; + p1.res_high = NULL; + p1.res_low = param->ofmap; + p1.a = param->ofmap; + p1.b_is_const = 0; + p1.b = param->buf; + p1.rshift_bits = 0; + p1.relu_enable = 0; + p1.layer_id = param->layer_id; + cvkcv181x_tiu_mul(ctx, &p1); + } + else { + // duplicate from cvikernel_1880v2.c + const cvk_tl_t *tl_ifmap = param->ifmap; + const cvk_tl_t *tl_ofmap_slope = param->buf; + const cvk_tl_t *tl_table_answer = param->tbl_answer; + const cvk_tl_t *tl_table_answer_slope = param->tbl_answer_mantissa; + const cvk_tl_t *tl_ofmap_y0 = param->ofmap; + float min = param->min; + float max = param->max; + float scale = 256 / (max - min); // 256 means hw support lut index size + uint8_t eu_align = param->eu_align; + cvk_fmt_t fmt = CVK_FMT_BF16; + + cvk_tl_shape_t tl_ofmap_x0_int8_shape = { + 1, tl_ifmap->shape.c, tl_ifmap->shape.h * tl_ifmap->shape.w, 1}; + + // filter y = max(range_min, x) + cvk_tiu_max_param_t p1 = {0}; + p1.max = tl_ifmap; + p1.a = tl_ifmap; + p1.b_is_const = 1; + p1.b_const.is_signed = 1; + p1.b_const.val = cvk_convert_fp32_bf16(min); + p1.layer_id = param->layer_id; + ctx->ops->tiu_max(ctx, &p1); + + // filter y = min(8, x) + cvk_tiu_min_param_t p2 = {0}; + p2.min = tl_ifmap; + p2.a = tl_ifmap; + p2.b_is_const = 1; + p2.b_const.val = cvk_convert_fp32_bf16(max - 1 / scale); // corner + p2.b_const.is_signed = 1; + p2.layer_id = param->layer_id; + ctx->ops->tiu_min(ctx, &p2); + + cvk_tdma_l2l_tensor_copy_param_t p3 = {0}; + // scale input for remap its idx(-x~x) to (-127~127), dirty tl_ifmap + cvk_tiu_mul_param_t p4 = {0}; + p4.res_high = NULL; + p4.res_low = tl_ifmap; + p4.a = tl_ifmap; + p4.b_is_const = 1; + p4.b_const.val = cvk_convert_fp32_bf16(scale); + p4.rshift_bits = 0; + p4.relu_enable = 0; + p4.layer_id = param->layer_id; + ctx->ops->tiu_mul(ctx, &p4); + + // int8 + memset(&p3, 0x00, sizeof(cvk_tdma_l2l_tensor_copy_param_t)); + cvk_tl_t dst; + memcpy(&dst, tl_ofmap_y0, sizeof(cvk_tl_t)); + + dst.shape = tl_ofmap_x0_int8_shape; + dst.fmt = CVK_FMT_I8; + dst.stride = + ctx->ops->tl_default_stride(ctx, tl_ofmap_x0_int8_shape, CVK_FMT_I8, eu_align); + dst.stride.h = dst.stride.h * 2; + dst.int8_rnd_mode = 1; + p3.dst = &dst; + p3.src = tl_ifmap; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p3); + dst.int8_rnd_mode = 0; // reset + + // ops->tdma_l2l_bf16_tensor_copy(ctx, &p3); + + // ops->tiu_sub(ctx, &p5); + + // get f(x0) and slope(x) + // reshape, 16->16 + dst.fmt = fmt; + dst.shape = tl_ofmap_slope->shape; + dst.stride = tl_ofmap_slope->stride; + + // layer_id; + ctx->ops->tiu_lookup_table(ctx, &p6); + + // base f(x0) + memset(&p6, 0x0, sizeof(cvk_tiu_lookup_table_param_t)); + p6.ofmap = tl_ofmap_y0; + p6.ifmap = &dst; + p6.table = tl_table_answer; + p6.layer_id = param->layer_id; + ctx->ops->tiu_lookup_table(ctx, &p6); + + // layer_id; + ctx->ops->tiu_mac(ctx, &p7); + } +} + +void cvkcv181x_gmem_init_tensor( + struct cvikernel_context *ctx, + cvk_tg_t *tg, + cvk_tg_shape_t shape, + cvk_fmt_t fmt) { + memset(tg, 0, sizeof(*tg)); + tg->fmt = fmt; + tg->shape = shape; + tg->stride = cvkcv181x_tg_default_stride(ctx, tg->shape, tg->fmt); +} + +static uint16_t cvkcv181x_float_to_bfloat16( + cvk_context_t *ctx, + float data) +{ + (void)ctx; + + return cvk_convert_fp32_bf16(data); +} + +static void cvkcv181x_bf16_table_shape( + cvk_context_t *ctx, + cvk_tl_shape_t *shape) +{ + if (!ctx || !shape) + return; + + shape->n = 1; + shape->c = ctx->info.npu_num; + shape->h = 32; // hard-coded in cv181x + shape->w = 8; // hard-coded in cv181x +} + +static cvk_operations_t cvk_cv181x_ops = { + .cleanup = cvkcv181x_cleanup, + .reset = cvkcv181x_reset, + .acquire_cmdbuf = cvkcv181x_acquire_cmdbuf, + .set_layer_id = cvkcv181x_set_layer_id, + .parallel_enable = cvkcv181x_parallel_enable, + .parallel_disable = cvkcv181x_parallel_disable, + .lmem_alloc_tensor = cvkcv181x_lmem_alloc_tensor, + .lmem_alloc_matrix = cvkcv181x_lmem_alloc_matrix, + .lmem_alloc_ps32_matrix = cvkcv181x_lmem_alloc_ps32_matrix, + .lmem_free_tensor = cvkcv181x_lmem_free_tensor, + .lmem_free_matrix = cvkcv181x_lmem_free_matrix, + .lmem_init_tensor = cvkcv181x_lmem_init_tensor, + .lmem_init_matrix = cvkcv181x_lmem_init_matrix, + .tl_default_stride = cvkcv181x_tl_default_stride, + .tg_default_stride = cvkcv181x_tg_default_stride, + .ml_default_shape = cvkcv181x_ml_default_shape, + .ml_default_stride = cvkcv181x_ml_default_stride, + .ml_shape_t1 = cvkcv181x_ml_shape_t1, + .lmem_tensor_to_size = cvkcv181x_lmem_tensor_to_size, + .lmem_matrix_to_size = cvkcv181x_lmem_matrix_to_size, + .lmem_ps32_matrix_to_size = cvkcv181x_lmem_ps32_matrix_to_size, + .gmem_init_tensor = cvkcv181x_gmem_init_tensor, + .tdma_l2l_tensor_copy = cvkcv181x_tdma_l2l_bf16_tensor_copy, + .tdma_l2l_bf16_tensor_copy = cvkcv181x_tdma_l2l_bf16_tensor_copy, + .tdma_l2l_tensor_lrn_shift = cvkcv181x_tdma_l2l_tensor_lrn_shift, + .tdma_l2g_tensor_copy = cvkcv181x_tdma_l2g_bf16_tensor_copy, + .tdma_l2g_bf16_tensor_copy = cvkcv181x_tdma_l2g_bf16_tensor_copy, + .tdma_l2g_tensor_copy_nc_transposed = cvkcv181x_tdma_l2g_bf16_tensor_copy_nc_transposed, + .tdma_l2g_bf16_tensor_copy_nc_transposed = cvkcv181x_tdma_l2g_bf16_tensor_copy_nc_transposed, + .tdma_l2g_tensor_copy_compressed = cvkcv181x_tdma_l2g_tensor_copy_compressed, + .tdma_l2g_tensor_fill_constant = cvkcv181x_tdma_l2g_tensor_fill_constant, + .tdma_l2g_tensor_copy_cw_transposed = cvkcv181x_tdma_l2g_bf16_tensor_copy_cw_transposed, + .tdma_l2g_bf16_tensor_copy_cw_transposed = cvkcv181x_tdma_l2g_bf16_tensor_copy_cw_transposed, + .tdma_l2g_matrix_copy = cvkcv181x_tdma_l2g_bf16_matrix_copy, + .tdma_l2g_bf16_matrix_copy = cvkcv181x_tdma_l2g_bf16_matrix_copy, + .tdma_l2g_matrix_copy_compressed = cvkcv181x_tdma_l2g_matrix_copy_compressed, + .tdma_l2g_general_copy = cvkcv181x_tdma_l2g_general_copy, + .tdma_l2g_bf16_general_copy = cvkcv181x_tdma_l2g_bf16_general_copy, + .tdma_g2l_tensor_copy = cvkcv181x_tdma_g2l_bf16_tensor_copy, + .tdma_g2l_bf16_tensor_copy = cvkcv181x_tdma_g2l_bf16_tensor_copy, + .tdma_g2l_tensor_copy_nc_transposed = cvkcv181x_tdma_g2l_bf16_tensor_copy_nc_transposed, + .tdma_g2l_bf16_tensor_copy_nc_transposed = cvkcv181x_tdma_g2l_bf16_tensor_copy_nc_transposed, + .tdma_g2l_tensor_copy_chw_rotated = cvkcv181x_tdma_g2l_tensor_copy_chw_rotated, + .tdma_g2l_tensor_copy_decompressed = cvkcv181x_tdma_g2l_tensor_copy_decompressed, + .tdma_g2l_tensor_fill_constant = cvkcv181x_tdma_g2l_bf16_tensor_fill_constant, + .tdma_g2l_bf16_tensor_fill_constant = cvkcv181x_tdma_g2l_bf16_tensor_fill_constant, + .tdma_g2l_matrix_copy_decompressed = cvkcv181x_tdma_g2l_matrix_copy_decompressed, + .tdma_g2l_matrix_copy = cvkcv181x_tdma_g2l_bf16_matrix_copy, + .tdma_g2l_bf16_matrix_copy = cvkcv181x_tdma_g2l_bf16_matrix_copy, + .tdma_g2l_matrix_copy_row_col_transposed = cvkcv181x_tdma_g2l_matrix_copy_row_col_transposed, + .tdma_g2l_general_copy = cvkcv181x_tdma_g2l_general_copy, + .tdma_g2l_bf16_general_copy = cvkcv181x_tdma_g2l_bf16_general_copy, + .tdma_g2g_tensor_copy = cvkcv181x_tdma_g2g_tensor_copy, + .tdma_g2g_general_copy = cvkcv181x_tdma_g2g_general_copy, + .tdma_g2g_bf16_general_copy = cvkcv181x_tdma_g2g_bf16_general_copy, + .tdma_g2g_bf16_tensor_copy = cvkcv181x_tdma_g2g_bf16_tensor_copy, + .tiu_mul = cvkcv181x_tiu_mul, + .tiu_mul_qm = cvkcv181x_tiu_mul_qm, + .tiu_mac = cvkcv181x_tiu_mac, + .tiu_add = cvkcv181x_tiu_add, + .tiu_sub = cvkcv181x_tiu_sub, + .tiu_max = cvkcv181x_tiu_max, + .tiu_min = cvkcv181x_tiu_min, + .tiu_and_int8 = cvkcv181x_tiu_and_int8, + .tiu_arith_shift = cvkcv181x_tiu_arith_shift, + .tiu_and_int16 = cvkcv181x_tiu_and_int16, + .tiu_or_int8 = cvkcv181x_tiu_or_int8, + .tiu_or_int16 = cvkcv181x_tiu_or_int16, + .tiu_xor_int8 = cvkcv181x_tiu_xor_int8, + .tiu_xor_int16 = cvkcv181x_tiu_xor_int16, + .tiu_copy = cvkcv181x_tiu_copy, + .tiu_lookup_table = cvkcv181x_tiu_lookup_table, + .tiu_bf16_lookup_interp_table = cvkcv181x_tiu_bf16_lookup_interp_table, + .tiu_pt_convolution = cvkcv181x_tiu_pt_convolution, + .tiu_convolution = cvkcv181x_tiu_convolution, + .tiu_max_pooling = cvkcv181x_tiu_max_pooling, + .tiu_average_pooling = cvkcv181x_tiu_average_pooling, + .tiu_pt_depthwise_convolution = cvkcv181x_tiu_pt_depthwise_convolution, + .tiu_depthwise_convolution = cvkcv181x_tiu_depthwise_convolution, + .tiu_matrix_multiplication = cvkcv181x_tiu_matrix_multiplication, + .tiu_matrix_multiplication_qm = cvkcv181x_tiu_matrix_multiplication_qm, + .tiu_ge = cvkcv181x_tiu_ge, + .tiu_min_pooling = cvkcv181x_tiu_min_pooling, +}; + +static cvk_misc_operations_t cvk_cv181x_misc_ops = { + .float_to_bfloat16 = cvkcv181x_float_to_bfloat16, + .bf16_table_shape = cvkcv181x_bf16_table_shape, +}; + +char *cvikernel_get_chip_info_cv181x(void) +{ + return CVI_TPU_VERSION_181X; +} + +void cvikernel_init_cv181x( + cvk_reg_info_t *req_info, + cvk_context_t *ctx) +{ + uint32_t max_nr_desc = cvkcv181x_estimate_nr_desc(req_info->cmdbuf_size); + cvk_prv_data_t *prv_data; + desc_pair_t *desc_pairs; + + prv_data = malloc(sizeof(cvk_prv_data_t)); + desc_pairs = malloc(max_nr_desc * sizeof(desc_pair_t)); + if (!req_info || !ctx || !prv_data || !desc_pairs) { + if (prv_data) + free(prv_data); + if (desc_pairs) + free(desc_pairs); + return; + } + + ctx->info.version = CV181X_VER; + ctx->info.node_num = CV181X_HW_NODE_CHIP_NUM; + ctx->info.node_shift = CV181X_HW_NODE_CHIP_SHIFT; + ctx->info.npu_num = CV181X_HW_NPU_NUM; + ctx->info.npu_shift = CV181X_HW_NPU_SHIFT; + ctx->info.eu_num = CV181X_HW_EU_NUM; + ctx->info.eu_shift = CV181X_HW_EU_SHIFT; + ctx->info.lmem_size = CV181X_HW_LMEM_SIZE; + ctx->info.lmem_shift = CV181X_HW_LMEM_SHIFT; + ctx->info.lmem_banks = CV181X_HW_LMEM_BANKS; + ctx->info.lmem_bank_size = CV181X_HW_LMEM_BANK_SIZE; + ctx->info.gmem_start = CV181X_GLOBAL_MEM_START_ADDR; + ctx->info.features = CVK_HWF_FC_OP1_CONST | CVK_HWF_8B_ADD_SUB | + CVK_HWF_MIN_POOL | CVK_HWF_M_BRADCAST | + CVK_HWF_QM_LSHIFT | CVK_HWF_GE | CVK_HWF_CMD_PRE_EXE; + ctx->info.gmem_size = CV181X_GLOBAL_MEM_SIZE; + + ctx->ops = &cvk_cv181x_ops; + ctx->misc_ops = &cvk_cv181x_misc_ops; + + prv_data->cmdbuf_ptr = 0; + prv_data->max_nr_desc = max_nr_desc; + prv_data->cur_nr_desc = 0; + prv_data->desc_pairs = desc_pairs; + prv_data->lmem_ptr = 0; + + if (!prv_data->desc_pairs) { + printf("cvkcv181x init: fail to allocate internal data\n"); + free(prv_data); + return; + } + + ec_init(&prv_data->ec, CV181X_ENGINE_NUM, max_nr_desc); + mode_manager_init(&prv_data->mode_manager, &prv_data->ec, CV181X_ENGINE_NUM); + + prv_data->cmdbuf = req_info->cmdbuf; + prv_data->cmdbuf_size = req_info->cmdbuf_size; + ctx->priv_data = prv_data; +} diff --git a/cvikernel/src/cv181x/cvkcv181x.h b/cvikernel/src/cv181x/cvkcv181x.h new file mode 100644 index 000000000..f64351845 --- /dev/null +++ b/cvikernel/src/cv181x/cvkcv181x.h @@ -0,0 +1,753 @@ +#ifndef CVKCV181X_H +#define CVKCV181X_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "engine_conductor.h" +#include "engine_state.h" +#include "mode_manager.h" +#include +#include +#include "../../include/cvikernel/cv181x/cv181x_tiu_reg.h" +#include "../../include/cvikernel/cv181x/cv181x_tdma_reg.h" +#include "../../include/cvikernel/cv181x/cv181x_tpu_cfg.h" + +#define CV181X_TIU 0 // Tensor Instruction Unit +#define CV181X_CPU 1 // CPU, Reserved for common cpu op +#define CV181X_TDMA 2 // TPU DMA +#define CV181X_ENGINE_NUM 3 // Number of Engines + +typedef struct __cmd_hdr_s { + uint8_t magic; // 0xA5 + uint8_t len; // lens in bytes + uint8_t engine_id: 4; // TPU, TDMA + uint8_t __deprecated: 4; + uint8_t flags; // CMD_ID, sync flags, etc. TBD + uint32_t mask; // bit mask for which register need to write + uint8_t cmd[0]; +} __attribute__((packed)) cmd_hdr_t; + +typedef struct { + cmd_hdr_t *cmd_hdr; + ec_desc_t *ec_desc; +} desc_pair_t; + +typedef struct cvk_prv_data { + ec_t ec; + mode_manager_t mode_manager; + + uint32_t cmdbuf_ptr; + uint32_t max_nr_desc; + uint32_t cur_nr_desc; + desc_pair_t *desc_pairs; + + uint32_t lmem_ptr; + uint16_t layer_id; + + uint32_t cmdbuf_size; + uint8_t *cmdbuf; +} cvk_prv_data_t; + +desc_pair_t *cvkcv181x_get_desc_pair(cvk_context_t *ctx, uint8_t eng_id); + +#define CHECK(_status, _cond) \ + do { \ + (_status) |= (_cond) ? 0 : -1; \ + } while (0) + +static inline int ceiling_func(int numerator, int denominator) +{ + return (numerator + denominator - 1) / denominator; +} + +static inline int ceiling_func_shift(int numerator, int shift) +{ + return (numerator + (1 << shift) - 1) >> shift; +} + +static inline uint64_t align_up(uint64_t x, uint64_t n) +{ + return (x + n - 1) / n * n; +} + +static inline int8_t check_same_stride(const cvk_tl_t *a, const cvk_tl_t *b) +{ + int8_t status = 0; + + CHECK(status, a->stride.n == b->stride.n); + CHECK(status, a->stride.c == b->stride.c); + CHECK(status, a->stride.h == b->stride.h); + CHECK(status, a->stride.w == b->stride.w); + + return status; +} + +static inline int8_t check_same_shape(const cvk_tl_t *a, const cvk_tl_t *b) +{ + int8_t status = 0; + + CHECK(status, a->shape.n == b->shape.n); + CHECK(status, a->shape.c == b->shape.c); + CHECK(status, a->shape.h == b->shape.h); + CHECK(status, a->shape.w == b->shape.w); + + return status; +} + +static inline int8_t check_same_shape_3( + const cvk_tl_t *a, + const cvk_tl_t *b, + const cvk_tl_t *c) +{ + int8_t status = 0; + status |= check_same_shape(a, b); + status |= check_same_shape(a, c); + + return status; +} + +static inline int8_t check_same_shape_4( + const cvk_tl_t *a, + const cvk_tl_t *b, + const cvk_tl_t *c, + const cvk_tl_t *d) +{ + int8_t status = 0; + status |= check_same_shape_3(a, b, c); + status |= check_same_shape(a, d); + + return status; +} + +static inline int8_t check_same_shape_5( + const cvk_tl_t *t0, + const cvk_tl_t *t1, + const cvk_tl_t *t2, + const cvk_tl_t *t3, + const cvk_tl_t *t4) +{ + int8_t status = 0; + status |= check_same_shape_3(t0, t1, t2); + status |= check_same_shape_3(t0, t3, t4); + + return status; +} + +static inline int8_t check_same_shape_6( + const cvk_tl_t *t0, + const cvk_tl_t *t1, + const cvk_tl_t *t2, + const cvk_tl_t *t3, + const cvk_tl_t *t4, + const cvk_tl_t *t5) +{ + int8_t status = 0; + status |= check_same_shape_5(t0, t1, t2, t3, t4); + status |=check_same_shape(t0, t5); + + return status; +} + + +static inline int8_t check_tiu_tensor_shape(const cvk_tl_t *t) +{ + int8_t status = 0; + CHECK(status, t->shape.n > 0); + CHECK(status, t->shape.c > 0); + CHECK(status, t->shape.h > 0); + CHECK(status, t->shape.w > 0); + + CHECK(status, t->shape.n < 0x1000); + CHECK(status, t->shape.c < 0x1000); + CHECK(status, t->shape.h <= (4095-32)); // 12bit, max 4095-32(lanes) + CHECK(status, t->shape.w <= (4095-32)); // 12bit, max 4095-32(lanes) + + return status; +} + +static inline int8_t check_tiu_tensor(const cvk_tl_t *t) +{ + int8_t status = 0; + + if (!t) + return -1; + + status |= check_tiu_tensor_shape(t); + CHECK(status, t->fmt == CVK_FMT_I8 || t->fmt == CVK_FMT_U8 || t->fmt == CVK_FMT_BF16); + + return status; +} + +static inline int8_t check_tiu_tensor_2( + const cvk_tl_t *t0, + const cvk_tl_t *t1) +{ + int8_t status = 0; + status |= check_tiu_tensor(t0); + status |= check_tiu_tensor(t1); + + return status; +} + +static inline int8_t check_tiu_tensor_3( + const cvk_tl_t *t0, + const cvk_tl_t *t1, + const cvk_tl_t *t2) +{ + int8_t status = 0; + status |= check_tiu_tensor(t0); + status |= check_tiu_tensor_2(t1, t2); + + return status; +} + +static inline int8_t check_tiu_tensor_4( + const cvk_tl_t *t0, + const cvk_tl_t *t1, + const cvk_tl_t *t2, + const cvk_tl_t *t3) +{ + int8_t status = 0; + status |= check_tiu_tensor_3(t0, t1, t2); + status |= check_tiu_tensor(t3); + + return status; +} + +static inline int8_t check_tiu_tensor_5( + const cvk_tl_t *t0, + const cvk_tl_t *t1, + const cvk_tl_t *t2, + const cvk_tl_t *t3, + const cvk_tl_t *t4) +{ + int8_t status = 0; + status |= check_tiu_tensor_3(t0, t1, t2); + status |= check_tiu_tensor_2(t3, t4); + + return status; +} + +static inline int8_t check_tiu_tensor_6( + const cvk_tl_t *t0, + const cvk_tl_t *t1, + const cvk_tl_t *t2, + const cvk_tl_t *t3, + const cvk_tl_t *t4, + const cvk_tl_t *t5) +{ + int8_t status = 0; + status |= check_tiu_tensor_3(t0, t1, t2); + status |= check_tiu_tensor_3(t3, t4, t5); + + return status; +} + +static inline int8_t check_16bit_tiu_tensor(const cvk_tl_t *low, const cvk_tl_t *high) +{ + int8_t status = 0; + + status |= check_tiu_tensor_2(low, high); + status |= check_same_shape(low, high); + status |= check_same_stride(low, high); + CHECK(status, low->fmt == high->fmt); + CHECK(status, low->start_address < high->start_address); + + return status; +} + +static inline int8_t check_stride_type_0(cvk_context_t *ctx, const cvk_tl_t *t) +{ + int8_t status = 0; + uint32_t eu_num = ctx->info.eu_num; + uint32_t fmt = (t->fmt == CVK_FMT_BF16) ? 2 : 1; + + uint32_t h = t->shape.h; + uint32_t w = t->shape.w * fmt; + uint32_t c_stride = align_up(h * w, eu_num); + + CHECK(status, t->stride.c == c_stride); + CHECK(status, t->stride.h == w); + CHECK(status, t->stride.w == fmt); + + return status; +} + +static inline int8_t check_bf16_stride_type_0(cvk_context_t *ctx, const cvk_tl_t *t) +{ + int8_t status = 0; + uint32_t eu_num = ctx->info.eu_num; + uint32_t fmt = (t->fmt == CVK_FMT_BF16) ? 2 : 1; + + CHECK(status, t->stride.c % eu_num == 0); + CHECK(status, t->stride.w == fmt); + + return status; +} + +static inline int8_t check_stride_type_2(cvk_context_t *ctx, const cvk_tl_t *t) +{ + int8_t status = 0; + + CHECK(status, t->shape.h == 1); + CHECK(status, t->shape.w == 1); + + uint32_t fmt = (t->fmt == CVK_FMT_BF16) ? 2 : 1; + uint32_t c = t->shape.c; + uint32_t npu_num = ctx->info.npu_num; + + CHECK(status, t->stride.n == fmt * align_up(c, npu_num) / npu_num); + CHECK(status, t->stride.c == 1 * fmt); + CHECK(status, t->stride.h == 1 * fmt); + CHECK(status, t->stride.w == 1 * fmt); + + return status; +} + +static inline int8_t check_bf16_stride_type_2(cvk_context_t *ctx, const cvk_tl_t *t) +{ + int8_t status = 0; + CHECK(status, t->shape.h == 1); + CHECK(status, t->shape.w == 1); + + uint32_t fmt = (t->fmt == CVK_FMT_BF16) ? 2 : 1; + uint32_t c = t->shape.c; + uint32_t npu_num = ctx->info.npu_num; + + CHECK(status, t->stride.n == fmt * align_up(c, npu_num) / npu_num); + CHECK(status, t->stride.c == 1 * fmt); + CHECK(status, t->stride.h == 1 * fmt); + CHECK(status, t->stride.w == 1 * fmt); + + return status; +} + +static inline int tensor_is_signed(const cvk_tl_t *t) +{ + switch (t->fmt) { + case CVK_FMT_I8: + return 1; + case CVK_FMT_U8: + case CVK_FMT_BF16: //does not matter, so set to default 0 + return 0; + default: + break; + } + + return 1; +} + +static inline int matrix_is_signed(const cvk_ml_t *t) +{ + switch (t->fmt) { + case CVK_FMT_I8: + return 1; + case CVK_FMT_U8: + case CVK_FMT_BF16: //does not matter, so set to default 0 + return 0; + default: + break; + } + + return 1; +} + +static inline void fill_same_tensor_shape(tiu_reg_t *r, cvk_tl_shape_t s) +{ + uint32_t n = s.n; + uint32_t c = s.c; + uint32_t h = s.h; + uint32_t w = s.w; + + r->opd0_n = n; + r->opd0_c = c; + r->opd0_h = h; + r->opd0_w = w; + + r->opd1_n = n; + r->opd1_c = c; + r->opd1_h = h; + r->opd1_w = w; + + r->opd2_n = n; + r->opd2_c = c; + r->opd2_h = h; + r->opd2_w = w; + + r->res0_n = n; + r->res0_c = c; + r->res0_h = h; + r->res0_w = w; +} + +static inline int8_t check_stride_range(cvk_tl_stride_t s) +{ + int8_t status = 0; + + CHECK(status, s.n < 0x10000); + CHECK(status, s.c < 0x10000); + CHECK(status, s.h < 0x10000); + + return status; +} + +static inline void fill_same_tensor_stride(tiu_reg_t *r, cvk_tl_stride_t s) +{ + uint32_t n = s.n; + uint32_t c = s.c; + uint32_t h = s.h; + uint32_t w = 1; + + r->opd0_n_str = n; + r->opd0_c_str = c; + r->opd0_h_str = h; + r->opd0_w_str = w; + + r->opd1_n_str = n; + r->opd1_c_str = c; + r->opd1_h_str = h; + r->opd1_w_str = w; + + r->opd2_n_str = n; + r->opd2_c_str = c; + r->opd2_h_str = h; + r->opd2_w_str = w; + + r->res0_n_str = n; + r->res0_c_str = c; + r->res0_h_str = h; + r->res0_w_str = w; +} + +#define fill_stride_code(r, op, str) \ + do { \ + r->op##_n_str = str->n; \ + r->op##_c_str = str->c; \ + r->op##_h_str = str->h; \ + r->op##_w_str = str->w; \ + } while (0) + +static inline void fill_opd0_stride(tiu_reg_t *r, const cvk_tl_stride_t *str) +{ + fill_stride_code(r, opd0, str); +} + +static inline void fill_opd1_stride(tiu_reg_t *r, const cvk_tl_stride_t *str) +{ + fill_stride_code(r, opd1, str); +} + +static inline void fill_opd2_stride(tiu_reg_t *r, const cvk_tl_stride_t *str) +{ + fill_stride_code(r, opd2, str); +} + +static inline void fill_res0_stride(tiu_reg_t *r, const cvk_tl_stride_t *str) +{ + fill_stride_code(r, res0, str); +} + +static inline void fill_same_tensor_stride_type(tiu_reg_t *r, int type) +{ + r->short_opd0_str = type & 0b11; + r->short_opd1_str = type & 0b11; + r->short_opd2_str = type & 0b11; + r->short_res0_str = type & 0b11; +} + +static inline ec_desc_t * emit_tiu_cmdbuf(cvk_context_t *ctx, tiu_reg_t *r) +{ + int engine_id = CV181X_TIU; + + desc_pair_t *dp = cvkcv181x_get_desc_pair(ctx, engine_id); + uint32_t *cmdbuf = (uint32_t *)dp->cmd_hdr->cmd; + emit_tiu_reg(r, cmdbuf); + + return dp->ec_desc; +} + +void cvkcv181x_cleanup(struct cvikernel_context *ctx); +void cvkcv181x_reset(struct cvikernel_context *ctx); + +void cvkcv181x_parallel_enable(struct cvikernel_context *ctx); +void cvkcv181x_parallel_disable(struct cvikernel_context *ctx); +void cvkcv181x_set_layer_id( + struct cvikernel_context *ctx, + uint16_t layer_id); +cvk_tl_t *cvkcv181x_lmem_alloc_tensor( + struct cvikernel_context *ctx, + cvk_tl_shape_t shape, + cvk_fmt_t fmt, + int eu_align); +cvk_ml_t *cvkcv181x_lmem_alloc_matrix( + struct cvikernel_context *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align); +cvk_ml_t *cvkcv181x_lmem_alloc_ps32_matrix( + struct cvikernel_context *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align); +void cvkcv181x_lmem_free_tensor( + struct cvikernel_context *ctx, + const cvk_tl_t *tl); +void cvkcv181x_lmem_free_matrix( + struct cvikernel_context *ctx, + const cvk_ml_t *ml); +void cvkcv181x_lmem_init_tensor( + struct cvikernel_context *ctx, + cvk_tl_t *tl, + cvk_tl_shape_t shape, + cvk_fmt_t fmt, + int eu_align); +void cvkcv181x_lmem_init_matrix( + struct cvikernel_context *ctx, + cvk_ml_t *ml, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align); +cvk_tl_stride_t cvkcv181x_tl_default_stride( + struct cvikernel_context *ctx, + cvk_tl_shape_t shape, + cvk_fmt_t fmt, + int eu_align); +cvk_tg_stride_t cvkcv181x_tg_default_stride( + struct cvikernel_context *ctx, + cvk_tg_shape_t shape, + cvk_fmt_t fmt); +cvk_ml_shape_t cvkcv181x_ml_default_shape( + struct cvikernel_context *ctx, + uint32_t row, + uint32_t col, + cvk_fmt_t fmt); +cvk_ml_stride_t cvkcv181x_ml_default_stride( + struct cvikernel_context *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align); +cvk_ml_shape_t cvkcv181x_ml_shape_t1( + struct cvikernel_context *ctx, + uint32_t len, + cvk_fmt_t fmt); +uint32_t cvkcv181x_lmem_tensor_to_size( + struct cvikernel_context *ctx, + cvk_tl_shape_t shape, + cvk_fmt_t fmt, + int eu_align); +uint32_t cvkcv181x_lmem_matrix_to_size( + struct cvikernel_context *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align); +uint32_t cvkcv181x_lmem_ps32_matrix_to_size( + struct cvikernel_context *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align); +void cvkcv181x_gmem_init_tensor( + struct cvikernel_context *ctx, + cvk_tg_t *tg, + cvk_tg_shape_t shape, + cvk_fmt_t fmt); + +/* Local to Local DMA API */ +void cvkcv181x_tdma_l2l_tensor_copy( + struct cvikernel_context *ctx, + const cvk_tdma_l2l_tensor_copy_param_t *param); +void cvkcv181x_tdma_l2l_bf16_tensor_copy( + struct cvikernel_context *ctx, + const cvk_tdma_l2l_tensor_copy_param_t *param); +void cvkcv181x_tdma_l2l_tensor_lrn_shift( + struct cvikernel_context *ctx, + const cvk_tdma_l2l_tensor_lrn_shift_param_t *param); + +/* Local to Global DMA API */ +void cvkcv181x_tdma_l2g_tensor_copy( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_tensor_copy_param_t *param); +void cvkcv181x_tdma_l2g_bf16_tensor_copy( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_tensor_copy_param_t *param); +void cvkcv181x_tdma_l2g_tensor_copy_nc_transposed( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_tensor_copy_nc_transposed_param_t *param); +void cvkcv181x_tdma_l2g_bf16_tensor_copy_nc_transposed( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_tensor_copy_nc_transposed_param_t *param); +void cvkcv181x_tdma_l2g_tensor_copy_compressed( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_tensor_copy_compressed_param_t *param); +void cvkcv181x_tdma_l2g_tensor_fill_constant( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_tensor_fill_constant_param_t *param); +void cvkcv181x_tdma_l2g_tensor_copy_cw_transposed( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_tensor_copy_cw_transposed_param_t *param); +void cvkcv181x_tdma_l2g_bf16_tensor_copy_cw_transposed( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_tensor_copy_cw_transposed_param_t *param); +void cvkcv181x_tdma_l2g_matrix_copy( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_matrix_copy_param_t *param); +void cvkcv181x_tdma_l2g_bf16_matrix_copy( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_matrix_copy_param_t *param); +void cvkcv181x_tdma_l2g_general_copy( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_general_copy_param_t *param); +void cvkcv181x_tdma_l2g_bf16_general_copy( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_bf16_general_copy_param_t *param); + +/* Global to Local DMA API */ +void cvkcv181x_tdma_g2l_tensor_copy( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_tensor_copy_param_t *param); +void cvkcv181x_tdma_g2l_bf16_tensor_copy( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_tensor_copy_param_t *param); +void cvkcv181x_tdma_g2l_tensor_copy_nc_transposed( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_tensor_copy_nc_transposed_param_t *param); +void cvkcv181x_tdma_g2l_bf16_tensor_copy_nc_transposed( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_tensor_copy_nc_transposed_param_t *param); +void cvkcv181x_tdma_g2l_tensor_copy_chw_rotated( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_tensor_copy_chw_rotated_param_t *param); +void cvkcv181x_tdma_g2l_tensor_copy_decompressed( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_tensor_copy_decompressed_param_t *param); +void cvkcv181x_tdma_g2l_tensor_fill_constant( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_tensor_fill_constant_param_t *param); +void cvkcv181x_tdma_g2l_bf16_tensor_fill_constant( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_tensor_fill_constant_param_t *param); +void cvkcv181x_tdma_g2l_matrix_copy_decompressed( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_matrix_copy_decompressed_param_t *param); +void cvkcv181x_tdma_l2g_matrix_copy_compressed( + struct cvikernel_context *ctx, + const cvk_tdma_l2g_matrix_copy_compressed_param_t *param); +void cvkcv181x_tdma_g2l_matrix_copy( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_matrix_copy_param_t *param); +void cvkcv181x_tdma_g2l_bf16_matrix_copy( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_matrix_copy_param_t *param); +void cvkcv181x_tdma_g2l_matrix_copy_row_col_transposed( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_matrix_copy_row_col_transposed_param_t *param); +void cvkcv181x_tdma_g2l_general_copy( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_general_copy_param_t *param); +void cvkcv181x_tdma_g2l_bf16_general_copy( + struct cvikernel_context *ctx, + const cvk_tdma_g2l_bf16_general_copy_param_t *param); + +/* Global to Global DMA API */ +void cvkcv181x_tdma_g2g_tensor_copy( + struct cvikernel_context *ctx, + const cvk_tdma_g2g_tensor_copy_param_t *param); +void cvkcv181x_tdma_g2g_general_copy( + struct cvikernel_context *ctx, + const cvk_tdma_g2g_tensor_copy_param_t *param); +void cvkcv181x_tdma_g2g_bf16_general_copy( + struct cvikernel_context *ctx, + const cvk_tdma_g2g_tensor_copy_param_t *param); +void cvkcv181x_tdma_g2g_bf16_tensor_copy( + struct cvikernel_context *ctx, + const cvk_tdma_g2g_tensor_copy_param_t *param); + +/* TIU API */ +void cvkcv181x_tiu_mul( + struct cvikernel_context *ctx, + const cvk_tiu_mul_param_t *param); +void cvkcv181x_tiu_mul_qm( + struct cvikernel_context *ctx, + const cvk_tiu_mul_qm_param_t *param); +void cvkcv181x_tiu_mac( + struct cvikernel_context *ctx, + const cvk_tiu_mac_param_t *param); +void cvkcv181x_tiu_add( + struct cvikernel_context *ctx, + const cvk_tiu_add_param_t *param); +void cvkcv181x_tiu_sub( + struct cvikernel_context *ctx, + const cvk_tiu_sub_param_t *param); +void cvkcv181x_tiu_max( + struct cvikernel_context *ctx, + const cvk_tiu_max_param_t *param); +void cvkcv181x_tiu_min( + struct cvikernel_context *ctx, + const cvk_tiu_min_param_t *param); +void cvkcv181x_tiu_and_int8( + struct cvikernel_context *ctx, + const cvk_tiu_and_int8_param_t *param); +void cvkcv181x_tiu_arith_shift( + struct cvikernel_context *ctx, + const cvk_tiu_arith_shift_param_t *param); +void cvkcv181x_tiu_and_int16( + struct cvikernel_context *ctx, + const cvk_tiu_and_int16_param_t *param); +void cvkcv181x_tiu_or_int8( + struct cvikernel_context *ctx, + const cvk_tiu_or_int8_param_t *param); +void cvkcv181x_tiu_or_int16( + struct cvikernel_context *ctx, + const cvk_tiu_or_int16_param_t *param); +void cvkcv181x_tiu_xor_int8( + struct cvikernel_context *ctx, + const cvk_tiu_xor_int8_param_t *param); +void cvkcv181x_tiu_xor_int16( + struct cvikernel_context *ctx, + const cvk_tiu_xor_int16_param_t *param); +void cvkcv181x_tiu_copy( + struct cvikernel_context *ctx, + const cvk_tiu_copy_param_t *param); +void cvkcv181x_tiu_lookup_table( + struct cvikernel_context *ctx, + const cvk_tiu_lookup_table_param_t *param); +void cvkcv181x_tiu_bf16_lookup_interp_table( + struct cvikernel_context *ctx, + const cvk_tiu_bf16_lookup_interp_table_param_t *param); +void cvkcv181x_tiu_pt_convolution( + struct cvikernel_context *ctx, + const cvk_tiu_pt_convolution_param_t *param); +void cvkcv181x_tiu_convolution( + struct cvikernel_context *ctx, + const cvk_tiu_convolution_param_t *param); +void cvkcv181x_tiu_max_pooling( + struct cvikernel_context *ctx, + const cvk_tiu_max_pooling_param_t *param); +void cvkcv181x_tiu_average_pooling( + struct cvikernel_context *ctx, + const cvk_tiu_average_pooling_param_t *param); +void cvkcv181x_tiu_pt_depthwise_convolution( + struct cvikernel_context *ctx, + const cvk_tiu_depthwise_pt_convolution_param_t *param); +void cvkcv181x_tiu_depthwise_convolution( + struct cvikernel_context *ctx, + const cvk_tiu_depthwise_convolution_param_t *param); +void cvkcv181x_tiu_matrix_multiplication( + struct cvikernel_context *ctx, + const cvk_tiu_matrix_multiplication_param_t *param); +void cvkcv181x_tiu_matrix_multiplication_qm( + struct cvikernel_context *ctx, + const cvk_tiu_matrix_multiplication_qm_param_t *param); +void cvkcv181x_tiu_ge( + cvk_context_t *ctx, + const cvk_tiu_ge_param_t *p); +void cvkcv181x_tiu_min_pooling( + cvk_context_t *ctx, + const cvk_tiu_min_pooling_param_t *p); + +#ifdef __cplusplus +} +#endif + +#endif /* CVKCV181X_H */ diff --git a/cvikernel/src/cv181x/tdma.c b/cvikernel/src/cv181x/tdma.c new file mode 100644 index 000000000..896ed4e2a --- /dev/null +++ b/cvikernel/src/cv181x/tdma.c @@ -0,0 +1,2267 @@ +#include "cvkcv181x.h" + +//n < 0x10000); + CHECK(status, s->c < 0x10000); + CHECK(status, s->h < 0x10000); + CHECK(status, s->w < 0x10000); + + CHECK(status, s->n > 0x0); + CHECK(status, s->c > 0x0); + CHECK(status, s->h > 0x0); + CHECK(status, s->w > 0x0); + + return status; +} + +static int8_t check_tdma_tl_bf16_shape(const cvk_tl_shape_t *s, cvk_fmt_t fmt) +{ + int8_t status = 0; + uint8_t fmt_type = (fmt == CVK_FMT_BF16 ? 2 : 1); + + CHECK(status, s->n < 0x10000); + CHECK(status, s->c < 0x10000); + CHECK(status, s->h < 0x10000); + CHECK(status, s->w < 0x10000 / fmt_type); + + CHECK(status, s->n > 0x0); + CHECK(status, s->c > 0x0); + CHECK(status, s->h > 0x0); + CHECK(status, s->w > 0x0); + + return status; +} + +static int8_t check_tdma_tg_shape(const cvk_tg_shape_t *s) +{ + int8_t status = 0; + + CHECK(status, s->n < 0x10000); + CHECK(status, s->c < 0x10000); + CHECK(status, s->h < 0x10000); + CHECK(status, s->w < 0x10000); + + CHECK(status, s->n > 0x0); + CHECK(status, s->c > 0x0); + CHECK(status, s->h > 0x0); + CHECK(status, s->w > 0x0); + + return status; +} + +static int8_t check_tdma_tg_bf16_shape(const cvk_tg_shape_t *s, cvk_fmt_t fmt) +{ + int8_t status = 0; + uint8_t fmt_type = (fmt == CVK_FMT_BF16 ? 2 : 1); + + CHECK(status, s->n < 0x10000); + CHECK(status, s->c < 0x10000); + CHECK(status, s->h < 0x10000); + CHECK(status, s->w < 0x10000 / fmt_type); + + CHECK(status, s->n > 0x0); + CHECK(status, s->c > 0x0); + CHECK(status, s->h > 0x0); + CHECK(status, s->w > 0x0); + + return status; +} + + +static int8_t check_tdma_ml_shape(const cvk_ml_shape_t *s) +{ + int8_t status = 0; + + CHECK(status, s->n < 0x10000); + CHECK(status, s->c < 0x10000); + CHECK(status, s->w < 0x10000); + CHECK(status, s->col < 0x10000); + + CHECK(status, s->n > 0); + CHECK(status, s->c > 0); + CHECK(status, s->w > 0); + CHECK(status, s->col > 0); + + return status; +} + +static int8_t check_tdma_ml_bf16_shape(const cvk_ml_shape_t *s, cvk_fmt_t fmt) +{ + int8_t status = 0; + uint8_t fmt_type = (fmt == CVK_FMT_BF16 ? 2 : 1); + + CHECK(status, s->n < 0x10000); + CHECK(status, s->c < 0x10000); + CHECK(status, s->w < 0x10000 / fmt_type); + CHECK(status, s->col < 0x10000); + + CHECK(status, s->n > 0); + CHECK(status, s->c > 0); + CHECK(status, s->w > 0); + CHECK(status, s->col > 0); + + return status; +} + +static int8_t check_tdma_mg_shape(const cvk_mg_shape_t *s) +{ + int8_t status = 0; + + CHECK(status, s->row < 0x10000); + CHECK(status, s->col < 0x10000); + + CHECK(status, s->row > 0x0); + CHECK(status, s->col > 0x0); + + return status; +} + +static int8_t check_tdma_mg_bf16_shape(const cvk_mg_shape_t *s, cvk_fmt_t fmt) +{ + int8_t status = 0; + uint8_t fmt_type = (fmt == CVK_FMT_BF16 ? 2 : 1); + + CHECK(status, s->row < 0x10000); + CHECK(status, s->col < 0x10000 / fmt_type); + + CHECK(status, s->row > 0x0); + CHECK(status, s->col > 0x0); + + return status; +} + +static int8_t check_tdma_tl(const cvk_tl_t *t) +{ + int8_t status = 0; + + CHECK(status, t); + CHECK(status, t->fmt == CVK_FMT_I8 || t->fmt == CVK_FMT_U8 || t->fmt == CVK_FMT_BF16); + status |= check_tdma_tl_shape(&t->shape); + + return status; +} + +static int8_t check_tdma_tl_bf16(const cvk_tl_t *t) +{ + int8_t status = 0; + + CHECK(status, t); + CHECK(status, t->fmt == CVK_FMT_I8 || t->fmt == CVK_FMT_U8 || t->fmt == CVK_FMT_BF16); + status |= check_tdma_tl_bf16_shape(&t->shape, t->fmt); + + return status; +} + +static int8_t check_tdma_tg(const cvk_tg_t *t) +{ + int8_t status = 0; + + CHECK(status, t); + CHECK(status, t->base_reg_index < TDMA_NUM_BASE_REGS); + CHECK(status, t->fmt == CVK_FMT_I8 || t->fmt == CVK_FMT_U8 || t->fmt == CVK_FMT_BF16); + status |= check_tdma_tg_shape(&t->shape); + + return status; +} + +static int8_t check_tdma_tg_bf16(const cvk_tg_t *t) +{ + int8_t status = 0; + + CHECK(status, t); + CHECK(status, t->base_reg_index < TDMA_NUM_BASE_REGS); + CHECK(status, t->fmt == CVK_FMT_I8 || t->fmt == CVK_FMT_U8 || t->fmt == CVK_FMT_BF16); + status |= check_tdma_tg_bf16_shape(&t->shape, t->fmt); + + return status; +} + +static int8_t check_tdma_compressed_tg(const cvk_cmpr_tg_t *t) +{ + int8_t status = 0; + uint32_t stride_w = t->t.fmt == CVK_FMT_BF16 ? 2 : 1; + + CHECK(status, t); + CHECK(status, t->t.base_reg_index < TDMA_NUM_BASE_REGS); + status |= check_tdma_tg_shape(&t->t.shape); + CHECK(status, !(t->t.start_address%0x10)); + + // Enable after backend fix + //CHECK(status, t->t.stride.n == + // (t->t.shape.w * t->t.shape.h * t->t.shape.c * stride_w)); + + CHECK(status, t->t.stride.c == (t->t.shape.w * t->t.shape.h * stride_w)); + CHECK(status, t->t.stride.h == (t->t.shape.w * stride_w)); + // m.base_reg_index < TDMA_NUM_BASE_REGS); + CHECK(status, !(t->m.start_address%0x10)); + + // the data should be continuous + if (t->m.fmt == CVK_FMT_BF16) { + CHECK(status, t->m.stride.row == t->m.shape.col * 2); + } + else if (t->m.fmt == CVK_FMT_I8 || t->m.fmt == CVK_FMT_U8) { + CHECK(status, t->m.stride.row == t->m.shape.col); + } + else { + CHECK(status, 0); //fmt == CVK_FMT_I8 || m->fmt == CVK_FMT_U8 || m->fmt == CVK_FMT_BF16); + status |= check_tdma_ml_shape(&m->shape); + + return status; +} + +static int8_t check_tdma_ml_bf16(const cvk_ml_t *m) +{ + int8_t status = 0; + + CHECK(status, m); + CHECK(status, m->fmt == CVK_FMT_I8 || m->fmt == CVK_FMT_U8 || m->fmt == CVK_FMT_BF16); + status |= check_tdma_ml_bf16_shape(&m->shape, m->fmt); + + return status; +} + +static int8_t check_tdma_mg(const cvk_mg_t *m) +{ + int8_t status = 0; + + CHECK(status, m); + CHECK(status, m->base_reg_index < TDMA_NUM_BASE_REGS); + status |= check_tdma_mg_shape(&m->shape); + + return status; +} + +static int8_t check_tdma_mg_bf16(const cvk_mg_t *m) +{ + int8_t status = 0; + + CHECK(status, m); + CHECK(status, m->base_reg_index < TDMA_NUM_BASE_REGS); + status |= check_tdma_mg_bf16_shape(&m->shape, m->fmt); + + return status; +} + +static int8_t check_tdma_compress_mg(const cvk_cmpr_mg_t *m) +{ + int8_t status = 0; + + CHECK(status, m); + CHECK(status, m->m.base_reg_index < TDMA_NUM_BASE_REGS); + status |= check_tdma_mg_shape(&m->m.shape); + + return status; +} + +static int8_t check_tl_same_size(const cvk_tl_t *a, const cvk_tl_t *b) +{ + int8_t status = 0; + uint32_t a_size = a->shape.n * a->shape.c * a->shape.h * a->shape.w; + uint32_t b_size = b->shape.n * b->shape.c * b->shape.h * b->shape.w; + + CHECK(status, a_size == b_size); + + return status; +} + +static int8_t check_tl_tg_same_size(const cvk_tl_t *tl, const cvk_tg_t *tg) +{ + int8_t status = 0; + uint32_t tl_size = tl->shape.n * tl->shape.c * tl->shape.h * tl->shape.w; + uint32_t tg_size = tg->shape.n * tg->shape.c * tg->shape.h * tg->shape.w; + + CHECK(status, tl_size == tg_size); + + return status; +} + +static int8_t check_ml_mg_same_size(const cvk_ml_t *ml, const cvk_mg_t *mg) +{ + int8_t status = 0; + uint32_t ml_size = ml->shape.n * ml->shape.col; + uint32_t mg_size = mg->shape.row * mg->shape.col; + + CHECK(status, ml_size == mg_size); + + return status; +} + +#if 0 +static uint64_t absolute_gmem_addr(uint64_t addr) +{ + return (addr & 0x0FFFFFFFFFF) + BM1822_GLOBAL_MEM_START_ADDR; +} +#else +//global memory start = 0x0 from 1822 kernel view, we can use it directlly +//cmdbuf descriptor content dram address does not need offset either +#define absolute_gmem_addr(addr) (addr & 0x0FFFFFFFFFF) +#endif + +static ec_desc_t * emit_tdma_cmdbuf(cvk_context_t *ctx, tdma_reg_t *reg) +{ + cvk_prv_data_t *prv_data = (cvk_prv_data_t *)ctx->priv_data; + desc_pair_t *dp = cvkcv181x_get_desc_pair(ctx, CV181X_TDMA); + + reg->layer_ID = prv_data->layer_id; + //CHECK(status, reg->rsv5 != 0x0);// "this is debug use, it's fine for skip"; + + uint32_t *cmdbuf = (uint32_t *)dp->cmd_hdr->cmd; + emit_tdma_reg(reg, cmdbuf); + + return dp->ec_desc; +} + +static void fill_l2g_fmt(tdma_reg_t *reg, cvk_fmt_t src_fmt, cvk_fmt_t dst_fmt) +{ + reg->dst_fmt = (dst_fmt == CVK_FMT_BF16) ? 2 : 1; + reg->src_fmt = (src_fmt == CVK_FMT_BF16) ? 2 : 1; + // check and decide bf16->int8 or bf16->uint8_t + reg->int8_sign = (dst_fmt == CVK_FMT_I8 ? 1 : 0);// | (dst_fmt == CVK_FMT_U8 ? 1 : 0); +} + +static void fill_g2l_fmt(tdma_reg_t *reg, cvk_fmt_t src_fmt, cvk_fmt_t dst_fmt) +{ + reg->dst_fmt = (dst_fmt == CVK_FMT_BF16) ? 2 : 1; + reg->src_fmt = (src_fmt == CVK_FMT_BF16) ? 2 : 1; + // check and decide int8->bf16 or uint8_t->bf16 + reg->int8_sign = (src_fmt == CVK_FMT_I8 ? 1 : 0) ;//| (src_fmt == CVK_FMT_U8 ? 1 : 0); +} + +static void fill_l2l_fmt(tdma_reg_t *reg, cvk_fmt_t src_fmt, cvk_fmt_t dst_fmt) +{ + reg->dst_fmt = (dst_fmt == CVK_FMT_BF16) ? 2 : 1; + reg->src_fmt = (src_fmt == CVK_FMT_BF16) ? 2 : 1; + // check and decide bf16->int8 or bf16->uint8_t or int8->bf16 or uint8_t->bf16 + reg->int8_sign = (dst_fmt == CVK_FMT_I8 ? 1 : 0) | (src_fmt == CVK_FMT_I8 ? 1 : 0); +} + +static void fill_src_addr(tdma_reg_t *r, uint64_t addr) +{ + r->src_base_addr_low = (uint32_t)addr; + r->src_base_addr_high = (addr >> 32); +} + +static void fill_dst_addr(tdma_reg_t *r, uint64_t addr) +{ + r->dst_base_addr_low = (uint32_t)addr; + r->dst_base_addr_high = (addr >> 32); +} + +static void fill_src_c_stride(tdma_reg_t *r, uint32_t str) +{ + r->src_c_stride_low = (uint16_t)str; + r->src_c_stride_high = (str >> 16); +} + +static void fill_dst_c_stride(tdma_reg_t *r, uint32_t str) +{ + r->dst_c_stride_low = (uint16_t)str; + r->dst_c_stride_high = (str >> 16); +} + +static void set_int8_rnd_mode(tdma_reg_t *r, uint32_t int8_rnd_mode) +{ + if (int8_rnd_mode == 1) { + // int8 + if (r->src_fmt == FMT_BF16_TYP && r->dst_fmt == FMT_FIX8B_TYP) { + r->int8_rnd_mode = int8_rnd_mode; + } + } +} + + +/* + * Direction: L2L + */ +void cvkcv181x_tdma_l2l_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_l2l_tensor_copy_param_t *p) +{ + int8_t status = 0; + + status |= check_tdma_tl(p->src); + status |= check_tdma_tl(p->dst); + CHECK(status, p->src->shape.n == p->dst->shape.n); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 3; + reg.spec_func = 0; + reg.sys_dtype = 0; + + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + reg.outstanding_en = p->outstanding; + + if (status) { + printf("cvkcv181x l2l: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +void cvkcv181x_tdma_l2l_bf16_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_l2l_tensor_copy_param_t *p) +{ + int8_t status = 0; + + status |= check_tdma_tl_bf16(p->src); + status |= check_tdma_tl_bf16(p->dst); + CHECK(status, p->src->shape.n == p->dst->shape.n); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 3; + reg.spec_func = 0; + reg.sys_dtype = 0; + fill_l2l_fmt(®, p->src->fmt, p->dst->fmt); + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + // does not allow open `mv_lut_idx and `mv_lut_basemv_lut_base at same time + if (p->mv_lut_idx == 1) { + reg.mv_lut_idx = p->mv_lut_idx; + } + + if (p->mv_lut_base == 1) { + reg.mv_lut_base = p->mv_lut_base; + } + + if (reg.mv_lut_idx == 1 && reg.mv_lut_base == 1) { + CHECK(status, 0); + } + + set_int8_rnd_mode(®, p->dst->int8_rnd_mode); + + reg.outstanding_en = p->outstanding; + + if (status) { + printf("cvkcv181x l2l bf16: wrong parameter\n"); + return; + } + + //trace_tdma_reg(®, __func__); + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static uint32_t addr_after_right_shift( + cvk_context_t *ctx, int addr, uint32_t step, int c_str) +{ + uint32_t npu_num = ctx->info.npu_num; + uint32_t lmem_size = ctx->info.lmem_size;; + + uint32_t lmem_i = (addr / lmem_size + step) % npu_num; + uint32_t offset = addr % lmem_size + (addr / lmem_size + step) / npu_num * c_str; + return lmem_i * lmem_size + offset; +} + +void cvkcv181x_tdma_l2l_tensor_lrn_shift( + cvk_context_t *ctx, + const cvk_tdma_l2l_tensor_lrn_shift_param_t *p) +{ + int8_t status = 0; + status |= check_tdma_tl(p->src); + status |= check_tdma_tl(p->dst); + status |= check_tl_same_size(p->src, p->dst); + CHECK(status, p->src->shape.n == p->dst->shape.n); + CHECK(status, p->src->shape.c == p->dst->shape.c); + CHECK(status, p->src->shape.c > p->lrn_step); + CHECK(status, p->src->shape.h * p->src->shape.w == + p->dst->shape.h * p->dst->shape.w); + CHECK(status, p->lrn_step < 16); + + CHECK(status, p->src->fmt == p->dst->fmt); + + int is_bf16 = (p->src->fmt == CVK_FMT_BF16) ? 1 : 0; + if (is_bf16) { + check_tdma_tl_bf16(p->src); + check_tdma_tl_bf16(p->dst); + } + + /* L2L lrn copy */ + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 3; + reg.spec_func = 0; + reg.sys_dtype = 0; + fill_l2l_fmt(®, p->src->fmt, p->dst->fmt); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c - p->lrn_step; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c - p->lrn_step; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + if (p->right_shift) { + uint32_t dst_addr = addr_after_right_shift( + ctx, p->dst->start_address, p->lrn_step, p->dst->stride.c); + + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + } else { + uint32_t src_addr = addr_after_right_shift( + ctx, p->src->start_address, p->lrn_step, p->src->stride.c); + + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + } + + if (is_bf16) + set_int8_rnd_mode(®, p->dst->int8_rnd_mode); + + emit_tdma_cmdbuf(ctx, ®); + + /* Constant fill with zero */ + reg.trans_dir = 0; + reg.spec_func = 4; + reg.const_val = is_bf16 ? cvk_convert_fp32_bf16(0.0): 0; + + reg.dst_c = p->lrn_step; + if (p->right_shift) { + uint32_t dst_addr = addr_after_right_shift( + ctx, p->dst->start_address, p->lrn_step, p->dst->stride.c); + + uint32_t lmem_size = ctx->info.lmem_size;; + uint32_t npu_num = ctx->info.npu_num; + uint32_t sht_num = p->lrn_step; + + uint32_t lmem_i = (dst_addr / lmem_size - sht_num) % npu_num; + uint32_t offset = (lmem_i + sht_num) / npu_num * p->dst->stride.c; + uint32_t zero_addr = lmem_i * lmem_size + dst_addr % lmem_size - offset; + + // printf(" lmem_i 0x%x, offset 0x%x, zero_addr 0x%x\n", + // lmem_i, offset, zero_addr); + + fill_dst_addr(®, zero_addr); + + } else { + uint32_t start_mem = p->dst->start_address / ctx->info.lmem_size; + uint32_t cur_mem = (start_mem + (p->dst->shape.c - p->lrn_step)) % ctx->info.npu_num; + uint32_t offset = + (p->dst->start_address % ctx->info.lmem_size) + + ((start_mem + (p->dst->shape.c - p->lrn_step)) / ctx->info.npu_num) * p->dst->stride.c; + uint32_t zero_addr = cur_mem * ctx->info.lmem_size + offset; + + // printf(" start_mem 0x%x, cur_mem 0x%x, offset 0x%x, zero_addr 0x%x\n", + // start_mem, cur_mem, offset, zero_addr); + + fill_dst_addr(®, zero_addr); + } + + if (status) { + printf("cvkcv181x tdma l2l lrn shift: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} +/* + * Direction: L2G + */ + +static void tdma_l2g_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_param_t *p, + uint64_t dst_addr) +{ + int8_t status = 0; + status |= check_tdma_tl(p->src); + status |= check_tdma_tg(p->dst); + status |= check_tl_tg_same_size(p->src, p->dst); + CHECK(status, p->src->shape.n == p->dst->shape.n); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.sys_dtype = 0; + reg.spec_func = 0; + + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + reg.intra_cmd_paral = p->intra_cmd_paral; + + if (status) { + printf("cvkcv181x l2g: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + + +static void tdma_l2g_bf16_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_param_t *p, + uint64_t dst_addr) +{ + int8_t status = 0; + status |= check_tdma_tl_bf16(p->src); + status |= check_tdma_tg_bf16(p->dst); + status |= check_tl_tg_same_size(p->src, p->dst); + CHECK(status, p->src->shape.n == p->dst->shape.n); + CHECK(status, !(p->src->fmt == CVK_FMT_I8 && p->dst->fmt == CVK_FMT_BF16)); // not support tl(int8)->tg(bf16) + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.sys_dtype = 0; + reg.spec_func = 0; + + fill_l2g_fmt(®, p->src->fmt, p->dst->fmt); + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + reg.intra_cmd_paral = p->intra_cmd_paral; + + set_int8_rnd_mode(®, p->dst->int8_rnd_mode); + //trace_tdma_reg(®, __func__); + + if (status) { + printf("cvkcv181x l2g bf16: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_l2g_tensor_copy_nc_transposed( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_nc_transposed_param_t *p, + uint64_t dst_addr) +{ + int8_t status = 0; + status |= check_tdma_tl(p->src); + status |= check_tdma_tg(p->dst); + CHECK(status, p->dst->shape.n == p->src->shape.c); + CHECK(status, p->dst->shape.c == p->src->shape.n); + CHECK(status, p->dst->shape.h * p->dst->shape.w == + p->src->shape.h * p->src->shape.w); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.sys_dtype = 0; + reg.spec_func = 1; + + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + if (status) { + printf("cvcv181x l2g nc tp: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_l2g_bf16_tensor_copy_nc_transposed( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_nc_transposed_param_t *p, + uint64_t dst_addr) +{ + int8_t status = 0; + status |= check_tdma_tl_bf16(p->src); + status |= check_tdma_tg_bf16(p->dst); + CHECK(status, p->dst->shape.n == p->src->shape.c); + CHECK(status, p->dst->shape.c == p->src->shape.n); + CHECK(status, p->dst->shape.h * p->dst->shape.w == + p->src->shape.h * p->src->shape.w); + CHECK(status, !(p->src->fmt == CVK_FMT_I8 && p->dst->fmt == CVK_FMT_BF16)); // not support tl(int8)->tg(bf16) + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.sys_dtype = 0; + reg.spec_func = 1; + + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + fill_l2g_fmt(®, p->src->fmt, p->dst->fmt); + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + set_int8_rnd_mode(®, p->dst->int8_rnd_mode); + + if (status) { + printf("cvkcv181x: l2g bf16 nc tp: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_l2g_tensor_copy_cw_transposed( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_cw_transposed_param_t *p, + uint64_t dst_addr) +{ + int8_t status = 0; + status |= check_tdma_tl(p->src); + status |= check_tdma_tg(p->dst); + CHECK(status, p->src->shape.n == p->dst->shape.n); + CHECK(status, p->src->shape.c == p->dst->shape.w); + CHECK(status, p->src->shape.h == p->dst->shape.h); + CHECK(status, p->src->shape.w == p->dst->shape.c); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.spec_func = 1; + reg.transpose_md = 3; + + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + if (status) { + printf("cvkcv181x l2g cw tp: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_l2g_bf16_tensor_copy_cw_transposed( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_cw_transposed_param_t *p, + uint64_t dst_addr) +{ + int8_t status = 0; + status |= check_tdma_tl_bf16(p->src); + status |= check_tdma_tg_bf16(p->dst); + CHECK(status, p->src->shape.n == p->dst->shape.n); + CHECK(status, p->src->shape.c == p->dst->shape.w); + CHECK(status, p->src->shape.h == p->dst->shape.h); + CHECK(status, p->src->shape.w == p->dst->shape.c); + + /*not support bf16 mode*/ + CHECK(status, !(p->src->fmt == CVK_FMT_BF16 || p->dst->fmt == CVK_FMT_BF16)); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.spec_func = 1; + reg.transpose_md = 3; + + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + fill_l2g_fmt(®, p->src->fmt, p->dst->fmt); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + if (status) { + printf("cvkcv181x l2g bf16 cw tp: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_l2g_tensor_copy_compressed( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_compressed_param_t *p, + uint64_t dst_addr) +{ + int8_t status = 0; + status |= check_tdma_tl(p->src); + status |= check_tdma_compressed_tg(p->dst); + status |= check_tl_tg_same_size(p->src, &p->dst->t); + + tdma_reg_t reg; + reset_tdma_reg(®); + + //src->fmt == CVK_FMT_BF16 || p->src->fmt == CVK_FMT_I8 || p->src->fmt == CVK_FMT_U8); + + CHECK(status, p->dst->bias1 == 0); + if (p->src->fmt == CVK_FMT_BF16) { + CHECK(status, p->dst->bias0 == 127); + } + else { + //p->src->fmt == CVK_FMT_I8 || p->src->fmt == CVK_FMT_U8); + CHECK(status, p->dst->bias0 == 0); + CHECK(status, p->dst->zero_guard_en == 0); + } + + reg.src_fmt = (p->src->fmt == CVK_FMT_BF16) ? FMT_BF16_TYP : FMT_FIX8B_TYP; + reg.dst_fmt = reg.src_fmt; + + reg.vld = 1; + reg.trans_dir = 1; + reg.compress_en = 1; + + // VLC constraint under hw compress + //1. in int8/uint8, bias0/bias should be 0/0 + //2. in bf16, signed should be 0 and bias0 set to 127, bias1 set to 0 + reg.cmprs_fmt = (p->src->fmt == CVK_FMT_I8); + + // NOTICE: it recommend set to 1 once data contain '0' under bf16 + reg.compress_zero_guard = p->dst->zero_guard_en ? 1 : 0; + reg.compress_bias0 = p->dst->bias0; + reg.compress_bias1 = p->dst->bias1; + + reg.dst_base_reg_sel = p->dst->t.base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->t.shape.c; + reg.dst_h = p->dst->t.shape.h; + reg.dst_w = p->dst->t.shape.w; + reg.dst_n_stride = p->dst->t.stride.n; + fill_dst_c_stride(®, p->dst->t.stride.c); + reg.dst_h_stride = p->dst->t.stride.h; + + reg.intra_cmd_paral = p->intra_cmd_paral; + + if (status) { + printf("cvkcv181x: l2g cmpr: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_l2g_tensor_fill_constant( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_fill_constant_param_t *p, + uint64_t dst_addr) +{ + int8_t status = 0; + status |= check_tdma_tg_bf16(p->dst); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.spec_func = 4; + reg.const_val = p->constant; + + // only support tl(bf16)->tg(bf16) or tl(fix8b)->tg(fix8b) + fill_l2g_fmt(®, p->dst->fmt, p->dst->fmt); + + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_dst_addr(®, dst_addr); + + reg.src_n = p->dst->shape.n; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + if (status) { + printf("cvkcv181x l2g fill const: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_l2g_matrix_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_matrix_copy_param_t *p, + uint64_t dst_addr) +{ + int8_t status = 0; + status |= check_tdma_ml(p->src); + status |= check_tdma_mg(p->dst); + status |= check_ml_mg_same_size(p->src, p->dst); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.sys_dtype = 1; + reg.spec_func = 0; + + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = 1; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.row; + reg.dst_w = p->dst->shape.col; + fill_dst_c_stride(®, p->dst->stride.row); + + if (status) { + printf("cvkcv181x l2g matrix: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_l2g_matrix_copy_compressed( + cvk_context_t *ctx, + const cvk_tdma_l2g_matrix_copy_compressed_param_t *p, + uint64_t dst_addr) +{ + int8_t status = 0; + + status |= check_tdma_ml(p->src); + status |= check_tdma_compress_mg(p->dst); + status |= check_tdma_vlc_matrix_compressed_mg(p->dst); + status |= check_ml_mg_same_size(p->src, &p->dst->m); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.compress_en = 1; + reg.sys_dtype = 1; + reg.spec_func = 0; + + // vlc setting + reg.cmprs_fmt = (p->src->fmt == CVK_FMT_I8); + + CHECK(status, p->dst->bias1 == 0); + if (p->src->fmt == CVK_FMT_BF16) { + CHECK(status, p->dst->bias0 == 127); + } + else { + //p->src->fmt == CVK_FMT_I8 || p->src->fmt == CVK_FMT_U8); + CHECK(status, p->dst->bias0 == 0); + CHECK(status, p->dst->zero_guard_en == 0); + } + + // NOTICE: it should be 1 once data contain '0' under bf16 + reg.compress_zero_guard = p->dst->zero_guard_en ? 1 : 0; + reg.compress_bias0 = p->dst->bias0; + reg.compress_bias1 = p->dst->bias1; + + reg.dst_base_reg_sel = p->dst->m.base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + fill_l2g_fmt(®, p->src->fmt, p->dst->m.fmt); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = 1; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->m.shape.row; + reg.dst_w = p->dst->m.shape.col; + fill_dst_c_stride(®, p->dst->m.stride.row); + + if (status) { + printf("cvkcv181x l2g matrix cmpr: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_l2g_bf16_matrix_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_matrix_copy_param_t *p, + uint64_t dst_addr) +{ + int8_t status = 0; + + status |= check_tdma_ml_bf16(p->src); + status |= check_tdma_mg_bf16(p->dst); + status |= check_ml_mg_same_size(p->src, p->dst); + CHECK(status, !((p->src->fmt == CVK_FMT_I8 || p->src->fmt == CVK_FMT_U8) && p->dst->fmt == CVK_FMT_BF16)); // not support tl(i8/uint8_t)->tg(bf16) + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.sys_dtype = 1; + reg.spec_func = 0; + + reg.dst_base_reg_sel = p->dst->base_reg_index; + fill_src_addr(®, p->src->start_address); + fill_dst_addr(®, dst_addr); + + fill_l2g_fmt(®, p->src->fmt, p->dst->fmt); + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = 1; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.row; + reg.dst_w = p->dst->shape.col; + fill_dst_c_stride(®, p->dst->stride.row); + set_int8_rnd_mode(®, p->dst->int8_rnd_mode); + + if (status) { + printf("cvkcv181x l2g bf16 matrix: wrong paramter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_l2g_general_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_general_copy_param_t *p, + uint64_t dst_addr) +{ + int8_t status = 0; + + CHECK(status, p->dst_base_reg_index < TDMA_NUM_BASE_REGS); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 1; + reg.trans_fmt = 1; + reg.sys_dtype = 0; + reg.spec_func = 0; + + reg.dst_base_reg_sel = p->dst_base_reg_index; + fill_src_addr(®, p->src_address); + fill_dst_addr(®, dst_addr); + reg.src_n_stride = p->bytes; + + if (status) { + printf("cvkcv181x l2g general: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_l2g_bf16_general_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_bf16_general_copy_param_t *p, + uint64_t dst_addr) +{ + int8_t status = 0; + + CHECK(status, p->dst_base_reg_index < TDMA_NUM_BASE_REGS); + + tdma_reg_t reg; + reset_tdma_reg(®); + // only support fix8b->fix8b or bf16->bf16 + CHECK(status, p->src_fmt == p->dst_fmt); + + reg.vld = 1; + reg.trans_dir = 1; + reg.trans_fmt = 1; + reg.sys_dtype = 0; + reg.spec_func = 0; + fill_l2g_fmt(®, p->src_fmt, p->dst_fmt); + + reg.dst_base_reg_sel = p->dst_base_reg_index; + fill_src_addr(®, p->src_address); + fill_dst_addr(®, dst_addr); + reg.src_n_stride = p->src_bytes; + + if (status) { + printf("cvkcv181x l2g bf16 general: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +void cvkcv181x_tdma_l2g_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + tdma_l2g_tensor_copy(ctx, p, dst_addr); +} + +void cvkcv181x_tdma_l2g_bf16_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + tdma_l2g_bf16_tensor_copy(ctx, p, dst_addr); +} +void cvkcv181x_tdma_l2g_tensor_copy_nc_transposed( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_nc_transposed_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + tdma_l2g_tensor_copy_nc_transposed(ctx, p, dst_addr); +} + +void cvkcv181x_tdma_l2g_bf16_tensor_copy_nc_transposed( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_nc_transposed_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + tdma_l2g_bf16_tensor_copy_nc_transposed(ctx, p, dst_addr); +} + +void cvkcv181x_tdma_l2g_tensor_copy_cw_transposed( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_cw_transposed_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + tdma_l2g_tensor_copy_cw_transposed(ctx, p, dst_addr); +} + +void cvkcv181x_tdma_l2g_bf16_tensor_copy_cw_transposed( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_cw_transposed_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + tdma_l2g_bf16_tensor_copy_cw_transposed(ctx, p, dst_addr); +} + +void cvkcv181x_tdma_l2g_tensor_copy_compressed( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_compressed_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->t.start_address); + tdma_l2g_tensor_copy_compressed(ctx, p, dst_addr); +} + +void cvkcv181x_tdma_l2g_tensor_fill_constant( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_fill_constant_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + tdma_l2g_tensor_fill_constant(ctx, p, dst_addr); +} + +void cvkcv181x_tdma_l2g_matrix_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_matrix_copy_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + tdma_l2g_matrix_copy(ctx, p, dst_addr); +} + +void cvkcv181x_tdma_l2g_bf16_matrix_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_matrix_copy_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->start_address); + tdma_l2g_bf16_matrix_copy(ctx, p, dst_addr); +} + +void cvkcv181x_tdma_l2g_matrix_copy_compressed( + cvk_context_t *ctx, + const cvk_tdma_l2g_matrix_copy_compressed_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst->m.start_address); + tdma_l2g_matrix_copy_compressed(ctx, p, dst_addr); +} + +void cvkcv181x_tdma_l2g_general_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_general_copy_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst_address); + tdma_l2g_general_copy(ctx, p, dst_addr); +} + +void cvkcv181x_tdma_l2g_bf16_general_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_bf16_general_copy_param_t *p) +{ + uint64_t dst_addr = absolute_gmem_addr(p->dst_address); + tdma_l2g_bf16_general_copy(ctx, p, dst_addr); +} + +/* + * Direction: G2L + */ + +static void tdma_g2l_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_param_t *p, + uint64_t src_addr) +{ + int8_t status = 0; + status |= check_tdma_tg(p->src); + status |= check_tdma_tl(p->dst); + CHECK(status, p->src->shape.n == p->dst->shape.n); + status |= check_tl_tg_same_size(p->dst, p->src); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 0; + reg.spec_func = 0; + + reg.src_base_reg_sel = p->src->base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + reg.intra_cmd_paral = p->intra_cmd_paral; + + if (status) { + printf("cvkcv181x g2l: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_g2l_bf16_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_param_t *p, + uint64_t src_addr) +{ + int8_t status = 0; + + status |= check_tdma_tg_bf16(p->src); + status |= check_tdma_tl_bf16(p->dst); + CHECK(status, p->src->shape.n == p->dst->shape.n); + CHECK(status, !(p->src->fmt == CVK_FMT_BF16 && p->dst->fmt == CVK_FMT_I8)); // not support tg(bf16)->tl(int8) + status |= check_tl_tg_same_size(p->dst, p->src); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 0; + reg.spec_func = 0; + + reg.src_base_reg_sel = p->src->base_reg_index; + + fill_g2l_fmt(®, p->src->fmt, p->dst->fmt); + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + reg.intra_cmd_paral = p->intra_cmd_paral; + + if (status) { + printf("cvkcv181x g2l bf16: wrong parameter\n"); + return; + } + + //trace_tdma_reg(®, __func__); + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_g2l_tensor_copy_nc_transposed( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_nc_transposed_param_t *p, + uint64_t src_addr) +{ + int8_t status = 0; + + status |= check_tdma_tg(p->src); + status |= check_tdma_tl(p->dst); + CHECK(status, p->dst->shape.n == p->src->shape.c); + CHECK(status, p->dst->shape.c == p->src->shape.n); + CHECK(status, p->dst->shape.h * p->dst->shape.w == + p->src->shape.h * p->src->shape.w); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 0; + reg.spec_func = 1; + + reg.src_base_reg_sel = p->src->base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + if (status) { + printf("cvkcv181x g2l nc tp: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_g2l_bf16_tensor_copy_nc_transposed( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_nc_transposed_param_t *p, + uint64_t src_addr) +{ + int8_t status = 0; + + status |= check_tdma_tg_bf16(p->src); + status |= check_tdma_tl_bf16(p->dst); + CHECK(status, p->dst->shape.n == p->src->shape.c); + CHECK(status, p->dst->shape.c == p->src->shape.n); + CHECK(status, p->dst->shape.h * p->dst->shape.w == + p->src->shape.h * p->src->shape.w); + + CHECK(status, !(p->src->fmt == CVK_FMT_BF16 && p->dst->fmt == CVK_FMT_I8)); // not support tg(bf16)->tl(int8) + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 0; + reg.spec_func = 1; + + reg.src_base_reg_sel = p->src->base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + fill_g2l_fmt(®, p->src->fmt, p->dst->fmt); + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + if (status) { + printf("cvkcv181x g2l bf16 nc tp: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_g2l_tensor_copy_chw_rotated( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_chw_rotated_param_t *p, + uint64_t src_addr) +{ + int8_t status = 0; + + status |= check_tdma_tg(p->src); + status |= check_tdma_tl(p->dst); + + CHECK(status, p->src->shape.c == 3 || p->src->shape.c == 4); + CHECK(status, p->src->shape.n == p->dst->shape.n); + CHECK(status, p->src->shape.c == p->dst->shape.c); + CHECK(status, p->src->shape.h == p->dst->shape.h); + CHECK(status, p->src->shape.w == p->dst->shape.w); + + CHECK(status, p->dst->start_address % ctx->info.eu_num == 0); + CHECK(status, p->dst->stride.n % ctx->info.eu_num == 0); + CHECK(status, p->dst->stride.c % ctx->info.eu_num == 0); + CHECK(status, p->dst->stride.h == p->dst->shape.w); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 0; + reg.spec_func = 1; + + if (p->dst->shape.c == 3) + reg.transpose_md = 1; + else if(p->dst->shape.c == 4) + reg.transpose_md = 2; + else + CHECK(status, 0); + + reg.src_base_reg_sel = p->src->base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, 1); + reg.src_h_stride = p->src->shape.c * p->src->shape.w; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + if (status) { + printf("cvkcv181x g2l chw: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_g2l_tensor_copy_decompressed( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_decompressed_param_t *p, + uint64_t src_addr) +{ + int8_t status = 0; + + status |= check_tdma_compressed_tg(p->src); + status |= check_tdma_tl(p->dst); + status |= check_tl_tg_same_size(p->dst, &p->src->t); + + tdma_reg_t reg; + reset_tdma_reg(®); + + //dst->fmt == CVK_FMT_BF16 || p->dst->fmt == CVK_FMT_I8 || p->dst->fmt == CVK_FMT_U8); + fill_g2l_fmt(®, p->src->t.fmt, p->dst->fmt); + + reg.vld = 1; + reg.trans_dir = 0; + reg.compress_en = 1; + reg.cmprs_fmt = (p->src->t.fmt == CVK_FMT_I8); + + reg.src_base_reg_sel = p->src->t.base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->t.shape.n; + reg.src_c = p->src->t.shape.c; + reg.src_h = p->src->t.shape.h; + reg.src_w = p->src->t.shape.w; + reg.src_n_stride = p->src->t.stride.n; + fill_src_c_stride(®, p->src->t.stride.c); + reg.src_h_stride = p->src->t.stride.h; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + reg.intra_cmd_paral = p->intra_cmd_paral; + + if (status) { + printf("cvkcv181x g2l cmpr: wrong parameter\n"); + return; + } + + // trace_tdma_reg(®, __FUNCTION__); + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_g2l_tensor_fill_constant( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_fill_constant_param_t *p) +{ + int8_t status = 0; + + status |= check_tdma_tl(p->dst); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.spec_func = 4; + reg.const_val = p->constant; + + reg.dst_fmt = (p->dst->fmt == CVK_FMT_BF16) ? 2 : 1; + + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->dst->shape.n; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + if (status) { + printf("cvkcv181x g2l fill const: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_g2l_bf16_tensor_fill_constant( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_fill_constant_param_t *p) +{ + int8_t status = 0; + + status |= check_tdma_tl_bf16(p->dst); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.spec_func = 4; + reg.const_val = p->constant; + + /*only suppoert fix8b->fix8b or bf16->bf16*/ + fill_g2l_fmt(®, p->dst->fmt, p->dst->fmt); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->dst->shape.n; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + if (status) { + printf("cvkcv181x g2l bf16 fill const: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_g2l_matrix_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_matrix_copy_param_t *p, + uint64_t src_addr) +{ + int8_t status = 0; + + status |= check_tdma_mg(p->src); + status |= check_tdma_ml(p->dst); + CHECK(status, p->dst->shape.n == p->src->shape.row); + status |= check_ml_mg_same_size(p->dst, p->src); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 1; + reg.spec_func = 0; + + reg.src_base_reg_sel = p->src->base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.row; + reg.src_c = p->src->shape.row; + reg.src_w = p->src->shape.col; + fill_src_c_stride(®, p->src->stride.row); + + reg.dst_c = p->dst->shape.c; + reg.dst_h = 1; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + if (status) { + printf("cvkcv181x g2l matrix: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_g2l_matrix_copy_decompressed( + cvk_context_t *ctx, + const cvk_tdma_g2l_matrix_copy_decompressed_param_t *p, + uint64_t src_addr) +{ + int8_t status = 0; + + status |= check_tdma_vlc_matrix_compressed_mg(p->src); + status |= check_tdma_mg(&p->src->m); + status |= check_tdma_ml(p->dst); + CHECK(status, p->dst->shape.n == p->src->m.shape.row); + status |= check_ml_mg_same_size(p->dst, &p->src->m); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 1; + reg.spec_func = 0; + reg.compress_en = 1; + reg.cmprs_fmt = (p->src->m.fmt == CVK_FMT_I8); + + fill_g2l_fmt(®, p->src->m.fmt, p->dst->fmt); + reg.src_base_reg_sel = p->src->m.base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->m.shape.row; + reg.src_c = p->src->m.shape.row; + reg.src_w = p->src->m.shape.col; + fill_src_c_stride(®, p->src->m.stride.row); + + reg.dst_c = p->dst->shape.c; + reg.dst_h = 1; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + if (status) { + printf("cvkcv181x g2l matrix cmpr: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_g2l_bf16_matrix_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_matrix_copy_param_t *p, + uint64_t src_addr) +{ + int8_t status = 0; + + status |= check_tdma_mg_bf16(p->src); + status |= check_tdma_ml_bf16(p->dst); + CHECK(status, p->dst->shape.n == p->src->shape.row); + status |= check_ml_mg_same_size(p->dst, p->src); + CHECK(status, !(p->src->fmt == CVK_FMT_BF16 && p->dst->fmt == CVK_FMT_I8)); // not support tg(bf16)->tl(int8) + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 1; + reg.spec_func = 0; + + fill_g2l_fmt(®, p->src->fmt, p->dst->fmt); + reg.src_base_reg_sel = p->src->base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.row; + reg.src_c = p->src->shape.row; + reg.src_w = p->src->shape.col; + fill_src_c_stride(®, p->src->stride.row); + + reg.dst_c = p->dst->shape.c; + reg.dst_h = 1; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + if (status) { + printf("cvkcv181x g2l bf16 matrix: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_g2l_matrix_copy_row_col_transposed( + cvk_context_t *ctx, + const cvk_tdma_g2l_matrix_copy_row_col_transposed_param_t *p, + uint64_t src_addr) +{ + int8_t status = 0; + + status |= check_tdma_mg(p->src); + status |= check_tdma_ml(p->dst); + CHECK(status, p->dst->shape.n == p->src->shape.col); + CHECK(status, p->dst->shape.col == p->src->shape.row); + status |= check_ml_mg_same_size(p->dst, p->src); + + CHECK(status, p->src->shape.row >= p->dst->shape.w); + CHECK(status, p->dst->shape.c == + (uint32_t) ceiling_func(p->src->shape.row, p->dst->shape.w)); + + CHECK(status, p->dst->start_address % ctx->info.eu_num == 0); + CHECK(status, p->dst->stride.n % ctx->info.eu_num == 0); + CHECK(status, p->dst->stride.c % ctx->info.eu_num == 0); + CHECK(status, p->dst->stride.h == p->dst->shape.w); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.sys_dtype = 1; + reg.spec_func = 1; + + reg.src_base_reg_sel = p->src->base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst->start_address); + + reg.src_n = p->src->shape.row; + reg.src_c = p->src->shape.row; + reg.src_w = p->src->shape.col; + fill_src_c_stride(®, p->src->stride.row); + + reg.dst_c = p->dst->shape.c; + reg.dst_h = 1; + reg.dst_w = p->dst->shape.w; + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p->dst->stride.h; + + if (status) { + printf("cvkcv181x g2l matrix tp: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_g2l_general_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_general_copy_param_t *p, + uint64_t src_addr) +{ + int8_t status = 0; + CHECK(status, p->src_base_reg_index < TDMA_NUM_BASE_REGS); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.trans_fmt = 1; + reg.sys_dtype = 0; + reg.spec_func = 0; + + reg.src_base_reg_sel = p->src_base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst_address); + reg.src_n_stride = p->bytes; + + if (status) { + printf("cvkcv181x g2l general: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +static void tdma_g2l_bf16_general_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_bf16_general_copy_param_t *p, + uint64_t src_addr) +{ + int8_t status = 0; + + CHECK(status, p->src_base_reg_index < TDMA_NUM_BASE_REGS); + // only support fix8b->fix8b or bf16->bf16 + CHECK(status, p->dst_fmt == p->src_fmt); + + tdma_reg_t reg; + reset_tdma_reg(®); + + reg.vld = 1; + reg.trans_dir = 0; + reg.trans_fmt = 1; + reg.sys_dtype = 0; + reg.spec_func = 0; + + fill_g2l_fmt(®, p->src_fmt, p->dst_fmt); + + reg.src_base_reg_sel = p->src_base_reg_index; + fill_src_addr(®, src_addr); + fill_dst_addr(®, p->dst_address); + reg.src_n_stride = p->src_bytes; + + if (status) { + printf("cvkcv181x g2l bf16 general: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +void cvkcv181x_tdma_g2l_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->start_address); + tdma_g2l_tensor_copy(ctx, p, src_addr); +} + +void cvkcv181x_tdma_g2l_bf16_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->start_address); + tdma_g2l_bf16_tensor_copy(ctx, p, src_addr); +} + +void cvkcv181x_tdma_g2l_tensor_copy_nc_transposed( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_nc_transposed_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->start_address); + tdma_g2l_tensor_copy_nc_transposed(ctx, p, src_addr); +} + +void cvkcv181x_tdma_g2l_bf16_tensor_copy_nc_transposed( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_nc_transposed_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->start_address); + tdma_g2l_bf16_tensor_copy_nc_transposed(ctx, p, src_addr); +} + +void cvkcv181x_tdma_g2l_tensor_copy_chw_rotated( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_chw_rotated_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->start_address); + tdma_g2l_tensor_copy_chw_rotated(ctx, p, src_addr); +} + +void cvkcv181x_tdma_g2l_tensor_copy_decompressed( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_decompressed_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->t.start_address); + tdma_g2l_tensor_copy_decompressed(ctx, p, src_addr); +} + +void cvkcv181x_tdma_g2l_tensor_fill_constant( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_fill_constant_param_t *p) +{ + tdma_g2l_tensor_fill_constant(ctx, p); +} + +void cvkcv181x_tdma_g2l_bf16_tensor_fill_constant( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_fill_constant_param_t *p) +{ + tdma_g2l_bf16_tensor_fill_constant(ctx, p); +} + +void cvkcv181x_tdma_g2l_matrix_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_matrix_copy_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->start_address); + tdma_g2l_matrix_copy(ctx, p, src_addr); +} + +void cvkcv181x_tdma_g2l_matrix_copy_decompressed( + cvk_context_t *ctx, + const cvk_tdma_g2l_matrix_copy_decompressed_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->m.start_address); + tdma_g2l_matrix_copy_decompressed(ctx, p, src_addr); +} + +void cvkcv181x_tdma_g2l_bf16_matrix_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_matrix_copy_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->start_address); + tdma_g2l_bf16_matrix_copy(ctx, p, src_addr); +} + +void cvkcv181x_tdma_g2l_matrix_copy_row_col_transposed( + cvk_context_t *ctx, + const cvk_tdma_g2l_matrix_copy_row_col_transposed_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src->start_address); + tdma_g2l_matrix_copy_row_col_transposed(ctx, p, src_addr); +} + +void cvkcv181x_tdma_g2l_general_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_general_copy_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src_address); + tdma_g2l_general_copy(ctx, p, src_addr); +} + +void cvkcv181x_tdma_g2l_bf16_general_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_bf16_general_copy_param_t *p) +{ + uint64_t src_addr = absolute_gmem_addr(p->src_address); + tdma_g2l_bf16_general_copy(ctx, p, src_addr); +} +/* + * Direction: TG2TG + */ +static void cvkcv181x_tdma_copy_gmem( + cvk_context_t *ctx, + const cvk_tdma_g2g_tensor_copy_param_t *p, + uint8_t u8_trans_fmt) +{ + tdma_reg_t reg; + + reset_tdma_reg(®); + + uint64_t u64_src_addr; + uint64_t u64_dst_addr; + + reg.vld = 1; + reg.trans_dir = 2; // 0:g2l, 1:l2g, 2:g2g, 3:l2l + reg.trans_fmt = u8_trans_fmt; // 1:general copy, 2:tensor copy + reg.sys_dtype = 0; // + reg.spec_func = 0; // + + u64_src_addr = absolute_gmem_addr(p->src->start_address); + u64_dst_addr = absolute_gmem_addr(p->dst->start_address); + fill_src_addr(®, u64_src_addr); + fill_dst_addr(®, u64_dst_addr); + + reg.src_base_reg_sel = p->src->base_reg_index; + reg.dst_base_reg_sel = p->dst->base_reg_index; + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p-> dst->stride.h; + + (void *)emit_tdma_cmdbuf( ctx, ®); +} + +static void cvkcv181x_tdma_bf16_copy_gmem( + cvk_context_t *ctx, + const cvk_tdma_g2g_tensor_copy_param_t *p, + uint8_t u8_trans_fmt) +{ + int8_t status = 0; + tdma_reg_t reg; + + reset_tdma_reg(®); + + uint64_t u64_src_addr; + uint64_t u64_dst_addr; + + reg.vld = 1; + reg.trans_dir = 2; // 0:g2l, 1:l2g, 2:g2g, 3:l2l + reg.trans_fmt = u8_trans_fmt; // 1:general copy, 2:tensor copy + reg.sys_dtype = 0; // + reg.spec_func = 0; // + CHECK(status, p->src->fmt == p->dst->fmt); + + reg.dst_fmt = (p->dst->fmt == CVK_FMT_BF16) ? 2 : 1; + reg.src_fmt = (p->src->fmt == CVK_FMT_BF16) ? 2 : 1; + + u64_src_addr = absolute_gmem_addr(p->src->start_address); + u64_dst_addr = absolute_gmem_addr(p->dst->start_address); + fill_src_addr(®, u64_src_addr); + fill_dst_addr(®, u64_dst_addr); + + reg.src_base_reg_sel = p->src->base_reg_index; + reg.dst_base_reg_sel = p->dst->base_reg_index; + + reg.src_n = p->src->shape.n; + reg.src_c = p->src->shape.c; + reg.src_h = p->src->shape.h; + reg.src_w = p->src->shape.w; + + reg.dst_c = p->dst->shape.c; + reg.dst_h = p->dst->shape.h; + reg.dst_w = p->dst->shape.w; + + reg.src_n_stride = p->src->stride.n; + fill_src_c_stride(®, p->src->stride.c); + reg.src_h_stride = p->src->stride.h; + + reg.dst_n_stride = p->dst->stride.n; + fill_dst_c_stride(®, p->dst->stride.c); + reg.dst_h_stride = p-> dst->stride.h; + + if (status) { + printf("cvkcv181x bf16 gmem: wrong parameter\n"); + return; + } + + (void *)emit_tdma_cmdbuf(ctx, ®); +} + +/* + * Direction: G2G + */ +void cvkcv181x_tdma_g2g_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_g2g_tensor_copy_param_t *p) +{ + cvkcv181x_tdma_copy_gmem(ctx, p, 2); +} + +void cvkcv181x_tdma_g2g_bf16_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_g2g_tensor_copy_param_t *p) +{ + cvkcv181x_tdma_bf16_copy_gmem(ctx, p, 2); +} + +void cvkcv181x_tdma_g2g_general_copy( + cvk_context_t *ctx, + const cvk_tdma_g2g_tensor_copy_param_t *p) +{ + cvkcv181x_tdma_copy_gmem(ctx, p, 1); +} + +void cvkcv181x_tdma_g2g_bf16_general_copy( + cvk_context_t *ctx, + const cvk_tdma_g2g_tensor_copy_param_t *p) +{ + cvkcv181x_tdma_bf16_copy_gmem(ctx, p, 1); +} diff --git a/cvikernel/src/cv181x/tiu_add.c b/cvikernel/src/cv181x/tiu_add.c new file mode 100644 index 000000000..653881250 --- /dev/null +++ b/cvikernel/src/cv181x/tiu_add.c @@ -0,0 +1,88 @@ +#include "cvkcv181x.h" + +void cvkcv181x_tiu_add( + cvk_context_t *ctx, + const cvk_tiu_add_param_t *p) +{ + int8_t status = 0; + int bf16_enable = (p->a_low->fmt == CVK_FMT_BF16) ? 1 : 0; + + if (bf16_enable) { + /*bf16 only support 16 bit*/ + CHECK(status, !p->a_high); + CHECK(status, !(p->b.high && !p->b_is_const)); + CHECK(status, !p->res_high); + status |= check_tiu_tensor(p->a_low); + status |= check_tiu_tensor(p->res_low); + status |= check_same_shape(p->res_low, p->a_low); + if (!p->b_is_const) { + status |= check_tiu_tensor(p->b.low); + status |= check_same_shape(p->res_low, p->b.low); + } + } else { + status |= check_16bit_tiu_tensor(p->a_low, p->a_high); + status |= check_tiu_tensor(p->res_low); + status |= check_same_shape(p->res_low, p->a_low); + if (!p->b_is_const) { + status |= check_16bit_tiu_tensor(p->b.low, p->b.high); + status |= check_same_shape(p->res_low, p->b.low); + } + } + if (p->res_high) + status |= check_16bit_tiu_tensor(p->res_low, p->res_high); + CHECK(status, p->relu_enable == 0 || p->relu_enable == 1); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_ADD_FIX8B; + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + reg.opt_res_shift = 0; + reg.opt_relu_typ = p->relu_enable; + fill_same_tensor_shape(®, p->a_low->shape); + fill_same_tensor_stride_type(®, 0b11); + + int arith_shift = tensor_is_signed(p->res_low); + reg.opt_shift_typ = arith_shift; + reg.opt_res_shift = p->rshift_bits; + + reg.opd0_addr = p->a_low->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a_low); + reg.opt_opd0_seg = (p->a_high == NULL); + reg.opd0_b_str = bf16_enable ? 0 : (p->a_high->start_address - p->a_low->start_address); + fill_opd0_stride(®, &p->a_low->stride); + + reg.opt_opd1_seg = bf16_enable ? 1 : 0; //(p->b_high == NULL); b_high is the same as b_val + if (p->b_is_const) { + reg.opt_opd1_const = 1; + reg.opt_opd1_sign = !!p->b_const.is_signed; + reg.opd1_addr = p->b_const.val; + } else { + reg.opt_opd1_const = 0; + reg.opt_opd1_sign = tensor_is_signed(p->b.low); + reg.opd1_addr = p->b.low->start_address; + reg.opd1_b_str = bf16_enable ? 0 : (p->b.high->start_address - p->b.low->start_address); + fill_opd1_stride(®, &p->b.low->stride); + } + + reg.res0_addr = p->res_low->start_address; + reg.opt_res0_sign = tensor_is_signed(p->res_low); + reg.opt_res0_seg = (p->res_high == NULL); + fill_res0_stride(®, &p->res_low->stride); + if (p->res_high) + reg.res0_b_str = p->res_high->start_address - p->res_low->start_address; + if (p->relu_enable) + CHECK(status, reg.opt_res0_seg); + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv181x tiu_add: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv181x/tiu_and.c b/cvikernel/src/cv181x/tiu_and.c new file mode 100644 index 000000000..9208661fd --- /dev/null +++ b/cvikernel/src/cv181x/tiu_and.c @@ -0,0 +1,111 @@ +#include "cvkcv181x.h" + +void cvkcv181x_tiu_and_int8( + cvk_context_t *ctx, + const cvk_tiu_and_int8_param_t *p) +{ + int8_t status = 0; + status |= check_tiu_tensor_3(p->res, p->a, p->b); + status |= check_same_shape_3(p->res, p->a, p->b); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_AND_FIX8B; + reg.tsk_opd_num = 2; + reg.opt_res_shift = 0; + reg.opt_shift_typ = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = 0; + reg.opt_opd0_seg = 1; + fill_opd0_stride(®, &p->a->stride); + + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = 0; + reg.opt_opd1_seg = 1; + fill_opd1_stride(®, &p->b->stride); + + reg.res0_addr = p->res->start_address; + reg.opt_res0_sign = 0; + reg.opt_res0_seg = 1; + fill_res0_stride(®, &p->res->stride); + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv181x tiu and: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} + +void cvkcv181x_tiu_and_int16( + cvk_context_t *ctx, + const cvk_tiu_and_int16_param_t *p) +{ + int8_t status = 0; + status |= check_16bit_tiu_tensor(p->a_low, p->a_high); + status |= check_16bit_tiu_tensor(p->b_low, p->b_high); + status |= check_16bit_tiu_tensor(p->res_low, p->res_high); + status |= check_same_shape_3(p->res_low, p->a_low, p->b_low); + + int res_high_addr = p->res_high->start_address; + int res_low_addr = p->res_low->start_address; + CHECK(status, res_high_addr > res_low_addr); + int res_b_stride = res_high_addr - res_low_addr; + + int a_high_addr = p->a_high->start_address; + int a_low_addr = p->a_low->start_address; + CHECK(status, a_high_addr > a_low_addr); + int a_b_stride = a_high_addr - a_low_addr; + + int b_high_addr = p->b_high->start_address; + int b_low_addr = p->b_low->start_address; + CHECK(status, b_high_addr > b_low_addr); + int b_b_stride = b_high_addr - b_low_addr; + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_AND_FIX8B; + reg.tsk_opd_num = 2; + reg.opt_res_shift = 0; + reg.opt_shift_typ = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a_low->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = a_low_addr; + reg.opt_opd0_sign = 0; + reg.opt_opd0_seg = 0; + reg.opd0_b_str = a_b_stride; + fill_opd0_stride(®, &p->a_low->stride); + + reg.opd1_addr = b_low_addr; + reg.opt_opd1_sign = 0; + reg.opt_opd1_seg = 0; + reg.opd1_b_str = b_b_stride; + fill_opd1_stride(®, &p->b_low->stride); + + reg.res0_addr = res_low_addr; + reg.opt_res0_sign = 0; + reg.opt_res0_seg = 0; + reg.res0_b_str = res_b_stride; + fill_res0_stride(®, &p->res_low->stride); + + if (status) { + printf("cvkcv181x tiu and: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv181x/tiu_average_pooling.c b/cvikernel/src/cv181x/tiu_average_pooling.c new file mode 100644 index 000000000..cbe54d08b --- /dev/null +++ b/cvikernel/src/cv181x/tiu_average_pooling.c @@ -0,0 +1,94 @@ +#include "cvkcv181x.h" + +void cvkcv181x_tiu_average_pooling( + cvk_context_t *ctx, + const cvk_tiu_average_pooling_param_t *p) +{ + int8_t status = 0; + int bf16_enable = (p->ifmap->fmt == CVK_FMT_BF16) ? 1 : 0; + + CHECK(status, p->ifmap->shape.n == p->ofmap->shape.n); + CHECK(status, p->ifmap->shape.c == p->ofmap->shape.c); + CHECK(status, p->stride_h < 32 && p->stride_h > 0); + CHECK(status, p->stride_w < 32 && p->stride_w > 0); + CHECK(status, p->pad_top < 16); + CHECK(status, p->pad_bottom < 16); + CHECK(status, p->pad_left < 16); + CHECK(status, p->pad_right < 16); + CHECK(status, p->ins_h < 15); + CHECK(status, p->ins_last_h < 15); + CHECK(status, p->ins_w < 15); + CHECK(status, p->ins_last_w < 15); + + status |= check_tiu_tensor_2(p->ifmap, p->ofmap); + if (bf16_enable) { + status |= check_bf16_stride_type_0(ctx, p->ifmap); + status |= check_bf16_stride_type_0(ctx, p->ofmap); + } else { + status |= check_stride_type_0(ctx, p->ifmap); + status |= check_stride_type_0(ctx, p->ofmap); + } + + int opd0_sign = tensor_is_signed(p->ifmap); + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_DEPTHWISE_POOL_FIX8B; + reg.tsk_eu_typ = 1; + reg.opt_shift_typ = opd0_sign; + reg.opt_res_shift = p->rshift_bits; + reg.opt_relu_typ = 0; /* hardware relu function not verified. */ + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + + reg.res0_addr = p->ofmap->start_address; + reg.opt_res0_sign = opd0_sign; + reg.opt_res0_seg = 1; + reg.res0_n = p->ofmap->shape.n; + reg.res0_c = p->ofmap->shape.c; + reg.res0_h = p->ofmap->shape.h; + reg.res0_w = p->ofmap->shape.w; + + reg.opd0_addr = p->ifmap->start_address; + reg.opt_opd0_sign = opd0_sign; + reg.opt_opd0_seg = 1; + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.conv_opd0_up_pad = p->pad_top; + reg.conv_opd0_dn_pad = p->pad_bottom; + reg.conv_opd0_lf_pad = p->pad_left; + reg.conv_opd0_rt_pad = p->pad_right; + reg.conv_opd0_x_ins0 = p->ins_w; + reg.conv_opd0_y_ins0 = p->ins_h; + reg.conv_opd0_x_ins0_last = p->ins_last_w; + reg.conv_opd0_y_ins0_last = p->ins_last_h; + + reg.opt_opd1_const = 1; + /*HW does not have dive, we need to calculate value here*/ + if (bf16_enable) + reg.opd1_addr = + cvk_convert_fp32_bf16( + (float)(cvk_convert_bf16_fp32(p->avg_pooling_const) / (p->kh * p->kw))); + else + reg.opd1_addr = p->avg_pooling_const; + + reg.opd1_h = p->kh; + reg.opd1_w = p->kw; + reg.opt_opd1_sign = 0; + reg.opt_opd1_seg = 1; + reg.conv_op_x_str = p->stride_w; + reg.conv_op_y_str = p->stride_h; + reg.opd0_ins_val = bf16_enable ? + (uint32_t)p->ins_fp : (uint32_t)p->ins_val; + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv181x tiu avg pool: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv181x/tiu_convolution.c b/cvikernel/src/cv181x/tiu_convolution.c new file mode 100644 index 000000000..3309948f1 --- /dev/null +++ b/cvikernel/src/cv181x/tiu_convolution.c @@ -0,0 +1,175 @@ +#include "cvkcv181x.h" + +typedef cvk_tiu_convolution_param_t param_t; + +static int can_do_double_conv(cvk_context_t *ctx, const param_t *p) +{ + if ((p->ifmap->start_address % ctx->info.lmem_size) % 2 == 0 && + p->ifmap->shape.c % 2 == 0 && + p->ifmap->shape.c >= 4 && + p->weight->start_address % 2 == 0) + return 1; + + return 0; +} + +static int8_t check_conv_param(cvk_context_t *ctx, const param_t *p) +{ + int8_t status = 0; + uint32_t eu_num = ctx->info.eu_num; + + status |= check_tiu_tensor_3(p->ifmap, p->ofmap, p->weight); + status |= check_stride_type_0(ctx, p->ifmap); + + CHECK(status, (p->ofmap->stride.n % eu_num) == 0); + CHECK(status, p->ifmap->start_address % eu_num == 0); + CHECK(status, p->ofmap->start_address % eu_num == 0); + CHECK(status, p->ifmap->shape.n == p->ofmap->shape.n); + CHECK(status, !(p->ifmap->shape.h == 1 && p->ins_h > 0)); + CHECK(status, p->weight->shape.n == p->ifmap->shape.c); + CHECK(status, p->weight->shape.c == p->ofmap->shape.c); + + if (p->chl_quan_param) { + status |= check_tiu_tensor(p->chl_quan_param); + status |= check_stride_type_2(ctx, p->chl_quan_param); + CHECK(status, p->chl_quan_param->start_address % eu_num == 0); + } + if (can_do_double_conv(ctx, p)) { + uint32_t lmem_i = p->ifmap->start_address % ctx->info.lmem_size; + CHECK(status, lmem_i % 2 == 0); + CHECK(status, p->ifmap->shape.c % 2 == 0); + CHECK(status, p->ifmap->shape.c >= 4); /* Otherwise performance will suffer */ + CHECK(status, p->weight->start_address % 2 == 0); + } + if(p->ps32_mode & 0x2) + { + CHECK(status, !p->relu_enable); + CHECK(status, !p->has_bias); + + CHECK(status, p->cmd_pre_exe <= 1); + } + CHECK(status, p->stride_h < 32 && p->stride_h > 0); + CHECK(status, p->stride_w < 32 && p->stride_w > 0); + CHECK(status, p->pad_top < 16); + CHECK(status, p->pad_bottom < 16); + CHECK(status, p->pad_left < 16); + CHECK(status, p->pad_right < 16); + CHECK(status, p->ins_h < 15); + CHECK(status, p->ins_last_h < 15); + CHECK(status, p->ins_w < 15); + CHECK(status, p->ins_last_w < 15); + CHECK(status, p->dilation_h >= 1); + CHECK(status, p->dilation_w >= 1); + CHECK(status, p->relu_enable == 0 || p->relu_enable == 1); + + return status; +} + +void cvkcv181x_tiu_convolution(cvk_context_t *ctx, const param_t *p) +{ + int8_t status = 0; + + status |= check_conv_param(ctx, p); + + int opd0_sign = tensor_is_signed(p->ifmap); + int opd1_sign = tensor_is_signed(p->weight); + int arith_shift = opd0_sign || opd1_sign; + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_CONV_FIX8B; + reg.opt_shift_typ = arith_shift; + reg.opt_relu_typ = !!(p->relu_enable); + reg.tsk_opd_num = 2; + + /*always automatically enabel double conv at those situations*/ + if (can_do_double_conv(ctx, p)) + reg.double_conv = 1; + + reg.res0_addr = p->ofmap->start_address; + reg.opt_res0_sign = tensor_is_signed(p->ofmap); + reg.opt_res0_seg = 1; + reg.res0_n = p->ofmap->shape.n; + reg.res0_c = p->ofmap->shape.c; + reg.res0_h = p->ofmap->shape.h; + reg.res0_w = p->ofmap->shape.w; + reg.res0_n_str = p->ofmap->stride.n; + reg.res0_c_str = p->ofmap->stride.c; + reg.res0_h_str = p->ofmap->stride.h; + reg.res0_w_str = p->ofmap->stride.w; + reg.short_res0_str = 3; // Manual instead of h/w + reg.ps32_md = p->ps32_mode; + if (p->ps32_mode > 0) { + reg.res0_b_str = p->ofmap->shape.n * p->ofmap->stride.n; + + // Per-channel parameter does not has right shift (default is 10). + // Set zero. + reg.opt_res_shift = 0; + } + + reg.opd0_addr = p->ifmap->start_address; + reg.opt_opd0_sign = opd0_sign; + reg.opt_opd0_seg = 1; + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.opd0_ins_val = (uint32_t)p->ins_val; + reg.short_opd0_str = 0; + reg.conv_opd0_up_pad = p->pad_top; + reg.conv_opd0_dn_pad = p->pad_bottom; + reg.conv_opd0_lf_pad = p->pad_left; + reg.conv_opd0_rt_pad = p->pad_right; + reg.conv_opd0_x_ins0 = p->ins_w; + reg.conv_opd0_y_ins0 = p->ins_h; + reg.conv_opd0_x_ins0_last = p->ins_last_w; + reg.conv_opd0_y_ins0_last = p->ins_last_h; + + reg.opd1_addr = p->weight->start_address; + reg.opt_opd1_sign = opd1_sign; + reg.opt_opd1_seg = 1; + reg.opt_opd1_const = p->w_is_const; + reg.opd1_n = p->weight->shape.n; + reg.opd1_c = p->weight->shape.c; + reg.opd1_h = p->weight->shape.h; + reg.opd1_w = p->weight->shape.w; + reg.short_opd1_str = 1; + reg.conv_opd1_x_ins0 = p->dilation_w - 1; + reg.conv_opd1_y_ins0 = p->dilation_h - 1; + reg.conv_op_x_str = p->stride_w; + reg.conv_op_y_str = p->stride_h; + + if (p->chl_quan_param) { + CHECK(status, p->chl_quan_param->shape.n == 1); + CHECK(status, p->chl_quan_param->shape.c == p->ofmap->shape.c); + CHECK(status, p->chl_quan_param->shape.h == 1); + CHECK(status, p->chl_quan_param->shape.w == 1); + reg.opt_chl_quan = 1; + reg.opt_res_shift = 0; // useless + reg.opd2_addr = p->chl_quan_param->start_address; + reg.opd2_n = p->chl_quan_param->shape.n; + reg.opd2_c = p->chl_quan_param->shape.c; + reg.opd2_h = p->chl_quan_param->shape.h; + reg.opd2_w = p->chl_quan_param->shape.w; + } + reg.opt_opd2_seg = 1; // useless, force to 1 to skip b_stride check + reg.short_opd2_str = 2; // useless + reg.opd2_b_str = 0; // useless + + if (p->has_bias) { + reg.tsk_opd_num = 3; + reg.opt_opd2_sign = 1; + } + + reg.layer_info = p->layer_id; + reg.cmd_pre_exe_typ = p->cmd_pre_exe_typ; + reg.cmd_pre_exe = p->cmd_pre_exe; + + if (status) { + printf("cvkcv181x tiu conv: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv181x/tiu_copy.c b/cvikernel/src/cv181x/tiu_copy.c new file mode 100644 index 000000000..e30def4d3 --- /dev/null +++ b/cvikernel/src/cv181x/tiu_copy.c @@ -0,0 +1,47 @@ +#include "cvkcv181x.h" + +void cvkcv181x_tiu_copy( + cvk_context_t *ctx, + const cvk_tiu_copy_param_t *p) +{ + int8_t status = 0; + int bf16_enable = (p->src->fmt == CVK_FMT_BF16) ? 1 : 0; + + status |= check_tiu_tensor_2(p->dst, p->src); + status |= check_same_shape(p->dst, p->src); + status |= check_stride_range(p->dst->stride); + status |= check_stride_range(p->src->stride); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_COPY_FIX8B; + reg.tsk_opd_num = 1; + reg.opd_typ = bf16_enable ? 1: 0; + reg.opt_res_shift = 0; + reg.opt_shift_typ = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->dst->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->src->start_address; + reg.opt_opd0_sign = 0; + reg.opt_opd0_seg = 1; + fill_opd0_stride(®, &p->src->stride); + + reg.res0_addr = p->dst->start_address; + reg.opt_res0_sign = 0; + reg.opt_res0_seg = 1; + fill_res0_stride(®, &p->dst->stride); + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv181x tiu copy: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv181x/tiu_depthwise_convolution.c b/cvikernel/src/cv181x/tiu_depthwise_convolution.c new file mode 100644 index 000000000..4096a2b11 --- /dev/null +++ b/cvikernel/src/cv181x/tiu_depthwise_convolution.c @@ -0,0 +1,147 @@ +#include "cvkcv181x.h" + +void cvkcv181x_tiu_depthwise_convolution( + cvk_context_t *ctx, + const cvk_tiu_depthwise_convolution_param_t *p) +{ + int8_t status = 0; + uint32_t eu_num = ctx->info.eu_num; + + int8_t isMulConst = (p->weight_is_const == 1) ? 1 : 0; + + if(isMulConst) { + status |= check_tiu_tensor_2(p->ifmap, p->ofmap); + } else { + status |= check_tiu_tensor_3(p->ifmap, p->ofmap, p->weight); + } + status |= check_stride_type_0(ctx, p->ifmap); + if(!isMulConst){ + status |= check_stride_type_0(ctx, p->weight); + } + status |= check_tiu_tensor(p->chl_quan_param); + status |= check_stride_type_2(ctx, p->chl_quan_param); + + CHECK(status, (p->ofmap->stride.n % eu_num) == 0); + CHECK(status, p->chl_quan_param->start_address %eu_num == 0); + CHECK(status, p->ifmap->shape.n == p->ofmap->shape.n); + CHECK(status, p->ifmap->shape.c == p->ofmap->shape.c); + if (!isMulConst) { + CHECK(status, p->ifmap->shape.c == p->weight->shape.c); + CHECK(status, p->weight->shape.n == 1); + } + CHECK(status, p->relu_enable == 0 || p->relu_enable == 1); + CHECK(status, p->stride_h < 32 && p->stride_h > 0); + CHECK(status, p->stride_w < 32 && p->stride_w > 0); + CHECK(status, p->pad_top < 16); + CHECK(status, p->pad_bottom < 16); + CHECK(status, p->pad_left < 16); + CHECK(status, p->pad_right < 16); + CHECK(status, p->ins_h < 15); + CHECK(status, p->ins_last_h < 15); + CHECK(status, p->ins_w < 15); + CHECK(status, p->ins_last_w < 15); + CHECK(status, p->dilation_h >= 1); + CHECK(status, p->dilation_w >= 1); + + int opd0_sign = tensor_is_signed(p->ifmap); + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_DEPTHWISE_POOL_FIX8B; + reg.tsk_eu_typ = 2; + reg.opt_relu_typ = p->relu_enable; + reg.opt_shift_typ = 1; + reg.tsk_opd_num = 2; + + int res0_sign = tensor_is_signed(p->ofmap); + reg.res0_addr = p->ofmap->start_address; + reg.opt_res0_sign = res0_sign; + reg.opt_res0_seg = 1; + reg.res0_n = p->ofmap->shape.n; + reg.res0_c = p->ofmap->shape.c; + reg.res0_h = p->ofmap->shape.h; + reg.res0_w = p->ofmap->shape.w; + reg.res0_n_str = p->ofmap->stride.n; + reg.res0_c_str = p->ofmap->stride.c; + reg.res0_h_str = p->ofmap->stride.h; + reg.res0_w_str = p->ofmap->stride.w; + reg.short_res0_str = 3; // Manual instead of h/w + + reg.opd0_addr = p->ifmap->start_address; + reg.opt_opd0_sign = opd0_sign; + reg.opt_opd0_seg = 1; + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.opd0_n_str = p->ifmap->stride.n; + reg.opd0_c_str = p->ifmap->stride.c; + reg.opd0_h_str = p->ifmap->stride.h; + reg.opd0_w_str = p->ifmap->stride.w; + reg.opd0_ins_val = (uint32_t)p->ins_val; + reg.short_opd0_str = 3; // Manual instead of h/w + reg.conv_opd0_up_pad = p->pad_top; + reg.conv_opd0_dn_pad = p->pad_bottom; + reg.conv_opd0_lf_pad = p->pad_left; + reg.conv_opd0_rt_pad = p->pad_right; + reg.conv_opd0_x_ins0 = p->ins_w; + reg.conv_opd0_y_ins0 = p->ins_h; + reg.conv_opd0_x_ins0_last = p->ins_last_w; + reg.conv_opd0_y_ins0_last = p->ins_last_h; + + reg.opt_opd1_sign = 1; + reg.opt_opd1_seg = 1; + reg.conv_opd1_x_ins0 = p->dilation_w - 1; + reg.conv_opd1_y_ins0 = p->dilation_h - 1; + if (isMulConst) { + reg.opt_opd1_const = 1; + reg.opt_opd1_sign = p->weight_const.is_signed; + reg.opd1_addr = p->weight_const.val; + reg.opd1_n = p->weight->shape.n; + reg.opd1_c = p->weight->shape.c; + reg.opd1_h = p->weight->shape.h; + reg.opd1_w = p->weight->shape.w; + } else { + reg.opd1_addr = p->weight->start_address; + reg.opd1_n = p->weight->shape.n; + reg.opd1_c = p->weight->shape.c; + reg.opd1_h = p->weight->shape.h; + reg.opd1_w = p->weight->shape.w; + } + reg.conv_op_x_str = p->stride_w; + reg.conv_op_y_str = p->stride_h; + reg.cmd_pre_exe_typ = p->cmd_pre_exe_typ; + reg.cmd_pre_exe = p->cmd_pre_exe; + + CHECK(status, p->chl_quan_param->shape.n == 1); + CHECK(status, p->chl_quan_param->shape.c == p->ofmap->shape.c); + CHECK(status, p->chl_quan_param->shape.h == 1); + CHECK(status, p->chl_quan_param->shape.w == 1); + reg.opt_chl_quan = 1; + reg.opt_res_shift = 0; // useless + reg.opd2_addr = p->chl_quan_param->start_address; + reg.opd2_n = p->chl_quan_param->shape.n; + reg.opd2_c = p->chl_quan_param->shape.c; + reg.opd2_h = p->chl_quan_param->shape.h; + reg.opd2_w = p->chl_quan_param->shape.w; + reg.opt_opd2_seg = 1; // useless, force to 1 to skip b_stride check + reg.short_opd2_str = 2; // useless + reg.opd2_b_str = 0; // useless + + if (p->has_bias) { + reg.tsk_opd_num = 3; + reg.opt_opd2_sign = 1; + } + + reg.layer_info = p->layer_id; + reg.cmd_pre_exe_typ = p->cmd_pre_exe_typ; + reg.cmd_pre_exe = p->cmd_pre_exe; + + if (status) { + printf("cvkcv181x tiu_dw_conv: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv181x/tiu_ge.c b/cvikernel/src/cv181x/tiu_ge.c new file mode 100644 index 000000000..2b58e3add --- /dev/null +++ b/cvikernel/src/cv181x/tiu_ge.c @@ -0,0 +1,123 @@ +#include "cvkcv181x.h" + +#if 0 +void cvkcv181x_tiu_ge( + cvk_context_t *ctx, + const cvk_tiu_ge_param_t *p) +{ + int8_t status = 0; + status |= check_tiu_tensor_2(p->ge, p->a); + status |= check_same_shape(p->ge, p->a); + if (p->b_is_const) { + if (tensor_is_signed(p->a)) + CHECK(status, p->b_const.is_signed); + else + CHECK(status, !p->b_const.is_signed); + } else { + status |= check_tiu_tensor(p->b); + status |= check_same_shape(p->ge, p->b); + CHECK(status, p->a->fmt == p->b->fmt); + } + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_GE_FIX8B; + reg.tsk_opd_num = 2; + reg.opt_res_shift = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a); + fill_opd0_stride(®, &p->a->stride); + + if (p->b_is_const) { + reg.opt_opd1_const = 1; + reg.opd1_addr = p->b_const.val; + reg.opt_opd1_sign = !!p->b_const.is_signed; + } else { + reg.opt_opd1_const = 0; + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = tensor_is_signed(p->b); + fill_opd1_stride(®, &p->b->stride); + } + + reg.res0_addr = p->ge->start_address; + reg.opt_res0_sign = tensor_is_signed(p->ge); + fill_res0_stride(®, &p->ge->stride); + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv181x tiu ge: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} +#endif + +void cvkcv181x_tiu_ge( + cvk_context_t *ctx, + const cvk_tiu_ge_param_t *p) +{ + int8_t status = 0; + int bf16_enable = (p->a->fmt == CVK_FMT_BF16) ? 1 : 0; + + status |= check_tiu_tensor_2(p->ge, p->a); + status |= check_same_shape(p->ge, p->a); + + if (p->b_is_const && !bf16_enable) { + if (tensor_is_signed(p->a)) + CHECK(status, p->b_const.is_signed); + else + CHECK(status, !p->b_const.is_signed); + } else if (!p->b_is_const) { + status |= check_tiu_tensor(p->b); + status |= check_same_shape(p->ge, p->b); + CHECK(status, p->a->fmt == p->b->fmt); + } + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_GE_FIX8B; + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + reg.opt_res_shift = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a); + fill_opd0_stride(®, &p->a->stride); + + if (p->b_is_const) { + reg.opt_opd1_const = 1; + reg.opd1_addr = bf16_enable ? p->b_const.val : (p->b_const.val & 0xFF); + reg.opt_opd1_sign = !!p->b_const.is_signed; + } else { + reg.opt_opd1_const = 0; + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = tensor_is_signed(p->b); + fill_opd1_stride(®, &p->b->stride); + } + + reg.res0_addr = p->ge->start_address; + reg.opt_res0_sign = tensor_is_signed(p->ge); + fill_res0_stride(®, &p->ge->stride); + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv181x tiu ge: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv181x/tiu_lookup_table.c b/cvikernel/src/cv181x/tiu_lookup_table.c new file mode 100644 index 000000000..ac3d0fc76 --- /dev/null +++ b/cvikernel/src/cv181x/tiu_lookup_table.c @@ -0,0 +1,118 @@ +#include "cvkcv181x.h" + +void cvkcv181x_tiu_lookup_table( + cvk_context_t *ctx, + const cvk_tiu_lookup_table_param_t *p) +{ + int8_t status = 0; + uint32_t eu_num = ctx->info.eu_num; + uint32_t npu_num = ctx->info.npu_num; + + status |= check_tiu_tensor_3(p->ofmap, p->ifmap, p->table); + status |= check_stride_type_0(ctx, p->ofmap); + status |= check_stride_type_0(ctx, p->ifmap); + status |= check_stride_type_0(ctx, p->table); + + uint8_t is_bf16 = (p->ofmap->fmt == CVK_FMT_BF16 && p->ifmap->fmt == CVK_FMT_BF16); + + CHECK(status, p->table->shape.n == 1); + CHECK(status, p->table->shape.c == npu_num); + + if (is_bf16) { + CHECK(status, p->table->shape.h == 32); + CHECK(status, p->table->shape.w == 8); + } + else { + CHECK(status, p->table->shape.h == 16); + CHECK(status, p->table->shape.w == 16); + } + + CHECK(status, p->ifmap->start_address % eu_num == 0); + CHECK(status, p->ofmap->start_address % eu_num == 0); + CHECK(status, p->table->start_address % eu_num == 0); + + // fmt MUST be same under bf16 + if (p->ofmap->fmt == CVK_FMT_BF16) { + CHECK(status, p->ifmap->fmt == CVK_FMT_BF16); + } + CHECK(status, p->ofmap->fmt == CVK_FMT_I8 || p->ofmap->fmt == CVK_FMT_U8 || p->ofmap->fmt == CVK_FMT_BF16); + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + //reg.tens_lookup = 1; + reg.tsk_opd_num = 2; + reg.opt_shift_typ = 0; + reg.opt_res_shift = 0; + reg.opt_relu_typ = 0; + reg.opd_typ = is_bf16; + + reg.res0_addr = p->ofmap->start_address; + if (is_bf16) { + reg.opt_res0_sign = 1; + reg.opt_res0_seg = 1; + } + else { + reg.opt_res0_sign = 0; + reg.opt_res0_seg = 1; + } + + // ifmap->shape.n == p->ofmap->shape.n); + CHECK(status, p->ifmap->shape.c == p->ofmap->shape.c); + CHECK(status, p->ifmap->shape.h == p->ofmap->shape.h); + CHECK(status, p->ifmap->shape.w == p->ofmap->shape.w); + + reg.res0_n = p->ifmap->shape.n; + reg.res0_c = p->ifmap->shape.c; + reg.res0_h = p->ifmap->shape.h; + reg.res0_w = p->ifmap->shape.w; + reg.short_res0_str = 0; + + reg.opd0_addr = p->ifmap->start_address; + if (is_bf16) { + reg.opt_opd0_sign = 1; + reg.opt_opd0_seg = 1; + } + else { + reg.opt_opd0_sign = 0; + reg.opt_opd0_seg = 1; + } + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.short_opd0_str = 0; + + reg.opd1_addr = p->table->start_address; + if (is_bf16) { + reg.opt_opd1_sign = 1; + reg.opt_opd1_seg = 1; + } + else { + reg.opt_opd1_sign = 0; + reg.opt_opd1_seg = 1; + } + reg.opd1_n = p->table->shape.n; + reg.opd1_c = p->table->shape.c; + reg.opd1_h = p->table->shape.h; + reg.opd1_w = p->table->shape.w; + reg.short_opd1_str = 0; + reg.tsk_eu_typ = 12; // 12 means lut + if (is_bf16) { + reg.opt_opd2_seg = 1; // hw check + // dont care once short_xxx_str set to 0 + } + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv181x tiu lookup: wrong parameter\n"); + return; + } + + //trace_tiu_reg(®, __FUNCTION__); + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv181x/tiu_mac.c b/cvikernel/src/cv181x/tiu_mac.c new file mode 100644 index 000000000..ea607b579 --- /dev/null +++ b/cvikernel/src/cv181x/tiu_mac.c @@ -0,0 +1,73 @@ +#include "cvkcv181x.h" + +void cvkcv181x_tiu_mac( + cvk_context_t *ctx, + const cvk_tiu_mac_param_t *p) +{ + int8_t status = 0; + int bf16_enable = (p->a->fmt == CVK_FMT_BF16) ? 1 : 0; + + status |= check_tiu_tensor(p->a); + status |= check_same_shape(p->res_low, p->a); + if(!bf16_enable) { + status |= check_16bit_tiu_tensor(p->res_low, p->res_high); + CHECK(status, p->lshift_bits < 32); + CHECK(status, p->rshift_bits < 16); + } + if (!p->b_is_const) { + status |= check_tiu_tensor(p->b); + status |= check_same_shape(p->res_low, p->b); + } + CHECK(status, p->relu_enable == 0 || p->relu_enable == 1); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_MAC_FIX8B; + reg.opt_res_add = 1; + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + reg.opt_left_shift = p->lshift_bits; + reg.opt_relu_typ = p->relu_enable; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + int arith_shift = tensor_is_signed(p->res_low); + reg.opt_shift_typ = arith_shift; + reg.opt_res_shift = p->rshift_bits; + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a); + fill_opd0_stride(®, &p->a->stride); + + if (p->b_is_const) { + reg.opt_opd1_const = 1; + reg.opd1_addr = bf16_enable ? p->b_const.val : (p->b_const.val & 0xFF); + reg.opt_opd1_sign = !!p->b_const.is_signed; + } else { + reg.opt_opd1_const = 0; + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = tensor_is_signed(p->b); + fill_opd1_stride(®, &p->b->stride); + } + + reg.res0_addr = p->res_low->start_address; + reg.opt_res0_sign = tensor_is_signed(p->res_low); + reg.opt_res0_seg = bf16_enable ? 1 : !!p->res_is_int8; + fill_res0_stride(®, &p->res_low->stride); + reg.res0_b_str = bf16_enable ? 0 : (p->res_high->start_address - p->res_low->start_address); + + if (p->relu_enable) + CHECK(status, reg.opt_res0_seg); + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv181x tiu avg pool: wrong paramter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv181x/tiu_matrix_multiplication.c b/cvikernel/src/cv181x/tiu_matrix_multiplication.c new file mode 100644 index 000000000..a6b3feac6 --- /dev/null +++ b/cvikernel/src/cv181x/tiu_matrix_multiplication.c @@ -0,0 +1,160 @@ +#include "cvkcv181x.h" +#include + +static int8_t check_matrix(cvk_context_t *ctx, const cvk_ml_t *m) +{ + int8_t status = 0; + cvk_tl_t t; + t.start_address = m->start_address; + t.fmt = m->fmt; + t.shape.n = m->shape.n; + t.shape.c = m->shape.c; + t.shape.h = 1; + t.shape.w = m->shape.w; + t.stride.n = m->stride.n; + t.stride.c = m->stride.c; + t.stride.h = m->stride.h; + t.stride.w = 1 * (m->fmt == CVK_FMT_BF16 ? 2 : 1); + + status |= check_tiu_tensor(&t); + status |= check_stride_type_0(ctx, &t); + + uint32_t eu_num = ctx->info.eu_num; + CHECK(status, m->start_address % eu_num == 0); + + return status; +} + +static int is_arith_shift(const cvk_tiu_matrix_multiplication_param_t *p) +{ + if (p->left->fmt == CVK_FMT_I8) + return 1; + if (p->right->fmt == CVK_FMT_I8) + return 1; + if (p->bias && p->bias->fmt == CVK_FMT_I8) + return 1; + + return 0; +} + +void cvkcv181x_tiu_matrix_multiplication(cvk_context_t *ctx, const cvk_tiu_matrix_multiplication_param_t *p) +{ + int8_t status = 0; + const cvk_ml_t *res = p->res; + const cvk_ml_t *left = p->left; + const cvk_ml_t *right = p->right; + const cvk_ml_t *bias = p->bias; + int bf16_enable = (res->fmt == CVK_FMT_BF16) ? 1 : 0; + + status |= check_matrix(ctx, res); + status |= check_matrix(ctx, left); + status |= check_matrix(ctx, right); + if (bias) + status |= check_matrix(ctx, bias); + + CHECK(status, p->lshift_bits < 32); + if (bf16_enable) /* bf16 does not support add_result*/ + CHECK(status, !p->add_result); + else + CHECK(status, !(p->relu_enable && p->add_result)); + + if(p->ps32_mode & 0x2) + { + CHECK(status, !p->relu_enable); + CHECK(status, !p->bias); + CHECK(status, !p->rshift_bits); + } + CHECK(status, p->relu_enable == 0 || p->relu_enable == 1); + + uint32_t left_row = left->shape.n; + uint32_t left_col = left->shape.col; + uint32_t right_row = right->shape.n; + uint32_t right_col = right->shape.col; + uint32_t res_row = res->shape.n; + uint32_t res_col = res->shape.col; + CHECK(status, left_col == right_row); + CHECK(status, res_col == right_col); + + if(p->ps32_mode) + { + CHECK(status, !p->add_result); + } else if ((p->add_result || !p->res_is_int8) && !bf16_enable) { + CHECK(status, res_row == left_row * 2); + res_row = left_row; + } else { + CHECK(status, res_row == left_row); + } + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_FC_FIX8B; + reg.tsk_opd_num = bias? 3: 2; + reg.opd_typ = bf16_enable ? 1 : 0; + reg.opt_shift_typ = is_arith_shift(p); + reg.opt_res_shift = p->rshift_bits; + reg.opt_left_shift = p->lshift_bits; + reg.opt_relu_typ = p->relu_enable; + reg.opt_res_add = p->add_result; + + reg.res0_addr = res->start_address; + reg.opt_res0_seg = (bf16_enable ? 1 : p->res_is_int8); + + reg.opt_res0_sign = matrix_is_signed(res); + reg.res0_n = res_row; + reg.res0_c = res->shape.c; + reg.res0_h = 1; + reg.res0_w = res->shape.w; + reg.short_res0_str = 0; // stride, b_stride calculated by H/W + + reg.opd0_addr = left->start_address; + reg.opt_opd0_seg = 1; + reg.opt_opd0_sign = (left->fmt == CVK_FMT_I8); + reg.opd0_n = left_row; + reg.opd0_c = left->shape.c; + reg.opd0_h = 1; + reg.opd0_w = left->shape.w; + reg.short_opd0_str = 0; + + reg.opd1_addr = right->start_address; + reg.opt_opd1_seg = 1; + reg.opt_opd1_sign = (right->fmt == CVK_FMT_I8); + reg.opd1_n = right_row; + reg.opd1_c = right->shape.c; + reg.opd1_h = 1; + reg.opd1_w = left_col - left->shape.w * (left->shape.c - 1); + reg.short_opd1_str = 0; + + reg.ps32_md = p->ps32_mode; + if (p->ps32_mode > 0) + reg.res0_b_str = p->res->shape.n * p->res->stride.n; + if(reg.opd0_c == 1) + CHECK(status, reg.opd0_w == reg.opd1_w); + + if (bias) { + CHECK(status, bias->shape.n == 2); + CHECK(status, bias->shape.c == right->shape.c); + CHECK(status, bias->shape.w == right->shape.w); + CHECK(status, bias->shape.col == right->shape.col); + + reg.opd2_addr = bias->start_address; + reg.opt_opd2_seg = 0; + reg.opt_opd2_sign = (bias->fmt == CVK_FMT_I8); + reg.opd2_n = 1; + reg.opd2_c = bias->shape.c; + reg.opd2_h = 1; + reg.opd2_w = bias->shape.w; + reg.short_opd2_str = 0; + } + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv181x tiu matrix: wrong parameter"); + assert(0); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv181x/tiu_matrix_multiplication_qm.c b/cvikernel/src/cv181x/tiu_matrix_multiplication_qm.c new file mode 100644 index 000000000..5afad1192 --- /dev/null +++ b/cvikernel/src/cv181x/tiu_matrix_multiplication_qm.c @@ -0,0 +1,153 @@ +#include "cvkcv181x.h" + +static int8_t check_matrix(cvk_context_t *ctx, const cvk_ml_t *m) +{ + int8_t status = 0; + cvk_tl_t t; + t.start_address = m->start_address; + t.fmt = m->fmt; + t.shape.n = m->shape.n; + t.shape.c = m->shape.c; + t.shape.h = 1; + t.shape.w = m->shape.w; + t.stride.n = m->stride.n; + t.stride.c = m->stride.c; + t.stride.h = m->stride.h; + t.stride.w = 1; + + status |= check_tiu_tensor(&t); + status |= check_stride_type_0(ctx, &t); + + uint32_t eu_num = ctx->info.eu_num; + CHECK(status, m->start_address % eu_num == 0); + + return status; +} + +static int is_arith_shift(const cvk_tiu_matrix_multiplication_qm_param_t *p) +{ + if (p->left->fmt == CVK_FMT_I8) + return 1; + if (p->right->fmt == CVK_FMT_I8) + return 1; + if (p->bias && p->bias->fmt == CVK_FMT_I8) + return 1; + + return 0; +} + +void cvkcv181x_tiu_matrix_multiplication_qm(cvk_context_t *ctx, const cvk_tiu_matrix_multiplication_qm_param_t *p) +{ + int8_t status = 0; + const cvk_ml_t *res = p->res; + const cvk_ml_t *left = p->left; + const cvk_ml_t *right = p->right; + const cvk_ml_t *bias = p->bias; + + status |= check_matrix(ctx, res); + status |= check_matrix(ctx, left); + status |= check_matrix(ctx, right); + if (bias) + status |= check_matrix(ctx, bias); + + CHECK(status, p->lshift_bits < 32); + CHECK(status, !(p->relu_enable && p->add_result)); + if(p->ps32_mode & 0x2) + { + CHECK(status, !p->relu_enable); + CHECK(status, !p->bias); + CHECK(status, !p->rshift_bits); + } + CHECK(status, p->relu_enable == 0 || p->relu_enable == 1); + + uint32_t left_row = left->shape.n; + uint32_t left_col = left->shape.col; + uint32_t right_row = right->shape.n; + uint32_t right_col = right->shape.col; + uint32_t res_row = res->shape.n; + uint32_t res_col = res->shape.col; + CHECK(status, left_col == right_row); + CHECK(status, res_col == right_col); + CHECK(status, p->res_is_int8 == 1); + + if(p->ps32_mode) + { + CHECK(status, !p->add_result); + } + else if (p->add_result) { + CHECK(status, res_row == left_row * 2); + res_row = left_row; + } else { + CHECK(status, res_row == left_row); + } + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_FC_FIX8B; + reg.tsk_opd_num = bias? 3: 2; + reg.opt_shift_typ = is_arith_shift(p); + reg.opt_res_shift = p->rshift_bits; + reg.opt_left_shift = p->lshift_bits; + reg.opt_relu_typ = p->relu_enable; + reg.opt_res_add = p->add_result; + + reg.res0_addr = res->start_address; + reg.opt_res0_seg = 1; + reg.opt_res0_sign = matrix_is_signed(res); + reg.res0_n = res_row; + reg.res0_c = res->shape.c; + reg.res0_h = 1; + reg.res0_w = res->shape.w; + reg.short_res0_str = 0; // stride, b_stride calculated by H/W + + reg.opd0_addr = left->start_address; + reg.opt_opd0_seg = 1; + reg.opt_opd0_sign = (left->fmt == CVK_FMT_I8); + reg.opd0_n = left_row; + reg.opd0_c = left->shape.c; + reg.opd0_h = 1; + reg.opd0_w = left->shape.w; + reg.short_opd0_str = 0; + + reg.opd1_addr = right->start_address; + reg.opt_opd1_seg = 1; + reg.opt_opd1_sign = (right->fmt == CVK_FMT_I8); + reg.opd1_n = right_row; + reg.opd1_c = right->shape.c; + reg.opd1_h = 1; + reg.opd1_w = left_col - left->shape.w * (left->shape.c - 1); + reg.short_opd1_str = 0; + + reg.ps32_md = p->ps32_mode; + if (p->ps32_mode > 0) + reg.res0_b_str = p->res->shape.n * p->res->stride.n; + if(reg.opd0_c == 1) + CHECK(status, reg.opd0_w == reg.opd1_w); + + // Only enable 32-bit multiplier at the final post processing stage + reg.opt_chl_quan = ((p->ps32_mode == 0) || (p->ps32_mode == 1)) ? 1 : 0; + reg.quan_m = p->quan_m; + + // 32b bias, determined by b_stride + if (bias) { + CHECK(status, bias->shape.n == 4); + CHECK(status, bias->shape.c == right->shape.c); + CHECK(status, bias->shape.w == right->shape.w); + CHECK(status, bias->shape.col == right->shape.col); + + reg.opd2_addr = bias->start_address; + reg.opt_opd2_seg = 0; + reg.opt_opd2_sign = (bias->fmt == CVK_FMT_I8); + reg.opd2_n = 1; + reg.opd2_c = bias->shape.c; + reg.opd2_h = 1; + reg.opd2_w = bias->shape.w; + reg.short_opd2_str = 0; + } + + reg.layer_info = p->layer_id; + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv181x/tiu_max.c b/cvikernel/src/cv181x/tiu_max.c new file mode 100644 index 000000000..e722b58d0 --- /dev/null +++ b/cvikernel/src/cv181x/tiu_max.c @@ -0,0 +1,62 @@ +#include "cvkcv181x.h" + +void cvkcv181x_tiu_max( + cvk_context_t *ctx, + const cvk_tiu_max_param_t *p) +{ + int8_t status = 0; + int bf16_enable = (p->a->fmt == CVK_FMT_BF16) ? 1 : 0; + + status |= check_tiu_tensor_2(p->max, p->a); + status |= check_same_shape(p->max, p->a); + + if (p->b_is_const && !bf16_enable) { + if (tensor_is_signed(p->a)) + CHECK(status, p->b_const.is_signed); + else + CHECK(status, !p->b_const.is_signed); + } else if (!p->b_is_const) { + status |= check_tiu_tensor(p->b); + status |= check_same_shape(p->max, p->b); + CHECK(status, p->a->fmt == p->b->fmt); + } + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_MAX_FIX8B; + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + reg.opt_res_shift = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a); + fill_opd0_stride(®, &p->a->stride); + + if (p->b_is_const) { + reg.opt_opd1_const = 1; + reg.opd1_addr = bf16_enable ? p->b_const.val : (p->b_const.val & 0xFF); + reg.opt_opd1_sign = !!p->b_const.is_signed; + } else { + reg.opt_opd1_const = 0; + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = tensor_is_signed(p->b); + fill_opd1_stride(®, &p->b->stride); + } + + reg.res0_addr = p->max->start_address; + reg.opt_res0_sign = tensor_is_signed(p->max); + fill_res0_stride(®, &p->max->stride); + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv181x tiu max: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv181x/tiu_max_pooling.c b/cvikernel/src/cv181x/tiu_max_pooling.c new file mode 100644 index 000000000..f1c011aaf --- /dev/null +++ b/cvikernel/src/cv181x/tiu_max_pooling.c @@ -0,0 +1,74 @@ +#include "cvkcv181x.h" + +void cvkcv181x_tiu_max_pooling( + cvk_context_t *ctx, + const cvk_tiu_max_pooling_param_t *p) +{ + int8_t status = 0; + int bf16_enable = (p->ifmap->fmt == CVK_FMT_BF16); + + status |= check_tiu_tensor_2(p->ifmap, p->ofmap); + CHECK(status, p->kh * p->kw >= 1); + CHECK(status, p->ifmap->shape.n == p->ofmap->shape.n); + CHECK(status, p->ifmap->shape.c == p->ofmap->shape.c); + CHECK(status, p->stride_h < 32 && p->stride_h > 0 && "stride_h should be in [1, 31] range"); + CHECK(status, p->stride_w < 32 && p->stride_w > 0 && "stride_w should be in [1, 31] range"); + if (bf16_enable) { + status |= check_bf16_stride_type_0(ctx, p->ifmap); + status |= check_bf16_stride_type_0(ctx, p->ofmap); + } else { + status |= check_stride_type_0(ctx, p->ifmap); + status |= check_stride_type_0(ctx, p->ofmap); + } + int opd0_sign = tensor_is_signed(p->ifmap); + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_DEPTHWISE_POOL_FIX8B; + reg.tsk_eu_typ = 0; + reg.opt_relu_typ = 0; /* Hardware relu function not validated. */ + reg.opt_res_shift = 0; + reg.opt_shift_typ = opd0_sign; + reg.tsk_opd_num = 1; + reg.opd_typ = bf16_enable ? 1: 0; + + reg.res0_addr = p->ofmap->start_address; + reg.opt_res0_sign = opd0_sign; + reg.opt_res0_seg = 1; + reg.res0_n = p->ofmap->shape.n; + reg.res0_c = p->ofmap->shape.c; + reg.res0_h = p->ofmap->shape.h; + reg.res0_w = p->ofmap->shape.w; + reg.opd0_addr = p->ifmap->start_address; + reg.opt_opd0_sign = opd0_sign; + reg.opt_opd0_seg = 1; + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.conv_opd0_up_pad = p->pad_top; + reg.conv_opd0_dn_pad = p->pad_bottom; + reg.conv_opd0_lf_pad = p->pad_left; + reg.conv_opd0_rt_pad = p->pad_right; + if (bf16_enable) { + reg.opd0_ins_val = p->ins_fp; + } else { + //reg.opd0_ins_val = bf16_enable ? 0 : (uint32_t)p->ins_val; + reg.opd0_ins_val = (!p->ins_val && opd0_sign) ? -128 : p->ins_val; // backend not set yet + } + + reg.opt_opd1_seg = 1; + reg.opd1_h = p->kh; + reg.opd1_w = p->kw; + reg.conv_op_x_str = p->stride_w; + reg.conv_op_y_str = p->stride_h; + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv181x tiu max pool: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv181x/tiu_min.c b/cvikernel/src/cv181x/tiu_min.c new file mode 100644 index 000000000..30ab338f0 --- /dev/null +++ b/cvikernel/src/cv181x/tiu_min.c @@ -0,0 +1,63 @@ +#include "cvkcv181x.h" + +void cvkcv181x_tiu_min( + cvk_context_t *ctx, + const cvk_tiu_min_param_t *p) +{ + int8_t status = 0; + int bf16_enable = (p->a->fmt == CVK_FMT_BF16) ? 1 : 0; + + status |= check_tiu_tensor_2(p->min, p->a); + status |= check_same_shape(p->min, p->a); + if (p->b_is_const && !bf16_enable) { + if (tensor_is_signed(p->a)) + CHECK(status, p->b_const.is_signed); + else + CHECK(status, !p->b_const.is_signed); + } else if (!p->b_is_const) { + status |= check_tiu_tensor(p->b); + status |= check_same_shape(p->min, p->b); + CHECK(status, p->a->fmt == p->b->fmt); + } + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_MIN_FIX8B; + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + reg.opt_res_shift = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a); + fill_opd0_stride(®, &p->a->stride); + + if (p->b_is_const) { + reg.opt_opd1_const = 1; + reg.opd1_addr = bf16_enable ? p->b_const.val : (p->b_const.val & 0xFF); + reg.opt_opd1_sign = !!p->b_const.is_signed; + } else { + reg.opt_opd1_const = 0; + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = tensor_is_signed(p->b); + fill_opd1_stride(®, &p->b->stride); + } + + reg.res0_addr = p->min->start_address; + reg.opt_res0_sign = tensor_is_signed(p->min); + fill_res0_stride(®, &p->min->stride); + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv181x tiu min: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv181x/tiu_min_pooling.c b/cvikernel/src/cv181x/tiu_min_pooling.c new file mode 100644 index 000000000..80b93c1a3 --- /dev/null +++ b/cvikernel/src/cv181x/tiu_min_pooling.c @@ -0,0 +1,140 @@ +#include "cvkcv181x.h" + +#if 0 +void cvkcv181x_tiu_min_pooling( + cvk_context_t *ctx, + const cvk_tiu_min_pooling_param_t *p) +{ + int8_t status = 0; + status |= check_tiu_tensor_2(p->ifmap, p->ofmap); + CHECK(status, p->kh * p->kw > 1); + CHECK(status, p->ifmap->shape.n == p->ofmap->shape.n); + CHECK(status, p->ifmap->shape.c == p->ofmap->shape.c); + status |= check_stride_type_0(ctx, p->ifmap); + status |= check_stride_type_0(ctx, p->ofmap); + + int opd0_sign = tensor_is_signed(p->ifmap); + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_DEPTHWISE_POOL_FIX8B; + reg.tsk_eu_typ = 3; + reg.opt_relu_typ = 0; /* Hardware relu function not validated. */ + reg.opt_res_shift = 0; + reg.opt_shift_typ = opd0_sign; + reg.tsk_opd_num = 1; + + reg.res0_addr = p->ofmap->start_address; + reg.opt_res0_sign = opd0_sign; + reg.opt_res0_seg = 1; + reg.res0_n = p->ofmap->shape.n; + reg.res0_c = p->ofmap->shape.c; + reg.res0_h = p->ofmap->shape.h; + reg.res0_w = p->ofmap->shape.w; + + reg.opd0_addr = p->ifmap->start_address; + reg.opt_opd0_sign = opd0_sign; + reg.opt_opd0_seg = 1; + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.conv_opd0_up_pad = p->pad_top; + reg.conv_opd0_dn_pad = p->pad_bottom; + reg.conv_opd0_lf_pad = p->pad_left; + reg.conv_opd0_rt_pad = p->pad_right; + if (opd0_sign) + reg.opd0_ins_val = (uint16_t)127; + else + reg.opd0_ins_val = (uint16_t)255; + reg.opt_opd1_seg = 1; + reg.opd1_h = p->kh; + reg.opd1_w = p->kw; + reg.conv_op_x_str = p->stride_w; + reg.conv_op_y_str = p->stride_h; + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv181x tiu min pool: wrong parameter\n"); + return; + } + + (void *) emit_tiu_cmdbuf(ctx, ®); +} +#endif + +void cvkcv181x_tiu_min_pooling( + cvk_context_t *ctx, + const cvk_tiu_min_pooling_param_t *p) +{ + int8_t status = 0; + int bf16_enable = (p->ifmap->fmt == CVK_FMT_BF16) ? 1 : 0; + + status |= check_tiu_tensor_2(p->ifmap, p->ofmap); + CHECK(status, p->kh * p->kw > 1); + CHECK(status, p->ifmap->shape.n == p->ofmap->shape.n); + CHECK(status, p->ifmap->shape.c == p->ofmap->shape.c); + if (bf16_enable) { + status |= check_bf16_stride_type_0(ctx, p->ifmap); + status |= check_bf16_stride_type_0(ctx, p->ofmap); + } else { + status |= check_stride_type_0(ctx, p->ifmap); + status |= check_stride_type_0(ctx, p->ofmap); + } + int opd0_sign = tensor_is_signed(p->ifmap); + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_DEPTHWISE_POOL_FIX8B; + reg.tsk_eu_typ = 3; + reg.opt_relu_typ = 0; /* Hardware relu function not validated. */ + reg.opt_res_shift = 0; + reg.opt_shift_typ = opd0_sign; + reg.tsk_opd_num = 1; + reg.opd_typ = bf16_enable ? 1: 0; + + reg.res0_addr = p->ofmap->start_address; + reg.opt_res0_sign = opd0_sign; + reg.opt_res0_seg = 1; + reg.res0_n = p->ofmap->shape.n; + reg.res0_c = p->ofmap->shape.c; + reg.res0_h = p->ofmap->shape.h; + reg.res0_w = p->ofmap->shape.w; + + if (!bf16_enable) { + if (opd0_sign) + reg.opd0_ins_val = (uint16_t)127; + else + reg.opd0_ins_val = (uint16_t)255; + } else + reg.opd0_ins_val = p->ins_fp; + + reg.opd0_addr = p->ifmap->start_address; + reg.opt_opd0_sign = opd0_sign; + reg.opt_opd0_seg = 1; + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.conv_opd0_up_pad = p->pad_top; + reg.conv_opd0_dn_pad = p->pad_bottom; + reg.conv_opd0_lf_pad = p->pad_left; + reg.conv_opd0_rt_pad = p->pad_right; + + reg.opt_opd1_seg = 1; + reg.opd1_h = p->kh; + reg.opd1_w = p->kw; + reg.conv_op_x_str = p->stride_w; + reg.conv_op_y_str = p->stride_h; + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv181x tiu min pool: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv181x/tiu_mul.c b/cvikernel/src/cv181x/tiu_mul.c new file mode 100644 index 000000000..485095840 --- /dev/null +++ b/cvikernel/src/cv181x/tiu_mul.c @@ -0,0 +1,72 @@ +#include "cvkcv181x.h" + +void cvkcv181x_tiu_mul( + cvk_context_t *ctx, + const cvk_tiu_mul_param_t *p) +{ + int8_t status = 0; + int bf16_enable = (p->res_low->fmt == CVK_FMT_BF16) ? 1 : 0; + + status |= check_tiu_tensor_2(p->res_low, p->a); + status |= check_same_shape(p->res_low, p->a); + if (!p->b_is_const) { + status |= check_tiu_tensor(p->b); + status |= check_same_shape(p->res_low, p->b); + } + if (p->res_high) + status |= check_16bit_tiu_tensor(p->res_low, p->res_high); + CHECK(status, p->relu_enable == 0 || p->relu_enable == 1); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_MUL_FIX8B; + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + int arith_shift = tensor_is_signed(p->res_low); + reg.opt_shift_typ = arith_shift; + reg.opt_res_shift = p->rshift_bits; + reg.opt_relu_typ = p->relu_enable; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a); + fill_opd0_stride(®, &p->a->stride); + + if (p->b_is_const) { + reg.opt_opd1_const = 1; + reg.opd1_addr = bf16_enable ? p->b_const.val : (p->b_const.val & 0xFF); + reg.opt_opd1_sign = !!p->b_const.is_signed; + } else { + reg.opt_opd1_const = 0; + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = tensor_is_signed(p->b); + fill_opd1_stride(®, &p->b->stride); + } + + reg.res0_addr = p->res_low->start_address; + reg.opt_res0_sign = tensor_is_signed(p->res_low); + reg.opt_res0_seg = (p->res_high == NULL); + fill_res0_stride(®, &p->res_low->stride); + if (p->res_high) + reg.res0_b_str = (p->res_high->start_address - p->res_low->start_address); + if (p->relu_enable) + CHECK(status, reg.opt_res0_seg); + + CHECK(status, ( + p->b_is_const || (!reg.opt_opd1_sign && !reg.opt_opd0_sign && !reg.opt_shift_typ) || + ((reg.opt_opd1_sign || reg.opt_opd0_sign) && reg.opt_shift_typ) + )); + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv181x tiu mul: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv181x/tiu_mul_qm.c b/cvikernel/src/cv181x/tiu_mul_qm.c new file mode 100644 index 000000000..d2905611f --- /dev/null +++ b/cvikernel/src/cv181x/tiu_mul_qm.c @@ -0,0 +1,71 @@ +#include "cvkcv181x.h" + +void cvkcv181x_tiu_mul_qm( + cvk_context_t *ctx, + const cvk_tiu_mul_qm_param_t *p) +{ + int8_t status = 0; + status |= check_tiu_tensor_2(p->res_low, p->a); + status |= check_same_shape(p->res_low, p->a); + if (!p->b_is_const) { + status |= check_tiu_tensor(p->b); + status |= check_same_shape(p->res_low, p->b); + } + if (p->res_high) + status |= check_16bit_tiu_tensor(p->res_low, p->res_high); + CHECK(status, p->relu_enable == 0 || p->relu_enable == 1); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_MUL_FIX8B; + reg.tsk_opd_num = 2; + int arith_shift = tensor_is_signed(p->res_low); + reg.opt_shift_typ = arith_shift; + reg.opt_res_shift = p->rshift_bits; + reg.opt_relu_typ = p->relu_enable; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a); + fill_opd0_stride(®, &p->a->stride); + + if (p->b_is_const) { + reg.opt_opd1_const = 1; + reg.opd1_addr = p->b_const.val; + reg.opt_opd1_sign = !!p->b_const.is_signed; + } else { + reg.opt_opd1_const = 0; + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = tensor_is_signed(p->b); + fill_opd1_stride(®, &p->b->stride); + } + + reg.res0_addr = p->res_low->start_address; + reg.opt_res0_sign = tensor_is_signed(p->res_low); + reg.opt_res0_seg = (p->res_high == NULL); + fill_res0_stride(®, &p->res_low->stride); + if (p->res_high) + reg.res0_b_str = p->res_high->start_address - p->res_low->start_address; + if (p->relu_enable) + CHECK(status, reg.opt_res0_seg); + + CHECK(status, ( + (!reg.opt_opd1_sign && !reg.opt_opd0_sign && !reg.opt_shift_typ) || + ((reg.opt_opd1_sign || reg.opt_opd0_sign) && reg.opt_shift_typ) + )); + + reg.opt_chl_quan = 1; + reg.quan_m = p->multiplier; + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv181x tiu mul qm: wrong parameter\n"); + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv181x/tiu_or.c b/cvikernel/src/cv181x/tiu_or.c new file mode 100644 index 000000000..3cb3b6c6a --- /dev/null +++ b/cvikernel/src/cv181x/tiu_or.c @@ -0,0 +1,112 @@ +#include "cvkcv181x.h" + +void cvkcv181x_tiu_or_int8( + cvk_context_t *ctx, + const cvk_tiu_or_int8_param_t *p) +{ + int8_t status = 0; + + status |= check_tiu_tensor_3(p->res, p->a, p->b); + status |= check_same_shape_3(p->res, p->a, p->b); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_OR_FIX8B; + reg.tsk_opd_num = 2; + reg.opt_res_shift = 0; + reg.opt_shift_typ = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = 0; + reg.opt_opd0_seg = 1; + fill_opd0_stride(®, &p->a->stride); + + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = 0; + reg.opt_opd1_seg = 1; + fill_opd1_stride(®, &p->b->stride); + + reg.res0_addr = p->res->start_address; + reg.opt_res0_sign = 0; + reg.opt_res0_seg = 1; + fill_res0_stride(®, &p->res->stride); + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv181x tiu or: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} + +void cvkcv181x_tiu_or_int16( + cvk_context_t *ctx, + const cvk_tiu_or_int16_param_t *p) +{ + int8_t status = 0; + status |= check_16bit_tiu_tensor(p->a_low, p->a_high); + status |= check_16bit_tiu_tensor(p->b_low, p->b_high); + status |= check_16bit_tiu_tensor(p->res_low, p->res_high); + status |= check_same_shape_3(p->res_low, p->a_low, p->b_low); + + int res_high_addr = p->res_high->start_address; + int res_low_addr = p->res_low->start_address; + CHECK(status, res_high_addr > res_low_addr); + int res_b_stride = res_high_addr - res_low_addr; + + int a_high_addr = p->a_high->start_address; + int a_low_addr = p->a_low->start_address; + CHECK(status, a_high_addr > a_low_addr); + int a_b_stride = a_high_addr - a_low_addr; + + int b_high_addr = p->b_high->start_address; + int b_low_addr = p->b_low->start_address; + CHECK(status, b_high_addr > b_low_addr); + int b_b_stride = b_high_addr - b_low_addr; + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_OR_FIX8B; + reg.tsk_opd_num = 2; + reg.opt_res_shift = 0; + reg.opt_shift_typ = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a_low->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = a_low_addr; + reg.opt_opd0_sign = 0; + reg.opt_opd0_seg = 0; + reg.opd0_b_str = a_b_stride; + fill_opd0_stride(®, &p->a_low->stride); + + reg.opd1_addr = b_low_addr; + reg.opt_opd1_sign = 0; + reg.opt_opd1_seg = 0; + reg.opd1_b_str = b_b_stride; + fill_opd1_stride(®, &p->b_low->stride); + + reg.res0_addr = res_low_addr; + reg.opt_res0_sign = 0; + reg.opt_res0_seg = 0; + reg.res0_b_str = res_b_stride; + fill_res0_stride(®, &p->res_low->stride); + + if (status) { + printf("cvkcv181x tiu or: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv181x/tiu_pt_convolution.c b/cvikernel/src/cv181x/tiu_pt_convolution.c new file mode 100644 index 000000000..40f11f87b --- /dev/null +++ b/cvikernel/src/cv181x/tiu_pt_convolution.c @@ -0,0 +1,183 @@ +#include "cvkcv181x.h" + +static int can_do_double_conv(cvk_context_t *ctx, const cvk_tiu_pt_convolution_param_t *p) +{ + uint8_t bf16_enable = (p->ifmap->fmt == CVK_FMT_BF16) ? 1 : 0; + + if (((p->ifmap->start_address % ctx->info.lmem_size) % 2 == 0 && + p->ifmap->shape.c % 2 == 0 && + p->ifmap->shape.c >= 4 && + p->weight->start_address % 2 == 0) && !bf16_enable) + return 1; + + return 0; +} + +static int8_t check_conv_param(cvk_context_t *ctx, const cvk_tiu_pt_convolution_param_t *p) +{ + int8_t status = 0; + uint32_t eu_num = ctx->info.eu_num; + uint8_t bf16_enable = (p->ifmap->fmt == CVK_FMT_BF16) ? 1 : 0; + + status |= check_tiu_tensor_3(p->ifmap, p->ofmap, p->weight); + if (bf16_enable) { + status |= check_bf16_stride_type_0(ctx, p->ifmap); + } else { + status |= check_stride_type_0(ctx, p->ifmap); + } + //assert_stride_type_1(ctx, p->weight); + if (p->bias) { + status |= check_tiu_tensor(p->bias); + if (bf16_enable) + status |= check_bf16_stride_type_2(ctx, p->bias); + else + status |= check_stride_type_2(ctx, p->bias); + } + + // n stride must align 16B + CHECK(status, (p->ofmap->stride.n % 16) == 0); + + CHECK(status, p->ifmap->start_address % eu_num == 0); + CHECK(status, p->ofmap->start_address % eu_num == 0); + CHECK(status, p->ifmap->shape.n == p->ofmap->shape.n); + CHECK(status, !(p->ifmap->shape.h == 1 && p->ins_h > 0)); + CHECK(status, p->weight->shape.n == p->ifmap->shape.c); + CHECK(status, p->weight->shape.c == p->ofmap->shape.c); + if (can_do_double_conv(ctx, p)) { + uint32_t lmem_i = p->ifmap->start_address % ctx->info.lmem_size; + CHECK(status, lmem_i % 2 == 0); + CHECK(status, p->ifmap->shape.c % 2 == 0); + CHECK(status, p->ifmap->shape.c >= 4); /* Otherwise performance will suffer */ + CHECK(status, p->weight->start_address % 2 == 0); + } + if(p->ps32_mode & 0x2) + { + CHECK(status, !p->relu_enable); + CHECK(status, !p->bias); + CHECK(status, !p->rshift_bits); + + CHECK(status, p->cmd_pre_exe <= 1); + } + CHECK(status, p->stride_h < 32 && p->stride_h > 0); + CHECK(status, p->stride_w < 32 && p->stride_w > 0); + CHECK(status, p->pad_top < 16); + CHECK(status, p->pad_bottom < 16); + CHECK(status, p->pad_left < 16); + CHECK(status, p->pad_right < 16); + CHECK(status, p->ins_h < 15); + CHECK(status, p->ins_last_h < 15); + CHECK(status, p->ins_w < 15); + CHECK(status, p->ins_last_w < 15); + CHECK(status, p->dilation_h >= 1); + CHECK(status, p->dilation_w >= 1); + CHECK(status, p->relu_enable == 0 || p->relu_enable == 1); + + return status; +} + +void cvkcv181x_tiu_pt_convolution(cvk_context_t *ctx, const cvk_tiu_pt_convolution_param_t *p) +{ + int8_t status = 0; + + status |= check_conv_param(ctx, p); + + uint32_t npu_num = ctx->info.npu_num; + int opd0_sign = tensor_is_signed(p->ifmap); + int opd1_sign = tensor_is_signed(p->weight); + int opd2_sign = p->bias? tensor_is_signed(p->bias): 1; + int arith_shift = opd0_sign || opd1_sign || opd2_sign; + int bf16_enable = (p->ifmap->fmt == CVK_FMT_BF16) ? 1 : 0; + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_CONV_FIX8B; + reg.opt_shift_typ = arith_shift; + reg.opt_res_shift = p->rshift_bits; + reg.opt_relu_typ = !!(p->relu_enable); + reg.tsk_opd_num = 2; + + reg.opd_typ = bf16_enable; + + /*always automatically enabel double conv at those situations*/ + if (can_do_double_conv(ctx, p)) + reg.double_conv = 1; + + reg.res0_addr = p->ofmap->start_address; + reg.opt_res0_sign = tensor_is_signed(p->ofmap); + reg.opt_res0_seg = 1; + reg.res0_n = p->ofmap->shape.n; + reg.res0_c = p->ofmap->shape.c; + reg.res0_h = p->ofmap->shape.h; + reg.res0_w = p->ofmap->shape.w; + reg.res0_n_str = p->ofmap->stride.n; + reg.res0_c_str = p->ofmap->stride.c; + reg.res0_h_str = p->ofmap->stride.h; + reg.res0_w_str = p->ofmap->stride.w; + reg.short_res0_str = 3; // Manual instead of h/w + reg.ps32_md = p->ps32_mode; + if (p->ps32_mode > 0) + reg.res0_b_str = p->ofmap->shape.n * p->ofmap->stride.n; + + reg.opd0_addr = p->ifmap->start_address; + reg.opt_opd0_sign = opd0_sign; + reg.opt_opd0_seg = 1; + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.short_opd0_str = 0; + reg.conv_opd0_up_pad = p->pad_top; + reg.conv_opd0_dn_pad = p->pad_bottom; + reg.conv_opd0_lf_pad = p->pad_left; + reg.conv_opd0_rt_pad = p->pad_right; + reg.conv_opd0_x_ins0 = p->ins_w; + reg.conv_opd0_y_ins0 = p->ins_h; + reg.conv_opd0_x_ins0_last = p->ins_last_w; + reg.conv_opd0_y_ins0_last = p->ins_last_h; + + reg.opd1_addr = p->weight->start_address; + reg.opt_opd1_sign = opd1_sign; + reg.opt_opd1_seg = 1; + reg.opt_opd1_const = p->w_is_const; + reg.opd1_n = p->weight->shape.n; + reg.opd1_c = p->weight->shape.c; + reg.opd1_h = p->weight->shape.h; + reg.opd1_w = p->weight->shape.w; + reg.short_opd1_str = 1; + reg.conv_opd1_x_ins0 = p->dilation_w - 1; + reg.conv_opd1_y_ins0 = p->dilation_h - 1; + reg.conv_op_x_str = p->stride_w; + reg.conv_op_y_str = p->stride_h; + reg.opd0_ins_val = bf16_enable ? + (uint32_t)p->ins_fp : (uint32_t)p->ins_val; + + if (p->bias) { + CHECK(status, p->bias->shape.n == 2); + CHECK(status, p->bias->shape.c == p->ofmap->shape.c); + CHECK(status, p->bias->shape.h == 1); + CHECK(status, p->bias->shape.w == 1); + + reg.tsk_opd_num = 3; + reg.opt_opd2_sign = opd2_sign; + reg.opt_opd2_seg = 0; + reg.opd2_addr = p->bias->start_address; + reg.opd2_n = 1; + reg.opd2_c = p->bias->shape.c; + reg.opd2_h = 1; + reg.opd2_w = 1; + reg.short_opd2_str = 2; + reg.opd2_b_str = ceiling_func(p->bias->shape.c, npu_num) * (bf16_enable ? 2 : 1); + } + + reg.cmd_pre_exe_typ = p->cmd_pre_exe_typ; + reg.cmd_pre_exe = p->cmd_pre_exe; + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv181x tiu_pt_conv: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv181x/tiu_pt_depthwise_convolution.c b/cvikernel/src/cv181x/tiu_pt_depthwise_convolution.c new file mode 100644 index 000000000..6237f83dd --- /dev/null +++ b/cvikernel/src/cv181x/tiu_pt_depthwise_convolution.c @@ -0,0 +1,158 @@ +#include "cvkcv181x.h" + +void cvkcv181x_tiu_pt_depthwise_convolution( + cvk_context_t *ctx, + const cvk_tiu_depthwise_pt_convolution_param_t *p) +{ + int8_t status = 0; + int bf16_enable = (p->ifmap->fmt == CVK_FMT_BF16) ? 1 : 0; + + int8_t isMulConst = (p->weight_is_const == 1) ? 1 : 0; + + if(isMulConst) { + status |= check_tiu_tensor_2(p->ifmap, p->ofmap); + } else { + status |= check_tiu_tensor_3(p->ifmap, p->ofmap, p->weight); + } + if (bf16_enable) { + status |= check_bf16_stride_type_0(ctx, p->ifmap); + if(!isMulConst) + status |= check_bf16_stride_type_0(ctx, p->weight); + if (p->bias) { + status |= check_tiu_tensor(p->bias); + status |= check_bf16_stride_type_2(ctx, p->bias); + } + } else { + status |= check_stride_type_0(ctx, p->ifmap); + if(!isMulConst) + status |= check_stride_type_0(ctx, p->weight); + if (p->bias) { + status |= check_tiu_tensor(p->bias); + status |= check_stride_type_2(ctx, p->bias); + } + } + + // n stride must align 16B + CHECK(status, (p->ofmap->stride.n % 16) == 0); + + CHECK(status, p->ifmap->shape.n == p->ofmap->shape.n); + CHECK(status, p->ifmap->shape.c == p->ofmap->shape.c); + if (!isMulConst){ + CHECK(status, p->ifmap->shape.c == p->weight->shape.c); + CHECK(status, p->weight->shape.n == 1); + } + CHECK(status, p->relu_enable == 0 || p->relu_enable == 1); + CHECK(status, p->stride_h < 32 && p->stride_h > 0); + CHECK(status, p->stride_w < 32 && p->stride_w > 0); + CHECK(status, p->pad_top < 16); + CHECK(status, p->pad_bottom < 16); + CHECK(status, p->pad_left < 16); + CHECK(status, p->pad_right < 16); + CHECK(status, p->ins_h < 15); + CHECK(status, p->ins_last_h < 15); + CHECK(status, p->ins_w < 15); + CHECK(status, p->ins_last_w < 15); + CHECK(status, p->dilation_h >= 1); + CHECK(status, p->dilation_w >= 1); + + int opd0_sign = tensor_is_signed(p->ifmap); + + tiu_reg_t reg; + reset_tiu_reg(®); + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_DEPTHWISE_POOL_FIX8B; + reg.tsk_eu_typ = 2; + reg.opt_relu_typ = p->relu_enable; + reg.opt_shift_typ = 1; + reg.opt_res_shift = p->rshift_bits; + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + + int res0_sign = tensor_is_signed(p->ofmap); + reg.res0_addr = p->ofmap->start_address; + reg.opt_res0_sign = res0_sign; + reg.opt_res0_seg = 1; + reg.res0_n = p->ofmap->shape.n; + reg.res0_c = p->ofmap->shape.c; + reg.res0_h = p->ofmap->shape.h; + reg.res0_w = p->ofmap->shape.w; + reg.res0_n_str = p->ofmap->stride.n; + reg.res0_c_str = p->ofmap->stride.c; + reg.res0_h_str = p->ofmap->stride.h; + reg.res0_w_str = p->ofmap->stride.w; + reg.short_res0_str = 3; // Manual instead of h/w + + reg.opd0_addr = p->ifmap->start_address; + reg.opt_opd0_sign = opd0_sign; + reg.opt_opd0_seg = 1; + reg.opd0_n = p->ifmap->shape.n; + reg.opd0_c = p->ifmap->shape.c; + reg.opd0_h = p->ifmap->shape.h; + reg.opd0_w = p->ifmap->shape.w; + reg.opd0_n_str = p->ifmap->stride.n; + reg.opd0_c_str = p->ifmap->stride.c; + reg.opd0_h_str = p->ifmap->stride.h; + reg.opd0_w_str = p->ifmap->stride.w; + reg.short_opd0_str = 3; // Manual instead of h/w + reg.conv_opd0_up_pad = p->pad_top; + reg.conv_opd0_dn_pad = p->pad_bottom; + reg.conv_opd0_lf_pad = p->pad_left; + reg.conv_opd0_rt_pad = p->pad_right; + reg.conv_opd0_x_ins0 = p->ins_w; + reg.conv_opd0_y_ins0 = p->ins_h; + reg.conv_opd0_x_ins0_last = p->ins_last_w; + reg.conv_opd0_y_ins0_last = p->ins_last_h; + + reg.opt_opd1_sign = 1; + reg.opt_opd1_seg = 1; + reg.conv_opd1_x_ins0 = p->dilation_w - 1; + reg.conv_opd1_y_ins0 = p->dilation_h - 1; + if (isMulConst) { + reg.opt_opd1_const = 1; + reg.opt_opd1_sign = p->weight_const.is_signed; + reg.opd1_addr = p->weight_const.val; + reg.opd1_n = p->weight->shape.n; + reg.opd1_c = p->weight->shape.c; + reg.opd1_h = p->weight->shape.h; + reg.opd1_w = p->weight->shape.w; + } else { + reg.opd1_addr = p->weight->start_address; + reg.opd1_n = p->weight->shape.n; + reg.opd1_c = p->weight->shape.c; + reg.opd1_h = p->weight->shape.h; + reg.opd1_w = p->weight->shape.w; + } + reg.conv_op_x_str = p->stride_w; + reg.conv_op_y_str = p->stride_h; + reg.opd0_ins_val = bf16_enable ? + (uint32_t)p->ins_fp : (uint32_t)p->ins_val; + + if (p->bias) { + CHECK(status, p->bias->shape.n == 2); + CHECK(status, p->bias->shape.c == p->ofmap->shape.c); + CHECK(status, p->bias->shape.h == 1); + CHECK(status, p->bias->shape.w == 1); + + reg.tsk_opd_num = 3; + reg.opd2_addr = p->bias->start_address; + reg.opt_opd2_seg = 0; + reg.opd2_n = 1; + reg.opd2_c = p->bias->shape.c; + reg.opd2_h = 1; + reg.opd2_w = 1; + reg.short_opd2_str = 2; + reg.opd2_b_str = p->bias->stride.n; + } + + reg.layer_info = p->layer_id; + + reg.cmd_pre_exe_typ = p->cmd_pre_exe_typ; + reg.cmd_pre_exe = p->cmd_pre_exe; + + if (status) { + printf("cvkcv181x pt dw-conv: invalid param\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv181x/tiu_shift.c b/cvikernel/src/cv181x/tiu_shift.c new file mode 100644 index 000000000..4f84a1109 --- /dev/null +++ b/cvikernel/src/cv181x/tiu_shift.c @@ -0,0 +1,63 @@ +#include "cvkcv181x.h" + +void cvkcv181x_tiu_arith_shift( + cvk_context_t *ctx, + const cvk_tiu_arith_shift_param_t *p) +{ + int8_t status = 0; + status |= check_16bit_tiu_tensor(p->a_low, p->a_high); + status |= check_16bit_tiu_tensor(p->res_low, p->res_high); + status |= check_tiu_tensor(p->bits); + status |= check_same_shape_3(p->res_low, p->a_low, p->bits); + CHECK(status, tensor_is_signed(p->a_low)); + CHECK(status, tensor_is_signed(p->bits)); + + int res_high_addr = p->res_high->start_address; + int res_low_addr = p->res_low->start_address; + CHECK(status, res_high_addr > res_low_addr); + int res_b_stride = res_high_addr - res_low_addr; + + int a_high_addr = p->a_high->start_address; + int a_low_addr = p->a_low->start_address; + CHECK(status, a_high_addr > a_low_addr); + int a_b_stride = a_high_addr - a_low_addr; + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_SHIFT_FIX8B; + reg.tsk_opd_num = 2; + reg.opt_res_shift = 0; + reg.opt_rshift_typ = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a_low->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = a_low_addr; + reg.opt_opd0_sign = 1; + reg.opt_opd0_seg = 0; + reg.opd0_b_str = a_b_stride; + fill_opd0_stride(®, &p->a_low->stride); + + reg.opd1_addr = p->bits->start_address; + reg.opt_opd1_sign = 1; + reg.opt_opd1_seg = 1; + fill_opd1_stride(®, &p->bits->stride); + + reg.res0_addr = res_low_addr; + reg.opt_res0_sign = 1; + reg.opt_res0_seg = 0; + reg.res0_b_str = res_b_stride; + fill_res0_stride(®, &p->res_low->stride); + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv181x tiu shift: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv181x/tiu_sub.c b/cvikernel/src/cv181x/tiu_sub.c new file mode 100644 index 000000000..3a55842e0 --- /dev/null +++ b/cvikernel/src/cv181x/tiu_sub.c @@ -0,0 +1,73 @@ +#include "cvkcv181x.h" + +void cvkcv181x_tiu_sub( + cvk_context_t *ctx, + const cvk_tiu_sub_param_t *p) +{ + int8_t status = 0; + int bf16_enable = (p->a_low->fmt == CVK_FMT_BF16) ? 1 : 0; + + if (bf16_enable) { + /*bf16 only support 16 bit*/ + CHECK(status, !p->a_high); + CHECK(status, !p->b_high); + CHECK(status, !p->res_high); + status |= check_tiu_tensor(p->a_low); + status |= check_tiu_tensor(p->b_low); + status |= check_tiu_tensor(p->res_low); + status |= check_same_shape_3(p->res_low, p->a_low, p->b_low); + } else { + status |= check_16bit_tiu_tensor(p->a_low, p->a_high); + status |= check_16bit_tiu_tensor(p->b_low, p->b_high); + status |= check_tiu_tensor(p->res_low); + status |= check_same_shape_3(p->res_low, p->a_low, p->b_low); + CHECK(status, tensor_is_signed(p->res_low)); + } + if (p->res_high) + status |= check_16bit_tiu_tensor(p->res_low, p->res_high); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_SUB_FIX8B; + reg.tsk_opd_num = 2; + reg.opd_typ = bf16_enable ? 1: 0; + reg.opt_res_shift = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a_low->shape); + fill_same_tensor_stride_type(®, 0b11); + + int arith_shift = tensor_is_signed(p->res_low); + reg.opt_shift_typ = arith_shift; + reg.opt_res_shift = p->rshift_bits; + + reg.opd0_addr = p->a_low->start_address; + reg.opt_opd0_sign = tensor_is_signed(p->a_low); + reg.opt_opd0_seg = (p->a_high == NULL); + reg.opd0_b_str = bf16_enable ? 0 : (p->a_high->start_address - p->a_low->start_address); + fill_opd0_stride(®, &p->a_low->stride); + + reg.opd1_addr = p->b_low->start_address; + reg.opt_opd1_sign = tensor_is_signed(p->b_low);; + reg.opt_opd1_seg = (p->b_high == NULL); + reg.opd1_b_str = bf16_enable ? 0 : (p->b_high->start_address - p->b_low->start_address); + fill_opd1_stride(®, &p->b_low->stride); + + reg.res0_addr = p->res_low->start_address; + reg.opt_res0_sign = 1; + reg.opt_res0_seg = (p->res_high == NULL); + fill_res0_stride(®, &p->res_low->stride); + if (p->res_high) + reg.res0_b_str = bf16_enable ? 0 : (p->res_high->start_address - p->res_low->start_address); + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv181x tiu sub: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv181x/tiu_xor.c b/cvikernel/src/cv181x/tiu_xor.c new file mode 100644 index 000000000..e0b34ec78 --- /dev/null +++ b/cvikernel/src/cv181x/tiu_xor.c @@ -0,0 +1,111 @@ +#include "cvkcv181x.h" + +void cvkcv181x_tiu_xor_int8( + cvk_context_t *ctx, + const cvk_tiu_xor_int8_param_t *p) +{ + int8_t status = 0; + status |= check_tiu_tensor_3(p->res, p->a, p->b); + status |= check_same_shape_3(p->res, p->a, p->b); + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_XOR_FIX8B; + reg.tsk_opd_num = 2; + reg.opt_res_shift = 0; + reg.opt_shift_typ = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = p->a->start_address; + reg.opt_opd0_sign = 0; + reg.opt_opd0_seg = 1; + fill_opd0_stride(®, &p->a->stride); + + reg.opd1_addr = p->b->start_address; + reg.opt_opd1_sign = 0; + reg.opt_opd1_seg = 1; + fill_opd1_stride(®, &p->b->stride); + + reg.res0_addr = p->res->start_address; + reg.opt_res0_sign = 0; + reg.opt_res0_seg = 1; + fill_res0_stride(®, &p->res->stride); + + reg.layer_info = p->layer_id; + + if (status) { + printf("cvkcv181x tiu xor: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} + +void cvkcv181x_tiu_xor_int16( + cvk_context_t *ctx, + const cvk_tiu_xor_int16_param_t *p) +{ + int8_t status = 0; + status |= check_16bit_tiu_tensor(p->a_low, p->a_high); + status |= check_16bit_tiu_tensor(p->b_low, p->b_high); + status |= check_16bit_tiu_tensor(p->res_low, p->res_high); + status |= check_same_shape_3(p->res_low, p->a_low, p->b_low); + + int res_high_addr = p->res_high->start_address; + int res_low_addr = p->res_low->start_address; + CHECK(status, res_high_addr > res_low_addr); + int res_b_stride = res_high_addr - res_low_addr; + + int a_high_addr = p->a_high->start_address; + int a_low_addr = p->a_low->start_address; + CHECK(status, a_high_addr > a_low_addr); + int a_b_stride = a_high_addr - a_low_addr; + + int b_high_addr = p->b_high->start_address; + int b_low_addr = p->b_low->start_address; + CHECK(status, b_high_addr > b_low_addr); + int b_b_stride = b_high_addr - b_low_addr; + + tiu_reg_t reg; + reset_tiu_reg(®); + + reg.cmd_en = 1; + reg.tsk_typ = DCR_TYPE_TENSOR_ARITH_FIX8B; + reg.tsk_eu_typ = TENSOR_XOR_FIX8B; + reg.tsk_opd_num = 2; + reg.opt_res_shift = 0; + reg.opt_shift_typ = 0; + reg.opt_relu_typ = 0; + fill_same_tensor_shape(®, p->a_low->shape); + fill_same_tensor_stride_type(®, 0b11); + + reg.opd0_addr = a_low_addr; + reg.opt_opd0_sign = 0; + reg.opt_opd0_seg = 0; + reg.opd0_b_str = a_b_stride; + fill_opd0_stride(®, &p->a_low->stride); + + reg.opd1_addr = b_low_addr; + reg.opt_opd1_sign = 0; + reg.opt_opd1_seg = 0; + reg.opd1_b_str = b_b_stride; + fill_opd1_stride(®, &p->b_low->stride); + + reg.res0_addr = res_low_addr; + reg.opt_res0_sign = 0; + reg.opt_res0_seg = 0; + reg.res0_b_str = res_b_stride; + fill_res0_stride(®, &p->res_low->stride); + + if (status) { + printf("cvkcv181x tiu xor: wrong parameter\n"); + return; + } + + (void *)emit_tiu_cmdbuf(ctx, ®); +} diff --git a/cvikernel/src/cv1822/cvikernel_1822.c b/cvikernel/src/cv1822/cvikernel_1822.c new file mode 100644 index 000000000..a8be0b300 --- /dev/null +++ b/cvikernel/src/cv1822/cvikernel_1822.c @@ -0,0 +1,2507 @@ +#include "kernel_internal.h" +#include +#include "cvikernel/cvikernel.h" +#include "../bm1822/kernel_1822.h" +#include "bmkernel/bm1822/1822_fp_convert.h" + +#define SET_TG_SHAPE(_dst, _src) \ +do { \ + (_dst).n = (_src).n; \ + (_dst).c = (_src).c; \ + (_dst).h = (_src).h; \ + (_dst).w = (_src).w; \ +} while(0) + +#define SET_TG_STRIDE(_dst, _src) \ +do { \ + (_dst).n = (_src).n; \ + (_dst).c = (_src).c; \ + (_dst).h = (_src).h; \ +} while(0) + +#define SET_TL_SHAPE(_dst, _src) \ +do { \ + (_dst).n = (_src).n; \ + (_dst).c = (_src).c; \ + (_dst).h = (_src).h; \ + (_dst).w = (_src).w; \ +} while(0) + +#define SET_TL_STRIDE(_dst, _src) \ +do { \ + (_dst).n = (_src).n; \ + (_dst).c = (_src).c; \ + (_dst).h = (_src).h; \ + (_dst).w = (_src).w; \ +} while(0) + +#define SET_ML_SHAPE(_dst, _src) \ +do { \ + (_dst).n = (_src).n; \ + (_dst).c = (_src).c; \ + (_dst).w = (_src).w; \ + (_dst).col = (_src).col; \ +} while(0) + +#define SET_ML_STRIDE(_dst, _src) \ +do { \ + (_dst).n = (_src).n; \ + (_dst).c = (_src).c; \ + (_dst).h = (_src).h; \ +} while(0) + +typedef struct cvk_prv_data { + bmk1822_context_t *bmk_ctx; + uint32_t cmdbuf_size; + uint8_t *cmdbuf; +} cvk_prv_data_t; + +extern uint32_t bmk1822_estimate_nr_desc(bmk_context_t *k); + +static void convert_lmem_tensor( + bmk1822_tensor_lmem_t *dst, + const cvk_tl_t *src) +{ + dst->start_address = src->start_address; + dst->fmt = src->fmt; + dst->cmprs_fmt = src->cmprs_fmt; + SET_TL_SHAPE(dst->shape, src->shape); + SET_TL_STRIDE(dst->stride, src->stride); + dst->int8_rnd_mode = src->int8_rnd_mode; +} + +static void convert_gmem_tensor( + bmk1822_tensor_tgmem_t *dst, + const cvk_tg_t *src) +{ + dst->base_reg_index = src->base_reg_index; + dst->start_address = src->start_address; + dst->fmt = src->fmt; + SET_TG_SHAPE(dst->shape, src->shape); + SET_TG_STRIDE(dst->stride, src->stride); + dst->int8_rnd_mode = src->int8_rnd_mode; +} + +static void convert_gmem_compressed_tensor( + bmk1822_compressed_tensor_tgmem_t *dst, + const cvk_cmpr_tg_t *src) +{ + dst->t.base_reg_index = src->t.base_reg_index; + dst->t.start_address = src->t.start_address; + dst->t.fmt = src->t.fmt; + SET_TG_SHAPE(dst->t.shape, src->t.shape); + SET_TG_STRIDE(dst->t.stride, src->t.stride); + dst->t.int8_rnd_mode = src->t.int8_rnd_mode; + dst->reserved_size = src->reserved_size; + dst->bit_length = src->bit_length; + dst->bias0 = src->bias0; + dst->bias1 = src->bias1; + dst->zero_guard_en = src->zero_guard_en; +} + +static void convert_lmem_matrix( + bmk1822_matrix_lmem_t *dst, + const cvk_ml_t *src) +{ + dst->start_address = src->start_address; + dst->fmt = src->fmt; + SET_ML_SHAPE(dst->shape, src->shape); + SET_ML_STRIDE(dst->stride, src->stride); + dst->int8_rnd_mode = src->int8_rnd_mode; +} + +static void convert_gmem_matrix( + bmk1822_matrix_tgmem_t *dst, + const cvk_mg_t *src) +{ + dst->base_reg_index = src->base_reg_index; + dst->start_address = src->start_address; + dst->fmt = src->fmt; + dst->shape.row = src->shape.row; + dst->shape.col = src->shape.col; + dst->stride.row = src->stride.row; + dst->int8_rnd_mode = src->int8_rnd_mode; +} + +static void convert_gmem_compressed_matrix( + bmk1822_compressed_matrix_tgmem_t *dst, + const cvk_cmpr_mg_t *src) +{ + dst->m.base_reg_index = src->m.base_reg_index; + dst->m.start_address = src->m.start_address; + dst->m.fmt = src->m.fmt; + dst->m.shape.row = src->m.shape.row; + dst->m.shape.col = src->m.shape.col; + dst->m.stride.row = src->m.stride.row; + dst->m.int8_rnd_mode = src->m.int8_rnd_mode; + dst->bias0 = src->bias0; + dst->bias1 = src->bias1; + dst->zero_guard_en = src->zero_guard_en; +} + +void cvk1822_cleanup(struct cvikernel_context *ctx) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_cleanup(bmk_ctx); + + // command buffer is freed in cviruntime_cvikernel_destroy(). + //free(((cvk_prv_data_t *)ctx->priv_data)->cmdbuf); + + free(ctx->priv_data); + ctx->priv_data = NULL; +} + +void cvk1822_reset(struct cvikernel_context *ctx) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_reset(bmk_ctx); +} + +uint8_t *cvk1822_acquire_cmdbuf( + struct cvikernel_context *ctx, + uint32_t *size) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + return bmk1822_acquire_cmdbuf(bmk_ctx, size); +} + +void cvk1822_set_layer_id( + struct cvikernel_context *ctx, + uint16_t layer_id) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_set_layer_id(bmk_ctx, layer_id); +} + +void cvk1822_parallel_enable(struct cvikernel_context *ctx) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_parallel_enable(bmk_ctx); +} + +void cvk1822_parallel_disable(struct cvikernel_context *ctx) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_parallel_disable(bmk_ctx); +} + +cvk_tl_t *cvk1822_lmem_alloc_tensor( + cvk_context_t *ctx, + cvk_tl_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tensor_lmem_shape_t bmk_shape; + SET_TL_SHAPE(bmk_shape, shape); + + fmt_t bmk_fmt = fmt; + + bmk1822_tensor_lmem_t *bmk_tl = + bmk1822_lmem_alloc_tensor(bmk_ctx, bmk_shape, bmk_fmt, eu_align); + + return (cvk_tl_t *)bmk_tl; +} + +cvk_ml_t *cvk1822_lmem_alloc_matrix( + cvk_context_t *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_matrix_lmem_shape_t bmk_shape; + SET_ML_SHAPE(bmk_shape, shape); + + bmk1822_matrix_lmem_t *bmk_ml = + bmk1822_lmem_alloc_matrix(bmk_ctx, bmk_shape, fmt, eu_align); + + return (cvk_ml_t *)bmk_ml; +} + +cvk_ml_t *cvk1822_lmem_alloc_ps32_matrix( + cvk_context_t *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_matrix_lmem_shape_t bmk_shape; + SET_ML_SHAPE(bmk_shape, shape); + + bmk1822_matrix_lmem_t *bmk_ml = + bmk1822_lmem_alloc_ps32_matrix(bmk_ctx, bmk_shape, fmt, eu_align); + + return (cvk_ml_t *)bmk_ml; +} + +void cvk1822_lmem_free_tensor( + struct cvikernel_context *ctx, + const cvk_tl_t *tl) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_lmem_free_tensor(bmk_ctx, (bmk1822_tensor_lmem_t *)tl); +} + +void cvk1822_lmem_free_matrix( + struct cvikernel_context *ctx, + const cvk_ml_t *ml) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_lmem_free_matrix(bmk_ctx, (bmk1822_matrix_lmem_t *)ml); +} + +void cvk1822_lmem_init_tensor( + struct cvikernel_context *ctx, + cvk_tl_t *tl, + cvk_tl_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + // memset(tl, 0, sizeof(*tl)); + // tl->shape = shape; + // tl->eu_align = eu_align; + // tl->stride = cvk1822_tl_default_stride(ctx, shape, fmt, eu_align); + bmk1822_tensor_lmem_shape_t bmk_shape; + SET_TL_SHAPE(bmk_shape, shape); + + fmt_t bmk_fmt = fmt; + + bmk1822_lmem_init_tensor(bmk_ctx, (bmk1822_tensor_lmem_t *)tl, bmk_shape, + bmk_fmt, eu_align); +} + +void cvk1822_lmem_init_matrix( + struct cvikernel_context *ctx, + cvk_ml_t *ml, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_matrix_lmem_shape_t bmk_shape; + SET_ML_SHAPE(bmk_shape, shape); + + bmk1822_lmem_init_matrix(bmk_ctx, (bmk1822_matrix_lmem_t *)ml, bmk_shape, fmt, + eu_align); +} + +cvk_tl_stride_t cvk1822_tl_default_stride( + struct cvikernel_context *ctx, + cvk_tl_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tensor_lmem_shape_t bmk_shape; + SET_TL_SHAPE(bmk_shape, shape); + + bmk1822_tensor_lmem_stride_t bmk_stride = + bmk1822_tensor_lmem_default_stride(bmk_ctx, bmk_shape, fmt, + eu_align); + + cvk_tl_stride_t stride; + SET_TL_STRIDE(stride, bmk_stride); + + return stride; +} + +cvk_tg_stride_t cvk1822_tg_default_stride( + struct cvikernel_context *ctx, + cvk_tg_shape_t shape, + cvk_fmt_t fmt) +{ + (void)ctx; + + bmk1822_tensor_tgmem_shape_t bmk_shape; + SET_TG_SHAPE(bmk_shape, shape); + + bmk1822_tensor_tgmem_stride_t bmk_stride = + bmk1822_tensor_tgmem_default_stride(bmk_shape, fmt); + + cvk_tg_stride_t stride; + SET_TG_STRIDE(stride, bmk_stride); + stride.w = (fmt == CVK_FMT_BF16) ? 2 : 1; + + return stride; +} + +cvk_ml_shape_t cvk1822_ml_default_shape( + struct cvikernel_context *ctx, + uint32_t row, + uint32_t col, + cvk_fmt_t fmt_type) { + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_matrix_lmem_shape_t bm_ml_shape = + bmk1822_matrix_lmem_default_shape(bmk_ctx, row, col, fmt_type); + + cvk_ml_shape_t ml_shape; + SET_ML_SHAPE(ml_shape, bm_ml_shape); + + return ml_shape; +} + +cvk_ml_stride_t cvk1822_ml_default_stride( + struct cvikernel_context *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align) { + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_matrix_lmem_shape_t bmk_shape; + SET_ML_SHAPE(bmk_shape, shape); + + bmk1822_matrix_lmem_stride_t bmk_stride = + bmk1822_matrix_lmem_default_stride(bmk_ctx, bmk_shape, fmt, eu_align); + + cvk_ml_stride_t stride; + SET_ML_STRIDE(stride, bmk_stride); + + return stride; +} + +cvk_ml_shape_t cvk1822_ml_shape_t1( + struct cvikernel_context *ctx, + uint32_t len, + cvk_fmt_t fmt_type) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_matrix_lmem_shape_t bmk_shape = + bmk1822_matrix_lmem_shape_t1(bmk_ctx, len, fmt_type); + + cvk_ml_shape_t shape; + SET_ML_SHAPE(shape, bmk_shape); + + return shape; +} + +uint32_t cvk1822_lmem_tensor_to_size( + struct cvikernel_context *ctx, + cvk_tl_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tensor_lmem_shape_t bmk_shape; + SET_TL_SHAPE(bmk_shape, shape); + + return bmk1822_lmem_tensor_to_size(bmk_ctx, bmk_shape, fmt, eu_align); +} + +uint32_t cvk1822_lmem_matrix_to_size( + struct cvikernel_context *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_matrix_lmem_shape_t bmk_shape; + SET_ML_SHAPE(bmk_shape, shape); + + return bmk1822_lmem_matrix_to_size(bmk_ctx, bmk_shape, fmt, + eu_align); +} + +uint32_t cvk1822_lmem_ps32_matrix_to_size( + struct cvikernel_context *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_matrix_lmem_shape_t bmk_shape; + SET_ML_SHAPE(bmk_shape, shape); + + return bmk1822_lmem_ps32_matrix_to_size(bmk_ctx, bmk_shape, fmt, + eu_align); +} + +void cvk1822_gmem_init_tensor( + struct cvikernel_context *ctx, + cvk_tg_t *tg, + cvk_tg_shape_t shape, + cvk_fmt_t fmt) { + memset(tg, 0, sizeof(*tg)); + tg->fmt = fmt; + tg->shape = shape; + tg->stride = ctx->ops->tg_default_stride(ctx, tg->shape, tg->fmt); +} + +void cvk1822_tdma_l2l_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_l2l_tensor_copy_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_l2l_tensor_copy_param_t bmk_param = {0}; + bmk_param.mv_lut_idx = param->mv_lut_idx; + bmk_param.mv_lut_base = param->mv_lut_base; + bmk_param.outstanding = param->outstanding; + + bmk1822_tensor_lmem_t tl_src; + convert_lmem_tensor(&tl_src, param->src); + bmk_param.src = &tl_src; + + bmk1822_tensor_lmem_t tl_dst; + convert_lmem_tensor(&tl_dst, param->dst); + bmk_param.dst = &tl_dst; + + bmk1822_tdma_l2l_tensor_copy(bmk_ctx, &bmk_param); +} + +void cvk1822_tdma_l2l_bf16_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_l2l_tensor_copy_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_l2l_tensor_copy_param_t bmk_param = {0}; + bmk_param.mv_lut_idx = param->mv_lut_idx; + bmk_param.mv_lut_base = param->mv_lut_base; + bmk_param.outstanding = param->outstanding; + + bmk1822_tensor_lmem_t tl_src; + convert_lmem_tensor(&tl_src, param->src); + bmk_param.src = &tl_src; + + bmk1822_tensor_lmem_t tl_dst; + convert_lmem_tensor(&tl_dst, param->dst); + bmk_param.dst = &tl_dst; + + bmk1822_tdma_l2l_bf16_tensor_copy(bmk_ctx, &bmk_param); +} + +void cvk1822_tdma_l2l_tensor_lrn_shift( + cvk_context_t *ctx, + const cvk_tdma_l2l_tensor_lrn_shift_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_l2l_tensor_lrn_shift_param_t bmk_param; + bmk_param.right_shift = param->right_shift; + bmk_param.lrn_step = param->lrn_step; + + bmk1822_tensor_lmem_t tl_src; + convert_lmem_tensor(&tl_src, param->src); + bmk_param.src = &tl_src; + + bmk1822_tensor_lmem_t tl_dst; + convert_lmem_tensor(&tl_dst, param->dst); + bmk_param.dst = &tl_dst; + + bmk1822_tdma_l2l_tensor_lrn_shift(bmk_ctx, &bmk_param); +} + +void cvk1822_tdma_l2g_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_l2tg_tensor_copy_param_t bmk_param; + bmk1822_tensor_lmem_t tl_src; + convert_lmem_tensor(&tl_src, param->src); + bmk_param.src = &tl_src; + + bmk1822_tensor_tgmem_t tg_dst; + convert_gmem_tensor(&tg_dst, param->dst); + bmk_param.dst = &tg_dst; + + bmk_param.intra_cmd_paral = param->intra_cmd_paral; + + bmk1822_tdma_l2g_tensor_copy(bmk_ctx, &bmk_param); +} + +void cvk1822_tdma_l2g_bf16_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_l2tg_tensor_copy_param_t bmk_param; + bmk1822_tensor_lmem_t tl_src; + convert_lmem_tensor(&tl_src, param->src); + bmk_param.src = &tl_src; + + bmk1822_tensor_tgmem_t tg_dst; + convert_gmem_tensor(&tg_dst, param->dst); + bmk_param.dst = &tg_dst; + + bmk_param.intra_cmd_paral = param->intra_cmd_paral; + + bmk1822_tdma_l2g_bf16_tensor_copy(bmk_ctx, &bmk_param); +} + +void cvk1822_tdma_l2g_tensor_copy_nc_transposed( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_nc_transposed_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_l2tg_tensor_copy_nc_transposed_param_t bmk_param; + bmk1822_tensor_lmem_t tl_src; + convert_lmem_tensor(&tl_src, param->src); + bmk_param.src = &tl_src; + + bmk1822_tensor_tgmem_t tg_dst; + convert_gmem_tensor(&tg_dst, param->dst); + bmk_param.dst = &tg_dst; + + bmk1822_tdma_l2g_tensor_copy_nc_transposed(bmk_ctx, &bmk_param); +} + +void cvk1822_tdma_l2g_bf16_tensor_copy_nc_transposed( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_nc_transposed_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_l2tg_tensor_copy_nc_transposed_param_t bmk_param; + bmk1822_tensor_lmem_t tl_src; + convert_lmem_tensor(&tl_src, param->src); + bmk_param.src = &tl_src; + + bmk1822_tensor_tgmem_t tg_dst; + convert_gmem_tensor(&tg_dst, param->dst); + bmk_param.dst = &tg_dst; + + bmk1822_tdma_l2g_bf16_tensor_copy_nc_transposed(bmk_ctx, &bmk_param); +} + +void cvk1822_tdma_l2g_tensor_copy_compressed( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_compressed_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_l2tg_tensor_copy_compressed_param_t bmk_param; + bmk1822_tensor_lmem_t tl_src; + convert_lmem_tensor(&tl_src, param->src); + bmk_param.src = &tl_src; + + bmk1822_compressed_tensor_tgmem_t tg_dst; + convert_gmem_compressed_tensor(&tg_dst, param->dst); + bmk_param.dst = &tg_dst; + + bmk_param.intra_cmd_paral = param->intra_cmd_paral; + + bmk1822_tdma_l2g_tensor_copy_compressed(bmk_ctx, &bmk_param); +} + +void cvk1822_tdma_l2g_tensor_fill_constant( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_fill_constant_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_l2tg_tensor_fill_constant_param_t bmk_param; + bmk_param.constant = param->constant; + + bmk1822_tensor_tgmem_t tg_dst; + convert_gmem_tensor(&tg_dst, param->dst); + bmk_param.dst = &tg_dst; + + bmk1822_tdma_l2g_tensor_fill_constant(bmk_ctx, &bmk_param); +} + +void cvk1822_tdma_l2g_tensor_copy_cw_transposed( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_cw_transposed_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_l2tg_tensor_copy_cw_transposed_param_t bmk_param; + bmk1822_tensor_lmem_t tl_src; + convert_lmem_tensor(&tl_src, param->src); + bmk_param.src = &tl_src; + + bmk1822_tensor_tgmem_t tg_dst; + convert_gmem_tensor(&tg_dst, param->dst); + bmk_param.dst = &tg_dst; + + bmk1822_tdma_l2g_tensor_copy_cw_transposed(bmk_ctx, &bmk_param); +} + +void cvk1822_tdma_l2g_bf16_tensor_copy_cw_transposed( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_cw_transposed_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_l2tg_tensor_copy_cw_transposed_param_t bmk_param; + bmk1822_tensor_lmem_t tl_src; + convert_lmem_tensor(&tl_src, param->src); + bmk_param.src = &tl_src; + + bmk1822_tensor_tgmem_t tg_dst; + convert_gmem_tensor(&tg_dst, param->dst); + bmk_param.dst = &tg_dst; + + bmk1822_tdma_l2g_bf16_tensor_copy_cw_transposed(bmk_ctx, &bmk_param); +} + +void cvk1822_tdma_l2g_matrix_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_matrix_copy_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_l2tg_matrix_copy_param_t bmk_param; + bmk1822_matrix_lmem_t tl_src; + convert_lmem_matrix(&tl_src, param->src); + bmk_param.src = &tl_src; + + bmk1822_matrix_tgmem_t tg_dst; + convert_gmem_matrix(&tg_dst, param->dst); + bmk_param.dst = &tg_dst; + + bmk1822_tdma_l2g_matrix_copy(bmk_ctx, &bmk_param); +} + +void cvk1822_tdma_l2g_bf16_matrix_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_matrix_copy_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_l2tg_matrix_copy_param_t bmk_param; + bmk1822_matrix_lmem_t tl_src; + convert_lmem_matrix(&tl_src, param->src); + bmk_param.src = &tl_src; + + bmk1822_matrix_tgmem_t tg_dst; + convert_gmem_matrix(&tg_dst, param->dst); + bmk_param.dst = &tg_dst; + + bmk1822_tdma_l2g_bf16_matrix_copy(bmk_ctx, &bmk_param); +} + +void cvk1822_tdma_l2g_matrix_copy_compressed( + cvk_context_t *ctx, + const cvk_tdma_l2g_matrix_copy_compressed_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_l2tg_matrix_copy_compressed_param_t bmk_param; + bmk1822_matrix_lmem_t tl_src; + convert_lmem_matrix(&tl_src, param->src); + bmk_param.src = &tl_src; + + bmk1822_compressed_matrix_tgmem_t mg_dst; + convert_gmem_compressed_matrix(&mg_dst, param->dst); + bmk_param.dst = &mg_dst; + + bmk1822_tdma_l2g_matrix_copy_compressed(bmk_ctx, &bmk_param); +} + + +void cvk1822_tdma_l2g_general_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_general_copy_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_l2tg_general_copy_param_t bmk_param; + bmk_param.src_address = param->src_address; + bmk_param.dst_base_reg_index = param->dst_base_reg_index; + bmk_param.dst_address = param->dst_address; + bmk_param.bytes = param->bytes; + + bmk1822_tdma_l2g_general_copy(bmk_ctx, &bmk_param); +} + +void cvk1822_tdma_l2g_bf16_general_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_bf16_general_copy_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_l2tg_bf16_general_copy_param_t bmk_param; + bmk_param.src_address = param->src_address; + bmk_param.dst_base_reg_index = param->dst_base_reg_index; + bmk_param.dst_address = param->dst_address; + bmk_param.src_bytes = param->src_bytes; + + bmk1822_tdma_l2g_bf16_general_copy(bmk_ctx, &bmk_param); +} + +void cvk1822_tdma_g2l_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_tg2l_tensor_copy_param_t bmk_param; + bmk1822_tensor_tgmem_t tg_src; + convert_gmem_tensor(&tg_src, param->src); + bmk_param.src = &tg_src; + + bmk1822_tensor_lmem_t tl_dst; + convert_lmem_tensor(&tl_dst, param->dst); + bmk_param.dst = &tl_dst; + + bmk_param.intra_cmd_paral = param->intra_cmd_paral; + + bmk1822_tdma_g2l_tensor_copy(bmk_ctx, &bmk_param); +} + +void cvk1822_tdma_g2l_bf16_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_tg2l_tensor_copy_param_t bmk_param; + bmk1822_tensor_tgmem_t tg_src; + convert_gmem_tensor(&tg_src, param->src); + bmk_param.src = &tg_src; + + bmk1822_tensor_lmem_t tl_dst; + convert_lmem_tensor(&tl_dst, param->dst); + bmk_param.dst = &tl_dst; + + bmk_param.intra_cmd_paral = param->intra_cmd_paral; + + bmk1822_tdma_g2l_bf16_tensor_copy(bmk_ctx, &bmk_param); +} + +void cvk1822_tdma_g2l_tensor_copy_nc_transposed( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_nc_transposed_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_tg2l_tensor_copy_nc_transposed_param_t bmk_param; + bmk1822_tensor_tgmem_t tg_src; + convert_gmem_tensor(&tg_src, param->src); + bmk_param.src = &tg_src; + + bmk1822_tensor_lmem_t tl_dst; + convert_lmem_tensor(&tl_dst, param->dst); + bmk_param.dst = &tl_dst; + + bmk1822_tdma_g2l_tensor_copy_nc_transposed(bmk_ctx, &bmk_param); +} + +void cvk1822_tdma_g2l_bf16_tensor_copy_nc_transposed( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_nc_transposed_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_tg2l_tensor_copy_nc_transposed_param_t bmk_param; + bmk1822_tensor_tgmem_t tg_src; + convert_gmem_tensor(&tg_src, param->src); + bmk_param.src = &tg_src; + + bmk1822_tensor_lmem_t tl_dst; + convert_lmem_tensor(&tl_dst, param->dst); + bmk_param.dst = &tl_dst; + + bmk1822_tdma_g2l_bf16_tensor_copy_nc_transposed(bmk_ctx, &bmk_param); +} + +void cvk1822_tdma_g2l_tensor_copy_chw_rotated( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_chw_rotated_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_tg2l_tensor_copy_chw_rotated_param_t bmk_param; + bmk1822_tensor_tgmem_t tg_src; + convert_gmem_tensor(&tg_src, param->src); + bmk_param.src = &tg_src; + + bmk1822_tensor_lmem_t tl_dst; + convert_lmem_tensor(&tl_dst, param->dst); + bmk_param.dst = &tl_dst; + + bmk1822_tdma_g2l_tensor_copy_chw_rotated(bmk_ctx, &bmk_param); +} + +void cvk1822_tdma_g2l_tensor_copy_decompressed( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_decompressed_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_tg2l_tensor_copy_decompressed_param_t bmk_param; + bmk1822_compressed_tensor_tgmem_t tg_src; + convert_gmem_compressed_tensor(&tg_src, param->src); + bmk_param.src = &tg_src; + + bmk1822_tensor_lmem_t tl_dst; + convert_lmem_tensor(&tl_dst, param->dst); + bmk_param.dst = &tl_dst; + + bmk_param.intra_cmd_paral = param->intra_cmd_paral; + + bmk1822_tdma_g2l_tensor_copy_decompressed(bmk_ctx, &bmk_param); +} + +void cvk1822_tdma_g2l_tensor_fill_constant( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_fill_constant_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_tg2l_tensor_fill_constant_param_t bmk_param; + bmk_param.constant = param->constant; + + bmk1822_tensor_lmem_t tl_dst; + convert_lmem_tensor(&tl_dst, param->dst); + bmk_param.dst = &tl_dst; + + bmk1822_tdma_tg2l_tensor_fill_constant(bmk_ctx, &bmk_param); +} + +void cvk1822_tdma_g2l_bf16_tensor_fill_constant( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_fill_constant_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_tg2l_tensor_fill_constant_param_t bmk_param; + bmk_param.constant = param->constant; + + bmk1822_tensor_lmem_t tl_dst; + convert_lmem_tensor(&tl_dst, param->dst); + bmk_param.dst = &tl_dst; + + bmk1822_tdma_tg2l_bf16_tensor_fill_constant(bmk_ctx, &bmk_param); +} + +void cvk1822_tdma_g2l_matrix_copy_decompressed( + cvk_context_t *ctx, + const cvk_tdma_g2l_matrix_copy_decompressed_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_tg2l_matrix_copy_decompressed_param_t bmk_param; + bmk1822_compressed_matrix_tgmem_t mg_src; + convert_gmem_compressed_matrix(&mg_src, param->src); + bmk_param.src = &mg_src; + + bmk1822_matrix_lmem_t ml_dst; + convert_lmem_matrix(&ml_dst, param->dst); + bmk_param.dst = &ml_dst; + + bmk1822_tdma_g2l_matrix_copy_decompressed(bmk_ctx, &bmk_param); +} + +void cvk1822_tdma_g2l_matrix_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_matrix_copy_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_tg2l_matrix_copy_param_t bmk_param; + bmk1822_matrix_tgmem_t mg_src; + convert_gmem_matrix(&mg_src, param->src); + bmk_param.src = &mg_src; + + bmk1822_matrix_lmem_t ml_dst; + convert_lmem_matrix(&ml_dst, param->dst); + bmk_param.dst = &ml_dst; + + bmk1822_tdma_g2l_matrix_copy(bmk_ctx, &bmk_param); +} + +void cvk1822_tdma_g2l_bf16_matrix_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_matrix_copy_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_tg2l_matrix_copy_param_t bmk_param; + bmk1822_matrix_tgmem_t mg_src; + convert_gmem_matrix(&mg_src, param->src); + bmk_param.src = &mg_src; + + bmk1822_matrix_lmem_t ml_dst; + convert_lmem_matrix(&ml_dst, param->dst); + bmk_param.dst = &ml_dst; + + bmk1822_tdma_g2l_bf16_matrix_copy(bmk_ctx, &bmk_param); +} + +void cvk1822_tdma_g2l_matrix_copy_row_col_transposed( + cvk_context_t *ctx, + const cvk_tdma_g2l_matrix_copy_row_col_transposed_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_tg2l_matrix_copy_row_col_transposed_param_t bmk_param; + bmk1822_matrix_tgmem_t mg_src; + convert_gmem_matrix(&mg_src, param->src); + bmk_param.src = &mg_src; + + bmk1822_matrix_lmem_t ml_dst; + convert_lmem_matrix(&ml_dst, param->dst); + bmk_param.dst = &ml_dst; + + bmk1822_tdma_g2l_matrix_copy_row_col_transposed(bmk_ctx, &bmk_param); +} + +void cvk1822_tdma_g2l_general_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_general_copy_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_tg2l_general_copy_param_t bmk_param; + bmk_param.src_base_reg_index = param->src_base_reg_index; + bmk_param.src_address = param->src_address; + bmk_param.dst_address = param->dst_address; + bmk_param.bytes = param->src_base_reg_index; + + bmk1822_tdma_g2l_general_copy(bmk_ctx, &bmk_param); +} + +void cvk1822_tdma_g2l_bf16_general_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_bf16_general_copy_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_tg2l_bf16_general_copy_param_t bmk_param; + bmk_param.src_base_reg_index = param->src_base_reg_index; + bmk_param.src_address = param->src_address; + bmk_param.dst_address = param->dst_address; + bmk_param.src_bytes = param->src_base_reg_index; + bmk_param.src_fmt = param->src_fmt; + bmk_param.dst_fmt = param->dst_fmt; + + bmk1822_tdma_g2l_bf16_general_copy(bmk_ctx, &bmk_param); +} + +void cvk1822_tdma_g2g_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_g2g_tensor_copy_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_tg2tg_tensor_copy_param_t bmk_param; + bmk1822_tensor_tgmem_t tg_src; + convert_gmem_tensor(&tg_src, param->src); + bmk_param.src = &tg_src; + + bmk1822_tensor_tgmem_t tg_dst; + convert_gmem_tensor(&tg_dst, param->dst); + bmk_param.dst = &tg_dst; + + bmk1822_tdma_tg2tg_tensor_copy(bmk_ctx, &bmk_param); +} + +void cvk1822_tdma_g2g_general_copy( + cvk_context_t *ctx, + const cvk_tdma_g2g_tensor_copy_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_tg2tg_tensor_copy_param_t bmk_param; + bmk1822_tensor_tgmem_t tg_src; + convert_gmem_tensor(&tg_src, param->src); + bmk_param.src = &tg_src; + + bmk1822_tensor_tgmem_t tg_dst; + convert_gmem_tensor(&tg_dst, param->dst); + bmk_param.dst = &tg_dst; + + bmk1822_tdma_tg2tg_general_copy(bmk_ctx, &bmk_param); +} + +void cvk1822_tdma_g2g_bf16_general_copy( + cvk_context_t *ctx, + const cvk_tdma_g2g_tensor_copy_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_tg2tg_tensor_copy_param_t bmk_param; + bmk1822_tensor_tgmem_t tg_src; + convert_gmem_tensor(&tg_src, param->src); + bmk_param.src = &tg_src; + + bmk1822_tensor_tgmem_t tg_dst; + convert_gmem_tensor(&tg_dst, param->dst); + bmk_param.dst = &tg_dst; + + bmk1822_tdma_tg2tg_bf16_general_copy(bmk_ctx, &bmk_param); +} + +void cvk1822_tdma_g2g_bf16_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_g2g_tensor_copy_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tdma_tg2tg_tensor_copy_param_t bmk_param; + bmk1822_tensor_tgmem_t tg_src; + convert_gmem_tensor(&tg_src, param->src); + bmk_param.src = &tg_src; + + bmk1822_tensor_tgmem_t tg_dst; + convert_gmem_tensor(&tg_dst, param->dst); + bmk_param.dst = &tg_dst; + + bmk1822_tdma_tg2tg_bf16_tensor_copy(bmk_ctx, &bmk_param); +} + +void cvk1822_tiu_mul( + cvk_context_t *ctx, + const cvk_tiu_mul_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tiu_element_wise_mul_param_t bmk_param; + bmk1822_tensor_lmem_t tl_res_high; + if (param->res_high) { + convert_lmem_tensor(&tl_res_high, param->res_high); + bmk_param.res_high = &tl_res_high; + } else { + bmk_param.res_high = NULL; + } + + bmk1822_tensor_lmem_t tl_res_low; + convert_lmem_tensor(&tl_res_low, param->res_low); + bmk_param.res_low = &tl_res_low; + + bmk1822_tensor_lmem_t tl_a; + convert_lmem_tensor(&tl_a, param->a); + bmk_param.a = &tl_a; + + bmk_param.b_is_const = param->b_is_const; + + bmk1822_tensor_lmem_t tl_b; + if (!param->b_is_const) { + convert_lmem_tensor(&tl_b, param->b); + bmk_param.b = &tl_b; + } else { + bmk_param.b_const.val = param->b_const.val; + bmk_param.b_const.is_signed = param->b_const.is_signed; + } + + bmk_param.rshift_bits = param->rshift_bits; + bmk_param.relu_enable = param->relu_enable; + bmk_param.layer_id = param->layer_id; + + bmk1822_tiu_element_wise_mul(bmk_ctx, &bmk_param); +} + +void cvk1822_tiu_mul_qm( + cvk_context_t *ctx, + const cvk_tiu_mul_qm_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tiu_element_wise_mul_qdm_param_t bmk_param; + bmk1822_tensor_lmem_t tl_res_high; + if (param->res_high) { + convert_lmem_tensor(&tl_res_high, param->res_high); + bmk_param.res_high = &tl_res_high; + } else { + bmk_param.res_high = NULL; + } + + bmk1822_tensor_lmem_t tl_res_low; + convert_lmem_tensor(&tl_res_low, param->res_low); + bmk_param.res_low = &tl_res_low; + + bmk1822_tensor_lmem_t tl_a; + convert_lmem_tensor(&tl_a, param->a); + bmk_param.a = &tl_a; + + bmk_param.b_is_const = param->b_is_const; + + bmk1822_tensor_lmem_t tl_b; + if (!param->b_is_const) { + convert_lmem_tensor(&tl_b, param->b); + bmk_param.b = &tl_b; + } else { + bmk_param.b_const.val = param->b_const.val; + bmk_param.b_const.is_signed = param->b_const.is_signed; + } + + bmk_param.rshift_bits = param->rshift_bits; + bmk_param.relu_enable = param->relu_enable; + bmk_param.multiplier = param->multiplier; + bmk_param.layer_id = param->layer_id; + + bmk1822_tiu_element_wise_mul_qdm(bmk_ctx, &bmk_param); +} + +void cvk1822_tiu_mac( + cvk_context_t *ctx, + const cvk_tiu_mac_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tiu_element_wise_mac_param_t bmk_param; + bmk1822_tensor_lmem_t tl_res_high; + if (param->res_high) { + convert_lmem_tensor(&tl_res_high, param->res_high); + bmk_param.res_high = &tl_res_high; + } else { + bmk_param.res_high = NULL; + } + + bmk1822_tensor_lmem_t tl_res_low; + convert_lmem_tensor(&tl_res_low, param->res_low); + bmk_param.res_low = &tl_res_low; + + bmk1822_tensor_lmem_t tl_a; + convert_lmem_tensor(&tl_a, param->a); + bmk_param.a = &tl_a; + + bmk_param.b_is_const = param->b_is_const; + + bmk1822_tensor_lmem_t tl_b; + if (!param->b_is_const) { + convert_lmem_tensor(&tl_b, param->b); + bmk_param.b = &tl_b; + } else { + bmk_param.b_const.val = param->b_const.val; + bmk_param.b_const.is_signed = param->b_const.is_signed; + } + + bmk_param.res_is_int8 = param->res_is_int8; + bmk_param.relu_enable = param->relu_enable; + bmk_param.lshift_bits = param->lshift_bits; + bmk_param.rshift_bits = param->rshift_bits; + bmk_param.layer_id = param->layer_id; + + bmk1822_tiu_element_wise_mac(bmk_ctx, &bmk_param); +} + +void cvk1822_tiu_add( + cvk_context_t *ctx, + const cvk_tiu_add_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tiu_element_wise_add_param_t bmk_param; + bmk1822_tensor_lmem_t tl_res_high; + if (param->res_high) { + convert_lmem_tensor(&tl_res_high, param->res_high); + bmk_param.res_high = &tl_res_high; + } else { + bmk_param.res_high = NULL; + } + + bmk1822_tensor_lmem_t tl_res_low; + convert_lmem_tensor(&tl_res_low, param->res_low); + bmk_param.res_low = &tl_res_low; + + bmk1822_tensor_lmem_t tl_a_high; + if (param->a_high) { + convert_lmem_tensor(&tl_a_high, param->a_high); + bmk_param.a_high = &tl_a_high; + } else { + bmk_param.a_high = NULL; + } + + bmk1822_tensor_lmem_t tl_a_low; + convert_lmem_tensor(&tl_a_low, param->a_low); + bmk_param.a_low = &tl_a_low; + + bmk_param.b_is_const = param->b_is_const; + + bmk1822_tensor_lmem_t tl_b_high; + bmk1822_tensor_lmem_t tl_b_low; + if (!param->b_is_const) { + if (param->b.high) { + convert_lmem_tensor(&tl_b_high, param->b.high); + bmk_param.b_high = &tl_b_high; + } else { + bmk_param.b_high = NULL; + } + + convert_lmem_tensor(&tl_b_low, param->b.low); + bmk_param.b_low = &tl_b_low; + } else { + bmk_param.b_const.val = param->b_const.val; + bmk_param.b_const.is_signed = param->b_const.is_signed; + } + + bmk_param.rshift_bits = param->rshift_bits; + bmk_param.relu_enable = param->relu_enable; + bmk_param.layer_id = param->layer_id; + + bmk1822_tiu_element_wise_add(bmk_ctx, &bmk_param); +} + +void cvk1822_tiu_sub( + cvk_context_t *ctx, + const cvk_tiu_sub_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tiu_element_wise_sub_param_t bmk_param; + bmk1822_tensor_lmem_t tl_res_high; + if (param->res_high) { + convert_lmem_tensor(&tl_res_high, param->res_high); + bmk_param.res_high = &tl_res_high; + } else { + bmk_param.res_high = NULL; + } + + bmk1822_tensor_lmem_t tl_res_low; + convert_lmem_tensor(&tl_res_low, param->res_low); + bmk_param.res_low = &tl_res_low; + + bmk1822_tensor_lmem_t tl_a_high; + if (param->a_high) { + convert_lmem_tensor(&tl_a_high, param->a_high); + bmk_param.a_high = &tl_a_high; + } else { + bmk_param.a_high = NULL; + } + + bmk1822_tensor_lmem_t tl_a_low; + convert_lmem_tensor(&tl_a_low, param->a_low); + bmk_param.a_low = &tl_a_low; + + bmk1822_tensor_lmem_t tl_b_high; + if (param->b_high) { + convert_lmem_tensor(&tl_b_high, param->b_high); + bmk_param.b_high = &tl_b_high; + } else { + bmk_param.b_high = NULL; + } + + bmk1822_tensor_lmem_t tl_b_low; + convert_lmem_tensor(&tl_b_low, param->b_low); + bmk_param.b_low = &tl_b_low; + + bmk_param.rshift_bits = param->rshift_bits; + bmk_param.layer_id = param->layer_id; + + bmk1822_tiu_element_wise_sub(bmk_ctx, &bmk_param); +} + +void cvk1822_tiu_max( + cvk_context_t *ctx, + const cvk_tiu_max_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tiu_element_wise_max_param_t bmk_param; + bmk1822_tensor_lmem_t tl_max; + convert_lmem_tensor(&tl_max, param->max); + bmk_param.max = &tl_max; + + bmk1822_tensor_lmem_t tl_a; + convert_lmem_tensor(&tl_a, param->a); + bmk_param.a = &tl_a; + + bmk_param.b_is_const = param->b_is_const; + bmk1822_tensor_lmem_t tl_b; + if (!param->b_is_const) { + convert_lmem_tensor(&tl_b, param->b); + bmk_param.b = &tl_b; + } else { + bmk_param.b_const.val = param->b_const.val; + bmk_param.b_const.is_signed = param->b_const.is_signed; + } + + bmk_param.layer_id = param->layer_id; + bmk1822_tiu_element_wise_max(bmk_ctx, &bmk_param); +} + +void cvk1822_tiu_min( + cvk_context_t *ctx, + const cvk_tiu_min_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tiu_element_wise_min_param_t bmk_param; + bmk1822_tensor_lmem_t tl_min; + convert_lmem_tensor(&tl_min, param->min); + bmk_param.min = &tl_min; + + bmk1822_tensor_lmem_t tl_a; + convert_lmem_tensor(&tl_a, param->a); + bmk_param.a = &tl_a; + + bmk_param.b_is_const = param->b_is_const; + bmk1822_tensor_lmem_t tl_b; + if (!param->b_is_const) { + convert_lmem_tensor(&tl_b, param->b); + bmk_param.b = &tl_b; + } else { + bmk_param.b_const.val = param->b_const.val; + bmk_param.b_const.is_signed = param->b_const.is_signed; + } + + bmk_param.layer_id = param->layer_id; + + bmk1822_tiu_element_wise_min(bmk_ctx, &bmk_param); +} + +void cvk1822_tiu_and_int8( + cvk_context_t *ctx, + const cvk_tiu_and_int8_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tiu_element_wise_and_int8_param_t bmk_param; + bmk1822_tensor_lmem_t tl_res; + convert_lmem_tensor(&tl_res, param->res); + bmk_param.res = &tl_res; + + bmk1822_tensor_lmem_t tl_a; + convert_lmem_tensor(&tl_a, param->a); + bmk_param.a = &tl_a; + + bmk1822_tensor_lmem_t tl_b; + convert_lmem_tensor(&tl_b, param->b); + bmk_param.b = &tl_b; + + bmk_param.layer_id = param->layer_id; + + bmk1822_tiu_element_wise_and_int8(bmk_ctx, &bmk_param); +} + +void cvk1822_tiu_arith_shift( + cvk_context_t *ctx, + const cvk_tiu_arith_shift_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tiu_element_wise_arith_shift_param_t bmk_param; + bmk1822_tensor_lmem_t tl_res_high; + if (param->res_high) { + convert_lmem_tensor(&tl_res_high, param->res_high); + bmk_param.res_high = &tl_res_high; + } else { + bmk_param.res_high = NULL; + } + + bmk1822_tensor_lmem_t tl_res_low; + convert_lmem_tensor(&tl_res_low, param->res_low); + bmk_param.res_low = &tl_res_low; + + bmk1822_tensor_lmem_t tl_a_high; + if (param->a_high) { + convert_lmem_tensor(&tl_a_high, param->a_high); + bmk_param.a_high = &tl_a_high; + } else { + bmk_param.a_high = NULL; + } + + bmk1822_tensor_lmem_t tl_a_low; + convert_lmem_tensor(&tl_a_low, param->a_low); + bmk_param.a_low = &tl_a_low; + + bmk1822_tensor_lmem_t tl_bits; + convert_lmem_tensor(&tl_bits, param->bits); + bmk_param.bits = &tl_bits; + + bmk1822_tiu_element_wise_arith_shift(bmk_ctx, &bmk_param); +} + +void cvk1822_tiu_and_int16( + cvk_context_t *ctx, + const cvk_tiu_and_int16_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tiu_element_wise_and_int16_param_t bmk_param; + bmk1822_tensor_lmem_t tl_res_high; + if (param->res_high) { + convert_lmem_tensor(&tl_res_high, param->res_high); + bmk_param.res_high = &tl_res_high; + } else { + bmk_param.res_high = NULL; + } + + bmk1822_tensor_lmem_t tl_res_low; + convert_lmem_tensor(&tl_res_low, param->res_low); + bmk_param.res_low = &tl_res_low; + + bmk1822_tensor_lmem_t tl_a_high; + if (param->a_high) { + convert_lmem_tensor(&tl_a_high, param->a_high); + bmk_param.a_high = &tl_a_high; + } else { + bmk_param.a_high = NULL; + } + + bmk1822_tensor_lmem_t tl_a_low; + convert_lmem_tensor(&tl_a_low, param->a_low); + bmk_param.a_low = &tl_a_low; + + bmk1822_tensor_lmem_t tl_b_high; + if (param->b_high) { + convert_lmem_tensor(&tl_b_high, param->b_high); + bmk_param.b_high = &tl_b_high; + } else { + bmk_param.b_high = NULL; + } + + bmk1822_tensor_lmem_t tl_b_low; + convert_lmem_tensor(&tl_b_low, param->b_low); + bmk_param.b_low = &tl_b_low; + + bmk1822_tiu_element_wise_and_int16(bmk_ctx, &bmk_param); +} + +void cvk1822_tiu_or_int8( + cvk_context_t *ctx, + const cvk_tiu_or_int8_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tiu_element_wise_or_int8_param_t bmk_param; + bmk1822_tensor_lmem_t tl_res; + convert_lmem_tensor(&tl_res, param->res); + bmk_param.res = &tl_res; + + bmk1822_tensor_lmem_t tl_a; + convert_lmem_tensor(&tl_a, param->a); + bmk_param.a = &tl_a; + + bmk1822_tensor_lmem_t tl_b; + convert_lmem_tensor(&tl_b, param->b); + bmk_param.b = &tl_b; + + bmk_param.layer_id = param->layer_id; + + bmk1822_tiu_element_wise_or_int8(bmk_ctx, &bmk_param); +} + +void cvk1822_tiu_or_int16( + cvk_context_t *ctx, + const cvk_tiu_or_int16_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tiu_element_wise_or_int16_param_t bmk_param; + bmk1822_tensor_lmem_t tl_res_high; + if (param->res_high) { + convert_lmem_tensor(&tl_res_high, param->res_high); + bmk_param.res_high = &tl_res_high; + } else { + bmk_param.res_high = NULL; + } + + bmk1822_tensor_lmem_t tl_res_low; + convert_lmem_tensor(&tl_res_low, param->res_low); + bmk_param.res_low = &tl_res_low; + + bmk1822_tensor_lmem_t tl_a_high; + if (param->a_high) { + convert_lmem_tensor(&tl_a_high, param->a_high); + bmk_param.a_high = &tl_a_high; + } else { + bmk_param.a_high = NULL; + } + + bmk1822_tensor_lmem_t tl_a_low; + convert_lmem_tensor(&tl_a_low, param->a_low); + bmk_param.a_low = &tl_a_low; + + bmk1822_tensor_lmem_t tl_b_high; + if (param->b_high) { + convert_lmem_tensor(&tl_b_high, param->b_high); + bmk_param.b_high = &tl_b_high; + } else { + bmk_param.b_high = NULL; + } + + bmk1822_tensor_lmem_t tl_b_low; + convert_lmem_tensor(&tl_b_low, param->b_low); + bmk_param.b_low = &tl_b_low; + + bmk1822_tiu_element_wise_or_int16(bmk_ctx, &bmk_param); +} + +void cvk1822_tiu_xor_int8( + cvk_context_t *ctx, + const cvk_tiu_xor_int8_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tiu_element_wise_xor_int8_param_t bmk_param; + bmk1822_tensor_lmem_t tl_res; + convert_lmem_tensor(&tl_res, param->res); + bmk_param.res = &tl_res; + + bmk1822_tensor_lmem_t tl_a; + convert_lmem_tensor(&tl_a, param->a); + bmk_param.a = &tl_a; + + bmk1822_tensor_lmem_t tl_b; + convert_lmem_tensor(&tl_b, param->b); + bmk_param.b = &tl_b; + + bmk_param.layer_id = param->layer_id; + + bmk1822_tiu_element_wise_xor_int8(bmk_ctx, &bmk_param); +} + +void cvk1822_tiu_xor_int16( + cvk_context_t *ctx, + const cvk_tiu_xor_int16_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tiu_element_wise_xor_int16_param_t bmk_param; + bmk1822_tensor_lmem_t tl_res_high; + if (param->res_high) { + convert_lmem_tensor(&tl_res_high, param->res_high); + bmk_param.res_high = &tl_res_high; + } else { + bmk_param.res_high = NULL; + } + + bmk1822_tensor_lmem_t tl_res_low; + convert_lmem_tensor(&tl_res_low, param->res_low); + bmk_param.res_low = &tl_res_low; + + bmk1822_tensor_lmem_t tl_a_high; + if (param->a_high) { + convert_lmem_tensor(&tl_a_high, param->a_high); + bmk_param.a_high = &tl_a_high; + } else { + bmk_param.a_high = NULL; + } + + bmk1822_tensor_lmem_t tl_a_low; + convert_lmem_tensor(&tl_a_low, param->a_low); + bmk_param.a_low = &tl_a_low; + + bmk1822_tensor_lmem_t tl_b_high; + if (param->b_high) { + convert_lmem_tensor(&tl_b_high, param->b_high); + bmk_param.b_high = &tl_b_high; + } else { + bmk_param.b_high = NULL; + } + + bmk1822_tensor_lmem_t tl_b_low; + convert_lmem_tensor(&tl_b_low, param->b_low); + bmk_param.b_low = &tl_b_low; + + bmk1822_tiu_element_wise_xor_int16(bmk_ctx, &bmk_param); +} + +void cvk1822_tiu_copy( + cvk_context_t *ctx, + const cvk_tiu_copy_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tiu_element_wise_copy_param_t bmk_param; + bmk1822_tensor_lmem_t tl_src; + convert_lmem_tensor(&tl_src, param->src); + bmk_param.src = &tl_src; + + bmk1822_tensor_lmem_t tl_dst; + convert_lmem_tensor(&tl_dst, param->dst); + bmk_param.dst = &tl_dst; + + bmk_param.layer_id = param->layer_id; + + bmk1822_tiu_element_wise_copy(bmk_ctx, &bmk_param); +} + +void cvk1822_tiu_lookup_table( + cvk_context_t *ctx, + const cvk_tiu_lookup_table_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tiu_lookup_table_param_t bmk_param; + bmk1822_tensor_lmem_t tl_ofmap; + convert_lmem_tensor(&tl_ofmap, param->ofmap); + bmk_param.ofmap = &tl_ofmap; + + bmk1822_tensor_lmem_t tl_ifmap; + convert_lmem_tensor(&tl_ifmap, param->ifmap); + bmk_param.ifmap = &tl_ifmap; + + bmk1822_tensor_lmem_t tl_table; + convert_lmem_tensor(&tl_table, param->table); + bmk_param.table = &tl_table; + + bmk_param.layer_id = param->layer_id; + + bmk1822_tiu_lookup_table(bmk_ctx, &bmk_param); +} + +void cvk1822_tiu_bf16_lookup_interp_table( + cvk_context_t *ctx, + const cvk_tiu_bf16_lookup_interp_table_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tensor_lmem_t ifmap; + convert_lmem_tensor(&ifmap, param->ifmap); + + bmk1822_tensor_lmem_t buf; + convert_lmem_tensor(&buf, param->buf); + + bmk1822_tensor_lmem_t tbl_answer; + convert_lmem_tensor(&tbl_answer, param->tbl_answer); + + bmk1822_tensor_lmem_t tbl_answer_mantissa; + convert_lmem_tensor(&tbl_answer_mantissa, param->tbl_answer_mantissa); + + bmk1822_tensor_lmem_t ofmap; + convert_lmem_tensor(&ofmap, param->ofmap); + + if (param->is_scientific) { + // issue lut cmd + bmk1822_tdma_l2l_tensor_copy_param_t p10; + // remove low 8 bits by int8 copy with stride + // get index(pow) + memset(&p10, 0x00, sizeof(bmk1822_tdma_l2l_tensor_copy_param_t)); + p10.dst = &ofmap; + p10.src = &ifmap; + p10.mv_lut_base = false; // MUST init by ifself in soc + p10.mv_lut_idx = true; + bmk1822_tdma_l2l_bf16_tensor_copy(bmk_ctx, &p10); + p10.mv_lut_idx = false; + + // get f(x0) = 2^(x0*-0.5) + bmk1822_tiu_lookup_table_param_t p12; + p12.ofmap = &ofmap; + p12.ifmap = &ofmap; + p12.table = &tbl_answer; + p12.layer_id = param->layer_id; + bmk1822_tiu_lookup_table(bmk_ctx, &p12); + + // get mantissa value + p12.ofmap = &buf; + p12.ifmap = &ifmap; + p12.table = &tbl_answer_mantissa; + bmk1822_tiu_lookup_table(bmk_ctx, &p12); + + // (2^exp) * mantissa + bmk1822_tiu_element_wise_mul_param_t p1; + p1.res_high = NULL; + p1.res_low = &ofmap; + p1.a = &ofmap; + p1.b_is_const = 0; + p1.b = &buf; + p1.rshift_bits = 0; + p1.relu_enable = 0; + p1.layer_id = param->layer_id; + bmk1822_tiu_element_wise_mul(bmk_ctx, &p1); + } + else { + // duplicate from cvikernel_1880v2.c + const cvk_tl_t *tl_ifmap = param->ifmap; + const cvk_tl_t *tl_ofmap_slope = param->buf; + const cvk_tl_t *tl_table_answer = param->tbl_answer; + const cvk_tl_t *tl_table_answer_slope = param->tbl_answer_mantissa; + const cvk_tl_t *tl_ofmap_y0 = param->ofmap; + float min = param->min; + float max = param->max; + float scale = 256 / (max - min); // 256 means hw support lut index size + uint8_t eu_align = param->eu_align; + cvk_fmt_t fmt = CVK_FMT_BF16; + + cvk_tl_shape_t tl_ofmap_x0_int8_shape = { + 1, tl_ifmap->shape.c, tl_ifmap->shape.h * tl_ifmap->shape.w, 1}; + + // filter y = max(range_min, x) + cvk_tiu_max_param_t p1 = {0}; + p1.max = tl_ifmap; + p1.a = tl_ifmap; + p1.b_is_const = 1; + p1.b_const.is_signed = 1; + p1.b_const.val = ctx->misc_ops->float_to_bfloat16(ctx, min); + p1.layer_id = param->layer_id; + ctx->ops->tiu_max(ctx, &p1); + + // filter y = min(8, x) + cvk_tiu_min_param_t p2 = {0}; + p2.min = tl_ifmap; + p2.a = tl_ifmap; + p2.b_is_const = 1; + p2.b_const.val = ctx->misc_ops->float_to_bfloat16(ctx, max - 1 / scale); // corner + p2.b_const.is_signed = 1; + p2.layer_id = param->layer_id; + ctx->ops->tiu_min(ctx, &p2); + + cvk_tdma_l2l_tensor_copy_param_t p3 = {0}; + // scale input for remap its idx(-x~x) to (-127~127), dirty tl_ifmap + cvk_tiu_mul_param_t p4 = {0}; + p4.res_high = NULL; + p4.res_low = tl_ifmap; + p4.a = tl_ifmap; + p4.b_is_const = 1; + p4.b_const.val = ctx->misc_ops->float_to_bfloat16(ctx, scale); + p4.rshift_bits = 0; + p4.relu_enable = 0; + p4.layer_id = param->layer_id; + ctx->ops->tiu_mul(ctx, &p4); + + // int8 + memset(&p3, 0x00, sizeof(cvk_tdma_l2l_tensor_copy_param_t)); + cvk_tl_t dst; + memcpy(&dst, tl_ofmap_y0, sizeof(cvk_tl_t)); + + dst.shape = tl_ofmap_x0_int8_shape; + dst.fmt = CVK_FMT_I8; + dst.stride = + ctx->ops->tl_default_stride(ctx, tl_ofmap_x0_int8_shape, CVK_FMT_I8, eu_align); + dst.stride.h = dst.stride.h * 2; + dst.int8_rnd_mode = 1; + p3.dst = &dst; + p3.src = tl_ifmap; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p3); + dst.int8_rnd_mode = 0; // reset + + // ops->tdma_l2l_bf16_tensor_copy(ctx, &p3); + + // ops->tiu_sub(ctx, &p5); + + // get f(x0) and slope(x) + // reshape, 16->16 + dst.fmt = fmt; + dst.shape = tl_ofmap_slope->shape; + dst.stride = tl_ofmap_slope->stride; + + // layer_id; + ctx->ops->tiu_lookup_table(ctx, &p6); + + // base f(x0) + memset(&p6, 0x0, sizeof(cvk_tiu_lookup_table_param_t)); + p6.ofmap = tl_ofmap_y0; + p6.ifmap = &dst; + p6.table = tl_table_answer; + p6.layer_id = param->layer_id; + ctx->ops->tiu_lookup_table(ctx, &p6); + + // layer_id; + ctx->ops->tiu_mac(ctx, &p7); + + } +} + +void cvk1822_tiu_pt_convolution( + cvk_context_t *ctx, + const cvk_tiu_pt_convolution_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tiu_convolution_param_t bmk_param; + bmk1822_tensor_lmem_t tl_ofmap; + convert_lmem_tensor(&tl_ofmap, param->ofmap); + bmk_param.ofmap = &tl_ofmap; + + bmk1822_tensor_lmem_t tl_ifmap; + convert_lmem_tensor(&tl_ifmap, param->ifmap); + bmk_param.ifmap = &tl_ifmap; + + bmk1822_tensor_lmem_t tl_weight; + convert_lmem_tensor(&tl_weight, param->weight); + bmk_param.weight = &tl_weight; + + bmk1822_tensor_lmem_t tl_bias; + if (param->bias) { + convert_lmem_tensor(&tl_bias, param->bias); + bmk_param.bias = &tl_bias; + } else { + bmk_param.bias = NULL; + } + + bmk_param.ins_h = param->ins_h; + bmk_param.ins_last_h = param->ins_last_h; + bmk_param.ins_w = param->ins_w; + bmk_param.ins_last_w = param->ins_last_w; + bmk_param.pad_top = param->pad_top; + bmk_param.pad_bottom = param->pad_bottom; + bmk_param.pad_left = param->pad_left; + bmk_param.pad_right = param->pad_right; + bmk_param.stride_h = param->stride_h; + bmk_param.stride_w = param->stride_w; + bmk_param.dilation_h = param->dilation_h; + bmk_param.dilation_w = param->dilation_w; + bmk_param.relu_enable = param->relu_enable; + bmk_param.rshift_bits = param->rshift_bits; + bmk_param.ps32_mode = param->ps32_mode; + bmk_param.w_is_const = param->w_is_const; + bmk_param.layer_id = param->layer_id; + bmk_param.fp_round_typ = param->fp_round_typ; + bmk_param.cmd_pre_exe_typ = param->cmd_pre_exe_typ; + bmk_param.cmd_pre_exe = param->cmd_pre_exe; + bmk_param.ins_val = param->ins_val; + bmk_param.ins_fp = param->ins_fp; + + bmk1822_tiu_convolution(bmk_ctx, &bmk_param); +} + +void cvk1822_tiu_convolution( + cvk_context_t *ctx, + const cvk_tiu_convolution_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tiu_convolution_qdm_param_t bmk_param; + bmk1822_tensor_lmem_t tl_ofmap; + convert_lmem_tensor(&tl_ofmap, param->ofmap); + bmk_param.ofmap = &tl_ofmap; + + bmk1822_tensor_lmem_t tl_ifmap; + convert_lmem_tensor(&tl_ifmap, param->ifmap); + bmk_param.ifmap = &tl_ifmap; + + bmk1822_tensor_lmem_t tl_weight; + convert_lmem_tensor(&tl_weight, param->weight); + bmk_param.weight = &tl_weight; + + bmk1822_tensor_lmem_t tl_chl_quan_param; + if (param->chl_quan_param) { + convert_lmem_tensor(&tl_chl_quan_param, param->chl_quan_param); + bmk_param.chl_quan_param = &tl_chl_quan_param; + } else { + bmk_param.chl_quan_param = NULL; + } + + bmk_param.ins_h = param->ins_h; + bmk_param.ins_last_h = param->ins_last_h; + bmk_param.ins_w = param->ins_w; + bmk_param.ins_last_w = param->ins_last_w; + bmk_param.pad_top = param->pad_top; + bmk_param.pad_bottom = param->pad_bottom; + bmk_param.pad_left = param->pad_left; + bmk_param.pad_right = param->pad_right; + bmk_param.stride_h = param->stride_h; + bmk_param.stride_w = param->stride_w; + bmk_param.dilation_h = param->dilation_h; + bmk_param.dilation_w = param->dilation_w; + bmk_param.has_bias = param->has_bias; + bmk_param.relu_enable = param->relu_enable; + bmk_param.ps32_mode = param->ps32_mode; + bmk_param.w_is_const = param->w_is_const; + bmk_param.layer_id = param->layer_id; + bmk_param.cmd_pre_exe_typ = param->cmd_pre_exe_typ; + bmk_param.cmd_pre_exe = param->cmd_pre_exe; + bmk_param.ins_val = param->ins_val; + bmk_param.ins_fp = param->ins_fp; + + bmk1822_tiu_convolution_qdm(bmk_ctx, &bmk_param); +} + +void cvk1822_tiu_max_pooling( + cvk_context_t *ctx, + const cvk_tiu_max_pooling_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tiu_max_pooling_param_t bmk_param; + bmk1822_tensor_lmem_t tl_ofmap; + convert_lmem_tensor(&tl_ofmap, param->ofmap); + bmk_param.ofmap = &tl_ofmap; + + bmk1822_tensor_lmem_t tl_ifmap; + convert_lmem_tensor(&tl_ifmap, param->ifmap); + bmk_param.ifmap = &tl_ifmap; + + bmk_param.kh = param->kh; + bmk_param.kw = param->kw; + bmk_param.pad_top = param->pad_top; + bmk_param.pad_bottom = param->pad_bottom; + bmk_param.pad_left = param->pad_left; + bmk_param.pad_right = param->pad_right; + bmk_param.stride_h = param->stride_h; + bmk_param.stride_w = param->stride_w; + bmk_param.ins_val = param->ins_val; + bmk_param.ins_fp = param->ins_fp; + bmk_param.layer_id = param->layer_id; + + bmk1822_tiu_max_pooling(bmk_ctx, &bmk_param); +} + +void cvk1822_tiu_average_pooling( + cvk_context_t *ctx, + const cvk_tiu_average_pooling_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tiu_average_pooling_param_t bmk_param; + bmk1822_tensor_lmem_t tl_ofmap; + convert_lmem_tensor(&tl_ofmap, param->ofmap); + bmk_param.ofmap = &tl_ofmap; + + bmk1822_tensor_lmem_t tl_ifmap; + convert_lmem_tensor(&tl_ifmap, param->ifmap); + bmk_param.ifmap = &tl_ifmap; + + bmk_param.kh = param->kh; + bmk_param.kw = param->kw; + bmk_param.ins_h = param->ins_h; + bmk_param.ins_last_h = param->ins_last_h; + bmk_param.ins_w = param->ins_w; + bmk_param.ins_last_w = param->ins_last_w; + bmk_param.pad_top = param->pad_top; + bmk_param.pad_bottom = param->pad_bottom; + bmk_param.pad_left = param->pad_left; + bmk_param.pad_right = param->pad_right; + bmk_param.stride_h = param->stride_h; + bmk_param.stride_w = param->stride_w; + bmk_param.avg_pooling_const = param->avg_pooling_const; + bmk_param.rshift_bits = param->rshift_bits; + bmk_param.layer_id = param->layer_id; + bmk_param.ins_val = param->ins_val; + bmk_param.ins_fp = param->ins_fp; + + bmk1822_tiu_average_pooling(bmk_ctx, &bmk_param); +} + +void cvk1822_tiu_pt_depthwise_convolution( + cvk_context_t *ctx, + const cvk_tiu_depthwise_pt_convolution_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tiu_depthwise_convolution_param_t bmk_param; + bmk1822_tensor_lmem_t tl_ofmap; + convert_lmem_tensor(&tl_ofmap, param->ofmap); + bmk_param.ofmap = &tl_ofmap; + + bmk1822_tensor_lmem_t tl_ifmap; + convert_lmem_tensor(&tl_ifmap, param->ifmap); + bmk_param.ifmap = &tl_ifmap; + + bmk1822_tensor_lmem_t tl_weight; + convert_lmem_tensor(&tl_weight, param->weight); + bmk_param.weight = &tl_weight; + + bmk1822_tensor_lmem_t tl_bias; + if (param->bias) { + convert_lmem_tensor(&tl_bias, param->bias); + bmk_param.bias = &tl_bias; + } else { + bmk_param.bias = NULL; + } + + bmk_param.ins_h = param->ins_h; + bmk_param.ins_last_h = param->ins_last_h; + bmk_param.ins_w = param->ins_w; + bmk_param.ins_last_w = param->ins_last_w; + bmk_param.pad_top = param->pad_top; + bmk_param.pad_bottom = param->pad_bottom; + bmk_param.pad_left = param->pad_left; + bmk_param.pad_right = param->pad_right; + bmk_param.stride_h = param->stride_h; + bmk_param.stride_w = param->stride_w; + bmk_param.dilation_h = param->dilation_h; + bmk_param.dilation_w = param->dilation_w; + bmk_param.relu_enable = param->relu_enable; + bmk_param.rshift_bits = param->rshift_bits; + bmk_param.layer_id = param->layer_id; + bmk_param.cmd_pre_exe_typ = param->cmd_pre_exe_typ; + bmk_param.cmd_pre_exe = param->cmd_pre_exe; + bmk_param.ins_val = param->ins_val; + bmk_param.ins_fp = param->ins_fp; + bmk_param.weight_is_const = param->weight_is_const; + bmk_param.weight_const.is_signed = param->weight_const.is_signed; + bmk_param.weight_const.val = param->weight_const.val; + + bmk1822_tiu_depthwise_convolution(bmk_ctx, &bmk_param); +} + +void cvk1822_tiu_depthwise_convolution( + cvk_context_t *ctx, + const cvk_tiu_depthwise_convolution_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tiu_depthwise_convolution_qdm_param_t bmk_param; + bmk1822_tensor_lmem_t tl_ofmap; + convert_lmem_tensor(&tl_ofmap, param->ofmap); + bmk_param.ofmap = &tl_ofmap; + + bmk1822_tensor_lmem_t tl_ifmap; + convert_lmem_tensor(&tl_ifmap, param->ifmap); + bmk_param.ifmap = &tl_ifmap; + + bmk1822_tensor_lmem_t tl_weight; + convert_lmem_tensor(&tl_weight, param->weight); + bmk_param.weight = &tl_weight; + + bmk1822_tensor_lmem_t tl_chl_quan_param; + convert_lmem_tensor(&tl_chl_quan_param, param->chl_quan_param); + bmk_param.chl_quan_param = &tl_chl_quan_param; + + bmk_param.ins_h = param->ins_h; + bmk_param.ins_last_h = param->ins_last_h; + bmk_param.ins_w = param->ins_w; + bmk_param.ins_last_w = param->ins_last_w; + bmk_param.pad_top = param->pad_top; + bmk_param.pad_bottom = param->pad_bottom; + bmk_param.pad_left = param->pad_left; + bmk_param.pad_right = param->pad_right; + bmk_param.stride_h = param->stride_h; + bmk_param.stride_w = param->stride_w; + bmk_param.dilation_h = param->dilation_h; + bmk_param.dilation_w = param->dilation_w; + bmk_param.has_bias = param->has_bias; + bmk_param.relu_enable = param->relu_enable; + bmk_param.layer_id = param->layer_id; + bmk_param.cmd_pre_exe_typ = param->cmd_pre_exe_typ; + bmk_param.cmd_pre_exe = param->cmd_pre_exe; + bmk_param.ins_val = param->ins_val; + bmk_param.ins_fp = param->ins_fp; + bmk_param.weight_is_const = param->weight_is_const; + bmk_param.weight_const.is_signed = param->weight_const.is_signed; + bmk_param.weight_const.val = param->weight_const.val; + + bmk1822_tiu_depthwise_convolution_qdm(bmk_ctx, &bmk_param); +} + +void cvk1822_tiu_matrix_multiplication( + cvk_context_t *ctx, + const cvk_tiu_matrix_multiplication_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tiu_matrix_multiplication_param_t bmk_param; + bmk1822_matrix_lmem_t ml_res; + convert_lmem_matrix(&ml_res, param->res); + bmk_param.res = &ml_res; + + bmk1822_matrix_lmem_t ml_left; + convert_lmem_matrix(&ml_left, param->left); + bmk_param.left = &ml_left; + + bmk1822_matrix_lmem_t ml_right; + convert_lmem_matrix(&ml_right, param->right); + bmk_param.right = &ml_right; + + bmk1822_matrix_lmem_t ml_bias; + if (param->bias) { + convert_lmem_matrix(&ml_bias, param->bias); + bmk_param.bias = &ml_bias; + } else { + bmk_param.bias = NULL; + } + + bmk_param.lshift_bits = param->lshift_bits; + bmk_param.rshift_bits = param->rshift_bits; + bmk_param.res_is_int8 = param->res_is_int8; + bmk_param.relu_enable = param->relu_enable; + bmk_param.add_result = param->add_result; + bmk_param.ps32_mode = param->ps32_mode; + bmk_param.layer_id = param->layer_id; + + bmk1822_tiu_matrix_multiplication(bmk_ctx, &bmk_param); +} + +void cvk1822_tiu_matrix_multiplication_qm( + cvk_context_t *ctx, + const cvk_tiu_matrix_multiplication_qm_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tiu_matrix_multiplication_qdm_param_t bmk_param; + bmk1822_matrix_lmem_t ml_res; + convert_lmem_matrix(&ml_res, param->res); + bmk_param.res = &ml_res; + + bmk1822_matrix_lmem_t ml_left; + convert_lmem_matrix(&ml_left, param->left); + bmk_param.left = &ml_left; + + bmk1822_matrix_lmem_t ml_right; + convert_lmem_matrix(&ml_right, param->right); + bmk_param.right = &ml_right; + + bmk1822_matrix_lmem_t ml_bias; + if (param->bias) { + convert_lmem_matrix(&ml_bias, param->bias); + bmk_param.bias = &ml_bias; + } else { + bmk_param.bias = NULL; + } + + bmk_param.lshift_bits = param->lshift_bits; + bmk_param.rshift_bits = param->rshift_bits; + bmk_param.res_is_int8 = param->res_is_int8; + bmk_param.relu_enable = param->relu_enable; + bmk_param.add_result = param->add_result; + bmk_param.ps32_mode = param->ps32_mode; + bmk_param.quan_m = param->quan_m; + bmk_param.layer_id = param->layer_id; + bmk1822_tiu_matrix_multiplication_qdm(bmk_ctx, &bmk_param); +} + +void cvk1822_tiu_ge( + cvk_context_t *ctx, + const cvk_tiu_ge_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tiu_element_wise_ge_param_t bmk_param; + bmk1822_tensor_lmem_t tl_ge; + convert_lmem_tensor(&tl_ge, param->ge); + bmk_param.ge = &tl_ge; + + bmk1822_tensor_lmem_t tl_a; + convert_lmem_tensor(&tl_a, param->a); + bmk_param.a = &tl_a; + + bmk_param.b_is_const = param->b_is_const; + bmk1822_tensor_lmem_t tl_b; + if (!bmk_param.b_is_const) { + convert_lmem_tensor(&tl_b, param->b); + bmk_param.b = &tl_b; + } else { + bmk_param.b_const.val = param->b_const.val; + bmk_param.b_const.is_signed = param->b_const.is_signed; + } + + bmk_param.layer_id = param->layer_id; + + bmk1822_tiu_element_wise_ge(bmk_ctx, &bmk_param); +} + +void cvk1822_tiu_min_pooling( + struct cvikernel_context *ctx, + const cvk_tiu_min_pooling_param_t *param) +{ + bmk1822_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1822_tiu_min_pooling_param_t bmk_param; + bmk1822_tensor_lmem_t tl_ofmap; + convert_lmem_tensor(&tl_ofmap, param->ofmap); + bmk_param.ofmap = &tl_ofmap; + + bmk1822_tensor_lmem_t tl_ifmap; + convert_lmem_tensor(&tl_ifmap, param->ifmap); + bmk_param.ifmap = &tl_ifmap; + + bmk_param.kh = param->kh; + bmk_param.kw = param->kw; + bmk_param.pad_top = param->pad_top; + bmk_param.pad_bottom = param->pad_bottom; + bmk_param.pad_left = param->pad_left; + bmk_param.pad_right = param->pad_right; + bmk_param.stride_h = param->stride_h; + bmk_param.stride_w = param->stride_w; + bmk_param.ins_fp = param->ins_fp; + bmk_param.layer_id = param->layer_id; + + bmk1822_tiu_min_pooling(bmk_ctx, &bmk_param); +} + +uint16_t cvk1822_float_to_bfloat16( + cvk_context_t *ctx, + float data) +{ + (void)ctx; + + return convert_fp32_bf16(data); +} + +void cvk1822_bf16_table_shape( + cvk_context_t *ctx, + cvk_tl_shape_t *shape) +{ + if (!ctx || !shape) + return; + + shape->n = 1; + shape->c = ctx->info.npu_num; + shape->h = 32; // hard-coded in 1880v2 + shape->w = 8; // hard-coded in 1822 +} + +static cvk_operations_t cvikernel_1822_ops = { + .cleanup = cvk1822_cleanup, + .reset = cvk1822_reset, + .acquire_cmdbuf = cvk1822_acquire_cmdbuf, + .dmabuf_size = bmk1822_dmabuf_size, + .dmabuf_convert = bmk1822_dmabuf_convert, + .set_layer_id = cvk1822_set_layer_id, + .parallel_enable = cvk1822_parallel_enable, + .parallel_disable = cvk1822_parallel_disable, + .lmem_alloc_tensor = cvk1822_lmem_alloc_tensor, + .lmem_alloc_matrix = cvk1822_lmem_alloc_matrix, + .lmem_alloc_ps32_matrix = cvk1822_lmem_alloc_ps32_matrix, + .lmem_free_tensor = cvk1822_lmem_free_tensor, + .lmem_free_matrix = cvk1822_lmem_free_matrix, + .lmem_init_tensor = cvk1822_lmem_init_tensor, + .lmem_init_matrix = cvk1822_lmem_init_matrix, + .tl_default_stride = cvk1822_tl_default_stride, + .tg_default_stride = cvk1822_tg_default_stride, + .ml_default_shape = cvk1822_ml_default_shape, + .ml_default_stride = cvk1822_ml_default_stride, + .ml_shape_t1 = cvk1822_ml_shape_t1, + .lmem_tensor_to_size = cvk1822_lmem_tensor_to_size, + .lmem_matrix_to_size = cvk1822_lmem_matrix_to_size, + .lmem_ps32_matrix_to_size = cvk1822_lmem_ps32_matrix_to_size, + .gmem_init_tensor = cvk1822_gmem_init_tensor, + .tdma_l2l_tensor_copy = cvk1822_tdma_l2l_tensor_copy, + .tdma_l2l_bf16_tensor_copy = cvk1822_tdma_l2l_bf16_tensor_copy, + .tdma_l2l_tensor_lrn_shift = cvk1822_tdma_l2l_tensor_lrn_shift, + .tdma_l2g_tensor_copy = cvk1822_tdma_l2g_tensor_copy, + .tdma_l2g_bf16_tensor_copy = cvk1822_tdma_l2g_bf16_tensor_copy, + .tdma_l2g_tensor_copy_nc_transposed = cvk1822_tdma_l2g_tensor_copy_nc_transposed, + .tdma_l2g_bf16_tensor_copy_nc_transposed = cvk1822_tdma_l2g_bf16_tensor_copy_nc_transposed, + .tdma_l2g_tensor_copy_compressed = cvk1822_tdma_l2g_tensor_copy_compressed, + .tdma_l2g_tensor_fill_constant = cvk1822_tdma_l2g_tensor_fill_constant, + .tdma_l2g_tensor_copy_cw_transposed = cvk1822_tdma_l2g_tensor_copy_cw_transposed, + .tdma_l2g_bf16_tensor_copy_cw_transposed = cvk1822_tdma_l2g_bf16_tensor_copy_cw_transposed, + .tdma_l2g_matrix_copy = cvk1822_tdma_l2g_matrix_copy, + .tdma_l2g_bf16_matrix_copy = cvk1822_tdma_l2g_bf16_matrix_copy, + .tdma_l2g_matrix_copy_compressed = cvk1822_tdma_l2g_matrix_copy_compressed, + .tdma_l2g_general_copy = cvk1822_tdma_l2g_general_copy, + .tdma_l2g_bf16_general_copy = cvk1822_tdma_l2g_bf16_general_copy, + .tdma_g2l_tensor_copy = cvk1822_tdma_g2l_tensor_copy, + .tdma_g2l_bf16_tensor_copy = cvk1822_tdma_g2l_bf16_tensor_copy, + .tdma_g2l_tensor_copy_nc_transposed = cvk1822_tdma_g2l_tensor_copy_nc_transposed, + .tdma_g2l_bf16_tensor_copy_nc_transposed = cvk1822_tdma_g2l_bf16_tensor_copy_nc_transposed, + .tdma_g2l_tensor_copy_chw_rotated = cvk1822_tdma_g2l_tensor_copy_chw_rotated, + .tdma_g2l_tensor_copy_decompressed = cvk1822_tdma_g2l_tensor_copy_decompressed, + .tdma_g2l_tensor_fill_constant = cvk1822_tdma_g2l_tensor_fill_constant, + .tdma_g2l_bf16_tensor_fill_constant = cvk1822_tdma_g2l_bf16_tensor_fill_constant, + .tdma_g2l_matrix_copy_decompressed = cvk1822_tdma_g2l_matrix_copy_decompressed, + .tdma_g2l_matrix_copy = cvk1822_tdma_g2l_matrix_copy, + .tdma_g2l_bf16_matrix_copy = cvk1822_tdma_g2l_bf16_matrix_copy, + .tdma_g2l_matrix_copy_row_col_transposed = cvk1822_tdma_g2l_matrix_copy_row_col_transposed, + .tdma_g2l_general_copy = cvk1822_tdma_g2l_general_copy, + .tdma_g2l_bf16_general_copy = cvk1822_tdma_g2l_bf16_general_copy, + .tdma_g2g_tensor_copy = cvk1822_tdma_g2g_tensor_copy, + .tdma_g2g_general_copy = cvk1822_tdma_g2g_general_copy, + .tdma_g2g_bf16_general_copy = cvk1822_tdma_g2g_bf16_general_copy, + .tdma_g2g_bf16_tensor_copy = cvk1822_tdma_g2g_bf16_tensor_copy, + .tiu_mul = cvk1822_tiu_mul, + .tiu_mul_qm = cvk1822_tiu_mul_qm, + .tiu_mac = cvk1822_tiu_mac, + .tiu_add = cvk1822_tiu_add, + .tiu_sub = cvk1822_tiu_sub, + .tiu_max = cvk1822_tiu_max, + .tiu_min = cvk1822_tiu_min, + .tiu_and_int8 = cvk1822_tiu_and_int8, + .tiu_arith_shift = cvk1822_tiu_arith_shift, + .tiu_and_int16 = cvk1822_tiu_and_int16, + .tiu_or_int8 = cvk1822_tiu_or_int8, + .tiu_or_int16 = cvk1822_tiu_or_int16, + .tiu_xor_int8 = cvk1822_tiu_xor_int8, + .tiu_xor_int16 = cvk1822_tiu_xor_int16, + .tiu_copy = cvk1822_tiu_copy, + .tiu_lookup_table = cvk1822_tiu_lookup_table, + .tiu_bf16_lookup_interp_table = cvk1822_tiu_bf16_lookup_interp_table, + .tiu_pt_convolution = cvk1822_tiu_pt_convolution, + .tiu_convolution = cvk1822_tiu_convolution, + .tiu_max_pooling = cvk1822_tiu_max_pooling, + .tiu_average_pooling = cvk1822_tiu_average_pooling, + .tiu_pt_depthwise_convolution = cvk1822_tiu_pt_depthwise_convolution, + .tiu_depthwise_convolution = cvk1822_tiu_depthwise_convolution, + .tiu_matrix_multiplication = cvk1822_tiu_matrix_multiplication, + .tiu_matrix_multiplication_qm = cvk1822_tiu_matrix_multiplication_qm, + .tiu_ge = cvk1822_tiu_ge, + .tiu_min_pooling = cvk1822_tiu_min_pooling, +}; + +static cvk_misc_operations_t cvikernel_1822_misc_ops = { + .float_to_bfloat16 = cvk1822_float_to_bfloat16, + .bf16_table_shape = cvk1822_bf16_table_shape, +}; + +char *cvikernel_get_chip_info_1822(void) +{ + return CVI_TPU_VERSION_182X; +} + +void cvikernel_init_1822( + cvk_reg_info_t *req_info, + cvk_context_t *ctx) +{ + ctx->info.version = BM1822_VER; + ctx->info.node_num = BM1822_HW_NODE_CHIP_NUM; + ctx->info.node_shift = BM1822_HW_NODE_CHIP_SHIFT; + ctx->info.npu_num = BM1822_HW_NPU_NUM; + ctx->info.npu_shift = BM1822_HW_NPU_SHIFT; + ctx->info.eu_num = BM1822_HW_EU_NUM; + ctx->info.eu_shift = BM1822_HW_EU_SHIFT; + ctx->info.lmem_size = BM1822_HW_LMEM_SIZE; + ctx->info.lmem_shift = BM1822_HW_LMEM_SHIFT; + ctx->info.lmem_banks = BM1822_HW_LMEM_BANKS; + ctx->info.lmem_bank_size = BM1822_HW_LMEM_BANK_SIZE; + ctx->info.gmem_start = BM1822_GLOBAL_MEM_START_ADDR; + ctx->info.features = CVK_HWF_FC_OP1_CONST | CVK_HWF_8B_ADD_SUB | + CVK_HWF_MIN_POOL | CVK_HWF_M_BRADCAST | + CVK_HWF_QM_LSHIFT | CVK_HWF_GE | CVK_HWF_CMD_PRE_EXE; + ctx->info.gmem_size = BM1822_GLOBAL_MEM_SIZE; + + ctx->ops = &cvikernel_1822_ops; + ctx->misc_ops = &cvikernel_1822_misc_ops; + + // kernel_init() in bmkernel.c + bmk1822_context_t *bmk_ctx = xmalloc(sizeof(bmk1822_context_t)); + bmk_ctx->info.chip_version = BM1822_VER; + bmk_ctx->info.cmdbuf_size = req_info->cmdbuf_size; + bmk_ctx->info.cmdbuf = req_info->cmdbuf; + + bmk_ctx->chip_info.version = BM1822_VER; + bmk_ctx->chip_info.node_num = BM1822_HW_NODE_CHIP_NUM; + bmk_ctx->chip_info.node_shift = BM1822_HW_NODE_CHIP_SHIFT; + bmk_ctx->chip_info.npu_num = BM1822_HW_NPU_NUM; + bmk_ctx->chip_info.npu_shift = BM1822_HW_NPU_SHIFT; + bmk_ctx->chip_info.eu_num = BM1822_HW_EU_NUM; + bmk_ctx->chip_info.eu_shift = BM1822_HW_EU_SHIFT; + bmk_ctx->chip_info.lmem_size = BM1822_HW_LMEM_SIZE; + bmk_ctx->chip_info.lmem_shift = BM1822_HW_LMEM_SHIFT; + bmk_ctx->chip_info.lmem_banks = BM1822_HW_LMEM_BANKS; + bmk_ctx->chip_info.lmem_bank_size = BM1822_HW_LMEM_BANK_SIZE; + bmk_ctx->chip_info.gmem_start = BM1822_GLOBAL_MEM_START_ADDR; + bmk_ctx->chip_info.gmem_size = BM1822_GLOBAL_MEM_SIZE; + + uint32_t max_nr_desc = bmk1822_estimate_nr_desc(bmk_ctx); + + bmk_ctx->cmdbuf_ptr = 0; + bmk_ctx->max_nr_desc = max_nr_desc; + bmk_ctx->cur_nr_desc = 0; + bmk_ctx->desc_pairs = xmalloc(max_nr_desc * sizeof(bmk_ctx->desc_pairs[0])); + bmk_ctx->lmem_ptr = 0; + + ec_init(&bmk_ctx->ec, BMK1822_ENGINE_NUM, max_nr_desc); + mode_manager_init(&bmk_ctx->mode_manager, &bmk_ctx->ec, BMK1822_ENGINE_NUM); + + cvk_prv_data_t *prv_data = malloc(sizeof(cvk_prv_data_t)); + prv_data->bmk_ctx = bmk_ctx; + prv_data->cmdbuf = req_info->cmdbuf; + prv_data->cmdbuf_size = req_info->cmdbuf_size; + + ctx->priv_data = prv_data; +} diff --git a/cvikernel/src/cv1880v2/cvikernel_1880v2.c b/cvikernel/src/cv1880v2/cvikernel_1880v2.c new file mode 100755 index 000000000..f06c58a3e --- /dev/null +++ b/cvikernel/src/cv1880v2/cvikernel_1880v2.c @@ -0,0 +1,2410 @@ +#include "kernel_internal.h" +#include "cvikernel/cvikernel.h" +#include "../bm1880v2/kernel_1880v2.h" +#include "../bm1880v2/non_atomic/gen_lut.h" +#include "bmkernel/bm1880v2/1880v2_fp_convert.h" + +#define SET_TG_SHAPE(_dst, _src) \ +do { \ + (_dst).n = (_src).n; \ + (_dst).c = (_src).c; \ + (_dst).h = (_src).h; \ + (_dst).w = (_src).w; \ +} while(0) + +#define SET_TG_STRIDE(_dst, _src) \ +do { \ + (_dst).n = (_src).n; \ + (_dst).c = (_src).c; \ + (_dst).h = (_src).h; \ +} while(0) + +#define SET_TL_SHAPE(_dst, _src) \ +do { \ + (_dst).n = (_src).n; \ + (_dst).c = (_src).c; \ + (_dst).h = (_src).h; \ + (_dst).w = (_src).w; \ +} while(0) + +#define SET_TL_STRIDE(_dst, _src) \ +do { \ + (_dst).n = (_src).n; \ + (_dst).c = (_src).c; \ + (_dst).h = (_src).h; \ + (_dst).w = (_src).w; \ +} while(0) + +#define SET_ML_SHAPE(_dst, _src) \ +do { \ + (_dst).n = (_src).n; \ + (_dst).c = (_src).c; \ + (_dst).w = (_src).w; \ + (_dst).col = (_src).col; \ +} while(0) + +#define SET_ML_STRIDE(_dst, _src) \ +do { \ + (_dst).n = (_src).n; \ + (_dst).c = (_src).c; \ + (_dst).h = (_src).h; \ +} while(0) + +typedef struct cvk_prv_data { + bmk1880v2_context_t *bmk_ctx; + uint32_t cmdbuf_size; + uint8_t *cmdbuf; +} cvk_prv_data_t; + +extern uint32_t bmk1880v2_estimate_nr_desc(bmk_context_t *k); + +static void convert_lmem_tensor(bmk1880v2_tensor_lmem_t *dst, + const cvk_tl_t *src) +{ + dst->start_address = src->start_address; + dst->fmt = src->fmt; + dst->cmprs_fmt = src->cmprs_fmt; + SET_TL_SHAPE(dst->shape, src->shape); + SET_TL_STRIDE(dst->stride, src->stride); + dst->int8_rnd_mode = src->int8_rnd_mode; +} + +static void convert_gmem_tensor(bmk1880v2_tensor_tgmem_t *dst, + const cvk_tg_t *src) +{ + dst->base_reg_index = src->base_reg_index; + dst->start_address = src->start_address; + dst->fmt = src->fmt; + SET_TG_SHAPE(dst->shape, src->shape); + SET_TG_STRIDE(dst->stride, src->stride); + dst->int8_rnd_mode = src->int8_rnd_mode; +} + +static void convert_gmem_compressed_tensor( + bmk1880v2_compressed_tensor_tgmem_t *dst, + const cvk_cmpr_tg_t *src) +{ + dst->t.base_reg_index = src->t.base_reg_index; + dst->t.start_address = src->t.start_address; + dst->t.fmt = src->t.fmt; + SET_TG_SHAPE(dst->t.shape, src->t.shape); + SET_TG_STRIDE(dst->t.stride, src->t.stride); + dst->t.int8_rnd_mode = src->t.int8_rnd_mode; + dst->reserved_size = src->reserved_size; + dst->bit_length = src->bit_length; + dst->bias0 = src->bias0; + dst->bias1 = src->bias1; + dst->zero_guard_en = src->zero_guard_en; +} + +static void convert_lmem_matrix( + bmk1880v2_matrix_lmem_t *dst, + const cvk_ml_t *src) +{ + dst->start_address = src->start_address; + dst->fmt = src->fmt; + SET_ML_SHAPE(dst->shape, src->shape); + SET_ML_STRIDE(dst->stride, src->stride); + dst->int8_rnd_mode = src->int8_rnd_mode; +} + +static void convert_gmem_matrix( + bmk1880v2_matrix_tgmem_t *dst, + const cvk_mg_t *src) +{ + dst->base_reg_index = src->base_reg_index; + dst->start_address = src->start_address; + dst->fmt = src->fmt; + dst->shape.row = src->shape.row; + dst->shape.col = src->shape.col; + dst->stride.row = src->stride.row; + dst->int8_rnd_mode = src->int8_rnd_mode; +} + +static void convert_gmem_compressed_matrix( + bmk1880v2_compressed_matrix_tgmem_t *dst, + const cvk_cmpr_mg_t *src) +{ + dst->m.base_reg_index = src->m.base_reg_index; + dst->m.start_address = src->m.start_address; + dst->m.fmt = src->m.fmt; + dst->m.shape.row = src->m.shape.row; + dst->m.shape.col = src->m.shape.col; + dst->m.stride.row = src->m.stride.row; + dst->m.int8_rnd_mode = src->m.int8_rnd_mode; + dst->bias0 = src->bias0; + dst->bias1 = src->bias1; + dst->zero_guard_en = src->zero_guard_en; +} + +void cvk1880v2_cleanup(struct cvikernel_context *ctx) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_cleanup(bmk_ctx); + + // command buffer is freed in cviruntime_cvikernel_destroy(). + //free(((cvk_prv_data_t *)ctx->priv_data)->cmdbuf); + free(ctx->priv_data); + ctx->priv_data = NULL; +} + +void cvk1880v2_reset(struct cvikernel_context *ctx) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_reset(bmk_ctx); +} + +uint8_t *cvk1880v2_acquire_cmdbuf( + struct cvikernel_context *ctx, + uint32_t *size) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + return bmk1880v2_acquire_cmdbuf(bmk_ctx, size); +} + +void cvk1880v2_set_layer_id( + struct cvikernel_context *ctx, + uint16_t layer_id) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_set_layer_id(bmk_ctx, layer_id); +} + +void cvk1880v2_parallel_enable(struct cvikernel_context *ctx) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_parallel_enable(bmk_ctx); +} + +void cvk1880v2_parallel_disable(struct cvikernel_context *ctx) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_parallel_disable(bmk_ctx); +} + +cvk_tl_t *cvk1880v2_lmem_alloc_tensor( + cvk_context_t *ctx, + cvk_tl_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tensor_lmem_shape_t bmk_shape; + SET_TL_SHAPE(bmk_shape, shape); + + fmt_t bmk_fmt = fmt; + + bmk1880v2_tensor_lmem_t *bmk_tl = + bmk1880v2_lmem_alloc_tensor(bmk_ctx, bmk_shape, bmk_fmt, eu_align); + + return (cvk_tl_t *)bmk_tl; +} + +cvk_ml_t *cvk1880v2_lmem_alloc_matrix( + cvk_context_t *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_matrix_lmem_shape_t bmk_shape; + SET_ML_SHAPE(bmk_shape, shape); + + bmk1880v2_matrix_lmem_t *bmk_ml = + bmk1880v2_lmem_alloc_matrix(bmk_ctx, bmk_shape, fmt, eu_align); + + return (cvk_ml_t *)bmk_ml; +} + +cvk_ml_t *cvk1880v2_lmem_alloc_ps32_matrix( + cvk_context_t *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_matrix_lmem_shape_t bmk_shape; + SET_ML_SHAPE(bmk_shape, shape); + + bmk1880v2_matrix_lmem_t *bmk_ml = + bmk1880v2_lmem_alloc_ps32_matrix(bmk_ctx, bmk_shape, fmt, eu_align); + + return (cvk_ml_t *)bmk_ml; +} + +void cvk1880v2_lmem_free_tensor( + struct cvikernel_context *ctx, + const cvk_tl_t *tl) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_lmem_free_tensor(bmk_ctx, (bmk1880v2_tensor_lmem_t *)tl); +} + +void cvk1880v2_lmem_free_matrix( + struct cvikernel_context *ctx, + const cvk_ml_t *ml) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_lmem_free_matrix(bmk_ctx, (bmk1880v2_matrix_lmem_t *)ml); +} + +void cvk1880v2_lmem_init_tensor( + struct cvikernel_context *ctx, + cvk_tl_t *tl, + cvk_tl_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + // memset(tl, 0, sizeof(*tl)); + // tl->shape = shape; + // tl->eu_align = eu_align; + // tl->stride = cvk1880v2_tl_default_stride(ctx, shape, fmt, eu_align); + bmk1880v2_tensor_lmem_shape_t bmk_shape; + SET_TL_SHAPE(bmk_shape, shape); + + fmt_t bmk_fmt = fmt; + + bmk1880v2_lmem_init_tensor(bmk_ctx, (bmk1880v2_tensor_lmem_t *)tl, bmk_shape, + bmk_fmt, eu_align); +} + +void cvk1880v2_lmem_init_matrix( + struct cvikernel_context *ctx, + cvk_ml_t *ml, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_matrix_lmem_shape_t bmk_shape; + SET_ML_SHAPE(bmk_shape, shape); + + bmk1880v2_lmem_init_matrix(bmk_ctx, (bmk1880v2_matrix_lmem_t *)ml, bmk_shape, + fmt, eu_align); +} + +cvk_tl_stride_t cvk1880v2_tl_default_stride( + struct cvikernel_context *ctx, + cvk_tl_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tensor_lmem_shape_t bmk_shape; + SET_TL_SHAPE(bmk_shape, shape); + + bmk1880v2_tensor_lmem_stride_t bmk_stride = + bmk1880v2_tensor_lmem_default_stride(bmk_ctx, bmk_shape, fmt, + eu_align); + + cvk_tl_stride_t stride; + SET_TL_STRIDE(stride, bmk_stride); + + return stride; +} + +cvk_tg_stride_t cvk1880v2_tg_default_stride( + struct cvikernel_context *ctx, + cvk_tg_shape_t shape, + cvk_fmt_t fmt) +{ + (void)ctx; + + bmk1880v2_tensor_tgmem_shape_t bmk_shape; + SET_TG_SHAPE(bmk_shape, shape); + + bmk1880v2_tensor_tgmem_stride_t bmk_stride = + bmk1880v2_tensor_tgmem_default_stride(bmk_shape, fmt); + + cvk_tg_stride_t stride; + SET_TG_STRIDE(stride, bmk_stride); + stride.w = (fmt == CVK_FMT_BF16) ? 2 : 1; + + return stride; +} + +cvk_ml_shape_t cvk1880v2_ml_default_shape( + struct cvikernel_context *ctx, + uint32_t row, + uint32_t col, + cvk_fmt_t fmt_type) { + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_matrix_lmem_shape_t bm_ml_shape = + bmk1880v2_matrix_lmem_default_shape(bmk_ctx, row, col, fmt_type); + + cvk_ml_shape_t ml_shape; + SET_ML_SHAPE(ml_shape, bm_ml_shape); + + return ml_shape; +} + +cvk_ml_stride_t cvk1880v2_ml_default_stride( + struct cvikernel_context *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align) { + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_matrix_lmem_shape_t bmk_shape; + SET_ML_SHAPE(bmk_shape, shape); + + bmk1880v2_matrix_lmem_stride_t bmk_stride = + bmk1880v2_matrix_lmem_default_stride(bmk_ctx, bmk_shape, fmt, eu_align); + + cvk_ml_stride_t stride; + SET_ML_STRIDE(stride, bmk_stride); + + return stride; +} + +cvk_ml_shape_t cvk1880v2_ml_shape_t1( + struct cvikernel_context *ctx, + uint32_t len, + cvk_fmt_t fmt) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_matrix_lmem_shape_t bmk_shape = + bmk1880v2_matrix_lmem_shape_t1(bmk_ctx, len, fmt); + + cvk_ml_shape_t shape; + SET_ML_SHAPE(shape, bmk_shape); + + return shape; +} + +uint32_t cvk1880v2_lmem_tensor_to_size( + struct cvikernel_context *ctx, + cvk_tl_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tensor_lmem_shape_t bmk_shape; + SET_TL_SHAPE(bmk_shape, shape); + + return bmk1880v2_lmem_tensor_to_size(bmk_ctx, bmk_shape, fmt, eu_align); +} + +uint32_t cvk1880v2_lmem_ps32_matrix_to_size( + struct cvikernel_context *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_matrix_lmem_shape_t bmk_shape; + SET_ML_SHAPE(bmk_shape, shape); + + return bmk1880v2_lmem_ps32_matrix_to_size(bmk_ctx, bmk_shape, fmt, + eu_align); +} + +void cvk1880v2_gmem_init_tensor( + struct cvikernel_context *ctx, + cvk_tg_t *tg, + cvk_tg_shape_t shape, + cvk_fmt_t fmt) { + memset(tg, 0, sizeof(*tg)); + tg->fmt = fmt; + tg->shape = shape; + tg->stride = ctx->ops->tg_default_stride(ctx, tg->shape, tg->fmt); +} + +uint32_t cvk1880v2_lmem_matrix_to_size( + struct cvikernel_context *ctx, + cvk_ml_shape_t shape, + cvk_fmt_t fmt, + int eu_align) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_matrix_lmem_shape_t bmk_shape; + SET_ML_SHAPE(bmk_shape, shape); + + return bmk1880v2_lmem_matrix_to_size(bmk_ctx, bmk_shape, fmt, + eu_align); +} + +void cvk1880v2_tdma_l2l_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_l2l_tensor_copy_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_l2l_tensor_copy_param_t bmk_param = {0}; + bmk_param.mv_lut_idx = param->mv_lut_idx; + bmk_param.mv_lut_base = param->mv_lut_base; + bmk_param.outstanding = param->outstanding; + + bmk1880v2_tensor_lmem_t tl_src; + convert_lmem_tensor(&tl_src, param->src); + bmk_param.src = &tl_src; + + bmk1880v2_tensor_lmem_t tl_dst; + convert_lmem_tensor(&tl_dst, param->dst); + bmk_param.dst = &tl_dst; + + bmk1880v2_tdma_l2l_tensor_copy(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tdma_l2l_bf16_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_l2l_tensor_copy_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_l2l_tensor_copy_param_t bmk_param = {0}; + bmk_param.mv_lut_idx = param->mv_lut_idx; + bmk_param.mv_lut_base = param->mv_lut_base; + bmk_param.outstanding = param->outstanding; + + bmk1880v2_tensor_lmem_t tl_src; + convert_lmem_tensor(&tl_src, param->src); + bmk_param.src = &tl_src; + + bmk1880v2_tensor_lmem_t tl_dst; + convert_lmem_tensor(&tl_dst, param->dst); + bmk_param.dst = &tl_dst; + + bmk1880v2_tdma_l2l_bf16_tensor_copy(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tdma_l2l_tensor_lrn_shift( + cvk_context_t *ctx, + const cvk_tdma_l2l_tensor_lrn_shift_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_l2l_tensor_lrn_shift_param_t bmk_param; + bmk_param.right_shift = param->right_shift; + bmk_param.lrn_step = param->lrn_step; + + bmk1880v2_tensor_lmem_t tl_src; + convert_lmem_tensor(&tl_src, param->src); + bmk_param.src = &tl_src; + + bmk1880v2_tensor_lmem_t tl_dst; + convert_lmem_tensor(&tl_dst, param->dst); + bmk_param.dst = &tl_dst; + + bmk1880v2_tdma_l2l_tensor_lrn_shift(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tdma_l2g_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_l2tg_tensor_copy_param_t bmk_param; + bmk1880v2_tensor_lmem_t tl_src; + convert_lmem_tensor(&tl_src, param->src); + bmk_param.src = &tl_src; + + bmk1880v2_tensor_tgmem_t tg_dst; + convert_gmem_tensor(&tg_dst, param->dst); + bmk_param.dst = &tg_dst; + + bmk1880v2_tdma_l2g_tensor_copy(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tdma_l2g_bf16_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_l2tg_tensor_copy_param_t bmk_param; + bmk1880v2_tensor_lmem_t tl_src; + convert_lmem_tensor(&tl_src, param->src); + bmk_param.src = &tl_src; + + bmk1880v2_tensor_tgmem_t tg_dst; + convert_gmem_tensor(&tg_dst, param->dst); + bmk_param.dst = &tg_dst; + + bmk1880v2_tdma_l2g_bf16_tensor_copy(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tdma_l2g_tensor_copy_nc_transposed( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_nc_transposed_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_l2tg_tensor_copy_nc_transposed_param_t bmk_param; + bmk1880v2_tensor_lmem_t tl_src; + convert_lmem_tensor(&tl_src, param->src); + bmk_param.src = &tl_src; + + bmk1880v2_tensor_tgmem_t tg_dst; + convert_gmem_tensor(&tg_dst, param->dst); + bmk_param.dst = &tg_dst; + + bmk1880v2_tdma_l2g_tensor_copy_nc_transposed(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tdma_l2g_bf16_tensor_copy_nc_transposed( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_nc_transposed_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_l2tg_tensor_copy_nc_transposed_param_t bmk_param; + bmk1880v2_tensor_lmem_t tl_src; + convert_lmem_tensor(&tl_src, param->src); + bmk_param.src = &tl_src; + + bmk1880v2_tensor_tgmem_t tg_dst; + convert_gmem_tensor(&tg_dst, param->dst); + bmk_param.dst = &tg_dst; + + bmk1880v2_tdma_l2g_bf16_tensor_copy_nc_transposed(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tdma_l2g_tensor_copy_compressed( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_compressed_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_l2tg_tensor_copy_compressed_param_t bmk_param; + bmk1880v2_tensor_lmem_t tl_src; + convert_lmem_tensor(&tl_src, param->src); + bmk_param.src = &tl_src; + + bmk1880v2_compressed_tensor_tgmem_t tg_dst; + convert_gmem_compressed_tensor(&tg_dst, param->dst); + bmk_param.dst = &tg_dst; + + bmk1880v2_tdma_l2g_tensor_copy_compressed(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tdma_l2g_tensor_fill_constant( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_fill_constant_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_l2tg_tensor_fill_constant_param_t bmk_param; + bmk_param.constant = param->constant; + + bmk1880v2_tensor_tgmem_t tg_dst; + convert_gmem_tensor(&tg_dst, param->dst); + bmk_param.dst = &tg_dst; + + bmk1880v2_tdma_l2g_tensor_fill_constant(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tdma_l2g_tensor_copy_cw_transposed( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_cw_transposed_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_l2tg_tensor_copy_cw_transposed_param_t bmk_param; + bmk1880v2_tensor_lmem_t tl_src; + convert_lmem_tensor(&tl_src, param->src); + bmk_param.src = &tl_src; + + bmk1880v2_tensor_tgmem_t tg_dst; + convert_gmem_tensor(&tg_dst, param->dst); + bmk_param.dst = &tg_dst; + + bmk1880v2_tdma_l2g_tensor_copy_cw_transposed(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tdma_l2g_bf16_tensor_copy_cw_transposed( + cvk_context_t *ctx, + const cvk_tdma_l2g_tensor_copy_cw_transposed_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_l2tg_tensor_copy_cw_transposed_param_t bmk_param; + bmk1880v2_tensor_lmem_t tl_src; + convert_lmem_tensor(&tl_src, param->src); + bmk_param.src = &tl_src; + + bmk1880v2_tensor_tgmem_t tg_dst; + convert_gmem_tensor(&tg_dst, param->dst); + bmk_param.dst = &tg_dst; + + bmk1880v2_tdma_l2g_bf16_tensor_copy_cw_transposed(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tdma_l2g_matrix_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_matrix_copy_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_l2tg_matrix_copy_param_t bmk_param; + bmk1880v2_matrix_lmem_t tl_src; + convert_lmem_matrix(&tl_src, param->src); + bmk_param.src = &tl_src; + + bmk1880v2_matrix_tgmem_t tg_dst; + convert_gmem_matrix(&tg_dst, param->dst); + bmk_param.dst = &tg_dst; + + bmk1880v2_tdma_l2g_matrix_copy(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tdma_l2g_bf16_matrix_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_matrix_copy_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_l2tg_matrix_copy_param_t bmk_param; + bmk1880v2_matrix_lmem_t tl_src; + convert_lmem_matrix(&tl_src, param->src); + bmk_param.src = &tl_src; + + bmk1880v2_matrix_tgmem_t tg_dst; + convert_gmem_matrix(&tg_dst, param->dst); + bmk_param.dst = &tg_dst; + + bmk1880v2_tdma_l2g_bf16_matrix_copy(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tdma_l2g_matrix_copy_compressed( + cvk_context_t *ctx, + const cvk_tdma_l2g_matrix_copy_compressed_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_l2tg_matrix_copy_compressed_param_t bmk_param; + bmk1880v2_matrix_lmem_t tl_src; + convert_lmem_matrix(&tl_src, param->src); + bmk_param.src = &tl_src; + + bmk1880v2_compressed_matrix_tgmem_t mg_dst; + convert_gmem_compressed_matrix(&mg_dst, param->dst); + bmk_param.dst = &mg_dst; + + bmk1880v2_tdma_l2g_matrix_copy_compressed(bmk_ctx, &bmk_param); +} + + +void cvk1880v2_tdma_l2g_general_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_general_copy_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_l2tg_general_copy_param_t bmk_param; + bmk_param.src_address = param->src_address; + bmk_param.dst_base_reg_index = param->dst_base_reg_index; + bmk_param.dst_address = param->dst_address; + bmk_param.bytes = param->bytes; + + bmk1880v2_tdma_l2g_general_copy(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tdma_l2g_bf16_general_copy( + cvk_context_t *ctx, + const cvk_tdma_l2g_bf16_general_copy_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_l2tg_bf16_general_copy_param_t bmk_param; + bmk_param.src_address = param->src_address; + bmk_param.dst_base_reg_index = param->dst_base_reg_index; + bmk_param.dst_address = param->dst_address; + bmk_param.src_bytes = param->src_bytes; + + bmk1880v2_tdma_l2g_bf16_general_copy(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tdma_g2l_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_tg2l_tensor_copy_param_t bmk_param; + bmk1880v2_tensor_tgmem_t tg_src; + convert_gmem_tensor(&tg_src, param->src); + bmk_param.src = &tg_src; + + bmk1880v2_tensor_lmem_t tl_dst; + convert_lmem_tensor(&tl_dst, param->dst); + bmk_param.dst = &tl_dst; + + bmk1880v2_tdma_g2l_tensor_copy(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tdma_g2l_bf16_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_tg2l_tensor_copy_param_t bmk_param; + bmk1880v2_tensor_tgmem_t tg_src; + convert_gmem_tensor(&tg_src, param->src); + bmk_param.src = &tg_src; + + bmk1880v2_tensor_lmem_t tl_dst; + convert_lmem_tensor(&tl_dst, param->dst); + bmk_param.dst = &tl_dst; + + bmk1880v2_tdma_g2l_bf16_tensor_copy(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tdma_g2l_tensor_copy_nc_transposed( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_nc_transposed_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_tg2l_tensor_copy_nc_transposed_param_t bmk_param; + bmk1880v2_tensor_tgmem_t tg_src; + convert_gmem_tensor(&tg_src, param->src); + bmk_param.src = &tg_src; + + bmk1880v2_tensor_lmem_t tl_dst; + convert_lmem_tensor(&tl_dst, param->dst); + bmk_param.dst = &tl_dst; + + bmk1880v2_tdma_g2l_tensor_copy_nc_transposed(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tdma_g2l_bf16_tensor_copy_nc_transposed( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_nc_transposed_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_tg2l_tensor_copy_nc_transposed_param_t bmk_param; + bmk1880v2_tensor_tgmem_t tg_src; + convert_gmem_tensor(&tg_src, param->src); + bmk_param.src = &tg_src; + + bmk1880v2_tensor_lmem_t tl_dst; + convert_lmem_tensor(&tl_dst, param->dst); + bmk_param.dst = &tl_dst; + + bmk1880v2_tdma_g2l_bf16_tensor_copy_nc_transposed(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tdma_g2l_tensor_copy_chw_rotated( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_chw_rotated_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_tg2l_tensor_copy_chw_rotated_param_t bmk_param; + bmk1880v2_tensor_tgmem_t tg_src; + convert_gmem_tensor(&tg_src, param->src); + bmk_param.src = &tg_src; + + bmk1880v2_tensor_lmem_t tl_dst; + convert_lmem_tensor(&tl_dst, param->dst); + bmk_param.dst = &tl_dst; + + bmk1880v2_tdma_g2l_tensor_copy_chw_rotated(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tdma_g2l_tensor_copy_decompressed( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_copy_decompressed_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_tg2l_tensor_copy_decompressed_param_t bmk_param; + bmk1880v2_compressed_tensor_tgmem_t tg_src; + convert_gmem_compressed_tensor(&tg_src, param->src); + bmk_param.src = &tg_src; + + bmk1880v2_tensor_lmem_t tl_dst; + convert_lmem_tensor(&tl_dst, param->dst); + bmk_param.dst = &tl_dst; + + bmk1880v2_tdma_g2l_tensor_copy_decompressed(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tdma_g2l_tensor_fill_constant( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_fill_constant_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_tg2l_tensor_fill_constant_param_t bmk_param; + bmk_param.constant = param->constant; + + bmk1880v2_tensor_lmem_t tl_dst; + convert_lmem_tensor(&tl_dst, param->dst); + bmk_param.dst = &tl_dst; + + bmk1880v2_tdma_tg2l_tensor_fill_constant(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tdma_g2l_bf16_tensor_fill_constant( + cvk_context_t *ctx, + const cvk_tdma_g2l_tensor_fill_constant_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_tg2l_tensor_fill_constant_param_t bmk_param; + bmk_param.constant = param->constant; + + bmk1880v2_tensor_lmem_t tl_dst; + convert_lmem_tensor(&tl_dst, param->dst); + bmk_param.dst = &tl_dst; + + bmk1880v2_tdma_tg2l_bf16_tensor_fill_constant(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tdma_g2l_matrix_copy_decompressed( + cvk_context_t *ctx, + const cvk_tdma_g2l_matrix_copy_decompressed_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_tg2l_matrix_copy_decompressed_param_t bmk_param; + bmk1880v2_compressed_matrix_tgmem_t mg_src; + convert_gmem_compressed_matrix(&mg_src, param->src); + bmk_param.src = &mg_src; + + bmk1880v2_matrix_lmem_t ml_dst; + convert_lmem_matrix(&ml_dst, param->dst); + bmk_param.dst = &ml_dst; + + bmk1880v2_tdma_g2l_matrix_copy_decompressed(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tdma_g2l_matrix_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_matrix_copy_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_tg2l_matrix_copy_param_t bmk_param; + bmk1880v2_matrix_tgmem_t mg_src; + convert_gmem_matrix(&mg_src, param->src); + bmk_param.src = &mg_src; + + bmk1880v2_matrix_lmem_t ml_dst; + convert_lmem_matrix(&ml_dst, param->dst); + bmk_param.dst = &ml_dst; + + bmk1880v2_tdma_g2l_matrix_copy(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tdma_g2l_bf16_matrix_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_matrix_copy_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_tg2l_matrix_copy_param_t bmk_param; + bmk1880v2_matrix_tgmem_t mg_src; + convert_gmem_matrix(&mg_src, param->src); + bmk_param.src = &mg_src; + + bmk1880v2_matrix_lmem_t ml_dst; + convert_lmem_matrix(&ml_dst, param->dst); + bmk_param.dst = &ml_dst; + + bmk1880v2_tdma_g2l_bf16_matrix_copy(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tdma_g2l_matrix_copy_row_col_transposed( + cvk_context_t *ctx, + const cvk_tdma_g2l_matrix_copy_row_col_transposed_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_tg2l_matrix_copy_row_col_transposed_param_t bmk_param; + bmk1880v2_matrix_tgmem_t mg_src; + convert_gmem_matrix(&mg_src, param->src); + bmk_param.src = &mg_src; + + bmk1880v2_matrix_lmem_t ml_dst; + convert_lmem_matrix(&ml_dst, param->dst); + bmk_param.dst = &ml_dst; + + bmk1880v2_tdma_g2l_matrix_copy_row_col_transposed(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tdma_g2l_general_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_general_copy_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_tg2l_general_copy_param_t bmk_param; + bmk_param.src_base_reg_index = param->src_base_reg_index; + bmk_param.src_address = param->src_address; + bmk_param.dst_address = param->dst_address; + bmk_param.bytes = param->src_base_reg_index; + + bmk1880v2_tdma_g2l_general_copy(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tdma_g2l_bf16_general_copy( + cvk_context_t *ctx, + const cvk_tdma_g2l_bf16_general_copy_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_tg2l_bf16_general_copy_param_t bmk_param; + bmk_param.src_base_reg_index = param->src_base_reg_index; + bmk_param.src_address = param->src_address; + bmk_param.dst_address = param->dst_address; + bmk_param.src_bytes = param->src_base_reg_index; + bmk_param.src_fmt = param->src_fmt; + bmk_param.dst_fmt = param->dst_fmt; + + bmk1880v2_tdma_g2l_bf16_general_copy(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tdma_g2g_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_g2g_tensor_copy_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_tg2tg_tensor_copy_param_t bmk_param; + bmk1880v2_tensor_tgmem_t tg_src; + convert_gmem_tensor(&tg_src, param->src); + bmk_param.src = &tg_src; + + bmk1880v2_tensor_tgmem_t tg_dst; + convert_gmem_tensor(&tg_dst, param->dst); + bmk_param.dst = &tg_dst; + + bmk1880v2_tdma_tg2tg_tensor_copy(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tdma_g2g_general_copy( + cvk_context_t *ctx, + const cvk_tdma_g2g_tensor_copy_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_tg2tg_tensor_copy_param_t bmk_param; + bmk1880v2_tensor_tgmem_t tg_src; + convert_gmem_tensor(&tg_src, param->src); + bmk_param.src = &tg_src; + + bmk1880v2_tensor_tgmem_t tg_dst; + convert_gmem_tensor(&tg_dst, param->dst); + bmk_param.dst = &tg_dst; + + bmk1880v2_tdma_tg2tg_general_copy(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tdma_g2g_bf16_general_copy( + cvk_context_t *ctx, + const cvk_tdma_g2g_tensor_copy_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_tg2tg_tensor_copy_param_t bmk_param; + bmk1880v2_tensor_tgmem_t tg_src; + convert_gmem_tensor(&tg_src, param->src); + bmk_param.src = &tg_src; + + bmk1880v2_tensor_tgmem_t tg_dst; + convert_gmem_tensor(&tg_dst, param->dst); + bmk_param.dst = &tg_dst; + + bmk1880v2_tdma_tg2tg_bf16_general_copy(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tdma_g2g_bf16_tensor_copy( + cvk_context_t *ctx, + const cvk_tdma_g2g_tensor_copy_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tdma_tg2tg_tensor_copy_param_t bmk_param; + bmk1880v2_tensor_tgmem_t tg_src; + convert_gmem_tensor(&tg_src, param->src); + bmk_param.src = &tg_src; + + bmk1880v2_tensor_tgmem_t tg_dst; + convert_gmem_tensor(&tg_dst, param->dst); + bmk_param.dst = &tg_dst; + + bmk1880v2_tdma_tg2tg_bf16_tensor_copy(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tiu_mul( + cvk_context_t *ctx, + const cvk_tiu_mul_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tiu_element_wise_mul_param_t bmk_param; + bmk1880v2_tensor_lmem_t tl_res_high; + if (param->res_high) { + convert_lmem_tensor(&tl_res_high, param->res_high); + bmk_param.res_high = &tl_res_high; + } else { + bmk_param.res_high = NULL; + } + + bmk1880v2_tensor_lmem_t tl_res_low; + convert_lmem_tensor(&tl_res_low, param->res_low); + bmk_param.res_low = &tl_res_low; + + bmk1880v2_tensor_lmem_t tl_a; + convert_lmem_tensor(&tl_a, param->a); + bmk_param.a = &tl_a; + + bmk_param.b_is_const = param->b_is_const; + + bmk1880v2_tensor_lmem_t tl_b; + if (!param->b_is_const) { + convert_lmem_tensor(&tl_b, param->b); + bmk_param.b = &tl_b; + } else { + bmk_param.b_const.val = param->b_const.val; + bmk_param.b_const.is_signed = param->b_const.is_signed; + } + + bmk_param.rshift_bits = param->rshift_bits; + bmk_param.relu_enable = param->relu_enable; + bmk_param.layer_id = param->layer_id; + + bmk1880v2_tiu_element_wise_mul(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tiu_mul_qm( + cvk_context_t *ctx, + const cvk_tiu_mul_qm_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tiu_element_wise_mul_qdm_param_t bmk_param; + bmk1880v2_tensor_lmem_t tl_res_high; + if (param->res_high) { + convert_lmem_tensor(&tl_res_high, param->res_high); + bmk_param.res_high = &tl_res_high; + } else { + bmk_param.res_high = NULL; + } + + bmk1880v2_tensor_lmem_t tl_res_low; + convert_lmem_tensor(&tl_res_low, param->res_low); + bmk_param.res_low = &tl_res_low; + + bmk1880v2_tensor_lmem_t tl_a; + convert_lmem_tensor(&tl_a, param->a); + bmk_param.a = &tl_a; + + bmk_param.b_is_const = param->b_is_const; + + bmk1880v2_tensor_lmem_t tl_b; + if (!param->b_is_const) { + convert_lmem_tensor(&tl_b, param->b); + bmk_param.b = &tl_b; + } else { + bmk_param.b_const.val = param->b_const.val; + bmk_param.b_const.is_signed = param->b_const.is_signed; + } + + bmk_param.rshift_bits = param->rshift_bits; + bmk_param.relu_enable = param->relu_enable; + bmk_param.multiplier = param->multiplier; + bmk_param.layer_id = param->layer_id; + + bmk1880v2_tiu_element_wise_mul_qdm(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tiu_mac( + cvk_context_t *ctx, + const cvk_tiu_mac_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tiu_element_wise_mac_param_t bmk_param; + bmk1880v2_tensor_lmem_t tl_res_high; + if (param->res_high) { + convert_lmem_tensor(&tl_res_high, param->res_high); + bmk_param.res_high = &tl_res_high; + } else { + bmk_param.res_high = NULL; + } + + bmk1880v2_tensor_lmem_t tl_res_low; + convert_lmem_tensor(&tl_res_low, param->res_low); + bmk_param.res_low = &tl_res_low; + + bmk1880v2_tensor_lmem_t tl_a; + convert_lmem_tensor(&tl_a, param->a); + bmk_param.a = &tl_a; + + bmk_param.b_is_const = param->b_is_const; + + bmk1880v2_tensor_lmem_t tl_b; + if (!param->b_is_const) { + convert_lmem_tensor(&tl_b, param->b); + bmk_param.b = &tl_b; + } else { + bmk_param.b_const.val = param->b_const.val; + bmk_param.b_const.is_signed = param->b_const.is_signed; + } + + bmk_param.res_is_int8 = param->res_is_int8; + bmk_param.relu_enable = param->relu_enable; + bmk_param.lshift_bits = param->lshift_bits; + bmk_param.rshift_bits = param->rshift_bits; + bmk_param.layer_id = param->layer_id; + + bmk1880v2_tiu_element_wise_mac(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tiu_add( + cvk_context_t *ctx, + const cvk_tiu_add_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tiu_element_wise_add_param_t bmk_param; + bmk1880v2_tensor_lmem_t tl_res_high; + if (param->res_high) { + convert_lmem_tensor(&tl_res_high, param->res_high); + bmk_param.res_high = &tl_res_high; + } else { + bmk_param.res_high = NULL; + } + + bmk1880v2_tensor_lmem_t tl_res_low; + convert_lmem_tensor(&tl_res_low, param->res_low); + bmk_param.res_low = &tl_res_low; + + bmk1880v2_tensor_lmem_t tl_a_high; + if (param->a_high) { + convert_lmem_tensor(&tl_a_high, param->a_high); + bmk_param.a_high = &tl_a_high; + } else { + bmk_param.a_high = NULL; + } + + bmk1880v2_tensor_lmem_t tl_a_low; + convert_lmem_tensor(&tl_a_low, param->a_low); + bmk_param.a_low = &tl_a_low; + + bmk_param.b_is_const = param->b_is_const; + + bmk1880v2_tensor_lmem_t tl_b_high; + bmk1880v2_tensor_lmem_t tl_b_low; + if (!param->b_is_const) { + if (param->b.high) { + convert_lmem_tensor(&tl_b_high, param->b.high); + bmk_param.b_high = &tl_b_high; + } else { + bmk_param.b_high = NULL; + } + + convert_lmem_tensor(&tl_b_low, param->b.low); + bmk_param.b_low = &tl_b_low; + } else { + bmk_param.b_const.val = param->b_const.val; + bmk_param.b_const.is_signed = param->b_const.is_signed; + } + + bmk_param.rshift_bits = param->rshift_bits; + bmk_param.relu_enable = param->relu_enable; + bmk_param.layer_id = param->layer_id; + + bmk1880v2_tiu_element_wise_add(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tiu_sub( + cvk_context_t *ctx, + const cvk_tiu_sub_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tiu_element_wise_sub_param_t bmk_param; + bmk1880v2_tensor_lmem_t tl_res_high; + if (param->res_high) { + convert_lmem_tensor(&tl_res_high, param->res_high); + bmk_param.res_high = &tl_res_high; + } else { + bmk_param.res_high = NULL; + } + + bmk1880v2_tensor_lmem_t tl_res_low; + convert_lmem_tensor(&tl_res_low, param->res_low); + bmk_param.res_low = &tl_res_low; + + bmk1880v2_tensor_lmem_t tl_a_high; + if (param->a_high) { + convert_lmem_tensor(&tl_a_high, param->a_high); + bmk_param.a_high = &tl_a_high; + } else { + bmk_param.a_high = NULL; + } + + bmk1880v2_tensor_lmem_t tl_a_low; + convert_lmem_tensor(&tl_a_low, param->a_low); + bmk_param.a_low = &tl_a_low; + + bmk1880v2_tensor_lmem_t tl_b_high; + if (param->b_high) { + convert_lmem_tensor(&tl_b_high, param->b_high); + bmk_param.b_high = &tl_b_high; + } else { + bmk_param.b_high = NULL; + } + + bmk1880v2_tensor_lmem_t tl_b_low; + convert_lmem_tensor(&tl_b_low, param->b_low); + bmk_param.b_low = &tl_b_low; + + bmk_param.rshift_bits = param->rshift_bits; + bmk_param.layer_id = param->layer_id; + + bmk1880v2_tiu_element_wise_sub(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tiu_max( + cvk_context_t *ctx, + const cvk_tiu_max_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tiu_element_wise_max_param_t bmk_param; + bmk1880v2_tensor_lmem_t tl_max; + convert_lmem_tensor(&tl_max, param->max); + bmk_param.max = &tl_max; + + bmk1880v2_tensor_lmem_t tl_a; + convert_lmem_tensor(&tl_a, param->a); + bmk_param.a = &tl_a; + + bmk_param.b_is_const = param->b_is_const; + bmk1880v2_tensor_lmem_t tl_b; + if (!param->b_is_const) { + convert_lmem_tensor(&tl_b, param->b); + bmk_param.b = &tl_b; + } else { + bmk_param.b_const.val = param->b_const.val; + bmk_param.b_const.is_signed = param->b_const.is_signed; + } + + bmk_param.layer_id = param->layer_id; + bmk1880v2_tiu_element_wise_max(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tiu_min( + cvk_context_t *ctx, + const cvk_tiu_min_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tiu_element_wise_min_param_t bmk_param; + bmk1880v2_tensor_lmem_t tl_min; + convert_lmem_tensor(&tl_min, param->min); + bmk_param.min = &tl_min; + + bmk1880v2_tensor_lmem_t tl_a; + convert_lmem_tensor(&tl_a, param->a); + bmk_param.a = &tl_a; + + bmk_param.b_is_const = param->b_is_const; + bmk1880v2_tensor_lmem_t tl_b; + if (!param->b_is_const) { + convert_lmem_tensor(&tl_b, param->b); + bmk_param.b = &tl_b; + } else { + bmk_param.b_const.val = param->b_const.val; + bmk_param.b_const.is_signed = param->b_const.is_signed; + } + + bmk_param.layer_id = param->layer_id; + + bmk1880v2_tiu_element_wise_min(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tiu_and_int8( + cvk_context_t *ctx, + const cvk_tiu_and_int8_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tiu_element_wise_and_int8_param_t bmk_param; + bmk1880v2_tensor_lmem_t tl_res; + convert_lmem_tensor(&tl_res, param->res); + bmk_param.res = &tl_res; + + bmk1880v2_tensor_lmem_t tl_a; + convert_lmem_tensor(&tl_a, param->a); + bmk_param.a = &tl_a; + + bmk1880v2_tensor_lmem_t tl_b; + convert_lmem_tensor(&tl_b, param->b); + bmk_param.b = &tl_b; + + bmk_param.layer_id = param->layer_id; + + bmk1880v2_tiu_element_wise_and_int8(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tiu_arith_shift( + cvk_context_t *ctx, + const cvk_tiu_arith_shift_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tiu_element_wise_arith_shift_param_t bmk_param; + bmk1880v2_tensor_lmem_t tl_res_high; + if (param->res_high) { + convert_lmem_tensor(&tl_res_high, param->res_high); + bmk_param.res_high = &tl_res_high; + } else { + bmk_param.res_high = NULL; + } + + bmk1880v2_tensor_lmem_t tl_res_low; + convert_lmem_tensor(&tl_res_low, param->res_low); + bmk_param.res_low = &tl_res_low; + + bmk1880v2_tensor_lmem_t tl_a_high; + if (param->a_high) { + convert_lmem_tensor(&tl_a_high, param->a_high); + bmk_param.a_high = &tl_a_high; + } else { + bmk_param.a_high = NULL; + } + + bmk1880v2_tensor_lmem_t tl_a_low; + convert_lmem_tensor(&tl_a_low, param->a_low); + bmk_param.a_low = &tl_a_low; + + bmk1880v2_tensor_lmem_t tl_bits; + convert_lmem_tensor(&tl_bits, param->bits); + bmk_param.bits = &tl_bits; + + bmk1880v2_tiu_element_wise_arith_shift(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tiu_and_int16( + cvk_context_t *ctx, + const cvk_tiu_and_int16_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tiu_element_wise_and_int16_param_t bmk_param; + bmk1880v2_tensor_lmem_t tl_res_high; + if (param->res_high) { + convert_lmem_tensor(&tl_res_high, param->res_high); + bmk_param.res_high = &tl_res_high; + } else { + bmk_param.res_high = NULL; + } + + bmk1880v2_tensor_lmem_t tl_res_low; + convert_lmem_tensor(&tl_res_low, param->res_low); + bmk_param.res_low = &tl_res_low; + + bmk1880v2_tensor_lmem_t tl_a_high; + if (param->a_high) { + convert_lmem_tensor(&tl_a_high, param->a_high); + bmk_param.a_high = &tl_a_high; + } else { + bmk_param.a_high = NULL; + } + + bmk1880v2_tensor_lmem_t tl_a_low; + convert_lmem_tensor(&tl_a_low, param->a_low); + bmk_param.a_low = &tl_a_low; + + bmk1880v2_tensor_lmem_t tl_b_high; + if (param->b_high) { + convert_lmem_tensor(&tl_b_high, param->b_high); + bmk_param.b_high = &tl_b_high; + } else { + bmk_param.b_high = NULL; + } + + bmk1880v2_tensor_lmem_t tl_b_low; + convert_lmem_tensor(&tl_b_low, param->b_low); + bmk_param.b_low = &tl_b_low; + + bmk1880v2_tiu_element_wise_and_int16(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tiu_or_int8( + cvk_context_t *ctx, + const cvk_tiu_or_int8_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tiu_element_wise_or_int8_param_t bmk_param; + bmk1880v2_tensor_lmem_t tl_res; + convert_lmem_tensor(&tl_res, param->res); + bmk_param.res = &tl_res; + + bmk1880v2_tensor_lmem_t tl_a; + convert_lmem_tensor(&tl_a, param->a); + bmk_param.a = &tl_a; + + bmk1880v2_tensor_lmem_t tl_b; + convert_lmem_tensor(&tl_b, param->b); + bmk_param.b = &tl_b; + + bmk_param.layer_id = param->layer_id; + + bmk1880v2_tiu_element_wise_or_int8(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tiu_or_int16( + cvk_context_t *ctx, + const cvk_tiu_or_int16_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tiu_element_wise_or_int16_param_t bmk_param; + bmk1880v2_tensor_lmem_t tl_res_high; + if (param->res_high) { + convert_lmem_tensor(&tl_res_high, param->res_high); + bmk_param.res_high = &tl_res_high; + } else { + bmk_param.res_high = NULL; + } + + bmk1880v2_tensor_lmem_t tl_res_low; + convert_lmem_tensor(&tl_res_low, param->res_low); + bmk_param.res_low = &tl_res_low; + + bmk1880v2_tensor_lmem_t tl_a_high; + if (param->a_high) { + convert_lmem_tensor(&tl_a_high, param->a_high); + bmk_param.a_high = &tl_a_high; + } else { + bmk_param.a_high = NULL; + } + + bmk1880v2_tensor_lmem_t tl_a_low; + convert_lmem_tensor(&tl_a_low, param->a_low); + bmk_param.a_low = &tl_a_low; + + bmk1880v2_tensor_lmem_t tl_b_high; + if (param->b_high) { + convert_lmem_tensor(&tl_b_high, param->b_high); + bmk_param.b_high = &tl_b_high; + } else { + bmk_param.b_high = NULL; + } + + bmk1880v2_tensor_lmem_t tl_b_low; + convert_lmem_tensor(&tl_b_low, param->b_low); + bmk_param.b_low = &tl_b_low; + + bmk1880v2_tiu_element_wise_or_int16(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tiu_xor_int8( + cvk_context_t *ctx, + const cvk_tiu_xor_int8_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tiu_element_wise_xor_int8_param_t bmk_param; + bmk1880v2_tensor_lmem_t tl_res; + convert_lmem_tensor(&tl_res, param->res); + bmk_param.res = &tl_res; + + bmk1880v2_tensor_lmem_t tl_a; + convert_lmem_tensor(&tl_a, param->a); + bmk_param.a = &tl_a; + + bmk1880v2_tensor_lmem_t tl_b; + convert_lmem_tensor(&tl_b, param->b); + bmk_param.b = &tl_b; + + bmk_param.layer_id = param->layer_id; + + bmk1880v2_tiu_element_wise_xor_int8(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tiu_xor_int16( + cvk_context_t *ctx, + const cvk_tiu_xor_int16_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tiu_element_wise_xor_int16_param_t bmk_param; + bmk1880v2_tensor_lmem_t tl_res_high; + if (param->res_high) { + convert_lmem_tensor(&tl_res_high, param->res_high); + bmk_param.res_high = &tl_res_high; + } else { + bmk_param.res_high = NULL; + } + + bmk1880v2_tensor_lmem_t tl_res_low; + convert_lmem_tensor(&tl_res_low, param->res_low); + bmk_param.res_low = &tl_res_low; + + bmk1880v2_tensor_lmem_t tl_a_high; + if (param->a_high) { + convert_lmem_tensor(&tl_a_high, param->a_high); + bmk_param.a_high = &tl_a_high; + } else { + bmk_param.a_high = NULL; + } + + bmk1880v2_tensor_lmem_t tl_a_low; + convert_lmem_tensor(&tl_a_low, param->a_low); + bmk_param.a_low = &tl_a_low; + + bmk1880v2_tensor_lmem_t tl_b_high; + if (param->b_high) { + convert_lmem_tensor(&tl_b_high, param->b_high); + bmk_param.b_high = &tl_b_high; + } else { + bmk_param.b_high = NULL; + } + + bmk1880v2_tensor_lmem_t tl_b_low; + convert_lmem_tensor(&tl_b_low, param->b_low); + bmk_param.b_low = &tl_b_low; + + bmk1880v2_tiu_element_wise_xor_int16(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tiu_copy( + cvk_context_t *ctx, + const cvk_tiu_copy_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tiu_element_wise_copy_param_t bmk_param; + bmk1880v2_tensor_lmem_t tl_src; + convert_lmem_tensor(&tl_src, param->src); + bmk_param.src = &tl_src; + + bmk1880v2_tensor_lmem_t tl_dst; + convert_lmem_tensor(&tl_dst, param->dst); + bmk_param.dst = &tl_dst; + + bmk_param.layer_id = param->layer_id; + + bmk1880v2_tiu_element_wise_copy(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tiu_lookup_table( + cvk_context_t *ctx, + const cvk_tiu_lookup_table_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tiu_lookup_table_param_t bmk_param; + bmk1880v2_tensor_lmem_t tl_ofmap; + convert_lmem_tensor(&tl_ofmap, param->ofmap); + bmk_param.ofmap = &tl_ofmap; + + bmk1880v2_tensor_lmem_t tl_ifmap; + convert_lmem_tensor(&tl_ifmap, param->ifmap); + bmk_param.ifmap = &tl_ifmap; + + bmk1880v2_tensor_lmem_t tl_table; + convert_lmem_tensor(&tl_table, param->table); + bmk_param.table = &tl_table; + + bmk_param.layer_id = param->layer_id; + + bmk1880v2_tiu_lookup_table(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tiu_bf16_lookup_interp_table( + cvk_context_t *ctx, + const cvk_tiu_bf16_lookup_interp_table_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tensor_lmem_t ifmap; + convert_lmem_tensor(&ifmap, param->ifmap); + + bmk1880v2_tensor_lmem_t buf; + convert_lmem_tensor(&buf, param->buf); + + bmk1880v2_tensor_lmem_t tbl_answer; + convert_lmem_tensor(&tbl_answer, param->tbl_answer); + + bmk1880v2_tensor_lmem_t tbl_answer_mantissa; + convert_lmem_tensor(&tbl_answer_mantissa, param->tbl_answer_mantissa); + + bmk1880v2_tensor_lmem_t ofmap; + convert_lmem_tensor(&ofmap, param->ofmap); + + if (param->is_scientific) { + bf16_lut_exp_mantissa(bmk_ctx, &ifmap, &buf, &tbl_answer, + &tbl_answer_mantissa, &ofmap); + } + else { + const cvk_tl_t *tl_ifmap = param->ifmap; + const cvk_tl_t *tl_ofmap_slope = param->buf; + const cvk_tl_t *tl_table_answer = param->tbl_answer; + const cvk_tl_t *tl_table_answer_slope = param->tbl_answer_mantissa; + const cvk_tl_t *tl_ofmap_y0 = param->ofmap; + float min = param->min; + float max = param->max; + float scale = 256 / (max - min); // 256 means hw support lut index size + uint8_t eu_align = param->eu_align; + cvk_fmt_t fmt = CVK_FMT_BF16; + + cvk_tl_shape_t tl_ofmap_x0_int8_shape = { + 1, tl_ifmap->shape.c, tl_ifmap->shape.h * tl_ifmap->shape.w, 1}; + + // filter y = max(range_min, x) + cvk_tiu_max_param_t p1 = {0}; + p1.max = tl_ifmap; + p1.a = tl_ifmap; + p1.b_is_const = 1; + p1.b_const.is_signed = 1; + p1.b_const.val = ctx->misc_ops->float_to_bfloat16(ctx, min); + p1.layer_id = param->layer_id; + ctx->ops->tiu_max(ctx, &p1); + + // filter y = min(8, x) + cvk_tiu_min_param_t p2 = {0}; + p2.min = tl_ifmap; + p2.a = tl_ifmap; + p2.b_is_const = 1; + p2.b_const.val = ctx->misc_ops->float_to_bfloat16(ctx, max - 1 / scale); // corner + p2.b_const.is_signed = 1; + p2.layer_id = param->layer_id; + ctx->ops->tiu_min(ctx, &p2); + + cvk_tdma_l2l_tensor_copy_param_t p3 = {0}; + // scale input for remap its idx(-x~x) to (-127~127), dirty tl_ifmap + cvk_tiu_mul_param_t p4 = {0}; + p4.res_high = NULL; + p4.res_low = tl_ifmap; + p4.a = tl_ifmap; + p4.b_is_const = 1; + p4.b_const.val = ctx->misc_ops->float_to_bfloat16(ctx, scale); + p4.rshift_bits = 0; + p4.relu_enable = 0; + p4.layer_id = param->layer_id; + ctx->ops->tiu_mul(ctx, &p4); + + // int8 + memset(&p3, 0x00, sizeof(cvk_tdma_l2l_tensor_copy_param_t)); + cvk_tl_t dst; + memcpy(&dst, tl_ofmap_y0, sizeof(cvk_tl_t)); + + dst.shape = tl_ofmap_x0_int8_shape; + dst.fmt = CVK_FMT_I8; + dst.stride = + ctx->ops->tl_default_stride(ctx, tl_ofmap_x0_int8_shape, CVK_FMT_I8, eu_align); + dst.stride.h = dst.stride.h * 2; + dst.int8_rnd_mode = 1; + p3.dst = &dst; + p3.src = tl_ifmap; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p3); + dst.int8_rnd_mode = 0; // reset + + // ops->tdma_l2l_bf16_tensor_copy(ctx, &p3); + + // ops->tiu_sub(ctx, &p5); + + // get f(x0) and slope(x) + // reshape, 16->16 + dst.fmt = fmt; + dst.shape = tl_ofmap_slope->shape; + dst.stride = tl_ofmap_slope->stride; + + // layer_id; + ctx->ops->tiu_lookup_table(ctx, &p6); + + // base f(x0) + memset(&p6, 0x0, sizeof(cvk_tiu_lookup_table_param_t)); + p6.ofmap = tl_ofmap_y0; + p6.ifmap = &dst; + p6.table = tl_table_answer; + p6.layer_id = param->layer_id; + ctx->ops->tiu_lookup_table(ctx, &p6); + + // layer_id; + ctx->ops->tiu_mac(ctx, &p7); + + } +} + +void cvk1880v2_tiu_pt_convolution( + cvk_context_t *ctx, + const cvk_tiu_pt_convolution_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tiu_convolution_param_t bmk_param; + bmk1880v2_tensor_lmem_t tl_ofmap; + convert_lmem_tensor(&tl_ofmap, param->ofmap); + bmk_param.ofmap = &tl_ofmap; + + bmk1880v2_tensor_lmem_t tl_ifmap; + convert_lmem_tensor(&tl_ifmap, param->ifmap); + bmk_param.ifmap = &tl_ifmap; + + bmk1880v2_tensor_lmem_t tl_weight; + convert_lmem_tensor(&tl_weight, param->weight); + bmk_param.weight = &tl_weight; + + bmk1880v2_tensor_lmem_t tl_bias; + if (param->bias) { + convert_lmem_tensor(&tl_bias, param->bias); + bmk_param.bias = &tl_bias; + } else { + bmk_param.bias = NULL; + } + + bmk_param.ins_h = param->ins_h; + bmk_param.ins_last_h = param->ins_last_h; + bmk_param.ins_w = param->ins_w; + bmk_param.ins_last_w = param->ins_last_w; + bmk_param.pad_top = param->pad_top; + bmk_param.pad_bottom = param->pad_bottom; + bmk_param.pad_left = param->pad_left; + bmk_param.pad_right = param->pad_right; + bmk_param.stride_h = param->stride_h; + bmk_param.stride_w = param->stride_w; + bmk_param.dilation_h = param->dilation_h; + bmk_param.dilation_w = param->dilation_w; + bmk_param.relu_enable = param->relu_enable; + bmk_param.rshift_bits = param->rshift_bits; + bmk_param.ps32_mode = param->ps32_mode; + bmk_param.w_is_const = param->w_is_const; + bmk_param.layer_id = param->layer_id; + bmk_param.fp_round_typ = param->fp_round_typ; + bmk_param.ins_val = param->ins_val; + bmk_param.ins_fp = param->ins_fp; + + bmk1880v2_tiu_convolution(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tiu_convolution( + cvk_context_t *ctx, + const cvk_tiu_convolution_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tiu_convolution_qdm_param_t bmk_param; + bmk1880v2_tensor_lmem_t tl_ofmap; + convert_lmem_tensor(&tl_ofmap, param->ofmap); + bmk_param.ofmap = &tl_ofmap; + + bmk1880v2_tensor_lmem_t tl_ifmap; + convert_lmem_tensor(&tl_ifmap, param->ifmap); + bmk_param.ifmap = &tl_ifmap; + + bmk1880v2_tensor_lmem_t tl_weight; + convert_lmem_tensor(&tl_weight, param->weight); + bmk_param.weight = &tl_weight; + + bmk1880v2_tensor_lmem_t tl_chl_quan_param; + if (param->chl_quan_param) { + convert_lmem_tensor(&tl_chl_quan_param, param->chl_quan_param); + bmk_param.chl_quan_param = &tl_chl_quan_param; + } else { + bmk_param.chl_quan_param = NULL; + } + + bmk_param.ins_h = param->ins_h; + bmk_param.ins_last_h = param->ins_last_h; + bmk_param.ins_w = param->ins_w; + bmk_param.ins_last_w = param->ins_last_w; + bmk_param.pad_top = param->pad_top; + bmk_param.pad_bottom = param->pad_bottom; + bmk_param.pad_left = param->pad_left; + bmk_param.pad_right = param->pad_right; + bmk_param.stride_h = param->stride_h; + bmk_param.stride_w = param->stride_w; + bmk_param.dilation_h = param->dilation_h; + bmk_param.dilation_w = param->dilation_w; + bmk_param.has_bias = param->has_bias; + bmk_param.relu_enable = param->relu_enable; + bmk_param.ps32_mode = param->ps32_mode; + bmk_param.w_is_const = param->w_is_const; + bmk_param.layer_id = param->layer_id; + bmk_param.ins_val = param->ins_val; + bmk_param.ins_fp = param->ins_fp; + + bmk1880v2_tiu_convolution_qdm(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tiu_max_pooling( + cvk_context_t *ctx, + const cvk_tiu_max_pooling_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tiu_max_pooling_param_t bmk_param; + bmk1880v2_tensor_lmem_t tl_ofmap; + convert_lmem_tensor(&tl_ofmap, param->ofmap); + bmk_param.ofmap = &tl_ofmap; + + bmk1880v2_tensor_lmem_t tl_ifmap; + convert_lmem_tensor(&tl_ifmap, param->ifmap); + bmk_param.ifmap = &tl_ifmap; + + bmk_param.kh = param->kh; + bmk_param.kw = param->kw; + bmk_param.pad_top = param->pad_top; + bmk_param.pad_bottom = param->pad_bottom; + bmk_param.pad_left = param->pad_left; + bmk_param.pad_right = param->pad_right; + bmk_param.stride_h = param->stride_h; + bmk_param.stride_w = param->stride_w; + bmk_param.ins_val = param->ins_val; + bmk_param.ins_fp = param->ins_fp; + bmk_param.layer_id = param->layer_id; + + bmk1880v2_tiu_max_pooling(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tiu_average_pooling( + cvk_context_t *ctx, + const cvk_tiu_average_pooling_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tiu_average_pooling_param_t bmk_param; + bmk1880v2_tensor_lmem_t tl_ofmap; + convert_lmem_tensor(&tl_ofmap, param->ofmap); + bmk_param.ofmap = &tl_ofmap; + + bmk1880v2_tensor_lmem_t tl_ifmap; + convert_lmem_tensor(&tl_ifmap, param->ifmap); + bmk_param.ifmap = &tl_ifmap; + + bmk_param.kh = param->kh; + bmk_param.kw = param->kw; + bmk_param.ins_h = param->ins_h; + bmk_param.ins_last_h = param->ins_last_h; + bmk_param.ins_w = param->ins_w; + bmk_param.ins_last_w = param->ins_last_w; + bmk_param.pad_top = param->pad_top; + bmk_param.pad_bottom = param->pad_bottom; + bmk_param.pad_left = param->pad_left; + bmk_param.pad_right = param->pad_right; + bmk_param.stride_h = param->stride_h; + bmk_param.stride_w = param->stride_w; + bmk_param.avg_pooling_const = param->avg_pooling_const; + bmk_param.rshift_bits = param->rshift_bits; + bmk_param.layer_id = param->layer_id; + bmk_param.ins_val = param->ins_val; + bmk_param.ins_fp = param->ins_fp; + + bmk1880v2_tiu_average_pooling(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tiu_pt_depthwise_convolution( + cvk_context_t *ctx, + const cvk_tiu_depthwise_pt_convolution_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tiu_depthwise_convolution_param_t bmk_param; + bmk1880v2_tensor_lmem_t tl_ofmap; + convert_lmem_tensor(&tl_ofmap, param->ofmap); + bmk_param.ofmap = &tl_ofmap; + + bmk1880v2_tensor_lmem_t tl_ifmap; + convert_lmem_tensor(&tl_ifmap, param->ifmap); + bmk_param.ifmap = &tl_ifmap; + + bmk1880v2_tensor_lmem_t tl_weight; + convert_lmem_tensor(&tl_weight, param->weight); + bmk_param.weight = &tl_weight; + + bmk1880v2_tensor_lmem_t tl_bias; + if (param->bias) { + convert_lmem_tensor(&tl_bias, param->bias); + bmk_param.bias = &tl_bias; + } else { + bmk_param.bias = NULL; + } + + bmk_param.ins_h = param->ins_h; + bmk_param.ins_last_h = param->ins_last_h; + bmk_param.ins_w = param->ins_w; + bmk_param.ins_last_w = param->ins_last_w; + bmk_param.pad_top = param->pad_top; + bmk_param.pad_bottom = param->pad_bottom; + bmk_param.pad_left = param->pad_left; + bmk_param.pad_right = param->pad_right; + bmk_param.stride_h = param->stride_h; + bmk_param.stride_w = param->stride_w; + bmk_param.dilation_h = param->dilation_h; + bmk_param.dilation_w = param->dilation_w; + bmk_param.relu_enable = param->relu_enable; + bmk_param.rshift_bits = param->rshift_bits; + bmk_param.ps32_mode = param->ps32_mode; + bmk_param.layer_id = param->layer_id; + bmk_param.ins_val = param->ins_val; + bmk_param.ins_fp = param->ins_fp; + bmk_param.weight_is_const = param->weight_is_const; + bmk_param.weight_const.is_signed = param->weight_const.is_signed; + bmk_param.weight_const.val = param->weight_const.val; + + bmk1880v2_tiu_depthwise_convolution(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tiu_depthwise_convolution( + cvk_context_t *ctx, + const cvk_tiu_depthwise_convolution_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tiu_depthwise_convolution_qdm_param_t bmk_param; + bmk1880v2_tensor_lmem_t tl_ofmap; + convert_lmem_tensor(&tl_ofmap, param->ofmap); + bmk_param.ofmap = &tl_ofmap; + + bmk1880v2_tensor_lmem_t tl_ifmap; + convert_lmem_tensor(&tl_ifmap, param->ifmap); + bmk_param.ifmap = &tl_ifmap; + + bmk1880v2_tensor_lmem_t tl_weight; + convert_lmem_tensor(&tl_weight, param->weight); + bmk_param.weight = &tl_weight; + + bmk1880v2_tensor_lmem_t tl_chl_quan_param; + convert_lmem_tensor(&tl_chl_quan_param, param->chl_quan_param); + bmk_param.chl_quan_param = &tl_chl_quan_param; + + bmk_param.ins_h = param->ins_h; + bmk_param.ins_last_h = param->ins_last_h; + bmk_param.ins_w = param->ins_w; + bmk_param.ins_last_w = param->ins_last_w; + bmk_param.pad_top = param->pad_top; + bmk_param.pad_bottom = param->pad_bottom; + bmk_param.pad_left = param->pad_left; + bmk_param.pad_right = param->pad_right; + bmk_param.stride_h = param->stride_h; + bmk_param.stride_w = param->stride_w; + bmk_param.dilation_h = param->dilation_h; + bmk_param.dilation_w = param->dilation_w; + bmk_param.has_bias = param->has_bias; + bmk_param.relu_enable = param->relu_enable; + bmk_param.layer_id = param->layer_id; + bmk_param.ins_val = param->ins_val; + bmk_param.ins_fp = param->ins_fp; + bmk_param.weight_is_const = param->weight_is_const; + bmk_param.weight_const.is_signed = param->weight_const.is_signed; + bmk_param.weight_const.val = param->weight_const.val; + + bmk1880v2_tiu_depthwise_convolution_qdm(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tiu_matrix_multiplication( + cvk_context_t *ctx, + const cvk_tiu_matrix_multiplication_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tiu_matrix_multiplication_param_t bmk_param; + bmk1880v2_matrix_lmem_t ml_res; + convert_lmem_matrix(&ml_res, param->res); + bmk_param.res = &ml_res; + + bmk1880v2_matrix_lmem_t ml_left; + convert_lmem_matrix(&ml_left, param->left); + bmk_param.left = &ml_left; + + bmk1880v2_matrix_lmem_t ml_right; + convert_lmem_matrix(&ml_right, param->right); + bmk_param.right = &ml_right; + + bmk1880v2_matrix_lmem_t ml_bias; + if (param->bias) { + convert_lmem_matrix(&ml_bias, param->bias); + bmk_param.bias = &ml_bias; + } else { + bmk_param.bias = NULL; + } + + bmk_param.lshift_bits = param->lshift_bits; + bmk_param.rshift_bits = param->rshift_bits; + bmk_param.res_is_int8 = param->res_is_int8; + bmk_param.relu_enable = param->relu_enable; + bmk_param.add_result = param->add_result; + bmk_param.ps32_mode = param->ps32_mode; + bmk_param.layer_id = param->layer_id; + + bmk1880v2_tiu_matrix_multiplication(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tiu_matrix_multiplication_qm( + cvk_context_t *ctx, + const cvk_tiu_matrix_multiplication_qm_param_t *param) +{ + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tiu_matrix_multiplication_qdm_param_t bmk_param; + bmk1880v2_matrix_lmem_t ml_res; + convert_lmem_matrix(&ml_res, param->res); + bmk_param.res = &ml_res; + + bmk1880v2_matrix_lmem_t ml_left; + convert_lmem_matrix(&ml_left, param->left); + bmk_param.left = &ml_left; + + bmk1880v2_matrix_lmem_t ml_right; + convert_lmem_matrix(&ml_right, param->right); + bmk_param.right = &ml_right; + + bmk1880v2_matrix_lmem_t ml_bias; + if (param->bias) { + convert_lmem_matrix(&ml_bias, param->bias); + bmk_param.bias = &ml_bias; + } else { + bmk_param.bias = NULL; + } + + bmk_param.lshift_bits = param->lshift_bits; + bmk_param.rshift_bits = param->rshift_bits; + bmk_param.res_is_int8 = param->res_is_int8; + bmk_param.relu_enable = param->relu_enable; + bmk_param.add_result = param->add_result; + bmk_param.ps32_mode = param->ps32_mode; + bmk_param.quan_m = param->quan_m; + bmk_param.layer_id = param->layer_id; + bmk1880v2_tiu_matrix_multiplication_qdm(bmk_ctx, &bmk_param); +} + +void cvk1880v2_tiu_ge( + cvk_context_t *ctx, + const cvk_tiu_ge_param_t *param) +{ + // H/W does not support + if (!ctx || !param) + return; +} + +void cvk1880v2_tiu_min_pooling( + struct cvikernel_context *ctx, + const cvk_tiu_min_pooling_param_t *param) +{ + // H/W does not support + if (!ctx || !param) + return; +} + +uint16_t cvk1880v2_float_to_bfloat16( + cvk_context_t *ctx, + float data) +{ + (void)ctx; + + return convert_fp32_bf16(data); +} + +void cvk1880v2_bf16_table_shape( + cvk_context_t *ctx, + cvk_tl_shape_t *shape) +{ + if (!shape) + return; + + bmk1880v2_context_t *bmk_ctx = + ((cvk_prv_data_t *)ctx->priv_data)->bmk_ctx; + + bmk1880v2_tensor_lmem_shape_t bmk_shape; + bf16_table_shape(bmk_ctx, &bmk_shape); + + shape->n = bmk_shape.n; + shape->c = bmk_shape.c; + shape->h = bmk_shape.h; + shape->w = bmk_shape.w; +} + +static cvk_operations_t cvikernel_1880v2_ops = { + .cleanup = cvk1880v2_cleanup, + .reset = cvk1880v2_reset, + .acquire_cmdbuf = cvk1880v2_acquire_cmdbuf, + .dmabuf_size = bmk1880v2_dmabuf_size, + .dmabuf_convert = bmk1880v2_dmabuf_convert, + .set_layer_id = cvk1880v2_set_layer_id, + .parallel_enable = cvk1880v2_parallel_enable, + .parallel_disable = cvk1880v2_parallel_disable, + .lmem_alloc_tensor = cvk1880v2_lmem_alloc_tensor, + .lmem_alloc_matrix = cvk1880v2_lmem_alloc_matrix, + .lmem_alloc_ps32_matrix = cvk1880v2_lmem_alloc_ps32_matrix, + .lmem_free_tensor = cvk1880v2_lmem_free_tensor, + .lmem_free_matrix = cvk1880v2_lmem_free_matrix, + .lmem_init_tensor = cvk1880v2_lmem_init_tensor, + .lmem_init_matrix = cvk1880v2_lmem_init_matrix, + .tl_default_stride = cvk1880v2_tl_default_stride, + .tg_default_stride = cvk1880v2_tg_default_stride, + .ml_default_shape = cvk1880v2_ml_default_shape, + .ml_default_stride = cvk1880v2_ml_default_stride, + .ml_shape_t1 = cvk1880v2_ml_shape_t1, + .lmem_tensor_to_size = cvk1880v2_lmem_tensor_to_size, + .lmem_matrix_to_size = cvk1880v2_lmem_matrix_to_size, + .lmem_ps32_matrix_to_size = cvk1880v2_lmem_ps32_matrix_to_size, + .gmem_init_tensor = cvk1880v2_gmem_init_tensor, + .tdma_l2l_tensor_copy = cvk1880v2_tdma_l2l_tensor_copy, + .tdma_l2l_bf16_tensor_copy = cvk1880v2_tdma_l2l_bf16_tensor_copy, + .tdma_l2l_tensor_lrn_shift = cvk1880v2_tdma_l2l_tensor_lrn_shift, + .tdma_l2g_tensor_copy = cvk1880v2_tdma_l2g_tensor_copy, + .tdma_l2g_bf16_tensor_copy = cvk1880v2_tdma_l2g_bf16_tensor_copy, + .tdma_l2g_tensor_copy_nc_transposed = cvk1880v2_tdma_l2g_tensor_copy_nc_transposed, + .tdma_l2g_bf16_tensor_copy_nc_transposed = cvk1880v2_tdma_l2g_bf16_tensor_copy_nc_transposed, + .tdma_l2g_tensor_copy_compressed = cvk1880v2_tdma_l2g_tensor_copy_compressed, + .tdma_l2g_tensor_fill_constant = cvk1880v2_tdma_l2g_tensor_fill_constant, + .tdma_l2g_tensor_copy_cw_transposed = cvk1880v2_tdma_l2g_tensor_copy_cw_transposed, + .tdma_l2g_bf16_tensor_copy_cw_transposed = cvk1880v2_tdma_l2g_bf16_tensor_copy_cw_transposed, + .tdma_l2g_matrix_copy = cvk1880v2_tdma_l2g_matrix_copy, + .tdma_l2g_bf16_matrix_copy = cvk1880v2_tdma_l2g_bf16_matrix_copy, + .tdma_l2g_matrix_copy_compressed = cvk1880v2_tdma_l2g_matrix_copy_compressed, + .tdma_l2g_general_copy = cvk1880v2_tdma_l2g_general_copy, + .tdma_l2g_bf16_general_copy = cvk1880v2_tdma_l2g_bf16_general_copy, + .tdma_g2l_tensor_copy = cvk1880v2_tdma_g2l_tensor_copy, + .tdma_g2l_bf16_tensor_copy = cvk1880v2_tdma_g2l_bf16_tensor_copy, + .tdma_g2l_tensor_copy_nc_transposed = cvk1880v2_tdma_g2l_tensor_copy_nc_transposed, + .tdma_g2l_bf16_tensor_copy_nc_transposed = cvk1880v2_tdma_g2l_bf16_tensor_copy_nc_transposed, + .tdma_g2l_tensor_copy_chw_rotated = cvk1880v2_tdma_g2l_tensor_copy_chw_rotated, + .tdma_g2l_tensor_copy_decompressed = cvk1880v2_tdma_g2l_tensor_copy_decompressed, + .tdma_g2l_tensor_fill_constant = cvk1880v2_tdma_g2l_tensor_fill_constant, + .tdma_g2l_bf16_tensor_fill_constant = cvk1880v2_tdma_g2l_bf16_tensor_fill_constant, + .tdma_g2l_matrix_copy_decompressed = cvk1880v2_tdma_g2l_matrix_copy_decompressed, + .tdma_g2l_matrix_copy = cvk1880v2_tdma_g2l_matrix_copy, + .tdma_g2l_bf16_matrix_copy = cvk1880v2_tdma_g2l_bf16_matrix_copy, + .tdma_g2l_matrix_copy_row_col_transposed = cvk1880v2_tdma_g2l_matrix_copy_row_col_transposed, + .tdma_g2l_general_copy = cvk1880v2_tdma_g2l_general_copy, + .tdma_g2l_bf16_general_copy = cvk1880v2_tdma_g2l_bf16_general_copy, + .tdma_g2g_tensor_copy = cvk1880v2_tdma_g2g_tensor_copy, + .tdma_g2g_general_copy = cvk1880v2_tdma_g2g_general_copy, + .tdma_g2g_bf16_general_copy = cvk1880v2_tdma_g2g_bf16_general_copy, + .tdma_g2g_bf16_tensor_copy = cvk1880v2_tdma_g2g_bf16_tensor_copy, + .tiu_mul = cvk1880v2_tiu_mul, + .tiu_mul_qm = cvk1880v2_tiu_mul_qm, + .tiu_mac = cvk1880v2_tiu_mac, + .tiu_add = cvk1880v2_tiu_add, + .tiu_sub = cvk1880v2_tiu_sub, + .tiu_max = cvk1880v2_tiu_max, + .tiu_min = cvk1880v2_tiu_min, + .tiu_and_int8 = cvk1880v2_tiu_and_int8, + .tiu_arith_shift = cvk1880v2_tiu_arith_shift, + .tiu_and_int16 = cvk1880v2_tiu_and_int16, + .tiu_or_int8 = cvk1880v2_tiu_or_int8, + .tiu_or_int16 = cvk1880v2_tiu_or_int16, + .tiu_xor_int8 = cvk1880v2_tiu_xor_int8, + .tiu_xor_int16 = cvk1880v2_tiu_xor_int16, + .tiu_copy = cvk1880v2_tiu_copy, + .tiu_lookup_table = cvk1880v2_tiu_lookup_table, + .tiu_bf16_lookup_interp_table = cvk1880v2_tiu_bf16_lookup_interp_table, + .tiu_pt_convolution = cvk1880v2_tiu_pt_convolution, + .tiu_convolution = cvk1880v2_tiu_convolution, + .tiu_max_pooling = cvk1880v2_tiu_max_pooling, + .tiu_average_pooling = cvk1880v2_tiu_average_pooling, + .tiu_pt_depthwise_convolution = cvk1880v2_tiu_pt_depthwise_convolution, + .tiu_depthwise_convolution = cvk1880v2_tiu_depthwise_convolution, + .tiu_matrix_multiplication = cvk1880v2_tiu_matrix_multiplication, + .tiu_matrix_multiplication_qm = cvk1880v2_tiu_matrix_multiplication_qm, + .tiu_ge = cvk1880v2_tiu_ge, + .tiu_min_pooling = cvk1880v2_tiu_min_pooling, +}; + +static cvk_misc_operations_t cvikernel_1880v2_misc_ops = { + .float_to_bfloat16 = cvk1880v2_float_to_bfloat16, + .bf16_table_shape = cvk1880v2_bf16_table_shape, +}; + +char *cvikernel_get_chip_info_1880v2(void) +{ + return CVI_TPU_VERSION_183X; +} + +void cvikernel_init_1880v2( + cvk_reg_info_t *req_info, + cvk_context_t *ctx) +{ + ctx->info.version = BM1880V2_VER; + ctx->info.node_num = BM1880V2_HW_NODE_CHIP_NUM; + ctx->info.node_shift = BM1880V2_HW_NODE_CHIP_SHIFT; + ctx->info.npu_num = BM1880V2_HW_NPU_NUM; + ctx->info.npu_shift = BM1880V2_HW_NPU_SHIFT; + ctx->info.eu_num = BM1880V2_HW_EU_NUM; + ctx->info.eu_shift = BM1880V2_HW_EU_SHIFT; + ctx->info.lmem_size = BM1880V2_HW_LMEM_SIZE; + ctx->info.lmem_shift = BM1880V2_HW_LMEM_SHIFT; + ctx->info.lmem_banks = BM1880V2_HW_LMEM_BANKS; + ctx->info.lmem_bank_size = BM1880V2_HW_LMEM_BANK_SIZE; + ctx->info.gmem_start = BM1880V2_GLOBAL_MEM_START_ADDR; + ctx->info.gmem_size = BM1880V2_GLOBAL_MEM_SIZE; + ctx->info.features = 0; + + ctx->ops = &cvikernel_1880v2_ops; + ctx->misc_ops = &cvikernel_1880v2_misc_ops; + + // kernel_init() in bmkernel.c + bmk1880v2_context_t *bmk_ctx = xmalloc(sizeof(bmk1880v2_context_t)); + bmk_ctx->info.chip_version = BM1880V2_VER; + bmk_ctx->info.cmdbuf_size = req_info->cmdbuf_size; + bmk_ctx->info.cmdbuf = req_info->cmdbuf; + + bmk_ctx->chip_info.version = BM1880V2_VER; + bmk_ctx->chip_info.node_num = BM1880V2_HW_NODE_CHIP_NUM; + bmk_ctx->chip_info.node_shift = BM1880V2_HW_NODE_CHIP_SHIFT; + bmk_ctx->chip_info.npu_num = BM1880V2_HW_NPU_NUM; + bmk_ctx->chip_info.npu_shift = BM1880V2_HW_NPU_SHIFT; + bmk_ctx->chip_info.eu_num = BM1880V2_HW_EU_NUM; + bmk_ctx->chip_info.eu_shift = BM1880V2_HW_EU_SHIFT; + bmk_ctx->chip_info.lmem_size = BM1880V2_HW_LMEM_SIZE; + bmk_ctx->chip_info.lmem_shift = BM1880V2_HW_LMEM_SHIFT; + bmk_ctx->chip_info.lmem_banks = BM1880V2_HW_LMEM_BANKS; + bmk_ctx->chip_info.lmem_bank_size = BM1880V2_HW_LMEM_BANK_SIZE; + bmk_ctx->chip_info.gmem_start = BM1880V2_GLOBAL_MEM_START_ADDR; + bmk_ctx->chip_info.gmem_size = BM1880V2_GLOBAL_MEM_SIZE; + + uint32_t max_nr_desc = bmk1880v2_estimate_nr_desc(bmk_ctx); + + bmk_ctx->cmdbuf_ptr = 0; + bmk_ctx->max_nr_desc = max_nr_desc; + bmk_ctx->cur_nr_desc = 0; + bmk_ctx->desc_pairs = xmalloc(max_nr_desc * sizeof(bmk_ctx->desc_pairs[0])); + bmk_ctx->lmem_ptr = 0; + + ec_init(&bmk_ctx->ec, BMK1880v2_ENGINE_NUM, max_nr_desc); + mode_manager_init(&bmk_ctx->mode_manager, &bmk_ctx->ec, BMK1880v2_ENGINE_NUM); + + cvk_prv_data_t *prv_data = malloc(sizeof(cvk_prv_data_t)); + prv_data->bmk_ctx = bmk_ctx; + prv_data->cmdbuf = req_info->cmdbuf; + prv_data->cmdbuf_size = req_info->cmdbuf_size; + + ctx->priv_data = prv_data; +} diff --git a/cvikernel/src/cvikernel.c b/cvikernel/src/cvikernel.c new file mode 100644 index 000000000..d3434b65e --- /dev/null +++ b/cvikernel/src/cvikernel.c @@ -0,0 +1,100 @@ +#include "kernel_internal.h" +#include "cvikernel/cvikernel.h" + +typedef struct internal_data { + ec_t ec; + mode_manager_t mode_manager; + uint32_t cmdbuf_ptr; + uint32_t max_nr_desc; + uint32_t cur_nr_desc; + desc_pair_t *desc_pairs; + uint32_t lmem_ptr; +} internal_data_t; + +/* Avoid to export interface */ +extern char *cvikernel_get_chip_info_1822(void); +extern void cvikernel_init_1822( + cvk_reg_info_t *req_info, + cvk_context_t *context); +#if CHIPID == 0x3 +extern char *cvikernel_get_chip_info_cv181x(void); +extern void cvikernel_init_cv181x( + cvk_reg_info_t *req_info, + cvk_context_t *context); +#elif CHIPID == 0x4 +extern char *cvikernel_get_chip_info_cv180x(void); +extern void cvikernel_init_cv180x( + cvk_reg_info_t *req_info, + cvk_context_t *context); +#elif CHIPID == 0x1 +extern char *cvikernel_get_chip_info_1880v2(void); +extern void cvikernel_init_1880v2( + cvk_reg_info_t *req_info, + cvk_context_t *context); +#elif CHIPID == 0x2 +#else +extern char *cvikernel_get_chip_info_cv181x(void); +extern void cvikernel_init_cv181x( + cvk_reg_info_t *req_info, + cvk_context_t *context); +extern char *cvikernel_get_chip_info_cv180x(void); +extern void cvikernel_init_cv180x( + cvk_reg_info_t *req_info, + cvk_context_t *context); +extern char *cvikernel_get_chip_info_1880v2(void); +extern void cvikernel_init_1880v2( + cvk_reg_info_t *req_info, + cvk_context_t *context); +#endif + +typedef struct chip_query_info { + char *(*get_chip_version)(void); + void (*chip_init)(cvk_reg_info_t *req_info, cvk_context_t *context); +} chip_query_info_t; + +// Supported chips +static chip_query_info_t cvikernel_chip_list[] = { +#if CHIPID == 0x3 + {cvikernel_get_chip_info_cv181x, cvikernel_init_cv181x}, +#elif CHIPID == 0x4 + {cvikernel_get_chip_info_cv180x, cvikernel_init_cv180x}, +#elif CHIPID == 0x1 + {cvikernel_get_chip_info_1880v2, cvikernel_init_1880v2}, +#elif CHIPID == 0x2 +#else + {cvikernel_get_chip_info_cv181x, cvikernel_init_cv181x}, + {cvikernel_get_chip_info_cv180x, cvikernel_init_cv180x}, + {cvikernel_get_chip_info_1880v2, cvikernel_init_1880v2}, +#endif + {cvikernel_get_chip_info_1822, cvikernel_init_1822} +}; + +#define NUM_DEVICES (sizeof(cvikernel_chip_list)/sizeof(chip_query_info_t)) + +cvk_context_t *cvikernel_register(cvk_reg_info_t *req_info) +{ + if (!req_info) + return NULL; + if (!req_info->cmdbuf) + return NULL; + + size_t req_chip_size = sizeof(req_info->chip_ver_str); + size_t req_chip_len = strlen(req_info->chip_ver_str); + + for (size_t i = 0; i < NUM_DEVICES; i++) { + char *version = (*cvikernel_chip_list[i].get_chip_version)(); + + // Compare chip string + if (!strncmp(version, req_info->chip_ver_str, req_chip_size) && + strlen(version) == req_chip_len) { + cvk_context_t *context = malloc(sizeof(cvk_context_t)); + if (!context) + return NULL; + + (*cvikernel_chip_list[i].chip_init)(req_info, context); + return context; + } + } + + return NULL; +} diff --git a/cvikernel/src/engine_conductor.c b/cvikernel/src/engine_conductor.c new file mode 100644 index 000000000..6e9039252 --- /dev/null +++ b/cvikernel/src/engine_conductor.c @@ -0,0 +1,255 @@ +#include "kernel_internal.h" + +#define ENABLE_UPDATE_TDMA_WAIT_ID + +#ifdef CVK_EC_DEBUG +char *get_engine_id_str(uint32_t engine_id) +{ + switch (engine_id) { + case 0: + return "TPU"; + case 1: + return "CPU"; + case 2: + return "TDMA"; + default: + break; + } + + return "UNK"; +} + +void dump_desc(ec_desc_t *desc) +{ + printf(" engine_id %d(%s)\n", desc->engine_id, get_engine_id_str(desc->engine_id)); + printf(" desc_offset %d\n", desc->desc_offset); + printf(" followers_offset %d\n", desc->followers_offset); + printf(" sync_ids_offset %d\n", desc->sync_ids_offset); + + if (desc->sync_ids) + printf(" sync_ids [TPU] %d, [TDMA] %d\n", desc->sync_ids[0], desc->sync_ids[2]); +} +#endif /* CVK_EC_DEBUG */ + +static void ec_desc_init(ec_desc_t *d, uint32_t engine_id, uint32_t nr_engines) +{ + d->engine_id = engine_id; + + uint32_t nr_followers = nr_engines - 1; + for (uint32_t i = 0; i < nr_followers; i++) + d->followers[i] = NULL; + + for (uint32_t i = 0; i < nr_engines; i++) + d->sync_ids[i] = 0; +} + +static void add_follower(ec_desc_t *d, ec_desc_t *follower, uint32_t nr_engines) +{ + if (d->engine_id == follower->engine_id) + return; + + uint32_t nr_followers = nr_engines - 1; + for (uint32_t fi = 0; fi < nr_followers; fi++) { + ec_desc_t **f = &d->followers[fi]; + if ((*f) == NULL) { + *f = follower; + return; + } else if ((*f)->engine_id == follower->engine_id) { + if ((*f) > follower) + (*f) = follower; + return; + } + } + ASSERT(0 && "desc->followers[] overflowed"); +} + +static uint32_t assign_sync_ids(ec_desc_t desc[], uint32_t nr_desc, uint32_t nr_engines) +{ + uint32_t ids[nr_engines]; + for (uint32_t i = 0; i < nr_engines; i++) + ids[i] = 0; + + for (uint32_t di = 0; di < nr_desc; di++) { + ec_desc_t *d = &desc[di]; + uint32_t ei = d->engine_id; // self engine id + + /* + * NOTE: + * Make sync_id equal to the number of descriptors + * to coincide with runtime code. + */ + d->sync_ids[ei] = ++ids[ei]; // Assign self sequence number + + if (ids[ei] == 0xffff) { + return di + 1; + } + } + + return nr_desc; +} + +static void update_followers(ec_desc_t desc[], uint32_t nr_desc, uint32_t nr_engines) +{ + for (uint32_t i = 0; i < nr_desc; i++) { + ec_desc_t *d = &desc[i]; + + uint32_t nr_followers = nr_engines - 1; + for (uint32_t fi = 0; fi < nr_followers; fi++) { + ec_desc_t *f = d->followers[fi]; + if (f == NULL) + break; + + // Follower must after current descriptor and before last descriptor. + if (f >= desc && f < &desc[nr_desc]) { + // Assign self id to follower's wait id + uint32_t ei = d->engine_id; + f->sync_ids[ei] = d->sync_ids[ei]; + } + } + } +} + +#ifdef ENABLE_UPDATE_TDMA_WAIT_ID +// +// Case 1: +// Zero wait_tiu_id in TDMA command: +// TIU : [wait_tdma_id=65|tiu_id=32] +// TDMA: [tdma_id=66|wait_tiu_id=32] +// TDMA: [tdma_id=67|wait_tiu_id=0] => zero wait_tiu_id +// TDMA: [tdma_id=68|wait_tiu_id=0] => zero wait_tiu_id +// TDMA: [tdma_id=69|wait_tiu_id=0] => zero wait_tiu_id +// +// Reuse previous wait_tiu_id: +// TIU : [wait_tdma_id=65|tiu_id=32] +// TDMA: [tdma_id=66|wait_tiu_id=32] +// TDMA: [tdma_id=67|wait_tiu_id=32] => Reuse previous wait_tiu_id +// TDMA: [tdma_id=68|wait_tiu_id=32] => Reuse previous wait_tiu_id +// TDMA: [tdma_id=69|wait_tiu_id=32] => Reuse previous wait_tiu_id +// +// Case 2: +// Zero wait_tiu_id in TDMA command: +// TDMA: [tdma_id=3|wait_tiu_id=0] +// TIU : [wait_tdma_id=3|tiu_id=1] +// TDMA: [tdma_id=4|wait_tiu_id=0] +// TIU : [wait_tdma_id=4|tiu_id=2] +// TDMA: [tdma_id=5|wait_tiu_id=1] +// TDMA: [tdma_id=6|wait_tiu_id=0] => zero wait_tiu_id +// +// Reuse previous wait_tiu_id: +// TDMA: [tdma_id=3|wait_tiu_id=0] +// TIU : [wait_tdma_id=3|tiu_id=1] +// TDMA: [tdma_id=4|wait_tiu_id=0] => Still zero, not wait previous TIU +// TIU : [wait_tdma_id=4|tiu_id=2] +// TDMA: [tdma_id=5|wait_tiu_id=1] +// TDMA: [tdma_id=6|wait_tiu_id=1] => Reuse previous wait_tiu_id +// +static void update_tdma_wait_id(ec_desc_t desc[], uint32_t nr_desc) +{ + uint32_t prev_wait_tiu_id = 0; + + for (uint32_t i = 0; i < nr_desc; i++) { + ec_desc_t *d = &desc[i]; + uint32_t ei = d->engine_id; + + // Only handle TDMA + if (ei != 2) + continue; + + // Reuse TIU wait id of previous TDMA command. + if (!d->sync_ids[0] && prev_wait_tiu_id) + d->sync_ids[0] = prev_wait_tiu_id; + + // Record last wait tpu id in TDMA command. + // Not tpu id of last TIU command, it forces TDMA to wait TIU. + prev_wait_tiu_id = d->sync_ids[0]; + } +} +#endif + +static void compute_sync_ids(ec_desc_t desc[], uint32_t nr_desc, uint32_t nr_engines) +{ + uint32_t nr_done = 0; + for (uint32_t i = 0; i < nr_desc; i += nr_done) { + // Assign command id of each engine (TPU, TDMA) + nr_done = assign_sync_ids(&desc[i], nr_desc - i, nr_engines); + + // Update wait id (wait_tdma_id in TIU, wait_tpu_id in TDMA) + update_followers(&desc[i], nr_done, nr_engines); + +#ifdef ENABLE_UPDATE_TDMA_WAIT_ID + // Update wait id (wait_tpu_id in TDMA) + update_tdma_wait_id(&desc[i], nr_done); +#endif + } +} + +void ec_init(ec_t *ec, uint32_t nr_engines, uint32_t max_nr_desc) +{ + ec->nr_engines = nr_engines; + + ec->max_nr_desc = max_nr_desc; + ec->cur_nr_desc = 0; + ec->desc = xmalloc(max_nr_desc * sizeof(ec->desc[0])); + + uint32_t nr_followers = nr_engines - 1; + uint32_t total_followers = max_nr_desc * nr_followers; + uint32_t follower_buf_size = total_followers * sizeof(ec->follower_buf[0]); + ec->follower_buf = xmalloc(follower_buf_size); + + uint32_t total_sync_ids = max_nr_desc * nr_engines; + uint32_t sync_id_buf_size = total_sync_ids * sizeof(ec->sync_id_buf[0]); + ec->sync_id_buf = xmalloc(sync_id_buf_size); +} + +void ec_reset(ec_t *ec) +{ + ec->cur_nr_desc = 0; +} + +void ec_destroy(ec_t *ec) +{ + free(ec->desc); + free(ec->follower_buf); + free(ec->sync_id_buf); +} + +ec_desc_t * ec_alloc_desc(ec_t *ec, uint32_t engine_id) +{ + ASSERT(engine_id < ec->nr_engines); + ASSERT(ec->cur_nr_desc < ec->max_nr_desc); + + uint32_t nr_followers = ec->nr_engines - 1; + uint32_t i = ec->cur_nr_desc++; + + ec_desc_t *d = &ec->desc[i]; + d->followers = &ec->follower_buf[i * nr_followers]; + d->sync_ids = &ec->sync_id_buf[i * ec->nr_engines]; + ec_desc_init(d, engine_id, ec->nr_engines); + +#ifdef CVK_EC_DEBUG + d->desc_offset = i; + d->followers_offset = i * nr_followers; + d->sync_ids_offset = i * ec->nr_engines; + + // dump_desc(d); +#endif + + + return d; +} + +void ec_add_dependency(ec_t *ec, ec_desc_t *before, ec_desc_t *after) +{ + ec_desc_t *start = ec->desc; + ec_desc_t *end = &ec->desc[ec->cur_nr_desc]; + + ASSERT(before >= start && before < end); + ASSERT(after >= start && after < end); + + add_follower(before, after, ec->nr_engines); +} + +void ec_compute_sync_ids(ec_t *ec) +{ + compute_sync_ids(ec->desc, ec->cur_nr_desc, ec->nr_engines); +} diff --git a/cvikernel/src/engine_conductor.h b/cvikernel/src/engine_conductor.h new file mode 100644 index 000000000..15f37b8dc --- /dev/null +++ b/cvikernel/src/engine_conductor.h @@ -0,0 +1,45 @@ +#ifndef ENGINE_CONDUCTOR_H +#define ENGINE_CONDUCTOR_H + +#include + +// #include + +// #define CVK_EC_DEBUG + +typedef struct ec_desc { + uint32_t engine_id; + struct ec_desc **followers; + uint16_t *sync_ids; + +#ifdef CVK_EC_DEBUG + // desc, follower and sync_ids are pointers. + // It is easier to debug from address offset instead of pointers. + uint32_t desc_offset; + uint32_t followers_offset; + uint32_t sync_ids_offset; +#endif + +} ec_desc_t; + +typedef struct { + uint32_t nr_engines; + + uint32_t max_nr_desc; + uint32_t cur_nr_desc; + ec_desc_t *desc; + + ec_desc_t **follower_buf; + uint16_t *sync_id_buf; +} ec_t; + +void ec_init(ec_t *ec, uint32_t nr_engines, uint32_t max_nr_desc); +void ec_reset(ec_t *ec); +void ec_destroy(ec_t *ec); + +ec_desc_t * ec_alloc_desc(ec_t *ec, uint32_t engine_id); + +void ec_add_dependency(ec_t *ec, ec_desc_t *before, ec_desc_t *after); +void ec_compute_sync_ids(ec_t *ec); + +#endif /* ENGINE_CONDUCTOR_H */ diff --git a/cvikernel/src/engine_state.c b/cvikernel/src/engine_state.c new file mode 100644 index 000000000..3df33dd7d --- /dev/null +++ b/cvikernel/src/engine_state.c @@ -0,0 +1,33 @@ +#include "kernel_internal.h" + +void engine_state_init(engine_state_t *es, uint32_t nr_engines) +{ + es->nr_engines = nr_engines; + es->last_desc = xmalloc(nr_engines * sizeof(es->last_desc[0])); + engine_state_reset(es); +} + +void engine_state_reset(engine_state_t *es) +{ + for (uint32_t ei = 0; ei < es->nr_engines; ei++) + es->last_desc[ei] = NULL; +} + +void engine_state_copy(engine_state_t *dst, engine_state_t *src) +{ + engine_state_init(dst, src->nr_engines); + for (uint32_t ei = 0; ei < src->nr_engines; ei++) + dst->last_desc[ei] = src->last_desc[ei]; +} + +void engine_state_update(engine_state_t *es, ec_desc_t *d) +{ + es->last_desc[d->engine_id] = d; +} + +void engine_state_destroy(engine_state_t *es) +{ + es->nr_engines = 0; + free(es->last_desc); + es->last_desc = NULL; +} diff --git a/cvikernel/src/engine_state.h b/cvikernel/src/engine_state.h new file mode 100644 index 000000000..66dfef2e9 --- /dev/null +++ b/cvikernel/src/engine_state.h @@ -0,0 +1,17 @@ +#ifndef CVIKERNEL_ENGINE_STATE_H +#define CVIKERNEL_ENGINE_STATE_H + +//#include "kernel_internal.h" + +typedef struct { + uint32_t nr_engines; + ec_desc_t **last_desc; +} engine_state_t; + +void engine_state_init(engine_state_t *es, uint32_t nr_engines); +void engine_state_update(engine_state_t *es, ec_desc_t *d); +void engine_state_copy(engine_state_t *dst, engine_state_t *src); +void engine_state_reset(engine_state_t *es); +void engine_state_destroy(engine_state_t *es); + +#endif /* CVIKERNEL_ENGINE_STATE_H */ diff --git a/cvikernel/src/kernel_internal.h b/cvikernel/src/kernel_internal.h new file mode 100644 index 000000000..dff239160 --- /dev/null +++ b/cvikernel/src/kernel_internal.h @@ -0,0 +1,94 @@ +#ifndef __INST_INTERNAL_H__ +#define __INST_INTERNAL_H__ + +#define ASSERT(cond) \ + do { \ + if (cond) { \ + /* To catch warnings in `cond' */; \ + } else { \ + fprintf(stderr, \ + "error: %s: line %d: function %s: " \ + "assertion `%s' failed\n", \ + __FILE__, __LINE__, __func__, #cond); \ + abort(); \ + } \ + } while (0) + +#include +#include +#include +#include +#include +#include +#include "engine_conductor.h" +#include "engine_state.h" +#include "mode_manager.h" + +#ifdef assert +#error "Please don't use assert. Use ASSERT instead." +#endif + +#define KiB (1 << 10) +#define MiB (1 << 20) +#define GiB (1 << 30) + +typedef struct { + cmd_hdr_t *cmd_hdr; + ec_desc_t *ec_desc; +} desc_pair_t; + +static inline void * xmalloc(size_t size) +{ + void *p = malloc(size); + ASSERT(p); + return p; +} + +static inline int bitsize_of_fmt(fmt_t fmt) +{ + switch (fmt) { + case FMT_F32: + case FMT_I32: + return 32; + case FMT_F16: + case FMT_I16: + case FMT_U16: + case FMT_BF16: + return 16; + case FMT_I8: + case FMT_U8: + return 8; + case FMT_I4: + return 4; + case FMT_I2: + return 2; + case FMT_I1: + return 1; + default: + ASSERT(0); + return -1; + } +} + +static inline int ceiling_bytesize_of(int bitsize) +{ + return ceiling_func_shift(bitsize, 3); +} + +static inline int ceiling_bytesize_of_array(int data_count, fmt_t fmt) +{ + int bitsize = bitsize_of_fmt(fmt); + return ceiling_bytesize_of(data_count * bitsize); +} + +static inline void replace_u32(uint32_t *data, uint32_t start, uint32_t width, uint32_t value) +{ + ASSERT(start < 32); + ASSERT(width > 0 && width <= 32); + ASSERT((start + width) <= 32); + + uint32_t mask = ~(((1 << width) - 1) << start); + *data = (*data & mask) | (value << start); +} + +#endif /* __INST_INTERNAL_H__ */ diff --git a/cvikernel/src/lmem.c b/cvikernel/src/lmem.c new file mode 100644 index 000000000..998c316cb --- /dev/null +++ b/cvikernel/src/lmem.c @@ -0,0 +1,115 @@ +#include "lmem.h" + +static uint32_t align_la_bm1880(uint32_t la, fmt_t fmt, uint32_t eu_num) +{ + int data_size = bitsize_of_fmt(fmt) / 8; + ASSERT(data_size == 1 || data_size == 2); + la = ceiling_func(la, data_size); + la = ALIGN(la, eu_num) * data_size; + return la; +} + +static void bank_init(bank_t *b, uint32_t size, uint32_t eu_num) +{ + b->size = size; + b->ptr = 0; + b->eu_num = eu_num; +} + +static uint32_t bank_alloc_bm1880( + bank_t *b, int size, fmt_t fmt, uint8_t eu_align, uint8_t la_align) +{ + uint32_t la = b->ptr; + if (eu_align || la_align) + la = align_la_bm1880(la, fmt, b->eu_num); + + if (la + size > b->size) + return -1; + + b->ptr = la + size; + return la; +} + +static void bank_free(bank_t *b, uint32_t la, uint32_t size) +{ + ASSERT(size <= b->ptr); + ASSERT(la <= b->ptr); + ASSERT(la + size <= b->ptr); + + b->ptr = la; +} + +static void lmem_mark_bank_used(lmem_t *lmem, int bank_id) +{ + uint32_t bank_base_addr = bank_id * lmem->chip_info->lmem_bank_size; + bank_t *b = &lmem->banks[BANK_ALL_ID]; + ASSERT(b->ptr <= bank_base_addr); + if (b->size > bank_base_addr) + b->size = bank_base_addr; +} + +static void lmem_mark_bank_free(lmem_t *lmem, uint32_t bank_id) +{ + uint32_t bank_size = lmem->chip_info->lmem_bank_size; + bank_t *all_bank = &lmem->banks[BANK_ALL_ID]; + + if (bank_id != all_bank->size / bank_size) + return; + + for (int i = bank_id; i < LMEM_MAX_BANKS; i++) { + if (lmem->banks[i].ptr == 0) + all_bank->size += bank_size; + else + break; + } +} + +void lmem_init(lmem_t *lmem, bmk_chip_info_t *chip_info) +{ + uint32_t eu_num = chip_info->eu_num; + uint32_t lmem_size = chip_info->lmem_size; + uint32_t bank_size = chip_info->lmem_bank_size; + uint32_t lmem_banks = chip_info->lmem_banks; + ASSERT(lmem_banks <= LMEM_MAX_BANKS); + + lmem->chip_info = chip_info; + bank_init(&lmem->banks[BANK_ALL_ID], lmem_size, eu_num); + for (uint32_t i = 0; i < lmem_banks; i++) + bank_init(&lmem->banks[i], bank_size, eu_num); +} + +uint32_t lmem_alloc( + lmem_t *lmem, int bank_id, int size, fmt_t fmt, + uint8_t eu_align, uint8_t la_align) +{ + uint32_t chip_version = lmem->chip_info->version; + bank_t *b = &lmem->banks[bank_id]; + + uint32_t la = -1; + if (chip_version == 1880) + la = bank_alloc_bm1880(b, size, fmt, eu_align, la_align); + + if (la == (uint32_t)-1) + return -1; + + if (bank_id != BANK_ALL_ID) + lmem_mark_bank_used(lmem, bank_id); + + return la; +} + +void lmem_free(lmem_t *lmem, int bank_id, uint32_t la, int size) +{ + uint32_t bank_size = lmem->chip_info->lmem_bank_size; + + uint32_t bank_base_addr = 0; + if (bank_id != BANK_ALL_ID) + bank_base_addr = bank_id * bank_size; + + bank_t *b = &lmem->banks[bank_id]; + bank_free(b, la - bank_base_addr, size); + + if (bank_id != BANK_ALL_ID) + if (b->ptr == 0) + lmem_mark_bank_free(lmem, bank_id); +} diff --git a/cvikernel/src/lmem.h b/cvikernel/src/lmem.h new file mode 100644 index 000000000..13ee5ff8f --- /dev/null +++ b/cvikernel/src/lmem.h @@ -0,0 +1,21 @@ +#include "kernel_internal.h" + +#define LMEM_MAX_BANKS 8 +#define BANK_ALL_ID LMEM_MAX_BANKS + +typedef struct { + uint32_t size; + uint32_t ptr; + uint32_t eu_num; +} bank_t; + +typedef struct { + bmk_chip_info_t *chip_info; + bank_t banks[LMEM_MAX_BANKS + 1]; +} lmem_t; + +void lmem_init(lmem_t *lmem, bmk_chip_info_t *chip_info); +uint32_t lmem_alloc( + lmem_t *lmem, int bank_id, int size, fmt_t fmt, + uint8_t eu_align, uint8_t la_align); +void lmem_free(lmem_t *lmem, int bank_id, uint32_t la, int size); diff --git a/cvikernel/src/mode_manager.c b/cvikernel/src/mode_manager.c new file mode 100644 index 000000000..ca6102356 --- /dev/null +++ b/cvikernel/src/mode_manager.c @@ -0,0 +1,142 @@ +#include "kernel_internal.h" + +static void enable_serial(mode_manager_t *mm) +{ + serial_mode_init(&mm->serial_mode, &mm->engine_state, mm->ec); + mm->mode = BMK_SERIAL_MODE; +} + +static void enable_parallel(mode_manager_t *mm) +{ + parallel_mode_init(&mm->parallel_mode, &mm->engine_state, mm->ec); + mm->mode = BMK_PARALLEL_MODE; +} + +static void enable_stream(mode_manager_t *mm, uint32_t nr_streams) +{ + stream_mode_init(&mm->stream_mode, &mm->engine_state, mm->ec, nr_streams); + mm->mode = BMK_STREAM_MODE; +} + +static void destroy_current_mode(mode_manager_t *mm) +{ + switch (mm->mode) { + case BMK_SERIAL_MODE: + serial_mode_destroy(&mm->serial_mode); + break; + case BMK_PARALLEL_MODE: + parallel_mode_destroy(&mm->parallel_mode); + break; + case BMK_STREAM_MODE: + stream_mode_destroy(&mm->stream_mode); + break; + default: + ASSERT(0); + } +} + +void mode_manager_init(mode_manager_t *mm, ec_t *ec, uint32_t nr_engines) +{ + engine_state_init(&mm->engine_state, nr_engines); + mm->ec = ec; + + enable_serial(mm); +} + +void mode_manager_destroy(mode_manager_t *mm) +{ + engine_state_destroy(&mm->engine_state); + destroy_current_mode(mm); +} + +void mode_manager_reset(mode_manager_t *mm) +{ + ec_reset(mm->ec); + engine_state_reset(&mm->engine_state); + + destroy_current_mode(mm); + enable_serial(mm); +} + +void mode_manager_enable_parallel(mode_manager_t *mm) +{ + if (mm->mode == BMK_PARALLEL_MODE) + return; + + destroy_current_mode(mm); + enable_parallel(mm); +} + +void mode_manager_disable_parallel(mode_manager_t *mm) +{ + if (mm->mode != BMK_PARALLEL_MODE) { + ASSERT(mm->mode == BMK_SERIAL_MODE); + return; + } + + destroy_current_mode(mm); + enable_serial(mm); +} + +void mode_manager_create_streams(mode_manager_t *mm, uint32_t nr_streams) +{ + ASSERT(mm->mode == BMK_SERIAL_MODE); + + destroy_current_mode(mm); + enable_stream(mm, nr_streams); +} + +void mode_manager_destroy_streams(mode_manager_t *mm) +{ + ASSERT(mm->mode == BMK_STREAM_MODE); + + destroy_current_mode(mm); + enable_serial(mm); +} + +void mode_manager_set_stream(mode_manager_t *mm, uint32_t i) +{ + ASSERT(mm->mode == BMK_STREAM_MODE); + + stream_mode_set_stream(&mm->stream_mode, i); +} + +void mode_manager_restart_sync_id(mode_manager_t *mm) +{ + ec_reset(mm->ec); + engine_state_reset(&mm->engine_state); + destroy_current_mode(mm); + + switch (mm->mode) { + case BMK_SERIAL_MODE: + serial_mode_init(&mm->serial_mode, &mm->engine_state, mm->ec); + break; + case BMK_PARALLEL_MODE: + parallel_mode_init(&mm->parallel_mode, &mm->engine_state, mm->ec); + break; + case BMK_STREAM_MODE: + stream_mode_init(&mm->stream_mode, &mm->engine_state, mm->ec, + mm->stream_mode.nr_streams); + break; + default: + ASSERT(0); + } +} + +void mode_manager_record_ec_desc(mode_manager_t *mm, ec_desc_t *d) +{ + engine_state_update(&mm->engine_state, d); + switch (mm->mode) { + case BMK_SERIAL_MODE: + serial_mode_record_desc(&mm->serial_mode, d); + break; + case BMK_PARALLEL_MODE: + parallel_mode_record_desc(&mm->parallel_mode, d); + break; + case BMK_STREAM_MODE: + stream_mode_record_desc(&mm->stream_mode, d); + break; + default: + ASSERT(0); + } +} diff --git a/cvikernel/src/mode_manager.h b/cvikernel/src/mode_manager.h new file mode 100644 index 000000000..07b235334 --- /dev/null +++ b/cvikernel/src/mode_manager.h @@ -0,0 +1,94 @@ +#ifndef CVIKERNEL_MODE_MANAGER_H +#define CVIKERNEL_MODE_MANAGER_H + +//#include "kernel_internal.h" +#include "engine_state.h" + +// Basic concept of TIU/TDMA command sequence management: +// 1. Each command buffer allocation +// Engine conductor allocates one descriptor. +// For previous command buffer, assign current one as follower. +// +// 2. Runtime retrieve all command buffers +// Assign unique non-zero command id to itself. +// Assign its command id to the follower as wait id. +// +// Difference between serial and parallel mode: +// Serial mode keeps track of each command sequence. +// serial_mode_record_desc() { +// engine_state_update() +// } +// +// Serial mode always has previous command buffer so it always can assign +// followers. +// +// Concurrent TDMA and TIU command execution: +// TDMA command runs without waiting previous TIU command: +// 1. parallel_disable +// 2. parallel_enable +// 3. tiu command +// 4. tdma command (not wait TIU command) +// 5. tdma command (not wait TIU command) +// +// Since parallel mode does not update last command buffer, the tiu command +// is not updated and the following tdma command will not wait TIU command. +// +typedef struct { + engine_state_t engine_state; + ec_t *ec; +} serial_mode_t; + +void serial_mode_init(serial_mode_t *m, engine_state_t *es, ec_t *ec); +void serial_mode_record_desc(serial_mode_t *m, ec_desc_t *d); +void serial_mode_destroy(serial_mode_t *m); + +typedef struct { + engine_state_t engine_state; + ec_t *ec; +} parallel_mode_t; + +void parallel_mode_init(parallel_mode_t *m, engine_state_t *es, ec_t *ec); +void parallel_mode_record_desc(parallel_mode_t *m, ec_desc_t *d); +void parallel_mode_destroy(parallel_mode_t *m); + +typedef struct { + uint32_t nr_streams; + serial_mode_t *streams; + serial_mode_t *cur_stream; +} stream_mode_t; + +void stream_mode_init( + stream_mode_t *m, + engine_state_t *es, + ec_t *ec, + uint32_t nr_streams); +void stream_mode_record_desc(stream_mode_t *m, ec_desc_t *d); +void stream_mode_set_stream(stream_mode_t *m, uint32_t i); +void stream_mode_destroy(stream_mode_t *m); + +typedef struct { + engine_state_t engine_state; + ec_t *ec; + +#define BMK_SERIAL_MODE 0 +#define BMK_PARALLEL_MODE 1 +#define BMK_STREAM_MODE 2 + uint32_t mode; + + serial_mode_t serial_mode; + parallel_mode_t parallel_mode; + stream_mode_t stream_mode; +} mode_manager_t; + +void mode_manager_init(mode_manager_t *mm, ec_t *ec, uint32_t nr_engines); +void mode_manager_destroy(mode_manager_t *mm); +void mode_manager_reset(mode_manager_t *mm); +void mode_manager_enable_parallel(mode_manager_t *mm); +void mode_manager_disable_parallel(mode_manager_t *mm); +void mode_manager_create_streams(mode_manager_t *mm, uint32_t nr_streams); +void mode_manager_destroy_streams(mode_manager_t *mm); +void mode_manager_set_stream(mode_manager_t *mm, uint32_t i); +void mode_manager_restart_sync_id(mode_manager_t *mm); +void mode_manager_record_ec_desc(mode_manager_t *mm, ec_desc_t *d); + +#endif /* CVIKERNEL_MODE_MANAGER_H */ diff --git a/cvikernel/src/parallel_mode.c b/cvikernel/src/parallel_mode.c new file mode 100644 index 000000000..2ddf5849a --- /dev/null +++ b/cvikernel/src/parallel_mode.c @@ -0,0 +1,23 @@ +#include "kernel_internal.h" + +void parallel_mode_init(parallel_mode_t *m, engine_state_t *es, ec_t *ec) +{ + engine_state_copy(&m->engine_state, es); + m->ec = ec; +} + +void parallel_mode_record_desc(parallel_mode_t *m, ec_desc_t *d) +{ + uint32_t nr_engines = m->engine_state.nr_engines; + + for (uint32_t i = 0; i < nr_engines; i++) { + ec_desc_t *before = m->engine_state.last_desc[i]; + if (before) + ec_add_dependency(m->ec, before, d); + } +} + +void parallel_mode_destroy(parallel_mode_t *m) +{ + engine_state_destroy(&m->engine_state); +} diff --git a/cvikernel/src/serial_mode.c b/cvikernel/src/serial_mode.c new file mode 100644 index 000000000..6beed5a5e --- /dev/null +++ b/cvikernel/src/serial_mode.c @@ -0,0 +1,28 @@ +#include "kernel_internal.h" + +void serial_mode_init(serial_mode_t *m, engine_state_t *es, ec_t *ec) +{ + engine_state_copy(&m->engine_state, es); + m->ec = ec; +} + +void serial_mode_record_desc(serial_mode_t *m, ec_desc_t *d) +{ + uint32_t nr_engines = m->engine_state.nr_engines; + + for (uint32_t i = 0; i < nr_engines; i++) { + ec_desc_t *before = m->engine_state.last_desc[i]; + if (before) + ec_add_dependency(m->ec, before, d); + } + + // 1st in mode_manager_record_ec_desc() updates last_desc of mode manager. + // This one updates last_desc of serial model + // The only one difference compared to parallel mode. + engine_state_update(&m->engine_state, d); +} + +void serial_mode_destroy(serial_mode_t *m) +{ + engine_state_destroy(&m->engine_state); +} diff --git a/cvikernel/src/stream_mode.c b/cvikernel/src/stream_mode.c new file mode 100644 index 000000000..dc1ba5794 --- /dev/null +++ b/cvikernel/src/stream_mode.c @@ -0,0 +1,33 @@ +#include "kernel_internal.h" + +void stream_mode_init( + stream_mode_t *m, engine_state_t *es, ec_t *ec, uint32_t nr_streams) +{ + m->nr_streams = nr_streams; + m->streams = xmalloc(nr_streams * sizeof(*m->streams)); + for (uint32_t i = 0; i < nr_streams; i++) + serial_mode_init(&m->streams[i], es, ec); + m->cur_stream = &m->streams[0]; +} + +void stream_mode_record_desc(stream_mode_t *m, ec_desc_t *d) +{ + serial_mode_record_desc(m->cur_stream, d); +} + +void stream_mode_set_stream(stream_mode_t *m, uint32_t i) +{ + ASSERT(i < m->nr_streams); + m->cur_stream = &m->streams[i]; +} + +void stream_mode_destroy(stream_mode_t *m) +{ + for (uint32_t i = 0; i < m->nr_streams; i++) + serial_mode_destroy(&m->streams[i]); + + free(m->streams); + m->nr_streams = 0; + m->streams = NULL; + m->cur_stream = NULL; +} diff --git a/cvikernel/tools/readcmdbuf.cpp b/cvikernel/tools/readcmdbuf.cpp new file mode 100644 index 000000000..98f2a71cc --- /dev/null +++ b/cvikernel/tools/readcmdbuf.cpp @@ -0,0 +1,361 @@ +#include +#include +#include +#include +#include +#include +#ifdef BM1880v2 +#include +#include +#include +#include "../src/bm1880v2/kernel_1880v2.h" +//#include "../../include/builder/Builder.hpp" +#include +#include // NOLINT(readability/streams) +#include +#include +#include +using namespace std; + +using google::protobuf::io::FileInputStream; + +#else /* ! if defined(CHIP) && CHIP == BM1880v2*/ + +#include +#endif /* if defined(CHIP) && CHIP == BM1880v2*/ + +int get_file_length(std::fstream& f) +{ + f.seekg (0, f.end); + int length = f.tellg(); + f.seekg (0, f.beg); + return length; +} + +#if defined(CHIP) && CHIP == BM1880v2 +static int bm1880v2_get_engine_desc_length(uint32_t engine_id) +{ + switch (engine_id) { + case BMK1880v2_TIU: + return TIU_ENGINE_DESCRIPTOR_NUM * sizeof(uint32_t); + case BMK1880v2_TDMA: + return TDMA_ENGINE_DESCRIPTOR_NUM * sizeof(uint32_t); + case BMK1880v2_CPU: + return CPU_ENGINE_DESCRIPTOR_NUM * sizeof(uint32_t); + default: + ASSERT(0); + } +} +static void parseCmdBuf(char* cmdbuf_path, map & map_layer_id_name, + char* output_file_path) { + + std::fstream f_cmdbuf(cmdbuf_path, std::ios::in | std::ios::binary); + + size_t i = 0; + size_t cmdbuf_size = get_file_length(f_cmdbuf); + //printf("cmdbuf size %zu\n", cmdbuf_size); + uint8_t *cmdbuf = new uint8_t[cmdbuf_size]; + uint32_t *buf = (uint32_t *)cmdbuf; + + std::ofstream output_file_path_fp; + output_file_path_fp.open(output_file_path); + f_cmdbuf.read((char *)cmdbuf, cmdbuf_size); + + while(i < cmdbuf_size / 4) { + cmd_hdr_t *hdr; + hdr = (cmd_hdr_t *)(&buf[i]); + i += sizeof(cmd_hdr_t) / 4; + const uint32_t *cmd = (uint32_t*)&buf[i]; + ASSERT(hdr->magic == BM_CMB_HDR_MAGIC); + uint32_t eng_id = hdr->engine_id; + //uint32_t cmd_len = cmd_hdr_len(hdr) / 4; + uint32_t cmd_len = bm1880v2_get_engine_desc_length(eng_id) / 4; + i += cmd_len; + + switch (eng_id) { + case BMK1880v2_TIU: + { + tiu_reg_t tiuTempReg; + int layer_id; + string mappined = ""; + map::iterator iter; + + parse_tiu_reg(&tiuTempReg, cmd); + layer_id = tiuTempReg.rsvd3; + + if (output_file_path) { + iter = map_layer_id_name.find(layer_id); + + if (iter != map_layer_id_name.end()) { + mappined = iter->second; + output_file_path_fp << tiuTempReg.cmd_id_tpu << "," << mappined << "\n"; + } + } + LOG(INFO) << "CMD [BD ], <" << + tiuTempReg.cmd_id_tpu + << ">, len " << cmd_len; + } + break; + case BMK1880v2_TDMA: + { + tdma_reg_t tdma_reg; + parse_tdma_reg(&tdma_reg, cmd); + LOG(INFO) << "[GDMA ], <" << tdma_reg.cmd_id + << ">, len " << cmd_len; + } + break; + case BMK1880v2_CPU: + LOG(INFO) << "[CPU], "; + break; + default: + CHECK(0) << "impossible eng_id " << eng_id << " not found"; + } + } + + if (output_file_path) { + output_file_path_fp.close(); + } +} + +static void read_bm1880v2_cmd(char* cmdbuf_path, char* layer_id_name_mapping_file_path, + char* output_file_path) { + string line; + map map_layer_id_name; + map::iterator iter; + + if (layer_id_name_mapping_file_path) { + std::ifstream file(layer_id_name_mapping_file_path); + + if (file.fail()) { + CHECK(0) << "File not found: " << cmdbuf_path; + } + + while (getline(file, line)) { + istringstream templine(line); + string data; + int group_id; + while (getline(templine, data,',')) { + group_id = atof(data.c_str()); + break; + } + map_layer_id_name[group_id] = line; + } + + //for(iter = map_layer_id_name.begin(); iter != map_layer_id_name.end(); iter++) { + // cout<first<<" "<second<> 8) & 0xff; +} + +static const char* desc_get_md_scale_op(void *desc) +{ + uint32_t *r = (uint32_t *)desc; + int op = (r[BD_CMD_REGI2] >> 19) & 0x3; + switch(op) { + case 0: return "ADD"; + case 1: return "SUB"; + case 2: return "MUL"; + case 3: return "DIV"; + default: return "UNKOWN"; + } +} + +static inline void desc_get_cmd_id( + void *desc, int engine_id, uint32_t cmd_len, + uint32_t& bd_cmd_id, uint32_t& gdma_cmd_id) +{ + uint32_t *r = (uint32_t *)desc; + if(ENGINE_BD == engine_id) { + if (cmd_len == 28) { // bd len = 28 in bm1880 + uint64_t data = ((uint64_t*)desc)[0]; + bd_cmd_id = ( data >> 3) & 0x0ffff; + gdma_cmd_id = (data >> 19) & 0x0ffff; + } else { + bd_cmd_id = (r[BD_CMD_REGI1] >> 16) & 0x0ffff; + gdma_cmd_id = (r[BD_CMD_REGI23] >> 16) & 0x0ffff; + } + } else if(ENGINE_GDMA == engine_id) { + bd_cmd_id = (r[GDMA_CMD_ACCPI1] >> 16) & 0x0ffff; + gdma_cmd_id = (r[GDMA_CMD_ACCPI0] >> 16) & 0x0ffff; + } +} + +static void parse_gdma_operand_addr(void *desc, uint64_t& src, uint64_t& dst) { + uint32_t *r = (uint32_t *)desc; + src = 0; + dst = 0; + int direction = (int)((r[GDMA_CMD_ACCPI0] >> GDMA_ACCPI0_DIRECTION_BIT) & 0x3); + switch (direction){ + case GDMA_DIR_S2L: + src = r[GDMA_CMD_ACCPI12] + (((uint64_t)(r[GDMA_CMD_ACCPI13] & 0xff000000)) << 8); + dst = r[GDMA_CMD_ACCPI11]; + break; + case GDMA_DIR_L2S: + src = r[GDMA_CMD_ACCPI12]; + dst = r[GDMA_CMD_ACCPI11] + (((uint64_t)(r[GDMA_CMD_ACCPI13] & 0x00ff0000)) << 16); + break; + case GDMA_DIR_S2S: + src = r[GDMA_CMD_ACCPI12] + (((uint64_t)(r[GDMA_CMD_ACCPI13] & 0xff000000)) << 8); + dst = r[GDMA_CMD_ACCPI11] + (((uint64_t)(r[GDMA_CMD_ACCPI13] & 0x00ff0000)) << 16); + break; + case GDMA_DIR_L2L: + src = r[GDMA_CMD_ACCPI12]; + dst = r[GDMA_CMD_ACCPI11]; + break; + default: + break; + } +} + +static std::string bd_type(void *desc) +{ + int dcr_type = desc_get_bd_dcr_type(desc); + CHECK_LT(dcr_type, NR_DCR_TYPES); + + switch (dcr_type) { + case DCR_TYPE_CONV: + return std::string("CONV"); + case DCR_TYPE_MD_SUM: + return std::string("MD_SUM"); + case DCR_TYPE_MD_LINEAR: + return std::string("MD_LINEAR"); + case DCR_TYPE_MD_SCALAR: + return std::string("MD_SCALAR ") + std::string(desc_get_md_scale_op(desc)); + case DCR_TYPE_MD_SFU: + return std::string("MD_SFU"); + case DCR_TYPE_MD_CMP: + return std::string("MD_CMP"); + case DCR_TYPE_POOLING_FWD: + return std::string("MD_POOLING_FWD"); + case DCR_TYPE_POOLING_BWD: + return std::string("MD_POOLING_BWD"); + case DCR_TYPE_MATRIX_MULTIPLY: + return std::string("MATMUL"); + case DCR_TYPE_IMAGE_SUM: + return std::string("IMG_SUM"); + case DCR_TYPE_CONV_COEFF: + return std::string("CONV_COEFF"); + case DCR_TYPE_CONV_COEFF_MAC: + return std::string("CONV_COEFF_MAC"); + case DCR_TYPE_LMEM_ARRANGE: + return std::string("LMEM_ARRANGE"); + case DCR_TYPE_TENSOR_ARITHMETIC: + return std::string("LMEM_ARITHMETIC"); + + default: + LOG(FATAL) << "Unknown BD dcr_type " << dcr_type; + } +} + +static void parse_cmdbuf(uint8_t *cmdbuf, size_t cmdbuf_size) +{ + uint32_t *buf = (uint32_t *)cmdbuf; + size_t i = 0; + + while(i < cmdbuf_size / 4) { + cmd_hdr_t *hdr; + void *cmd; + hdr = (cmd_hdr_t *)(&buf[i]); +#ifdef DETAILED_CMD + printf("magic:%d len: %d engine_id: %d node_id: %d" + "flags: %d mask: %d cmd[0]:%d\n", (int)hdr->magic, (int)cmd_hdr_len(hdr), + (int)hdr->engine_id, (int)hdr->__deprecated, (int)hdr->flags, (int)hdr->mask, + (int)hdr->cmd[0]); +#endif + i += sizeof(cmd_hdr_t) / 4; + cmd = (void *)&buf[i]; + uint32_t cmd_len = cmd_hdr_len(hdr) / 4; + uint64_t src, dst; + uint32_t bd_cmd_id = 0, gdma_cmd_id = 0; + desc_get_cmd_id(cmd, hdr->engine_id, cmd_len, bd_cmd_id, gdma_cmd_id); + +#ifdef DETAILED_CMD + uint32_t * c = (uint32_t *)cmd; +#endif + + switch (hdr->engine_id) { + case ENGINE_BD: + LOG(INFO) << "CMD [BD ], <" + << bd_cmd_id << "," << gdma_cmd_id + << ">, len " << cmd_len << ", " + << bd_type(cmd); +#ifdef DETAILED_CMD + for(size_t k = 0; k < cmd_len; k++) + LOG(INFO) << "[BD " << k << "]:" << std::hex << c[k]; +#endif + break; + case ENGINE_GDMA: + parse_gdma_operand_addr(cmd, src, dst); + LOG(INFO) << "CMD [GDMA], <" + << bd_cmd_id << "," << gdma_cmd_id + << ">, 0x" << std::hex << src + << " => 0x" << std::hex << dst + << ", len " << cmd_len; +#ifdef DETAILED_CMD + for(size_t k = 0; k < cmd_len; k++) + LOG(INFO) << "[GDMA " << k << "]:" << std::hex << c[k]; +#endif + break; + case ENGINE_CPU: + LOG(INFO) << "CMD [ARM ], len " << cmd_len; +#ifdef DETAILED_CMD + for(size_t k = 0; k < cmd_len; k++) + LOG(INFO) << "[CPU " << k << "]:" << std::hex << c[k]; +#endif + break; + default: + LOG(FATAL) << "UNKNOWN CMD, len " << cmd_len + << " ,engine_id " << (int)hdr->engine_id; + } + i += cmd_len; + } +} +#endif /* if defined(CHIP) && CHIP == BM1880v2*/ + +int main (int argc, char *argv[]) +{ +#if defined(CHIP) && CHIP == BM1880v2 + char* layer_id_name_mapping_file_path = NULL; + char* output_file_path = NULL; + if (argc != 4 && argc != 2) { + printf("Usage: %s cmdbuf.bin \n", argv[0]); + exit(1); + } + + if (argc == 4) { + layer_id_name_mapping_file_path = argv[2]; + output_file_path = argv[3]; + } + + read_bm1880v2_cmd(argv[1], layer_id_name_mapping_file_path, output_file_path); +#else + if (argc != 2) { + printf("Usage: %s cmdbuf.bin\n", argv[0]); + exit(1); + } + + std::fstream f_cmdbuf(argv[1], std::ios::in | std::ios::binary); + + size_t cmdbuf_size = get_file_length(f_cmdbuf); + printf("cmdbuf size %zu\n", cmdbuf_size); + uint8_t *cmdbuf = new uint8_t[cmdbuf_size]; + f_cmdbuf.read((char *)cmdbuf, cmdbuf_size); + //dump_hex("cmdbuf", cmdbuf, 16); + + parse_cmdbuf(cmdbuf, cmdbuf_size); +#endif /* if CHIP == "BM1880v2"*/ + + return 0; +}