From: Georgi Gerganov Date: Thu, 10 Apr 2025 21:02:24 +0000 (+0300) Subject: sync : fix (skip) (#0) X-Git-Tag: upstream/0.0.1982~41 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=2abf606f098844faebee578996cae9c6d63a40e2;p=pkg%2Fggml%2Fsources%2Fggml sync : fix (skip) (#0) ggml-ci --- diff --git a/src/ggml-cann/kernels/CMakeLists.txt b/src/ggml-cann/kernels/CMakeLists.txt deleted file mode 100644 index d687220c..00000000 --- a/src/ggml-cann/kernels/CMakeLists.txt +++ /dev/null @@ -1,30 +0,0 @@ -file(GLOB SRC_FILES - get_row_f32.cpp - get_row_f16.cpp - get_row_q4_0.cpp - get_row_q8_0.cpp - quantize_f32_q8_0.cpp - quantize_f16_q8_0.cpp - quantize_float_to_q4_0.cpp - dup.cpp -) - -set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR}) -set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim") - -if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) - set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) -elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake) - set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake) -else() - message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.") -endif() -include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) - -ascendc_library(ascendc_kernels STATIC - ${SRC_FILES} -) - -message(STATUS "CANN: compile ascend kernels witch SOC_TYPE:${SOC_TYPE}, SOC_VERSION:${SOC_VERSION}, compile macro:-D${SOC_TYPE_COMPILE_OPTION}.") -ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}") -# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP) diff --git a/src/ggml-cann/kernels/ascendc_kernels.h b/src/ggml-cann/kernels/ascendc_kernels.h deleted file mode 100644 index 7e153208..00000000 --- a/src/ggml-cann/kernels/ascendc_kernels.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef ASCENDC_KERNELS_H -#define ASCENDC_KERNELS_H - -#include "aclrtlaunch_ascendc_get_row_f32.h" -#include "aclrtlaunch_ascendc_get_row_f16.h" -#include "aclrtlaunch_ascendc_get_row_q8_0.h" -#include "aclrtlaunch_ascendc_get_row_q4_0.h" - -#include "aclrtlaunch_ascendc_quantize_f32_q8_0.h" -#include "aclrtlaunch_ascendc_quantize_f16_q8_0.h" -#include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h" -#include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h" - -#include "aclrtlaunch_ascendc_dup_by_rows_fp16.h" -#include "aclrtlaunch_ascendc_dup_by_rows_fp32.h" -#include "aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16.h" -#include "aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32.h" - -#endif // ASCENDC_KERNELS_H diff --git a/src/ggml-cann/kernels/dup.cpp b/src/ggml-cann/kernels/dup.cpp deleted file mode 100644 index d9b95744..00000000 --- a/src/ggml-cann/kernels/dup.cpp +++ /dev/null @@ -1,234 +0,0 @@ -#include "kernel_operator.h" - -using namespace AscendC; - -#define BUFFER_NUM 2 -const int64_t SUPPORTED_MAX_DIM = 65535; // currently the limit of max block dim supportted by dup kernel is 65535template - -template -class DupByRows { - public: - __aicore__ inline DupByRows() {} - __aicore__ inline void init(GM_ADDR src, GM_ADDR dst, int64_t *input_ne_ub, - size_t *input_nb_ub) { - /* Dup by rows when src is contigous on first dimension and dst is - contiguous, each kernel process one row. - */ - - // Input has four dims. - int64_t op_block_num = GetBlockNum(); - int64_t op_block_idx = GetBlockIdx(); - - // param - num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3]; - num_elem = input_ne_ub[0]; - - // index for (ne[1], ne[2], ne[3]): (idx_ne1, idx_ne2, idx_ne3) - idx_ne3 = op_block_idx / (input_ne_ub[1] * input_ne_ub[2]); - idx_ne2 = (op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2])) - / (input_ne_ub[1]); - idx_ne1 = op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]) - - idx_ne2 * input_ne_ub[1]; - - // src may not contiguous in dim [1,2,3], so stride decited by ne&nb - src_stride = input_nb_ub[3] * idx_ne3 + input_nb_ub[2] * idx_ne2 - + input_nb_ub[1] * idx_ne1; - - // dst is contiguous - dst_stride = op_block_idx * (input_ne_ub[0] * sizeof(DST_T)); - - src_gm.SetGlobalBuffer(reinterpret_cast<__gm__ SRC_T *>(src + - src_stride)); - dst_gm.SetGlobalBuffer(reinterpret_cast<__gm__ DST_T *>(dst + - dst_stride)); - - pipe.InitBuffer(src_queue, BUFFER_NUM, (sizeof(SRC_T) * num_elem + - 32 - 1) / 32 * 32); - pipe.InitBuffer(dst_queue, BUFFER_NUM, (sizeof(DST_T) * num_elem + - 32 - 1) / 32 * 32); - } - - __aicore__ inline void copy_in() { - LocalTensor src_local = src_queue.AllocTensor(); - const size_t elem_per_block = 32 / sizeof(SRC_T); - size_t tail = num_elem % elem_per_block; - size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem; - DataCopy(src_local, src_gm, cpy_elements_len); - src_queue.EnQue(src_local); - } - - __aicore__ inline void copy_out() { - LocalTensor dst_local = dst_queue.DeQue(); -#ifdef ASCEND_310P - const size_t elem_per_block = 32 / sizeof(DST_T); - size_t tail = num_elem % elem_per_block; - size_t len = num_elem & ~(elem_per_block - 1); - if (len > 0) { - DataCopy(dst_gm, dst_local, len); - } - if(tail != 0) { - for (size_t i = tail; i < elem_per_block; i++) { - dst_local[len + i].SetValue(0, 0); - } - SetAtomicAdd(); - DataCopy(dst_gm[len], dst_local[len], elem_per_block); - SetAtomicNone(); - } -#else - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = num_elem * sizeof(DST_T); - DataCopyPad(dst_gm, dst_local, dataCopyParams); -#endif - dst_queue.FreeTensor(dst_local); - } - - __aicore__ inline void dup() { - // main process, copy one row data from src to dst. - copy_in(); - - LocalTensor src_local = src_queue.DeQue(); - LocalTensor dst_local = dst_queue.AllocTensor(); - - int32_t BLOCK_NUM = 32 / sizeof(DST_T); - DataCopy(dst_local, src_local, (num_elem + BLOCK_NUM - 1) - / BLOCK_NUM * BLOCK_NUM); - dst_queue.EnQue(dst_local); - - src_queue.FreeTensor(src_local); - copy_out(); - } - - __aicore__ inline void dup_with_cast() { - // main process, copy one row data from src to dst. - // cast dtype from src to dst. - copy_in(); - - LocalTensor src_local = src_queue.DeQue(); - LocalTensor dst_local = dst_queue.AllocTensor(); - - Cast(dst_local, src_local, RoundMode::CAST_NONE, num_elem); - dst_queue.EnQue(dst_local); - - src_queue.FreeTensor(src_local); - copy_out(); - } - - private: - - TPipe pipe; - GlobalTensor src_gm; - GlobalTensor dst_gm; - - int64_t num_rows; - int64_t num_elem; - int64_t idx_ne3; - int64_t idx_ne2; - int64_t idx_ne1; - int64_t src_stride; - int64_t dst_stride; - - TQue src_queue; - TQue dst_queue; -}; - -template -__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { - auto gm_ptr = (__gm__ uint8_t *)gm; - auto ub_ptr = (uint8_t *)(ub); - for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { - *ub_ptr = *gm_ptr; - } -} - -extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16( - GM_ADDR src_gm, - GM_ADDR dst_gm, - GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, - GM_ADDR output_ne_gm, - GM_ADDR output_nb_gm) { - - int64_t input_ne_ub[4]; - size_t input_nb_ub[4]; - int64_t output_ne_ub[4]; - size_t output_nb_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(input_nb_gm, input_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - copy_to_ub(output_nb_gm, output_nb_ub, 32); - - DupByRows op; - op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub); - op.dup(); -} - -extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32( - GM_ADDR src_gm, - GM_ADDR dst_gm, - GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, - GM_ADDR output_ne_gm, - GM_ADDR output_nb_gm) { - int64_t input_ne_ub[4]; - size_t input_nb_ub[4]; - int64_t output_ne_ub[4]; - size_t output_nb_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(input_nb_gm, input_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - copy_to_ub(output_nb_gm, output_nb_ub, 32); - - DupByRows op; - op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub); - op.dup(); -} - -extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32_to_fp16( - GM_ADDR src_gm, - GM_ADDR dst_gm, - GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, - GM_ADDR output_ne_gm, - GM_ADDR output_nb_gm) { - - int64_t input_ne_ub[4]; - size_t input_nb_ub[4]; - int64_t output_ne_ub[4]; - size_t output_nb_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(input_nb_gm, input_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - copy_to_ub(output_nb_gm, output_nb_ub, 32); - - DupByRows op; - op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub); - op.dup_with_cast(); -} - -extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16_to_fp32( - GM_ADDR src_gm, - GM_ADDR dst_gm, - GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, - GM_ADDR output_ne_gm, - GM_ADDR output_nb_gm) { - - // copy params from gm to ub. - int64_t input_ne_ub[4]; - size_t input_nb_ub[4]; - int64_t output_ne_ub[4]; - size_t output_nb_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(input_nb_gm, input_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - copy_to_ub(output_nb_gm, output_nb_ub, 32); - - DupByRows op; - op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub); - op.dup_with_cast(); -} diff --git a/src/ggml-cann/kernels/get_row_f16.cpp b/src/ggml-cann/kernels/get_row_f16.cpp deleted file mode 100644 index 416b4510..00000000 --- a/src/ggml-cann/kernels/get_row_f16.cpp +++ /dev/null @@ -1,197 +0,0 @@ -#include "kernel_operator.h" - -// optimize me. Use template to avoid copy code. -using namespace AscendC; - -#define BUFFER_NUM 2 - -class GET_ROW_F16 { - public: - __aicore__ inline GET_ROW_F16() {} - __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output, - int64_t *input_ne_ub, size_t *input_nb_ub, - int64_t *indices_ne_ub, size_t *indices_nb_ub, - int64_t *output_ne_ub, size_t *output_nb_ub) { - // TODO, use template for F16/f32 - int64_t op_block_num = GetBlockNum(); - op_block_idx = GetBlockIdx(); - - for (int i = 0; i < 4; i++) { - input_ne[i] = input_ne_ub[i]; - input_stride[i] = input_nb_ub[i] / input_nb_ub[0]; - - indices_ne[i] = indices_ne_ub[i]; - indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0]; - - output_ne[i] = output_ne_ub[i]; - output_stride[i] = output_nb_ub[i] / output_nb_ub[0]; - } - - // Indices has two dims. n_elements = all rows should get. - // dr, all rows should this thread get. - uint64_t n_elements = - indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3]; - dr = n_elements / op_block_num; - - uint64_t tails = n_elements % op_block_num; - if (op_block_idx < tails) { - dr += 1; - ir = dr * op_block_idx; - } else { - ir = dr * op_block_idx + tails; - } - - input_gm.SetGlobalBuffer((__gm__ half *)input); - indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices); - output_gm.SetGlobalBuffer((__gm__ float *)output); - - uint64_t input_local_buffer_size = ((input_ne[0] * sizeof(half) + 31) - & ~31); - uint64_t output_local_buffer_size = ((input_ne[0] * sizeof(float) + 31) - & ~31); - - local_buffer_elems = input_local_buffer_size / sizeof(half); - - // TODO, consider long row that can't put in UB. - // All data should asign to 32. It's ok because all data is align to 32. - pipe.InitBuffer(input_queue, BUFFER_NUM, input_local_buffer_size); - pipe.InitBuffer(output_queue, BUFFER_NUM, output_local_buffer_size); - } - - __aicore__ inline void copy_in(uint32_t offset, size_t len) { - size_t origin_len = len; - LocalTensor input_local = input_queue.AllocTensor(); - const size_t elem_per_block = 32 / sizeof(half); - size_t tail = len % elem_per_block; - len = len & ~(elem_per_block - 1); - if(tail != 0) { - len += elem_per_block; - } - DataCopy(input_local, input_gm[offset], len); - input_queue.EnQue(input_local); - } - - __aicore__ inline void copy_out(uint32_t offset, size_t len) { - LocalTensor output_local = output_queue.DeQue(); - const size_t elem_per_block = 32 / sizeof(float); - size_t tail = len % elem_per_block; - len = len & ~(elem_per_block - 1); - if (len > 0) { - DataCopy(output_gm[offset], output_local, len); - } - - if(tail != 0) { -#ifdef ASCEND_310P - for (size_t i = tail; i < elem_per_block; i++) { - output_local[len + i].SetValue(0, 0); - } - SetAtomicAdd(); - DataCopy(output_gm[offset + len], output_local[len], elem_per_block); - SetAtomicNone(); -#else - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = tail * sizeof(float); - DataCopyPad(output_gm[offset + len], output_local[len], - dataCopyParams); -#endif - } - output_queue.FreeTensor(output_local); - } - - __aicore__ inline void calculate_row(int64_t idx) { - const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]); - const int64_t indices_ne1_idx = - (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) / - indices_ne[0]; - const int64_t indices_ne0_idx = - (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] - - indices_ne1_idx * indices_ne[0]); - - const int64_t indices_offset = indices_ne0_idx * indices_stride[0] + - indices_ne1_idx * indices_stride[1] + - indices_ne2_idx * indices_stride[2]; - const int32_t selected_row_idx = indices_gm.GetValue(indices_offset); - - const int64_t input_offset = selected_row_idx * input_stride[1] + - indices_ne1_idx * input_stride[2] + - indices_ne2_idx * input_stride[3]; - - const int64_t output_offset = indices_ne0_idx * output_stride[1] + - indices_ne1_idx * output_stride[2] + - indices_ne2_idx * output_stride[3]; - - copy_in(input_offset, input_ne[0]); - LocalTensor input_local = input_queue.DeQue(); - LocalTensor output_local = output_queue.AllocTensor(); - - Cast(output_local, input_local, RoundMode::CAST_NONE, - local_buffer_elems); - output_queue.EnQue(output_local); - copy_out(output_offset, input_ne[0]); - - input_queue.FreeTensor(input_local); - } - - __aicore__ inline void calculate() { - for (int64_t i = ir; i < ir + dr; i++) { - calculate_row(i); - } - } - - private: - int64_t input_ne[4]; - size_t input_stride[4]; - - int64_t indices_ne[4]; - size_t indices_stride[4]; - - int64_t output_ne[4]; - size_t output_stride[4]; - - size_t local_buffer_elems; - - int64_t ir; - int64_t dr; - - TPipe pipe; - GlobalTensor input_gm; - GlobalTensor indices_gm; - GlobalTensor output_gm; - TQue input_queue; - TQue output_queue; - int64_t op_block_idx; -}; - -template -__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { - auto gm_ptr = (__gm__ uint8_t *)gm; - auto ub_ptr = (uint8_t *)(ub); - for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { - *ub_ptr = *gm_ptr; - } -} - -extern "C" __global__ __aicore__ void ascendc_get_row_f16( - GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm, - GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm, - GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) { - int64_t input_ne_ub[4]; - size_t input_nb_ub[4]; - int64_t indices_ne_ub[4]; - size_t indices_nb_ub[4]; - int64_t output_ne_ub[4]; - size_t output_nb_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(input_nb_gm, input_nb_ub, 32); - copy_to_ub(indices_ne_gm, indices_ne_ub, 32); - copy_to_ub(indices_nb_gm, indices_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - copy_to_ub(output_nb_gm, output_nb_ub, 32); - - GET_ROW_F16 op; - op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub, - indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub); - op.calculate(); -} diff --git a/src/ggml-cann/kernels/get_row_f32.cpp b/src/ggml-cann/kernels/get_row_f32.cpp deleted file mode 100644 index 02116905..00000000 --- a/src/ggml-cann/kernels/get_row_f32.cpp +++ /dev/null @@ -1,190 +0,0 @@ -#include "kernel_operator.h" - -// optimize me. Use template to avoid copy code. -using namespace AscendC; - -#define BUFFER_NUM 2 - -class GET_ROW_F32 { - public: - __aicore__ inline GET_ROW_F32() {} - __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output, - int64_t *input_ne_ub, size_t *input_nb_ub, - int64_t *indices_ne_ub, size_t *indices_nb_ub, - int64_t *output_ne_ub, size_t *output_nb_ub) { - int64_t op_block_num = GetBlockNum(); - op_block_idx = GetBlockIdx(); - - for (int i = 0; i < 4; i++) { - input_ne[i] = input_ne_ub[i]; - input_stride[i] = input_nb_ub[i] / input_nb_ub[0]; - - indices_ne[i] = indices_ne_ub[i]; - indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0]; - - output_ne[i] = output_ne_ub[i]; - output_stride[i] = output_nb_ub[i] / output_nb_ub[0]; - } - - // Indices has two dims. n_elements = all rows should get. - // dr, all rows should this thread get. - uint64_t n_elements = - indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3]; - dr = n_elements / op_block_num; - - uint64_t tails = n_elements % op_block_num; - if (op_block_idx < tails) { - dr += 1; - ir = dr * op_block_idx; - } else { - ir = dr * op_block_idx + tails; - } - - input_gm.SetGlobalBuffer((__gm__ float *)input); - indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices); - output_gm.SetGlobalBuffer((__gm__ float *)output); - - uint64_t local_buffer_size = ((input_ne[0] * sizeof(float) + 31) & ~31); - local_buffer_elems = local_buffer_size / sizeof(float); - - // TODO, consider long row that can't put in UB. - // All data should asign to 32. It's ok because all data is align to 32. - pipe.InitBuffer(input_queue, BUFFER_NUM, local_buffer_size); - pipe.InitBuffer(output_queue, BUFFER_NUM, local_buffer_size); - } - - __aicore__ inline void copy_in(uint32_t offset, size_t len) { - LocalTensor input_local = input_queue.AllocTensor(); - const size_t elem_per_block = 32 / sizeof(float); - size_t tail = len % elem_per_block; - len = len & ~(elem_per_block - 1); - if(tail != 0) { - len += elem_per_block; - } - DataCopy(input_local, input_gm[offset], len); - input_queue.EnQue(input_local); - } - - __aicore__ inline void copy_out(uint32_t offset, size_t len) { - LocalTensor output_local = output_queue.DeQue(); - const size_t elem_per_block = 32 / sizeof(float); - size_t tail = len % elem_per_block; - len = len & ~(elem_per_block - 1); - if (len > 0) { - DataCopy(output_gm[offset], output_local, len); - } - - if(tail != 0) { -#ifdef ASCEND_310P - for (size_t i = tail; i < elem_per_block; i++) { - output_local[len + i].SetValue(0, 0); - } - SetAtomicAdd(); - DataCopy(output_gm[offset + len], output_local[len], elem_per_block); - SetAtomicNone(); -#else - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = tail * sizeof(float); - DataCopyPad(output_gm[offset + len], output_local[len], - dataCopyParams); -#endif - } - output_queue.FreeTensor(output_local); - } - - __aicore__ inline void calculate_row(int64_t idx) { - const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]); - const int64_t indices_ne1_idx = - (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) / - indices_ne[0]; - const int64_t indices_ne0_idx = - (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] - - indices_ne1_idx * indices_ne[0]); - - const int64_t indices_offset = indices_ne0_idx * indices_stride[0] + - indices_ne1_idx * indices_stride[1] + - indices_ne2_idx * indices_stride[2]; - const int32_t selected_row_idx = indices_gm.GetValue(indices_offset); - - const int64_t input_offset = selected_row_idx * input_stride[1] + - indices_ne1_idx * input_stride[2] + - indices_ne2_idx * input_stride[3]; - - const int64_t output_offset = indices_ne0_idx * output_stride[1] + - indices_ne1_idx * output_stride[2] + - indices_ne2_idx * output_stride[3]; - - copy_in(input_offset, input_ne[0]); - LocalTensor input_local = input_queue.DeQue(); - LocalTensor output_local = output_queue.AllocTensor(); - - DataCopy(output_local, input_local, local_buffer_elems); - output_queue.EnQue(output_local); - copy_out(output_offset, input_ne[0]); - - input_queue.FreeTensor(input_local); - } - - __aicore__ inline void calculate() { - for (int64_t i = ir; i < ir + dr; i++) { - calculate_row(i); - } - } - - private: - int64_t input_ne[4]; - size_t input_stride[4]; - - int64_t indices_ne[4]; - size_t indices_stride[4]; - - int64_t output_ne[4]; - size_t output_stride[4]; - - size_t local_buffer_elems; - - int64_t ir; - int64_t dr; - - TPipe pipe; - GlobalTensor input_gm; - GlobalTensor indices_gm; - GlobalTensor output_gm; - TQue input_queue; - TQue output_queue; - int64_t op_block_idx; -}; - -template -__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { - auto gm_ptr = (__gm__ uint8_t *)gm; - auto ub_ptr = (uint8_t *)(ub); - for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { - *ub_ptr = *gm_ptr; - } -} - -extern "C" __global__ __aicore__ void ascendc_get_row_f32( - GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm, - GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm, - GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) { - int64_t input_ne_ub[4]; - size_t input_nb_ub[4]; - int64_t indices_ne_ub[4]; - size_t indices_nb_ub[4]; - int64_t output_ne_ub[4]; - size_t output_nb_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(input_nb_gm, input_nb_ub, 32); - copy_to_ub(indices_ne_gm, indices_ne_ub, 32); - copy_to_ub(indices_nb_gm, indices_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - copy_to_ub(output_nb_gm, output_nb_ub, 32); - - GET_ROW_F32 op; - op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub, - indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub); - op.calculate(); -} diff --git a/src/ggml-cann/kernels/get_row_q4_0.cpp b/src/ggml-cann/kernels/get_row_q4_0.cpp deleted file mode 100644 index 4fbe7220..00000000 --- a/src/ggml-cann/kernels/get_row_q4_0.cpp +++ /dev/null @@ -1,204 +0,0 @@ -#include "kernel_operator.h" - -// optimize me. Use template to avoid copy code. -using namespace AscendC; -#ifdef ASCEND_310P // 310P not support 4bit get row - extern "C" __global__ __aicore__ void ascendc_get_row_q4_0( - GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm, - GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm, - GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) { - // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed. - printf("Ascend310P not support 4bit get row.\n"); - } -#else - -#define BUFFER_NUM 2 - -#define QK4_0 32 - -class GET_ROW_Q4_0 { - public: - __aicore__ inline GET_ROW_Q4_0() {} - __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output, - int64_t *input_ne_ub, int64_t *indices_ne_ub, - size_t *indices_nb_ub, int64_t *output_ne_ub, - size_t *output_nb_ub) { - int64_t op_block_num = GetBlockNum(); - int64_t op_block_idx = GetBlockIdx(); - - for (int i = 0; i < 4; i++) { - input_ne[i] = input_ne_ub[i]; - indices_ne[i] = indices_ne_ub[i]; - indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0]; - scale_ne[i] = input_ne_ub[i]; - output_ne[i] = output_ne_ub[i]; - output_stride[i] = output_nb_ub[i] / output_nb_ub[0]; - } - - // one scale for a group. - scale_ne[0] /= QK4_0; - - input_stride[0] = 1; - scale_stride[0] = 1; - output_stride[0] = 1; - for (int i = 1; i < 4; i++) { - input_stride[i] = input_stride[i - 1] * input_ne[i - 1]; - scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1]; - } - - group_size_in_row = input_ne[0] / QK4_0; - int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] * - input_ne[3] / 2; - - // Indices has two dims. n_elements = all rows should get. - // dr, all rows should this thread get. - uint64_t n_elements = - indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3]; - dr = n_elements / op_block_num; - - uint64_t tails = n_elements % op_block_num; - if (op_block_idx < tails) { - dr += 1; - ir = dr * op_block_idx; - } else { - ir = dr * op_block_idx + tails; - } - - input_gm.SetGlobalBuffer((__gm__ int4b_t *)input); - scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset)); - indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices); - output_gm.SetGlobalBuffer((__gm__ float *)output); - - pipe.InitBuffer(input_queue, BUFFER_NUM, QK4_0 * sizeof(int4b_t)); - pipe.InitBuffer(cast_queue, BUFFER_NUM, QK4_0 * sizeof(half)); - pipe.InitBuffer(output_queue, BUFFER_NUM, QK4_0 * sizeof(float)); - } - - __aicore__ inline void copy_in(uint32_t offset) { - LocalTensor input_local = input_queue.AllocTensor(); - // 32 * sizeof(int4b_t) = 16, which is not aligned to 32, why no error? - DataCopy(input_local, input_gm[offset], QK4_0); - input_queue.EnQue(input_local); - } - - __aicore__ inline void copy_out(uint32_t offset) { - LocalTensor output_local = output_queue.DeQue(); - DataCopy(output_gm[offset], output_local, QK4_0); - output_queue.FreeTensor(output_local); - } - - __aicore__ inline void calculate_group(int64_t idx, int64_t group) { - const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]); - const int64_t indices_ne1_idx = - (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) / - indices_ne[0]; - const int64_t indices_ne0_idx = - (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] - - indices_ne1_idx * indices_ne[0]); - - const int64_t indices_offset = indices_ne0_idx * indices_stride[0] + - indices_ne1_idx * indices_stride[1] + - indices_ne2_idx * indices_stride[2]; - const int32_t selected_row_idx = indices_gm.GetValue(indices_offset); - - const int64_t input_offset = selected_row_idx * input_stride[1] + - indices_ne1_idx * input_stride[2] + - indices_ne2_idx * input_stride[3] + - group * QK4_0; - const int64_t scale_offset = selected_row_idx * scale_stride[1] + - indices_ne1_idx * scale_stride[2] + - indices_ne2_idx * scale_stride[3] + group; - const int64_t output_offset = indices_ne0_idx * output_stride[1] + - indices_ne1_idx * output_stride[2] + - indices_ne2_idx * output_stride[3] + - group * QK4_0; - - copy_in(input_offset); - LocalTensor input_local = input_queue.DeQue(); - LocalTensor cast_local = cast_queue.AllocTensor(); - LocalTensor output_local = output_queue.AllocTensor(); - - // TODO: cast more data to speed up. - Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0); - Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0); - - // Only mul need compile by group. - half scale = scale_gm.GetValue(scale_offset); - - Muls(output_local, output_local, (float)scale, QK4_0); - - input_queue.FreeTensor(input_local); - cast_queue.FreeTensor(cast_local); - output_queue.EnQue(output_local); - - copy_out(output_offset); - } - - __aicore__ inline void calculate() { - for (int64_t i = ir; i < ir + dr; i++) { - for (int64_t j = 0; j < group_size_in_row; j++) { - calculate_group(i, j); - } - } - } - - private: - int64_t input_ne[4]; - size_t input_stride[4]; - - int64_t scale_ne[4]; - size_t scale_stride[4]; - - int64_t indices_ne[4]; - size_t indices_stride[4]; - - int64_t output_ne[4]; - size_t output_stride[4]; - - int64_t ir; - int64_t dr; - - int64_t group_size_in_row; - - TPipe pipe; - GlobalTensor input_gm; - GlobalTensor scale_gm; - GlobalTensor indices_gm; - GlobalTensor output_gm; - TQue input_queue; - TQue output_queue; - TQue cast_queue; -}; - -template -__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { - auto gm_ptr = (__gm__ uint8_t *)gm; - auto ub_ptr = (uint8_t *)(ub); - for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { - *ub_ptr = *gm_ptr; - } -} - -extern "C" __global__ __aicore__ void ascendc_get_row_q4_0( - GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm, - GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm, - GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) { - int64_t input_ne_ub[4]; - int64_t indices_ne_ub[4]; - size_t indices_nb_ub[4]; - int64_t output_ne_ub[4]; - size_t output_nb_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(indices_ne_gm, indices_ne_ub, 32); - copy_to_ub(indices_nb_gm, indices_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - copy_to_ub(output_nb_gm, output_nb_ub, 32); - - GET_ROW_Q4_0 op; - op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub, - indices_nb_ub, output_ne_ub, output_nb_ub); - op.calculate(); -} - -#endif // #ifdef ASCEND_310P diff --git a/src/ggml-cann/kernels/get_row_q8_0.cpp b/src/ggml-cann/kernels/get_row_q8_0.cpp deleted file mode 100644 index ba9ab3c0..00000000 --- a/src/ggml-cann/kernels/get_row_q8_0.cpp +++ /dev/null @@ -1,191 +0,0 @@ -#include "kernel_operator.h" - -// optimize me. Use template to avoid copy code. -using namespace AscendC; - -#define BUFFER_NUM 2 - -#define QK8_0 32 - -class GET_ROW_Q8_0 { - public: - __aicore__ inline GET_ROW_Q8_0() {} - __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output, - int64_t *input_ne_ub, int64_t *indices_ne_ub, - size_t *indices_nb_ub, int64_t *output_ne_ub, - size_t *output_nb_ub) { - int64_t op_block_num = GetBlockNum(); - int64_t op_block_idx = GetBlockIdx(); - - for (int i = 0; i < 4; i++) { - input_ne[i] = input_ne_ub[i]; - indices_ne[i] = indices_ne_ub[i]; - indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0]; - scale_ne[i] = input_ne_ub[i]; - output_ne[i] = output_ne_ub[i]; - output_stride[i] = output_nb_ub[i] / output_nb_ub[0]; - } - - // one scale for a group. - scale_ne[0] /= QK8_0; - - input_stride[0] = 1; - scale_stride[0] = 1; - output_stride[0] = 1; - for (int i = 1; i < 4; i++) { - input_stride[i] = input_stride[i - 1] * input_ne[i - 1]; - scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1]; - } - - group_size_in_row = input_ne[0] / QK8_0; - int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] * - input_ne[3] * sizeof(int8_t); - - // Indices has two dims. n_elements = all rows should get. - // dr, all rows should this thread get. - uint64_t n_elements = - indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3]; - dr = n_elements / op_block_num; - - uint64_t tails = n_elements % op_block_num; - if (op_block_idx < tails) { - dr += 1; - ir = dr * op_block_idx; - } else { - ir = dr * op_block_idx + tails; - } - - input_gm.SetGlobalBuffer((__gm__ int8_t *)input); - scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset)); - indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices); - output_gm.SetGlobalBuffer((__gm__ float *)output); - - pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t)); - pipe.InitBuffer(cast_queue, BUFFER_NUM, QK8_0 * sizeof(half)); - pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(float)); - } - - __aicore__ inline void copy_in(uint32_t offset) { - LocalTensor input_local = input_queue.AllocTensor(); - DataCopy(input_local, input_gm[offset], QK8_0); - input_queue.EnQue(input_local); - } - - __aicore__ inline void copy_out(uint32_t offset) { - LocalTensor output_local = output_queue.DeQue(); - DataCopy(output_gm[offset], output_local, QK8_0); - output_queue.FreeTensor(output_local); - } - - __aicore__ inline void calculate_group(int64_t idx, int64_t group) { - const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]); - const int64_t indices_ne1_idx = - (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) / - indices_ne[0]; - const int64_t indices_ne0_idx = - (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] - - indices_ne1_idx * indices_ne[0]); - - const int64_t indices_offset = indices_ne0_idx * indices_stride[0] + - indices_ne1_idx * indices_stride[1] + - indices_ne2_idx * indices_stride[2]; - const int32_t selected_row_idx = indices_gm.GetValue(indices_offset); - - const int64_t input_offset = selected_row_idx * input_stride[1] + - indices_ne1_idx * input_stride[2] + - indices_ne2_idx * input_stride[3] + - group * QK8_0; - const int64_t scale_offset = selected_row_idx * scale_stride[1] + - indices_ne1_idx * scale_stride[2] + - indices_ne2_idx * scale_stride[3] + group; - const int64_t output_offset = indices_ne0_idx * output_stride[1] + - indices_ne1_idx * output_stride[2] + - indices_ne2_idx * output_stride[3] + - group * QK8_0; - - copy_in(input_offset); - LocalTensor input_local = input_queue.DeQue(); - LocalTensor cast_local = cast_queue.AllocTensor(); - LocalTensor output_local = output_queue.AllocTensor(); - - // TODO: cast more data to speed up. - Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0); - Cast(output_local, cast_local, RoundMode::CAST_NONE, QK8_0); - - // Only mul need compile by group. - half scale = scale_gm.GetValue(scale_offset); - Muls(output_local, output_local, (float)scale, QK8_0); - - input_queue.FreeTensor(input_local); - cast_queue.FreeTensor(cast_local); - output_queue.EnQue(output_local); - - copy_out(output_offset); - } - - __aicore__ inline void calculate() { - for (int64_t i = ir; i < ir + dr; i++) { - for (int64_t j = 0; j < group_size_in_row; j++) { - calculate_group(i, j); - } - } - } - - private: - int64_t input_ne[4]; - size_t input_stride[4]; - - int64_t scale_ne[4]; - size_t scale_stride[4]; - - int64_t indices_ne[4]; - size_t indices_stride[4]; - - int64_t output_ne[4]; - size_t output_stride[4]; - - int64_t ir; - int64_t dr; - - int64_t group_size_in_row; - - TPipe pipe; - GlobalTensor input_gm; - GlobalTensor scale_gm; - GlobalTensor indices_gm; - GlobalTensor output_gm; - TQue input_queue; - TQue output_queue; - TQue cast_queue; -}; - -template -__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { - auto gm_ptr = (__gm__ uint8_t *)gm; - auto ub_ptr = (uint8_t *)(ub); - for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { - *ub_ptr = *gm_ptr; - } -} - -extern "C" __global__ __aicore__ void ascendc_get_row_q8_0( - GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm, - GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm, - GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) { - int64_t input_ne_ub[4]; - int64_t indices_ne_ub[4]; - size_t indices_nb_ub[4]; - int64_t output_ne_ub[4]; - size_t output_nb_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(indices_ne_gm, indices_ne_ub, 32); - copy_to_ub(indices_nb_gm, indices_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - copy_to_ub(output_nb_gm, output_nb_ub, 32); - - GET_ROW_Q8_0 op; - op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub, - indices_nb_ub, output_ne_ub, output_nb_ub); - op.calculate(); -} diff --git a/src/ggml-cann/kernels/quantize_f16_q8_0.cpp b/src/ggml-cann/kernels/quantize_f16_q8_0.cpp deleted file mode 100644 index 504b43af..00000000 --- a/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +++ /dev/null @@ -1,218 +0,0 @@ -#include "kernel_operator.h" - -using namespace AscendC; -#ifdef ASCEND_310P - extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0( - GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { - // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed. - printf("Ascend310P not support f16->8bit quantization.\n"); - } -#else - -#define BUFFER_NUM 2 -#define QK8_0 32 - -class QUANTIZE_F16_Q8_0 { - public: - __aicore__ inline QUANTIZE_F16_Q8_0() {} - __aicore__ inline void init(GM_ADDR input, GM_ADDR output, - int64_t *input_ne_ub, size_t *input_nb_ub, - int64_t *output_ne_ub) { - int64_t op_block_num = GetBlockNum(); - int64_t op_block_idx = GetBlockIdx(); - - for (int i = 0; i < 4; i++) { - input_ne[i] = input_ne_ub[i]; - input_stride[i] = input_nb_ub[i] / input_nb_ub[0]; - - output_ne[i] = output_ne_ub[i]; - } - - output_stride[0] = 1; - for (int i = 1; i < 4; i++) { - output_stride[i] = output_stride[i - 1] * output_ne[i - 1]; - } - - scale_ne = input_ne; - scale_stride[0] = 1; - scale_stride[1] = input_ne[0] / QK8_0; - for (int i = 2; i < 4; i++) { - scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1]; - } - - // split input tensor by rows. - uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3]; - dr = nr / op_block_num; - - uint64_t tails = nr % op_block_num; - if (op_block_idx < tails) { - dr += 1; - ir = dr * op_block_idx; - } else { - ir = dr * op_block_idx + tails; - } - - group_size_in_row = scale_stride[1]; - int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] * - output_ne[3] * sizeof(uint8_t); - - input_gm.SetGlobalBuffer((__gm__ half *)input); - output_gm.SetGlobalBuffer((__gm__ int8_t *)output); - scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size + ir * - group_size_in_row * - sizeof(half))); - - pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(half)); - pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t)); - pipe.InitBuffer(work_queue, 1, 32); - pipe.InitBuffer(max_queue, 1, 32); - pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float)); - pipe.InitBuffer(scale_queue, 1, 32); - pipe.InitBuffer(cast_queue ,1 ,QK8_0 * sizeof(float)); - } - - __aicore__ inline void copy_in(uint32_t offset) { - LocalTensor input_local = input_queue.AllocTensor(); - DataCopy(input_local, input_gm[offset], QK8_0); - input_queue.EnQue(input_local); - } - - __aicore__ inline void copy_out(uint32_t offset) { - LocalTensor output_local = output_queue.DeQue(); - DataCopy(output_gm[offset], output_local, QK8_0); - output_queue.FreeTensor(output_local); - } - - __aicore__ inline half calculate_group(int64_t row, int64_t group) { - const int64_t i3 = row / (input_ne[1] * input_ne[2]); - const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1]; - const int64_t i1 = - row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1]; - - const int64_t input_offset = i1 * input_stride[1] + - i2 * input_stride[2] + - i3 * input_stride[3] + QK8_0 * group; - - const int64_t output_offset = i1 * output_stride[1] + - i2 * output_stride[2] + - i3 * output_stride[3] + QK8_0 * group; - - copy_in(input_offset); - LocalTensor input_local = input_queue.DeQue(); - LocalTensor output_local = output_queue.AllocTensor(); - LocalTensor work_local = work_queue.AllocTensor(); - LocalTensor abs_local = abs_queue.AllocTensor(); - LocalTensor max_local = max_queue.AllocTensor(); - LocalTensor cast_local = cast_queue.AllocTensor(); - - Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0); - Abs(abs_local, cast_local, QK8_0); - ReduceMax(max_local, abs_local, work_local, QK8_0); - - pipe_barrier(PIPE_ALL); - float d = max_local.GetValue(0); - d = d / ((1 << 7) - 1); - if (d != 0) { - Muls(cast_local, cast_local, 1.0f / d, QK8_0); - } - - Cast(cast_local, cast_local, RoundMode::CAST_ROUND, QK8_0); - Cast(input_local, cast_local, RoundMode::CAST_ROUND, QK8_0); - Cast(output_local, input_local, RoundMode::CAST_ROUND, QK8_0); - output_queue.EnQue(output_local); - copy_out(output_offset); - - input_queue.FreeTensor(input_local); - work_queue.FreeTensor(work_local); - abs_queue.FreeTensor(abs_local); - max_queue.FreeTensor(max_local); - cast_queue.FreeTensor(cast_local); - return (half)d; - } - - __aicore__ inline void calculate() { - LocalTensor scale_local = scale_queue.AllocTensor(); - uint32_t scale_local_offset = 0; - uint32_t scale_global_offset = 0; - for (int64_t i = ir; i < ir + dr; i++) { - for (int64_t j = 0; j < group_size_in_row; j++) { - half scale = calculate_group(i, j); - scale_local.SetValue(scale_local_offset++, scale); - if (scale_local_offset == 16) { - scale_local_offset = 0; - // TODO: OPTIMIZE ME - pipe_barrier(PIPE_ALL); - DataCopy(scale_gm[scale_global_offset], scale_local, 16); - pipe_barrier(PIPE_ALL); - scale_global_offset += 16; - } - } - } - - if (scale_local_offset != 0) { - pipe_barrier(PIPE_ALL); - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = scale_local_offset * sizeof(half); - DataCopyPad(scale_gm[scale_global_offset], scale_local, - dataCopyParams); - pipe_barrier(PIPE_ALL); - } - } - - private: - int64_t input_ne[4]; - size_t input_stride[4]; - - int64_t *scale_ne; - size_t scale_stride[4]; - - int64_t output_ne[4]; - size_t output_stride[4]; - - int64_t group_size_in_row; - - int64_t ir; - int64_t dr; - - TPipe pipe; - GlobalTensor input_gm; - GlobalTensor scale_gm; - GlobalTensor output_gm; - TQue input_queue; - TQue output_queue; - TQue work_queue; - TQue max_queue; - TQue abs_queue; - TQue scale_queue; - TQue cast_queue; - -}; - -template -__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { - auto gm_ptr = (__gm__ uint8_t *)gm; - auto ub_ptr = (uint8_t *)(ub); - for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { - *ub_ptr = *gm_ptr; - } -} - -extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0( - GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { - int64_t input_ne_ub[4]; - size_t input_nb_ub[4]; - int64_t output_ne_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(input_nb_gm, input_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - - QUANTIZE_F16_Q8_0 op; - op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); - op.calculate(); -} - -#endif // #ifdef ASCEND_310P diff --git a/src/ggml-cann/kernels/quantize_f32_q8_0.cpp b/src/ggml-cann/kernels/quantize_f32_q8_0.cpp deleted file mode 100644 index 05b0bc1d..00000000 --- a/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +++ /dev/null @@ -1,216 +0,0 @@ -#include "kernel_operator.h" - -using namespace AscendC; -#ifdef ASCEND_310P // 310P not support f32->8bit quantization - extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0( - GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { - // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed. - printf("Ascend310P not support f32->8bit quantization.\n"); - } -#else - -#define BUFFER_NUM 2 -#define QK8_0 32 - -class QUANTIZE_F32_Q8_0 { - public: - __aicore__ inline QUANTIZE_F32_Q8_0() {} - __aicore__ inline void init(GM_ADDR input, GM_ADDR output, - int64_t *input_ne_ub, size_t *input_nb_ub, - int64_t *output_ne_ub) { - int64_t op_block_num = GetBlockNum(); - int64_t op_block_idx = GetBlockIdx(); - - for (int i = 0; i < 4; i++) { - input_ne[i] = input_ne_ub[i]; - input_stride[i] = input_nb_ub[i] / input_nb_ub[0]; - - output_ne[i] = output_ne_ub[i]; - } - - output_stride[0] = 1; - for (int i = 1; i < 4; i++) { - output_stride[i] = output_stride[i - 1] * output_ne[i - 1]; - } - - scale_ne = input_ne; - scale_stride[0] = 1; - scale_stride[1] = input_ne[0] / QK8_0; - for (int i = 2; i < 4; i++) { - scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1]; - } - - // split input tensor by rows. - uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3]; - dr = nr / op_block_num; - - uint64_t tails = nr % op_block_num; - if (op_block_idx < tails) { - dr += 1; - ir = dr * op_block_idx; - } else { - ir = dr * op_block_idx + tails; - } - - group_size_in_row = scale_stride[1]; - int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] * - output_ne[3] * sizeof(uint8_t); - - input_gm.SetGlobalBuffer((__gm__ float *)input); - output_gm.SetGlobalBuffer((__gm__ int8_t *)output); - scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size + - ir * group_size_in_row * - sizeof(half))); - - pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(float)); - pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t)); - pipe.InitBuffer(work_queue, 1, 32); - pipe.InitBuffer(max_queue, 1, 32); - pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float)); - pipe.InitBuffer(cast_queue, 1, QK8_0 * sizeof(half)); - pipe.InitBuffer(scale_queue, 1, 32); - } - - __aicore__ inline void copy_in(uint32_t offset) { - LocalTensor input_local = input_queue.AllocTensor(); - DataCopy(input_local, input_gm[offset], QK8_0); - input_queue.EnQue(input_local); - } - - __aicore__ inline void copy_out(uint32_t offset) { - LocalTensor output_local = output_queue.DeQue(); - DataCopy(output_gm[offset], output_local, QK8_0); - output_queue.FreeTensor(output_local); - } - - __aicore__ inline half calculate_group(int64_t row, int64_t group) { - const int64_t i3 = row / (input_ne[1] * input_ne[2]); - const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1]; - const int64_t i1 = - row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1]; - - const int64_t input_offset = i1 * input_stride[1] + - i2 * input_stride[2] + - i3 * input_stride[3] + QK8_0 * group; - - const int64_t output_offset = i1 * output_stride[1] + - i2 * output_stride[2] + - i3 * output_stride[3] + QK8_0 * group; - - copy_in(input_offset); - LocalTensor input_local = input_queue.DeQue(); - LocalTensor output_local = output_queue.AllocTensor(); - LocalTensor work_local = work_queue.AllocTensor(); - LocalTensor abs_local = abs_queue.AllocTensor(); - LocalTensor max_local = max_queue.AllocTensor(); - LocalTensor cast_local = cast_queue.AllocTensor(); - - Abs(abs_local, input_local, QK8_0); - ReduceMax(max_local, abs_local, work_local, QK8_0); - pipe_barrier(PIPE_ALL); - float d = max_local.GetValue(0); - d = d / ((1 << 7) - 1); - if (d != 0) { - Muls(input_local, input_local, 1.0f / d, QK8_0); - } - - Cast(input_local, input_local, RoundMode::CAST_ROUND, QK8_0); - Cast(cast_local, input_local, RoundMode::CAST_ROUND, QK8_0); - Cast(output_local, cast_local, RoundMode::CAST_ROUND, QK8_0); - output_queue.EnQue(output_local); - copy_out(output_offset); - - input_queue.FreeTensor(input_local); - work_queue.FreeTensor(work_local); - abs_queue.FreeTensor(abs_local); - max_queue.FreeTensor(max_local); - cast_queue.FreeTensor(cast_local); - - return (half)d; - } - - __aicore__ inline void calculate() { - LocalTensor scale_local = scale_queue.AllocTensor(); - uint32_t scale_local_offset = 0; - uint32_t scale_global_offset = 0; - for (int64_t i = ir; i < ir + dr; i++) { - for (int64_t j = 0; j < group_size_in_row; j++) { - half scale = calculate_group(i, j); - scale_local.SetValue(scale_local_offset++, scale); - if (scale_local_offset == 16) { - scale_local_offset = 0; - // TODO: OPTIMIZE ME - pipe_barrier(PIPE_ALL); - DataCopy(scale_gm[scale_global_offset], scale_local, 16); - pipe_barrier(PIPE_ALL); - scale_global_offset += 16; - } - } - } - - if (scale_local_offset != 0) { - pipe_barrier(PIPE_ALL); - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = scale_local_offset * sizeof(half); - DataCopyPad(scale_gm[scale_global_offset], scale_local, - dataCopyParams); - pipe_barrier(PIPE_ALL); - } - } - - private: - int64_t input_ne[4]; - size_t input_stride[4]; - - int64_t *scale_ne; - size_t scale_stride[4]; - - int64_t output_ne[4]; - size_t output_stride[4]; - - int64_t group_size_in_row; - - int64_t ir; - int64_t dr; - - TPipe pipe; - GlobalTensor input_gm; - GlobalTensor scale_gm; - GlobalTensor output_gm; - TQue input_queue; - TQue output_queue; - TQue work_queue; - TQue max_queue; - TQue abs_queue; - TQue cast_queue; - TQue scale_queue; -}; - -template -__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { - auto gm_ptr = (__gm__ uint8_t *)gm; - auto ub_ptr = (uint8_t *)(ub); - for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { - *ub_ptr = *gm_ptr; - } -} - -extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0( - GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { - int64_t input_ne_ub[4]; - size_t input_nb_ub[4]; - int64_t output_ne_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(input_nb_gm, input_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - - QUANTIZE_F32_Q8_0 op; - op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); - op.calculate(); -} - -#endif // #ifdef ASCEND_310P diff --git a/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp b/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp deleted file mode 100644 index 1188937b..00000000 --- a/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +++ /dev/null @@ -1,295 +0,0 @@ -#include "kernel_operator.h" - -using namespace AscendC; -#ifdef ASCEND_310P // 310P not support float->4bit quantization - extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0( - GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { - // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed. - printf("Ascend310P not support f32->4bit quantization.\n"); - } - - extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0( - GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { - // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed. - printf("Ascend310P not support f16->4bit quantization.\n"); - } -#else - -#define BUFFER_NUM 2 -#define Group_Size 32 - -template -class QUANTIZE_FLOAT_TO_Q4_0 { - public: - __aicore__ inline QUANTIZE_FLOAT_TO_Q4_0() {} - __aicore__ inline void init(GM_ADDR input, GM_ADDR output, - int64_t *input_ne_ub, size_t *input_nb_ub, - int64_t *output_ne_ub) { - // TODO: fix test_case CPY(type_src=f16,type_dst=q4_0,ne=[256,4,4,4], - // permute=[0,0,0,0]): - // [CPY] NMSE = 0.000008343 > 0.000001000 FAIL - int64_t op_block_num = GetBlockNum(); - int64_t op_block_idx = GetBlockIdx(); - - // input stride of data elements - for (int i = 0; i < 4; i++) { - input_ne[i] = input_ne_ub[i]; - input_stride[i] = input_nb_ub[i] / input_nb_ub[0]; - output_ne[i] = output_ne_ub[i]; - } - - // output stride of data elements - output_stride[0] = 1; - for (int i = 1; i < 4; i++) { - output_stride[i] = output_stride[i - 1] * output_ne[i - 1]; - } - - // scale saved one by one after data:. [group1_scale, group2_scale, ...] - scale_ne = input_ne; - scale_stride[0] = 1; - scale_stride[1] = input_ne[0] / Group_Size; - for (int i = 2; i < 4; i++) { - scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1]; - } - - // split input tensor by rows. - uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3]; - dr = nr / op_block_num; - - uint64_t tails = nr % op_block_num; - if (op_block_idx < tails) { - dr += 1; - ir = dr * op_block_idx; - } else { - ir = dr * op_block_idx + tails; - } - - group_size_in_row = scale_stride[1]; - int64_t scale_offset = output_ne[0] * output_ne[1] * output_ne[2] * - output_ne[3] * sizeof(uint8_t) / 2; - - input_gm.SetGlobalBuffer((__gm__ SRC_T *)input); - output_gm.SetGlobalBuffer((__gm__ int8_t *)output); - scale_gm.SetGlobalBuffer((__gm__ half *)(output + scale_offset + ir * - group_size_in_row * - sizeof(half))); - - pipe.InitBuffer(input_queue, BUFFER_NUM, Group_Size * sizeof(SRC_T)); - pipe.InitBuffer(output_queue, BUFFER_NUM, - Group_Size * sizeof(int8_t) / 2); - pipe.InitBuffer(cast_queue , 1, Group_Size * sizeof(float)); - pipe.InitBuffer(work_queue, 1, Group_Size * sizeof(float)); - pipe.InitBuffer(max_queue, 1, Group_Size * sizeof(float)); - pipe.InitBuffer(min_queue, 1, Group_Size * sizeof(float)); - pipe.InitBuffer(scale_queue, 1, Group_Size / 2 * sizeof(half)); - pipe.InitBuffer(int8_queue, 1, Group_Size * sizeof(int8_t)); - pipe.InitBuffer(half_queue, 1, Group_Size * sizeof(half)); - } - - __aicore__ inline void copy_in(uint32_t offset) { - LocalTensor input_local = input_queue.AllocTensor(); - DataCopy(input_local, input_gm[offset], Group_Size); - input_queue.EnQue(input_local); - } - - __aicore__ inline void copy_out(uint32_t offset) { - // reinterpretcast Group_Size(32) * int4b_t to Group_Size / 2 * int8_t, - // and using DataCopyPad to avoid 32 bits align. - LocalTensor output_local = output_queue.DeQue(); - LocalTensor output_int8_local = - output_local.ReinterpretCast(); - - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = Group_Size / 2 * sizeof(int8_t); - DataCopyPad(output_gm[offset], output_int8_local, dataCopyParams); - - output_queue.FreeTensor(output_local); - } - - __aicore__ inline void input_to_cast(LocalTensor cast_local, - LocalTensor input_local) { - DataCopy(cast_local, input_local, Group_Size); - } - - __aicore__ inline void input_to_cast(LocalTensor cast_local, - LocalTensor input_local) { - Cast(cast_local, input_local, RoundMode::CAST_NONE, Group_Size); - } - - __aicore__ inline half calculate_group(int64_t row, int64_t group) { - const int64_t i3 = row / (input_ne[1] * input_ne[2]); - const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1]; - const int64_t i1 = - row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1]; - - const int64_t input_offset = i1 * input_stride[1] + - i2 * input_stride[2] + - i3 * input_stride[3] + Group_Size * group; - - // output_offset is stride for output_gm which datatype is int8_t and - // divided by 2 is needed for int4b_t. - const int64_t output_offset = (i1 * output_stride[1] + - i2 * output_stride[2] + - i3 * output_stride[3] + - Group_Size * group) / 2; - copy_in(input_offset); - - LocalTensor input_local = input_queue.DeQue(); - LocalTensor output_local = output_queue.AllocTensor(); - LocalTensor cast_local = cast_queue.AllocTensor(); - LocalTensor work_local = work_queue.AllocTensor(); - LocalTensor max_local = max_queue.AllocTensor(); - LocalTensor min_local = min_queue.AllocTensor(); - LocalTensor int8_local = int8_queue.AllocTensor(); - LocalTensor half_local = half_queue.AllocTensor(); - - input_to_cast(cast_local, input_local); - - ReduceMax(max_local, cast_local, work_local, Group_Size); - ReduceMin(min_local, cast_local, work_local, Group_Size); - const float max_value = max_local.GetValue(0); - const float min_value = min_local.GetValue(0); - float d = max_value; - if (min_value < 0 && (-1 * min_value) > max_value) { - d = min_value; - } - - d = d / (-8); - if (d != 0) { - Muls(cast_local, cast_local, 1.0f / d, Group_Size); - } - - // range: [-8,8] -> [0.5,16.5] -> [0,16] -> [0,15] -> [-8,7] - float scalar = 8.5f; - Adds(cast_local, cast_local, scalar, Group_Size); - Cast(cast_local, cast_local, RoundMode::CAST_FLOOR, Group_Size); - scalar = 15.0f; - Mins(cast_local, cast_local, scalar, Group_Size); - scalar = -8.0f; - Adds(cast_local, cast_local, scalar, Group_Size); - - // float->half->int4b - Cast(half_local, cast_local, RoundMode::CAST_NONE, Group_Size); - Cast(output_local, half_local, RoundMode::CAST_NONE, Group_Size); - - output_queue.EnQue(output_local); - copy_out(output_offset); - - input_queue.FreeTensor(input_local); - work_queue.FreeTensor(work_local); - max_queue.FreeTensor(max_local); - min_queue.FreeTensor(min_local); - int8_queue.FreeTensor(int8_local); - half_queue.FreeTensor(half_local); - cast_queue.FreeTensor(cast_local); - return (half)d; - } - - __aicore__ inline void calculate() { - LocalTensor scale_local = scale_queue.AllocTensor(); - uint32_t scale_local_offset = 0; - uint32_t scale_global_offset = 0; - for (int64_t i = ir; i < ir + dr; i++) { - for (int64_t j = 0; j < group_size_in_row; j++) { - half scale = calculate_group(i, j); - scale_local.SetValue(scale_local_offset++, scale); - // Copy Group_Size/2 length data each time. - if (scale_local_offset == Group_Size / 2) { - scale_local_offset = 0; - // TODO: OPTIMIZE ME - pipe_barrier(PIPE_ALL); - DataCopy(scale_gm[scale_global_offset], scale_local, - Group_Size / 2); - pipe_barrier(PIPE_ALL); - scale_global_offset += Group_Size / 2; - } - } - } - - if (scale_local_offset != 0) { - pipe_barrier(PIPE_ALL); - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = scale_local_offset * sizeof(half); - DataCopyPad(scale_gm[scale_global_offset], scale_local, - dataCopyParams); - pipe_barrier(PIPE_ALL); - } - scale_queue.FreeTensor(scale_local); - } - - private: - int64_t input_ne[4]; - size_t input_stride[4]; - - int64_t *scale_ne; - size_t scale_stride[4]; - - int64_t output_ne[4]; - size_t output_stride[4]; - - int64_t group_size_in_row; - - int64_t ir; - int64_t dr; - - TPipe pipe; - GlobalTensor input_gm; - GlobalTensor scale_gm; - GlobalTensor output_gm; - TQue input_queue; - TQue output_queue; - TQue work_queue; - TQue max_queue; - TQue min_queue; - TQue scale_queue; - TQue cast_queue; - TQue int8_queue; - TQue half_queue; -}; - -template -__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { - auto gm_ptr = (__gm__ uint8_t *)gm; - auto ub_ptr = (uint8_t *)(ub); - for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { - *ub_ptr = *gm_ptr; - } -} - -extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0( - GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { - int64_t input_ne_ub[4]; - size_t input_nb_ub[4]; - int64_t output_ne_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(input_nb_gm, input_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - - QUANTIZE_FLOAT_TO_Q4_0 op; - op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); - op.calculate(); -} - -extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0( - GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { - int64_t input_ne_ub[4]; - size_t input_nb_ub[4]; - int64_t output_ne_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(input_nb_gm, input_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - - QUANTIZE_FLOAT_TO_Q4_0 op; - op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); - op.calculate(); -} - -#endif // #ifdef ASCEND_310P