## News
+- 2024.11
+ - Support F16 and F32 data type model for Ascend 310P NPU.
- 2024.8
- Support `Q4_0` and `Q8_0` data type for Ascend NPU.
- 2024.7
| Ascend NPU | Status |
|:-----------------------------:|:-------:|
| Atlas 300T A2 | Support |
+| Atlas 300I Duo | Support |
*Notes:*
detect_ascend_soc_type(SOC_VERSION)
set(SOC_TYPE "${SOC_VERSION}")
message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}")
-else()
- string(TOLOWER ${SOC_TYPE} SOC_VERSION)
endif()
-# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND310P.
+string(TOLOWER ${SOC_TYPE} SOC_VERSION) # SOC_VERSION need lower
+
+# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND_310P.
string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
+string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION)
if (CANN_INSTALL_DIR)
# Only Support Linux.
${SRC_FILES}
)
-message(STATUS "CANN: compile ascend kernels witch SOC_VERSION:${SOC_VERSION}.")
+message(STATUS "CANN: compile ascend kernels witch SOC_TYPE:${SOC_TYPE}, SOC_VERSION:${SOC_VERSION}, compile macro:-D${SOC_TYPE_COMPILE_OPTION}.")
ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
// Input has four dims.
int64_t op_block_num = GetBlockNum();
int64_t op_block_idx = GetBlockIdx();
- assert(op_block_idx < SUPPORTED_MAX_DIM && op_block_idx >= 0, "Invalid block index:%d, max is:%d\n", op_block_idx, SUPPORTED_MAX_DIM);
// param
num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
// optimize me. Use template to avoid copy code.
using namespace AscendC;
+#ifdef ASCEND_310P // 310P not support 4bit get row
+ extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
+ GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
+ GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
+ GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
+ // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
+ printf("Ascend310P not support 4bit get row.\n");
+ }
+#else
#define BUFFER_NUM 2
LocalTensor<float> output_local = output_queue.AllocTensor<float>();
// TODO: cast more data to speed up.
-#ifdef ASCEND_310P
- // TODO: 310P support quantification
-#else
Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
-#endif
+
// Only mul need compile by group.
half scale = scale_gm.GetValue(scale_offset);
indices_nb_ub, output_ne_ub, output_nb_ub);
op.calculate();
}
+
+#endif // #ifdef ASCEND_310P
#include "kernel_operator.h"
using namespace AscendC;
+#ifdef ASCEND_310P
+ extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
+ GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+ GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+ // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
+ printf("Ascend310P not support f16->8bit quantization.\n");
+ }
+#else
#define BUFFER_NUM 2
#define QK8_0 32
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
op.calculate();
}
+
+#endif // #ifdef ASCEND_310P
#include "kernel_operator.h"
using namespace AscendC;
+#ifdef ASCEND_310P // 310P not support f32->8bit quantization
+ extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
+ GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+ GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+ // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
+ printf("Ascend310P not support f32->8bit quantization.\n");
+ }
+#else
#define BUFFER_NUM 2
#define QK8_0 32
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
op.calculate();
}
+
+#endif // #ifdef ASCEND_310P
#include "kernel_operator.h"
using namespace AscendC;
+#ifdef ASCEND_310P // 310P not support float->4bit quantization
+ extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
+ GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+ GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+ // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
+ printf("Ascend310P not support f32->4bit quantization.\n");
+ }
+
+ extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
+ GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+ GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+ // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
+ printf("Ascend310P not support f16->4bit quantization.\n");
+ }
+#else
#define BUFFER_NUM 2
#define Group_Size 32
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
op.calculate();
}
+
+#endif // #ifdef ASCEND_310P