+++ /dev/null
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#include <stdbool.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define GGML_KOMPUTE_MAX_DEVICES 16
-
-struct ggml_vk_device {
- int index;
- int type; // same as VkPhysicalDeviceType
- size_t heapSize;
- const char * name;
- const char * vendor;
- int subgroupSize;
- uint64_t bufferAlignment;
- uint64_t maxAlloc;
-};
-
-struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
-bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
-bool ggml_vk_has_vulkan(void);
-bool ggml_vk_has_device(void);
-struct ggml_vk_device ggml_vk_current_device(void);
-
-//
-// backend API
-//
-
-// forward declaration
-typedef struct ggml_backend * ggml_backend_t;
-
-GGML_BACKEND_API ggml_backend_t ggml_backend_kompute_init(int device);
-
-GGML_BACKEND_API bool ggml_backend_is_kompute(ggml_backend_t backend);
-
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
-
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_kompute_reg(void);
-
-#ifdef __cplusplus
-}
-#endif
+++ /dev/null
-file(GLOB SRC_FILES
- get_row_f32.cpp
- get_row_f16.cpp
- get_row_q4_0.cpp
- get_row_q8_0.cpp
- quantize_f32_q8_0.cpp
- quantize_f16_q8_0.cpp
- quantize_float_to_q4_0.cpp
- dup.cpp
-)
-
-set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
-set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")
-
-if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
- set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
-elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
- set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
-else()
- message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.")
-endif()
-include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
-
-ascendc_library(ascendc_kernels STATIC
- ${SRC_FILES}
-)
-
-message(STATUS "CANN: compile ascend kernels witch SOC_TYPE:${SOC_TYPE}, SOC_VERSION:${SOC_VERSION}, compile macro:-D${SOC_TYPE_COMPILE_OPTION}.")
-ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
-# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
+++ /dev/null
-#ifndef ASCENDC_KERNELS_H
-#define ASCENDC_KERNELS_H
-
-#include "aclrtlaunch_ascendc_get_row_f32.h"
-#include "aclrtlaunch_ascendc_get_row_f16.h"
-#include "aclrtlaunch_ascendc_get_row_q8_0.h"
-#include "aclrtlaunch_ascendc_get_row_q4_0.h"
-
-#include "aclrtlaunch_ascendc_quantize_f32_q8_0.h"
-#include "aclrtlaunch_ascendc_quantize_f16_q8_0.h"
-#include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h"
-#include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h"
-
-#include "aclrtlaunch_ascendc_dup_by_rows_fp16.h"
-#include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"
-#include "aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16.h"
-#include "aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32.h"
-
-#endif // ASCENDC_KERNELS_H
+++ /dev/null
-#include "kernel_operator.h"
-
-using namespace AscendC;
-
-#define BUFFER_NUM 2
-const int64_t SUPPORTED_MAX_DIM = 65535; // currently the limit of max block dim supportted by dup kernel is 65535template <typename SRC_T, typename DST_T>
-
-template <typename SRC_T, typename DST_T>
-class DupByRows {
- public:
- __aicore__ inline DupByRows() {}
- __aicore__ inline void init(GM_ADDR src, GM_ADDR dst, int64_t *input_ne_ub,
- size_t *input_nb_ub) {
- /* Dup by rows when src is contigous on first dimension and dst is
- contiguous, each kernel process one row.
- */
-
- // Input has four dims.
- int64_t op_block_num = GetBlockNum();
- int64_t op_block_idx = GetBlockIdx();
-
- // param
- num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
- num_elem = input_ne_ub[0];
-
- // index for (ne[1], ne[2], ne[3]): (idx_ne1, idx_ne2, idx_ne3)
- idx_ne3 = op_block_idx / (input_ne_ub[1] * input_ne_ub[2]);
- idx_ne2 = (op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]))
- / (input_ne_ub[1]);
- idx_ne1 = op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2])
- - idx_ne2 * input_ne_ub[1];
-
- // src may not contiguous in dim [1,2,3], so stride decited by ne&nb
- src_stride = input_nb_ub[3] * idx_ne3 + input_nb_ub[2] * idx_ne2
- + input_nb_ub[1] * idx_ne1;
-
- // dst is contiguous
- dst_stride = op_block_idx * (input_ne_ub[0] * sizeof(DST_T));
-
- src_gm.SetGlobalBuffer(reinterpret_cast<__gm__ SRC_T *>(src +
- src_stride));
- dst_gm.SetGlobalBuffer(reinterpret_cast<__gm__ DST_T *>(dst +
- dst_stride));
-
- pipe.InitBuffer(src_queue, BUFFER_NUM, (sizeof(SRC_T) * num_elem +
- 32 - 1) / 32 * 32);
- pipe.InitBuffer(dst_queue, BUFFER_NUM, (sizeof(DST_T) * num_elem +
- 32 - 1) / 32 * 32);
- }
-
- __aicore__ inline void copy_in() {
- LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
- const size_t elem_per_block = 32 / sizeof(SRC_T);
- size_t tail = num_elem % elem_per_block;
- size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem;
- DataCopy(src_local, src_gm, cpy_elements_len);
- src_queue.EnQue(src_local);
- }
-
- __aicore__ inline void copy_out() {
- LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
-#ifdef ASCEND_310P
- const size_t elem_per_block = 32 / sizeof(DST_T);
- size_t tail = num_elem % elem_per_block;
- size_t len = num_elem & ~(elem_per_block - 1);
- if (len > 0) {
- DataCopy(dst_gm, dst_local, len);
- }
- if(tail != 0) {
- for (size_t i = tail; i < elem_per_block; i++) {
- dst_local[len + i].SetValue(0, 0);
- }
- SetAtomicAdd<float>();
- DataCopy(dst_gm[len], dst_local[len], elem_per_block);
- SetAtomicNone();
- }
-#else
- DataCopyExtParams dataCopyParams;
- dataCopyParams.blockCount = 1;
- dataCopyParams.blockLen = num_elem * sizeof(DST_T);
- DataCopyPad(dst_gm, dst_local, dataCopyParams);
-#endif
- dst_queue.FreeTensor(dst_local);
- }
-
- __aicore__ inline void dup() {
- // main process, copy one row data from src to dst.
- copy_in();
-
- LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
- LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
-
- int32_t BLOCK_NUM = 32 / sizeof(DST_T);
- DataCopy(dst_local, src_local, (num_elem + BLOCK_NUM - 1)
- / BLOCK_NUM * BLOCK_NUM);
- dst_queue.EnQue<DST_T>(dst_local);
-
- src_queue.FreeTensor(src_local);
- copy_out();
- }
-
- __aicore__ inline void dup_with_cast() {
- // main process, copy one row data from src to dst.
- // cast dtype from src to dst.
- copy_in();
-
- LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
- LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
-
- Cast(dst_local, src_local, RoundMode::CAST_NONE, num_elem);
- dst_queue.EnQue<DST_T>(dst_local);
-
- src_queue.FreeTensor(src_local);
- copy_out();
- }
-
- private:
-
- TPipe pipe;
- GlobalTensor<SRC_T> src_gm;
- GlobalTensor<DST_T> dst_gm;
-
- int64_t num_rows;
- int64_t num_elem;
- int64_t idx_ne3;
- int64_t idx_ne2;
- int64_t idx_ne1;
- int64_t src_stride;
- int64_t dst_stride;
-
- TQue<QuePosition::VECIN, BUFFER_NUM> src_queue;
- TQue<QuePosition::VECOUT, BUFFER_NUM> dst_queue;
-};
-
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
- auto gm_ptr = (__gm__ uint8_t *)gm;
- auto ub_ptr = (uint8_t *)(ub);
- for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
- *ub_ptr = *gm_ptr;
- }
-}
-
-extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16(
- GM_ADDR src_gm,
- GM_ADDR dst_gm,
- GM_ADDR input_ne_gm,
- GM_ADDR input_nb_gm,
- GM_ADDR output_ne_gm,
- GM_ADDR output_nb_gm) {
-
- int64_t input_ne_ub[4];
- size_t input_nb_ub[4];
- int64_t output_ne_ub[4];
- size_t output_nb_ub[4];
-
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
- copy_to_ub(output_nb_gm, output_nb_ub, 32);
-
- DupByRows<half, half> op;
- op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
- op.dup();
-}
-
-extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32(
- GM_ADDR src_gm,
- GM_ADDR dst_gm,
- GM_ADDR input_ne_gm,
- GM_ADDR input_nb_gm,
- GM_ADDR output_ne_gm,
- GM_ADDR output_nb_gm) {
- int64_t input_ne_ub[4];
- size_t input_nb_ub[4];
- int64_t output_ne_ub[4];
- size_t output_nb_ub[4];
-
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
- copy_to_ub(output_nb_gm, output_nb_ub, 32);
-
- DupByRows<float, float> op;
- op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
- op.dup();
-}
-
-extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32_to_fp16(
- GM_ADDR src_gm,
- GM_ADDR dst_gm,
- GM_ADDR input_ne_gm,
- GM_ADDR input_nb_gm,
- GM_ADDR output_ne_gm,
- GM_ADDR output_nb_gm) {
-
- int64_t input_ne_ub[4];
- size_t input_nb_ub[4];
- int64_t output_ne_ub[4];
- size_t output_nb_ub[4];
-
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
- copy_to_ub(output_nb_gm, output_nb_ub, 32);
-
- DupByRows<float, half> op;
- op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
- op.dup_with_cast();
-}
-
-extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16_to_fp32(
- GM_ADDR src_gm,
- GM_ADDR dst_gm,
- GM_ADDR input_ne_gm,
- GM_ADDR input_nb_gm,
- GM_ADDR output_ne_gm,
- GM_ADDR output_nb_gm) {
-
- // copy params from gm to ub.
- int64_t input_ne_ub[4];
- size_t input_nb_ub[4];
- int64_t output_ne_ub[4];
- size_t output_nb_ub[4];
-
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
- copy_to_ub(output_nb_gm, output_nb_ub, 32);
-
- DupByRows<half, float> op;
- op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
- op.dup_with_cast();
-}
+++ /dev/null
-#include "kernel_operator.h"
-
-// optimize me. Use template to avoid copy code.
-using namespace AscendC;
-
-#define BUFFER_NUM 2
-
-class GET_ROW_F16 {
- public:
- __aicore__ inline GET_ROW_F16() {}
- __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
- int64_t *input_ne_ub, size_t *input_nb_ub,
- int64_t *indices_ne_ub, size_t *indices_nb_ub,
- int64_t *output_ne_ub, size_t *output_nb_ub) {
- // TODO, use template for F16/f32
- int64_t op_block_num = GetBlockNum();
- op_block_idx = GetBlockIdx();
-
- for (int i = 0; i < 4; i++) {
- input_ne[i] = input_ne_ub[i];
- input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
-
- indices_ne[i] = indices_ne_ub[i];
- indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
-
- output_ne[i] = output_ne_ub[i];
- output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
- }
-
- // Indices has two dims. n_elements = all rows should get.
- // dr, all rows should this thread get.
- uint64_t n_elements =
- indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
- dr = n_elements / op_block_num;
-
- uint64_t tails = n_elements % op_block_num;
- if (op_block_idx < tails) {
- dr += 1;
- ir = dr * op_block_idx;
- } else {
- ir = dr * op_block_idx + tails;
- }
-
- input_gm.SetGlobalBuffer((__gm__ half *)input);
- indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
- output_gm.SetGlobalBuffer((__gm__ float *)output);
-
- uint64_t input_local_buffer_size = ((input_ne[0] * sizeof(half) + 31)
- & ~31);
- uint64_t output_local_buffer_size = ((input_ne[0] * sizeof(float) + 31)
- & ~31);
-
- local_buffer_elems = input_local_buffer_size / sizeof(half);
-
- // TODO, consider long row that can't put in UB.
- // All data should asign to 32. It's ok because all data is align to 32.
- pipe.InitBuffer(input_queue, BUFFER_NUM, input_local_buffer_size);
- pipe.InitBuffer(output_queue, BUFFER_NUM, output_local_buffer_size);
- }
-
- __aicore__ inline void copy_in(uint32_t offset, size_t len) {
- size_t origin_len = len;
- LocalTensor<half> input_local = input_queue.AllocTensor<half>();
- const size_t elem_per_block = 32 / sizeof(half);
- size_t tail = len % elem_per_block;
- len = len & ~(elem_per_block - 1);
- if(tail != 0) {
- len += elem_per_block;
- }
- DataCopy(input_local, input_gm[offset], len);
- input_queue.EnQue(input_local);
- }
-
- __aicore__ inline void copy_out(uint32_t offset, size_t len) {
- LocalTensor<float> output_local = output_queue.DeQue<float>();
- const size_t elem_per_block = 32 / sizeof(float);
- size_t tail = len % elem_per_block;
- len = len & ~(elem_per_block - 1);
- if (len > 0) {
- DataCopy(output_gm[offset], output_local, len);
- }
-
- if(tail != 0) {
-#ifdef ASCEND_310P
- for (size_t i = tail; i < elem_per_block; i++) {
- output_local[len + i].SetValue(0, 0);
- }
- SetAtomicAdd<float>();
- DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
- SetAtomicNone();
-#else
- DataCopyExtParams dataCopyParams;
- dataCopyParams.blockCount = 1;
- dataCopyParams.blockLen = tail * sizeof(float);
- DataCopyPad(output_gm[offset + len], output_local[len],
- dataCopyParams);
-#endif
- }
- output_queue.FreeTensor(output_local);
- }
-
- __aicore__ inline void calculate_row(int64_t idx) {
- const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
- const int64_t indices_ne1_idx =
- (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
- indices_ne[0];
- const int64_t indices_ne0_idx =
- (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
- indices_ne1_idx * indices_ne[0]);
-
- const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
- indices_ne1_idx * indices_stride[1] +
- indices_ne2_idx * indices_stride[2];
- const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
-
- const int64_t input_offset = selected_row_idx * input_stride[1] +
- indices_ne1_idx * input_stride[2] +
- indices_ne2_idx * input_stride[3];
-
- const int64_t output_offset = indices_ne0_idx * output_stride[1] +
- indices_ne1_idx * output_stride[2] +
- indices_ne2_idx * output_stride[3];
-
- copy_in(input_offset, input_ne[0]);
- LocalTensor<half> input_local = input_queue.DeQue<half>();
- LocalTensor<float> output_local = output_queue.AllocTensor<float>();
-
- Cast(output_local, input_local, RoundMode::CAST_NONE,
- local_buffer_elems);
- output_queue.EnQue(output_local);
- copy_out(output_offset, input_ne[0]);
-
- input_queue.FreeTensor(input_local);
- }
-
- __aicore__ inline void calculate() {
- for (int64_t i = ir; i < ir + dr; i++) {
- calculate_row(i);
- }
- }
-
- private:
- int64_t input_ne[4];
- size_t input_stride[4];
-
- int64_t indices_ne[4];
- size_t indices_stride[4];
-
- int64_t output_ne[4];
- size_t output_stride[4];
-
- size_t local_buffer_elems;
-
- int64_t ir;
- int64_t dr;
-
- TPipe pipe;
- GlobalTensor<half> input_gm;
- GlobalTensor<int32_t> indices_gm;
- GlobalTensor<float> output_gm;
- TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
- TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
- int64_t op_block_idx;
-};
-
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
- auto gm_ptr = (__gm__ uint8_t *)gm;
- auto ub_ptr = (uint8_t *)(ub);
- for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
- *ub_ptr = *gm_ptr;
- }
-}
-
-extern "C" __global__ __aicore__ void ascendc_get_row_f16(
- GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
- GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
- GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
- int64_t input_ne_ub[4];
- size_t input_nb_ub[4];
- int64_t indices_ne_ub[4];
- size_t indices_nb_ub[4];
- int64_t output_ne_ub[4];
- size_t output_nb_ub[4];
-
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
- copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
- copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
- copy_to_ub(output_nb_gm, output_nb_ub, 32);
-
- GET_ROW_F16 op;
- op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
- indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
- op.calculate();
-}
+++ /dev/null
-#include "kernel_operator.h"
-
-// optimize me. Use template to avoid copy code.
-using namespace AscendC;
-
-#define BUFFER_NUM 2
-
-class GET_ROW_F32 {
- public:
- __aicore__ inline GET_ROW_F32() {}
- __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
- int64_t *input_ne_ub, size_t *input_nb_ub,
- int64_t *indices_ne_ub, size_t *indices_nb_ub,
- int64_t *output_ne_ub, size_t *output_nb_ub) {
- int64_t op_block_num = GetBlockNum();
- op_block_idx = GetBlockIdx();
-
- for (int i = 0; i < 4; i++) {
- input_ne[i] = input_ne_ub[i];
- input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
-
- indices_ne[i] = indices_ne_ub[i];
- indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
-
- output_ne[i] = output_ne_ub[i];
- output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
- }
-
- // Indices has two dims. n_elements = all rows should get.
- // dr, all rows should this thread get.
- uint64_t n_elements =
- indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
- dr = n_elements / op_block_num;
-
- uint64_t tails = n_elements % op_block_num;
- if (op_block_idx < tails) {
- dr += 1;
- ir = dr * op_block_idx;
- } else {
- ir = dr * op_block_idx + tails;
- }
-
- input_gm.SetGlobalBuffer((__gm__ float *)input);
- indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
- output_gm.SetGlobalBuffer((__gm__ float *)output);
-
- uint64_t local_buffer_size = ((input_ne[0] * sizeof(float) + 31) & ~31);
- local_buffer_elems = local_buffer_size / sizeof(float);
-
- // TODO, consider long row that can't put in UB.
- // All data should asign to 32. It's ok because all data is align to 32.
- pipe.InitBuffer(input_queue, BUFFER_NUM, local_buffer_size);
- pipe.InitBuffer(output_queue, BUFFER_NUM, local_buffer_size);
- }
-
- __aicore__ inline void copy_in(uint32_t offset, size_t len) {
- LocalTensor<float> input_local = input_queue.AllocTensor<float>();
- const size_t elem_per_block = 32 / sizeof(float);
- size_t tail = len % elem_per_block;
- len = len & ~(elem_per_block - 1);
- if(tail != 0) {
- len += elem_per_block;
- }
- DataCopy(input_local, input_gm[offset], len);
- input_queue.EnQue(input_local);
- }
-
- __aicore__ inline void copy_out(uint32_t offset, size_t len) {
- LocalTensor<float> output_local = output_queue.DeQue<float>();
- const size_t elem_per_block = 32 / sizeof(float);
- size_t tail = len % elem_per_block;
- len = len & ~(elem_per_block - 1);
- if (len > 0) {
- DataCopy(output_gm[offset], output_local, len);
- }
-
- if(tail != 0) {
-#ifdef ASCEND_310P
- for (size_t i = tail; i < elem_per_block; i++) {
- output_local[len + i].SetValue(0, 0);
- }
- SetAtomicAdd<float>();
- DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
- SetAtomicNone();
-#else
- DataCopyExtParams dataCopyParams;
- dataCopyParams.blockCount = 1;
- dataCopyParams.blockLen = tail * sizeof(float);
- DataCopyPad(output_gm[offset + len], output_local[len],
- dataCopyParams);
-#endif
- }
- output_queue.FreeTensor(output_local);
- }
-
- __aicore__ inline void calculate_row(int64_t idx) {
- const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
- const int64_t indices_ne1_idx =
- (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
- indices_ne[0];
- const int64_t indices_ne0_idx =
- (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
- indices_ne1_idx * indices_ne[0]);
-
- const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
- indices_ne1_idx * indices_stride[1] +
- indices_ne2_idx * indices_stride[2];
- const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
-
- const int64_t input_offset = selected_row_idx * input_stride[1] +
- indices_ne1_idx * input_stride[2] +
- indices_ne2_idx * input_stride[3];
-
- const int64_t output_offset = indices_ne0_idx * output_stride[1] +
- indices_ne1_idx * output_stride[2] +
- indices_ne2_idx * output_stride[3];
-
- copy_in(input_offset, input_ne[0]);
- LocalTensor<float> input_local = input_queue.DeQue<float>();
- LocalTensor<float> output_local = output_queue.AllocTensor<float>();
-
- DataCopy(output_local, input_local, local_buffer_elems);
- output_queue.EnQue(output_local);
- copy_out(output_offset, input_ne[0]);
-
- input_queue.FreeTensor(input_local);
- }
-
- __aicore__ inline void calculate() {
- for (int64_t i = ir; i < ir + dr; i++) {
- calculate_row(i);
- }
- }
-
- private:
- int64_t input_ne[4];
- size_t input_stride[4];
-
- int64_t indices_ne[4];
- size_t indices_stride[4];
-
- int64_t output_ne[4];
- size_t output_stride[4];
-
- size_t local_buffer_elems;
-
- int64_t ir;
- int64_t dr;
-
- TPipe pipe;
- GlobalTensor<float> input_gm;
- GlobalTensor<int32_t> indices_gm;
- GlobalTensor<float> output_gm;
- TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
- TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
- int64_t op_block_idx;
-};
-
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
- auto gm_ptr = (__gm__ uint8_t *)gm;
- auto ub_ptr = (uint8_t *)(ub);
- for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
- *ub_ptr = *gm_ptr;
- }
-}
-
-extern "C" __global__ __aicore__ void ascendc_get_row_f32(
- GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
- GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
- GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
- int64_t input_ne_ub[4];
- size_t input_nb_ub[4];
- int64_t indices_ne_ub[4];
- size_t indices_nb_ub[4];
- int64_t output_ne_ub[4];
- size_t output_nb_ub[4];
-
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
- copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
- copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
- copy_to_ub(output_nb_gm, output_nb_ub, 32);
-
- GET_ROW_F32 op;
- op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
- indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
- op.calculate();
-}
+++ /dev/null
-#include "kernel_operator.h"
-
-// optimize me. Use template to avoid copy code.
-using namespace AscendC;
-#ifdef ASCEND_310P // 310P not support 4bit get row
- extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
- GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
- GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
- GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
- // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
- printf("Ascend310P not support 4bit get row.\n");
- }
-#else
-
-#define BUFFER_NUM 2
-
-#define QK4_0 32
-
-class GET_ROW_Q4_0 {
- public:
- __aicore__ inline GET_ROW_Q4_0() {}
- __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
- int64_t *input_ne_ub, int64_t *indices_ne_ub,
- size_t *indices_nb_ub, int64_t *output_ne_ub,
- size_t *output_nb_ub) {
- int64_t op_block_num = GetBlockNum();
- int64_t op_block_idx = GetBlockIdx();
-
- for (int i = 0; i < 4; i++) {
- input_ne[i] = input_ne_ub[i];
- indices_ne[i] = indices_ne_ub[i];
- indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
- scale_ne[i] = input_ne_ub[i];
- output_ne[i] = output_ne_ub[i];
- output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
- }
-
- // one scale for a group.
- scale_ne[0] /= QK4_0;
-
- input_stride[0] = 1;
- scale_stride[0] = 1;
- output_stride[0] = 1;
- for (int i = 1; i < 4; i++) {
- input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
- scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
- }
-
- group_size_in_row = input_ne[0] / QK4_0;
- int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
- input_ne[3] / 2;
-
- // Indices has two dims. n_elements = all rows should get.
- // dr, all rows should this thread get.
- uint64_t n_elements =
- indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
- dr = n_elements / op_block_num;
-
- uint64_t tails = n_elements % op_block_num;
- if (op_block_idx < tails) {
- dr += 1;
- ir = dr * op_block_idx;
- } else {
- ir = dr * op_block_idx + tails;
- }
-
- input_gm.SetGlobalBuffer((__gm__ int4b_t *)input);
- scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
- indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
- output_gm.SetGlobalBuffer((__gm__ float *)output);
-
- pipe.InitBuffer(input_queue, BUFFER_NUM, QK4_0 * sizeof(int4b_t));
- pipe.InitBuffer(cast_queue, BUFFER_NUM, QK4_0 * sizeof(half));
- pipe.InitBuffer(output_queue, BUFFER_NUM, QK4_0 * sizeof(float));
- }
-
- __aicore__ inline void copy_in(uint32_t offset) {
- LocalTensor<int4b_t> input_local = input_queue.AllocTensor<int4b_t>();
- // 32 * sizeof(int4b_t) = 16, which is not aligned to 32, why no error?
- DataCopy(input_local, input_gm[offset], QK4_0);
- input_queue.EnQue(input_local);
- }
-
- __aicore__ inline void copy_out(uint32_t offset) {
- LocalTensor<float> output_local = output_queue.DeQue<float>();
- DataCopy(output_gm[offset], output_local, QK4_0);
- output_queue.FreeTensor(output_local);
- }
-
- __aicore__ inline void calculate_group(int64_t idx, int64_t group) {
- const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
- const int64_t indices_ne1_idx =
- (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
- indices_ne[0];
- const int64_t indices_ne0_idx =
- (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
- indices_ne1_idx * indices_ne[0]);
-
- const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
- indices_ne1_idx * indices_stride[1] +
- indices_ne2_idx * indices_stride[2];
- const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
-
- const int64_t input_offset = selected_row_idx * input_stride[1] +
- indices_ne1_idx * input_stride[2] +
- indices_ne2_idx * input_stride[3] +
- group * QK4_0;
- const int64_t scale_offset = selected_row_idx * scale_stride[1] +
- indices_ne1_idx * scale_stride[2] +
- indices_ne2_idx * scale_stride[3] + group;
- const int64_t output_offset = indices_ne0_idx * output_stride[1] +
- indices_ne1_idx * output_stride[2] +
- indices_ne2_idx * output_stride[3] +
- group * QK4_0;
-
- copy_in(input_offset);
- LocalTensor<int4b_t> input_local = input_queue.DeQue<int4b_t>();
- LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
- LocalTensor<float> output_local = output_queue.AllocTensor<float>();
-
- // TODO: cast more data to speed up.
- Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
- Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
-
- // Only mul need compile by group.
- half scale = scale_gm.GetValue(scale_offset);
-
- Muls(output_local, output_local, (float)scale, QK4_0);
-
- input_queue.FreeTensor(input_local);
- cast_queue.FreeTensor(cast_local);
- output_queue.EnQue(output_local);
-
- copy_out(output_offset);
- }
-
- __aicore__ inline void calculate() {
- for (int64_t i = ir; i < ir + dr; i++) {
- for (int64_t j = 0; j < group_size_in_row; j++) {
- calculate_group(i, j);
- }
- }
- }
-
- private:
- int64_t input_ne[4];
- size_t input_stride[4];
-
- int64_t scale_ne[4];
- size_t scale_stride[4];
-
- int64_t indices_ne[4];
- size_t indices_stride[4];
-
- int64_t output_ne[4];
- size_t output_stride[4];
-
- int64_t ir;
- int64_t dr;
-
- int64_t group_size_in_row;
-
- TPipe pipe;
- GlobalTensor<int4b_t> input_gm;
- GlobalTensor<half> scale_gm;
- GlobalTensor<int32_t> indices_gm;
- GlobalTensor<float> output_gm;
- TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
- TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
- TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
-};
-
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
- auto gm_ptr = (__gm__ uint8_t *)gm;
- auto ub_ptr = (uint8_t *)(ub);
- for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
- *ub_ptr = *gm_ptr;
- }
-}
-
-extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
- GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
- GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
- GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
- int64_t input_ne_ub[4];
- int64_t indices_ne_ub[4];
- size_t indices_nb_ub[4];
- int64_t output_ne_ub[4];
- size_t output_nb_ub[4];
-
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
- copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
- copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
- copy_to_ub(output_nb_gm, output_nb_ub, 32);
-
- GET_ROW_Q4_0 op;
- op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
- indices_nb_ub, output_ne_ub, output_nb_ub);
- op.calculate();
-}
-
-#endif // #ifdef ASCEND_310P
+++ /dev/null
-#include "kernel_operator.h"
-
-// optimize me. Use template to avoid copy code.
-using namespace AscendC;
-
-#define BUFFER_NUM 2
-
-#define QK8_0 32
-
-class GET_ROW_Q8_0 {
- public:
- __aicore__ inline GET_ROW_Q8_0() {}
- __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
- int64_t *input_ne_ub, int64_t *indices_ne_ub,
- size_t *indices_nb_ub, int64_t *output_ne_ub,
- size_t *output_nb_ub) {
- int64_t op_block_num = GetBlockNum();
- int64_t op_block_idx = GetBlockIdx();
-
- for (int i = 0; i < 4; i++) {
- input_ne[i] = input_ne_ub[i];
- indices_ne[i] = indices_ne_ub[i];
- indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
- scale_ne[i] = input_ne_ub[i];
- output_ne[i] = output_ne_ub[i];
- output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
- }
-
- // one scale for a group.
- scale_ne[0] /= QK8_0;
-
- input_stride[0] = 1;
- scale_stride[0] = 1;
- output_stride[0] = 1;
- for (int i = 1; i < 4; i++) {
- input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
- scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
- }
-
- group_size_in_row = input_ne[0] / QK8_0;
- int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
- input_ne[3] * sizeof(int8_t);
-
- // Indices has two dims. n_elements = all rows should get.
- // dr, all rows should this thread get.
- uint64_t n_elements =
- indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
- dr = n_elements / op_block_num;
-
- uint64_t tails = n_elements % op_block_num;
- if (op_block_idx < tails) {
- dr += 1;
- ir = dr * op_block_idx;
- } else {
- ir = dr * op_block_idx + tails;
- }
-
- input_gm.SetGlobalBuffer((__gm__ int8_t *)input);
- scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
- indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
- output_gm.SetGlobalBuffer((__gm__ float *)output);
-
- pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
- pipe.InitBuffer(cast_queue, BUFFER_NUM, QK8_0 * sizeof(half));
- pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(float));
- }
-
- __aicore__ inline void copy_in(uint32_t offset) {
- LocalTensor<int8_t> input_local = input_queue.AllocTensor<int8_t>();
- DataCopy(input_local, input_gm[offset], QK8_0);
- input_queue.EnQue(input_local);
- }
-
- __aicore__ inline void copy_out(uint32_t offset) {
- LocalTensor<float> output_local = output_queue.DeQue<float>();
- DataCopy(output_gm[offset], output_local, QK8_0);
- output_queue.FreeTensor(output_local);
- }
-
- __aicore__ inline void calculate_group(int64_t idx, int64_t group) {
- const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
- const int64_t indices_ne1_idx =
- (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
- indices_ne[0];
- const int64_t indices_ne0_idx =
- (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
- indices_ne1_idx * indices_ne[0]);
-
- const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
- indices_ne1_idx * indices_stride[1] +
- indices_ne2_idx * indices_stride[2];
- const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
-
- const int64_t input_offset = selected_row_idx * input_stride[1] +
- indices_ne1_idx * input_stride[2] +
- indices_ne2_idx * input_stride[3] +
- group * QK8_0;
- const int64_t scale_offset = selected_row_idx * scale_stride[1] +
- indices_ne1_idx * scale_stride[2] +
- indices_ne2_idx * scale_stride[3] + group;
- const int64_t output_offset = indices_ne0_idx * output_stride[1] +
- indices_ne1_idx * output_stride[2] +
- indices_ne2_idx * output_stride[3] +
- group * QK8_0;
-
- copy_in(input_offset);
- LocalTensor<int8_t> input_local = input_queue.DeQue<int8_t>();
- LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
- LocalTensor<float> output_local = output_queue.AllocTensor<float>();
-
- // TODO: cast more data to speed up.
- Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
- Cast(output_local, cast_local, RoundMode::CAST_NONE, QK8_0);
-
- // Only mul need compile by group.
- half scale = scale_gm.GetValue(scale_offset);
- Muls(output_local, output_local, (float)scale, QK8_0);
-
- input_queue.FreeTensor(input_local);
- cast_queue.FreeTensor(cast_local);
- output_queue.EnQue(output_local);
-
- copy_out(output_offset);
- }
-
- __aicore__ inline void calculate() {
- for (int64_t i = ir; i < ir + dr; i++) {
- for (int64_t j = 0; j < group_size_in_row; j++) {
- calculate_group(i, j);
- }
- }
- }
-
- private:
- int64_t input_ne[4];
- size_t input_stride[4];
-
- int64_t scale_ne[4];
- size_t scale_stride[4];
-
- int64_t indices_ne[4];
- size_t indices_stride[4];
-
- int64_t output_ne[4];
- size_t output_stride[4];
-
- int64_t ir;
- int64_t dr;
-
- int64_t group_size_in_row;
-
- TPipe pipe;
- GlobalTensor<int8_t> input_gm;
- GlobalTensor<half> scale_gm;
- GlobalTensor<int32_t> indices_gm;
- GlobalTensor<float> output_gm;
- TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
- TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
- TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
-};
-
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
- auto gm_ptr = (__gm__ uint8_t *)gm;
- auto ub_ptr = (uint8_t *)(ub);
- for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
- *ub_ptr = *gm_ptr;
- }
-}
-
-extern "C" __global__ __aicore__ void ascendc_get_row_q8_0(
- GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
- GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
- GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
- int64_t input_ne_ub[4];
- int64_t indices_ne_ub[4];
- size_t indices_nb_ub[4];
- int64_t output_ne_ub[4];
- size_t output_nb_ub[4];
-
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
- copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
- copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
- copy_to_ub(output_nb_gm, output_nb_ub, 32);
-
- GET_ROW_Q8_0 op;
- op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
- indices_nb_ub, output_ne_ub, output_nb_ub);
- op.calculate();
-}
+++ /dev/null
-#include "kernel_operator.h"
-
-using namespace AscendC;
-#ifdef ASCEND_310P
- extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
- GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
- GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
- // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
- printf("Ascend310P not support f16->8bit quantization.\n");
- }
-#else
-
-#define BUFFER_NUM 2
-#define QK8_0 32
-
-class QUANTIZE_F16_Q8_0 {
- public:
- __aicore__ inline QUANTIZE_F16_Q8_0() {}
- __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
- int64_t *input_ne_ub, size_t *input_nb_ub,
- int64_t *output_ne_ub) {
- int64_t op_block_num = GetBlockNum();
- int64_t op_block_idx = GetBlockIdx();
-
- for (int i = 0; i < 4; i++) {
- input_ne[i] = input_ne_ub[i];
- input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
-
- output_ne[i] = output_ne_ub[i];
- }
-
- output_stride[0] = 1;
- for (int i = 1; i < 4; i++) {
- output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
- }
-
- scale_ne = input_ne;
- scale_stride[0] = 1;
- scale_stride[1] = input_ne[0] / QK8_0;
- for (int i = 2; i < 4; i++) {
- scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
- }
-
- // split input tensor by rows.
- uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
- dr = nr / op_block_num;
-
- uint64_t tails = nr % op_block_num;
- if (op_block_idx < tails) {
- dr += 1;
- ir = dr * op_block_idx;
- } else {
- ir = dr * op_block_idx + tails;
- }
-
- group_size_in_row = scale_stride[1];
- int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] *
- output_ne[3] * sizeof(uint8_t);
-
- input_gm.SetGlobalBuffer((__gm__ half *)input);
- output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
- scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size + ir *
- group_size_in_row *
- sizeof(half)));
-
- pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(half));
- pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
- pipe.InitBuffer(work_queue, 1, 32);
- pipe.InitBuffer(max_queue, 1, 32);
- pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float));
- pipe.InitBuffer(scale_queue, 1, 32);
- pipe.InitBuffer(cast_queue ,1 ,QK8_0 * sizeof(float));
- }
-
- __aicore__ inline void copy_in(uint32_t offset) {
- LocalTensor<half> input_local = input_queue.AllocTensor<half>();
- DataCopy(input_local, input_gm[offset], QK8_0);
- input_queue.EnQue(input_local);
- }
-
- __aicore__ inline void copy_out(uint32_t offset) {
- LocalTensor<int8_t> output_local = output_queue.DeQue<int8_t>();
- DataCopy(output_gm[offset], output_local, QK8_0);
- output_queue.FreeTensor(output_local);
- }
-
- __aicore__ inline half calculate_group(int64_t row, int64_t group) {
- const int64_t i3 = row / (input_ne[1] * input_ne[2]);
- const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
- const int64_t i1 =
- row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
-
- const int64_t input_offset = i1 * input_stride[1] +
- i2 * input_stride[2] +
- i3 * input_stride[3] + QK8_0 * group;
-
- const int64_t output_offset = i1 * output_stride[1] +
- i2 * output_stride[2] +
- i3 * output_stride[3] + QK8_0 * group;
-
- copy_in(input_offset);
- LocalTensor<half> input_local = input_queue.DeQue<half>();
- LocalTensor<int8_t> output_local = output_queue.AllocTensor<int8_t>();
- LocalTensor<float> work_local = work_queue.AllocTensor<float>();
- LocalTensor<float> abs_local = abs_queue.AllocTensor<float>();
- LocalTensor<float> max_local = max_queue.AllocTensor<float>();
- LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
-
- Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
- Abs(abs_local, cast_local, QK8_0);
- ReduceMax(max_local, abs_local, work_local, QK8_0);
-
- pipe_barrier(PIPE_ALL);
- float d = max_local.GetValue(0);
- d = d / ((1 << 7) - 1);
- if (d != 0) {
- Muls(cast_local, cast_local, 1.0f / d, QK8_0);
- }
-
- Cast(cast_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
- Cast(input_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
- Cast(output_local, input_local, RoundMode::CAST_ROUND, QK8_0);
- output_queue.EnQue(output_local);
- copy_out(output_offset);
-
- input_queue.FreeTensor(input_local);
- work_queue.FreeTensor(work_local);
- abs_queue.FreeTensor(abs_local);
- max_queue.FreeTensor(max_local);
- cast_queue.FreeTensor(cast_local);
- return (half)d;
- }
-
- __aicore__ inline void calculate() {
- LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
- uint32_t scale_local_offset = 0;
- uint32_t scale_global_offset = 0;
- for (int64_t i = ir; i < ir + dr; i++) {
- for (int64_t j = 0; j < group_size_in_row; j++) {
- half scale = calculate_group(i, j);
- scale_local.SetValue(scale_local_offset++, scale);
- if (scale_local_offset == 16) {
- scale_local_offset = 0;
- // TODO: OPTIMIZE ME
- pipe_barrier(PIPE_ALL);
- DataCopy(scale_gm[scale_global_offset], scale_local, 16);
- pipe_barrier(PIPE_ALL);
- scale_global_offset += 16;
- }
- }
- }
-
- if (scale_local_offset != 0) {
- pipe_barrier(PIPE_ALL);
- DataCopyExtParams dataCopyParams;
- dataCopyParams.blockCount = 1;
- dataCopyParams.blockLen = scale_local_offset * sizeof(half);
- DataCopyPad(scale_gm[scale_global_offset], scale_local,
- dataCopyParams);
- pipe_barrier(PIPE_ALL);
- }
- }
-
- private:
- int64_t input_ne[4];
- size_t input_stride[4];
-
- int64_t *scale_ne;
- size_t scale_stride[4];
-
- int64_t output_ne[4];
- size_t output_stride[4];
-
- int64_t group_size_in_row;
-
- int64_t ir;
- int64_t dr;
-
- TPipe pipe;
- GlobalTensor<half> input_gm;
- GlobalTensor<half> scale_gm;
- GlobalTensor<int8_t> output_gm;
- TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
- TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
- TQue<QuePosition::VECIN, 1> work_queue;
- TQue<QuePosition::VECOUT, 1> max_queue;
- TQue<QuePosition::VECIN, 1> abs_queue;
- TQue<QuePosition::VECOUT, 1> scale_queue;
- TQue<QuePosition::VECOUT, 1> cast_queue;
-
-};
-
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
- auto gm_ptr = (__gm__ uint8_t *)gm;
- auto ub_ptr = (uint8_t *)(ub);
- for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
- *ub_ptr = *gm_ptr;
- }
-}
-
-extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
- GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
- GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
- int64_t input_ne_ub[4];
- size_t input_nb_ub[4];
- int64_t output_ne_ub[4];
-
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
-
- QUANTIZE_F16_Q8_0 op;
- op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
- op.calculate();
-}
-
-#endif // #ifdef ASCEND_310P
+++ /dev/null
-#include "kernel_operator.h"
-
-using namespace AscendC;
-#ifdef ASCEND_310P // 310P not support f32->8bit quantization
- extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
- GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
- GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
- // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
- printf("Ascend310P not support f32->8bit quantization.\n");
- }
-#else
-
-#define BUFFER_NUM 2
-#define QK8_0 32
-
-class QUANTIZE_F32_Q8_0 {
- public:
- __aicore__ inline QUANTIZE_F32_Q8_0() {}
- __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
- int64_t *input_ne_ub, size_t *input_nb_ub,
- int64_t *output_ne_ub) {
- int64_t op_block_num = GetBlockNum();
- int64_t op_block_idx = GetBlockIdx();
-
- for (int i = 0; i < 4; i++) {
- input_ne[i] = input_ne_ub[i];
- input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
-
- output_ne[i] = output_ne_ub[i];
- }
-
- output_stride[0] = 1;
- for (int i = 1; i < 4; i++) {
- output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
- }
-
- scale_ne = input_ne;
- scale_stride[0] = 1;
- scale_stride[1] = input_ne[0] / QK8_0;
- for (int i = 2; i < 4; i++) {
- scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
- }
-
- // split input tensor by rows.
- uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
- dr = nr / op_block_num;
-
- uint64_t tails = nr % op_block_num;
- if (op_block_idx < tails) {
- dr += 1;
- ir = dr * op_block_idx;
- } else {
- ir = dr * op_block_idx + tails;
- }
-
- group_size_in_row = scale_stride[1];
- int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] *
- output_ne[3] * sizeof(uint8_t);
-
- input_gm.SetGlobalBuffer((__gm__ float *)input);
- output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
- scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size +
- ir * group_size_in_row *
- sizeof(half)));
-
- pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(float));
- pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
- pipe.InitBuffer(work_queue, 1, 32);
- pipe.InitBuffer(max_queue, 1, 32);
- pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float));
- pipe.InitBuffer(cast_queue, 1, QK8_0 * sizeof(half));
- pipe.InitBuffer(scale_queue, 1, 32);
- }
-
- __aicore__ inline void copy_in(uint32_t offset) {
- LocalTensor<float> input_local = input_queue.AllocTensor<float>();
- DataCopy(input_local, input_gm[offset], QK8_0);
- input_queue.EnQue(input_local);
- }
-
- __aicore__ inline void copy_out(uint32_t offset) {
- LocalTensor<int8_t> output_local = output_queue.DeQue<int8_t>();
- DataCopy(output_gm[offset], output_local, QK8_0);
- output_queue.FreeTensor(output_local);
- }
-
- __aicore__ inline half calculate_group(int64_t row, int64_t group) {
- const int64_t i3 = row / (input_ne[1] * input_ne[2]);
- const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
- const int64_t i1 =
- row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
-
- const int64_t input_offset = i1 * input_stride[1] +
- i2 * input_stride[2] +
- i3 * input_stride[3] + QK8_0 * group;
-
- const int64_t output_offset = i1 * output_stride[1] +
- i2 * output_stride[2] +
- i3 * output_stride[3] + QK8_0 * group;
-
- copy_in(input_offset);
- LocalTensor<float> input_local = input_queue.DeQue<float>();
- LocalTensor<int8_t> output_local = output_queue.AllocTensor<int8_t>();
- LocalTensor<float> work_local = work_queue.AllocTensor<float>();
- LocalTensor<float> abs_local = abs_queue.AllocTensor<float>();
- LocalTensor<float> max_local = max_queue.AllocTensor<float>();
- LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
-
- Abs(abs_local, input_local, QK8_0);
- ReduceMax(max_local, abs_local, work_local, QK8_0);
- pipe_barrier(PIPE_ALL);
- float d = max_local.GetValue(0);
- d = d / ((1 << 7) - 1);
- if (d != 0) {
- Muls(input_local, input_local, 1.0f / d, QK8_0);
- }
-
- Cast(input_local, input_local, RoundMode::CAST_ROUND, QK8_0);
- Cast(cast_local, input_local, RoundMode::CAST_ROUND, QK8_0);
- Cast(output_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
- output_queue.EnQue(output_local);
- copy_out(output_offset);
-
- input_queue.FreeTensor(input_local);
- work_queue.FreeTensor(work_local);
- abs_queue.FreeTensor(abs_local);
- max_queue.FreeTensor(max_local);
- cast_queue.FreeTensor(cast_local);
-
- return (half)d;
- }
-
- __aicore__ inline void calculate() {
- LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
- uint32_t scale_local_offset = 0;
- uint32_t scale_global_offset = 0;
- for (int64_t i = ir; i < ir + dr; i++) {
- for (int64_t j = 0; j < group_size_in_row; j++) {
- half scale = calculate_group(i, j);
- scale_local.SetValue(scale_local_offset++, scale);
- if (scale_local_offset == 16) {
- scale_local_offset = 0;
- // TODO: OPTIMIZE ME
- pipe_barrier(PIPE_ALL);
- DataCopy(scale_gm[scale_global_offset], scale_local, 16);
- pipe_barrier(PIPE_ALL);
- scale_global_offset += 16;
- }
- }
- }
-
- if (scale_local_offset != 0) {
- pipe_barrier(PIPE_ALL);
- DataCopyExtParams dataCopyParams;
- dataCopyParams.blockCount = 1;
- dataCopyParams.blockLen = scale_local_offset * sizeof(half);
- DataCopyPad(scale_gm[scale_global_offset], scale_local,
- dataCopyParams);
- pipe_barrier(PIPE_ALL);
- }
- }
-
- private:
- int64_t input_ne[4];
- size_t input_stride[4];
-
- int64_t *scale_ne;
- size_t scale_stride[4];
-
- int64_t output_ne[4];
- size_t output_stride[4];
-
- int64_t group_size_in_row;
-
- int64_t ir;
- int64_t dr;
-
- TPipe pipe;
- GlobalTensor<float> input_gm;
- GlobalTensor<half> scale_gm;
- GlobalTensor<int8_t> output_gm;
- TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
- TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
- TQue<QuePosition::VECIN, 1> work_queue;
- TQue<QuePosition::VECOUT, 1> max_queue;
- TQue<QuePosition::VECIN, 1> abs_queue;
- TQue<QuePosition::VECIN, 1> cast_queue;
- TQue<QuePosition::VECOUT, 1> scale_queue;
-};
-
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
- auto gm_ptr = (__gm__ uint8_t *)gm;
- auto ub_ptr = (uint8_t *)(ub);
- for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
- *ub_ptr = *gm_ptr;
- }
-}
-
-extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
- GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
- GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
- int64_t input_ne_ub[4];
- size_t input_nb_ub[4];
- int64_t output_ne_ub[4];
-
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
-
- QUANTIZE_F32_Q8_0 op;
- op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
- op.calculate();
-}
-
-#endif // #ifdef ASCEND_310P
+++ /dev/null
-#include "kernel_operator.h"
-
-using namespace AscendC;
-#ifdef ASCEND_310P // 310P not support float->4bit quantization
- extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
- GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
- GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
- // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
- printf("Ascend310P not support f32->4bit quantization.\n");
- }
-
- extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
- GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
- GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
- // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
- printf("Ascend310P not support f16->4bit quantization.\n");
- }
-#else
-
-#define BUFFER_NUM 2
-#define Group_Size 32
-
-template <typename SRC_T>
-class QUANTIZE_FLOAT_TO_Q4_0 {
- public:
- __aicore__ inline QUANTIZE_FLOAT_TO_Q4_0() {}
- __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
- int64_t *input_ne_ub, size_t *input_nb_ub,
- int64_t *output_ne_ub) {
- // TODO: fix test_case CPY(type_src=f16,type_dst=q4_0,ne=[256,4,4,4],
- // permute=[0,0,0,0]):
- // [CPY] NMSE = 0.000008343 > 0.000001000 FAIL
- int64_t op_block_num = GetBlockNum();
- int64_t op_block_idx = GetBlockIdx();
-
- // input stride of data elements
- for (int i = 0; i < 4; i++) {
- input_ne[i] = input_ne_ub[i];
- input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
- output_ne[i] = output_ne_ub[i];
- }
-
- // output stride of data elements
- output_stride[0] = 1;
- for (int i = 1; i < 4; i++) {
- output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
- }
-
- // scale saved one by one after data:. [group1_scale, group2_scale, ...]
- scale_ne = input_ne;
- scale_stride[0] = 1;
- scale_stride[1] = input_ne[0] / Group_Size;
- for (int i = 2; i < 4; i++) {
- scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
- }
-
- // split input tensor by rows.
- uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
- dr = nr / op_block_num;
-
- uint64_t tails = nr % op_block_num;
- if (op_block_idx < tails) {
- dr += 1;
- ir = dr * op_block_idx;
- } else {
- ir = dr * op_block_idx + tails;
- }
-
- group_size_in_row = scale_stride[1];
- int64_t scale_offset = output_ne[0] * output_ne[1] * output_ne[2] *
- output_ne[3] * sizeof(uint8_t) / 2;
-
- input_gm.SetGlobalBuffer((__gm__ SRC_T *)input);
- output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
- scale_gm.SetGlobalBuffer((__gm__ half *)(output + scale_offset + ir *
- group_size_in_row *
- sizeof(half)));
-
- pipe.InitBuffer(input_queue, BUFFER_NUM, Group_Size * sizeof(SRC_T));
- pipe.InitBuffer(output_queue, BUFFER_NUM,
- Group_Size * sizeof(int8_t) / 2);
- pipe.InitBuffer(cast_queue , 1, Group_Size * sizeof(float));
- pipe.InitBuffer(work_queue, 1, Group_Size * sizeof(float));
- pipe.InitBuffer(max_queue, 1, Group_Size * sizeof(float));
- pipe.InitBuffer(min_queue, 1, Group_Size * sizeof(float));
- pipe.InitBuffer(scale_queue, 1, Group_Size / 2 * sizeof(half));
- pipe.InitBuffer(int8_queue, 1, Group_Size * sizeof(int8_t));
- pipe.InitBuffer(half_queue, 1, Group_Size * sizeof(half));
- }
-
- __aicore__ inline void copy_in(uint32_t offset) {
- LocalTensor<SRC_T> input_local = input_queue.AllocTensor<SRC_T>();
- DataCopy(input_local, input_gm[offset], Group_Size);
- input_queue.EnQue(input_local);
- }
-
- __aicore__ inline void copy_out(uint32_t offset) {
- // reinterpretcast Group_Size(32) * int4b_t to Group_Size / 2 * int8_t,
- // and using DataCopyPad to avoid 32 bits align.
- LocalTensor<int4b_t> output_local = output_queue.DeQue<int4b_t>();
- LocalTensor<int8_t> output_int8_local =
- output_local.ReinterpretCast<int8_t>();
-
- DataCopyExtParams dataCopyParams;
- dataCopyParams.blockCount = 1;
- dataCopyParams.blockLen = Group_Size / 2 * sizeof(int8_t);
- DataCopyPad(output_gm[offset], output_int8_local, dataCopyParams);
-
- output_queue.FreeTensor(output_local);
- }
-
- __aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
- LocalTensor<float> input_local) {
- DataCopy(cast_local, input_local, Group_Size);
- }
-
- __aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
- LocalTensor<half> input_local) {
- Cast(cast_local, input_local, RoundMode::CAST_NONE, Group_Size);
- }
-
- __aicore__ inline half calculate_group(int64_t row, int64_t group) {
- const int64_t i3 = row / (input_ne[1] * input_ne[2]);
- const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
- const int64_t i1 =
- row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
-
- const int64_t input_offset = i1 * input_stride[1] +
- i2 * input_stride[2] +
- i3 * input_stride[3] + Group_Size * group;
-
- // output_offset is stride for output_gm which datatype is int8_t and
- // divided by 2 is needed for int4b_t.
- const int64_t output_offset = (i1 * output_stride[1] +
- i2 * output_stride[2] +
- i3 * output_stride[3] +
- Group_Size * group) / 2;
- copy_in(input_offset);
-
- LocalTensor<SRC_T> input_local = input_queue.DeQue<SRC_T>();
- LocalTensor<int4b_t> output_local = output_queue.AllocTensor<int4b_t>();
- LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
- LocalTensor<float> work_local = work_queue.AllocTensor<float>();
- LocalTensor<float> max_local = max_queue.AllocTensor<float>();
- LocalTensor<float> min_local = min_queue.AllocTensor<float>();
- LocalTensor<int8_t> int8_local = int8_queue.AllocTensor<int8_t>();
- LocalTensor<half> half_local = half_queue.AllocTensor<half>();
-
- input_to_cast(cast_local, input_local);
-
- ReduceMax(max_local, cast_local, work_local, Group_Size);
- ReduceMin(min_local, cast_local, work_local, Group_Size);
- const float max_value = max_local.GetValue(0);
- const float min_value = min_local.GetValue(0);
- float d = max_value;
- if (min_value < 0 && (-1 * min_value) > max_value) {
- d = min_value;
- }
-
- d = d / (-8);
- if (d != 0) {
- Muls(cast_local, cast_local, 1.0f / d, Group_Size);
- }
-
- // range: [-8,8] -> [0.5,16.5] -> [0,16] -> [0,15] -> [-8,7]
- float scalar = 8.5f;
- Adds(cast_local, cast_local, scalar, Group_Size);
- Cast(cast_local, cast_local, RoundMode::CAST_FLOOR, Group_Size);
- scalar = 15.0f;
- Mins(cast_local, cast_local, scalar, Group_Size);
- scalar = -8.0f;
- Adds(cast_local, cast_local, scalar, Group_Size);
-
- // float->half->int4b
- Cast(half_local, cast_local, RoundMode::CAST_NONE, Group_Size);
- Cast(output_local, half_local, RoundMode::CAST_NONE, Group_Size);
-
- output_queue.EnQue(output_local);
- copy_out(output_offset);
-
- input_queue.FreeTensor(input_local);
- work_queue.FreeTensor(work_local);
- max_queue.FreeTensor(max_local);
- min_queue.FreeTensor(min_local);
- int8_queue.FreeTensor(int8_local);
- half_queue.FreeTensor(half_local);
- cast_queue.FreeTensor(cast_local);
- return (half)d;
- }
-
- __aicore__ inline void calculate() {
- LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
- uint32_t scale_local_offset = 0;
- uint32_t scale_global_offset = 0;
- for (int64_t i = ir; i < ir + dr; i++) {
- for (int64_t j = 0; j < group_size_in_row; j++) {
- half scale = calculate_group(i, j);
- scale_local.SetValue(scale_local_offset++, scale);
- // Copy Group_Size/2 length data each time.
- if (scale_local_offset == Group_Size / 2) {
- scale_local_offset = 0;
- // TODO: OPTIMIZE ME
- pipe_barrier(PIPE_ALL);
- DataCopy(scale_gm[scale_global_offset], scale_local,
- Group_Size / 2);
- pipe_barrier(PIPE_ALL);
- scale_global_offset += Group_Size / 2;
- }
- }
- }
-
- if (scale_local_offset != 0) {
- pipe_barrier(PIPE_ALL);
- DataCopyExtParams dataCopyParams;
- dataCopyParams.blockCount = 1;
- dataCopyParams.blockLen = scale_local_offset * sizeof(half);
- DataCopyPad(scale_gm[scale_global_offset], scale_local,
- dataCopyParams);
- pipe_barrier(PIPE_ALL);
- }
- scale_queue.FreeTensor(scale_local);
- }
-
- private:
- int64_t input_ne[4];
- size_t input_stride[4];
-
- int64_t *scale_ne;
- size_t scale_stride[4];
-
- int64_t output_ne[4];
- size_t output_stride[4];
-
- int64_t group_size_in_row;
-
- int64_t ir;
- int64_t dr;
-
- TPipe pipe;
- GlobalTensor<SRC_T> input_gm;
- GlobalTensor<half> scale_gm;
- GlobalTensor<int8_t> output_gm;
- TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
- TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
- TQue<QuePosition::VECIN, BUFFER_NUM> work_queue;
- TQue<QuePosition::VECOUT, BUFFER_NUM> max_queue;
- TQue<QuePosition::VECOUT, BUFFER_NUM> min_queue;
- TQue<QuePosition::VECOUT, BUFFER_NUM> scale_queue;
- TQue<QuePosition::VECOUT, BUFFER_NUM> cast_queue;
- TQue<QuePosition::VECOUT, BUFFER_NUM> int8_queue;
- TQue<QuePosition::VECOUT, BUFFER_NUM> half_queue;
-};
-
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
- auto gm_ptr = (__gm__ uint8_t *)gm;
- auto ub_ptr = (uint8_t *)(ub);
- for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
- *ub_ptr = *gm_ptr;
- }
-}
-
-extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
- GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
- GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
- int64_t input_ne_ub[4];
- size_t input_nb_ub[4];
- int64_t output_ne_ub[4];
-
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
-
- QUANTIZE_FLOAT_TO_Q4_0<half> op;
- op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
- op.calculate();
-}
-
-extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
- GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
- GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
- int64_t input_ne_ub[4];
- size_t input_nb_ub[4];
- int64_t output_ne_ub[4];
-
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
-
- QUANTIZE_FLOAT_TO_Q4_0<float> op;
- op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
- op.calculate();
-}
-
-#endif // #ifdef ASCEND_310P
+++ /dev/null
-
-find_package(Vulkan COMPONENTS glslc REQUIRED)
-find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
-
-if (NOT glslc_executable)
- message(FATAL_ERROR "glslc not found")
-endif()
-
-ggml_add_backend_library(ggml-kompute
- ggml-kompute.cpp
- ../../include/ggml-kompute.h
- )
-
-target_link_libraries(ggml-kompute PRIVATE ggml-base kompute)
-target_include_directories(ggml-kompute PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
-
-add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
-
-function(compile_shader)
- set(options)
- set(oneValueArgs)
- set(multiValueArgs SOURCES)
- cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
- foreach(source ${compile_shader_SOURCES})
- get_filename_component(filename ${source} NAME)
- set(spv_file ${filename}.spv)
- add_custom_command(
- OUTPUT ${spv_file}
- DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
- ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp
- ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp
- ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
- ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp
- COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
- COMMENT "Compiling ${source} to ${spv_file}"
- )
-
- get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
- set(FILE_NAME "shader${RAW_FILE_NAME}")
- string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
- string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
- string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
- set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
- message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
- if(CMAKE_GENERATOR MATCHES "Visual Studio")
- add_custom_command(
- OUTPUT ${OUTPUT_HEADER_FILE}
- COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
- COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
- COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
- COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
- COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
- COMMAND ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
- COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
- COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
- DEPENDS ${spv_file} xxd
- COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd"
- )
- else()
- add_custom_command(
- OUTPUT ${OUTPUT_HEADER_FILE}
- COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
- COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
- COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
- COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
- COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
- COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
- COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
- COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
- DEPENDS ${spv_file} xxd
- COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd"
- )
- endif()
- endforeach()
-endfunction()
-
-if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
- message(STATUS "Kompute found")
- set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level")
- add_subdirectory(kompute)
-
- # Compile our shaders
- compile_shader(SOURCES
- kompute-shaders/op_scale.comp
- kompute-shaders/op_scale_8.comp
- kompute-shaders/op_add.comp
- kompute-shaders/op_addrow.comp
- kompute-shaders/op_mul.comp
- kompute-shaders/op_silu.comp
- kompute-shaders/op_relu.comp
- kompute-shaders/op_gelu.comp
- kompute-shaders/op_softmax.comp
- kompute-shaders/op_norm.comp
- kompute-shaders/op_rmsnorm.comp
- kompute-shaders/op_diagmask.comp
- kompute-shaders/op_mul_mat_mat_f32.comp
- kompute-shaders/op_mul_mat_f16.comp
- kompute-shaders/op_mul_mat_q8_0.comp
- kompute-shaders/op_mul_mat_q4_0.comp
- kompute-shaders/op_mul_mat_q4_1.comp
- kompute-shaders/op_mul_mat_q4_k.comp
- kompute-shaders/op_mul_mat_q6_k.comp
- kompute-shaders/op_getrows_f32.comp
- kompute-shaders/op_getrows_f16.comp
- kompute-shaders/op_getrows_q4_0.comp
- kompute-shaders/op_getrows_q4_1.comp
- kompute-shaders/op_getrows_q6_k.comp
- kompute-shaders/op_rope_norm_f16.comp
- kompute-shaders/op_rope_norm_f32.comp
- kompute-shaders/op_rope_neox_f16.comp
- kompute-shaders/op_rope_neox_f32.comp
- kompute-shaders/op_cpy_f16_f16.comp
- kompute-shaders/op_cpy_f16_f32.comp
- kompute-shaders/op_cpy_f32_f16.comp
- kompute-shaders/op_cpy_f32_f32.comp
- )
-
- # Create a custom target for our generated shaders
- add_custom_target(generated_shaders DEPENDS
- shaderop_scale.h
- shaderop_scale_8.h
- shaderop_add.h
- shaderop_addrow.h
- shaderop_mul.h
- shaderop_silu.h
- shaderop_relu.h
- shaderop_gelu.h
- shaderop_softmax.h
- shaderop_norm.h
- shaderop_rmsnorm.h
- shaderop_diagmask.h
- shaderop_mul_mat_mat_f32.h
- shaderop_mul_mat_f16.h
- shaderop_mul_mat_q8_0.h
- shaderop_mul_mat_q4_0.h
- shaderop_mul_mat_q4_1.h
- shaderop_mul_mat_q4_k.h
- shaderop_mul_mat_q6_k.h
- shaderop_getrows_f32.h
- shaderop_getrows_f16.h
- shaderop_getrows_q4_0.h
- shaderop_getrows_q4_1.h
- shaderop_getrows_q6_k.h
- shaderop_rope_norm_f16.h
- shaderop_rope_norm_f32.h
- shaderop_rope_neox_f16.h
- shaderop_rope_neox_f32.h
- shaderop_cpy_f16_f16.h
- shaderop_cpy_f16_f32.h
- shaderop_cpy_f32_f16.h
- shaderop_cpy_f32_f32.h
- )
-
- # Create a custom command that depends on the generated_shaders
- add_custom_command(
- OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
- COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
- DEPENDS generated_shaders
- COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp"
- )
-
- # Add the stamp to the main sources to ensure dependency tracking
- target_sources(ggml-kompute PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
-else()
- message(WARNING "Kompute not found")
-endif()
+++ /dev/null
-#include "ggml-impl.h"
-#include "ggml-backend.h"
-#include "ggml-backend-impl.h"
-#include "ggml-kompute.h"
-
-// These are generated at build time by cmake custom command
-#include "shaderop_scale.h"
-#include "shaderop_scale_8.h"
-#include "shaderop_add.h"
-#include "shaderop_addrow.h"
-#include "shaderop_mul.h"
-#include "shaderop_silu.h"
-#include "shaderop_relu.h"
-#include "shaderop_gelu.h"
-#include "shaderop_softmax.h"
-#include "shaderop_norm.h"
-#include "shaderop_rmsnorm.h"
-#include "shaderop_diagmask.h"
-#include "shaderop_mul_mat_f16.h"
-#include "shaderop_mul_mat_q8_0.h"
-#include "shaderop_mul_mat_q4_0.h"
-#include "shaderop_mul_mat_q4_1.h"
-#include "shaderop_mul_mat_q4_k.h"
-#include "shaderop_mul_mat_q6_k.h"
-#include "shaderop_mul_mat_mat_f32.h"
-#include "shaderop_getrows_f32.h"
-#include "shaderop_getrows_f16.h"
-#include "shaderop_getrows_q4_0.h"
-#include "shaderop_getrows_q4_1.h"
-#include "shaderop_getrows_q6_k.h"
-#include "shaderop_rope_norm_f16.h"
-#include "shaderop_rope_norm_f32.h"
-#include "shaderop_rope_neox_f16.h"
-#include "shaderop_rope_neox_f32.h"
-#include "shaderop_cpy_f16_f16.h"
-#include "shaderop_cpy_f16_f32.h"
-#include "shaderop_cpy_f32_f16.h"
-#include "shaderop_cpy_f32_f32.h"
-
-#include <algorithm>
-#include <array>
-#include <cassert>
-#include <cstdint>
-#include <cstdio>
-#include <cstring>
-#include <iostream>
-#include <memory>
-#include <mutex>
-#include <stdexcept>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include <kompute/Kompute.hpp>
-#include <vulkan/vulkan.hpp>
-
-#ifdef __linux__
-#include <cstdlib> // for setenv
-#endif
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-#define QK_NL 16
-
-typedef ggml_fp16_t half;
-
-static std::string ggml_kompute_format_name(int device) {
- return "Kompute" + std::to_string(device);
-}
-
-struct ggml_kompute_context {
- int device;
- std::string name;
- std::shared_ptr<vk::DescriptorPool> pool;
-
- ggml_kompute_context(int device)
- : device(device), name(ggml_kompute_format_name(device)) {}
-};
-
-// FIXME: It would be good to consolidate the kompute manager and the kompute context into one object
-// and consolidate the init functions and simplify object lifetime management. As it currently stands,
-// we *have* to have the kompute manager no matter what for device discovery, but the kompute context
-// is only created when a device is set and vulkan is explicitly turned on.
-static ggml_kompute_context *s_kompute_context = nullptr;
-
-class kompute_manager {
- kp::Manager *s_mgr = nullptr;
-
-public:
- kp::Manager *operator()() {
- if (s_mgr && !s_mgr->hasInstance()) {
- destroy();
- }
- if (!s_mgr) {
- s_mgr = new kp::Manager;
- }
- return s_mgr;
- }
-
- void destroy() {
- delete s_mgr;
- s_mgr = nullptr;
- }
-};
-
-static kompute_manager komputeManager;
-
-struct ggml_vk_memory {
- void *data = nullptr;
- size_t size = 0;
- vk::DeviceMemory *primaryMemory = nullptr;
- vk::Buffer *primaryBuffer = nullptr;
- vk::DeviceMemory *stagingMemory = nullptr;
- vk::Buffer *stagingBuffer = nullptr;
-};
-
-#ifdef __linux__
-__attribute__((constructor))
-static void enable_sam() {
- setenv("RADV_PERFTEST", "sam", false);
-}
-#endif
-
-static bool ggml_vk_checkPhysicalDeviceFeatures(vk::PhysicalDevice physical_device) {
- vk::PhysicalDeviceFeatures availableFeatures;
- physical_device.getFeatures(&availableFeatures);
-
- if (!availableFeatures.shaderInt16)
- return false;
-
- vk::PhysicalDeviceVulkan11Features availableFeatures11;
- vk::PhysicalDeviceVulkan12Features availableFeatures12;
-
- availableFeatures11.pNext = &availableFeatures12;
- availableFeatures12.pNext = nullptr;
-
- vk::PhysicalDeviceFeatures2 features2;
- features2.pNext = &availableFeatures11;
-
- physical_device.getFeatures2(&features2);
-
- if (!availableFeatures11.uniformAndStorageBuffer16BitAccess ||
- !availableFeatures11.storageBuffer16BitAccess) {
- return false;
- }
-
- if (!availableFeatures12.storageBuffer8BitAccess ||
- !availableFeatures12.uniformAndStorageBuffer8BitAccess ||
- !availableFeatures12.shaderFloat16 ||
- !availableFeatures12.shaderInt8) {
- return false;
- }
-
- return true;
-}
-
-static const char * ggml_vk_getVendorName(uint32_t vendorID) {
- switch (vendorID) {
- case 0x10DE:
- return "nvidia";
- case 0x1002:
- return "amd";
- case 0x8086:
- return "intel";
- default:
- return "unknown";
- }
-}
-
-static std::vector<ggml_vk_device> ggml_vk_available_devices_internal(size_t memoryRequired) {
- std::vector<ggml_vk_device> results;
- if (!komputeManager()->hasVulkan() || !komputeManager()->hasInstance())
- return results;
-
- std::vector<vk::PhysicalDevice> physical_devices;
- try {
- physical_devices = komputeManager()->listDevices();
- } catch (vk::SystemError & err) {
- std::cerr << __func__ << ": ignoring Vulkan exception: " << err.what() << "\n";
- return results;
- }
-
- uint32_t deviceCount = physical_devices.size();
- if (deviceCount == 0)
- return results;
-
- std::unordered_map<std::string, size_t> count_by_name;
-
- for (uint32_t i = 0; i < deviceCount; i++) {
- const auto & physical_device = physical_devices[i];
-
- VkPhysicalDeviceProperties dev_props = physical_device.getProperties();
- VkPhysicalDeviceMemoryProperties memoryProperties = physical_device.getMemoryProperties();
- const uint32_t major = VK_VERSION_MAJOR(dev_props.apiVersion);
- const uint32_t minor = VK_VERSION_MINOR(dev_props.apiVersion);
- if (major < 1 || minor < 2)
- continue;
-
- if (!ggml_vk_checkPhysicalDeviceFeatures(physical_device))
- continue;
-
- size_t heapSize = 0;
- for (uint32_t j = 0; j < memoryProperties.memoryHeapCount; ++j) {
- VkMemoryHeap heap = memoryProperties.memoryHeaps[j];
- if (heap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) {
- heapSize = heap.size;
- break;
- }
- }
-
- if (heapSize < memoryRequired)
- continue;
-
- auto ext_props = physical_device.enumerateDeviceExtensionProperties();
- bool has_maintenance4 = false;
-
- // Check if maintenance4 is supported
- for (const auto & properties : ext_props) {
- if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
- has_maintenance4 = true;
- }
- }
-
- vk::PhysicalDeviceSubgroupProperties subgroup_props;
- vk::PhysicalDeviceProperties2 dev_props2;
- vk::PhysicalDeviceMaintenance3Properties dev_props3;
- vk::PhysicalDeviceMaintenance4Properties dev_props4;
- dev_props2.pNext = &dev_props3;
- dev_props3.pNext = &subgroup_props;
- if (has_maintenance4) {
- subgroup_props.pNext = &dev_props4;
- }
- physical_device.getProperties2(&dev_props2);
-
- if (subgroup_props.subgroupSize < 32)
- continue;
-
- ggml_vk_device d;
- d.index = i;
- d.type = dev_props.deviceType;
- d.heapSize = heapSize;
- d.vendor = strdup(ggml_vk_getVendorName(dev_props.vendorID));
- d.subgroupSize = subgroup_props.subgroupSize;
- d.bufferAlignment = dev_props.limits.minStorageBufferOffsetAlignment;
-
- if (has_maintenance4) {
- d.maxAlloc = std::min(dev_props3.maxMemoryAllocationSize, dev_props4.maxBufferSize);
- } else {
- d.maxAlloc = dev_props3.maxMemoryAllocationSize;
- }
-
- std::string name(dev_props.deviceName);
- size_t n_idx = ++count_by_name[name];
- if (n_idx > 1) {
- name += " (" + std::to_string(n_idx) + ")";
- }
- d.name = strdup(name.c_str());
-
- results.push_back(d);
- }
-
- std::stable_sort(results.begin(), results.end(),
- [](const ggml_vk_device& lhs, const ggml_vk_device& rhs) -> bool {
- if (lhs.type != rhs.type) {
- if (lhs.type == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU) return true;
- if (rhs.type == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU) return false;
-
- if (lhs.type == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU) return true;
- if (rhs.type == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU) return false;
- }
- return lhs.heapSize < rhs.heapSize;
- }
- );
-
- return results;
-}
-
-static std::vector<ggml_vk_device>& ggml_vk_available_devices() {
- static std::vector<ggml_vk_device> devices = ggml_vk_available_devices_internal(0);
- return devices;
-}
-
-static void ggml_vk_filterByVendor(std::vector<ggml_vk_device>& devices, const std::string& targetVendor) {
- devices.erase(
- std::remove_if(devices.begin(), devices.end(),
- [&targetVendor](const ggml_vk_device& device) {
- return device.vendor != targetVendor;
- }),
- devices.end()
- );
-}
-
-static void ggml_vk_filterByName(std::vector<ggml_vk_device>& devices, const std::string& targetName) {
- devices.erase(
- std::remove_if(devices.begin(), devices.end(),
- [&targetName](const ggml_vk_device& device) {
- return device.name != targetName;
- }),
- devices.end()
- );
-}
-
-static bool ggml_vk_get_device(ggml_vk_device * device, size_t memoryRequired, const std::string & name) {
- if (name.empty())
- return false;
-
- auto devices = ggml_vk_available_devices_internal(memoryRequired);
- if (name == "amd" || name == "nvidia" || name == "intel") {
- ggml_vk_filterByVendor(devices, name);
- } else if (name != "gpu") {
- ggml_vk_filterByName(devices, name);
- }
-
- if (devices.empty())
- return false;
-
- *device = devices.front();
- return true;
-}
-
-bool ggml_vk_get_device(ggml_vk_device * device, size_t memoryRequired, const char * name) {
- return ggml_vk_get_device(device, memoryRequired, std::string(name));
-}
-
-bool ggml_vk_has_vulkan() {
- return komputeManager()->hasVulkan();
-}
-
-bool ggml_vk_has_device() {
- return komputeManager()->hasDevice();
-}
-
-ggml_vk_device ggml_vk_current_device() {
- if (!komputeManager()->hasDevice())
- return ggml_vk_device();
-
- auto devices = ggml_vk_available_devices();
- ggml_vk_filterByName(devices, komputeManager()->physicalDevice()->getProperties().deviceName.data());
- GGML_ASSERT(!devices.empty());
- return devices.front();
-}
-
-static
-void ggml_vk_allocate_descriptor_pool(struct ggml_kompute_context * ctx, size_t size) {
- std::vector<vk::DescriptorPoolSize> descriptorPoolSizes = {
- vk::DescriptorPoolSize(
- vk::DescriptorType::eStorageBuffer,
- 4 * size // Descriptor count is number of possible tensors to pass into an algorithm
- )
- };
-
- vk::DescriptorPoolCreateInfo descriptorPoolInfo(
- vk::DescriptorPoolCreateFlags(),
- size, // Max sets
- static_cast<uint32_t>(descriptorPoolSizes.size()),
- descriptorPoolSizes.data());
-
- ctx->pool = std::make_shared<vk::DescriptorPool>();
- vk::Result r = komputeManager()->device()->createDescriptorPool(
- &descriptorPoolInfo, nullptr, ctx->pool.get());
- if (r != vk::Result::eSuccess)
- std::cerr << "Error allocating descriptor pool" << vk::to_string(r);
-}
-
-static
-void ggml_vk_free_descriptor_pool(struct ggml_kompute_context * ctx) {
- if (ctx->pool) {
- komputeManager()->device()->destroy(
- *ctx->pool,
- (vk::Optional<const vk::AllocationCallbacks>)nullptr);
- ctx->pool = nullptr;
- }
-}
-
-static
-vk::Buffer *ggml_vk_allocate_buffer(size_t size) {
- vk::BufferCreateInfo bufferCreateInfo;
- bufferCreateInfo.size = size;
- bufferCreateInfo.usage = vk::BufferUsageFlagBits::eStorageBuffer |
- vk::BufferUsageFlagBits::eTransferSrc |
- vk::BufferUsageFlagBits::eTransferDst;
- bufferCreateInfo.sharingMode = vk::SharingMode::eExclusive;
-
- vk::Buffer *vkBuffer = new vk::Buffer;
- vk::Result r = komputeManager()->device()->createBuffer(&bufferCreateInfo, nullptr, vkBuffer);
- if (r != vk::Result::eSuccess)
- std::cerr << "Error allocating buffer " << vk::to_string(r) << std::endl;
- return vkBuffer;
-}
-
-static
-vk::DeviceMemory *ggml_vk_allocate(size_t size, vk::MemoryPropertyFlags flags, vk::MemoryRequirements requirements, bool *isHostVisible) {
-
- uint32_t memoryTypeIndex = -1;
- bool memoryTypeIndexFound = false;
- vk::PhysicalDeviceMemoryProperties memoryProperties = komputeManager()->physicalDevice()->getMemoryProperties();
- for (uint32_t i = 0; i < memoryProperties.memoryTypeCount; i++) {
- const vk::MemoryType &memoryType = memoryProperties.memoryTypes[i];
- const vk::MemoryHeap &memoryHeap = memoryProperties.memoryHeaps[memoryType.heapIndex];
- if (memoryHeap.size < size) {
- continue;
- }
-
- if (requirements.memoryTypeBits & (1 << i)) {
- if (((memoryProperties.memoryTypes[i]).propertyFlags &
- flags) == flags) {
- memoryTypeIndex = i;
- memoryTypeIndexFound = true;
- if (isHostVisible && (memoryProperties.memoryTypes[i].propertyFlags & vk::MemoryPropertyFlagBits::eHostVisible)) {
- *isHostVisible = true;
- }
- break;
- }
- }
- }
- if (!memoryTypeIndexFound) {
- throw std::runtime_error(
- "Memory type index for buffer creation not found");
- }
-
- vk::MemoryAllocateInfo allocInfo;
- allocInfo.allocationSize = size;
- allocInfo.memoryTypeIndex = memoryTypeIndex;
- vk::DeviceMemory *vkDeviceMemory = new vk::DeviceMemory;
- vk::Result r = komputeManager()->device()->allocateMemory(&allocInfo, nullptr, vkDeviceMemory);
- if (r != vk::Result::eSuccess) {
- std::cerr << "Error allocating memory " << vk::to_string(r) << std::endl;
- throw std::runtime_error("Error allocating vulkan memory.");
- }
- return vkDeviceMemory;
-}
-
-static size_t ggml_vk_aligned_offset(ggml_backend_buffer_t buffer, size_t offset) {
- size_t minStorageBufferOffsetAlignment = ggml_backend_buffer_get_alignment(buffer);
-
- // If offset is already aligned, return it directly
- if (offset % minStorageBufferOffsetAlignment == 0) {
- return offset;
- }
-
- // Otherwise, return the largest multiple of minStorageBufferOffsetAlignment less than offset
- return (offset / minStorageBufferOffsetAlignment) * minStorageBufferOffsetAlignment;
-}
-
-static ggml_vk_memory ggml_vk_allocate(size_t size) {
- ggml_vk_memory memory;
- bool isHostVisible = false;
- {
- memory.primaryBuffer = ggml_vk_allocate_buffer(size);
- vk::MemoryRequirements memoryRequirements = komputeManager()->device()->getBufferMemoryRequirements(*memory.primaryBuffer);
- vk::MemoryPropertyFlags memoryPropertyFlags = vk::MemoryPropertyFlagBits::eDeviceLocal;
- memory.primaryMemory = ggml_vk_allocate(size, memoryPropertyFlags, memoryRequirements, &isHostVisible);
- komputeManager()->device()->bindBufferMemory(*memory.primaryBuffer, *memory.primaryMemory, 0);
- if (isHostVisible) {
- vk::Result r = komputeManager()->device()->mapMemory(*memory.primaryMemory, 0, size, vk::MemoryMapFlags(), &memory.data);
- if (r != vk::Result::eSuccess)
- std::cerr << "Error mapping memory" << vk::to_string(r);
- }
- }
-
- if (!isHostVisible) {
- memory.stagingBuffer = ggml_vk_allocate_buffer(size);
- vk::MemoryRequirements memoryRequirements = komputeManager()->device()->getBufferMemoryRequirements(*memory.stagingBuffer);
- vk::MemoryPropertyFlags memoryPropertyFlags = vk::MemoryPropertyFlagBits::eHostVisible |
- vk::MemoryPropertyFlagBits::eHostCoherent |
- vk::MemoryPropertyFlagBits::eHostCached;
- memory.stagingMemory = ggml_vk_allocate(size, memoryPropertyFlags, memoryRequirements, &isHostVisible);
- komputeManager()->device()->bindBufferMemory(*memory.stagingBuffer, *memory.stagingMemory, 0);
- vk::Result r = komputeManager()->device()->mapMemory(*memory.stagingMemory, 0, size, vk::MemoryMapFlags(), &memory.data);
- if (r != vk::Result::eSuccess)
- std::cerr << "Error mapping memory" << vk::to_string(r);
- }
-
- memory.size = size;
- return memory;
-}
-
-static void ggml_vk_free_memory(ggml_vk_memory &memory)
-{
- komputeManager()->device()->destroy(
- *memory.primaryBuffer,
- (vk::Optional<const vk::AllocationCallbacks>)nullptr);
- if (memory.stagingBuffer) {
- komputeManager()->device()->destroy(
- *memory.stagingBuffer,
- (vk::Optional<const vk::AllocationCallbacks>)nullptr);
- }
- komputeManager()->device()->freeMemory(
- *memory.primaryMemory,
- (vk::Optional<const vk::AllocationCallbacks>)nullptr);
- if (memory.stagingMemory) {
- komputeManager()->device()->freeMemory(
- *memory.stagingMemory,
- (vk::Optional<const vk::AllocationCallbacks>)nullptr);
- }
-}
-
-static const char * ggml_backend_kompute_buffer_type_get_name(ggml_backend_buffer_type_t buft);
-
-static
-ggml_vk_memory * ggml_vk_find_tensor(const struct ggml_tensor * t, uint64_t & offset) {
- ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
-
- // compatibility with ggml-backend
- GGML_ASSERT(buffer && buffer->buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name);
-
- ggml_vk_memory * buf_ctx = static_cast<ggml_vk_memory *>(buffer->context);
-
- const intptr_t ioffs = intptr_t(t->data) - intptr_t(buf_ctx->data);
-
- GGML_ASSERT(ioffs >= 0 && ioffs + int64_t(ggml_nbytes(t)) <= int64_t(buffer->size));
-
- offset = uint64_t(ioffs);
- return buf_ctx;
-}
-
-static
-const std::shared_ptr<kp::Tensor> ggml_vk_get_tensor(const struct ggml_tensor * t, uint32_t * alignedOffset = nullptr) {
- uint64_t originalOffset = 0;
- auto * res = ggml_vk_find_tensor(t, originalOffset);
- if (!res) {
- static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
- return nullTensor;
- }
-
- // Create a tensor whose memory will be composed of our buffers at the correct offset
- const size_t nelements = ggml_nelements(t);
- size_t nbytes = ggml_nbytes(t);
-
- size_t vulkanOffset = ggml_vk_aligned_offset(t->buffer, originalOffset);
- if (alignedOffset) {
- *alignedOffset = originalOffset - vulkanOffset;
- nbytes += *alignedOffset;
- }
-
- return komputeManager()->tensor(
- t->data,
- nelements,
- nbytes, kp::Tensor::TensorDataTypes::eFloat,
- res->primaryMemory, res->primaryBuffer,
- res->stagingMemory, res->stagingBuffer,
- vulkanOffset);
-}
-
-static std::vector<uint32_t> getSpirvShader(const unsigned char* rawData, size_t size) {
- if (size % sizeof(uint32_t) != 0) {
- throw std::runtime_error("Invalid size: must be divisible by sizeof(uint32_t)");
- }
-
- const uint32_t* data_ptr = reinterpret_cast<const uint32_t*>(rawData);
- size_t count = size / sizeof(uint32_t);
- return std::vector<uint32_t>(data_ptr, data_ptr + count);
-}
-
-inline static
-uint32_t safe_divide(uint32_t a, uint32_t b) {
- if (b <= 1) {
- return a;
- }
- if ((a % b) != 0) {
- fprintf(stderr, "((%u %% %u) == %u) != 0\n", a, b, a % b);
- GGML_ABORT("safe_divide result would've had remainder");
- }
- return a / b;
-}
-
-static void ggml_vk_add(
- kp::Sequence& seq,
- const std::shared_ptr<kp::Tensor>& inA,
- const std::shared_ptr<kp::Tensor>& inB,
- const std::shared_ptr<kp::Tensor>& out,
- uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
- int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne03,
- int32_t nb00, int32_t nb01, int32_t nb02, int32_t nb03,
- int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
- int32_t nb10, int32_t nb11, int32_t nb12, int32_t nb13,
- int32_t ne0,
- int32_t nb0, int32_t nb1, int32_t nb2, int32_t nb3
-) {
- const static auto spirv = getSpirvShader(kp::shader_data::op_add_comp_spv,
- kp::shader_data::op_add_comp_spv_len);
-
- struct PushConstants {
- uint32_t inAOff, inBOff, outOff;
- int32_t ne00;
- int32_t nb00, nb01, nb02, nb03;
- int32_t ne10, ne11, ne12, ne13;
- int32_t nb10, nb11, nb12, nb13;
- int32_t ne0;
- int32_t nb0, nb1, nb2, nb3;
- } const pushConsts {
- safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
- ne00,
- nb00, nb01, nb02, nb03,
- ne10, ne11, ne12, ne13,
- nb10, nb11, nb12, nb13,
- ne0,
- nb0, nb1, nb2, nb3
- };
-
- std::shared_ptr<kp::Algorithm> s_algo = nullptr;
- if (!komputeManager()->hasAlgorithm(__func__)) {
- s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
- } else {
- s_algo = komputeManager()->getAlgorithm(__func__);
- s_algo->setTensors({inA, inB, out});
- s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
- s_algo->setPushConstants<PushConstants>({pushConsts});
- s_algo->updateDescriptors(s_kompute_context->pool.get());
- }
- seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-
-static void ggml_vk_addrow(kp::Sequence& seq,
- const std::shared_ptr<kp::Tensor>& inA,
- const std::shared_ptr<kp::Tensor>& inB,
- const std::shared_ptr<kp::Tensor>& out,
- uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
- uint32_t size, uint32_t row = 0) {
-
- const static auto spirv = getSpirvShader(kp::shader_data::op_addrow_comp_spv,
- kp::shader_data::op_addrow_comp_spv_len);
-
- struct PushConstants {
- uint32_t inAOff, inBOff, outOff;
- uint32_t row;
- } const pushConsts {
- safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
- row
- };
-
- std::shared_ptr<kp::Algorithm> s_algo = nullptr;
- if (!komputeManager()->hasAlgorithm(__func__))
- s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
- else {
- s_algo = komputeManager()->getAlgorithm(__func__);
- s_algo->setTensors({inA, inB, out});
- s_algo->setWorkgroup({size});
- s_algo->setPushConstants<PushConstants>({pushConsts});
- s_algo->updateDescriptors(s_kompute_context->pool.get());
- }
- seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-
-static void ggml_vk_mul(
- kp::Sequence& seq,
- const std::shared_ptr<kp::Tensor>& inA,
- const std::shared_ptr<kp::Tensor>& inB,
- const std::shared_ptr<kp::Tensor>& out,
- uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
- int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne03,
- int32_t nb00, int32_t nb01, int32_t nb02, int32_t nb03,
- int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
- int32_t nb10, int32_t nb11, int32_t nb12, int32_t nb13,
- int32_t ne0,
- int32_t nb0, int32_t nb1, int32_t nb2, int32_t nb3
-) {
- const static auto spirv = getSpirvShader(kp::shader_data::op_mul_comp_spv,
- kp::shader_data::op_mul_comp_spv_len);
-
- struct PushConstants {
- uint32_t inAOff, inBOff, outOff;
- int32_t ne00;
- int32_t nb00, nb01, nb02, nb03;
- int32_t ne10, ne11, ne12, ne13;
- int32_t nb10, nb11, nb12, nb13;
- int32_t ne0;
- int32_t nb0, nb1, nb2, nb3;
- } const pushConsts {
- safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
- ne00,
- nb00, nb01, nb02, nb03,
- ne10, ne11, ne12, ne13,
- nb10, nb11, nb12, nb13,
- ne0,
- nb0, nb1, nb2, nb3
- };
-
- std::shared_ptr<kp::Algorithm> s_algo = nullptr;
- if (!komputeManager()->hasAlgorithm(__func__)) {
- s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
- } else {
- s_algo = komputeManager()->getAlgorithm(__func__);
- s_algo->setTensors({inA, inB, out});
- s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
- s_algo->setPushConstants<PushConstants>({pushConsts});
- s_algo->updateDescriptors(s_kompute_context->pool.get());
- }
- seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-
-static void ggml_vk_scale(kp::Sequence& seq,
- const std::shared_ptr<kp::Tensor>& in,
- const std::shared_ptr<kp::Tensor>& out,
- uint32_t inOff, uint32_t outOff,
- uint32_t size, float scale) {
- const static auto spirv_1 = getSpirvShader(
- kp::shader_data::op_scale_comp_spv, kp::shader_data::op_scale_comp_spv_len
- );
- const static auto spirv_8 = getSpirvShader(
- kp::shader_data::op_scale_8_comp_spv, kp::shader_data::op_scale_8_comp_spv_len
- );
-
- struct PushConstants {
- uint32_t inOff, outOff;
- float scale;
- } const pushConsts {
- safe_divide(inOff, 4), safe_divide(outOff, 4),
- scale
- };
-
- const auto * spirv = &spirv_1;
- std::string name(__func__);
- if (size % 8 == 0) {
- size /= 8;
- name += "_8";
- spirv = &spirv_8;
- }
-
- std::shared_ptr<kp::Algorithm> s_algo = nullptr;
- if (!komputeManager()->hasAlgorithm(name)) {
- s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {in, out}, *spirv, {size}, {}, {pushConsts});
- } else {
- s_algo = komputeManager()->getAlgorithm(name);
- s_algo->setTensors({in, out});
- s_algo->setWorkgroup({size});
- s_algo->setPushConstants<PushConstants>({pushConsts});
- s_algo->updateDescriptors(s_kompute_context->pool.get());
- }
- seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-
-static void ggml_vk_xxlu(
- const std::vector<uint32_t>& spirv, const char * suffix, kp::Sequence& seq,
- const std::shared_ptr<kp::Tensor>& in,
- const std::shared_ptr<kp::Tensor>& out,
- uint32_t inOff, uint32_t outOff,
- uint32_t size
-) {
- struct PushConstants {
- uint32_t inOff, outOff;
- } const pushConsts {
- safe_divide(inOff, 4), safe_divide(outOff, 4),
- };
-
- auto name = std::string(__func__) + "_" + suffix;
- std::shared_ptr<kp::Algorithm> s_algo = nullptr;
- if (!komputeManager()->hasAlgorithm(name)) {
- s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
- } else {
- s_algo = komputeManager()->getAlgorithm(name);
- s_algo->setTensors({in, out});
- s_algo->setWorkgroup({size});
- s_algo->setPushConstants<PushConstants>({pushConsts});
- s_algo->updateDescriptors(s_kompute_context->pool.get());
- }
- seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-
-template <typename... Args>
-static void ggml_vk_silu(Args&&... args) {
- const static auto spirv = getSpirvShader(kp::shader_data::op_silu_comp_spv,
- kp::shader_data::op_silu_comp_spv_len);
-
- ggml_vk_xxlu(spirv, "silu", std::forward<Args>(args)...);
-}
-
-template <typename... Args>
-static void ggml_vk_relu(Args&&... args) {
- const static auto spirv = getSpirvShader(kp::shader_data::op_relu_comp_spv,
- kp::shader_data::op_relu_comp_spv_len);
-
- ggml_vk_xxlu(spirv, "relu", std::forward<Args>(args)...);
-}
-
-template <typename... Args>
-static void ggml_vk_gelu(Args&&... args) {
- const static auto spirv = getSpirvShader(kp::shader_data::op_gelu_comp_spv,
- kp::shader_data::op_gelu_comp_spv_len);
-
- ggml_vk_xxlu(spirv, "gelu", std::forward<Args>(args)...);
-}
-
-static void ggml_vk_soft_max(
- kp::Sequence& seq,
- const std::shared_ptr<kp::Tensor>& inA,
- const std::shared_ptr<kp::Tensor>& inB,
- const std::shared_ptr<kp::Tensor>& out,
- uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
- int32_t ne00, int32_t ne01, int32_t ne02, uint32_t ne03,
- float scale, float max_bias, float m0, float m1,
- uint32_t n_head_log2
-) {
- const static auto spirv = getSpirvShader(kp::shader_data::op_softmax_comp_spv,
- kp::shader_data::op_softmax_comp_spv_len);
-
- struct PushConstants {
- uint32_t inAOff, inBOff, outOff;
- int32_t ne00, ne01, ne02;
- float scale, max_bias, m0, m1;
- uint32_t n_head_log2;
- int32_t mask;
- } pushConsts {
- safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
- ne00, ne01, ne02,
- scale, max_bias, m0, m1,
- n_head_log2,
- bool(inB)
- };
-
- auto & inB_ = inB ? inB : inA;
-
- std::shared_ptr<kp::Algorithm> s_algo = nullptr;
- if (!komputeManager()->hasAlgorithm(__func__)) {
- // FIXME: The softmax kernel needs to be fixed to use the subgroupsize which can vary by device
- const uint32_t local_x = 32;
- s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB_, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {local_x}, {pushConsts});
- } else {
- s_algo = komputeManager()->getAlgorithm(__func__);
- s_algo->setTensors({inA, inB_, out});
- s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
- s_algo->setPushConstants<PushConstants>({pushConsts});
- s_algo->updateDescriptors(s_kompute_context->pool.get());
- }
- seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-
-static void ggml_vk_norm_(
- const std::vector<uint32_t>& spirv, const char * suffix, kp::Sequence& seq,
- const std::shared_ptr<kp::Tensor>& in,
- const std::shared_ptr<kp::Tensor>& out,
- uint32_t inOff, uint32_t outOff,
- int32_t ne00, int32_t nb01,
- int32_t nrows, float epsilon
-) {
- GGML_ASSERT(nb01%sizeof(float) == 0);
- GGML_ASSERT(ne00%sizeof(float) == 0);
-
- struct PushConstants {
- uint32_t inOff, outOff;
- uint32_t ne00, nb01;
- float eps;
- } pushConsts {
- safe_divide(inOff, 4), safe_divide(outOff, 4),
- (uint32_t)ne00, (uint32_t)nb01, epsilon
- };
-
- auto name = std::string(__func__) + "_" + suffix;
- std::shared_ptr<kp::Algorithm> s_algo = nullptr;
- if (!komputeManager()->hasAlgorithm(name)) {
- s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts});
- } else {
- s_algo = komputeManager()->getAlgorithm(name);
- s_algo->setTensors({in, out});
- s_algo->setWorkgroup({(uint32_t)nrows});
- s_algo->setPushConstants<PushConstants>({pushConsts});
- s_algo->updateDescriptors(s_kompute_context->pool.get());
- }
- seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-
-template <typename... Args>
-static void ggml_vk_norm(Args&&... args) {
- const static auto spirv = getSpirvShader(kp::shader_data::op_norm_comp_spv,
- kp::shader_data::op_norm_comp_spv_len);
-
- ggml_vk_norm_(spirv, "norm", std::forward<Args>(args)...);
-}
-
-template <typename... Args>
-static void ggml_vk_rms_norm(Args&&... args) {
- const static auto spirv = getSpirvShader(kp::shader_data::op_rmsnorm_comp_spv,
- kp::shader_data::op_rmsnorm_comp_spv_len);
-
- ggml_vk_norm_(spirv, "rms", std::forward<Args>(args)...);
-}
-
-static void ggml_vk_diag_mask_inf(kp::Sequence& seq,
- const std::shared_ptr<kp::Tensor>& in,
- const std::shared_ptr<kp::Tensor>& out,
- uint32_t inOff, uint32_t outOff,
- uint32_t n_past,
- int32_t ne00, int32_t ne01, int32_t ne02) {
- const static auto spirv = getSpirvShader(kp::shader_data::op_diagmask_comp_spv,
- kp::shader_data::op_diagmask_comp_spv_len);
-
- struct PushConstants {
- uint32_t inOff, outOff;
- uint32_t n_past;
- int32_t ne00, ne01;
- } pushConsts {
- safe_divide(inOff, 4), safe_divide(outOff, 4),
- n_past,
- ne00, ne01
- };
-
- std::shared_ptr<kp::Algorithm> s_algo = nullptr;
- if (!komputeManager()->hasAlgorithm(__func__))
- s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne00), unsigned(ne01), unsigned(ne02)}, {}, {pushConsts});
- else {
- s_algo = komputeManager()->getAlgorithm(__func__);
- s_algo->setTensors({in, out});
- s_algo->setWorkgroup({unsigned(ne00), unsigned(ne01), unsigned(ne02)});
- s_algo->setPushConstants<PushConstants>({pushConsts});
- s_algo->updateDescriptors(s_kompute_context->pool.get());
- }
- seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-
-static void ggml_vk_mul_mat_f16(
- kp::Sequence& seq,
- const std::shared_ptr<kp::Tensor>& inA,
- const std::shared_ptr<kp::Tensor>& inB,
- const std::shared_ptr<kp::Tensor>& out,
- uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
- int32_t ne00, int32_t ne01, int32_t ne02,
- uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
- int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
- uint32_t nb10, uint32_t nb11, uint32_t nb12, uint32_t nb13,
- int32_t ne0, int32_t ne1,
- uint32_t r2, uint32_t r3
-) {
- const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_f16_comp_spv,
- kp::shader_data::op_mul_mat_f16_comp_spv_len);
-
- struct PushConstants {
- uint32_t inAOff, inBOff, outOff;
- int32_t ne00, ne01, ne02;
- uint32_t nb00, nb01, nb02, nb03;
- int32_t ne10, ne11, ne12;
- uint32_t nb10, nb11, nb12, nb13;
- int32_t ne0, ne1;
- uint32_t r2, r3;
- } pushConsts {
- safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4),
- ne00, ne01, ne02,
- nb00, nb01, nb02, nb03,
- ne10, ne11, ne12,
- nb10, nb11, nb12, nb13,
- ne0, ne1,
- r2, r3
- };
-
- const unsigned ny = unsigned((ne11 + 4 - 1)/4);
-
- std::shared_ptr<kp::Algorithm> s_algo = nullptr;
- if (!komputeManager()->hasAlgorithm(__func__)) {
- const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
- s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), ny, unsigned(ne12*ne13)}, {local_x}, {pushConsts});
- } else {
- s_algo = komputeManager()->getAlgorithm(__func__);
- s_algo->setTensors({inA, inB, out});
- s_algo->setWorkgroup({unsigned(ne01), ny, unsigned(ne12*ne13)});
- s_algo->setPushConstants<PushConstants>({pushConsts});
- s_algo->updateDescriptors(s_kompute_context->pool.get());
- }
- seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-
-static void ggml_vk_mul_mat_mat_f32(kp::Sequence& seq,
- const std::shared_ptr<kp::Tensor>& inA,
- const std::shared_ptr<kp::Tensor>& inB,
- const std::shared_ptr<kp::Tensor>& out,
- uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
- int32_t ne00, int32_t ne01, int32_t ne02,
- uint32_t nb01, uint32_t nb02,
- int32_t ne11, int32_t ne12,
- uint32_t nb11, uint32_t nb12,
- uint32_t nb1, uint32_t nb2) {
- const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_mat_f32_comp_spv,
- kp::shader_data::op_mul_mat_mat_f32_comp_spv_len);
-
- struct PushConstants {
- uint32_t inAOff, inBOff, outOff;
- int32_t ne00, ne01, ne02, ne11, ne12;
- uint32_t nb01, nb02;
- uint32_t nb11, nb12;
- uint32_t nb1, nb2;
- } pushConsts {
- safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
- ne00, ne01, ne02, ne11, ne12,
- nb01, nb02, nb11, nb12,
- nb1, nb2
- };
-
- const uint32_t local_x = ggml_vk_current_device().subgroupSize;
- std::shared_ptr<kp::Algorithm> s_algo = nullptr;
- if (!komputeManager()->hasAlgorithm(__func__)) {
- s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(),
- {inA, inB, out}, spirv,
- {unsigned(ne01),
- unsigned(ne11),
- unsigned(std::max(ne12, ne02))
- },
- {local_x},
- {pushConsts});
- } else {
- s_algo = komputeManager()->getAlgorithm(__func__);
- s_algo->setTensors({inA, inB, out});
- s_algo->setWorkgroup({unsigned(ne01),
- unsigned(ne11),
- unsigned(std::max(ne12, ne02)),
- });
- s_algo->setPushConstants<PushConstants>({pushConsts});
- s_algo->updateDescriptors(s_kompute_context->pool.get());
- }
- seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-
-static void ggml_vk_mul_mat_impl(
- const std::vector<uint32_t>& spirv, const char * suffix, uint32_t block_size, kp::Sequence& seq,
- const std::shared_ptr<kp::Tensor>& inA,
- const std::shared_ptr<kp::Tensor>& inB,
- const std::shared_ptr<kp::Tensor>& out,
- uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
- int32_t ne00, int32_t ne01, int32_t ne02,
- int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
- int32_t ne0, int32_t ne1,
- uint32_t nb01, uint32_t nb02, uint32_t nb03,
- uint32_t nb11, uint32_t nb12, uint32_t nb13,
- uint32_t r2, uint32_t r3
-) {
- struct PushConstants {
- uint32_t inAOff, inBOff, outOff;
- int32_t ne00, ne01, ne02;
- int32_t ne10, ne12;
- int32_t ne0, ne1;
- uint32_t nb01, nb02, nb03;
- uint32_t nb11, nb12, nb13;
- uint32_t r2, r3;
- } pushConsts {
- safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
- ne00, ne01, ne02,
- ne10, ne12,
- ne0, ne1,
- nb01, nb02, nb03,
- nb11, nb12, nb13,
- r2, r3
- };
-
- auto name = std::string(__func__) + "_" + suffix;
- std::shared_ptr<kp::Algorithm> s_algo = nullptr;
- if (!komputeManager()->hasAlgorithm(name)) {
- const uint32_t local_x = (ggml_vk_current_device().subgroupSize * 2) / 8;
- s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(name, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)}, {local_x}, {pushConsts});
- } else {
- s_algo = komputeManager()->getAlgorithm(name);
- s_algo->setTensors({inA, inB, out});
- s_algo->setWorkgroup({unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)});
- s_algo->setPushConstants<PushConstants>({pushConsts});
- s_algo->updateDescriptors(s_kompute_context->pool.get());
- }
- seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-
-template <typename... Args>
-static void ggml_vk_mul_mat_q4_0(Args&&... args) {
- const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_0_comp_spv,
- kp::shader_data::op_mul_mat_q4_0_comp_spv_len);
-
- ggml_vk_mul_mat_impl(spirv, "q4_0", 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
-}
-
-template <typename... Args>
-static void ggml_vk_mul_mat_q4_1(Args&&... args) {
- const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_1_comp_spv,
- kp::shader_data::op_mul_mat_q4_1_comp_spv_len);
-
- ggml_vk_mul_mat_impl(spirv, "q4_1", 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
-}
-
-template <typename... Args>
-static void ggml_vk_mul_mat_q8_0(Args&&... args) {
- const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q8_0_comp_spv,
- kp::shader_data::op_mul_mat_q8_0_comp_spv_len);
-
- ggml_vk_mul_mat_impl(spirv, "q8_0", 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
-}
-
-static void ggml_vk_mul_mat_q4_k(
- kp::Sequence& seq,
- const std::shared_ptr<kp::Tensor>& inA,
- const std::shared_ptr<kp::Tensor>& inB,
- const std::shared_ptr<kp::Tensor>& out,
- uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
- int32_t ne00, int32_t ne01, int32_t ne02,
- int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
- int32_t ne0, int32_t ne1,
- uint32_t nb01, uint32_t nb02, uint32_t nb03,
- uint32_t nb11, uint32_t nb12, uint32_t nb13,
- uint32_t r2, uint32_t r3
-) {
- const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_k_comp_spv,
- kp::shader_data::op_mul_mat_q4_k_comp_spv_len);
-
- struct PushConstants {
- uint32_t inAOff, inBOff, outOff;
- int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12;
- uint32_t nb01, nb02, nb03, nb11, nb12, nb13;
- uint32_t r2, r3;
- } pushConsts {
- inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
- ne00, ne10, ne0, ne1, ne01, ne02, ne12,
- nb01, nb02, nb03, nb11, nb12, nb13,
- r2, r3
- };
-
- std::shared_ptr<kp::Algorithm> s_algo = nullptr;
- if (!komputeManager()->hasAlgorithm(__func__)) {
- s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 3)/4), unsigned(ne11), unsigned(ne12) * unsigned(ne13)}, {}, {pushConsts});
- } else {
- s_algo = komputeManager()->getAlgorithm(__func__);
- s_algo->setTensors({inA, inB, out});
- s_algo->setWorkgroup({unsigned((ne01 + 3)/4), unsigned(ne11), unsigned(ne12) * unsigned(ne13)});
- s_algo->setPushConstants<PushConstants>({pushConsts});
- s_algo->updateDescriptors(s_kompute_context->pool.get());
- }
- seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-
-static void ggml_vk_mul_mat_q6_k(
- kp::Sequence& seq,
- const std::shared_ptr<kp::Tensor>& inA,
- const std::shared_ptr<kp::Tensor>& inB,
- const std::shared_ptr<kp::Tensor>& out,
- uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
- int32_t ne00, int32_t ne01, int32_t ne02,
- int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
- int32_t ne0, int32_t ne1,
- uint32_t nb01, uint32_t nb02, uint32_t nb03,
- uint32_t nb11, uint32_t nb12, uint32_t nb13,
- uint32_t r2, uint32_t r3
-) {
- const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q6_k_comp_spv,
- kp::shader_data::op_mul_mat_q6_k_comp_spv_len);
-
- struct PushConstants {
- uint32_t inAOff, inBOff, outOff;
- int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12;
- uint32_t nb01, nb02, nb03, nb11, nb12, nb13;
- uint32_t r2, r3;
- } pushConsts {
- inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
- ne00, ne10, ne0, ne1, ne01, ne02, ne12,
- nb01, nb02, nb03, nb11, nb12, nb13,
- r2, r3
- };
-
- std::shared_ptr<kp::Algorithm> s_algo = nullptr;
- if (!komputeManager()->hasAlgorithm(__func__)) {
- const uint32_t local_x = 2;
- const uint32_t local_y = ggml_vk_current_device().subgroupSize;
- s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)*unsigned(ne13)}, {local_x, local_y}, {pushConsts});
- } else {
- s_algo = komputeManager()->getAlgorithm(__func__);
- s_algo->setTensors({inA, inB, out});
- s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)*unsigned(ne13)});
- s_algo->setPushConstants<PushConstants>({pushConsts});
- s_algo->updateDescriptors(s_kompute_context->pool.get());
- }
- seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-
-static void ggml_vk_get_rows(
- const std::vector<uint32_t>& spirv,
- const char * suffix,
- unsigned element_size, unsigned qk,
- kp::Sequence& seq,
- const std::shared_ptr<kp::Tensor>& inA,
- const std::shared_ptr<kp::Tensor>& inB,
- const std::shared_ptr<kp::Tensor>& out,
- uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
- int32_t ne00, int32_t nb01, int32_t nb1,
- uint32_t size
-) {
- GGML_ASSERT(nb01%element_size == 0);
- GGML_ASSERT(nb1%sizeof(float) == 0);
- if (qk) GGML_ASSERT(ne00%qk == 0);
-
- struct PushConstants {
- uint32_t inAOff, inBOff, outOff;
- int32_t ne00, nb01, nb1;
- } pushConsts {
- safe_divide(inAOff, element_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
- ne00, nb01, nb1
- };
-
- auto name = std::string(__func__) + "_" + suffix;
- std::shared_ptr<kp::Algorithm> s_algo = nullptr;
- if (!komputeManager()->hasAlgorithm(name)) {
- s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
- } else {
- s_algo = komputeManager()->getAlgorithm(name);
- s_algo->setTensors({inA, inB, out});
- s_algo->setWorkgroup({size});
- s_algo->setPushConstants<PushConstants>({pushConsts});
- s_algo->updateDescriptors(s_kompute_context->pool.get());
- }
- seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-
-template <typename... Args>
-static void ggml_vk_get_rows_f32(Args&&... args) {
- const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f32_comp_spv,
- kp::shader_data::op_getrows_f32_comp_spv_len);
-
- ggml_vk_get_rows(spirv, "f32", sizeof(float), 0, std::forward<Args>(args)...);
-}
-
-template <typename... Args>
-static void ggml_vk_get_rows_f16(Args&&... args) {
- const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f16_comp_spv,
- kp::shader_data::op_getrows_f16_comp_spv_len);
-
- ggml_vk_get_rows(spirv, "f16", sizeof(half), 0, std::forward<Args>(args)...);
-}
-
-template <typename... Args>
-static void ggml_vk_get_rows_q4_0(Args&&... args) {
- const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q4_0_comp_spv,
- kp::shader_data::op_getrows_q4_0_comp_spv_len);
-
- ggml_vk_get_rows(spirv, "q4_0", 1/*We access blocks unaligned*/, QK4_0, std::forward<Args>(args)...);
-}
-
-template <typename... Args>
-static void ggml_vk_get_rows_q4_1(Args&&... args) {
- const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q4_1_comp_spv,
- kp::shader_data::op_getrows_q4_1_comp_spv_len);
-
- ggml_vk_get_rows(spirv, "q4_1", 1/*We access blocks unaligned*/, QK4_1, std::forward<Args>(args)...);
-}
-
-template <typename... Args>
-static void ggml_vk_get_rows_q6_k(Args&&... args) {
- const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q6_k_comp_spv,
- kp::shader_data::op_getrows_q6_k_comp_spv_len);
- ggml_vk_get_rows(spirv, "q6_k", 1/*We access blocks unaligned*/, QK_NL, std::forward<Args>(args)...);
-}
-
-static void ggml_vk_rope(
- kp::Sequence& seq,
- const std::shared_ptr<kp::Tensor>& inA,
- const std::shared_ptr<kp::Tensor>& inB,
- const std::shared_ptr<kp::Tensor>& inC,
- const std::shared_ptr<kp::Tensor>& out,
- uint32_t inAOff, uint32_t inBOff, uint32_t inCOff, uint32_t outOff,
- ggml_type src0t, int32_t n_dims, int32_t mode, int32_t n_ctx_orig,
- float freq_base, float freq_scale, bool has_freq_factors, float ext_factor, float attn_factor, float beta_fast, float beta_slow,
- int32_t ne01, int32_t ne02, int32_t ne03,
- uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
- int32_t ne0,
- uint32_t nb0, uint32_t nb1, uint32_t nb2, uint32_t nb3
-) {
- GGML_ASSERT(src0t == GGML_TYPE_F16 || src0t == GGML_TYPE_F32);
-
- static const auto spirv_norm_f16 = getSpirvShader(
- kp::shader_data::op_rope_norm_f16_comp_spv, kp::shader_data::op_rope_norm_f16_comp_spv_len
- );
- static const auto spirv_norm_f32 = getSpirvShader(
- kp::shader_data::op_rope_norm_f32_comp_spv, kp::shader_data::op_rope_norm_f32_comp_spv_len
- );
- static const auto spirv_neox_f16 = getSpirvShader(
- kp::shader_data::op_rope_neox_f16_comp_spv, kp::shader_data::op_rope_neox_f16_comp_spv_len
- );
- static const auto spirv_neox_f32 = getSpirvShader(
- kp::shader_data::op_rope_neox_f32_comp_spv, kp::shader_data::op_rope_neox_f32_comp_spv_len
- );
-
- int type_size = src0t == GGML_TYPE_F16 ? 2 : 4;
-
- GGML_ASSERT(nb03 % type_size == 0);
- GGML_ASSERT(nb02 % type_size == 0);
- GGML_ASSERT(nb01 % type_size == 0);
- GGML_ASSERT(nb00 % type_size == 0);
- GGML_ASSERT(nb3 % type_size == 0);
- GGML_ASSERT(nb2 % type_size == 0);
- GGML_ASSERT(nb1 % type_size == 0);
- GGML_ASSERT(nb0 % type_size == 0);
-
- struct PushConstants {
- uint32_t inAOff, inBOff, inCOff, outOff;
- int32_t n_dims, mode, n_ctx_orig;
- float freq_base, freq_scale;
- bool has_freq_factors;
- float ext_factor, attn_factor, beta_fast, beta_slow;
- uint32_t nb00, nb01, nb02, nb03;
- int32_t ne0;
- uint32_t nb0, nb1, nb2, nb3;
- } pushConsts {
- safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(inCOff, type_size), safe_divide(outOff, type_size),
- n_dims, mode, n_ctx_orig,
- freq_base, freq_scale,
- has_freq_factors,
- ext_factor, attn_factor, beta_fast, beta_slow,
- nb00, nb01, nb02, nb03,
- ne0,
- nb0, nb1, nb2, nb3
- };
-
- auto & inC_ = inC ? inC : inA;
- const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
- const bool is_f16 = src0t == GGML_TYPE_F16;
-
- auto name = std::string(__func__) + (is_neox ? "_neox" : "_norm") + (src0t == GGML_TYPE_F16 ? "_f16" : "_f32");
- std::shared_ptr<kp::Algorithm> s_algo = nullptr;
- if (!komputeManager()->hasAlgorithm(name)) {
- auto & spirv = is_neox ? is_f16 ? spirv_neox_f16 : spirv_neox_f32 : is_f16 ? spirv_norm_f16 : spirv_norm_f32;
- s_algo = komputeManager()->algorithm<float, PushConstants>(
- name, s_kompute_context->pool.get(), {inA, inB, inC_, out}, spirv,
- {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts}
- );
- } else {
- s_algo = komputeManager()->getAlgorithm(name);
- s_algo->setTensors({inA, inB, inC_, out});
- s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
- s_algo->setPushConstants<PushConstants>({pushConsts});
- s_algo->updateDescriptors(s_kompute_context->pool.get());
- }
- seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-
-static void ggml_vk_cpy(
- const std::vector<uint32_t>& spirv,
- uint32_t in_element_size, uint32_t out_element_size,
- kp::Sequence& seq,
- const std::shared_ptr<kp::Tensor>& in,
- const std::shared_ptr<kp::Tensor>& out,
- uint32_t inOff, uint32_t outOff,
- int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne03,
- uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
- int32_t ne0, int32_t ne1, int32_t ne2,
- uint32_t nb0, uint32_t nb1, uint32_t nb2, uint32_t nb3
-) {
- struct PushConstants {
- uint32_t inOff, outOff;
- int32_t ne00, ne01, ne02;
- uint32_t nb00, nb01, nb02, nb03;
- int32_t ne0, ne1, ne2;
- uint32_t nb0, nb1, nb2, nb3;
- } pushConsts {
- safe_divide(inOff, in_element_size), safe_divide(outOff, out_element_size),
- ne00, ne01, ne02,
- nb00, nb01, nb02, nb03,
- ne0, ne1, ne2,
- nb0, nb1, nb2, nb3
- };
-
- std::string name = std::string(__func__)
- + "_i_" + std::to_string(in_element_size)
- + "_o_" + std::to_string(out_element_size);
- std::shared_ptr<kp::Algorithm> s_algo = nullptr;
- if (!komputeManager()->hasAlgorithm(name))
- s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
- else {
- s_algo = komputeManager()->getAlgorithm(name);
- s_algo->setTensors({in, out});
- s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
- s_algo->setPushConstants<PushConstants>({pushConsts});
- s_algo->updateDescriptors(s_kompute_context->pool.get());
- }
- seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-
-template <typename... Args>
-static void ggml_vk_cpy_f32_f16(Args&&... args) {
- const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f32_f16_comp_spv,
- kp::shader_data::op_cpy_f32_f16_comp_spv_len);
- ggml_vk_cpy(spirv, 4, 2, std::forward<Args>(args)...);
-}
-
-template <typename... Args>
-static void ggml_vk_cpy_f32_f32(Args&&... args) {
- const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f32_f32_comp_spv,
- kp::shader_data::op_cpy_f32_f32_comp_spv_len);
- ggml_vk_cpy(spirv, 4, 4, std::forward<Args>(args)...);
-}
-
-template <typename... Args>
-static void ggml_vk_cpy_f16_f16(Args&&... args) {
- const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f16_f16_comp_spv,
- kp::shader_data::op_cpy_f16_f16_comp_spv_len);
- ggml_vk_cpy(spirv, 2, 2, std::forward<Args>(args)...);
-}
-
-template <typename... Args>
-static void ggml_vk_cpy_f16_f32(Args&&... args) {
- const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f16_f32_comp_spv,
- kp::shader_data::op_cpy_f16_f32_comp_spv_len);
- ggml_vk_cpy(spirv, 2, 4, std::forward<Args>(args)...);
-}
-
-static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
- int64_t n = ggml_nelements(op);
- switch (op->op) {
- case GGML_OP_UNARY:
- if (n % 4 != 0) return false;
- switch (ggml_get_unary_op(op)) {
- case GGML_UNARY_OP_GELU:
- if (n % 8 != 0) return false;
- // fall through
- case GGML_UNARY_OP_RELU:
- case GGML_UNARY_OP_SILU:
- return ggml_is_contiguous(op->src[0]);
- default:
- ;
- }
- break;
- case GGML_OP_NONE:
- case GGML_OP_RESHAPE:
- case GGML_OP_VIEW:
- case GGML_OP_TRANSPOSE:
- case GGML_OP_PERMUTE:
- case GGML_OP_ADD:
- case GGML_OP_MUL:
- case GGML_OP_SCALE:
- case GGML_OP_SOFT_MAX:
- case GGML_OP_RMS_NORM:
- case GGML_OP_NORM:
- return true;
- case GGML_OP_ROPE:
- {
- const int mode = ((const int32_t *) op->op_params)[2];
- if (mode & GGML_ROPE_TYPE_MROPE) {
- return false;
- }
- if (mode & GGML_ROPE_TYPE_VISION) {
- return false;
- }
- return true;
- }
- case GGML_OP_DUP:
- case GGML_OP_CPY:
- case GGML_OP_CONT:
- switch (op->src[0]->type) {
- case GGML_TYPE_F32:
- case GGML_TYPE_F16:
- break;
- default:
- return false;
- }
- switch (op->type) {
- case GGML_TYPE_F32:
- case GGML_TYPE_F16:
- break;
- default:
- return false;
- }
- return true;
- case GGML_OP_DIAG_MASK_INF:
- return op->ne[3] == 1;
- case GGML_OP_GET_ROWS:
- switch (op->src[0]->type) {
- case GGML_TYPE_F32:
- case GGML_TYPE_F16:
- case GGML_TYPE_Q4_0:
- case GGML_TYPE_Q4_1:
- case GGML_TYPE_Q6_K:
- return op->ne[2] == 1 && op->ne[3] == 1;
- default:
- ;
- }
- return false;
- case GGML_OP_MUL_MAT:
- if (op->src[1]->type != GGML_TYPE_F32 || ggml_is_transposed(op->src[0]) || ggml_is_transposed(op->src[1]))
- return false;
-
- switch (op->src[0]->type) {
- case GGML_TYPE_F32:
- return op->ne[3] == 1;
- case GGML_TYPE_Q6_K:
- case GGML_TYPE_F16:
- case GGML_TYPE_Q8_0:
- case GGML_TYPE_Q4_0:
- case GGML_TYPE_Q4_1:
- case GGML_TYPE_Q4_K:
- return true;
- default:
- ;
- }
- default:
- ;
- }
- return false;
-
- GGML_UNUSED(dev);
-}
-
-static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) {
- const int n_seq = 8;
-
- // FIXME: Figure out if we can somehow optimize the size of the pool... right now we're setting
- // it to the size of the graph, but I think it can be made smaller?
- ggml_vk_allocate_descriptor_pool(ctx, gf->n_nodes);
-
- std::vector<std::shared_ptr<kp::Sequence>> sequences(n_seq);
-
- for (auto& sequence : sequences) {
- sequence = komputeManager()->sequence();
- }
- for (int seq_idx = 0; seq_idx < n_seq; ++seq_idx) {
- const int n_nodes_per_seq = (gf->n_nodes + n_seq - 1) / n_seq;
-
- auto& seq = *sequences[seq_idx];
-
- const int node_start = (seq_idx + 0) * n_nodes_per_seq;
- const int node_end = std::min((seq_idx == n_seq - 1) ? gf->n_nodes : (seq_idx + 1) * n_nodes_per_seq, gf->n_nodes);
-
- bool any_commands_recorded = false;
-
- for (int i = node_start; i < node_end; ++i) {
- struct ggml_tensor * src0 = gf->nodes[i]->src[0];
- struct ggml_tensor * src1 = gf->nodes[i]->src[1];
- struct ggml_tensor * src2 = gf->nodes[i]->src[2]; GGML_UNUSED(src2);
- struct ggml_tensor * dst = gf->nodes[i];
- GGML_ASSERT(dst->data != nullptr);
-
- if (ggml_is_empty(dst)) {
- continue;
- }
-
- switch (dst->op) {
- case GGML_OP_NONE:
- case GGML_OP_RESHAPE:
- case GGML_OP_VIEW:
- case GGML_OP_TRANSPOSE:
- case GGML_OP_PERMUTE:
- continue; // noop -> next node
- default:
- break;
- }
-
- any_commands_recorded = true;
-
- const int32_t ne00 = src0 ? src0->ne[0] : 0;
- const int32_t ne01 = src0 ? src0->ne[1] : 0;
- const int32_t ne02 = src0 ? src0->ne[2] : 0;
- const int32_t ne03 = src0 ? src0->ne[3] : 0;
-
- const uint32_t nb00 = src0 ? src0->nb[0] : 0;
- const uint32_t nb01 = src0 ? src0->nb[1] : 0;
- const uint32_t nb02 = src0 ? src0->nb[2] : 0;
- const uint32_t nb03 = src0 ? src0->nb[3] : 0;
-
- const int32_t ne10 = src1 ? src1->ne[0] : 0;
- const int32_t ne11 = src1 ? src1->ne[1] : 0;
- const int32_t ne12 = src1 ? src1->ne[2] : 0;
- const int32_t ne13 = src1 ? src1->ne[3] : 0;
-
- const uint32_t nb10 = src1 ? src1->nb[0] : 0;
- const uint32_t nb11 = src1 ? src1->nb[1] : 0;
- const uint32_t nb12 = src1 ? src1->nb[2] : 0;
- const uint32_t nb13 = src1 ? src1->nb[3] : 0;
-
- const int32_t ne0 = dst ? dst->ne[0] : 0;
- const int32_t ne1 = dst ? dst->ne[1] : 0;
- const int32_t ne2 = dst ? dst->ne[2] : 0;
-// const int32_t ne3 = dst ? dst->ne[3] : 0;
-
- const uint32_t nb0 = dst ? dst->nb[0] : 0;
- const uint32_t nb1 = dst ? dst->nb[1] : 0;
- const uint32_t nb2 = dst ? dst->nb[2] : 0;
- const uint32_t nb3 = dst ? dst->nb[3] : 0;
-
- const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
- const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
- const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
-
- const static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
- uint32_t off_src0 = 0;
- uint32_t off_src1 = 0;
- uint32_t off_src2 = 0;
- uint32_t off_dst = 0;
- const std::shared_ptr<kp::Tensor>& id_src0 = src0 ? ggml_vk_get_tensor(src0, &off_src0) : nullTensor;
- const std::shared_ptr<kp::Tensor>& id_src1 = src1 ? ggml_vk_get_tensor(src1, &off_src1) : nullTensor;
- const std::shared_ptr<kp::Tensor>& id_src2 = src2 ? ggml_vk_get_tensor(src2, &off_src2) : nullTensor;
- const std::shared_ptr<kp::Tensor>& id_dst = dst ? ggml_vk_get_tensor(dst, &off_dst) : nullTensor;
-
- switch (dst->op) {
- case GGML_OP_ADD:
- {
- if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
- // src1 is a row
- ggml_vk_addrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst)/4, ne00);
- } else {
- ggml_vk_add(
- seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
- ne00, ne01, ne02, ne03,
- nb00, nb01, nb02, nb03,
- ne10, ne11, ne12, ne13,
- nb10, nb11, nb12, nb13,
- ne0,
- nb0, nb1, nb2, nb3
- );
- }
- } break;
- case GGML_OP_MUL:
- {
- ggml_vk_mul(
- seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
- ne00, ne01, ne02, ne03,
- nb00, nb01, nb02, nb03,
- ne10, ne11, ne12, ne13,
- nb10, nb11, nb12, nb13,
- ne0,
- nb0, nb1, nb2, nb3
- );
- } break;
- case GGML_OP_SCALE:
- {
- float scale; memcpy(&scale, dst->op_params, sizeof(float));
-
- ggml_vk_scale(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst), scale);
- } break;
- case GGML_OP_UNARY:
- {
- int64_t n = ggml_nelements(dst);
- GGML_ASSERT(n % 4 == 0);
- switch (ggml_get_unary_op(gf->nodes[i])) {
- case GGML_UNARY_OP_SILU:
- {
- ggml_vk_silu(seq, id_src0, id_dst, off_src0, off_dst, n/4);
- } break;
- case GGML_UNARY_OP_RELU:
- {
- ggml_vk_relu(seq, id_src0, id_dst, off_src0, off_dst, n/4);
- } break;
- case GGML_UNARY_OP_GELU:
- {
- GGML_ASSERT(n % 8 == 0);
- ggml_vk_gelu(seq, id_src0, id_dst, off_src0, off_dst, n/8);
- } break;
- default:
- {
- fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
- GGML_ABORT("fatal error");
- }
- }
- } break;
- case GGML_OP_SOFT_MAX:
- {
- float scale;
- float max_bias;
-
- memcpy(&scale, (float *)dst->op_params + 0, sizeof(float));
- memcpy(&max_bias, (float *)dst->op_params + 1, sizeof(float));
-
-#pragma message("TODO: add ggml_vk_soft_max() F16 src1 support")
-#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
- GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
-
- const int64_t nrows_x = ggml_nrows(src0);
- const int64_t nrows_y = src0->ne[1];
-
- const uint32_t n_head = nrows_x/nrows_y;
- const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
-
- const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
- const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
-
- ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale, max_bias, m0, m1, n_head_log2);
- } break;
- case GGML_OP_DIAG_MASK_INF:
- {
- const int n_past = ((int32_t *)(dst->op_params))[0];
- ggml_vk_diag_mask_inf(seq, id_src0, id_dst, off_src0, off_dst, n_past, ne00, ne01, ne02);
- } break;
- case GGML_OP_NORM:
- {
- float eps;
- memcpy(&eps, dst->op_params, sizeof(float));
- ggml_vk_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0), eps);
- } break;
- case GGML_OP_RMS_NORM:
- {
- GGML_ASSERT(ne00 % 4 == 0);
-
- float eps;
- memcpy(&eps, dst->op_params, sizeof(float));
- ggml_vk_rms_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0), eps);
- } break;
- case GGML_OP_MUL_MAT:
- {
- GGML_ASSERT(ne00 == ne10);
-
- GGML_ASSERT(ne12 % ne02 == 0);
- GGML_ASSERT(ne13 % ne03 == 0);
-
- const uint32_t r2 = ne12/ne02;
- const uint32_t r3 = ne13/ne03;
-
- if (src1t != GGML_TYPE_F32) {
- fprintf(stderr, "%s: %s: Unsupported src1 type: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
- goto not_implemented;
- }
-
- if (ggml_is_transposed(src0) ||
- ggml_is_transposed(src1)) {
- fprintf(stderr, "%s: %s: matmul on tranposed tensor not supported: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
- goto not_implemented;
- }
-
- switch (src0t) {
- case GGML_TYPE_F32:
- ggml_vk_mul_mat_mat_f32(
- seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
- ne00, ne01, ne02, nb01, nb02, ne11, ne12, nb11, nb12, nb1, nb2
- );
- break;
- case GGML_TYPE_F16:
- ggml_vk_mul_mat_f16(
- seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
- ne00, ne01, ne02, nb00, nb01, nb02, nb03,
- ne10, ne11, ne12, ne13, nb10, nb11, nb12, nb13,
- ne0, ne1, r2, r3
- );
- break;
- case GGML_TYPE_Q8_0:
- ggml_vk_mul_mat_q8_0(
- seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
- ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
- nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
- );
- break;
- case GGML_TYPE_Q4_0:
- ggml_vk_mul_mat_q4_0(
- seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
- ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
- nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
- );
- break;
- case GGML_TYPE_Q4_1:
- ggml_vk_mul_mat_q4_1(
- seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
- ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
- nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
- );
- break;
- case GGML_TYPE_Q4_K:
- ggml_vk_mul_mat_q4_k(
- seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
- ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
- nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
- );
- break;
- case GGML_TYPE_Q6_K:
- ggml_vk_mul_mat_q6_k(
- seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
- ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
- nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
- );
- break;
- default: {
- fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
- goto not_implemented;
- }
- }
-
- } break;
- case GGML_OP_GET_ROWS:
- {
- if (src0t == GGML_TYPE_F32) {
- ggml_vk_get_rows_f32(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
- } else if (src0t == GGML_TYPE_F16) {
- ggml_vk_get_rows_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
- } else if (src0t == GGML_TYPE_Q4_0) {
- ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
- } else if (src0t == GGML_TYPE_Q4_1) {
- ggml_vk_get_rows_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
- } else if (src0t == GGML_TYPE_Q6_K) {
- ggml_vk_get_rows_q6_k(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
- } else {
- fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0t);
- goto not_implemented;
- }
- } break;
- case GGML_OP_ROPE:
- {
- GGML_ASSERT(ne10 == ne02);
- GGML_ASSERT(src0t == dstt);
- // const int n_past = ((int32_t *) dst->op_params)[0];
- const int n_dims = ((int32_t *) dst->op_params)[1];
- const int mode = ((int32_t *) dst->op_params)[2];
- // skip 3, n_ctx used in GLM RoPE, unimplemented in Vulkan
- const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
-
- const bool has_freq_factors = dst->src[2] != nullptr;
-
- float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
- memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
- memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
- memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
- memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
- memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
- memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
- ggml_vk_rope(
- seq, id_src0, id_src1, id_src2, id_dst, off_src0, off_src1, off_src2, off_dst, src0t, n_dims, mode, n_ctx_orig,
- freq_base, freq_scale, has_freq_factors, ext_factor, attn_factor, beta_fast, beta_slow,
- ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3
- );
- } break;
- case GGML_OP_DUP:
- case GGML_OP_CPY:
- case GGML_OP_CONT:
- {
- switch (src0t) {
- case GGML_TYPE_F32:
- {
- switch (dstt) {
- case GGML_TYPE_F16: ggml_vk_cpy_f32_f16(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
- case GGML_TYPE_F32: ggml_vk_cpy_f32_f32(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
- default: goto not_implemented;
- }
- } break;
- case GGML_TYPE_F16:
- {
- switch (dstt) {
- case GGML_TYPE_F16: ggml_vk_cpy_f16_f16(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
- case GGML_TYPE_F32: ggml_vk_cpy_f16_f32(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
- default: goto not_implemented;
- } break;
- default: goto not_implemented;
- }
- }
- } break;
- default: goto not_implemented;
- }
- continue;
- not_implemented: {}
- fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
- //GGML_ABORT("fatal error");
- }
-
- // Evaluate sequence
- if (any_commands_recorded) {
- seq.evalAsync();
- }
- }
-
- // Wait for all sequences to finish
- for (auto& sequence : sequences) {
- if (sequence->isRunning())
- sequence->evalAwait();
- }
-
- ggml_vk_free_descriptor_pool(ctx);
-}
-
-template<>
-kp::Tensor::TensorDataTypes
-kp::TensorT<half>::dataType()
-{
- return TensorDataTypes::eFloat;
-}
-
-template<>
-kp::Tensor::TensorDataTypes
-kp::TensorT<uint8_t>::dataType()
-{
- return TensorDataTypes::eUnsignedInt;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-// backend interface
-
-struct ggml_backend_kompute_buffer_type_context {
- int device;
- int device_ref = 0;
- uint64_t buffer_alignment;
- uint64_t max_alloc;
- std::string name;
-
- ggml_backend_kompute_buffer_type_context(int device, uint64_t buffer_alignment, uint64_t max_alloc)
- : device(device), buffer_alignment(buffer_alignment), max_alloc(max_alloc), name(ggml_kompute_format_name(device)) {}
-};
-
-static void ggml_backend_kompute_device_ref(ggml_backend_buffer_type_t buft) {
- auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
-
- if (!ctx->device_ref) {
- komputeManager()->initializeDevice(
- ctx->device, {}, {
- "VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage",
- "VK_KHR_16bit_storage", "VK_KHR_shader_non_semantic_info"
- }
- );
- }
-
- assert(ggml_vk_has_device());
- ctx->device_ref++;
-}
-
-static void ggml_backend_kompute_device_unref(ggml_backend_buffer_type_t buft) {
- auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
-
- assert(ctx->device_ref > 0);
-
- ctx->device_ref--;
-
- if (!ctx->device_ref) {
- komputeManager.destroy();
- }
-}
-
-static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) {
- auto * memory = (ggml_vk_memory *)buffer->context;
- if (ggml_vk_has_device()) {
- ggml_vk_free_memory(*memory);
- }
- delete memory;
-}
-
-static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
- return ((ggml_vk_memory *)buffer->context)->data;
-}
-
-static void ggml_backend_kompute_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
- GGML_UNUSED(buffer);
-
- const auto res = ggml_vk_get_tensor(tensor);
- GGML_ASSERT(res);
-
- memcpy((char *)tensor->data + offset, data, size);
-
- komputeManager()->sequence()->eval<kp::OpTensorSyncDevice>({res});
-}
-
-static void ggml_backend_kompute_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
- GGML_UNUSED(buffer);
-
- const auto res = ggml_vk_get_tensor(tensor);
- GGML_ASSERT(res);
-
- komputeManager()->sequence()->eval<kp::OpTensorSyncLocal>({res});
-
- memcpy(data, (const char *)tensor->data + offset, size);
-}
-
-static void ggml_backend_kompute_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
- auto * memory = (ggml_vk_memory *)buffer->context;
- memset(memory->data, value, buffer->size);
-
- if (memory->stagingBuffer)
- komputeManager()->sequence()->eval<kp::OpBufferSyncDevice>(memory->primaryBuffer, memory->stagingBuffer, memory->size);
-}
-
-static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = {
- /* .free_buffer = */ ggml_backend_kompute_buffer_free_buffer,
- /* .get_base = */ ggml_backend_kompute_buffer_get_base,
- /* .init_tensor = */ NULL,
- /* .memset_tensor = */ NULL,
- /* .set_tensor = */ ggml_backend_kompute_buffer_set_tensor,
- /* .get_tensor = */ ggml_backend_kompute_buffer_get_tensor,
- /* .cpy_tensor = */ NULL,
- /* .clear = */ ggml_backend_kompute_buffer_clear,
- /* .reset = */ NULL,
-};
-
-// default buffer type
-
-static const char * ggml_backend_kompute_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
- auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
- return ctx->name.c_str();
-}
-
-static ggml_backend_buffer_t ggml_backend_kompute_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
- ggml_backend_kompute_device_ref(buft);
- auto * ctx = new ggml_vk_memory(ggml_vk_allocate(size));
- return ggml_backend_buffer_init(buft, ggml_backend_kompute_buffer_i, ctx, size);
-}
-
-static size_t ggml_backend_kompute_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
- auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
- return ctx->buffer_alignment;
-}
-
-static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
- auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
- return ctx->max_alloc;
-}
-
-static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
- /* .get_name = */ ggml_backend_kompute_buffer_type_get_name,
- /* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer,
- /* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment,
- /* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
- /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
- /* .is_host = */ NULL,
-};
-
-ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device) {
- static std::mutex mutex;
- std::lock_guard<std::mutex> lock(mutex);
-
- auto devices = ggml_vk_available_devices();
- int32_t device_count = (int32_t) devices.size();
- GGML_ASSERT(device < device_count);
- GGML_ASSERT(devices.size() <= GGML_KOMPUTE_MAX_DEVICES);
-
- static ggml_backend_buffer_type
- ggml_backend_kompute_buffer_types[GGML_KOMPUTE_MAX_DEVICES];
-
- static bool ggml_backend_kompute_buffer_type_initialized = false;
-
- if (!ggml_backend_kompute_buffer_type_initialized) {
- for (int32_t i = 0; i < device_count; i++) {
- ggml_backend_kompute_buffer_types[i] = {
- /* .iface = */ ggml_backend_kompute_buffer_type_interface,
- /* .device = */ ggml_backend_reg_dev_get(ggml_backend_kompute_reg(), i),
- /* .context = */ new ggml_backend_kompute_buffer_type_context{ i, devices[i].bufferAlignment, devices[i].maxAlloc },
- };
- }
- ggml_backend_kompute_buffer_type_initialized = true;
- }
-
- return &ggml_backend_kompute_buffer_types[device];
-}
-
-// backend
-
-static const char * ggml_backend_kompute_name(ggml_backend_t backend) {
- auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
- return ctx->name.c_str();
-}
-
-static void ggml_backend_kompute_free(ggml_backend_t backend) {
- auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
-
- assert(ctx == s_kompute_context);
- s_kompute_context = nullptr;
- if (ctx != nullptr) {
- delete ctx;
- }
-
- delete backend;
-}
-
-static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
- auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
- ggml_vk_graph_compute(ctx, cgraph);
- return GGML_STATUS_SUCCESS;
-}
-
-static struct ggml_backend_i kompute_backend_i = {
- /* .get_name = */ ggml_backend_kompute_name,
- /* .free = */ ggml_backend_kompute_free,
- /* .set_tensor_async = */ NULL,
- /* .get_tensor_async = */ NULL,
- /* .cpy_tensor_async = */ NULL,
- /* .synchronize = */ NULL,
- /* .graph_plan_create = */ NULL,
- /* .graph_plan_free = */ NULL,
- /* .graph_plan_update = */ NULL,
- /* .graph_plan_compute = */ NULL,
- /* .graph_compute = */ ggml_backend_kompute_graph_compute,
- /* .event_record = */ NULL,
- /* .event_wait = */ NULL,
-};
-
-static ggml_guid_t ggml_backend_kompute_guid() {
- static ggml_guid guid = { 0x7b, 0x57, 0xdc, 0xaf, 0xde, 0x12, 0x1d, 0x49, 0xfb, 0x35, 0xfa, 0x9b, 0x18, 0x31, 0x1d, 0xca };
- return &guid;
-}
-
-ggml_backend_t ggml_backend_kompute_init(int device) {
- GGML_ASSERT(s_kompute_context == nullptr);
- s_kompute_context = new ggml_kompute_context(device);
-
- ggml_backend_t kompute_backend = new ggml_backend {
- /* .guid = */ ggml_backend_kompute_guid(),
- /* .interface = */ kompute_backend_i,
- /* .device = */ ggml_backend_reg_dev_get(ggml_backend_kompute_reg(), device),
- /* .context = */ s_kompute_context,
- };
-
- return kompute_backend;
-}
-
-bool ggml_backend_is_kompute(ggml_backend_t backend) {
- return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_kompute_guid());
-}
-
-static size_t ggml_backend_kompute_get_device_count() {
- auto devices = ggml_vk_available_devices();
- return devices.size();
-}
-
-static void ggml_backend_kompute_get_device_description(int device, char * description, size_t description_size) {
- auto devices = ggml_vk_available_devices();
- GGML_ASSERT((size_t) device < devices.size());
- snprintf(description, description_size, "%s", devices[device].name);
-}
-
-static void ggml_backend_kompute_get_device_memory(int device, size_t * free, size_t * total) {
- auto devices = ggml_vk_available_devices();
- GGML_ASSERT((size_t) device < devices.size());
- *total = devices[device].heapSize;
- *free = devices[device].heapSize;
-}
-
-//////////////////////////
-
-struct ggml_backend_kompute_device_context {
- int device;
- std::string name;
- std::string description;
-};
-
-static const char * ggml_backend_kompute_device_get_name(ggml_backend_dev_t dev) {
- ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
- return ctx->name.c_str();
-}
-
-static const char * ggml_backend_kompute_device_get_description(ggml_backend_dev_t dev) {
- ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
- return ctx->description.c_str();
-}
-
-static void ggml_backend_kompute_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
- ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
- ggml_backend_kompute_get_device_memory(ctx->device, free, total);
-}
-
-static ggml_backend_buffer_type_t ggml_backend_kompute_device_get_buffer_type(ggml_backend_dev_t dev) {
- ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
- return ggml_backend_kompute_buffer_type(ctx->device);
-}
-
-static bool ggml_backend_kompute_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
- if (buft->iface.get_name != ggml_backend_kompute_buffer_type_get_name) {
- return false;
- }
-
- ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
- ggml_backend_kompute_buffer_type_context * buft_ctx = (ggml_backend_kompute_buffer_type_context *)buft->context;
-
- return buft_ctx->device == ctx->device;
-}
-
-static enum ggml_backend_dev_type ggml_backend_kompute_device_get_type(ggml_backend_dev_t dev) {
- GGML_UNUSED(dev);
- return GGML_BACKEND_DEVICE_TYPE_GPU;
-}
-
-static void ggml_backend_kompute_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
- props->name = ggml_backend_kompute_device_get_name(dev);
- props->description = ggml_backend_kompute_device_get_description(dev);
- props->type = ggml_backend_kompute_device_get_type(dev);
- ggml_backend_kompute_device_get_memory(dev, &props->memory_free, &props->memory_total);
- props->caps = {
- /* async = */ false,
- /* host_buffer = */ false,
- /* .buffer_from_host_ptr = */ false,
- /* events = */ false,
- };
-}
-
-static ggml_backend_t ggml_backend_kompute_device_init(ggml_backend_dev_t dev, const char * params) {
- GGML_UNUSED(params);
- ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
- return ggml_backend_kompute_init(ctx->device);
-}
-
-static bool ggml_backend_kompute_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
- const int min_batch_size = 32;
-
- return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
- (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
-
- GGML_UNUSED(dev);
-}
-
-static const struct ggml_backend_device_i ggml_backend_kompute_device_i = {
- /* .get_name = */ ggml_backend_kompute_device_get_name,
- /* .get_description = */ ggml_backend_kompute_device_get_description,
- /* .get_memory = */ ggml_backend_kompute_device_get_memory,
- /* .get_type = */ ggml_backend_kompute_device_get_type,
- /* .get_props = */ ggml_backend_kompute_device_get_props,
- /* .init_backend = */ ggml_backend_kompute_device_init,
- /* .get_buffer_type = */ ggml_backend_kompute_device_get_buffer_type,
- /* .get_host_buffer_type = */ NULL,
- /* .buffer_from_host_ptr = */ NULL,
- /* .supports_op = */ ggml_backend_kompute_device_supports_op,
- /* .supports_buft = */ ggml_backend_kompute_device_supports_buft,
- /* .offload_op = */ ggml_backend_kompute_device_offload_op,
- /* .event_new = */ NULL,
- /* .event_free = */ NULL,
- /* .event_synchronize = */ NULL,
-};
-
-static const char * ggml_backend_kompute_reg_get_name(ggml_backend_reg_t reg) {
- GGML_UNUSED(reg);
- return "Kompute";
-}
-
-static size_t ggml_backend_kompute_reg_get_device_count(ggml_backend_reg_t reg) {
- GGML_UNUSED(reg);
- return ggml_backend_kompute_get_device_count();
-}
-
-static ggml_backend_dev_t ggml_backend_kompute_reg_get_device(ggml_backend_reg_t reg, size_t device) {
- static std::vector<ggml_backend_dev_t> devices;
-
- static bool initialized = false;
-
- {
- static std::mutex mutex;
- std::lock_guard<std::mutex> lock(mutex);
- if (!initialized) {
- for (size_t i = 0; i < ggml_backend_kompute_get_device_count(); i++) {
- ggml_backend_kompute_device_context * ctx = new ggml_backend_kompute_device_context;
- char desc[256];
- ggml_backend_kompute_get_device_description(i, desc, sizeof(desc));
- ctx->device = i;
- ctx->name = "Kompute" + std::to_string(i);
- ctx->description = desc;
- devices.push_back(new ggml_backend_device {
- /* .iface = */ ggml_backend_kompute_device_i,
- /* .reg = */ reg,
- /* .context = */ ctx,
- });
- }
- initialized = true;
- }
- }
-
- GGML_ASSERT(device < devices.size());
- return devices[device];
-}
-
-static const struct ggml_backend_reg_i ggml_backend_kompute_reg_i = {
- /* .get_name = */ ggml_backend_kompute_reg_get_name,
- /* .get_device_count = */ ggml_backend_kompute_reg_get_device_count,
- /* .get_device = */ ggml_backend_kompute_reg_get_device,
- /* .get_proc_address = */ NULL,
-};
-
-ggml_backend_reg_t ggml_backend_kompute_reg() {
- static ggml_backend_reg reg = {
- /* .api_version = */ GGML_BACKEND_API_VERSION,
- /* .iface = */ ggml_backend_kompute_reg_i,
- /* .context = */ nullptr,
- };
-
- return ®
-}
-
-GGML_BACKEND_DL_IMPL(ggml_backend_kompute_reg)
+++ /dev/null
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int64: require
-#extension GL_EXT_control_flow_attributes: enable
-#extension GL_KHR_shader_subgroup_arithmetic : require
-#extension GL_EXT_debug_printf : enable
-
-#define QK4_0 32
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-#define TWOPI_F 6.283185307179586f
-
-#define QK_K 256
-#define K_SCALE_SIZE 12
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-struct block_q4_0 {
- float16_t d;
- uint8_t qs[QK4_0 / 2];
-};
-mat4 dequantize_q4_0(const block_q4_0 xb, uint il) {
- const float d1 = il != 0 ? (xb.d / 16.f) : xb.d;
- const float d2 = d1 / 256.f;
- const float md = -8.f * xb.d;
- const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F);
- const uint16_t mask1 = mask0 << 8;
-
- mat4 reg;
- for (int i=0;i<8;i++) {
- uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]);
- reg[i/2][2*(i%2)+0] = d1 * (b & mask0) + md;
- reg[i/2][2*(i%2)+1] = d2 * (b & mask1) + md;
- }
- return reg;
-}
-
-#define sizeof_block_q4_1 0x14
-struct block_q4_1 {
- float16_t d;
- float16_t m;
- uint8_t qs[QK4_1 / 2];
-};
-mat4 dequantize_q4_1(const block_q4_1 xb, uint il) {
- const float d1 = il != 0 ? (xb.d / 16.f) : xb.d;
- const float d2 = d1 / 256.f;
- const float m = xb.m;
- const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F);
- const uint16_t mask1 = mask0 << 8;
-
- mat4 reg;
- for (int i=0;i<8;i++) {
- uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]);
- reg[i/2][2*(i%2)+0] = ((b & mask0) * d1) + m;
- reg[i/2][2*(i%2)+1] = ((b & mask1) * d2) + m;
- }
- return reg;
-}
-
-#define sizeof_block_q4_k 144
-struct block_q4_k {
- float16_t d;
- float16_t dmin;
- uint8_t scales[K_SCALE_SIZE];
- uint8_t qs[QK_K/2];
-};
-
-#define sizeof_block_q6_k 210
-struct block_q6_k {
- uint8_t ql[QK_K/2]; // quants, lower 4 bits
- uint8_t qh[QK_K/4]; // quants, upper 2 bits
- int8_t scales[QK_K/16]; // scales, quantized with 8 bits
- float16_t d; // super-block scale
-};
-mat4 dequantize_q6_k(const block_q6_k xb, uint il) {
- const float16_t d_all = xb.d;
-
- const uint qlIndex = 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
- const uint qhIndex = 32*(il/8) + 16*(il&1);
- float16_t sc = xb.scales[(il%2) + 2 * ((il/2))];
- il = (il/2) & 3;
-
- const uint16_t kmask1 = il>1 ? uint16_t(il>2 ? 192 : 48) : uint16_t(il>0 ? 12 : 3);
- const uint16_t kmask2 = il>1 ? uint8_t(0xF0) : uint8_t(0x0F);
- const float16_t coef = il>1 ? float16_t(1.f/16.f) : float16_t(1.f);
- const float16_t ml = float16_t(d_all * sc * 32.f);
- const float16_t dl = float16_t(d_all * sc * coef);
- mat4 reg;
- for (int i = 0; i < 16; ++i) {
- const float16_t q = (il&1) != 0 ? ((xb.ql[qlIndex + i] & kmask2) | ((xb.qh[qhIndex + i] & kmask1) << 2))
- : ((xb.ql[qlIndex + i] & kmask2) | ((xb.qh[qhIndex + i] & kmask1) << 4));
- reg[i/4][i%4] = dl * q - ml;
- }
- return reg;
-}
-
-
-#define QK8_0 32
-// struct block_q8_0 {
-// float16_t d; // delta
-// int8_t qs[QK8_0]; // quants
-// };
-#define sizeof_block_q8_0 34
+++ /dev/null
-#version 450
-
-#include "common.comp"
-
-layout(local_size_x = 1024) in;
-
-layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
-layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
-layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
-
-layout(push_constant) uniform PushConstants {
- uint inAOff;
- uint inBOff;
- uint outOff;
- int ne00;
- int nb00;
- int nb01;
- int nb02;
- int nb03;
- int ne10;
- int ne11;
- int ne12;
- int ne13;
- int nb10;
- int nb11;
- int nb12;
- int nb13;
- int ne0;
- int nb0;
- int nb1;
- int nb2;
- int nb3;
- //int offs; // TODO: needed for GGML_OP_ACC, see metal code
-} pcs;
-
-// general-purpose kernel for addition of two tensors
-// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
-// cons: not very efficient
-void main() {
- const uint i03 = gl_WorkGroupID.z;
- const uint i02 = gl_WorkGroupID.y;
- const uint i01 = gl_WorkGroupID.x;
-
- const uint i13 = i03 % pcs.ne13;
- const uint i12 = i02 % pcs.ne12;
- const uint i11 = i01 % pcs.ne11;
-
- int offs = 0; // TMP (see above)
-
- uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + offs) / 4);
- uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11 ) / 4);
- uint dst_off = uint((i03*pcs.nb3 + i02*pcs.nb2 + i01*pcs.nb1 + offs) / 4);
-
- for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) {
- const uint i10 = i0 % pcs.ne10;
- out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] + inB[pcs.inBOff + src1_off + i10];
- }
-}
+++ /dev/null
-#version 450
-
-#include "common.comp"
-
-layout(local_size_x = 1) in;
-
-layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
-layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
-layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
-
-layout(push_constant) uniform PushConstants {
- uint inAOff;
- uint inBOff;
- uint outOff;
- uint row;
-} pcs;
-
-void main() {
- const uint baseIndex = gl_WorkGroupID.x * 4;
-
- for (uint x = 0; x < 4; x++) {
- const uint i = baseIndex + x;
- out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i % pcs.row) + pcs.inBOff];
- }
-}
+++ /dev/null
-#version 450
-
-#include "common.comp"
-
-#define IN_TYPE float16_t
-#define IN_TYPE_SIZE 2
-#define OUT_TYPE float16_t
-#define OUT_TYPE_SIZE 2
-
-layout(local_size_x = 1024) in;
-
-layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
-layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
-
-layout (push_constant) uniform parameter {
- uint inOff;
- uint outOff;
- int ne00;
- int ne01;
- int ne02;
- uint nb00;
- uint nb01;
- uint nb02;
- uint nb03;
- int ne0;
- int ne1;
- int ne2;
- uint nb0;
- uint nb1;
- uint nb2;
- uint nb3;
-} pcs;
-
-void main() {
- const uint i03 = gl_WorkGroupID.z;
- const uint i02 = gl_WorkGroupID.y;
- const uint i01 = gl_WorkGroupID.x;
-
- const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
-
- const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
- const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
- const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
- const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
-
- const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
-
- for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
- const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
- out_[dst_data+i00] = OUT_TYPE(in_[src]);
- }
-}
+++ /dev/null
-#version 450
-
-#include "common.comp"
-
-#define IN_TYPE float16_t
-#define IN_TYPE_SIZE 2
-#define OUT_TYPE float
-#define OUT_TYPE_SIZE 4
-
-layout(local_size_x = 1024) in;
-
-layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
-layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
-
-layout (push_constant) uniform parameter {
- uint inOff;
- uint outOff;
- int ne00;
- int ne01;
- int ne02;
- uint nb00;
- uint nb01;
- uint nb02;
- uint nb03;
- int ne0;
- int ne1;
- int ne2;
- uint nb0;
- uint nb1;
- uint nb2;
- uint nb3;
-} pcs;
-
-void main() {
- const uint i03 = gl_WorkGroupID.z;
- const uint i02 = gl_WorkGroupID.y;
- const uint i01 = gl_WorkGroupID.x;
-
- const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
-
- const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
- const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
- const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
- const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
-
- const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
-
- for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
- const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
- out_[dst_data+i00] = OUT_TYPE(in_[src]);
- }
-}
+++ /dev/null
-#version 450
-
-#include "common.comp"
-
-#define IN_TYPE float
-#define IN_TYPE_SIZE 4
-#define OUT_TYPE float16_t
-#define OUT_TYPE_SIZE 2
-
-layout(local_size_x = 1024) in;
-
-layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
-layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
-
-layout (push_constant) uniform parameter {
- uint inOff;
- uint outOff;
- int ne00;
- int ne01;
- int ne02;
- uint nb00;
- uint nb01;
- uint nb02;
- uint nb03;
- int ne0;
- int ne1;
- int ne2;
- uint nb0;
- uint nb1;
- uint nb2;
- uint nb3;
-} pcs;
-
-void main() {
- const uint i03 = gl_WorkGroupID.z;
- const uint i02 = gl_WorkGroupID.y;
- const uint i01 = gl_WorkGroupID.x;
-
- const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
-
- const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
- const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
- const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
- const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
-
- const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
-
- for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
- const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
- out_[dst_data+i00] = OUT_TYPE(in_[src]);
- }
-}
+++ /dev/null
-#version 450
-
-#include "common.comp"
-
-#define IN_TYPE float
-#define IN_TYPE_SIZE 4
-#define OUT_TYPE float
-#define OUT_TYPE_SIZE 4
-
-layout(local_size_x = 1024) in;
-
-layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
-layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
-
-layout (push_constant) uniform parameter {
- uint inOff;
- uint outOff;
- int ne00;
- int ne01;
- int ne02;
- uint nb00;
- uint nb01;
- uint nb02;
- uint nb03;
- int ne0;
- int ne1;
- int ne2;
- uint nb0;
- uint nb1;
- uint nb2;
- uint nb3;
-} pcs;
-
-void main() {
- const uint i03 = gl_WorkGroupID.z;
- const uint i02 = gl_WorkGroupID.y;
- const uint i01 = gl_WorkGroupID.x;
-
- const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
-
- const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
- const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
- const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
- const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
-
- const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
-
- for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
- const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
- out_[dst_data+i00] = OUT_TYPE(in_[src]);
- }
-}
+++ /dev/null
-#version 450
-
-#include "common.comp"
-
-layout(local_size_x = 1) in;
-
-layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
-layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
-
-layout(push_constant) uniform PushConstants {
- uint inOff;
- uint outOff;
- uint n_past;
- int ne00;
- int ne01;
-} pcs;
-
-void main() {
- const uint i02 = gl_WorkGroupID.z;
- const uint i01 = gl_WorkGroupID.y;
- const uint i00 = gl_WorkGroupID.x;
-
- const uint index = i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00 + i00;
-
- if (i00 > pcs.n_past + i01) {
- out_[index + pcs.outOff] = uintBitsToFloat(0xFF800000);
- } else {
- out_[index + pcs.outOff] = in_[index + pcs.inOff];
- }
-}
+++ /dev/null
-#version 450
-
-#include "common.comp"
-
-layout(local_size_x = 1) in;
-
-layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
-layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
-layout(push_constant) uniform PushConstants {
- uint inOff;
- uint outOff;
-} pcs;
-
-void main() {
- const uint baseIndex = gl_WorkGroupID.x * 8;
-
- for (uint x = 0; x < 8; x++) {
- const uint i = baseIndex + x;
- const float y = in_[i + pcs.inOff];
- out_[i + pcs.outOff] = 0.5*y*(1.0 + tanh(clamp(SQRT_2_OVER_PI*y*(1.0 + GELU_COEF_A*y*y), -15.0, 15.0)));
- }
-}
+++ /dev/null
-void main() {
- const uint i = gl_WorkGroupID.x;
- const int r = inB[i + pcs.inBOff];
-
- int z = 0;
- for (uint ind = gl_LocalInvocationID.x; ind < pcs.ne00/16; ind += gl_WorkGroupSize.x) {
- const uint inIndex = (r * pcs.nb01 + pcs.inAOff) + ind/NL * SIZE_OF_BLOCK;
- const mat4 result = dequantize_block(inIndex, ind%NL);
- for (uint j = 0; j < 4; ++j) {
- for (uint k = 0; k < 4; ++k) {
- const uint outIndex = i * pcs.nb1/BYTES_FOR_TYPE + pcs.outOff + z;
- out_[outIndex] = result[j][k];
- ++z;
- }
- }
- }
-}
+++ /dev/null
-#version 450
-
-#include "common.comp"
-
-layout(local_size_x = 1) in;
-
-layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
-layout (binding = 1) readonly buffer tensorInB { int inB[]; };
-layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
-
-layout (push_constant) uniform parameter {
- uint inAOff;
- uint inBOff;
- uint outOff;
- int ne00;
- int nb01;
- int nb1;
-} pcs;
-
-void dequantize_row_f16(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
- for (int j = 0; j < k; j++) {
- out_[y + j] = inA[x + j];
- }
-}
-
-void main() {
- const uint i = gl_WorkGroupID.x;
- const int r = inB[i + pcs.inBOff];
-
- dequantize_row_f16(r*pcs.nb01/2/*bytes for float16*/ + pcs.inAOff, i*pcs.nb1/4 + pcs.outOff, pcs.ne00);
-}
+++ /dev/null
-#version 450
-
-#include "common.comp"
-
-layout(local_size_x = 1) in;
-
-layout (binding = 0) readonly buffer tensorInA { float inA[]; };
-layout (binding = 1) readonly buffer tensorInB { int inB[]; };
-layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
-
-layout (push_constant) uniform parameter {
- uint inAOff;
- uint inBOff;
- uint outOff;
- int ne00;
- int nb01;
- int nb1;
-} pcs;
-
-void dequantize_row_f32(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
- for (int j = 0; j < k; j++) {
- out_[y + j] = inA[x + j];
- }
-}
-
-void main() {
- const uint i = gl_WorkGroupID.x;
- const int r = inB[i + pcs.inBOff];
-
- dequantize_row_f32(r*pcs.nb01/4 + pcs.inAOff, i*pcs.nb1/4 + pcs.outOff, pcs.ne00);
-}
+++ /dev/null
-#version 450
-
-#include "common.comp"
-
-#define NL 2
-#define BYTES_FOR_TYPE 4 /*bytes for float*/
-#define SIZE_OF_BLOCK sizeof_block_q4_0
-
-layout(local_size_x = 1) in;
-
-layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
-layout (binding = 1) readonly buffer tensorInB { int inB[]; };
-layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
-
-layout (push_constant) uniform parameter {
- uint inAOff;
- uint inBOff;
- uint outOff;
- int ne00;
- int nb01;
- int nb1;
-} pcs;
-
-block_q4_0 get_unaligned_block_q4_0(uint index) {
- block_q4_0 fres;
- fres.d = u8BufToFloat16(inA, index);
- [[unroll]] for (uint it = 0; it != QK4_0 / 2; it++) {
- fres.qs[it] = inA[index+2+it];
- }
- return fres;
-}
-
-mat4 dequantize_block(uint index, uint il) {
- const block_q4_0 block = get_unaligned_block_q4_0(index);
- return dequantize_q4_0(block, il);
-}
-
-#include "op_getrows.comp"
+++ /dev/null
-#version 450
-
-#include "common.comp"
-
-#define NL 2
-#define BYTES_FOR_TYPE 4 /*bytes for float*/
-#define SIZE_OF_BLOCK sizeof_block_q4_1
-
-layout(local_size_x = 1) in;
-
-layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
-layout (binding = 1) readonly buffer tensorInB { int inB[]; };
-layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
-
-layout (push_constant) uniform parameter {
- uint inAOff;
- uint inBOff;
- uint outOff;
- int ne00;
- int nb01;
- int nb1;
-} pcs;
-
-block_q4_1 get_unaligned_block_q4_1(uint index) {
- block_q4_1 fres;
- fres.d = u8BufToFloat16(inA, index);
- fres.m = u8BufToFloat16(inA, index+2);
- [[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) {
- fres.qs[it] = inA[index+4+it];
- }
- return fres;
-}
-
-mat4 dequantize_block(uint index, uint il) {
- const block_q4_1 block = get_unaligned_block_q4_1(index);
- return dequantize_q4_1(block, il);
-}
-
-#include "op_getrows.comp"
+++ /dev/null
-#version 450
-
-#include "common.comp"
-
-#define NL 16
-#define BYTES_FOR_TYPE 4 /*bytes for float*/
-#define SIZE_OF_BLOCK sizeof_block_q6_k
-
-layout(local_size_x = 1) in;
-
-layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
-layout (binding = 1) readonly buffer tensorInB { int inB[]; };
-layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
-
-layout (push_constant) uniform parameter {
- uint inAOff;
- uint inBOff;
- uint outOff;
- int ne00;
- int nb01;
- int nb1;
-} pcs;
-
-block_q6_k get_unaligned_block_q6_k(uint index) {
- block_q6_k fres;
- [[unroll]] for (uint it = 0; it != QK_K / 2; it++) {
- fres.ql[it] = inA[index + it];
- }
- [[unroll]] for (uint it = 0; it != QK_K / 4; it++) {
- fres.qh[it] = inA[index + QK_K/2 + it];
- }
- [[unroll]] for (uint it = 0; it != QK_K / 16; it++) {
- fres.scales[it] = int8_t(inA[index + QK_K/2 + QK_K/4 + it]);
- }
- fres.d = u8BufToFloat16(inA, index + QK_K/2 + QK_K/4 + QK_K/16);
- return fres;
-}
-
-mat4 dequantize_block(uint index, uint il) {
- const block_q6_k block = get_unaligned_block_q6_k(index);
- return dequantize_q6_k(block, il);
-}
-
-#include "op_getrows.comp"
+++ /dev/null
-#version 450
-
-#include "common.comp"
-
-layout(local_size_x = 1024) in;
-
-layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
-layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
-layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
-
-layout(push_constant) uniform PushConstants {
- uint inAOff;
- uint inBOff;
- uint outOff;
- int ne00;
- int nb00;
- int nb01;
- int nb02;
- int nb03;
- int ne10;
- int ne11;
- int ne12;
- int ne13;
- int nb10;
- int nb11;
- int nb12;
- int nb13;
- int ne0;
- int nb0;
- int nb1;
- int nb2;
- int nb3;
-} pcs;
-
-void main() {
- const uint i03 = gl_WorkGroupID.z;
- const uint i02 = gl_WorkGroupID.y;
- const uint i01 = gl_WorkGroupID.x;
-
- const uint i13 = i03 % pcs.ne13;
- const uint i12 = i02 % pcs.ne12;
- const uint i11 = i01 % pcs.ne11;
-
- uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01) / 4);
- uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11) / 4);
- uint dst_off = uint((i03*pcs.nb3 + i02*pcs.nb2 + i01*pcs.nb1) / 4);
-
- for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) {
- const uint i10 = i0 % pcs.ne10;
- out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] * inB[pcs.inBOff + src1_off + i10];
- }
-}
+++ /dev/null
-#version 450
-
-#include "common.comp"
-
-#extension GL_KHR_shader_subgroup_arithmetic : require
-
-layout(local_size_x_id = 0) in;
-
-layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
-layout (binding = 1) readonly buffer tensorInB { float inB[]; };
-layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
-
-layout (push_constant) uniform parameter {
- uint inAOff;
- uint inBOff;
- uint outOff;
- int ne00;
- int ne01;
- int ne02;
- uint nb00;
- uint nb01;
- uint nb02;
- uint nb03;
- int ne10;
- int ne11;
- int ne12;
- uint nb10;
- uint nb11;
- uint nb12;
- uint nb13;
- int ne0;
- int ne1;
- uint r2;
- uint r3;
-} pcs;
-
-#define N_F16_F32 4
-
-void main() {
- const uint r0 = gl_WorkGroupID.x;
- const uint rb = gl_WorkGroupID.y*N_F16_F32;
- const uint im = gl_WorkGroupID.z;
-
- const uint i12 = im%pcs.ne12;
- const uint i13 = im/pcs.ne12;
-
- const uint offset0 = r0*pcs.nb01 + (i12/pcs.r2)*pcs.nb02 + (i13/pcs.r3)*pcs.nb03;
-
- const uint x = offset0 / 2 + pcs.inAOff; // Based from inA
-
- for (uint row = 0; row < N_F16_F32; ++row) {
- uint r1 = rb + row;
- if (r1 >= pcs.ne11) {
- break;
- }
-
- const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
-
- float sumf = 0;
- for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
- sumf += float(inA[x+i]) * float(inB[y+i]);
- }
-
- const float all_sum = subgroupAdd(sumf);
- if (subgroupElect()) {
- out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = all_sum;
- }
- }
-}
+++ /dev/null
-#version 450
-
-#include "common.comp"
-
-#extension GL_KHR_shader_subgroup_arithmetic : require
-#extension GL_EXT_debug_printf : enable
-
-// device subgroup size
-layout (local_size_x_id = 0) in;
-
-layout(binding = 0) readonly buffer tensorInA { float inA[]; };
-layout(binding = 1) readonly buffer tensorInB { float inB[]; };
-layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
-
-layout(push_constant) uniform parameter {
- uint inAOff;
- uint inBOff;
- uint outOff;
- int ne00;
- int ne01;
- int ne02;
- int ne11;
- int ne12;
- uint nb01;
- uint nb02;
- uint nb11;
- uint nb12;
- uint nb1;
- uint nb2;
-}
-pcs;
-
-
-void main() {
- uvec3 gid = gl_WorkGroupID;
-
- uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
- uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
-
- const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) / 4 + pcs.inAOff; // Based from inA
- const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
- float sum = 0.0f;
- for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
- sum += float(inA[x+i]) * float(inB[y+i]);
- }
-
- const float all_sum = subgroupAdd(sum);
- if (subgroupElect()) {
- out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = all_sum;
- }
-}
+++ /dev/null
-#version 450
-
-#include "common.comp"
-
-#define BLOCKS_IN_QUANT QK4_0
-#define SIZE_OF_BLOCK sizeof_block_q4_0
-#define N_ROWS 4
-
-#include "op_mul_mv_q_n_pre.comp"
-
-// The q4_0 version of this function
-float block_q_n_dot_y(uint block_index, uint yb, uint il) {
- vec2 acc = vec2(0.0, 0.0);
- const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
- float d = float(u8BufToFloat16(inA, index));
- float sumy = 0.0f;
- for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
- const uint16_t b = u8BufToU16(inA, index + 2 + il + i);
-
- const float yl0 = inB[yb + i];
- const float yl1 = inB[yb + i + 1];
- const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
- const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
-
- sumy += yl0 + yl1 + yl8 + yl9;
-
- acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
- acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
- }
- return d * (sumy * -8.f + acc[0] + acc[1]);
-}
-
-#include "op_mul_mv_q_n.comp"
+++ /dev/null
-#version 450
-
-#include "common.comp"
-
-#define BLOCKS_IN_QUANT QK4_1
-#define SIZE_OF_BLOCK sizeof_block_q4_1
-#define N_ROWS 4
-
-#include "op_mul_mv_q_n_pre.comp"
-
-// The q4_1 version of this function
-float block_q_n_dot_y(uint block_index, uint yb, uint il) {
- vec2 acc = vec2(0.0, 0.0);
- const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
- float d = float(u8BufToFloat16(inA, index));
- float m = float(u8BufToFloat16(inA, index+2));
-
- float sumy = 0.0f;
- for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
- const uint16_t b = u8BufToU16(inA, index + 4 + il + i);
-
- const float yl0 = inB[yb + i];
- const float yl1 = inB[yb + i + 1];
- const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
- const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
-
- sumy += yl0 + yl1 + yl8 + yl9;
-
- acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
- acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
- }
- return d * (acc[0] + acc[1]) + sumy * m;
-}
-
-#include "op_mul_mv_q_n.comp"
+++ /dev/null
-#version 450
-
-#include "common.comp"
-
-#define N_DST 4
-#define SIZE_OF_BLOCK sizeof_block_q4_k
-
-layout(local_size_x = 4) in;
-layout(local_size_y = 8) in;
-layout(local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer tensorInA { block_q4_k inA[]; };
-layout (binding = 1) readonly buffer tensorInB { float inB[]; };
-layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
-
-layout (push_constant) uniform parameter {
- uint inAOff;
- uint inBOff;
- uint outOff;
- int ne00;
- int ne10;
- int ne0;
- int ne1;
- int ne01;
- int ne02;
- int ne12;
- uint nb01;
- uint nb02;
- uint nb03;
- uint nb11;
- uint nb12;
- uint nb13;
- uint r2;
- uint r3;
-} pcs;
-
-void main() {
- const uint16_t kmask1 = uint16_t(0x3f3f);
- const uint16_t kmask2 = uint16_t(0x0f0f);
- const uint16_t kmask3 = uint16_t(0xc0c0);
-
- const uint ix = gl_SubgroupInvocationID/8; // 0...3
- const uint it = gl_SubgroupInvocationID%8; // 0...7
- const uint iq = it/4; // 0 or 1
- const uint ir = it%4; // 0...3
-
- const uint nb = pcs.ne00/QK_K;
-
- const uint r0 = gl_WorkGroupID.x;
- const uint r1 = gl_WorkGroupID.y;
- const uint im = gl_WorkGroupID.z;
-
- const uint first_row = r0 * N_DST;
- const uint ib_row = first_row * nb;
-
- const uint i12 = im%pcs.ne12;
- const uint i13 = im/pcs.ne12;
-
- const uint offset0 = first_row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
- const uint offset1 = r1*pcs.nb11 + (i12 )*pcs.nb12 + (i13 )*pcs.nb13;
-
- const uint xblk = offset0 + pcs.inAOff;
- const uint y = (offset1 / 4) + pcs.inBOff;
-
- float yl[16];
- float yh[16];
- float sumf[N_DST] = {0.f, 0.f, 0.f, 0.f};
- float all_sum = 0.f;
-
- uint y4 = y + ix * QK_K + 64 * iq + 8 * ir;
-
- for (uint ib = ix; ib < nb; ib += 4) {
- const uint blk_idx = ib + xblk;
-
- float sumy[4] = {0.f, 0.f, 0.f, 0.f};
- for (int i = 0; i < 8; ++i) {
- yl[i+0] = inB[y4+i+ 0]; sumy[0] += yl[i+0];
- yl[i+8] = inB[y4+i+ 32]; sumy[1] += yl[i+8];
- yh[i+0] = inB[y4+i+128]; sumy[2] += yh[i+0];
- yh[i+8] = inB[y4+i+160]; sumy[3] += yh[i+8];
- }
-
- for (int row = 0; row < N_DST; row++) {
- uint row_idx = row * (pcs.nb01 / SIZE_OF_BLOCK);
-
- uint16_t sc_0 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 0);
- uint16_t sc_1 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 2);
- uint16_t sc_2 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 4);
- uint16_t sc_3 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 6);
- uint16_t sc_4 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 8);
-
- uint16_t sc16[4];
- sc16[0] = sc_0 & kmask1;
- sc16[1] = sc_2 & kmask1;
- sc16[2] = ((sc_4 >> 0) & kmask2) | ((sc_0 & kmask3) >> 2);
- sc16[3] = ((sc_4 >> 4) & kmask2) | ((sc_2 & kmask3) >> 2);
-
- float acc1[4] = {0.f, 0.f, 0.f, 0.f};
- float acc2[4] = {0.f, 0.f, 0.f, 0.f};
- for (int i = 0; i < 8; i += 2) {
- uint16_t q1 = u8BufToU16(inA[blk_idx + row_idx].qs, 32 * iq + 8 * ir + i);
- uint16_t q2 = u8BufToU16(inA[blk_idx + row_idx].qs, 64 + 32 * iq + 8 * ir + i);
- acc1[0] += yl[i+0] * (q1 & 0x000F);
- acc1[1] += yl[i+1] * (q1 & 0x0F00);
- acc1[2] += yl[i+8] * (q1 & 0x00F0);
- acc1[3] += yl[i+9] * (q1 & 0xF000);
- acc2[0] += yh[i+0] * (q2 & 0x000F);
- acc2[1] += yh[i+1] * (q2 & 0x0F00);
- acc2[2] += yh[i+8] * (q2 & 0x00F0);
- acc2[3] += yh[i+9] * (q2 & 0xF000);
- }
-
- uint8_t sc8_0 = uint8_t(sc16[0] & 0xFF);
- uint8_t sc8_1 = uint8_t(sc16[0] >> 8 );
- uint8_t sc8_2 = uint8_t(sc16[1] & 0xFF);
- uint8_t sc8_3 = uint8_t(sc16[1] >> 8 );
- uint8_t sc8_4 = uint8_t(sc16[2] & 0xFF);
- uint8_t sc8_5 = uint8_t(sc16[2] >> 8 );
- uint8_t sc8_6 = uint8_t(sc16[3] & 0xFF);
- uint8_t sc8_7 = uint8_t(sc16[3] >> 8 );
-
- float dall = float(inA[blk_idx + row_idx].d);
- float dmin = float(inA[blk_idx + row_idx].dmin);
- sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc8_0 +
- (acc1[2] + 1.f/256.f * acc1[3]) * sc8_1 * 1.f/16.f +
- (acc2[0] + 1.f/256.f * acc2[1]) * sc8_4 +
- (acc2[2] + 1.f/256.f * acc2[3]) * sc8_5 * 1.f/16.f) -
- dmin * (sumy[0] * sc8_2 + sumy[1] * sc8_3 + sumy[2] * sc8_6 + sumy[3] * sc8_7);
- }
-
- y4 += 4 * QK_K;
- }
-
- for (int row = 0; row < N_DST; ++row) {
- all_sum = subgroupAdd(sumf[row]);
- if (subgroupElect()) {
- out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = all_sum;
- }
- }
-}
+++ /dev/null
-#version 450
-
-#include "common.comp"
-
-#define SIZE_OF_BLOCK sizeof_block_q6_k
-
-layout(local_size_x_id = 0) in;
-layout(local_size_y_id = 1) in;
-layout(local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
-layout (binding = 1) readonly buffer tensorInB { float inB[]; };
-layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
-
-layout (push_constant) uniform parameter {
- uint inAOff;
- uint inBOff;
- uint outOff;
- int ne00;
- int ne10;
- int ne0;
- int ne1;
- int ne01;
- int ne02;
- int ne12;
- uint nb01;
- uint nb02;
- uint nb03;
- uint nb11;
- uint nb12;
- uint nb13;
- uint r2;
- uint r3;
-} pcs;
-
-void main() {
- const uint8_t kmask1 = uint8_t(0x03);
- const uint8_t kmask2 = uint8_t(0x0C);
- const uint8_t kmask3 = uint8_t(0x30);
- const uint8_t kmask4 = uint8_t(0xC0);
-
- const uint nb = pcs.ne00/QK_K;
-
- const uint r0 = gl_WorkGroupID.x;
- const uint r1 = gl_WorkGroupID.y;
- const uint im = gl_WorkGroupID.z;
-
- const uint row = (r0 * gl_NumSubgroups + gl_SubgroupID);
-
- const uint i12 = im%pcs.ne12;
- const uint i13 = im/pcs.ne12;
-
- const uint x = row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
- const uint yy = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
-
- float sumf = 0;
-
- // bits of invocation ID for gl_SubgroupSize=32:
- // x x x x x
- // 4 3 2 1 0
- // ( tid ) ix
- // ip ( il )
-
- const uint block_stride = gl_SubgroupSize / 16; // number of blocks each subgroup processes
- const uint tid = gl_SubgroupInvocationID/block_stride; // first block_stride groups have tid=0
- const uint ix = gl_SubgroupInvocationID%block_stride; // first block is 0..block_stride-1
- const uint ip = tid/8; // first or second half of block (0 or 1)
- const uint il = tid%8; // each half has 8 parts, one per scale
- const uint n = 4; // 4 scales at a time (and 4 sums)
- const uint l0 = n*il; // offset into half-block, 0..28
- const uint is = 8*ip + l0/16; // 0, 1, 8, 9
-
- const uint y_offset = 128*ip + l0;
- const uint q_offset_l = 64*ip + l0;
- const uint q_offset_h = 32*ip + l0;
-
- for (uint i = ix; i < nb; i += block_stride) {
-
- const uint baseIndex = (x + i) * SIZE_OF_BLOCK + pcs.inAOff;
-
- const uint qlIndex = q_offset_l;
- const uint q2Index = qlIndex + QK_K/8;
- const uint qhIndex = q_offset_h;
- const uint y = yy + i * QK_K + y_offset;
-
- float sums[4] = {0.0f, 0.0f, 0.0f, 0.0f};
- for (uint l = 0; l < n; ++l) {
- const uint8_t currentQ1 = inA[baseIndex + qlIndex + l];
- const uint8_t currentQ2 = inA[baseIndex + q2Index + l];
- const uint8_t currentQh = inA[baseIndex + QK_K/2 + qhIndex + l];
-
- sums[0] += inB[y+l+ 0] * (int8_t((currentQ1 & 0xF) | ((currentQh & kmask1) << 4)) - 32);
- sums[1] += inB[y+l+32] * (int8_t((currentQ2 & 0xF) | ((currentQh & kmask2) << 2)) - 32);
- sums[2] += inB[y+l+64] * (int8_t((currentQ1 >> 4) | ((currentQh & kmask3) << 0)) - 32);
- sums[3] += inB[y+l+96] * (int8_t((currentQ2 >> 4) | ((currentQh & kmask4) >> 2)) - 32);
- }
-
- float d = u8BufToFloat16(inA, baseIndex + QK_K/2 + QK_K/4 + QK_K/16);
- sumf += d * (sums[0] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + is]) + sums[1] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 2 + is]) + sums[2] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 4 + is]) + sums[3] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 6 + is]));
- }
-
- const float tot = subgroupAdd(sumf);
- if (subgroupElect()) {
- out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot;
- }
-}
+++ /dev/null
-#version 450
-
-#include "common.comp"
-
-#include "op_mul_mv_q_n_pre.comp"
-
-#define SIZE_OF_D 2
-
-#define N_DST 4 // each SIMD group works on 4 rows
-#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
-#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
-
-#define NB_Q8_0 8
-
-void main() {
- // NB: hack to make compatible with AMD GPUs that have a subgroup size of 64
- if (gl_SubgroupInvocationID > 31)
- return;
-
- const int nr = N_DST;
- const int nsg = N_SIMDGROUP;
- const int nw = N_SIMDWIDTH;
-
- const int nb = pcs.ne00/QK8_0;
- const uint r0 = gl_WorkGroupID.x;
- const uint r1 = gl_WorkGroupID.y;
- const uint im = gl_WorkGroupID.z;
-
- const uint first_row = (r0 * nsg + gl_SubgroupID) * nr;
-
- const uint i12 = im%pcs.ne12;
- const uint i13 = im/pcs.ne12;
-
- const uint offset0 = first_row * nb + (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02);
-
- const uint x = offset0*sizeof_block_q8_0 + pcs.inAOff; // Based from inA
- const uint y = r1*pcs.ne10 + im*pcs.ne00*pcs.ne1 + pcs.inBOff; // based from inB
-
- float yl[NB_Q8_0];
- float sumf[N_DST]={0.f, 0.f, 0.f, 0.f};
-
- const uint ix = gl_SubgroupInvocationID.x/4;
- const uint il = gl_SubgroupInvocationID.x%4;
-
- uint yb = y + ix * QK8_0 + NB_Q8_0*il;
-
- // each thread in a SIMD group deals with NB_Q8_0 quants at a time
- for (uint ib = ix; ib < nb; ib += nw/4) {
- for (int i = 0; i < NB_Q8_0; ++i) {
- yl[i] = inB[yb + i];
- }
-
- for (int row = 0; row < nr; row++) {
- const uint block_offset = (ib+row*nb) * sizeof_block_q8_0;
- float sumq = 0.f;
- for (int iq = 0; iq < NB_Q8_0; ++iq) {
- const int8_t qs_iq = int8_t(inA[x + block_offset + SIZE_OF_D + NB_Q8_0*il + iq]);
- sumq += qs_iq * yl[iq];
- }
- const float16_t d = u8BufToFloat16(inA, x + block_offset);
- sumf[row] += sumq*d;
- }
-
- yb += NB_Q8_0 * nw;
- }
-
- for (int row = 0; row < nr; ++row) {
- const float tot = subgroupAdd(sumf[row]);
- if (subgroupElect() && first_row + row < pcs.ne01) {
- out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row] = tot;
- }
- }
-}
+++ /dev/null
-void main() {
- // NB: hack to make compatible with AMD GPUs that have a subgroup size of 64
- if (gl_SubgroupInvocationID > 31)
- return;
-
- const uint nb = uint(pcs.ne00/BLOCKS_IN_QUANT);
-
- const uint r0 = gl_WorkGroupID.x;
- const uint r1 = gl_WorkGroupID.y;
- const uint im = gl_WorkGroupID.z;
-
- const uint first_row = (r0 * gl_NumSubgroups + gl_SubgroupID) * N_ROWS;
-
- const uint i12 = im%pcs.ne12;
- const uint i13 = im/pcs.ne12;
-
- // pointers to src0 rows
- uint ax[N_ROWS];
- for (int row = 0; row < N_ROWS; ++row) {
- const uint offset0 = (first_row + row)*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
-
- ax[row] = offset0 + pcs.inAOff;
- }
-
- const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
-
- float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f};
-
- const uint ix = gl_SubgroupInvocationID/2;
- const uint il = (BLOCKS_IN_QUANT/4)*(gl_SubgroupInvocationID%2);
-
- uint yb = y + ix * BLOCKS_IN_QUANT + il;
-
- //debugPrintfEXT("gl_NumSubgroups=%d, gl_SubgroupID=%d, gl_SubgroupInvocationID=%d, glSubgroupSize=%d, gl_WorkGroupSize.x=%d, gl_WorkGroupSize.y=%d, gl_WorkGroupSize.z=%d\n",
- // gl_NumSubgroups, gl_SubgroupID, gl_SubgroupInvocationID, gl_SubgroupSize,
- // gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z);
-
- for (uint ib = ix; ib < nb; ib += 16) {
- for (int row = 0; row < N_ROWS; row++) {
- sumf[row] += block_q_n_dot_y(ax[row] + ib, yb, il);
- }
-
- yb += BLOCKS_IN_QUANT * 16;
- }
-
- for (int row = 0; row < N_ROWS; ++row) {
- const float tot = subgroupAdd(sumf[row]);
- if (first_row + row < pcs.ne01 && subgroupElect()) {
- out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = tot;
- }
- }
-}
+++ /dev/null
-layout(local_size_x_id = 0) in;
-layout(local_size_y = 8) in;
-layout(local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
-layout (binding = 1) readonly buffer tensorInB { float inB[]; };
-layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
-
-layout (push_constant) uniform parameter {
- uint inAOff;
- uint inBOff;
- uint outOff;
- int ne00;
- int ne01;
- int ne02;
- int ne10;
- int ne12;
- int ne0;
- int ne1;
- uint nb01;
- uint nb02;
- uint nb03;
- uint nb11;
- uint nb12;
- uint nb13;
- uint r2;
- uint r3;
-} pcs;
+++ /dev/null
-#version 450
-
-#include "common.comp"
-
-layout(local_size_x = 256) in;
-
-layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
-layout(binding = 1) buffer restrict tensorOut { float out_[]; };
-
-layout(push_constant) uniform PushConstants {
- uint inOff;
- uint outOff;
- uint ne00;
- uint nb01;
- float eps;
-} pcs;
-
-shared float sum[gl_WorkGroupSize.x];
-
-void main() {
- const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
- // MEAN
- // parallel sum
- sum[gl_LocalInvocationID.x] = 0.0;
- for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
- sum[gl_LocalInvocationID.x] += in_[x+i00];
- }
-
- // reduce
- barrier();
- memoryBarrierShared();
- [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
- if (gl_LocalInvocationID.x < i) {
- sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
- }
- barrier();
- memoryBarrierShared();
- }
-
- // broadcast
- if (gl_LocalInvocationID.x == 0) {
- sum[0] /= float(pcs.ne00);
- }
- barrier();
- memoryBarrierShared();
- const float mean = sum[0];
-
- // recenter
- const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
- for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
- out_[y+i00] = in_[x+i00] - mean;
- }
-
- // VARIANCE
- // parallel sum
- sum[gl_LocalInvocationID.x] = 0.0;
- for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
- sum[gl_LocalInvocationID.x] += out_[y+i00] * out_[y+i00];
- }
-
- // reduce
- barrier();
- memoryBarrierShared();
- [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
- if (gl_LocalInvocationID.x < i) {
- sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
- }
- barrier();
- memoryBarrierShared();
- }
-
- // broadcast
- if (gl_LocalInvocationID.x == 0) {
- sum[0] /= float(pcs.ne00);
- }
- barrier();
- memoryBarrierShared();
- const float variance = sum[0];
-
- const float scale = 1.0f/sqrt(variance + pcs.eps);
- for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
- out_[y+i00] *= scale;
- }
-}
+++ /dev/null
-#version 450
-
-#include "common.comp"
-
-layout(local_size_x = 1) in;
-
-layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
-layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
-layout(push_constant) uniform PushConstants {
- uint inOff;
- uint outOff;
-} pcs;
-
-void main() {
- const uint baseIndex = gl_WorkGroupID.x * 4;
-
- for (uint x = 0; x < 4; x++) {
- const uint i = baseIndex + x;
- out_[i + pcs.outOff] = max(0.0, in_[i + pcs.inOff]);
- }
-}
+++ /dev/null
-#version 450
-
-#include "common.comp"
-
-layout(local_size_x = 512) in;
-
-layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
-layout(binding = 1) buffer restrict tensorOut { float out_[]; };
-
-layout(push_constant) uniform PushConstants {
- uint inOff;
- uint outOff;
- uint ne00;
- uint nb01;
- float eps;
-} pcs;
-
-shared float sum[gl_WorkGroupSize.x];
-
-void main() {
- const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
-
- // parallel sum
- sum[gl_LocalInvocationID.x] = 0.0;
- for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
- sum[gl_LocalInvocationID.x] += in_[x+i00] * in_[x+i00];
- }
-
- // reduce
- barrier();
- memoryBarrierShared();
- [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
- if (gl_LocalInvocationID.x < i) {
- sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
- }
- barrier();
- memoryBarrierShared();
- }
-
- // broadcast
- if (gl_LocalInvocationID.x == 0) {
- sum[0] /= float(pcs.ne00);
- }
- barrier();
- memoryBarrierShared();
-
- const float scale = 1.0f/sqrt(sum[0] + pcs.eps);
-
- const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
- for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
- out_[y+i00] = in_[x+i00] * scale;
- }
-}
+++ /dev/null
-#version 450
-
-#include "rope_common.comp"
-
-layout(binding = 0) buffer restrict readonly tensorInA { float16_t inA[]; };
-layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
-layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; };
-layout(binding = 3) buffer restrict writeonly tensorOut { float16_t out_[]; };
-
-void main() {
- const uint i3 = gl_WorkGroupID.z;
- const uint i2 = gl_WorkGroupID.y;
- const uint i1 = gl_WorkGroupID.x;
-
- float corr_dims[2];
- rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
-
- const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
-
- float theta_base = float(inB[pcs.inBOff + i2]);
- float inv_ndims = -1.f/pcs.n_dims;
-
- float cos_theta;
- float sin_theta;
-
- for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
- if (i0 < pcs.n_dims) {
- uint ic = i0/2;
-
- float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
-
- const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
-
- rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
-
- const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + ic*pcs.nb00) / 2) + pcs.inAOff; // Based from in
- const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + ic*pcs.nb0) / 2) + pcs.outOff; // Based from out_
-
- const float x0 = float(inA[src]);
- const float x1 = float(inA[src+pcs.n_dims/2]);
-
- out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta);
- out_[dst_data+pcs.n_dims/2] = float16_t(x0*sin_theta + x1*cos_theta);
- } else {
- const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
- const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
-
- out_[dst_data] = inA[src];
- out_[dst_data+1] = inA[src+1];
- }
- }
-}
+++ /dev/null
-#version 450
-
-#include "rope_common.comp"
-
-layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
-layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
-layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; };
-layout(binding = 3) buffer restrict writeonly tensorOut { float out_[]; };
-
-void main() {
- const uint i3 = gl_WorkGroupID.z;
- const uint i2 = gl_WorkGroupID.y;
- const uint i1 = gl_WorkGroupID.x;
-
- float corr_dims[2];
- rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
-
- const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
-
- float theta_base = float(inB[pcs.inBOff + i2]);
- float inv_ndims = -1.f/pcs.n_dims;
-
- float cos_theta;
- float sin_theta;
-
- for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
- if (i0 < pcs.n_dims) {
- uint ic = i0/2;
-
- float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
-
- const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
-
- rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
-
- const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + ic*pcs.nb00) / 4) + pcs.inAOff; // Based from in
- const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + ic*pcs.nb0) / 4) + pcs.outOff; // Based from out_
-
- const float x0 = inA[src];
- const float x1 = inA[src+pcs.n_dims/2];
-
- out_[dst_data] = x0*cos_theta - x1*sin_theta;
- out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta;
- } else {
- const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
- const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
-
- out_[dst_data] = inA[src];
- out_[dst_data+1] = inA[src+1];
- }
- }
-}
+++ /dev/null
-#version 450
-
-#include "rope_common.comp"
-
-layout(binding = 0) buffer restrict readonly tensorInA { float16_t inA[]; };
-layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
-layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; };
-layout(binding = 3) buffer restrict writeonly tensorOut { float16_t out_[]; };
-
-void main() {
- const uint i3 = gl_WorkGroupID.z;
- const uint i2 = gl_WorkGroupID.y;
- const uint i1 = gl_WorkGroupID.x;
-
- float corr_dims[2];
- rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
-
- const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
-
- float theta_base = float(inB[pcs.inBOff + i2]);
- float inv_ndims = -1.f/pcs.n_dims;
-
- float cos_theta;
- float sin_theta;
-
- for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
- if (i0 < pcs.n_dims) {
- uint ic = i0/2;
-
- float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
-
- const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
-
- rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
-
- const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
- const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
-
- const float x0 = float(inA[src]);
- const float x1 = float(inA[src+1]);
-
- out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta);
- out_[dst_data+1] = float16_t(x0*sin_theta + x1*cos_theta);
- } else {
- const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
- const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
-
- out_[dst_data] = inA[src];
- out_[dst_data+1] = inA[src+1];
- }
- }
-}
+++ /dev/null
-#version 450
-
-#include "rope_common.comp"
-
-layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
-layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
-layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; };
-layout(binding = 3) buffer restrict writeonly tensorOut { float out_[]; };
-
-void main() {
- const uint i3 = gl_WorkGroupID.z;
- const uint i2 = gl_WorkGroupID.y;
- const uint i1 = gl_WorkGroupID.x;
-
- float corr_dims[2];
- rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
-
- const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
-
- float theta_base = float(inB[pcs.inBOff + i2]);
- float inv_ndims = -1.f/pcs.n_dims;
-
- float cos_theta;
- float sin_theta;
-
- for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
- if (i0 < pcs.n_dims) {
- uint ic = i0/2;
-
- float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
-
- const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
-
- rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
-
- const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
- const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
-
- const float x0 = inA[src];
- const float x1 = inA[src+1];
-
- out_[dst_data] = x0*cos_theta - x1*sin_theta;
- out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
- } else {
- const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
- const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
-
- out_[dst_data] = inA[src];
- out_[dst_data+1] = inA[src+1];
- }
- }
-}
+++ /dev/null
-#version 450
-
-#include "common.comp"
-
-layout(local_size_x = 1) in;
-
-layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
-layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
-
-layout(push_constant) uniform PushConstants {
- uint inOff;
- uint outOff;
- float scale;
-} pcs;
-
-void main() {
- const uint i = gl_WorkGroupID.x;
- out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
-}
+++ /dev/null
-#version 450
-
-#include "common.comp"
-
-layout(local_size_x = 1) in;
-
-layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
-layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
-
-layout(push_constant) uniform PushConstants {
- uint inOff;
- uint outOff;
- float scale;
-} pcs;
-
-void main() {
- const uint baseIndex = gl_WorkGroupID.x * 8;
-
- for (uint x = 0; x < 8; x++) {
- const uint i = baseIndex + x;
- out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
- }
-}
+++ /dev/null
-#version 450
-
-#include "common.comp"
-
-layout(local_size_x = 1) in;
-
-layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
-layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
-layout(push_constant) uniform PushConstants {
- uint inOff;
- uint outOff;
-} pcs;
-
-void main() {
- const uint baseIndex = gl_WorkGroupID.x * 4;
-
- for (uint x = 0; x < 4; x++) {
- const uint i = baseIndex + x;
- const float y = in_[i + pcs.inOff];
- out_[i + pcs.outOff] = y / (1.0 + exp(-y));
- }
-}
+++ /dev/null
-// TODO: implement multi-simd softmax (llama.cpp commit e16b9fa4)
-
-#version 450
-
-#include "common.comp"
-
-layout(local_size_x_id = 0) in;
-
-layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
-layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
-layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
-
-layout(push_constant) uniform PushConstants {
- uint inAOff;
- uint inBOff;
- uint outOff;
- int ne00;
- int ne01;
- int ne02;
- float scale;
- float max_bias;
- float m0;
- float m1;
- uint n_head_log2;
- int mask;
-} pcs;
-
-void main() {
- if (gl_SubgroupInvocationID > 31)
- return;
-
- const uint i03 = gl_WorkGroupID.z;
- const uint i02 = gl_WorkGroupID.y;
- const uint i01 = gl_WorkGroupID.x;
-
- const uint extra_off = i03*pcs.ne02*pcs.ne01*pcs.ne00 + i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00;
- const uint psrc0 = extra_off + pcs.inAOff; // Based from inA
- const uint pmask = i01*pcs.ne00 + pcs.inBOff; // Based from inB
- const uint pdst = extra_off + pcs.outOff; // Based from out_
-
- float slope = 1.0f;
-
- // ALiBi
- if (pcs.max_bias > 0.0f) {
- int64_t h = i02;
-
- float base = h < pcs.n_head_log2 ? pcs.m0 : pcs.m1;
- int64_t exp = h < pcs.n_head_log2 ? h + 1 : 2*(h - pcs.n_head_log2) + 1;
-
- slope = pow(base, float(exp));
- }
-
- // parallel max
- float localMax = uintBitsToFloat(0xFF800000);
- for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
- localMax = max(localMax, inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? slope*inB[pmask + i00] : 0.0f));
- }
- float max_ = subgroupMax(localMax);
-
- // parallel sum
- float localSum = 0.0f;
- for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
- const float exp_psrc0 = exp(inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? slope*inB[pmask + i00] : 0.0f) - max_);
- localSum += exp_psrc0;
- out_[pdst + i00] = exp_psrc0;
- }
-
- const float sum = subgroupAdd(localSum);
- for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
- out_[pdst + i00] /= sum;
- }
-}
+++ /dev/null
-#include "common.comp"
-
-#define GGML_ROPE_TYPE_NEOX 2
-
-// TODO: use a local size of 32 or more (Metal uses 1024)
-layout(local_size_x = 1) in;
-
-layout (push_constant) uniform parameter {
- uint inAOff;
- uint inBOff;
- uint inCOff;
- uint outOff;
- int n_dims;
- int mode;
- int n_ctx_orig;
- float freq_base;
- float freq_scale;
- bool has_freq_factors;
- float ext_factor;
- float attn_factor;
- float beta_fast;
- float beta_slow;
- uint nb00;
- uint nb01;
- uint nb02;
- uint nb03;
- int ne0;
- uint nb0;
- uint nb1;
- uint nb2;
- uint nb3;
-} pcs;
-
-float rope_yarn_ramp(const float low, const float high, const float i0) {
- const float y = (i0 / 2 - low) / max(0.001f, high - low);
- return 1.0f - min(1.0f, max(0.0f, y));
-}
-
-// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
-// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
-void rope_yarn(
- float theta_extrap, float freq_scale, float corr_dims[2], float i0, float ext_factor, float mscale,
- out float cos_theta, out float sin_theta
-) {
- // Get n-d rotational scaling corrected for extrapolation
- float theta_interp = freq_scale * theta_extrap;
- float theta = theta_interp;
- if (ext_factor != 0.0f) {
- float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
- theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
-
- // Get n-d magnitude scaling corrected for interpolation
- mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
- }
- cos_theta = cos(theta) * mscale;
- sin_theta = sin(theta) * mscale;
-}
-
-// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
-// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
-float rope_yarn_corr_factor(int n_dims, int n_ctx_orig, float n_rot, float base) {
- return n_dims * log(n_ctx_orig / (n_rot * TWOPI_F)) / (2 * log(base));
-}
-
-void rope_yarn_corr_dims(
- int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, out float dims[2]
-) {
- // start and end correction dims
- dims[0] = max(0.0f, floor(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_fast, freq_base)));
- dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_slow, freq_base)));
-}