GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
+// tell the allocator to parse nodes following the order described in the list
+// you should call this if your graph are optimized to execute out-of-order
+GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n);
+
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);
#define GGML_MAX_PARAMS 256
#define GGML_MAX_CONTEXTS 64
#define GGML_MAX_SRC 6
-#define GGML_MAX_NAME 48
+#define GGML_MAX_NAME 64
#define GGML_MAX_OP_PARAMS 32
#define GGML_DEFAULT_N_THREADS 4
#define GGML_EXIT_SUCCESS 0
#define GGML_EXIT_ABORTED 1
+#define GGUF_MAGIC 0x46554747 // "GGUF"
+#define GGUF_VERSION 1
+
+#define GGUF_DEFAULT_ALIGNMENT 32
+
#define GGML_UNUSED(x) (void)(x)
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
extern "C" {
#endif
-#ifdef __ARM_NEON
- // we use the built-in 16-bit float type
+#if defined(__ARM_NEON) && defined(__CUDACC__)
+ typedef half ggml_fp16_t;
+#elif defined(__ARM_NEON)
typedef __fp16 ggml_fp16_t;
#else
typedef uint16_t ggml_fp16_t;
GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
+ GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
GGML_API int ggml_blck_size (enum ggml_type type);
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
+ //
+ // gguf
+ //
+
+ enum gguf_type {
+ GGUF_TYPE_UINT8 = 0,
+ GGUF_TYPE_INT8 = 1,
+ GGUF_TYPE_UINT16 = 2,
+ GGUF_TYPE_INT16 = 3,
+ GGUF_TYPE_UINT32 = 4,
+ GGUF_TYPE_INT32 = 5,
+ GGUF_TYPE_FLOAT32 = 6,
+ GGUF_TYPE_BOOL = 7,
+ GGUF_TYPE_STRING = 8,
+ GGUF_TYPE_ARRAY = 9,
+ GGUF_TYPE_COUNT, // marks the end of the enum
+ };
+
+ struct gguf_context;
+
+ struct gguf_init_params {
+ bool no_alloc;
+
+ // if not NULL, create a ggml_context and allocate the tensor data in it
+ struct ggml_context ** ctx;
+ };
+
+ GGML_API struct gguf_context * gguf_init_empty(void);
+ GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
+ //GGML_API struct gguf_context * gguf_init_from_buffer(..);
+
+ GGML_API void gguf_free(struct gguf_context * ctx);
+
+ GGML_API const char * gguf_type_name(enum gguf_type type);
+
+ GGML_API int gguf_get_version (struct gguf_context * ctx);
+ GGML_API size_t gguf_get_alignment (struct gguf_context * ctx);
+ GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
+ GGML_API void * gguf_get_data (struct gguf_context * ctx);
+
+ GGML_API int gguf_get_n_kv(struct gguf_context * ctx);
+ GGML_API int gguf_find_key(struct gguf_context * ctx, const char * key);
+ GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
+
+ GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
+ GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);
+
+ // results are undefined if the wrong type is used for the key
+ GGML_API uint8_t gguf_get_val_u8 (struct gguf_context * ctx, int i);
+ GGML_API int8_t gguf_get_val_i8 (struct gguf_context * ctx, int i);
+ GGML_API uint16_t gguf_get_val_u16 (struct gguf_context * ctx, int i);
+ GGML_API int16_t gguf_get_val_i16 (struct gguf_context * ctx, int i);
+ GGML_API uint32_t gguf_get_val_u32 (struct gguf_context * ctx, int i);
+ GGML_API int32_t gguf_get_val_i32 (struct gguf_context * ctx, int i);
+ GGML_API float gguf_get_val_f32 (struct gguf_context * ctx, int i);
+ GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i);
+ GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
+ GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i);
+ GGML_API const void * gguf_get_arr_data(struct gguf_context * ctx, int i);
+ GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);
+
+ GGML_API int gguf_get_n_tensors (struct gguf_context * ctx);
+ GGML_API int gguf_find_tensor (struct gguf_context * ctx, const char * name);
+ GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
+ GGML_API char * gguf_get_tensor_name (struct gguf_context * ctx, int i);
+
+ // overrides existing values or adds a new one
+ GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
+ GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
+ GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
+ GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val);
+ GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
+ GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
+ GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
+ GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
+ GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
+ GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
+ GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
+
+ // set or add KV pairs from another context
+ GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
+
+ // manage tensor info
+ GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
+ GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
+ GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
+
+ // writing gguf files can be done in 2 ways:
+ //
+ // - write the entire gguf_context to a binary file in a single pass:
+ //
+ // gguf_write_to_file(ctx, fname);
+ //
+ // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
+ //
+ // FILE * f = fopen(fname, "wb");
+ // fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
+ // fwrite(f, ...);
+ // void * data = gguf_meta_get_meta_data(ctx);
+ // fseek(f, 0, SEEK_SET);
+ // fwrite(f, data, gguf_get_meta_size(ctx));
+ // free(data);
+ // fclose(f);
+ //
+
+ // write the entire context to a binary file
+ GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta);
+
+ // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
+ GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx);
+ GGML_API void gguf_get_meta_data(struct gguf_context * ctx, void * data);
+
//
// system info
//
typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
typedef struct {
+ const char * type_name;
+ int blck_size;
+ size_t type_size;
+ bool is_quantized;
ggml_to_float_t to_float;
ggml_from_float_t from_float;
ggml_from_float_t from_float_reference;
enum ggml_type vec_dot_type;
} ggml_type_traits_t;
- ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i);
+ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
#ifdef __cplusplus
}
#!/bin/bash
cp -rpv ../llama.cpp/ggml.c src/ggml.c
+cp -rpv ../llama.cpp/ggml-alloc.c src/ggml-alloc.c
cp -rpv ../llama.cpp/ggml-cuda.h src/ggml-cuda.h
cp -rpv ../llama.cpp/ggml-cuda.cu src/ggml-cuda.cu
cp -rpv ../llama.cpp/ggml-opencl.h src/ggml-opencl.h
cp -rpv ../llama.cpp/ggml-metal.m src/ggml-metal.m
cp -rpv ../llama.cpp/ggml-metal.metal src/ggml-metal.metal
cp -rpv ../llama.cpp/ggml.h include/ggml/ggml.h
+cp -rpv ../llama.cpp/ggml-alloc.h include/ggml/ggml-alloc.h
cp -rpv ../llama.cpp/tests/test-opt.cpp tests/test-opt.cpp
cp -rpv ../llama.cpp/tests/test-grad0.cpp tests/test-grad0.cpp
struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
size_t max_size;
bool measure;
+ int parse_seq[GGML_MAX_NODES];
+ bool has_parse_seq;
#ifdef GGML_ALLOCATOR_DEBUG
struct ggml_tensor * allocated_tensors[1024];
size_t max_avail = 0;
- // find the best fitting free block
+ // find the best fitting free block besides the last block
int best_fit_block = -1;
size_t best_fit_size = SIZE_MAX;
- for (int i = 0; i < alloc->n_free_blocks; i++) {
+ for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
struct free_block * block = &alloc->free_blocks[i];
max_avail = MAX(max_avail, block->size);
if (block->size >= size && block->size <= best_fit_size) {
AT_PRINTF("block %d\n", best_fit_block);
if (best_fit_block == -1) {
- fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
- __func__, size, max_avail);
- GGML_ASSERT(!"not enough space in the buffer");
+ // the last block is our last resort
+ struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
+ if (block->size >= size) {
+ best_fit_block = alloc->n_free_blocks - 1;
+ max_avail = MAX(max_avail, block->size);
+ } else {
+ fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
+ __func__, size, max_avail);
+ GGML_ASSERT(!"not enough space in the buffer");
return;
+ }
}
struct free_block * block = &alloc->free_blocks[best_fit_block];
void * addr = block->addr;
alloc->n_free_blocks++;
}
+void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) {
+ int pos = 0;
+ for (int i = 0; i < n; i++) {
+ if (list[i] != -1) {
+ alloc->parse_seq[pos] = list[i];
+ pos++;
+ }
+ }
+ alloc->has_parse_seq = true;
+}
+
void ggml_allocr_reset(struct ggml_allocr * alloc) {
alloc->n_free_blocks = 1;
size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
/*.hash_table = */ {{0}},
/*.max_size = */ 0,
/*.measure = */ false,
+ /*.parse_seq = */ {0},
+ /*.has_parse_seq = */ false,
#ifdef GGML_ALLOCATOR_DEBUG
- /* .allocated_tensors = */ {0},
+ /*.allocated_tensors = */ = {0},
#endif
};
/*.hash_table = */ {{0}},
/*.max_size = */ 0,
/*.measure = */ true,
+ /*.parse_seq = */ {0},
+ /*.has_parse_seq = */ false,
#ifdef GGML_ALLOCATOR_DEBUG
- /*.allocated_tensors = */ {0},
+ /*.allocated_tensors = */ = {0},
#endif
};
allocate_node(alloc, input);
}
}
- for (int i = 0; i < gf->n_nodes; i++) {
+ for (int ind = 0; ind < gf->n_nodes; ind++) {
+ int i;
+ if (alloc->has_parse_seq) {
+ i = alloc->parse_seq[ind];
+ } else {
+ i = ind;
+ }
struct ggml_tensor * node = gf->nodes[i];
// allocate parents (leafs)
struct ggml_tensor * view_src = get_view_source(parent);
struct hash_node * view_src_hn = hash_get(ht, view_src);
view_src_hn->n_views -= 1;
- AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
+ AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views);
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
ggml_allocator_free_tensor(alloc, view_src);
}
#include "ggml.h"
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
+#define CC_TURING 700
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#define CUDA_QUANTIZE_BLOCK_SIZE 256
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
-#ifndef GGML_CUDA_MMQ_Y
-#define GGML_CUDA_MMQ_Y 64
-#endif // GGML_CUDA_MMQ_Y
-
// dmmv = dequantize_mul_mat_vec
#ifndef GGML_CUDA_DMMV_X
#define GGML_CUDA_DMMV_X 32
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
};
+static int g_device_count = -1;
+static int g_main_device = 0;
+static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
+static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
+static bool g_mul_mat_q = false;
+
+static void * g_scratch_buffer = nullptr;
+static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
+static size_t g_scratch_offset = 0;
+
+static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
+
+static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
+
static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
// second part effectively subtracts 8 from each quant value
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
#else
+ assert(false);
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
#else
+ assert(false);
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
// second part effectively subtracts 16 from each quant value
return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
#else
+ assert(false);
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
#else
+ assert(false);
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return d8_0*d8_1 * sumi;
#else
+ assert(false);
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
#else
const float2 dm8f = __half22float2(dm8);
const float2 ds8f = __half22float2(ds8);
- const float d8d8 = dm8.x * ds8.x;
- const float m8s8 = dm8.y * ds8.y;
+ const float d8d8 = dm8f.x * ds8f.x;
+ const float m8s8 = dm8f.y * ds8f.y;
#endif // GGML_CUDA_F16
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
#else
+ assert(false);
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return dm2f.x*sumf_d - dm2f.y*sumf_m;
#else
+ assert(false);
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
#else
+ assert(false);
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return d3 * sumf;
#else
+ assert(false);
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return d3*d8 * sumi;
#else
+ assert(false);
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return dm4f.x*sumf_d - dm4f.y*sumf_m;
#else
+ assert(false);
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
// contiguous u/y values
-// also used for q5_K
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
float sumf_m = 0.0f;
#pragma unroll
- for (int i0 = 0; i0 < VDR_Q4_K_Q8_1_MMQ; i0 += (QI8_1/QR4_K)) {
+ for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
int sumi_d = 0;
#pragma unroll
- for (int i = i0; i < i0 + (QI8_1/QR4_K); ++i) {
- sumi_d = __dp4a(v[2*i+0], u[2*i+0], sumi_d); // SIMD dot product
- sumi_d = __dp4a(v[2*i+1], u[2*i+1], sumi_d); // SIMD dot product
+ for (int j = 0; j < QI8_1; ++j) {
+ sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
}
- const float2 ds8f = __half22float2(ds8[i0 / 4]);
+ const float2 ds8f = __half22float2(ds8[i]);
- sumf_d += ds8f.x * (sc[i0/4] * sumi_d);
- sumf_m += ds8f.y * m[i0/4]; // sum of q8_1 block * q4_K min val
+ sumf_d += ds8f.x * (sc[i] * sumi_d);
+ sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
}
const float2 dm4f = __half22float2(dm4);
return dm4f.x*sumf_d - dm4f.y*sumf_m;
#else
+ assert(false);
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
#define VDR_Q5_K_Q8_1_MMQ 8
// contiguous v/x values
-static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
+static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
return dm5f.x*sumf_d - dm5f.y*sumf_m;
#else
+ assert(false);
+ return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+// contiguous u/y values
+static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
+ const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
+ const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+ float sumf_d = 0.0f;
+ float sumf_m = 0.0f;
+
+#pragma unroll
+ for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
+ int sumi_d = 0;
+
+#pragma unroll
+ for (int j = 0; j < QI8_1; ++j) {
+ sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
+ }
+
+ const float2 ds8f = __half22float2(ds8[i]);
+
+ sumf_d += ds8f.x * (sc[i] * sumi_d);
+ sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
+ }
+
+ const float2 dm4f = __half22float2(dm4);
+
+ return dm4f.x*sumf_d - dm4f.y*sumf_m;
+
+#else
+ assert(false);
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return d*sumf;
#else
+ assert(false);
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return d6 * sumf_d;
#else
+ assert(false);
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
}
-static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
- __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
- __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_0) + GGML_CUDA_MMQ_Y/QI4_0];
+ __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
+ __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
*x_ql = tile_x_qs;
*x_dm = (half2 *) tile_x_d;
}
-template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
__builtin_assume(i_offset >= 0);
- __builtin_assume(i_offset < 8);
+ __builtin_assume(i_offset < nwarps);
__builtin_assume(k >= 0);
__builtin_assume(k < WARP_SIZE);
float * x_dmf = (float *) x_dm;
#pragma unroll
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
int i = i0 + i_offset;
if (need_check) {
const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
- x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
+ // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
}
-// const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
-// const int kbxd = k % blocks_per_tile_x_row;
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
+ const int kbxd = k % blocks_per_tile_x_row;
-// #pragma unroll
-// for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_0) {
-// FIXME out-of-bounds
-// const int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
+ int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
-// if (i >= GGML_CUDA_MMQ_Y) {
-// return;
-// }
+ if (need_check) {
+ i = min(i, i_max);
+ }
-// const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
+ const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-// x_dm[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd].x = bxi->d;
-// }
+ x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
+ }
}
static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
- __builtin_assume(i >= 0);
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
- __builtin_assume(j >= 0);
- __builtin_assume(j < WARP_SIZE);
- __builtin_assume(k >= 0);
- __builtin_assume(k < WARP_SIZE);
- __builtin_assume(k % VDR_Q4_0_Q8_1_MMQ == 0);
-
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
const float * x_dmf = (float *) x_dm;
#pragma unroll
for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
- u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
- u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_0];
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
}
return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
(&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
- y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
+ y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
}
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
}
-static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
- __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + + GGML_CUDA_MMQ_Y];
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_1) + GGML_CUDA_MMQ_Y/QI4_1];
+ __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
*x_ql = tile_x_qs;
*x_dm = tile_x_dm;
}
-template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
__builtin_assume(i_offset >= 0);
- __builtin_assume(i_offset < 8);
+ __builtin_assume(i_offset < nwarps);
__builtin_assume(k >= 0);
__builtin_assume(k < WARP_SIZE);
const block_q4_1 * bx0 = (block_q4_1 *) vx;
#pragma unroll
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
int i = i0 + i_offset;
if (need_check) {
const int kbxd = k % blocks_per_tile_x_row;
#pragma unroll
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_1) {
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
if (need_check) {
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
- __builtin_assume(i >= 0);
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
- __builtin_assume(j >= 0);
- __builtin_assume(j < WARP_SIZE);
- __builtin_assume(k >= 0);
- __builtin_assume(k < WARP_SIZE);
- __builtin_assume(k % VDR_Q4_1_Q8_1_MMQ == 0);
-
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
int u[2*VDR_Q4_1_Q8_1_MMQ];
#pragma unroll
for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
- u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
- u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_1];
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
}
return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
(&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
- y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
+ y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
}
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
}
-static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y];
- __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_0) + GGML_CUDA_MMQ_Y/QI5_0];
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
+ __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
*x_ql = tile_x_ql;
*x_dm = (half2 *) tile_x_d;
}
-template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
__builtin_assume(i_offset >= 0);
- __builtin_assume(i_offset < 8);
+ __builtin_assume(i_offset < nwarps);
__builtin_assume(k >= 0);
__builtin_assume(k < WARP_SIZE);
const block_q5_0 * bx0 = (block_q5_0 *) vx;
#pragma unroll
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
int i = i0 + i_offset;
if (need_check) {
float * x_dmf = (float *) x_dm;
#pragma unroll
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_0) {
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
if (need_check) {
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
- __builtin_assume(i >= 0);
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
- __builtin_assume(j >= 0);
- __builtin_assume(j < WARP_SIZE);
- __builtin_assume(k >= 0);
- __builtin_assume(k < WARP_SIZE);
- __builtin_assume(k % VDR_Q5_0_Q8_1_MMQ == 0);
-
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
const float * x_dmf = (const float *) x_dm;
#pragma unroll
for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
- u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
- u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_0];
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
}
return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
- (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
+ (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
}
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
}
-static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y];
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_1) + GGML_CUDA_MMQ_Y/QI5_1];
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
*x_ql = tile_x_ql;
*x_dm = tile_x_dm;
}
-template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
__builtin_assume(i_offset >= 0);
- __builtin_assume(i_offset < 8);
+ __builtin_assume(i_offset < nwarps);
__builtin_assume(k >= 0);
__builtin_assume(k < WARP_SIZE);
const block_q5_1 * bx0 = (block_q5_1 *) vx;
#pragma unroll
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
int i = i0 + i_offset;
if (need_check) {
const int kbxd = k % blocks_per_tile_x_row;
#pragma unroll
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_1) {
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
if (need_check) {
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
- __builtin_assume(i >= 0);
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
- __builtin_assume(j >= 0);
- __builtin_assume(j < WARP_SIZE);
- __builtin_assume(k >= 0);
- __builtin_assume(k < WARP_SIZE);
- __builtin_assume(k % VDR_Q5_1_Q8_1_MMQ == 0);
-
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
#pragma unroll
for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
- u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
- u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_1];
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
}
return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
- (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
+ (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
}
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds.x);
}
-static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
- __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
- __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI8_0) + GGML_CUDA_MMQ_Y/QI8_0];
+ __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
+ __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
*x_ql = tile_x_qs;
*x_dm = (half2 *) tile_x_d;
}
-template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
__builtin_assume(i_offset >= 0);
- __builtin_assume(i_offset < 8);
+ __builtin_assume(i_offset < nwarps);
__builtin_assume(k >= 0);
__builtin_assume(k < WARP_SIZE);
const block_q8_0 * bx0 = (block_q8_0 *) vx;
#pragma unroll
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
int i = i0 + i_offset;
if (need_check) {
const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
- x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbx] = bxi->d;
}
-// const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
-// const int kbxd = k % blocks_per_tile_x_row;
+ const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
+ const int kbxd = k % blocks_per_tile_x_row;
-// #pragma unroll
-// for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI8_0) {
-// FIXME out-of-bounds
-// const int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
+ int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
-// #if GGML_CUDA_MMQ_Y < 64
-// if (i >= GGML_CUDA_MMQ_Y) {
-// return;
-// }
-// #endif // GGML_CUDA_MMQ_Y < 64
+ if (need_check) {
+ i = min(i, i_max);
+ }
-// const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
+ const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-// x_dm[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd].x = bxi->d;
-// }
+ x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
+ }
}
static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
- __builtin_assume(i >= 0);
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
- __builtin_assume(j >= 0);
- __builtin_assume(j < WARP_SIZE);
- __builtin_assume(k >= 0);
- __builtin_assume(k < WARP_SIZE);
- __builtin_assume(k % VDR_Q8_0_Q8_1_MMQ == 0);
-
const float * x_dmf = (const float *) x_dm;
const float * y_df = (const float *) y_ds;
return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
}
-static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI2_K) + GGML_CUDA_MMQ_Y/QI2_K];
- __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4];
+ __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
*x_ql = tile_x_ql;
*x_dm = tile_x_dm;
*x_sc = tile_x_sc;
}
-template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
__builtin_assume(i_offset >= 0);
- __builtin_assume(i_offset < 8);
+ __builtin_assume(i_offset < nwarps);
__builtin_assume(k >= 0);
__builtin_assume(k < WARP_SIZE);
const block_q2_K * bx0 = (block_q2_K *) vx;
#pragma unroll
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
int i = i0 + i_offset;
if (need_check) {
const int kbxd = k % blocks_per_tile_x_row;
#pragma unroll
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI2_K) {
- int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
+ int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
if (need_check) {
i = min(i, i_max);
}
#pragma unroll
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
if (need_check) {
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
- __builtin_assume(i >= 0);
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
- __builtin_assume(j >= 0);
- __builtin_assume(j < WARP_SIZE);
- __builtin_assume(k >= 0);
- __builtin_assume(k < WARP_SIZE);
- __builtin_assume(k % VDR_Q2_K_Q8_1_MMQ == 0);
-
const int kbx = k / QI2_K;
const int ky = (k % QI2_K) * QR2_K;
const float * y_df = (const float *) y_ds;
const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
- const int index_y = j * (QR2_K*WARP_SIZE) + QR2_K*k;
+ const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
}
return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
}
-static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI3_K) + GGML_CUDA_MMQ_Y/QI3_K];
- __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/2) + GGML_CUDA_MMQ_Y/2];
- __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4];
+ __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K];
+ __shared__ int tile_x_qh[mmq_y * (WARP_SIZE/2) + mmq_y/2];
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
*x_ql = tile_x_ql;
*x_dm = tile_x_dm;
*x_sc = tile_x_sc;
}
-template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
__builtin_assume(i_offset >= 0);
- __builtin_assume(i_offset < 8);
+ __builtin_assume(i_offset < nwarps);
__builtin_assume(k >= 0);
__builtin_assume(k < WARP_SIZE);
const block_q3_K * bx0 = (block_q3_K *) vx;
#pragma unroll
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
int i = i0 + i_offset;
if (need_check) {
float * x_dmf = (float *) x_dm;
#pragma unroll
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI3_K) {
- int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
+ int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
if (need_check) {
i = min(i, i_max);
}
#pragma unroll
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 2) {
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
if (need_check) {
}
#pragma unroll
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
if (need_check) {
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
- __builtin_assume(i >= 0);
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
- __builtin_assume(j >= 0);
- __builtin_assume(j < WARP_SIZE);
- __builtin_assume(k >= 0);
- __builtin_assume(k < WARP_SIZE);
- __builtin_assume(k % VDR_Q3_K_Q8_1_MMQ == 0);
-
const int kbx = k / QI3_K;
const int ky = (k % QI3_K) * QR3_K;
const float * x_dmf = (const float *) x_dm;
v[l] = __vsubss4(vll, vlh);
}
- const int index_y = j * (QR3_K*WARP_SIZE) + k*QR3_K;
+ const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
}
return dall * sumf_d - dmin * sumf_m;
#else
+ assert(false);
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
#endif
}
-static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_K) + GGML_CUDA_MMQ_Y/QI4_K];
- __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
+ __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
*x_ql = tile_x_ql;
*x_dm = tile_x_dm;
*x_sc = tile_x_sc;
}
-template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
__builtin_assume(i_offset >= 0);
- __builtin_assume(i_offset < 8);
+ __builtin_assume(i_offset < nwarps);
__builtin_assume(k >= 0);
__builtin_assume(k < WARP_SIZE);
const block_q4_K * bx0 = (block_q4_K *) vx;
#pragma unroll
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
int i = i0 + i_offset;
if (need_check) {
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
#pragma unroll
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_K) {
- int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
+ int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
if (need_check) {
i = min(i, i_max);
}
#pragma unroll
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
- int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
if (need_check) {
i = min(i, i_max);
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
- __builtin_assume(i >= 0);
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
- __builtin_assume(j >= 0);
- __builtin_assume(j < WARP_SIZE);
- __builtin_assume(k >= 0);
- __builtin_assume(k < WARP_SIZE);
- __builtin_assume(k % VDR_Q4_K_Q8_1_MMQ == 0);
-
- int v[QR4_K*VDR_Q4_K_Q8_1_MMQ];
-
-#pragma unroll
- for (int l = 0; l < VDR_Q4_K_Q8_1_MMQ; ++l) {
- v[l + 0] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 0) & 0x0F0F0F0F;
- v[l + (QI4_K/4)] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 4) & 0x0F0F0F0F;
- }
-
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
- const int index_y = j * (QR4_K*WARP_SIZE) + QR4_K*k;
- return vec_dot_q4_K_q8_1_impl_mmq(v, &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
+ const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
+ return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
+ x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
}
static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
u[2*i+1] = q8[4];
}
- return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, bq5_K->dm, d8);
+ return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
#else
return d * sumf_d;
#else
+ assert(false);
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
#endif
}
-static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y];
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_K) + GGML_CUDA_MMQ_Y/QI5_K];
- __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
*x_ql = tile_x_ql;
*x_dm = tile_x_dm;
*x_sc = tile_x_sc;
}
-template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
__builtin_assume(i_offset >= 0);
- __builtin_assume(i_offset < 8);
+ __builtin_assume(i_offset < nwarps);
__builtin_assume(k >= 0);
__builtin_assume(k < WARP_SIZE);
const block_q5_K * bx0 = (block_q5_K *) vx;
#pragma unroll
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
int i = i0 + i_offset;
if (need_check) {
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
#pragma unroll
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_K) {
- int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
+ int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
if (need_check) {
i = min(i, i_max);
}
#pragma unroll
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
- int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
if (need_check) {
i = min(i, i_max);
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
- __builtin_assume(i >= 0);
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
- __builtin_assume(j >= 0);
- __builtin_assume(j < WARP_SIZE);
- __builtin_assume(k >= 0);
- __builtin_assume(k < WARP_SIZE);
- __builtin_assume(k % VDR_Q5_K_Q8_1_MMQ == 0);
-
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
- const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
- const int index_y = j * (QR5_K*WARP_SIZE) + QR5_K*k;
- return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
+ const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
+ const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE;
+ return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
+ x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
}
static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
}
-static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y];
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI6_K) + GGML_CUDA_MMQ_Y/QI6_K];
- __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
*x_ql = tile_x_ql;
*x_dm = tile_x_dm;
*x_sc = tile_x_sc;
}
-template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
__builtin_assume(i_offset >= 0);
- __builtin_assume(i_offset < 8);
+ __builtin_assume(i_offset < nwarps);
__builtin_assume(k >= 0);
__builtin_assume(k < WARP_SIZE);
const block_q6_K * bx0 = (block_q6_K *) vx;
#pragma unroll
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
int i = i0 + i_offset;
if (need_check) {
float * x_dmf = (float *) x_dm;
#pragma unroll
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI6_K) {
- int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
+ int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
if (need_check) {
i = min(i, i_max);
}
#pragma unroll
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
- int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
if (need_check) {
i = min(i, i_max);
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
- __builtin_assume(i >= 0);
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
- __builtin_assume(j >= 0);
- __builtin_assume(j < WARP_SIZE);
- __builtin_assume(k >= 0);
- __builtin_assume(k < WARP_SIZE);
- __builtin_assume(k % VDR_Q6_K_Q8_1_MMQ == 0);
-
const float * x_dmf = (const float *) x_dm;
const float * y_df = (const float *) y_ds;
const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
- const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k;
- const int index_y = j * (QR6_K*WARP_SIZE) + QR6_K*k;
+ const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k;
+ const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE;
return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
}
-template <int qk, int qr, int qi, bool need_sum, typename block_q_t,
+template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
-static __global__ void mul_mat_q(
+static __device__ __forceinline__ void mul_mat_q(
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
const int & ncols_dst = ncols_y;
- const int tid_x = threadIdx.x;
- const int tid_y = threadIdx.y;
-
- const int row_dst_0 = blockIdx.x*GGML_CUDA_MMQ_Y;
+ const int row_dst_0 = blockIdx.x*mmq_y;
const int & row_x_0 = row_dst_0;
- const int row_dst = row_dst_0 + tid_x;
- const int col_dst_0 = blockIdx.y*WARP_SIZE;
+ const int col_dst_0 = blockIdx.y*mmq_x;
const int & col_y_0 = col_dst_0;
int * tile_x_ql = nullptr;
allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
- const int blocks_per_tile_y_col = qr*WARP_SIZE/QI8_1;
+ __shared__ int tile_y_qs[mmq_x * WARP_SIZE];
+ __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
- __shared__ int tile_y_qs[(WARP_SIZE) * (qr*WARP_SIZE)];
- __shared__ half2 tile_y_ds[(WARP_SIZE) * blocks_per_tile_y_col];
-
- float sum[GGML_CUDA_MMQ_Y/WARP_SIZE][4] = {0.0f};
+ float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f};
for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
- tid_y, nrows_x-row_x_0-1, tid_x, blocks_per_row_x);
+ threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
+#pragma unroll
for (int ir = 0; ir < qr; ++ir) {
- const int kqs = ir*WARP_SIZE + tid_x;
+ const int kqs = ir*WARP_SIZE + threadIdx.x;
const int kbxd = kqs / QI8_1;
- for (int i = 0; i < WARP_SIZE; i += 8) {
- const int col_y_eff = min(col_y_0 + tid_y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
+#pragma unroll
+ for (int i = 0; i < mmq_x; i += nwarps) {
+ const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
- tile_y_qs[(tid_y + i) * (qr*WARP_SIZE) + kqs] = get_int_from_int8_aligned(by0->qs, tid_x % QI8_1);
+ const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE;
+ tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
}
- }
- for (int ids0 = 0; ids0 < WARP_SIZE; ids0 += 8 * (WARP_SIZE/blocks_per_tile_y_col)) {
- const int ids = (ids0 + tid_y * (WARP_SIZE/blocks_per_tile_y_col) + tid_x / blocks_per_tile_y_col) % WARP_SIZE;
- const int kby = tid_x % blocks_per_tile_y_col;
- const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
-
- // if the sum is not needed it's faster to transform the scale to f32 ahead of time
- const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kby].ds;
- half2 * dsi_dst = &tile_y_ds[ids * (qr*WARP_SIZE/QI8_1) + kby];
- if (need_sum) {
- *dsi_dst = *dsi_src;
- } else {
- float * dfi_dst = (float *) dsi_dst;
- *dfi_dst = (*dsi_src).x;
+#pragma unroll
+ for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
+ const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
+ const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
+ const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
+
+ // if the sum is not needed it's faster to transform the scale to f32 ahead of time
+ const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds;
+ half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
+ if (need_sum) {
+ *dsi_dst = *dsi_src;
+ } else {
+ float * dfi_dst = (float *) dsi_dst;
+ *dfi_dst = (*dsi_src).x;
+ }
}
- }
- __syncthreads();
+ __syncthreads();
-#if __CUDA_ARCH__ >= 700 // Unrolling the loop is slower on Pascal
+// #pragma unroll // unrolling this loop causes too much register pressure
+ for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
#pragma unroll
-#endif // __CUDA_ARCH__ >= 700
- for (int k = 0; k < WARP_SIZE; k += vdr) {
+ for (int j = 0; j < mmq_x; j += nwarps) {
#pragma unroll
- for (int j = 0; j < WARP_SIZE; j += 8) {
-#pragma unroll
- for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) {
- sum[i/WARP_SIZE][j/8] += vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
- tid_x + i, tid_y + j, k);
+ for (int i = 0; i < mmq_y; i += WARP_SIZE) {
+ sum[i/WARP_SIZE][j/nwarps] += vec_dot(
+ tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
+ threadIdx.x + i, threadIdx.y + j, k);
+ }
}
}
- }
-
- __syncthreads();
- }
-
- if (row_dst >= nrows_dst) {
- return;
+ __syncthreads();
+ }
}
- for (int j = 0; j < WARP_SIZE; j += 8) {
- const int col_dst = col_dst_0 + j + tid_y;
+#pragma unroll
+ for (int j = 0; j < mmq_x; j += nwarps) {
+ const int col_dst = col_dst_0 + j + threadIdx.y;
if (col_dst >= ncols_dst) {
return;
}
- for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) {
- dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/8];
+#pragma unroll
+ for (int i = 0; i < mmq_y; i += WARP_SIZE) {
+ const int row_dst = row_dst_0 + threadIdx.x + i;
+
+ if (row_dst >= nrows_dst) {
+ continue;
+ }
+
+ dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
}
}
}
+#define MMQ_X_Q4_0_AMPERE 64
+#define MMQ_Y_Q4_0_AMPERE 128
+#define NWARPS_Q4_0_AMPERE 4
+#define MMQ_X_Q4_0_PASCAL 64
+#define MMQ_Y_Q4_0_PASCAL 64
+#define NWARPS_Q4_0_PASCAL 8
+
+template <bool need_check> static __global__ void mul_mat_q4_0(
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if __CUDA_ARCH__ >= CC_TURING
+ const int mmq_x = MMQ_X_Q4_0_AMPERE;
+ const int mmq_y = MMQ_Y_Q4_0_AMPERE;
+ const int nwarps = NWARPS_Q4_0_AMPERE;
+
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= MIN_CC_DP4A
+ const int mmq_x = MMQ_X_Q4_0_PASCAL;
+ const int mmq_y = MMQ_Y_Q4_0_PASCAL;
+ const int nwarps = NWARPS_Q4_0_PASCAL;
+
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+ (void) vec_dot_q4_0_q8_1_mul_mat;
+ assert(false);
+#endif // __CUDA_ARCH__ >= CC_TURING
+}
+
+#define MMQ_X_Q4_1_AMPERE 64
+#define MMQ_Y_Q4_1_AMPERE 128
+#define NWARPS_Q4_1_AMPERE 4
+#define MMQ_X_Q4_1_PASCAL 64
+#define MMQ_Y_Q4_1_PASCAL 64
+#define NWARPS_Q4_1_PASCAL 8
+
+template <bool need_check> static __global__ void
+#if __CUDA_ARCH__ < CC_TURING
+ __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
+#endif // __CUDA_ARCH__ < CC_TURING
+ mul_mat_q4_1(
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if __CUDA_ARCH__ >= CC_TURING
+ const int mmq_x = MMQ_X_Q4_1_AMPERE;
+ const int mmq_y = MMQ_Y_Q4_1_AMPERE;
+ const int nwarps = NWARPS_Q4_1_AMPERE;
+
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= MIN_CC_DP4A
+ const int mmq_x = MMQ_X_Q4_1_PASCAL;
+ const int mmq_y = MMQ_Y_Q4_1_PASCAL;
+ const int nwarps = NWARPS_Q4_1_PASCAL;
+
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+ (void) vec_dot_q4_1_q8_1_mul_mat;
+ assert(false);
+#endif // __CUDA_ARCH__ >= CC_TURING
+}
+
+#define MMQ_X_Q5_0_AMPERE 128
+#define MMQ_Y_Q5_0_AMPERE 64
+#define NWARPS_Q5_0_AMPERE 4
+#define MMQ_X_Q5_0_PASCAL 64
+#define MMQ_Y_Q5_0_PASCAL 64
+#define NWARPS_Q5_0_PASCAL 8
+
+template <bool need_check> static __global__ void mul_mat_q5_0(
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if __CUDA_ARCH__ >= CC_TURING
+ const int mmq_x = MMQ_X_Q5_0_AMPERE;
+ const int mmq_y = MMQ_Y_Q5_0_AMPERE;
+ const int nwarps = NWARPS_Q5_0_AMPERE;
+
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= MIN_CC_DP4A
+ const int mmq_x = MMQ_X_Q5_0_PASCAL;
+ const int mmq_y = MMQ_Y_Q5_0_PASCAL;
+ const int nwarps = NWARPS_Q5_0_PASCAL;
+
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+ (void) vec_dot_q5_0_q8_1_mul_mat;
+ assert(false);
+#endif // __CUDA_ARCH__ >= CC_TURING
+}
+
+#define MMQ_X_Q5_1_AMPERE 128
+#define MMQ_Y_Q5_1_AMPERE 64
+#define NWARPS_Q5_1_AMPERE 4
+#define MMQ_X_Q5_1_PASCAL 64
+#define MMQ_Y_Q5_1_PASCAL 64
+#define NWARPS_Q5_1_PASCAL 8
+
+template <bool need_check> static __global__ void mul_mat_q5_1(
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if __CUDA_ARCH__ >= CC_TURING
+ const int mmq_x = MMQ_X_Q5_1_AMPERE;
+ const int mmq_y = MMQ_Y_Q5_1_AMPERE;
+ const int nwarps = NWARPS_Q5_1_AMPERE;
+
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= MIN_CC_DP4A
+ const int mmq_x = MMQ_X_Q5_1_PASCAL;
+ const int mmq_y = MMQ_Y_Q5_1_PASCAL;
+ const int nwarps = NWARPS_Q5_1_PASCAL;
+
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+ (void) vec_dot_q5_1_q8_1_mul_mat;
+ assert(false);
+#endif // __CUDA_ARCH__ >= CC_TURING
+}
+
+#define MMQ_X_Q8_0_AMPERE 128
+#define MMQ_Y_Q8_0_AMPERE 64
+#define NWARPS_Q8_0_AMPERE 4
+#define MMQ_X_Q8_0_PASCAL 64
+#define MMQ_Y_Q8_0_PASCAL 64
+#define NWARPS_Q8_0_PASCAL 8
+
+template <bool need_check> static __global__ void mul_mat_q8_0(
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if __CUDA_ARCH__ >= CC_TURING
+ const int mmq_x = MMQ_X_Q8_0_AMPERE;
+ const int mmq_y = MMQ_Y_Q8_0_AMPERE;
+ const int nwarps = NWARPS_Q8_0_AMPERE;
+
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= MIN_CC_DP4A
+ const int mmq_x = MMQ_X_Q8_0_PASCAL;
+ const int mmq_y = MMQ_Y_Q8_0_PASCAL;
+ const int nwarps = NWARPS_Q8_0_PASCAL;
+
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+ (void) vec_dot_q8_0_q8_1_mul_mat;
+ assert(false);
+#endif // __CUDA_ARCH__ >= CC_TURING
+}
+
+#define MMQ_X_Q2_K_AMPERE 64
+#define MMQ_Y_Q2_K_AMPERE 128
+#define NWARPS_Q2_K_AMPERE 4
+#define MMQ_X_Q2_K_PASCAL 64
+#define MMQ_Y_Q2_K_PASCAL 64
+#define NWARPS_Q2_K_PASCAL 8
+
+template <bool need_check> static __global__ void mul_mat_q2_K(
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if __CUDA_ARCH__ >= CC_TURING
+ const int mmq_x = MMQ_X_Q2_K_AMPERE;
+ const int mmq_y = MMQ_Y_Q2_K_AMPERE;
+ const int nwarps = NWARPS_Q2_K_AMPERE;
+
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= MIN_CC_DP4A
+ const int mmq_x = MMQ_X_Q2_K_PASCAL;
+ const int mmq_y = MMQ_Y_Q2_K_PASCAL;
+ const int nwarps = NWARPS_Q2_K_PASCAL;
+
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+ (void) vec_dot_q2_K_q8_1_mul_mat;
+ assert(false);
+#endif // __CUDA_ARCH__ >= CC_TURING
+}
+
+#define MMQ_X_Q3_K_AMPERE 128
+#define MMQ_Y_Q3_K_AMPERE 128
+#define NWARPS_Q3_K_AMPERE 4
+#define MMQ_X_Q3_K_PASCAL 64
+#define MMQ_Y_Q3_K_PASCAL 64
+#define NWARPS_Q3_K_PASCAL 8
+
+template <bool need_check> static __global__ void
+#if __CUDA_ARCH__ < CC_TURING
+ __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
+#endif // __CUDA_ARCH__ < CC_TURING
+ mul_mat_q3_K(
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if __CUDA_ARCH__ >= CC_TURING
+ const int mmq_x = MMQ_X_Q3_K_AMPERE;
+ const int mmq_y = MMQ_Y_Q3_K_AMPERE;
+ const int nwarps = NWARPS_Q3_K_AMPERE;
+
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= MIN_CC_DP4A
+ const int mmq_x = MMQ_X_Q3_K_PASCAL;
+ const int mmq_y = MMQ_Y_Q3_K_PASCAL;
+ const int nwarps = NWARPS_Q3_K_PASCAL;
+
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+ (void) vec_dot_q3_K_q8_1_mul_mat;
+ assert(false);
+#endif // __CUDA_ARCH__ >= CC_TURING
+}
+
+#define MMQ_X_Q4_K_AMPERE 64
+#define MMQ_Y_Q4_K_AMPERE 128
+#define NWARPS_Q4_K_AMPERE 4
+#define MMQ_X_Q4_K_PASCAL 64
+#define MMQ_Y_Q4_K_PASCAL 64
+#define NWARPS_Q4_K_PASCAL 8
+
+template <bool need_check> static __global__ void
+#if __CUDA_ARCH__ < CC_TURING
+ __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
+#endif // __CUDA_ARCH__ < CC_TURING
+ mul_mat_q4_K(
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if __CUDA_ARCH__ >= CC_TURING
+ const int mmq_x = MMQ_X_Q4_K_AMPERE;
+ const int mmq_y = MMQ_Y_Q4_K_AMPERE;
+ const int nwarps = NWARPS_Q4_K_AMPERE;
+
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= MIN_CC_DP4A
+ const int mmq_x = MMQ_X_Q4_K_PASCAL;
+ const int mmq_y = MMQ_Y_Q4_K_PASCAL;
+ const int nwarps = NWARPS_Q4_K_PASCAL;
+
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+ (void) vec_dot_q4_K_q8_1_mul_mat;
+ assert(false);
+#endif // __CUDA_ARCH__ >= CC_TURING
+}
+
+#define MMQ_X_Q5_K_AMPERE 64
+#define MMQ_Y_Q5_K_AMPERE 128
+#define NWARPS_Q5_K_AMPERE 4
+#define MMQ_X_Q5_K_PASCAL 64
+#define MMQ_Y_Q5_K_PASCAL 64
+#define NWARPS_Q5_K_PASCAL 8
+
+template <bool need_check> static __global__ void mul_mat_q5_K(
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if __CUDA_ARCH__ >= CC_TURING
+ const int mmq_x = MMQ_X_Q5_K_AMPERE;
+ const int mmq_y = MMQ_Y_Q5_K_AMPERE;
+ const int nwarps = NWARPS_Q5_K_AMPERE;
+
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= MIN_CC_DP4A
+ const int mmq_x = MMQ_X_Q5_K_PASCAL;
+ const int mmq_y = MMQ_Y_Q5_K_PASCAL;
+ const int nwarps = NWARPS_Q5_K_PASCAL;
+
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+ (void) vec_dot_q5_K_q8_1_mul_mat;
+ assert(false);
+#endif // __CUDA_ARCH__ >= CC_TURING
+}
+
+#define MMQ_X_Q6_K_AMPERE 64
+#define MMQ_Y_Q6_K_AMPERE 64
+#define NWARPS_Q6_K_AMPERE 4
+#define MMQ_X_Q6_K_PASCAL 64
+#define MMQ_Y_Q6_K_PASCAL 64
+#define NWARPS_Q6_K_PASCAL 8
+
+template <bool need_check> static __global__ void
+#if __CUDA_ARCH__ < CC_TURING
+ __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
+#endif // __CUDA_ARCH__ < CC_TURING
+ mul_mat_q6_K(
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if __CUDA_ARCH__ >= CC_TURING
+ const int mmq_x = MMQ_X_Q6_K_AMPERE;
+ const int mmq_y = MMQ_Y_Q6_K_AMPERE;
+ const int nwarps = NWARPS_Q6_K_AMPERE;
+
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= MIN_CC_DP4A
+ const int mmq_x = MMQ_X_Q6_K_PASCAL;
+ const int mmq_y = MMQ_Y_Q6_K_PASCAL;
+ const int nwarps = NWARPS_Q6_K_PASCAL;
+
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+ (void) vec_dot_q6_K_q8_1_mul_mat;
+ assert(false);
+#endif // __CUDA_ARCH__ >= CC_TURING
+}
+
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
const int row = blockIdx.y*blockDim.y + threadIdx.y;
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
+ int id;
+ CUDA_CHECK(cudaGetDevice(&id));
+ const int compute_capability = g_compute_capabilities[id];
+
+ int mmq_x, mmq_y, nwarps;
+ if (compute_capability >= CC_TURING) {
+ mmq_x = MMQ_X_Q4_0_AMPERE;
+ mmq_y = MMQ_Y_Q4_0_AMPERE;
+ nwarps = NWARPS_Q4_0_AMPERE;
+ } else if (compute_capability >= MIN_CC_DP4A) {
+ mmq_x = MMQ_X_Q4_0_PASCAL;
+ mmq_y = MMQ_Y_Q4_0_PASCAL;
+ nwarps = NWARPS_Q4_0_PASCAL;
+ } else {
+ GGML_ASSERT(false);
+ }
+
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
const dim3 block_nums(block_num_x, block_num_y, 1);
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
- mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, allocate_tiles_q4_0, load_tiles_q4_0<false>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+ if (nrows_x % mmq_y == 0) {
+ const bool need_check = false;
+ mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
} else {
- mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, allocate_tiles_q4_0, load_tiles_q4_0<true>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+ const bool need_check = true;
+ mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
}
}
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
+ int id;
+ CUDA_CHECK(cudaGetDevice(&id));
+ const int compute_capability = g_compute_capabilities[id];
+
+ int mmq_x, mmq_y, nwarps;
+ if (compute_capability >= CC_TURING) {
+ mmq_x = MMQ_X_Q4_1_AMPERE;
+ mmq_y = MMQ_Y_Q4_1_AMPERE;
+ nwarps = NWARPS_Q4_1_AMPERE;
+ } else if (compute_capability >= MIN_CC_DP4A) {
+ mmq_x = MMQ_X_Q4_1_PASCAL;
+ mmq_y = MMQ_Y_Q4_1_PASCAL;
+ nwarps = NWARPS_Q4_1_PASCAL;
+ } else {
+ GGML_ASSERT(false);
+ }
+
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
const dim3 block_nums(block_num_x, block_num_y, 1);
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
- mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, allocate_tiles_q4_1, load_tiles_q4_1<false>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+ if (nrows_x % mmq_y == 0) {
+ const bool need_check = false;
+ mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
} else {
- mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, allocate_tiles_q4_1, load_tiles_q4_1<true>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+ const bool need_check = true;
+ mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
}
}
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
+ int id;
+ CUDA_CHECK(cudaGetDevice(&id));
+ const int compute_capability = g_compute_capabilities[id];
+
+ int mmq_x, mmq_y, nwarps;
+ if (compute_capability >= CC_TURING) {
+ mmq_x = MMQ_X_Q5_0_AMPERE;
+ mmq_y = MMQ_Y_Q5_0_AMPERE;
+ nwarps = NWARPS_Q5_0_AMPERE;
+ } else if (compute_capability >= MIN_CC_DP4A) {
+ mmq_x = MMQ_X_Q5_0_PASCAL;
+ mmq_y = MMQ_Y_Q5_0_PASCAL;
+ nwarps = NWARPS_Q5_0_PASCAL;
+ } else {
+ GGML_ASSERT(false);
+ }
+
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
const dim3 block_nums(block_num_x, block_num_y, 1);
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
- mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, allocate_tiles_q5_0, load_tiles_q5_0<false>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+ if (nrows_x % mmq_y == 0) {
+ const bool need_check = false;
+ mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
} else {
- mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, allocate_tiles_q5_0, load_tiles_q5_0<true>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+ const bool need_check = true;
+ mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
}
}
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
+ int id;
+ CUDA_CHECK(cudaGetDevice(&id));
+ const int compute_capability = g_compute_capabilities[id];
+
+ int mmq_x, mmq_y, nwarps;
+ if (compute_capability >= CC_TURING) {
+ mmq_x = MMQ_X_Q5_1_AMPERE;
+ mmq_y = MMQ_Y_Q5_1_AMPERE;
+ nwarps = NWARPS_Q5_1_AMPERE;
+ } else if (compute_capability >= MIN_CC_DP4A) {
+ mmq_x = MMQ_X_Q5_1_PASCAL;
+ mmq_y = MMQ_Y_Q5_1_PASCAL;
+ nwarps = NWARPS_Q5_1_PASCAL;
+ } else {
+ GGML_ASSERT(false);
+ }
+
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
const dim3 block_nums(block_num_x, block_num_y, 1);
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
- mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, allocate_tiles_q5_1, load_tiles_q5_1<false>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+ if (nrows_x % mmq_y == 0) {
+ const bool need_check = false;
+ mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
} else {
- mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, allocate_tiles_q5_1, load_tiles_q5_1<true>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+ const bool need_check = true;
+ mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
}
}
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
+ int id;
+ CUDA_CHECK(cudaGetDevice(&id));
+ const int compute_capability = g_compute_capabilities[id];
+
+ int mmq_x, mmq_y, nwarps;
+ if (compute_capability >= CC_TURING) {
+ mmq_x = MMQ_X_Q8_0_AMPERE;
+ mmq_y = MMQ_Y_Q8_0_AMPERE;
+ nwarps = NWARPS_Q8_0_AMPERE;
+ } else if (compute_capability >= MIN_CC_DP4A) {
+ mmq_x = MMQ_X_Q8_0_PASCAL;
+ mmq_y = MMQ_Y_Q8_0_PASCAL;
+ nwarps = NWARPS_Q8_0_PASCAL;
+ } else {
+ GGML_ASSERT(false);
+ }
+
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
const dim3 block_nums(block_num_x, block_num_y, 1);
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
- mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, allocate_tiles_q8_0, load_tiles_q8_0<false>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+ if (nrows_x % mmq_y == 0) {
+ const bool need_check = false;
+ mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
} else {
- mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, allocate_tiles_q8_0, load_tiles_q8_0<true>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+ const bool need_check = true;
+ mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
}
}
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
+ int id;
+ CUDA_CHECK(cudaGetDevice(&id));
+ const int compute_capability = g_compute_capabilities[id];
+
+ int mmq_x, mmq_y, nwarps;
+ if (compute_capability >= CC_TURING) {
+ mmq_x = MMQ_X_Q2_K_AMPERE;
+ mmq_y = MMQ_Y_Q2_K_AMPERE;
+ nwarps = NWARPS_Q2_K_AMPERE;
+ } else if (compute_capability >= MIN_CC_DP4A) {
+ mmq_x = MMQ_X_Q2_K_PASCAL;
+ mmq_y = MMQ_Y_Q2_K_PASCAL;
+ nwarps = NWARPS_Q2_K_PASCAL;
+ } else {
+ GGML_ASSERT(false);
+ }
+
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
const dim3 block_nums(block_num_x, block_num_y, 1);
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
- mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, allocate_tiles_q2_K, load_tiles_q2_K<false>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+ if (nrows_x % mmq_y == 0) {
+ const bool need_check = false;
+ mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
} else {
- mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, allocate_tiles_q2_K, load_tiles_q2_K<true>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+ const bool need_check = true;
+ mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
}
}
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
+ int id;
+ CUDA_CHECK(cudaGetDevice(&id));
+ const int compute_capability = g_compute_capabilities[id];
+
+ int mmq_x, mmq_y, nwarps;
+ if (compute_capability >= CC_TURING) {
+ mmq_x = MMQ_X_Q3_K_AMPERE;
+ mmq_y = MMQ_Y_Q3_K_AMPERE;
+ nwarps = NWARPS_Q3_K_AMPERE;
+ } else if (compute_capability >= MIN_CC_DP4A) {
+ mmq_x = MMQ_X_Q3_K_PASCAL;
+ mmq_y = MMQ_Y_Q3_K_PASCAL;
+ nwarps = NWARPS_Q3_K_PASCAL;
+ } else {
+ GGML_ASSERT(false);
+ }
+
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
const dim3 block_nums(block_num_x, block_num_y, 1);
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
- mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, allocate_tiles_q3_K, load_tiles_q3_K<false>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+ if (nrows_x % mmq_y == 0) {
+ const bool need_check = false;
+ mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
} else {
- mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, allocate_tiles_q3_K, load_tiles_q3_K<true>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+ const bool need_check = true;
+ mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
}
}
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
+ int id;
+ CUDA_CHECK(cudaGetDevice(&id));
+ const int compute_capability = g_compute_capabilities[id];
+
+ int mmq_x, mmq_y, nwarps;
+ if (compute_capability >= CC_TURING) {
+ mmq_x = MMQ_X_Q4_K_AMPERE;
+ mmq_y = MMQ_Y_Q4_K_AMPERE;
+ nwarps = NWARPS_Q4_K_AMPERE;
+ } else if (compute_capability >= MIN_CC_DP4A) {
+ mmq_x = MMQ_X_Q4_K_PASCAL;
+ mmq_y = MMQ_Y_Q4_K_PASCAL;
+ nwarps = NWARPS_Q4_K_PASCAL;
+ } else {
+ GGML_ASSERT(false);
+ }
+
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
const dim3 block_nums(block_num_x, block_num_y, 1);
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
- mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, allocate_tiles_q4_K, load_tiles_q4_K<false>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+ if (nrows_x % mmq_y == 0) {
+ const bool need_check = false;
+ mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
} else {
- mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, allocate_tiles_q4_K, load_tiles_q4_K<true>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+ const bool need_check = true;
+ mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
}
}
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
+ int id;
+ CUDA_CHECK(cudaGetDevice(&id));
+ const int compute_capability = g_compute_capabilities[id];
+
+ int mmq_x, mmq_y, nwarps;
+ if (compute_capability >= CC_TURING) {
+ mmq_x = MMQ_X_Q5_K_AMPERE;
+ mmq_y = MMQ_Y_Q5_K_AMPERE;
+ nwarps = NWARPS_Q5_K_AMPERE;
+ } else if (compute_capability >= MIN_CC_DP4A) {
+ mmq_x = MMQ_X_Q5_K_PASCAL;
+ mmq_y = MMQ_Y_Q5_K_PASCAL;
+ nwarps = NWARPS_Q5_K_PASCAL;
+ } else {
+ GGML_ASSERT(false);
+ }
+
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
const dim3 block_nums(block_num_x, block_num_y, 1);
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
- mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, allocate_tiles_q5_K, load_tiles_q5_K<false>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+ if (nrows_x % mmq_y == 0) {
+ const bool need_check = false;
+ mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
} else {
- mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, allocate_tiles_q5_K, load_tiles_q5_K<true>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+ const bool need_check = true;
+ mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
}
}
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
+ int id;
+ CUDA_CHECK(cudaGetDevice(&id));
+ const int compute_capability = g_compute_capabilities[id];
+
+ int mmq_x, mmq_y, nwarps;
+ if (compute_capability >= CC_TURING) {
+ mmq_x = MMQ_X_Q6_K_AMPERE;
+ mmq_y = MMQ_Y_Q6_K_AMPERE;
+ nwarps = NWARPS_Q6_K_AMPERE;
+ } else if (compute_capability >= MIN_CC_DP4A) {
+ mmq_x = MMQ_X_Q6_K_PASCAL;
+ mmq_y = MMQ_Y_Q6_K_PASCAL;
+ nwarps = NWARPS_Q6_K_PASCAL;
+ } else {
+ GGML_ASSERT(false);
+ }
+
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
const dim3 block_nums(block_num_x, block_num_y, 1);
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
- mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, allocate_tiles_q6_K, load_tiles_q6_K<false>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+ if (nrows_x % mmq_y == 0) {
+ const bool need_check = false;
+ mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
} else {
- mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, allocate_tiles_q6_K, load_tiles_q6_K<true>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+ const bool need_check = true;
+ mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
}
}
}
-static void * g_scratch_buffer = nullptr;
-static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
-static size_t g_scratch_offset = 0;
-
-static int g_device_count = -1;
-static int g_main_device = 0;
-static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
-static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
-static bool g_mul_mat_q = false;
-
-static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
-
-static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
-
void ggml_init_cublas() {
static bool initialized = false;
(void) i1;
}
+static int64_t get_row_rounding(ggml_type type) {
+ int max_compute_capability = INT_MIN;
+ for (int id = 0; id < g_device_count; ++id) {
+ if (max_compute_capability < g_compute_capabilities[id]
+ && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
+ max_compute_capability = g_compute_capabilities[id];
+ }
+ }
+
+ switch(type) {
+ case GGML_TYPE_Q4_0:
+ case GGML_TYPE_Q4_1:
+ return max_compute_capability >= CC_TURING ? 128 : 64;
+ case GGML_TYPE_Q5_0:
+ case GGML_TYPE_Q5_1:
+ case GGML_TYPE_Q8_0:
+ return 64;
+ case GGML_TYPE_F16:
+ return 1;
+ case GGML_TYPE_Q2_K:
+ case GGML_TYPE_Q3_K:
+ case GGML_TYPE_Q4_K:
+ case GGML_TYPE_Q5_K:
+ return max_compute_capability >= CC_TURING ? 128 : 64;
+ case GGML_TYPE_Q6_K:
+ return 64;
+ default:
+ GGML_ASSERT(false);
+ }
+}
+
inline void ggml_cuda_op_mul_mat_vec(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
int64_t row_low, row_high;
if (split) {
+ const int64_t rounding = get_row_rounding(src0->type);
+
row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
- row_low -= row_low % GGML_CUDA_MMQ_Y;
+ row_low -= row_low % rounding;
if (id == g_device_count - 1) {
row_high = nrows0;
} else {
row_high = nrows0*g_tensor_split[id + 1];
- row_high -= row_high % GGML_CUDA_MMQ_Y;
+ row_high -= row_high % rounding;
}
} else {
row_low = 0;
row_low = 0;
row_high = nrows;
} else if (backend == GGML_BACKEND_GPU_SPLIT) {
+ const int64_t rounding = get_row_rounding(tensor->type);
+
row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
- row_low -= row_low % GGML_CUDA_MMQ_Y;
+ row_low -= row_low % rounding;
if (id == g_device_count - 1) {
row_high = nrows;
} else {
row_high = nrows*g_tensor_split[id + 1];
- row_high -= row_high % GGML_CUDA_MMQ_Y;
+ row_high -= row_high % rounding;
}
} else {
GGML_ASSERT(false);
func(tensor->src[0], tensor->src[1], tensor);
return true;
}
+
+int ggml_cuda_get_device_count() {
+ int device_count;
+ CUDA_CHECK(cudaGetDeviceCount(&device_count));
+ return device_count;
+}
+
+void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
+ cudaDeviceProp prop;
+ CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
+ snprintf(description, description_size, "%s", prop.name);
+}
#define GGML_CUDA_MAX_DEVICES 16
-void ggml_init_cublas(void);
-void ggml_cuda_set_tensor_split(const float * tensor_split);
-
-void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
-
-// TODO: export these with GGML_API
-void * ggml_cuda_host_malloc(size_t size);
-void ggml_cuda_host_free(void * ptr);
-
-void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
-
-void ggml_cuda_free_data(struct ggml_tensor * tensor);
-void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
-void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
-void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
-void ggml_cuda_set_main_device(int main_device);
-void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
-void ggml_cuda_set_scratch_size(size_t scratch_size);
-void ggml_cuda_free_scratch(void);
-bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
+GGML_API void ggml_init_cublas(void);
+GGML_API void * ggml_cuda_host_malloc(size_t size);
+GGML_API void ggml_cuda_host_free(void * ptr);
+
+GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+GGML_API void ggml_cuda_set_tensor_split(const float * tensor_split);
+GGML_API void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
+GGML_API void ggml_cuda_free_data(struct ggml_tensor * tensor);
+GGML_API void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
+GGML_API void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
+GGML_API void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
+GGML_API void ggml_cuda_set_main_device(int main_device);
+GGML_API void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
+GGML_API void ggml_cuda_set_scratch_size(size_t scratch_size);
+GGML_API void ggml_cuda_free_scratch(void);
+GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
+
+GGML_API int ggml_cuda_get_device_count(void);
+GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
#ifdef __cplusplus
}
struct ggml_metal_context * ggml_metal_init(int n_cb);
void ggml_metal_free(struct ggml_metal_context * ctx);
+void * ggml_metal_host_malloc(size_t n);
+void ggml_metal_host_free (void * data);
+
// set the number of command buffers to use
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
// try to find operations that can be run concurrently in the graph
// you should run it again if the topology of your graph changes
-void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
+void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem);
+
+// if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized
+int ggml_metal_if_optimized(struct ggml_metal_context * ctx);
-// if the graph has been optimized for concurrently dispatch
-bool ggml_metal_if_optimized(struct ggml_metal_context * ctx);
+// output the concur_list for ggml_alloc
+int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
// same as ggml_graph_compute but uses Metal
// creates gf->n_threads command buffers in parallel
#import <Foundation/Foundation.h>
#import <Metal/Metal.h>
-#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
#undef MIN
#undef MAX
GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
+ GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
+ GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
+ GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32);
+ GGML_METAL_DECL_KERNEL(mul_mm_q2_K_f32);
+ GGML_METAL_DECL_KERNEL(mul_mm_q3_K_f32);
+ GGML_METAL_DECL_KERNEL(mul_mm_q4_K_f32);
+ GGML_METAL_DECL_KERNEL(mul_mm_q5_K_f32);
+ GGML_METAL_DECL_KERNEL(mul_mm_q6_K_f32);
GGML_METAL_DECL_KERNEL(rope);
GGML_METAL_DECL_KERNEL(alibi_f32);
GGML_METAL_DECL_KERNEL(cpy_f32_f16);
ctx->n_buffers = 0;
ctx->concur_list_len = 0;
- // determine if we can use MPS
- if (MPSSupportsMTLDevice(ctx->device)) {
- fprintf(stderr, "%s: using MPS\n", __func__);
- } else {
- fprintf(stderr, "%s: not using MPS\n", __func__);
- GGML_ASSERT(false && "MPS not supported");
- }
#if 0
// compile from source string and show compile log
ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
if (error) {
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
- exit(1);
+ return NULL;
}
}
#else
NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
if (error) {
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
- exit(1);
+ return NULL;
}
#ifdef GGML_QKK_64
#endif
if (error) {
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
- exit(1);
+ return NULL;
}
}
#endif
// load kernels
{
+ NSError * error = nil;
#define GGML_METAL_ADD_KERNEL(name) \
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
- ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:nil]; \
- fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name);
+ ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
+ fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name); \
+ if (error) { \
+ fprintf(stderr, "%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
+ return NULL; \
+ }
GGML_METAL_ADD_KERNEL(add);
GGML_METAL_ADD_KERNEL(add_row);
GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
+ GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
+ GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
+ GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
+ GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
+ GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
+ GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
+ GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
+ GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
GGML_METAL_ADD_KERNEL(rope);
GGML_METAL_ADD_KERNEL(alibi_f32);
GGML_METAL_ADD_KERNEL(cpy_f32_f16);
free(ctx);
}
+void * ggml_metal_host_malloc(size_t n) {
+ void * data = NULL;
+ const int result = posix_memalign((void **) &data, getpagesize(), n);
+ if (result != 0) {
+ fprintf(stderr, "%s: error: posix_memalign failed\n", __func__);
+ return NULL;
+ }
+
+ return data;
+}
+
+void ggml_metal_host_free(void * data) {
+ free(data);
+}
+
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
ctx->n_cb = n_cb;
}
-bool ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
- if (ctx->concur_list_len) {
- return true;
- }
- return false;
+int ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
+ return ctx->concur_list_len;
+}
+
+int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
+ return ctx->concur_list;
}
// finds the Metal buffer that contains the tensor data on the GPU device
void ggml_metal_graph_find_concurrency(
struct ggml_metal_context * ctx,
- struct ggml_cgraph * gf) {
+ struct ggml_cgraph * gf, bool check_mem) {
int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
int nodes_unused[GGML_MAX_CONCUR];
}
}
}
- if (exe_flag) {
+ if (exe_flag && check_mem) {
// check if nodes[i]'s data will be overwritten by a node before nodes[i].
// if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3]
int64_t data_start = (int64_t) gf->nodes[i]->data;
id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx];
- id<MTLComputeCommandEncoder> encoder = nil;
+ id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
const int node_start = (cb_idx + 0) * n_nodes_per_cb;
const int node_end = (cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb;
const int i = has_concur ? ctx->concur_list[ind] : ind;
if (i == -1) {
- if (encoder == nil) {
- encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
- continue;
- }
[encoder memoryBarrierWithScope:MTLBarrierScopeBuffers];
continue;
}
} break;
case GGML_OP_ADD:
{
- if (encoder == nil) {
- encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
- }
-
if (ggml_nelements(src1) == ne10) {
// src1 is a row
[encoder setComputePipelineState:ctx->pipeline_add_row];
} break;
case GGML_OP_MUL:
{
- if (encoder == nil) {
- encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
- }
-
if (ggml_nelements(src1) == ne10) {
// src1 is a row
[encoder setComputePipelineState:ctx->pipeline_mul_row];
} break;
case GGML_OP_SCALE:
{
- if (encoder == nil) {
- encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
- }
-
const float scale = *(const float *) src1->data;
[encoder setComputePipelineState:ctx->pipeline_scale];
switch (ggml_get_unary_op(gf->nodes[i])) {
case GGML_UNARY_OP_SILU:
{
- if (encoder == nil) {
- encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
- }
-
[encoder setComputePipelineState:ctx->pipeline_silu];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
} break;
case GGML_UNARY_OP_RELU:
{
- if (encoder == nil) {
- encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
- }
-
[encoder setComputePipelineState:ctx->pipeline_relu];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
} break;
case GGML_UNARY_OP_GELU:
{
- if (encoder == nil) {
- encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
- }
-
[encoder setComputePipelineState:ctx->pipeline_gelu];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
} break;
case GGML_OP_SOFT_MAX:
{
- if (encoder == nil) {
- encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
- }
-
const int nth = 32;
[encoder setComputePipelineState:ctx->pipeline_soft_max];
} break;
case GGML_OP_DIAG_MASK_INF:
{
- if (encoder == nil) {
- encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
- }
-
const int n_past = ((int32_t *)(dst->op_params))[0];
[encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
GGML_ASSERT(ne00 == ne10);
// GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
+ uint gqa = ne12/ne02;
GGML_ASSERT(ne03 == ne13);
+ // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
+ // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
if (ggml_is_contiguous(src0) &&
ggml_is_contiguous(src1) &&
- (src0t == GGML_TYPE_F32 || src0t == GGML_TYPE_F16) && ne11 > 1) {
-
- if (encoder != nil) {
- [encoder endEncoding];
- encoder = nil;
- }
-
- MPSDataType src0dt = src0t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16;
- MPSDataType src1dt = src1t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16;
-
- // for F32 x F32 we use MPS
- MPSMatrixDescriptor * desc0 = [MPSMatrixDescriptor
- matrixDescriptorWithRows:ne01 columns:ne00 rowBytes:src0->nb[1] dataType:src0dt];
-
- MPSMatrixDescriptor * desc1 = [MPSMatrixDescriptor
- matrixDescriptorWithRows:ne11 columns:ne10 rowBytes:src1->nb[1] dataType:src1dt];
-
- MPSMatrixDescriptor * desc = [MPSMatrixDescriptor
- matrixDescriptorWithRows:ne1 columns:ne0 rowBytes:dst->nb[1] dataType:MPSDataTypeFloat32];
-
- MPSMatrixMultiplication * mul = [[MPSMatrixMultiplication alloc]
- initWithDevice:ctx->device transposeLeft:false transposeRight:true
- resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
-
- // we need to do ne12 multiplications
- // TODO: is there a way to do this in parallel - currently very slow ..
- // TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
- for (int64_t i02 = 0; i02 < ne12; ++i02) {
- size_t offs_src0_cur = offs_src0 + i02/(ne12/ne02)*nb02; // gqa not used for now
- size_t offs_src1_cur = offs_src1 + i02*nb12;
- size_t offs_dst_cur = offs_dst + i02*nb2;
-
- MPSMatrix * mat_src0 = [[MPSMatrix alloc] initWithBuffer:id_src0 offset:offs_src0_cur descriptor:desc0];
- MPSMatrix * mat_src1 = [[MPSMatrix alloc] initWithBuffer:id_src1 offset:offs_src1_cur descriptor:desc1];
- MPSMatrix * mat_dst = [[MPSMatrix alloc] initWithBuffer:id_dst offset:offs_dst_cur descriptor:desc ];
-
- [mul encodeToCommandBuffer:command_buffer leftMatrix:mat_src1 rightMatrix:mat_src0 resultMatrix:mat_dst];
- }
- } else {
- if (encoder == nil) {
- encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
+ src1t == GGML_TYPE_F32 &&
+ [ctx->device supportsFamily:MTLGPUFamilyApple7] &&
+ ne00%32 == 0 &&
+ ne11 > 1) {
+ switch (src0->type) {
+ case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break;
+ case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
+ case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
+ case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q2_K_f32]; break;
+ case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q3_K_f32]; break;
+ case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_K_f32]; break;
+ case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_K_f32]; break;
+ case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q6_K_f32]; break;
+ default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
+ }
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+ [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
+ [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+ [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
+ [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:5];
+ [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:6];
+ [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7];
+ [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:8];
+ [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:9];
+ [encoder setBytes:&gqa length:sizeof(gqa) atIndex:10];
+ [encoder setThreadgroupMemoryLength:8192 atIndex:0];
+ [encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
}
-
+ else {
int nth0 = 32;
int nth1 = 1;
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16];
+ [encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
- [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+ [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}
else if (src0t == GGML_TYPE_Q3_K) {
#ifdef GGML_QKK_64
- [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+ [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
#else
- [encoder dispatchThreadgroups:MTLSizeMake((ne01+3)/4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+ [encoder dispatchThreadgroups:MTLSizeMake((ne01+3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
#endif
}
else if (src0t == GGML_TYPE_Q5_K) {
- [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+ [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}
else if (src0t == GGML_TYPE_Q6_K) {
- [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+ [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} else {
[encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} break;
case GGML_OP_GET_ROWS:
{
- if (encoder == nil) {
- encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
- }
-
switch (src0->type) {
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
} break;
case GGML_OP_RMS_NORM:
{
- if (encoder == nil) {
- encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
- }
-
float eps;
memcpy(&eps, dst->op_params, sizeof(float));
} break;
case GGML_OP_NORM:
{
- if (encoder == nil) {
- encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
- }
-
const float eps = 1e-5f;
const int nth = 256;
} break;
case GGML_OP_ALIBI:
{
- if (encoder == nil) {
- encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
- }
-
GGML_ASSERT((src0t == GGML_TYPE_F32));
const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
} break;
case GGML_OP_ROPE:
{
- if (encoder == nil) {
- encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
- }
-
const int n_past = ((int32_t *) dst->op_params)[0];
const int n_dims = ((int32_t *) dst->op_params)[1];
const int mode = ((int32_t *) dst->op_params)[2];
case GGML_OP_CPY:
case GGML_OP_CONT:
{
- if (encoder == nil) {
- encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
- }
-
const int nth = 32;
switch (src0t) {
uint8_t qs[QK4_1 / 2]; // nibbles / quants
} block_q4_1;
-static void dequantize_row_q4_0(device const block_q4_0 * x, device float * y, int k) {
- const int qk = QK4_0;
-
- assert(k % qk == 0);
-
- const int nb = k / qk;
-
- for (int i = 0; i < nb; i++) {
- const half d = x[i].d;
-
- for (int j = 0; j < qk/2; ++j) {
- const int x0 = (x[i].qs[j] & 0x0F) - 8;
- const int x1 = (x[i].qs[j] >> 4) - 8;
-
- y[i*qk + j + 0 ] = x0*d;
- y[i*qk + j + qk/2] = x1*d;
- }
- }
-}
-
-static void dequantize_row_q4_1(device const block_q4_1 * x, device float * y, int k) {
- const int qk = QK4_1;
-
- assert(k % qk == 0);
-
- const int nb = k / qk;
-
- for (int i = 0; i < nb; i++) {
- const half d = x[i].d;
- const half m = x[i].m;
-
- for (int j = 0; j < qk/2; ++j) {
- const int x0 = (x[i].qs[j] & 0x0F);
- const int x1 = (x[i].qs[j] >> 4);
-
- y[i*qk + j + 0 ] = x0*d + m;
- y[i*qk + j + qk/2] = x1*d + m;
- }
- }
-}
-
kernel void kernel_add(
device const float * src0,
device const float * src1,
}
}
-kernel void kernel_get_rows_f16(
- device const void * src0,
- device const int * src1,
- device float * dst,
- constant int64_t & ne00,
- constant uint64_t & nb01,
- constant uint64_t & nb1,
- uint tpig[[thread_position_in_grid]]) {
- const int i = tpig;
- const int r = ((device int32_t *) src1)[i];
-
- for (int j = 0; j < ne00; j++) {
- dst[i*nb1 + j] = ((device half *) ((device char *) src0 + r*nb01))[j];
- }
-}
-
-kernel void kernel_get_rows_q4_0(
- device const void * src0,
- device const int * src1,
- device float * dst,
- constant int64_t & ne00,
- constant uint64_t & nb01,
- constant uint64_t & nb1,
- uint tpig[[thread_position_in_grid]]) {
- const int i = tpig;
- const int r = ((device int32_t *) src1)[i];
-
- dequantize_row_q4_0(
- (device const block_q4_0 *) ((device char *) src0 + r*nb01),
- (device float *) ((device char *) dst + i*nb1), ne00);
-}
-
-kernel void kernel_get_rows_q4_1(
- device const void * src0,
- device const int * src1,
- device float * dst,
- constant int64_t & ne00,
- constant uint64_t & nb01,
- constant uint64_t & nb1,
- uint tpig[[thread_position_in_grid]]) {
- const int i = tpig;
- const int r = ((device int32_t *) src1)[i];
-
- dequantize_row_q4_1(
- (device const block_q4_1 *) ((device char *) src0 + r*nb01),
- (device float *) ((device char *) dst + i*nb1), ne00);
-}
-
kernel void kernel_norm(
device const void * src0,
device float * dst,
// N_DST, so this is another explicit assumption of the implementation.
template<typename block_q_type, int nr, int nsg, int nw>
void mul_vec_q_n_f32(device const void * src0, device const float * src1, device float * dst,
- int64_t ne00, int64_t ne10, int64_t ne0, int64_t ne01,
- uint2 tgpig, uint tiisg, uint sgitg) {
+ int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne10, int64_t ne12, int64_t ne0, int64_t ne1, uint gqa,
+ uint3 tgpig, uint tiisg, uint sgitg) {
const int nb = ne00/QK4_0;
const int r0 = tgpig.x;
const int r1 = tgpig.y;
+ const int im = tgpig.z;
const int first_row = (r0 * nsg + sgitg) * nr;
- device const block_q_type * x = (device const block_q_type *) src0 + first_row * nb;
- device const float * y = (device const float *) src1 + r1*ne10;
+ const uint offset0 = first_row * nb + im/gqa*(nb*ne0);
+ device const block_q_type * x = (device const block_q_type *) src0 + offset0;
+ device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
float yl[16]; // src1 vector cache
float sumf[nr]={0.f};
for (int row = 0; row < nr; ++row) {
const float tot = simd_sum(sumf[row]);
if (tiisg == 0 && first_row + row < ne01) {
- dst[r1*ne0 + first_row + row] = tot;
+ dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot;
}
}
}
device const float * src1,
device float * dst,
constant int64_t & ne00,
- constant int64_t & ne10,
- constant int64_t & ne0,
constant int64_t & ne01[[buffer(4)]],
- uint2 tgpig[[threadgroup_position_in_grid]],
+ constant int64_t & ne02[[buffer(5)]],
+ constant int64_t & ne10[[buffer(9)]],
+ constant int64_t & ne12[[buffer(11)]],
+ constant int64_t & ne0[[buffer(15)]],
+ constant int64_t & ne1[[buffer(16)]],
+ constant uint & gqa[[buffer(17)]],
+ uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
- mul_vec_q_n_f32<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
+ mul_vec_q_n_f32<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
}
kernel void kernel_mul_mat_q4_1_f32(
device const float * src1,
device float * dst,
constant int64_t & ne00,
- constant int64_t & ne10,
- constant int64_t & ne0,
constant int64_t & ne01[[buffer(4)]],
- uint2 tgpig[[threadgroup_position_in_grid]],
+ constant int64_t & ne02[[buffer(5)]],
+ constant int64_t & ne10[[buffer(9)]],
+ constant int64_t & ne12[[buffer(11)]],
+ constant int64_t & ne0[[buffer(15)]],
+ constant int64_t & ne1[[buffer(16)]],
+ constant uint & gqa[[buffer(17)]],
+ uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
- mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
+ mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
}
kernel void kernel_mul_mat_f16_f32(
return r;
}
-//========================================== dequantization =============================
-
-static void dequantize_row_q2_K(device const block_q2_K * x, device float * y, int k) {
- assert(k % QK_K == 0);
- const int nb = k / QK_K;
-
- for (int i = 0; i < nb; i++) {
-
- const float d = x[i].d;
- const float min = x[i].dmin;
-
- device const uint8_t * q = x[i].qs;
-
-#if QK_K == 256
- int is = 0;
- float dl, ml;
- for (int n = 0; n < QK_K; n += 128) {
- int shift = 0;
- for (int j = 0; j < 4; ++j) {
-
- uint8_t sc = x[i].scales[is++];
- dl = d * (sc & 0xF); ml = min * (sc >> 4);
- for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l] >> shift) & 3)) - ml;
-
- sc = x[i].scales[is++];
- dl = d * (sc & 0xF); ml = min * (sc >> 4);
- for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml;
-
- shift += 2;
- }
- q += 32;
- }
-#else
- float dl1 = d * (x[i].scales[0] & 0xF), ml1 = min * (x[i].scales[0] >> 4);
- float dl2 = d * (x[i].scales[1] & 0xF), ml2 = min * (x[i].scales[1] >> 4);
- float dl3 = d * (x[i].scales[2] & 0xF), ml3 = min * (x[i].scales[2] >> 4);
- float dl4 = d * (x[i].scales[3] & 0xF), ml4 = min * (x[i].scales[3] >> 4);
- for (int l = 0; l < 16; ++l) {
- y[l+ 0] = dl1 * ((q[l] >> 0) & 3) - ml1;
- y[l+16] = dl2 * ((q[l] >> 2) & 3) - ml2;
- y[l+32] = dl3 * ((q[l] >> 4) & 3) - ml3;
- y[l+48] = dl4 * ((q[l] >> 6) & 3) - ml4;
- }
- y += QK_K;
-#endif
-
- }
-}
-
-static void dequantize_row_q3_K(device const block_q3_K * x, device float * y, int k) {
- assert(k % QK_K == 0);
- const int nb = k / QK_K;
-
-#if QK_K == 256
-
- const uint16_t kmask1 = 0x0303;
- const uint16_t kmask2 = 0x0f0f;
-
- uint16_t aux[8];
- thread const int8_t * scales = (thread const int8_t*)aux;
-
- for (int i = 0; i < nb; i++) {
-
- const float d_all = (float)(x[i].d);
-
- device const uint8_t * q = x[i].qs;
- device const uint8_t * h = x[i].hmask;
- uint8_t m = 1;
-
- device const uint16_t * a = (device const uint16_t *)x[i].scales;
- aux[0] = (a[0] & kmask2) | (((a[4] >> 0) & kmask1) << 4);
- aux[1] = (a[1] & kmask2) | (((a[5] >> 0) & kmask1) << 4);
- aux[2] = (a[2] & kmask2) | (((a[4] >> 2) & kmask1) << 4);
- aux[3] = (a[3] & kmask2) | (((a[5] >> 2) & kmask1) << 4);
- aux[4] = ((a[0] >> 4) & kmask2) | (((a[4] >> 4) & kmask1) << 4);
- aux[5] = ((a[1] >> 4) & kmask2) | (((a[5] >> 4) & kmask1) << 4);
- aux[6] = ((a[2] >> 4) & kmask2) | (((a[4] >> 6) & kmask1) << 4);
- aux[7] = ((a[3] >> 4) & kmask2) | (((a[5] >> 6) & kmask1) << 4);
-
- int is = 0;
- float dl;
- for (int n = 0; n < QK_K; n += 128) {
- int shift = 0;
- for (int j = 0; j < 4; ++j) {
-
- dl = d_all * (scales[is++] - 32);
- for (int l = 0; l < 16; ++l) {
- *y++ = dl * ((int8_t)((q[l+ 0] >> shift) & 3) - ((h[l+ 0] & m) ? 0 : 4));
- }
-
- dl = d_all * (scales[is++] - 32);
- for (int l = 0; l < 16; ++l) {
- *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3) - ((h[l+16] & m) ? 0 : 4));
- }
-
- shift += 2;
- m <<= 1;
- }
- q += 32;
- }
- }
-#else
- for (int i = 0; i < nb; i++) {
-
- const float d_all = (float)(x[i].d);
-
- device const uint8_t * q = x[i].qs;
- device const uint8_t * hm = x[i].hmask;
-
- const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8);
- const float d2 = d_all * ((x[i].scales[0] >> 4) - 8);
- const float d3 = d_all * ((x[i].scales[1] & 0xF) - 8);
- const float d4 = d_all * ((x[i].scales[1] >> 4) - 8);
-
- for (int l = 0; l < 8; ++l) {
- uint8_t h = hm[l];
- y[l+ 0] = d1 * ((int8_t)((q[l+0] >> 0) & 3) - ((h & 0x01) ? 0 : 4));
- y[l+ 8] = d1 * ((int8_t)((q[l+8] >> 0) & 3) - ((h & 0x02) ? 0 : 4));
- y[l+16] = d2 * ((int8_t)((q[l+0] >> 2) & 3) - ((h & 0x04) ? 0 : 4));
- y[l+24] = d2 * ((int8_t)((q[l+8] >> 2) & 3) - ((h & 0x08) ? 0 : 4));
- y[l+32] = d3 * ((int8_t)((q[l+0] >> 4) & 3) - ((h & 0x10) ? 0 : 4));
- y[l+40] = d3 * ((int8_t)((q[l+8] >> 4) & 3) - ((h & 0x20) ? 0 : 4));
- y[l+48] = d4 * ((int8_t)((q[l+0] >> 6) & 3) - ((h & 0x40) ? 0 : 4));
- y[l+56] = d4 * ((int8_t)((q[l+8] >> 6) & 3) - ((h & 0x80) ? 0 : 4));
- }
- y += QK_K;
- }
-#endif
-
-}
-
-static void dequantize_row_q4_K(device const block_q4_K * x, device float * y, int k) {
- assert(k % QK_K == 0);
- const int nb = k / QK_K;
-
- for (int i = 0; i < nb; i++) {
-
- device const uint8_t * q = x[i].qs;
-
-#if QK_K == 256
- const float d = x[i].d;
- const float min = x[i].dmin;
-
- device const uint8_t * scales = x[i].scales;
-
- int is = 0;
- for (int j = 0; j < QK_K; j += 64) {
- const uchar4 sc = get_scale_min_k4(is, scales);
- const float d1 = d * sc[0]; const float m1 = min * sc[1];
- const float d2 = d * sc[2]; const float m2 = min * sc[3];
- for (int l = 0; l < 32; ++l) *y++ = d1 * (q[l] & 0xF) - m1;
- for (int l = 0; l < 32; ++l) *y++ = d2 * (q[l] >> 4) - m2;
- q += 32; is += 2;
- }
-#else
- device const uint8_t * s = x[i].scales;
- device const half2 * dh = (device const half2 *)x[i].d;
- const float2 d = (float2)dh[0];
- const float d1 = d[0] * (s[0] & 0xF);
- const float d2 = d[0] * (s[1] & 0xF);
- const float m1 = d[1] * (s[0] >> 4);
- const float m2 = d[1] * (s[1] >> 4);
- for (int l = 0; l < 32; ++l) {
- y[l+ 0] = d1 * (q[l] & 0xF) - m1;
- y[l+32] = d2 * (q[l] >> 4) - m2;
- }
- y += QK_K;
-#endif
-
- }
-}
-
-static void dequantize_row_q5_K(device const block_q5_K * x, device float * y, int k) {
- assert(k % QK_K == 0);
- const int nb = k / QK_K;
-
-#if QK_K == 256
- for (int i = 0; i < nb; i++) {
-
- const float d = (float)(x[i].d);
- const float min = (float)(x[i].dmin);
-
- device const uint8_t * ql = x[i].qs;
- device const uint8_t * qh = x[i].qh;
-
- int is = 0;
- uint8_t u1 = 1, u2 = 2;
- for (int j = 0; j < QK_K; j += 64) {
- const uchar4 sc = get_scale_min_k4(is, x[i].scales);
- const float d1 = d * sc[0]; const float m1 = min * sc[1];
- const float d2 = d * sc[2]; const float m2 = min * sc[3];
- for (int l = 0; l < 32; ++l) *y++ = d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1;
- for (int l = 0; l < 32; ++l) *y++ = d2 * ((ql[l] >> 4) + (qh[l] & u2 ? 16 : 0)) - m2;
- ql += 32; is += 2;
- u1 <<= 2; u2 <<= 2;
- }
- }
-#else
- for (int i = 0; i < nb; i++) {
-
- const float d = (float)x[i].d;
-
- device const uint8_t * ql = x[i].qs;
- device const uint8_t * qh = x[i].qh;
- device const int8_t * sc = x[i].scales;
-
- for (int l = 0; l < 8; ++l) {
- y[l+ 0] = d * sc[0] * ((ql[l+ 0] & 0xF) - (qh[l] & 0x01 ? 0 : 16));
- y[l+ 8] = d * sc[0] * ((ql[l+ 8] & 0xF) - (qh[l] & 0x02 ? 0 : 16));
- y[l+16] = d * sc[1] * ((ql[l+16] & 0xF) - (qh[l] & 0x04 ? 0 : 16));
- y[l+24] = d * sc[1] * ((ql[l+24] & 0xF) - (qh[l] & 0x08 ? 0 : 16));
- y[l+32] = d * sc[2] * ((ql[l+ 0] >> 4) - (qh[l] & 0x10 ? 0 : 16));
- y[l+40] = d * sc[2] * ((ql[l+ 8] >> 4) - (qh[l] & 0x20 ? 0 : 16));
- y[l+48] = d * sc[3] * ((ql[l+16] >> 4) - (qh[l] & 0x40 ? 0 : 16));
- y[l+56] = d * sc[3] * ((ql[l+24] >> 4) - (qh[l] & 0x80 ? 0 : 16));
- }
- y += QK_K;
- }
-#endif
-
-}
-
-static void dequantize_row_q6_K(device const block_q6_K * x, device float * y, int k) {
- assert(k % QK_K == 0);
- const int nb = k / QK_K;
-
- for (int i = 0; i < nb; i++) {
-
- device const uint8_t * ql = x[i].ql;
- device const uint8_t * qh = x[i].qh;
- device const int8_t * sc = x[i].scales;
-
- const float d = x[i].d;
-
-#if QK_K == 256
- for (int n = 0; n < QK_K; n += 128) {
- for (int l = 0; l < 32; ++l) {
- int is = l/16;
- const int8_t q1 = (int8_t)((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
- const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
- const int8_t q3 = (int8_t)((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
- const int8_t q4 = (int8_t)((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
- y[l + 0] = d * sc[is + 0] * q1;
- y[l + 32] = d * sc[is + 2] * q2;
- y[l + 64] = d * sc[is + 4] * q3;
- y[l + 96] = d * sc[is + 6] * q4;
- }
- y += 128;
- ql += 64;
- qh += 32;
- sc += 8;
- }
-#else
- for (int l = 0; l < 16; ++l) {
- const int8_t q1 = (int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
- const int8_t q2 = (int8_t)((ql[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
- const int8_t q3 = (int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
- const int8_t q4 = (int8_t)((ql[l+16] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
- y[l+ 0] = d * sc[0] * q1;
- y[l+16] = d * sc[1] * q2;
- y[l+32] = d * sc[2] * q3;
- y[l+48] = d * sc[3] * q4;
- }
- y += 64;
-#endif
- }
-}
-
-kernel void kernel_get_rows_q2_K(
- device const void * src0,
- device const int * src1,
- device float * dst,
- constant int64_t & ne00,
- constant uint64_t & nb01,
- constant uint64_t & nb1,
- uint tpig[[thread_position_in_grid]]) {
- const int i = tpig;
- const int r = ((device int32_t *) src1)[i];
-
- dequantize_row_q2_K(
- (device const block_q2_K *) ((device char *) src0 + r*nb01),
- (device float *) ((device char *) dst + i*nb1), ne00);
-}
-
-kernel void kernel_get_rows_q3_K(
- device const void * src0,
- device const int * src1,
- device float * dst,
- constant int64_t & ne00,
- constant uint64_t & nb01,
- constant uint64_t & nb1,
- uint tpig[[thread_position_in_grid]]) {
- const int i = tpig;
- const int r = ((device int32_t *) src1)[i];
-
- dequantize_row_q3_K(
- (device const block_q3_K *) ((device char *) src0 + r*nb01),
- (device float *) ((device char *) dst + i*nb1), ne00);
-}
-
-kernel void kernel_get_rows_q4_K(
- device const void * src0,
- device const int * src1,
- device float * dst,
- constant int64_t & ne00,
- constant uint64_t & nb01,
- constant uint64_t & nb1,
- uint tpig[[thread_position_in_grid]]) {
- const int i = tpig;
- const int r = ((device int32_t *) src1)[i];
-
- dequantize_row_q4_K(
- (device const block_q4_K *) ((device char *) src0 + r*nb01),
- (device float *) ((device char *) dst + i*nb1), ne00);
-}
-
-kernel void kernel_get_rows_q5_K(
- device const void * src0,
- device const int * src1,
- device float * dst,
- constant int64_t & ne00,
- constant uint64_t & nb01,
- constant uint64_t & nb1,
- uint tpig[[thread_position_in_grid]]) {
- const int i = tpig;
- const int r = ((device int32_t *) src1)[i];
-
- dequantize_row_q5_K(
- (device const block_q5_K *) ((device char *) src0 + r*nb01),
- (device float *) ((device char *) dst + i*nb1), ne00);
-}
-
-kernel void kernel_get_rows_q6_K(
- device const void * src0,
- device const int * src1,
- device float * dst,
- constant int64_t & ne00,
- constant uint64_t & nb01,
- constant uint64_t & nb1,
- uint tpig[[thread_position_in_grid]]) {
- const int i = tpig;
- const int r = ((device int32_t *) src1)[i];
-
- dequantize_row_q6_K(
- (device const block_q6_K *) ((device char *) src0 + r*nb01),
- (device float *) ((device char *) dst + i*nb1), ne00);
-}
-
//====================================== dot products =========================
kernel void kernel_mul_mat_q2_K_f32(
device const float * src1,
device float * dst,
constant int64_t & ne00,
- constant int64_t & ne10,
- constant int64_t & ne0,
constant int64_t & ne01[[buffer(4)]],
- uint2 tgpig[[threadgroup_position_in_grid]],
+ constant int64_t & ne02[[buffer(5)]],
+ constant int64_t & ne10[[buffer(9)]],
+ constant int64_t & ne12[[buffer(11)]],
+ constant int64_t & ne0[[buffer(15)]],
+ constant int64_t & ne1[[buffer(16)]],
+ constant uint & gqa[[buffer(17)]],
+ uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
const int nb = ne00/QK_K;
const int r0 = tgpig.x;
const int r1 = tgpig.y;
+ const int r2 = tgpig.z;
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
const int ib_row = first_row * nb;
- device const block_q2_K * x = (device const block_q2_K *) src0 + ib_row;
- device const float * y = (device const float *) src1 + r1*ne10;
+ const uint offset0 = r2/gqa*(nb*ne0);
+ device const block_q2_K * x = (device const block_q2_K *) src0 + ib_row + offset0;
+ device const float * y = (device const float *) src1 + r1*ne10 + r2*ne00*ne1;
float yl[32];
float sumf[N_DST]={0.f}, all_sum;
for (int row = 0; row < N_DST; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
- dst[r1*ne0 + first_row + row] = all_sum;
+ dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = all_sum;
}
}
}
device const float * src1,
device float * dst,
constant int64_t & ne00,
- constant int64_t & ne10,
- constant int64_t & ne0,
- constant int64_t & ne1,
- uint2 tgpig[[threadgroup_position_in_grid]],
+ constant int64_t & ne01[[buffer(4)]],
+ constant int64_t & ne02[[buffer(5)]],
+ constant int64_t & ne10[[buffer(9)]],
+ constant int64_t & ne12[[buffer(11)]],
+ constant int64_t & ne0[[buffer(15)]],
+ constant int64_t & ne1[[buffer(16)]],
+ constant uint & gqa[[buffer(17)]],
+ uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y;
+ const int64_t r2 = tgpig.z;
const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2;
-
- device const block_q3_K * x = (device const block_q3_K *) src0 + first_row*nb;
- device const float * yy = (device const float *) src1 + r1*ne10;
+ const uint offset0 = r2/gqa*(nb*ne0);
+ device const block_q3_K * x = (device const block_q3_K *) src0 + first_row*nb + offset0;
+ device const float * yy = (device const float *) src1 + r1*ne10 + r2*ne00*ne1;
float yl[16];
const float sumf = (sumf1[row] - 32.f*sumf2[row]) / (1 << shift);
const float tot = simd_sum(sumf);
if (tiisg == 0) {
- dst[r1*ne0 + first_row + row] = tot;
+ dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = tot;
}
}
}
device const float * src1,
device float * dst,
constant int64_t & ne00,
- constant int64_t & ne10,
- constant int64_t & ne0,
- constant int64_t & ne1,
- uint2 tgpig[[threadgroup_position_in_grid]],
+ constant int64_t & ne01[[buffer(4)]],
+ constant int64_t & ne02[[buffer(5)]],
+ constant int64_t & ne10[[buffer(9)]],
+ constant int64_t & ne12[[buffer(11)]],
+ constant int64_t & ne0[[buffer(15)]],
+ constant int64_t & ne1[[buffer(16)]],
+ constant uint & gqa[[buffer(17)]],
+ uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y;
+ const int64_t r2 = tgpig.z;
const int row = 2 * r0 + sgitg;
-
- device const block_q3_K * x = (device const block_q3_K *) src0 + row*nb;
- device const float * yy = (device const float *) src1 + r1*ne10;
+ const uint offset0 = r2/gqa*(nb*ne0);
+ device const block_q3_K * x = (device const block_q3_K *) src0 + row*nb + offset0;
+ device const float * yy = (device const float *) src1 + r1*ne10 + r2*ne00*ne1;
const int ix = tiisg/4;
const int il = 4 * (tiisg%4);// 0, 4, 8, 12
const int im = il/8; // 0, 0, 1, 1
const float tot = simd_sum(sumf);
if (tiisg == 0) {
- dst[r1*ne0 + row] = tot;
+ dst[r1*ne0 + r2*ne0*ne1 + row] = tot;
}
}
device const float * src1,
device float * dst,
constant int64_t & ne00,
- constant int64_t & ne10,
- constant int64_t & ne0,
constant int64_t & ne01[[buffer(4)]],
- uint2 tgpig[[threadgroup_position_in_grid]],
+ constant int64_t & ne02[[buffer(5)]],
+ constant int64_t & ne10[[buffer(9)]],
+ constant int64_t & ne12[[buffer(11)]],
+ constant int64_t & ne0[[buffer(15)]],
+ constant int64_t & ne1[[buffer(16)]],
+ constant uint & gqa[[buffer(17)]],
+ uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
const int nb = ne00/QK_K;
const int r0 = tgpig.x;
const int r1 = tgpig.y;
+ const int r2 = tgpig.z;
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
const int ib_row = first_row * nb;
- device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row;
- device const float * y = (device const float *) src1 + r1*ne10;
+ const uint offset0 = r2/gqa*(nb*ne0);
+ device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
+ device const float * y = (device const float *) src1 + r1*ne10 + r2*ne00*ne1;
float yl[16];
float yh[16];
float sumf[N_DST]={0.f}, all_sum;
for (int row = 0; row < N_DST; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
- dst[r1*ne0 + first_row + row] = all_sum;
+ dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = all_sum;
}
}
}
device const float * src1,
device float * dst,
constant int64_t & ne00,
- constant int64_t & ne10,
- constant int64_t & ne0,
constant int64_t & ne01[[buffer(4)]],
- uint2 tgpig[[threadgroup_position_in_grid]],
+ constant int64_t & ne02[[buffer(5)]],
+ constant int64_t & ne10[[buffer(9)]],
+ constant int64_t & ne12[[buffer(11)]],
+ constant int64_t & ne0[[buffer(15)]],
+ constant int64_t & ne1[[buffer(16)]],
+ constant uint & gqa[[buffer(17)]],
+ uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
const int nb = ne00/QK_K;
const int r0 = tgpig.x;
const int r1 = tgpig.y;
+ const int r2 = tgpig.z;
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
const int ib_row = first_row * nb;
- device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row;
- device const float * y = (device const float *) src1 + r1*ne10;
+ const uint offset0 = r2/gqa*(nb*ne0);
+ device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
+ device const float * y = (device const float *) src1 + r1*ne10 + r2*ne00*ne1;
float yl[8];
float yh[8];
float sumf[N_DST]={0.f}, all_sum;
for (int row = 0; row < N_DST; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
- dst[r1*ne0 + first_row + row] = all_sum;
+ dst[r1*ne0+ r2*ne0*ne1 + first_row + row] = all_sum;
}
}
}
device const float * src1,
device float * dst,
constant int64_t & ne00,
- constant int64_t & ne10,
- constant int64_t & ne0,
- uint2 tgpig[[threadgroup_position_in_grid]],
+ constant int64_t & ne01[[buffer(4)]],
+ constant int64_t & ne02[[buffer(5)]],
+ constant int64_t & ne10[[buffer(9)]],
+ constant int64_t & ne12[[buffer(11)]],
+ constant int64_t & ne0[[buffer(15)]],
+ constant int64_t & ne1[[buffer(16)]],
+ constant uint & gqa[[buffer(17)]],
+ uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y;
+ const int r2 = tgpig.z;
const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2;
-
- device const block_q5_K * x = (device const block_q5_K *) src0 + first_row*nb;
- device const float * yy = (device const float *) src1 + r1*ne10;
+ const uint offset0 = r2/gqa*(nb*ne0);
+ device const block_q5_K * x = (device const block_q5_K *) src0 + first_row*nb + offset0;
+ device const float * yy = (device const float *) src1 + r1*ne10 + r2*ne00*ne1;
float sumf[2]={0.f};
for (int row = 0; row < 2; ++row) {
const float tot = simd_sum(sumf[row]);
if (tiisg == 0) {
- dst[r1*ne0 + first_row + row] = tot;
+ dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = tot;
}
}
device const float * src1,
device float * dst,
constant int64_t & ne00,
- constant int64_t & ne10,
- constant int64_t & ne0,
- uint2 tgpig[[threadgroup_position_in_grid]],
+ constant int64_t & ne01[[buffer(4)]],
+ constant int64_t & ne02[[buffer(5)]],
+ constant int64_t & ne10[[buffer(9)]],
+ constant int64_t & ne12[[buffer(11)]],
+ constant int64_t & ne0[[buffer(15)]],
+ constant int64_t & ne1[[buffer(16)]],
+ constant uint & gqa[[buffer(17)]],
+ uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y;
+ const int r2 = tgpig.z;
const int row = 2 * r0 + sgitg;
-
- device const block_q6_K * x = (device const block_q6_K *) src0 + row * nb; //r0*nb;
- device const float * yy = (device const float *) src1 + r1*ne10;
+ const uint offset0 = r2/gqa*(nb*ne0);
+ device const block_q6_K * x = (device const block_q6_K *) src0 + row * nb + offset0;
+ device const float * yy = (device const float *) src1 + r1*ne10 + r2*ne00*ne1;
float sumf = 0;
const float tot = simd_sum(sumf);
if (tiisg == 0) {
- dst[r1*ne0 + row] = tot;
+ dst[r1*ne0 + r2*ne0*ne1 + row] = tot;
+ }
+}
+
+//============================= templates and their specializations =============================
+
+template <typename type4x4>
+void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) {
+ half4x4 temp = *(((device half4x4 *)src));
+ for (int i = 0; i < 16; i++){
+ reg[i/4][i%4] = temp[i/4][i%4];
}
}
+
+template <typename type4x4>
+void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) {
+ device const uint16_t * qs = ((device const uint16_t *)xb + 1);
+ const half d = il ? (xb->d / 16.h) : xb->d;
+ const half m = il ? (-8.h * 16.h) : -8.h;
+ const ushort mask0 = il ? 0x00F0 : 0x000F;
+ const ushort mask1 = il ? 0xF000 : 0x0F00;
+
+ for (int i=0;i<8;i++) {
+ reg[i/2][2*(i%2)] = (((qs[i] & mask0)) + m) * d;
+ reg[i/2][2*(i%2)+1] = (((qs[i] & mask1) >> 8) + m) * d;
+ }
+}
+
+template <typename type4x4>
+void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) {
+ device const uint16_t * qs = ((device const uint16_t *)xb + 2);
+ const half d = il ? (xb->d / 16.h) : xb->d;
+ const half m = xb->m;
+ const ushort mask0 = il ? 0x00F0 : 0x000F;
+ const ushort mask1 = il ? 0xF000 : 0x0F00;
+
+ for (int i=0;i<8;i++) {
+ reg[i/2][2*(i%2)] = (((qs[i] & mask0)) * d) + m;
+ reg[i/2][2*(i%2)+1] = (((qs[i] & mask1) >> 8) * d) + m;
+ }
+}
+
+template <typename type4x4>
+void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) {
+ const half d = xb->d;
+ const half min = xb->dmin;
+ device const uint8_t * q = (device const uint8_t *)xb->qs;
+ half dl, ml;
+ uint8_t sc = xb->scales[il];
+
+#if QK_K == 256
+ q = q + 32*(il/8) + 16*(il&1);
+ il = (il/2)%4;
+#endif
+ half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
+ uchar mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
+ dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4);
+ for (int i = 0; i < 16; ++i) {
+ reg[i/4][i%4] = dl * (q[i] & mask) - ml;
+ }
+}
+
+template <typename type4x4>
+void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) {
+ const float d_all = (float)(xb->d);
+ device const uint8_t * q = (device const uint8_t *)xb->qs;
+ device const uint8_t * h = (device const uint8_t *)xb->hmask;
+ device const int8_t * scales = (device const int8_t *)xb->scales;
+
+#if QK_K == 256
+ q = q + 32 * (il/8) + 16 * (il&1);
+ h = h + 16 * (il&1);
+ uint8_t m = 1 << (il/2);
+ uint16_t kmask1 = (il/4)>1 ? ((il/4)>2 ? 192 : 48) : \
+ ((il/4)>0 ? 12 : 3);
+ uint16_t kmask2 = il/8 ? 0xF0 : 0x0F;
+ uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4];
+ int16_t dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2) : \
+ (scale_2&kmask2) | ((scale_1&kmask1) << 4);
+ float dl = il<8 ? d_all * (dl_int - 32.f) : d_all * (dl_int / 16.f - 32.f);
+
+ il = (il/2)%4;
+ float coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
+ uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
+
+ for (int i = 0; i < 16; ++i) {
+ reg[i/4][i%4] = coef * dl * ((q[i] & mask) - ((h[i] & m) ? 0 : 4.f/coef));
+ }
+#else
+ float kcoef = il&1 ? 1.f/16.f : 1.f;
+ uint16_t kmask = il&1 ? 0xF0 : 0x0F;
+ float dl = d_all * ((scales[il/2] & kmask) * kcoef - 8);
+ float coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
+ uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
+ uint8_t m = 1<<(il*2);
+ for (int i = 0; i < 16; ++i) {
+ reg[i/4][i%4] = coef * dl * ((q[i] & mask) - ((h[i%8] & (m * (1 + i/8))) ? 0 : 4.f/coef));
+ }
+#endif
+}
+
+template <typename type4x4>
+void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) {
+ device const uint8_t * q = xb->qs;
+
+#if QK_K == 256
+ const float d = (float)(xb->d);
+ const float min = (float)(xb->dmin);
+ short is = (il/4) * 2;
+ q = q + (il/4) * 32 + 16 * (il&1);
+ il = il%4;
+ const uchar4 sc = get_scale_min_k4(is, xb->scales);
+ const float dl = il<2 ? d * sc[0] : d * sc[2]/16.h;
+ const float ml = il<2 ? min * sc[1] : min * sc[3];
+#else
+ q = q + 16 * (il&1);
+ device const uint8_t * s = xb->scales;
+ device const half2 * dh = (device const half2 *)xb->d;
+ const float2 d = (float2)dh[0];
+ const float dl = il<2 ? d[0] * (s[0]&0xF) : d[0] * (s[1]&0xF)/16.h;
+ const float ml = il<2 ? d[1] * (s[0]>>4) : d[1 ]* (s[1]>>4);
+#endif
+ const ushort mask = il<2 ? 0x0F : 0xF0;
+ for (int i = 0; i < 16; ++i) {
+ reg[i/4][i%4] = dl * (q[i] & mask) - ml;
+ }
+}
+
+template <typename type4x4>
+void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg) {
+ device const uint8_t * q = xb->qs;
+ device const uint8_t * qh = xb->qh;
+
+#if QK_K == 256
+ const float d = (float)(xb->d);
+ const float min = (float)(xb->dmin);
+ short is = (il/4) * 2;
+ q = q + 32 * (il/4) + 16 * (il&1);
+ qh = qh + 16 * (il&1);
+ uint8_t ul = 1 << (il/2);
+ il = il%4;
+ const uchar4 sc = get_scale_min_k4(is, xb->scales);
+ const float dl = il<2 ? d * sc[0] : d * sc[2]/16.h;
+ const float ml = il<2 ? min * sc[1] : min * sc[3];
+
+ const ushort mask = il<2 ? 0x0F : 0xF0;
+ const float qh_val = il<2 ? 16.f : 256.f;
+ for (int i = 0; i < 16; ++i) {
+ reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml;
+ }
+#else
+ q = q + 16 * (il&1);
+ device const int8_t * s = xb->scales;
+ const float dl = xb->d * s[il];
+ uint8_t m = 1<<(il*2);
+ const float coef = il<2 ? 1.f : 1.f/16.f;
+ const ushort mask = il<2 ? 0x0F : 0xF0;
+ for (int i = 0; i < 16; ++i) {
+ reg[i/4][i%4] = coef * dl * ((q[i] & mask) - (qh[i%8] & (m*(1+i/8)) ? 0.f : 16.f/coef));
+ }
+#endif
+}
+
+template <typename type4x4>
+void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) {
+ const float d_all = (float)(xb->d);
+ device const uint8_t * ql = (device const uint8_t *)xb->ql;
+ device const uint8_t * qh = (device const uint8_t *)xb->qh;
+ device const int8_t * scales = (device const int8_t *)xb->scales;
+
+#if QK_K == 256
+ ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
+ qh = qh + 32*(il/8) + 16*(il&1);
+ float sc = scales[(il%2) + 2 * ((il/2))];
+ il = (il/2)%4;
+#else
+ ql = ql + 16 * (il&1);
+ float sc = scales[il];
+#endif
+ for (int i = 0; i < 16; ++i) {
+ uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
+ uint16_t kmask2 = il>1 ? 0xF0 : 0x0F;
+ const float coef = il>1 ? 1.f/16.f : 1.f;
+ float q = il&1 ? ((ql[i]&kmask2)|((qh[i]&kmask1)<<2)) - 32.f/coef : \
+ ((ql[i]&kmask2)|((qh[i]&kmask1)<<4)) - 32.f/coef;
+ reg[i/4][i%4] = d_all * sc * q * coef;
+ }
+}
+
+template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
+kernel void kernel_get_rows(
+ device const void * src0,
+ device const int * src1,
+ device float * dst,
+ constant int64_t & ne00,
+ constant uint64_t & nb01,
+ constant uint64_t & nb1,
+ uint tgpig[[threadgroup_position_in_grid]],
+ uint tiitg[[thread_index_in_threadgroup]],
+ uint tptg[[threads_per_threadgroup]]) {
+ const int i = tgpig;
+ const int r = ((device int32_t *) src1)[i];
+
+ for (int ind = tiitg; ind < ne00/16; ind += tptg) {
+ float4x4 temp;
+ dequantize_func(
+ ((device const block_q *) ((device char *) src0 + r*nb01)) + ind/nl, ind%nl, temp);
+ *(((device float4x4 *) ((device char *) dst + i*nb1)) + ind) = temp;
+ }
+}
+
+#define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A
+#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix A
+#define BLOCK_SIZE_K 32
+#define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A
+#define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B
+#define THREAD_PER_BLOCK 128
+#define THREAD_PER_ROW 2 // 2 thread for each row in matrix A to load numbers
+#define THREAD_PER_COL 4 // 4 thread for each row in matrix B to load numbers
+#define SG_MAT_SIZE 64 // simdgroup matrix is of shape 8x8
+#define SG_MAT_ROW 8
+
+// each block_q contains 16*nl weights
+template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
+kernel void kernel_mul_mm(device const uchar * src0,
+ device const float * src1,
+ device float * dst,
+ constant int64_t & ne00,
+ constant int64_t & ne02,
+ constant int64_t & nb01,
+ constant int64_t & nb02,
+ constant int64_t & ne12,
+ constant int64_t & ne0,
+ constant int64_t & ne1,
+ constant uint & gqa,
+ threadgroup uchar * shared_memory [[threadgroup(0)]],
+ uint3 tgpig[[threadgroup_position_in_grid]],
+ uint tiitg[[thread_index_in_threadgroup]],
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
+
+ threadgroup half * sa = ((threadgroup half *)shared_memory);
+ threadgroup float * sb = (threadgroup float *)(shared_memory + 4096);
+
+ const uint r0 = tgpig.y;
+ const uint r1 = tgpig.x;
+ const uint im = tgpig.z;
+ // if this block is of 64x32 shape or smaller
+ short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M;
+ short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N;
+ // a thread shouldn't load data outside of the matrix
+ short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
+ short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
+
+ simdgroup_half8x8 ma[4];
+ simdgroup_float8x8 mb[2];
+ simdgroup_float8x8 c_res[8];
+ for (int i = 0; i < 8; i++){
+ c_res[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
+ }
+
+ short il = (tiitg % THREAD_PER_ROW);
+ uint offset0 = im/gqa*nb02; ushort offset1 = il/nl;
+ device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1;
+ device const float * y = src1 + (r1 * BLOCK_SIZE_N + thread_col) * ne00 \
+ + BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL) + im * ne00 * ne1;
+
+ for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
+ //load data and store to threadgroup memory
+ half4x4 temp_a;
+ dequantize_func(x, il, temp_a);
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ #pragma unroll(16)
+ for (int i = 0; i < 16; i++) {
+ *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
+ + 16 * (tiitg % THREAD_PER_ROW) + 8 * (i / 8)) \
+ + (tiitg / THREAD_PER_ROW) % 8 + (i & 7) * 8) = temp_a[i/4][i%4];
+ }
+ *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) \
+ = *((device float2x4 *)y);
+ il = (il + 2 < nl) ? il + 2 : il % 2;
+ x = (il < 2) ? x + (2+nl-1)/nl : x;
+ y += BLOCK_SIZE_K;
+
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ //load matrices from threadgroup memory and conduct outer products
+ threadgroup half * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2));
+ threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2));
+ #pragma unroll(4)
+ for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
+ #pragma unroll(4)
+ for (int i = 0; i < 4; i++) {
+ simdgroup_load(ma[i],lsma + SG_MAT_SIZE * i);
+ }
+ simdgroup_barrier(mem_flags::mem_none);
+ #pragma unroll(2)
+ for (int i = 0; i < 2; i++) {
+ simdgroup_load(mb[i],lsmb + SG_MAT_SIZE * i);
+ }
+
+ lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE;
+ lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE;
+ #pragma unroll(8)
+ for (int i = 0; i < 8; i++){
+ simdgroup_multiply_accumulate(c_res[i], mb[i/4], ma[i%4], c_res[i]);
+ }
+ }
+ }
+
+ if ((r0 + 1) * BLOCK_SIZE_M <= ne0 && (r1 + 1) * BLOCK_SIZE_N <= ne1) {
+ device float *C = dst + BLOCK_SIZE_M * r0 + 32 * (sgitg&1) \
+ + (BLOCK_SIZE_N * r1 + 16 * (sgitg>>1)) * ne0 + im*ne1*ne0;
+ for (int i = 0; i < 8; i++) {
+ simdgroup_store(c_res[i], C + 8 * (i%4) + 8 * ne0 * (i/4), ne0);
+ }
+ } else {
+ // block is smaller than 64x32, we should avoid writing data outside of the matrix
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
+ + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
+ for (int i = 0; i < 8; i++) {
+ simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
+ }
+
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
+ if (sgitg==0) {
+ for (int i = 0; i < n_rows; i++) {
+ for (int j = tiitg; j< n_cols; j += BLOCK_SIZE_N) {
+ *(C + i + j * ne0) = *(temp_str + i + j * BLOCK_SIZE_M);
+ }
+ }
+ }
+ }
+}
+
+#if QK_K == 256
+#define QK_NL 16
+#else
+#define QK_NL 4
+#endif
+
+typedef void (get_rows_t)(device const void *, device const int *, device float *, constant int64_t &, \
+ constant uint64_t &, constant uint64_t &, uint, uint, uint);
+
+template [[host_name("kernel_get_rows_f16")]] kernel get_rows_t kernel_get_rows<half4x4, 1, dequantize_f16>;
+template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_t kernel_get_rows<block_q4_0, 2, dequantize_q4_0>;
+template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_t kernel_get_rows<block_q4_1, 2, dequantize_q4_1>;
+template [[host_name("kernel_get_rows_q2_K")]] kernel get_rows_t kernel_get_rows<block_q2_K, QK_NL, dequantize_q2_K>;
+template [[host_name("kernel_get_rows_q3_K")]] kernel get_rows_t kernel_get_rows<block_q3_K, QK_NL, dequantize_q3_K>;
+template [[host_name("kernel_get_rows_q4_K")]] kernel get_rows_t kernel_get_rows<block_q4_K, QK_NL, dequantize_q4_K>;
+template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_t kernel_get_rows<block_q5_K, QK_NL, dequantize_q5_K>;
+template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows<block_q6_K, QK_NL, dequantize_q6_K>;
+
+typedef void (mat_mm_t)(device const uchar *, device const float *, device float *, constant int64_t &,\
+ constant int64_t &, constant int64_t &, constant int64_t &, constant int64_t &, \
+ constant int64_t &, constant int64_t &, constant uint &, threadgroup uchar *, uint3, uint, uint);
+
+template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm<half4x4, 1, dequantize_f16>;
+template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_0, 2, dequantize_q4_0>;
+template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_1, 2, dequantize_q4_1>;
+template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q2_K, QK_NL, dequantize_q2_K>;
+template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q3_K, QK_NL, dequantize_q3_K>;
+template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_K, QK_NL, dequantize_q4_K>;
+template [[host_name("kernel_mul_mm_q5_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q5_K, QK_NL, dequantize_q5_K>;
+template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q6_K, QK_NL, dequantize_q6_K>;
error_desc = "insufficient memory";
break;
}
- GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n",
- __func__, error_desc, size/(1024.0*1024.0));
+ GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
return NULL;
}
return aligned_memory;
static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
+ [GGML_TYPE_I8] = {
+ .type_name = "i8",
+ .blck_size = 1,
+ .type_size = sizeof(int8_t),
+ .is_quantized = false,
+ },
+ [GGML_TYPE_I16] = {
+ .type_name = "i16",
+ .blck_size = 1,
+ .type_size = sizeof(int16_t),
+ .is_quantized = false,
+ },
+ [GGML_TYPE_I32] = {
+ .type_name = "i32",
+ .blck_size = 1,
+ .type_size = sizeof(int32_t),
+ .is_quantized = false,
+ },
[GGML_TYPE_F32] = {
+ .type_name = "f32",
+ .blck_size = 1,
+ .type_size = sizeof(float),
+ .is_quantized = false,
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
.vec_dot_type = GGML_TYPE_F32,
},
[GGML_TYPE_F16] = {
+ .type_name = "f16",
+ .blck_size = 1,
+ .type_size = sizeof(ggml_fp16_t),
+ .is_quantized = false,
.to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
.from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
.vec_dot_type = GGML_TYPE_F16,
},
[GGML_TYPE_Q4_0] = {
+ .type_name = "q4_0",
+ .blck_size = QK4_0,
+ .type_size = sizeof(block_q4_0),
+ .is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q4_0,
.from_float = quantize_row_q4_0,
.from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
.vec_dot_type = GGML_TYPE_Q8_0,
},
[GGML_TYPE_Q4_1] = {
+ .type_name = "q4_1",
+ .blck_size = QK4_1,
+ .type_size = sizeof(block_q4_1),
+ .is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q4_1,
.from_float = quantize_row_q4_1,
.from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
.vec_dot_type = GGML_TYPE_Q8_1,
},
[GGML_TYPE_Q5_0] = {
+ .type_name = "q5_0",
+ .blck_size = QK5_0,
+ .type_size = sizeof(block_q5_0),
+ .is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q5_0,
.from_float = quantize_row_q5_0,
.from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
.vec_dot_type = GGML_TYPE_Q8_0,
},
[GGML_TYPE_Q5_1] = {
+ .type_name = "q5_1",
+ .blck_size = QK5_1,
+ .type_size = sizeof(block_q5_1),
+ .is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q5_1,
.from_float = quantize_row_q5_1,
.from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
.vec_dot_type = GGML_TYPE_Q8_1,
},
[GGML_TYPE_Q8_0] = {
+ .type_name = "q8_0",
+ .blck_size = QK8_0,
+ .type_size = sizeof(block_q8_0),
+ .is_quantized = true,
.to_float = dequantize_row_q8_0,
.from_float = quantize_row_q8_0,
.from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
.vec_dot_type = GGML_TYPE_Q8_0,
},
[GGML_TYPE_Q8_1] = {
+ .type_name = "q8_1",
+ .blck_size = QK8_1,
+ .type_size = sizeof(block_q8_1),
+ .is_quantized = true,
.from_float = quantize_row_q8_1,
.from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
.vec_dot_type = GGML_TYPE_Q8_1,
},
#ifdef GGML_USE_K_QUANTS
[GGML_TYPE_Q2_K] = {
+ .type_name = "q2_K",
+ .blck_size = QK_K,
+ .type_size = sizeof(block_q2_K),
+ .is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q2_K,
.from_float = quantize_row_q2_K,
.from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
.vec_dot_type = GGML_TYPE_Q8_K,
},
[GGML_TYPE_Q3_K] = {
+ .type_name = "q3_K",
+ .blck_size = QK_K,
+ .type_size = sizeof(block_q3_K),
+ .is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q3_K,
.from_float = quantize_row_q3_K,
.from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
.vec_dot_type = GGML_TYPE_Q8_K,
},
[GGML_TYPE_Q4_K] = {
+ .type_name = "q4_K",
+ .blck_size = QK_K,
+ .type_size = sizeof(block_q4_K),
+ .is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q4_K,
.from_float = quantize_row_q4_K,
.from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
.vec_dot_type = GGML_TYPE_Q8_K,
},
[GGML_TYPE_Q5_K] = {
+ .type_name = "q5_K",
+ .blck_size = QK_K,
+ .type_size = sizeof(block_q5_K),
+ .is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q5_K,
.from_float = quantize_row_q5_K,
.from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
.vec_dot_type = GGML_TYPE_Q8_K,
},
[GGML_TYPE_Q6_K] = {
+ .type_name = "q6_K",
+ .blck_size = QK_K,
+ .type_size = sizeof(block_q6_K),
+ .is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q6_K,
.from_float = quantize_row_q6_K,
.from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
.vec_dot_type = GGML_TYPE_Q8_K,
},
[GGML_TYPE_Q8_K] = {
+ .type_name = "q8_K",
+ .blck_size = QK_K,
+ .type_size = sizeof(block_q8_K),
+ .is_quantized = true,
.from_float = quantize_row_q8_K,
}
#endif
};
// For internal test use
-ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i) {
- GGML_ASSERT(i < GGML_TYPE_COUNT);
- return type_traits[i];
+ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
+ GGML_ASSERT(type < GGML_TYPE_COUNT);
+ return type_traits[type];
}
// data types
//
-static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
- [GGML_TYPE_F32] = 1,
- [GGML_TYPE_F16] = 1,
- [GGML_TYPE_Q4_0] = QK4_0,
- [GGML_TYPE_Q4_1] = QK4_1,
- [GGML_TYPE_Q5_0] = QK5_0,
- [GGML_TYPE_Q5_1] = QK5_1,
- [GGML_TYPE_Q8_0] = QK8_0,
- [GGML_TYPE_Q8_1] = QK8_1,
-#ifdef GGML_USE_K_QUANTS
- [GGML_TYPE_Q2_K] = QK_K,
- [GGML_TYPE_Q3_K] = QK_K,
- [GGML_TYPE_Q4_K] = QK_K,
- [GGML_TYPE_Q5_K] = QK_K,
- [GGML_TYPE_Q6_K] = QK_K,
- [GGML_TYPE_Q8_K] = QK_K,
-#endif
- [GGML_TYPE_I8] = 1,
- [GGML_TYPE_I16] = 1,
- [GGML_TYPE_I32] = 1,
-};
-static_assert(GGML_TYPE_COUNT == 19, "GGML_BLCK_SIZE is outdated");
-
-static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
- [GGML_TYPE_F32] = sizeof(float),
- [GGML_TYPE_F16] = sizeof(ggml_fp16_t),
- [GGML_TYPE_Q4_0] = sizeof(block_q4_0),
- [GGML_TYPE_Q4_1] = sizeof(block_q4_1),
- [GGML_TYPE_Q5_0] = sizeof(block_q5_0),
- [GGML_TYPE_Q5_1] = sizeof(block_q5_1),
- [GGML_TYPE_Q8_0] = sizeof(block_q8_0),
- [GGML_TYPE_Q8_1] = sizeof(block_q8_1),
-#ifdef GGML_USE_K_QUANTS
- [GGML_TYPE_Q2_K] = sizeof(block_q2_K),
- [GGML_TYPE_Q3_K] = sizeof(block_q3_K),
- [GGML_TYPE_Q4_K] = sizeof(block_q4_K),
- [GGML_TYPE_Q5_K] = sizeof(block_q5_K),
- [GGML_TYPE_Q6_K] = sizeof(block_q6_K),
- [GGML_TYPE_Q8_K] = sizeof(block_q8_K),
-#endif
- [GGML_TYPE_I8] = sizeof(int8_t),
- [GGML_TYPE_I16] = sizeof(int16_t),
- [GGML_TYPE_I32] = sizeof(int32_t),
-};
-static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_SIZE is outdated");
-
-
-static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
- [GGML_TYPE_F32] = "f32",
- [GGML_TYPE_F16] = "f16",
- [GGML_TYPE_Q4_0] = "q4_0",
- [GGML_TYPE_Q4_1] = "q4_1",
- [GGML_TYPE_Q5_0] = "q5_0",
- [GGML_TYPE_Q5_1] = "q5_1",
- [GGML_TYPE_Q8_0] = "q8_0",
- [GGML_TYPE_Q8_1] = "q8_1",
- [GGML_TYPE_Q2_K] = "q2_K",
- [GGML_TYPE_Q3_K] = "q3_K",
- [GGML_TYPE_Q4_K] = "q4_K",
- [GGML_TYPE_Q5_K] = "q5_K",
- [GGML_TYPE_Q6_K] = "q6_K",
- [GGML_TYPE_Q8_K] = "q8_K",
- [GGML_TYPE_I8] = "i8",
- [GGML_TYPE_I16] = "i16",
- [GGML_TYPE_I32] = "i32",
-};
-static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_NAME is outdated");
-
-static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
- [GGML_TYPE_F32] = false,
- [GGML_TYPE_F16] = false,
- [GGML_TYPE_Q4_0] = true,
- [GGML_TYPE_Q4_1] = true,
- [GGML_TYPE_Q5_0] = true,
- [GGML_TYPE_Q5_1] = true,
- [GGML_TYPE_Q8_0] = true,
- [GGML_TYPE_Q8_1] = true,
- [GGML_TYPE_Q2_K] = true,
- [GGML_TYPE_Q3_K] = true,
- [GGML_TYPE_Q4_K] = true,
- [GGML_TYPE_Q5_K] = true,
- [GGML_TYPE_Q6_K] = true,
- [GGML_TYPE_Q8_K] = true,
- [GGML_TYPE_I8] = false,
- [GGML_TYPE_I16] = false,
- [GGML_TYPE_I32] = false,
-};
-static_assert(GGML_TYPE_COUNT == 19, "GGML_IS_QUANTIZED is outdated");
-
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"NONE",
//
// is enough, but just in case, adding the second part
- return GGML_PAD(MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]), GGML_MEM_ALIGN);
+ return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type));
+}
+
+size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
+ return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
}
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
- return (nrows_split*tensor->ne[0]*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type];
+ return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type);
}
int ggml_blck_size(enum ggml_type type) {
- return GGML_BLCK_SIZE[type];
+ return type_traits[type].blck_size;
}
size_t ggml_type_size(enum ggml_type type) {
- return GGML_TYPE_SIZE[type];
+ return type_traits[type].type_size;
}
float ggml_type_sizef(enum ggml_type type) {
- return ((float)(GGML_TYPE_SIZE[type]))/GGML_BLCK_SIZE[type];
+ return ((float)(type_traits[type].type_size))/type_traits[type].blck_size;
}
const char * ggml_type_name(enum ggml_type type) {
- return GGML_TYPE_NAME[type];
+ return type_traits[type].type_name;
+}
+
+bool ggml_is_quantized(enum ggml_type type) {
+ return type_traits[type].is_quantized;
}
const char * ggml_op_name(enum ggml_op op) {
}
size_t ggml_element_size(const struct ggml_tensor * tensor) {
- return GGML_TYPE_SIZE[tensor->type];
+ return ggml_type_size(tensor->type);
}
static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) {
(t0->ne[3] == t1->ne[3]);
}
-bool ggml_is_quantized(enum ggml_type type) {
- return GGML_IS_QUANTIZED[type];
-}
-
enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
enum ggml_type wtype = GGML_TYPE_COUNT;
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return
- tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
- tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/GGML_BLCK_SIZE[tensor->type] &&
+ tensor->nb[0] == ggml_type_size(tensor->type) &&
+ tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
}
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return
- tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
+ tensor->nb[0] == ggml_type_size(tensor->type) &&
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
}
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return
- tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
+ tensor->nb[0] == ggml_type_size(tensor->type) &&
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
}
size_t data_size = 0;
if (data == NULL && !ctx->no_alloc) {
- data_size += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
+ data_size += ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
for (int i = 1; i < n_dims; i++) {
data_size *= ne[i];
}
result->ne[i] = ne[i];
}
- result->nb[0] = GGML_TYPE_SIZE[type];
- result->nb[1] = result->nb[0]*(result->ne[0]/GGML_BLCK_SIZE[type]);
+ result->nb[0] = ggml_type_size(type);
+ result->nb[1] = result->nb[0]*(result->ne[0]/ggml_blck_size(type));
for (int i = 2; i < GGML_MAX_DIMS; i++) {
result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
}
memcpy(
((char *) dst->data + ie0*nb0),
((char *) src0->data + ie0*nb00),
- (ie1 - ie0) * GGML_TYPE_SIZE[src0->type]);
+ (ie1 - ie0) * ggml_type_size(src0->type));
}
}
if (src0->type == dst->type &&
ne00 == ne0 &&
- nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) {
+ nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
// copy by rows
const size_t rs = ne00*nb00;
for (int64_t i03 = 0; i03 < ne03; i03++) {
float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
size_t id = 0;
- size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
+ size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
char * dst_ptr = (char *) dst->data;
for (int i03 = 0; i03 < ne03; i03++) {
if (src0->type == dst->type &&
ne00 == ne0 &&
- nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) {
+ nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
// copy by rows
const size_t rs = ne00*nb00;
for (int64_t i03 = 0; i03 < ne03; i03++) {
ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
size_t id = 0;
- size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
+ size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
char * dst_ptr = (char *) dst->data;
for (int i03 = 0; i03 < ne03; i03++) {
ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
// we don't support permuted src0 or src1
- GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
+ GGML_ASSERT(nb00 == ggml_type_size(type));
GGML_ASSERT(nb10 == sizeof(float));
// dst cannot be transposed or permuted
ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
// we don't support permuted src0
- GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
+ GGML_ASSERT(nb00 == ggml_type_size(type));
// dst cannot be transposed or permuted
GGML_ASSERT(nb0 <= nb1);
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
+ GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now");
+
switch (src0->type) {
case GGML_TYPE_F32:
{
GGML_ASSERT(ne3 == ne13);
// we don't support permuted src0 or src1
- GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
+ GGML_ASSERT(nb00 == ggml_type_size(type));
GGML_ASSERT(nb10 == sizeof(float));
// dst cannot be transposed or permuted
if (params->type == GGML_TASK_INIT) {
if (src1->type != vec_dot_type) {
char * wdata = params->wdata;
- const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
+ const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
for (int64_t i13 = 0; i13 < ne13; ++i13) {
for (int64_t i12 = 0; i12 < ne12; ++i12) {
}
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
- const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
+ const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
const int64_t nr0 = ne01; // src0 rows
const int64_t nr1 = ne11*ne12*ne13; // src1 rows
assert( dst->ne[0] == nc);
assert( dst->ne[1] == nr);
- assert(src0->nb[0] == GGML_TYPE_SIZE[type]);
+ assert(src0->nb[0] == ggml_type_size(type));
for (int i = 0; i < nr; ++i) {
const int r = ((int32_t *) src1->data)[i];
size_t cur = 0;
if (ggml_is_quantized(node->type)) {
- cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks;
+ cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
}
work_size = MAX(work_size, cur);
size_t cur = 0;
if (ggml_is_quantized(node->src[0]->type)) {
- cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[0]->ne[0] * n_tasks;
+ cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
}
work_size = MAX(work_size, cur);
size_t cur = 0;
if (ggml_is_quantized(node->src[0]->type)) {
- cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[1]->ne[0] * n_tasks;
+ cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
}
work_size = MAX(work_size, cur);
// the threads are still spinning
if (node->src[0]->type != GGML_TYPE_F32) {
// here we need memory just for single 2D matrix from src0
- cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src[0]->ne[0]*node->src[0]->ne[1]);
+ cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
}
} else
#endif
if (node->src[1]->type != vec_dot_type) {
- cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src[1])/GGML_BLCK_SIZE[vec_dot_type];
+ cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
} else {
cur = 0;
}
// compute size of intermediate results
// TODO: does not take into account scratch buffers !!!!
for (int i = 0; i < cgraph->n_nodes; ++i) {
- size_eval += ggml_nbytes(cgraph->nodes[i]);
+ size_eval += ggml_nbytes_pad(cgraph->nodes[i]);
}
// print
struct ggml_tensor * f) {
// build forward + backward compute graphs
- struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0));
- struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0));
+ struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
+ struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
////////////////////////////////////////////////////////////////////////////////
+struct gguf_str {
+ uint32_t n;
+ char * data;
+};
+
+static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
+ [GGUF_TYPE_UINT8] = sizeof(uint8_t),
+ [GGUF_TYPE_INT8] = sizeof(int8_t),
+ [GGUF_TYPE_UINT16] = sizeof(uint16_t),
+ [GGUF_TYPE_INT16] = sizeof(int16_t),
+ [GGUF_TYPE_UINT32] = sizeof(uint32_t),
+ [GGUF_TYPE_INT32] = sizeof(int32_t),
+ [GGUF_TYPE_FLOAT32] = sizeof(float),
+ [GGUF_TYPE_BOOL] = sizeof(bool),
+ [GGUF_TYPE_STRING] = sizeof(struct gguf_str),
+ [GGUF_TYPE_ARRAY] = 0, // undefined
+};
+static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
+
+static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
+ [GGUF_TYPE_UINT8] = "u8",
+ [GGUF_TYPE_INT8] = "i8",
+ [GGUF_TYPE_UINT16] = "u16",
+ [GGUF_TYPE_INT16] = "i16",
+ [GGUF_TYPE_UINT32] = "u32",
+ [GGUF_TYPE_INT32] = "i32",
+ [GGUF_TYPE_FLOAT32] = "f32",
+ [GGUF_TYPE_BOOL] = "bool",
+ [GGUF_TYPE_STRING] = "str",
+ [GGUF_TYPE_ARRAY] = "arr",
+};
+static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
+
+union gguf_value {
+ uint8_t uint8;
+ int8_t int8;
+ uint16_t uint16;
+ int16_t int16;
+ uint32_t uint32;
+ int32_t int32;
+ float float32;
+ bool bool_;
+
+ struct gguf_str str;
+
+ struct {
+ enum gguf_type type;
+
+ uint32_t n;
+ void * data;
+ } arr;
+};
+
+struct gguf_kv {
+ struct gguf_str key;
+
+ uint32_t n_bytes; // TODO: is this actually needed?
+
+ enum gguf_type type;
+ union gguf_value value;
+};
+
+struct gguf_header {
+ uint32_t magic;
+ uint32_t version;
+ uint32_t n_tensors;
+ uint32_t n_kv;
+};
+
+struct gguf_tensor_info {
+ struct gguf_str name;
+
+ uint32_t n_dims;
+ uint32_t ne[GGML_MAX_DIMS];
+
+ enum ggml_type type;
+
+ uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
+
+ // for writing API
+ const void * data;
+ size_t size;
+};
+
+struct gguf_context {
+ struct gguf_header header;
+
+ struct gguf_kv * kv;
+ struct gguf_tensor_info * infos;
+
+ size_t alignment;
+ size_t offset; // offset of `data` from beginning of file
+ size_t size; // size of `data` in bytes
+
+ //uint8_t * padding;
+ void * data;
+};
+
+static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
+ const size_t n = fread(dst, 1, size, file);
+ *offset += n;
+ return n == size;
+}
+
+static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
+ p->n = 0;
+ p->data = NULL;
+
+ bool ok = true;
+
+ // TODO: how to avoid mallocs for strings?
+ ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
+ ok = ok && gguf_fread_el(file, p->data, p->n, offset);
+
+ return ok;
+}
+
+struct gguf_context * gguf_init_empty(void) {
+ struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
+
+ ctx->header.magic = GGUF_MAGIC;
+ ctx->header.version = GGUF_VERSION;
+ ctx->header.n_tensors = 0;
+ ctx->header.n_kv = 0;
+
+ ctx->kv = NULL;
+ ctx->infos = NULL;
+
+ ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
+ ctx->offset = 0;
+ ctx->size = 0;
+
+ ctx->data = NULL;
+
+ return ctx;
+}
+
+struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
+ FILE * file = fopen(fname, "rb");
+ if (!file) {
+ return NULL;
+ }
+
+ // offset from start of file
+ size_t offset = 0;
+
+ uint32_t magic = 0;
+
+ // check the magic before making allocations
+ {
+ gguf_fread_el(file, &magic, sizeof(magic), &offset);
+
+ if (magic != GGUF_MAGIC) {
+ fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
+ fclose(file);
+ return NULL;
+ }
+ }
+
+ bool ok = true;
+
+ struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
+
+ // read the header
+ {
+ ctx->header.magic = magic;
+
+ ctx->kv = NULL;
+ ctx->infos = NULL;
+ ctx->data = NULL;
+
+ ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset);
+ ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
+ ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
+
+ if (!ok) {
+ fprintf(stderr, "%s: failed to read header\n", __func__);
+ fclose(file);
+ gguf_free(ctx);
+ return NULL;
+ }
+ }
+
+ // read the kv pairs
+ {
+ ctx->kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
+
+ for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
+ struct gguf_kv * kv = &ctx->kv[i];
+
+ //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
+
+ ok = ok && gguf_fread_str(file, &kv->key, &offset);
+ //ok = ok && gguf_fread_el (file, &kv->n_bytes, sizeof(kv->n_bytes), &offset);
+ ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
+
+ //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
+
+ switch (kv->type) {
+ case GGUF_TYPE_UINT8: ok = ok && gguf_fread_el (file, &kv->value.uint8, sizeof(kv->value.uint8), &offset); break;
+ case GGUF_TYPE_INT8: ok = ok && gguf_fread_el (file, &kv->value.int8, sizeof(kv->value.int8), &offset); break;
+ case GGUF_TYPE_UINT16: ok = ok && gguf_fread_el (file, &kv->value.uint16, sizeof(kv->value.uint16), &offset); break;
+ case GGUF_TYPE_INT16: ok = ok && gguf_fread_el (file, &kv->value.int16, sizeof(kv->value.int16), &offset); break;
+ case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break;
+ case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break;
+ case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
+ case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break;
+ case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break;
+ case GGUF_TYPE_ARRAY:
+ {
+ ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
+ ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
+
+ switch (kv->value.arr.type) {
+ case GGUF_TYPE_UINT8:
+ case GGUF_TYPE_INT8:
+ case GGUF_TYPE_UINT16:
+ case GGUF_TYPE_INT16:
+ case GGUF_TYPE_UINT32:
+ case GGUF_TYPE_INT32:
+ case GGUF_TYPE_FLOAT32:
+ case GGUF_TYPE_BOOL:
+ {
+ kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
+ ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], &offset);
+ } break;
+ case GGUF_TYPE_STRING:
+ {
+ kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
+ for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
+ ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
+ }
+ } break;
+ case GGUF_TYPE_ARRAY:
+ case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
+ };
+ } break;
+ case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
+ };
+
+ if (!ok) {
+ break;
+ }
+ }
+
+ if (!ok) {
+ fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
+ fclose(file);
+ gguf_free(ctx);
+ return NULL;
+ }
+ }
+
+ // read the tensor infos
+ {
+ ctx->infos = GGML_ALIGNED_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
+
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+ struct gguf_tensor_info * info = &ctx->infos[i];
+
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+ info->ne[j] = 1;
+ }
+
+ ok = ok && gguf_fread_str(file, &info->name, &offset);
+ ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
+ for (uint32_t j = 0; j < info->n_dims; ++j) {
+ ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
+ }
+ ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
+ ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
+
+ if (!ok) {
+ fprintf(stderr, "%s: failed to read tensor info\n", __func__);
+ fclose(file);
+ gguf_free(ctx);
+ return NULL;
+ }
+ }
+ }
+
+ ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
+
+ int alignment_idx = gguf_find_key(ctx, "general.alignment");
+ if (alignment_idx != -1) {
+ ctx->alignment = gguf_get_val_u32(ctx, alignment_idx);
+ }
+
+ // we require the data section to be aligned, so take into account any padding
+ {
+ const size_t offset_pad = offset % ctx->alignment;
+
+ if (offset_pad != 0) {
+ offset += ctx->alignment - offset_pad;
+ fseek(file, offset, SEEK_SET);
+ }
+ }
+
+ // store the current file offset - this is where the data section starts
+ ctx->offset = offset;
+
+ // compute the total size of the data section, taking into account the alignment
+ {
+ ctx->size = 0;
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+ struct gguf_tensor_info * info = &ctx->infos[i];
+
+ const int64_t ne =
+ (int64_t) info->ne[0] *
+ (int64_t) info->ne[1] *
+ (int64_t) info->ne[2] *
+ (int64_t) info->ne[3];
+
+ if (ne % ggml_blck_size(info->type) != 0) {
+ fprintf(stderr, "%s: tensor '%s' number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
+ __func__, info->name.data, ne, ggml_blck_size(info->type));
+ fclose(file);
+ gguf_free(ctx);
+ return NULL;
+ }
+
+ const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type);
+
+ ctx->size += GGML_PAD(size_cur, ctx->alignment);
+ }
+ }
+
+ // load the tensor data only if requested
+ if (params.ctx != NULL) {
+ // if the provided gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob
+ // otherwise, we load the binary blob into the created ggml_context as well, and point the "data" members of
+ // the ggml_tensor structs to the appropriate locations in the binary blob
+
+ // compute the exact size needed for the new ggml_context
+ const size_t mem_size =
+ params.no_alloc ?
+ (ctx->header.n_tensors )*ggml_tensor_overhead() :
+ (ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
+
+ struct ggml_init_params pdata = {
+ .mem_size = mem_size,
+ .mem_buffer = NULL,
+ .no_alloc = params.no_alloc,
+ };
+
+ *params.ctx = ggml_init(pdata);
+
+ struct ggml_context * ctx_data = *params.ctx;
+
+ struct ggml_tensor * data = NULL;
+
+ if (params.no_alloc == false) {
+ data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
+
+ ok = ok && data != NULL;
+
+ // read the binary blob with the tensor data
+ ok = ok && gguf_fread_el(file, data->data, ctx->size, &offset);
+
+ if (!ok) {
+ fprintf(stderr, "%s: failed to read tensor data\n", __func__);
+ fclose(file);
+ ggml_free(ctx_data);
+ gguf_free(ctx);
+ return NULL;
+ }
+
+ ctx->data = data->data;
+ }
+
+ ggml_set_no_alloc(ctx_data, true);
+
+ // create the tensors
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+ const int64_t ne[GGML_MAX_DIMS] = {
+ ctx->infos[i].ne[0],
+ ctx->infos[i].ne[1],
+ ctx->infos[i].ne[2],
+ ctx->infos[i].ne[3],
+ };
+
+ struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
+
+ ok = ok && cur != NULL;
+
+ ggml_set_name(cur, ctx->infos[i].name.data);
+
+ if (!ok) {
+ break;
+ }
+
+ // point the data member to the appropriate location in the binary blob using the tensor infos
+ if (params.no_alloc == false) {
+ //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
+ cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
+ }
+ }
+
+ if (!ok) {
+ fprintf(stderr, "%s: failed to read the tensor data\n", __func__);
+ fclose(file);
+ ggml_free(ctx_data);
+ gguf_free(ctx);
+ return NULL;
+ }
+
+ ggml_set_no_alloc(ctx_data, params.no_alloc);
+ }
+
+ fclose(file);
+
+ return ctx;
+}
+
+void gguf_free(struct gguf_context * ctx) {
+ if (ctx == NULL) {
+ return;
+ }
+
+ if (ctx->kv) {
+ // free string memory - not great..
+ for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
+ struct gguf_kv * kv = &ctx->kv[i];
+
+ if (kv->key.data) {
+ free(kv->key.data);
+ }
+
+ if (kv->type == GGUF_TYPE_STRING) {
+ if (kv->value.str.data) {
+ free(kv->value.str.data);
+ }
+ }
+
+ if (kv->type == GGUF_TYPE_ARRAY) {
+ if (kv->value.arr.data) {
+ if (kv->value.arr.type == GGUF_TYPE_STRING) {
+ for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
+ struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
+ if (str->data) {
+ free(str->data);
+ }
+ }
+ }
+ free(kv->value.arr.data);
+ }
+ }
+ }
+
+ GGML_ALIGNED_FREE(ctx->kv);
+ }
+
+ if (ctx->infos) {
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+ struct gguf_tensor_info * info = &ctx->infos[i];
+
+ if (info->name.data) {
+ free(info->name.data);
+ }
+ }
+
+ GGML_ALIGNED_FREE(ctx->infos);
+ }
+
+ GGML_ALIGNED_FREE(ctx);
+}
+
+const char * gguf_type_name(enum gguf_type type) {
+ return GGUF_TYPE_NAME[type];
+}
+
+int gguf_get_version(struct gguf_context * ctx) {
+ return ctx->header.version;
+}
+
+size_t gguf_get_alignment(struct gguf_context * ctx) {
+ return ctx->alignment;
+}
+
+size_t gguf_get_data_offset(struct gguf_context * ctx) {
+ return ctx->offset;
+}
+
+void * gguf_get_data(struct gguf_context * ctx) {
+ return ctx->data;
+}
+
+int gguf_get_n_kv(struct gguf_context * ctx) {
+ return ctx->header.n_kv;
+}
+
+int gguf_find_key(struct gguf_context * ctx, const char * key) {
+ // return -1 if key not found
+ int keyfound = -1;
+
+ const int n_kv = gguf_get_n_kv(ctx);
+
+ for (int i = 0; i < n_kv; ++i) {
+ if (strcmp(key, gguf_get_key(ctx, i)) == 0) {
+ keyfound = i;
+ break;
+ }
+ }
+
+ return keyfound;
+}
+
+const char * gguf_get_key(struct gguf_context * ctx, int i) {
+ return ctx->kv[i].key.data;
+}
+
+enum gguf_type gguf_get_kv_type(struct gguf_context * ctx, int i) {
+ return ctx->kv[i].type;
+}
+
+enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i) {
+ return ctx->kv[i].value.arr.type;
+}
+
+const void * gguf_get_arr_data(struct gguf_context * ctx, int i) {
+ return ctx->kv[i].value.arr.data;
+}
+
+const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i) {
+ struct gguf_kv * kv = &ctx->kv[key_id];
+ struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
+ return str->data;
+}
+
+int gguf_get_arr_n(struct gguf_context * ctx, int i) {
+ return ctx->kv[i].value.arr.n;
+}
+
+uint8_t gguf_get_val_u8(struct gguf_context * ctx, int i) {
+ return ctx->kv[i].value.uint8;
+}
+
+int8_t gguf_get_val_i8(struct gguf_context * ctx, int i) {
+ return ctx->kv[i].value.int8;
+}
+
+uint16_t gguf_get_val_u16(struct gguf_context * ctx, int i) {
+ return ctx->kv[i].value.uint16;
+}
+
+int16_t gguf_get_val_i16(struct gguf_context * ctx, int i) {
+ return ctx->kv[i].value.int16;
+}
+
+uint32_t gguf_get_val_u32(struct gguf_context * ctx, int i) {
+ return ctx->kv[i].value.uint32;
+}
+
+int32_t gguf_get_val_i32(struct gguf_context * ctx, int i) {
+ return ctx->kv[i].value.int32;
+}
+
+float gguf_get_val_f32(struct gguf_context * ctx, int i) {
+ return ctx->kv[i].value.float32;
+}
+
+bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
+ return ctx->kv[i].value.bool_;
+}
+
+const char * gguf_get_val_str (struct gguf_context * ctx, int i) {
+ return ctx->kv[i].value.str.data;
+}
+
+int gguf_get_n_tensors(struct gguf_context * ctx) {
+ return ctx->header.n_tensors;
+}
+
+int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
+ // return -1 if tensor not found
+ int tensorfound = -1;
+
+ const int n_tensors = gguf_get_n_tensors(ctx);
+
+ for (int i = 0; i < n_tensors; ++i) {
+ if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) {
+ tensorfound = i;
+ break;
+ }
+ }
+
+ return tensorfound;
+}
+
+size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) {
+ return ctx->infos[i].offset;
+}
+
+char * gguf_get_tensor_name(struct gguf_context * ctx, int i) {
+ return ctx->infos[i].name.data;
+}
+
+// returns the index
+static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
+ const int idx = gguf_find_key(ctx, key);
+ if (idx >= 0) {
+ return idx;
+ }
+
+ const int n_kv = gguf_get_n_kv(ctx);
+
+ ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
+ ctx->kv[n_kv].key.n = strlen(key) + 1;
+ ctx->kv[n_kv].key.data = strdup(key);
+ ctx->header.n_kv++;
+
+ return n_kv;
+}
+
+void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
+ const int idx = gguf_get_or_add_key(ctx, key);
+
+ ctx->kv[idx].type = GGUF_TYPE_UINT8;
+ ctx->kv[idx].value.uint8 = val;
+}
+
+void gguf_set_val_i8(struct gguf_context * ctx, const char * key, int8_t val) {
+ const int idx = gguf_get_or_add_key(ctx, key);
+
+ ctx->kv[idx].type = GGUF_TYPE_INT8;
+ ctx->kv[idx].value.int8 = val;
+}
+
+void gguf_set_val_u16(struct gguf_context * ctx, const char * key, uint16_t val) {
+ const int idx = gguf_get_or_add_key(ctx, key);
+
+ ctx->kv[idx].type = GGUF_TYPE_UINT16;
+ ctx->kv[idx].value.uint16 = val;
+}
+
+void gguf_set_val_i16(struct gguf_context * ctx, const char * key, int16_t val) {
+ const int idx = gguf_get_or_add_key(ctx, key);
+
+ ctx->kv[idx].type = GGUF_TYPE_INT16;
+ ctx->kv[idx].value.int16 = val;
+}
+
+void gguf_set_val_u32(struct gguf_context * ctx, const char * key, uint32_t val) {
+ const int idx = gguf_get_or_add_key(ctx, key);
+
+ ctx->kv[idx].type = GGUF_TYPE_UINT32;
+ ctx->kv[idx].value.uint32 = val;
+}
+
+void gguf_set_val_i32(struct gguf_context * ctx, const char * key, int32_t val) {
+ const int idx = gguf_get_or_add_key(ctx, key);
+
+ ctx->kv[idx].type = GGUF_TYPE_INT32;
+ ctx->kv[idx].value.int32 = val;
+}
+
+void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
+ const int idx = gguf_get_or_add_key(ctx, key);
+
+ ctx->kv[idx].type = GGUF_TYPE_FLOAT32;
+ ctx->kv[idx].value.float32 = val;
+}
+
+void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
+ const int idx = gguf_get_or_add_key(ctx, key);
+
+ ctx->kv[idx].type = GGUF_TYPE_BOOL;
+ ctx->kv[idx].value.bool_ = val;
+}
+
+void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) {
+ const int idx = gguf_get_or_add_key(ctx, key);
+
+ ctx->kv[idx].type = GGUF_TYPE_STRING;
+ ctx->kv[idx].value.str.n = strlen(val) + 1;
+ ctx->kv[idx].value.str.data = strdup(val);
+}
+
+void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n) {
+ const int idx = gguf_get_or_add_key(ctx, key);
+
+ ctx->kv[idx].type = GGUF_TYPE_ARRAY;
+ ctx->kv[idx].value.arr.type = type;
+ ctx->kv[idx].value.arr.n = n;
+ ctx->kv[idx].value.arr.data = malloc(n*GGUF_TYPE_SIZE[type]);
+ memcpy(ctx->kv[idx].value.arr.data, data, n*GGUF_TYPE_SIZE[type]);
+}
+
+void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
+ const int idx = gguf_get_or_add_key(ctx, key);
+
+ ctx->kv[idx].type = GGUF_TYPE_ARRAY;
+ ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
+ ctx->kv[idx].value.arr.n = n;
+ ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str));
+ for (int i = 0; i < n; i++) {
+ struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
+ str->n = strlen(data[i]) + 1;
+ str->data = strdup(data[i]);
+ }
+}
+
+// set or add KV pairs from another context
+void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
+ for (uint32_t i = 0; i < src->header.n_kv; i++) {
+ switch (src->kv[i].type) {
+ case GGUF_TYPE_UINT8: gguf_set_val_u8 (ctx, src->kv[i].key.data, src->kv[i].value.uint8); break;
+ case GGUF_TYPE_INT8: gguf_set_val_i8 (ctx, src->kv[i].key.data, src->kv[i].value.int8); break;
+ case GGUF_TYPE_UINT16: gguf_set_val_u16 (ctx, src->kv[i].key.data, src->kv[i].value.uint16); break;
+ case GGUF_TYPE_INT16: gguf_set_val_i16 (ctx, src->kv[i].key.data, src->kv[i].value.int16); break;
+ case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32); break;
+ case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32); break;
+ case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32); break;
+ case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_); break;
+ case GGUF_TYPE_STRING: gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
+ case GGUF_TYPE_ARRAY:
+ {
+ if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
+ const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *));
+ for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
+ data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
+ }
+ gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
+ free(data);
+ } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
+ GGML_ASSERT(false && "nested arrays not supported");
+ } else {
+ gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
+ }
+ } break;
+ case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
+ }
+ }
+}
+
+void gguf_add_tensor(
+ struct gguf_context * ctx,
+ const struct ggml_tensor * tensor) {
+ const int idx = ctx->header.n_tensors;
+ ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
+
+ ctx->infos[idx].name.n = strlen(tensor->name) + 1;
+ ctx->infos[idx].name.data = strdup(tensor->name);
+
+ for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+ ctx->infos[idx].ne[i] = 1;
+ }
+
+ ctx->infos[idx].n_dims = tensor->n_dims;
+ for (int i = 0; i < tensor->n_dims; i++) {
+ ctx->infos[idx].ne[i] = tensor->ne[i];
+ }
+
+ ctx->infos[idx].type = tensor->type;
+ ctx->infos[idx].offset = 0;
+ ctx->infos[idx].data = tensor->data;
+ ctx->infos[idx].size = ggml_nbytes(tensor);
+
+ if (ctx->header.n_tensors > 0) {
+ ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment);
+ }
+
+ ctx->header.n_tensors++;
+}
+
+void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
+ const int idx = gguf_find_tensor(ctx, name);
+ if (idx < 0) {
+ GGML_ASSERT(false && "tensor not found");
+ }
+
+ ctx->infos[idx].type = type;
+}
+
+void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) {
+ const int idx = gguf_find_tensor(ctx, name);
+ if (idx < 0) {
+ GGML_ASSERT(false && "tensor not found");
+ }
+
+ ctx->infos[idx].data = data;
+ ctx->infos[idx].size = size;
+
+ // update offsets
+ for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) {
+ ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment);
+ }
+}
+
+//static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) {
+// fwrite(&val->n, sizeof(val->n), 1, file);
+// fwrite(val->data, sizeof(char), val->n, file);
+//}
+//
+//static void gguf_fwrite_el(FILE * file, const void * val, size_t size) {
+// fwrite(val, sizeof(char), size, file);
+//}
+
+struct gguf_buf {
+ void * data;
+ size_t size;
+ size_t offset;
+};
+
+static struct gguf_buf gguf_buf_init(size_t size) {
+ struct gguf_buf buf = {
+ /*buf.data =*/ size == 0 ? NULL : malloc(size),
+ /*buf.size =*/ size,
+ /*buf.offset =*/ 0,
+ };
+
+ return buf;
+}
+
+static void gguf_buf_free(struct gguf_buf buf) {
+ if (buf.data) {
+ free(buf.data);
+ }
+}
+
+static void gguf_buf_grow(struct gguf_buf * buf, size_t size) {
+ if (buf->offset + size > buf->size) {
+ buf->size = 1.5*(buf->offset + size);
+ if (buf->data) {
+ buf->data = realloc(buf->data, buf->size);
+ }
+ }
+}
+
+static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) {
+ gguf_buf_grow(buf, sizeof(val->n) + val->n);
+
+ if (buf->data) {
+ memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n));
+ }
+ buf->offset += sizeof(val->n);
+
+ if (buf->data) {
+ memcpy((char *) buf->data + buf->offset, val->data, val->n);
+ }
+ buf->offset += val->n;
+}
+
+static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) {
+ gguf_buf_grow(buf, el_size);
+
+ if (buf->data) {
+ memcpy((char *) buf->data + buf->offset, val, el_size);
+ }
+ buf->offset += el_size;
+}
+
+static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
+ // write header
+ gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic));
+ gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version));
+ gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors));
+ gguf_bwrite_el(buf, &ctx->header.n_kv, sizeof(ctx->header.n_kv));
+
+ // write key-value pairs
+ for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
+ struct gguf_kv * kv = &ctx->kv[i];
+
+ gguf_bwrite_str(buf, &kv->key);
+ gguf_bwrite_el (buf, &kv->type, sizeof(kv->type));
+
+ switch (kv->type) {
+ case GGUF_TYPE_UINT8: gguf_bwrite_el( buf, &kv->value.uint8, sizeof(kv->value.uint8) ); break;
+ case GGUF_TYPE_INT8: gguf_bwrite_el (buf, &kv->value.int8, sizeof(kv->value.int8) ); break;
+ case GGUF_TYPE_UINT16: gguf_bwrite_el (buf, &kv->value.uint16, sizeof(kv->value.uint16) ); break;
+ case GGUF_TYPE_INT16: gguf_bwrite_el (buf, &kv->value.int16, sizeof(kv->value.int16) ); break;
+ case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break;
+ case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break;
+ case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
+ case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break;
+ case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break;
+ case GGUF_TYPE_ARRAY:
+ {
+ gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type));
+ gguf_bwrite_el(buf, &kv->value.arr.n, sizeof(kv->value.arr.n) );
+
+ switch (kv->value.arr.type) {
+ case GGUF_TYPE_UINT8:
+ case GGUF_TYPE_INT8:
+ case GGUF_TYPE_UINT16:
+ case GGUF_TYPE_INT16:
+ case GGUF_TYPE_UINT32:
+ case GGUF_TYPE_INT32:
+ case GGUF_TYPE_FLOAT32:
+ case GGUF_TYPE_BOOL:
+ {
+ gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
+ } break;
+ case GGUF_TYPE_STRING:
+ {
+ for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
+ gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]);
+ }
+ } break;
+ case GGUF_TYPE_ARRAY:
+ case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
+ };
+ } break;
+ case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
+ };
+ }
+
+ // write tensor infos
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+ struct gguf_tensor_info * info = &ctx->infos[i];
+
+ gguf_bwrite_str(buf, &info->name);
+ gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims));
+ for (uint32_t j = 0; j < info->n_dims; ++j) {
+ gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j]));
+ }
+ gguf_bwrite_el(buf, &info->type, sizeof(info->type));
+ gguf_bwrite_el(buf, &info->offset, sizeof(info->offset));
+ }
+
+ // we require the data section to be aligned, so take into account any padding
+ {
+ const size_t offset = buf->offset;
+ const size_t offset_pad = GGML_PAD(offset, ctx->alignment);
+
+ if (offset_pad != offset) {
+ uint8_t pad = 0;
+ for (size_t i = 0; i < offset_pad - offset; ++i) {
+ gguf_bwrite_el(buf, &pad, sizeof(pad));
+ }
+ }
+ }
+
+ if (only_meta) {
+ return;
+ }
+
+ size_t offset = 0;
+
+ // write tensor data
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+ struct gguf_tensor_info * info = &ctx->infos[i];
+
+ const size_t size = info->size;
+ const size_t size_pad = GGML_PAD(size, ctx->alignment);
+
+ gguf_bwrite_el(buf, info->data, size);
+
+ if (size_pad != size) {
+ uint8_t pad = 0;
+ for (size_t j = 0; j < size_pad - size; ++j) {
+ gguf_bwrite_el(buf, &pad, sizeof(pad));
+ }
+ }
+
+ GGML_ASSERT(offset == info->offset);
+
+ offset += size_pad;
+ }
+}
+
+void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta) {
+ FILE * file = fopen(fname, "wb");
+ if (!file) {
+ GGML_ASSERT(false && "failed to open file for writing");
+ }
+
+ struct gguf_buf buf = gguf_buf_init(16*1024);
+
+ gguf_write_to_buf(ctx, &buf, only_meta);
+
+ fwrite(buf.data, 1, buf.offset, file);
+
+ gguf_buf_free(buf);
+
+ fclose(file);
+}
+
+size_t gguf_get_meta_size(struct gguf_context * ctx) {
+ // no allocs - only compute size
+ struct gguf_buf buf = gguf_buf_init(0);
+
+ gguf_write_to_buf(ctx, &buf, true);
+
+ return buf.offset;
+}
+
+void gguf_get_meta_data(struct gguf_context * ctx, void * data) {
+ struct gguf_buf buf = gguf_buf_init(16*1024);
+
+ gguf_write_to_buf(ctx, &buf, true);
+
+ memcpy(data, buf.data, buf.offset);
+
+ gguf_buf_free(buf);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
int ggml_cpu_has_avx(void) {
#if defined(__AVX__)
return 1;