GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor);
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
- GGML_API GGML_CALL int ggml_blck_size(enum ggml_type type);
- GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
- GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
+ GGML_API GGML_CALL int64_t ggml_blck_size(enum ggml_type type);
+ GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
+ GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
GGML_DEPRECATED(
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
#endif
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
- typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
- const void * GGML_RESTRICT y, size_t by, int nrc);
- typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr,
- int64_t k, int64_t bx);
- typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
- const void * GGML_RESTRICT y, int nr, int nc);
- typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
- const void * GGML_RESTRICT y, int nr, int nc);
+ typedef void (*ggml_from_float_to_mat_t)
+ (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
+ typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
+ const void * GGML_RESTRICT y, size_t by, int nrc);
+ typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
+ const void * GGML_RESTRICT y, int nr, int nc);
+ typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
+ const void * GGML_RESTRICT y, int nr, int nc);
typedef struct {
- const char * type_name;
- int blck_size;
- size_t type_size;
- bool is_quantized;
- ggml_to_float_t to_float;
- ggml_from_float_t from_float;
- ggml_from_float_t from_float_reference;
- ggml_vec_dot_t vec_dot;
- enum ggml_type vec_dot_type;
- int64_t nrows; // number of rows to process simultaneously;
- int64_t ncols; // number of columns to process simultaneously;
- int64_t interleave_blcksize; // interleave elements in blocks of interleave_blcksize;
+ const char * type_name;
+ int64_t blck_size;
+ int64_t blck_size_interleave; // interleave elements in blocks
+ size_t type_size;
+ bool is_quantized;
+ ggml_to_float_t to_float;
+ ggml_from_float_t from_float;
+ ggml_from_float_t from_float_ref;
ggml_from_float_to_mat_t from_float_to_mat;
- ggml_gemv_t gemv;
- ggml_gemm_t gemm;
+ ggml_vec_dot_t vec_dot;
+ enum ggml_type vec_dot_type;
+ int64_t nrows; // number of rows to process simultaneously
+ int64_t ncols; // number of columns to process simultaneously
+ ggml_gemv_t gemv;
+ ggml_gemm_t gemm;
} ggml_type_traits_t;
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
#endif //__loongarch_asx
// reference implementation for deterministic creation of model files
-void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
+void quantize_row_q4_0_ref(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
static const int qk = QK4_0;
assert(k % qk == 0);
}
void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) {
- quantize_row_q4_0_reference(x, y, k);
+ quantize_row_q4_0_ref(x, y, k);
}
-void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int64_t k) {
+void quantize_row_q4_1_ref(const float * restrict x, block_q4_1 * restrict y, int64_t k) {
const int qk = QK4_1;
assert(k % qk == 0);
}
void quantize_row_q4_1(const float * restrict x, void * restrict y, int64_t k) {
- quantize_row_q4_1_reference(x, y, k);
+ quantize_row_q4_1_ref(x, y, k);
}
-void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int64_t k) {
+void quantize_row_q5_0_ref(const float * restrict x, block_q5_0 * restrict y, int64_t k) {
static const int qk = QK5_0;
assert(k % qk == 0);
}
void quantize_row_q5_0(const float * restrict x, void * restrict y, int64_t k) {
- quantize_row_q5_0_reference(x, y, k);
+ quantize_row_q5_0_ref(x, y, k);
}
-void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int64_t k) {
+void quantize_row_q5_1_ref(const float * restrict x, block_q5_1 * restrict y, int64_t k) {
const int qk = QK5_1;
assert(k % qk == 0);
}
void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) {
- quantize_row_q5_1_reference(x, y, k);
+ quantize_row_q5_1_ref(x, y, k);
}
// reference implementation for deterministic creation of model files
-void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int64_t k) {
+void quantize_row_q8_0_ref(const float * restrict x, block_q8_0 * restrict y, int64_t k) {
assert(k % QK8_0 == 0);
const int nb = k / QK8_0;
#else
GGML_UNUSED(nb);
// scalar
- quantize_row_q8_0_reference(x, y, k);
+ quantize_row_q8_0_ref(x, y, k);
#endif
}
// reference implementation for deterministic creation of model files
-void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int64_t k) {
+void quantize_row_q8_1_ref(const float * restrict x, block_q8_1 * restrict y, int64_t k) {
assert(QK8_1 == 32);
assert(k % QK8_1 == 0);
const int nb = k / QK8_1;
#else
GGML_UNUSED(nb);
// scalar
- quantize_row_q8_1_reference(x, y, k);
+ quantize_row_q8_1_ref(x, y, k);
#endif
}
//========================- 2-bit (de)-quantization
-void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int64_t k) {
+void quantize_row_q2_K_ref(const float * restrict x, block_q2_K * restrict y, int64_t k) {
assert(k % QK_K == 0);
const int nb = k / QK_K;
}
void quantize_row_q2_K(const float * restrict x, void * restrict vy, int64_t k) {
- quantize_row_q2_K_reference(x, vy, k);
+ quantize_row_q2_K_ref(x, vy, k);
}
static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
if (!quant_weights) {
- quantize_row_q2_K_reference(src, dst, (int64_t)nrow*n_per_row);
+ quantize_row_q2_K_ref(src, dst, (int64_t)nrow*n_per_row);
}
else {
char * qrow = (char *)dst;
//========================= 3-bit (de)-quantization
-void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int64_t k) {
+void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, int64_t k) {
assert(k % QK_K == 0);
const int nb = k / QK_K;
}
void quantize_row_q3_K(const float * restrict x, void * restrict vy, int64_t k) {
- quantize_row_q3_K_reference(x, vy, k);
+ quantize_row_q3_K_ref(x, vy, k);
}
static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int64_t n_per_row, const float * restrict quant_weights) {
size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
if (!quant_weights) {
- quantize_row_q3_K_reference(src, dst, (int64_t)nrow*n_per_row);
+ quantize_row_q3_K_ref(src, dst, (int64_t)nrow*n_per_row);
}
else {
char * qrow = (char *)dst;
// ====================== 4-bit (de)-quantization
-void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int64_t k) {
+void quantize_row_q4_K_ref(const float * restrict x, block_q4_K * restrict y, int64_t k) {
assert(k % QK_K == 0);
const int nb = k / QK_K;
void quantize_row_q4_K(const float * restrict x, void * restrict vy, int64_t k) {
assert(k % QK_K == 0);
block_q4_K * restrict y = vy;
- quantize_row_q4_K_reference(x, y, k);
+ quantize_row_q4_K_ref(x, y, k);
}
static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int64_t n_per_row, const float * quant_weights) {
size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
if (!quant_weights) {
- quantize_row_q4_K_reference(src, dst, (int64_t)nrow*n_per_row);
+ quantize_row_q4_K_ref(src, dst, (int64_t)nrow*n_per_row);
}
else {
char * qrow = (char *)dst;
// ====================== 5-bit (de)-quantization
-void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int64_t k) {
+void quantize_row_q5_K_ref(const float * restrict x, block_q5_K * restrict y, int64_t k) {
assert(k % QK_K == 0);
const int64_t nb = k / QK_K;
void quantize_row_q5_K(const float * restrict x, void * restrict vy, int64_t k) {
assert(k % QK_K == 0);
block_q5_K * restrict y = vy;
- quantize_row_q5_K_reference(x, y, k);
+ quantize_row_q5_K_ref(x, y, k);
}
static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int64_t n_per_row, const float * quant_weights) {
size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
if (!quant_weights) {
- quantize_row_q5_K_reference(src, dst, (int64_t)nrow*n_per_row);
+ quantize_row_q5_K_ref(src, dst, (int64_t)nrow*n_per_row);
}
else {
char * qrow = (char *)dst;
// ====================== 6-bit (de)-quantization
-void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int64_t k) {
+void quantize_row_q6_K_ref(const float * restrict x, block_q6_K * restrict y, int64_t k) {
assert(k % QK_K == 0);
const int64_t nb = k / QK_K;
void quantize_row_q6_K(const float * restrict x, void * restrict vy, int64_t k) {
assert(k % QK_K == 0);
block_q6_K * restrict y = vy;
- quantize_row_q6_K_reference(x, y, k);
+ quantize_row_q6_K_ref(x, y, k);
}
static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int64_t n_per_row, const float * quant_weights) {
size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
if (!quant_weights) {
- quantize_row_q6_K_reference(src, dst, (int64_t)nrow*n_per_row);
+ quantize_row_q6_K_ref(src, dst, (int64_t)nrow*n_per_row);
}
else {
char * qrow = (char *)dst;
static_assert(QK4_0 == 32, "QK4_0 must be 32");
if (!quant_weights) {
- quantize_row_q4_0_reference(x, y, n_per_row);
+ quantize_row_q4_0_ref(x, y, n_per_row);
return;
}
size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
if (!quant_weights) {
- quantize_row_q4_0_reference(src, dst, (int64_t)nrow*n_per_row);
+ quantize_row_q4_0_ref(src, dst, (int64_t)nrow*n_per_row);
return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
}
size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
static_assert(QK4_1 == 32, "QK4_1 must be 32");
if (!quant_weights) {
- quantize_row_q4_1_reference(x, y, n_per_row);
+ quantize_row_q4_1_ref(x, y, n_per_row);
return;
}
size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
if (!quant_weights) {
- quantize_row_q4_1_reference(src, dst, (int64_t)nrow*n_per_row);
+ quantize_row_q4_1_ref(src, dst, (int64_t)nrow*n_per_row);
return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
}
size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
static_assert(QK5_0 == 32, "QK5_0 must be 32");
if (!quant_weights) {
- quantize_row_q5_0_reference(x, y, n_per_row);
+ quantize_row_q5_0_ref(x, y, n_per_row);
return;
}
size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
if (!quant_weights) {
- quantize_row_q5_0_reference(src, dst, (int64_t)nrow*n_per_row);
+ quantize_row_q5_0_ref(src, dst, (int64_t)nrow*n_per_row);
return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
}
size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
static_assert(QK5_1 == 32, "QK5_1 must be 32");
if (!quant_weights) {
- quantize_row_q5_1_reference(x, y, n_per_row);
+ quantize_row_q5_1_ref(x, y, n_per_row);
return;
}
size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
if (!quant_weights) {
- quantize_row_q5_1_reference(src, dst, (int64_t)nrow*n_per_row);
+ quantize_row_q5_1_ref(src, dst, (int64_t)nrow*n_per_row);
return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
}
size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
(void)quant_weights; // not used
const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
- quantize_row_q8_0_reference(src, dst, (int64_t)nrow*n_per_row);
+ quantize_row_q8_0_ref(src, dst, (int64_t)nrow*n_per_row);
return nrow * row_size;
}
//===================================== Q8_K ==============================================
-void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int64_t k) {
+void quantize_row_q8_K_ref(const float * restrict x, block_q8_K * restrict y, int64_t k) {
assert(k % QK_K == 0);
const int64_t nb = k / QK_K;
}
void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
- quantize_row_q8_K_reference(x, y, k);
+ quantize_row_q8_K_ref(x, y, k);
}
//===================================== Dot ptoducts =================================
void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int64_t k) {
assert(k % QK_K == 0);
block_iq3_xxs * restrict y = vy;
- quantize_row_iq3_xxs_reference(x, y, k);
+ quantize_row_iq3_xxs_ref(x, y, k);
}
-void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int64_t k) {
+void quantize_row_iq3_xxs_ref(const float * restrict x, block_iq3_xxs * restrict y, int64_t k) {
assert(k % QK_K == 0);
quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
}
void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int64_t k) {
assert(k % QK_K == 0);
block_iq3_s * restrict y = vy;
- quantize_row_iq3_s_reference(x, y, k);
+ quantize_row_iq3_s_ref(x, y, k);
}
-void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int64_t k) {
+void quantize_row_iq3_s_ref(const float * restrict x, block_iq3_s * restrict y, int64_t k) {
assert(k % QK_K == 0);
quantize_iq3_s(x, y, 1, k, NULL);
}
}
}
-void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int64_t k) {
+void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y, int64_t k) {
assert(k % QK4_NL == 0);
quantize_row_iq4_nl(x, y, k);
}
void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int64_t k) {
assert(k % QK_K == 0);
block_iq4_xs * restrict y = vy;
- quantize_row_iq4_xs_reference(x, y, k);
+ quantize_row_iq4_xs_ref(x, y, k);
}
-void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int64_t k) {
+void quantize_row_iq4_xs_ref(const float * restrict x, block_iq4_xs * restrict y, int64_t k) {
assert(k % QK_K == 0);
quantize_iq4_xs(x, y, 1, k, NULL);
}
return nrow * nblock * sizeof(block_iq2_s);
}
-void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int64_t k) {
+void quantize_row_iq2_s_ref(const float * restrict x, block_iq2_s * restrict y, int64_t k) {
assert(k % QK_K == 0);
quantize_iq2_s(x, y, 1, k, NULL);
}
void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k) {
assert(k % QK_K == 0);
block_iq2_s * restrict y = vy;
- quantize_row_iq2_s_reference(x, y, k);
+ quantize_row_iq2_s_ref(x, y, k);
}
static bool validate_float(float f, size_t i) {
#endif
// Quantization
-void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
-void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
-void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
-void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
-void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
-void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
-
-void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
-void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
-void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
-void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
-void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
-void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
-
-void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
-void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
-void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
-void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
-void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
+void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
+void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
+void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
+void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
+void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
+void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
+
+void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
+void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
+void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
+void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
+void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
+void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
+
+void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
.is_quantized = false,
.to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
.from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
- .from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
+ .from_float_ref = (ggml_from_float_t) ggml_fp32_to_fp16_row,
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
.vec_dot_type = GGML_TYPE_F16,
.nrows = 1,
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q4_0,
.from_float = quantize_row_q4_0,
- .from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
+ .from_float_ref = (ggml_from_float_t) quantize_row_q4_0_ref,
.vec_dot = ggml_vec_dot_q4_0_q8_0,
.vec_dot_type = GGML_TYPE_Q8_0,
#if defined (__ARM_FEATURE_MATMUL_INT8)
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q4_1,
.from_float = quantize_row_q4_1,
- .from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
+ .from_float_ref = (ggml_from_float_t) quantize_row_q4_1_ref,
.vec_dot = ggml_vec_dot_q4_1_q8_1,
.vec_dot_type = GGML_TYPE_Q8_1,
#if defined (__ARM_FEATURE_MATMUL_INT8)
.is_quantized = false,
.to_float = NULL,
.from_float = NULL,
- .from_float_reference = NULL,
+ .from_float_ref = NULL,
.vec_dot = NULL,
.vec_dot_type = GGML_TYPE_COUNT,
.nrows = 1,
.is_quantized = false,
.to_float = NULL,
.from_float = NULL,
- .from_float_reference = NULL,
+ .from_float_ref = NULL,
.vec_dot = NULL,
.vec_dot_type = GGML_TYPE_COUNT,
.nrows = 1,
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q5_0,
.from_float = quantize_row_q5_0,
- .from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
+ .from_float_ref = (ggml_from_float_t) quantize_row_q5_0_ref,
.vec_dot = ggml_vec_dot_q5_0_q8_0,
.vec_dot_type = GGML_TYPE_Q8_0,
.nrows = 1,
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q5_1,
.from_float = quantize_row_q5_1,
- .from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
+ .from_float_ref = (ggml_from_float_t) quantize_row_q5_1_ref,
.vec_dot = ggml_vec_dot_q5_1_q8_1,
.vec_dot_type = GGML_TYPE_Q8_1,
.nrows = 1,
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q8_0,
.from_float = quantize_row_q8_0,
- .from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
+ .from_float_ref = (ggml_from_float_t) quantize_row_q8_0_ref,
+ .from_float_to_mat = quantize_mat_q8_0,
.vec_dot = ggml_vec_dot_q8_0_q8_0,
.vec_dot_type = GGML_TYPE_Q8_0,
#if defined (__ARM_FEATURE_MATMUL_INT8)
#else
.nrows = 1,
#endif
- .from_float_to_mat = quantize_mat_q8_0,
},
[GGML_TYPE_Q8_1] = {
.type_name = "q8_1",
.type_size = sizeof(block_q8_1),
.is_quantized = true,
.from_float = quantize_row_q8_1,
- .from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
+ .from_float_ref = (ggml_from_float_t) quantize_row_q8_1_ref,
.vec_dot_type = GGML_TYPE_Q8_1,
.nrows = 1,
},
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q2_K,
.from_float = quantize_row_q2_K,
- .from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
+ .from_float_ref = (ggml_from_float_t) quantize_row_q2_K_ref,
.vec_dot = ggml_vec_dot_q2_K_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q3_K,
.from_float = quantize_row_q3_K,
- .from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
+ .from_float_ref = (ggml_from_float_t) quantize_row_q3_K_ref,
.vec_dot = ggml_vec_dot_q3_K_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q4_K,
.from_float = quantize_row_q4_K,
- .from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
+ .from_float_ref = (ggml_from_float_t) quantize_row_q4_K_ref,
.vec_dot = ggml_vec_dot_q4_K_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q5_K,
.from_float = quantize_row_q5_K,
- .from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
+ .from_float_ref = (ggml_from_float_t) quantize_row_q5_K_ref,
.vec_dot = ggml_vec_dot_q5_K_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q6_K,
.from_float = quantize_row_q6_K,
- .from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
+ .from_float_ref = (ggml_from_float_t) quantize_row_q6_K_ref,
.vec_dot = ggml_vec_dot_q6_K_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_iq2_xxs,
.from_float = NULL,
- .from_float_reference = NULL,
+ .from_float_ref = NULL,
.vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_iq2_xs,
.from_float = NULL,
- .from_float_reference = NULL,
+ .from_float_ref = NULL,
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_iq3_xxs,
.from_float = quantize_row_iq3_xxs,
- .from_float_reference = (ggml_from_float_t)quantize_row_iq3_xxs_reference,
+ .from_float_ref = (ggml_from_float_t)quantize_row_iq3_xxs_ref,
.vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_iq3_s,
.from_float = quantize_row_iq3_s,
- .from_float_reference = (ggml_from_float_t)quantize_row_iq3_s_reference,
+ .from_float_ref = (ggml_from_float_t)quantize_row_iq3_s_ref,
.vec_dot = ggml_vec_dot_iq3_s_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_iq2_s,
.from_float = quantize_row_iq2_s,
- .from_float_reference = (ggml_from_float_t)quantize_row_iq2_s_reference,
+ .from_float_ref = (ggml_from_float_t)quantize_row_iq2_s_ref,
.vec_dot = ggml_vec_dot_iq2_s_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_iq1_s,
.from_float = NULL,
- .from_float_reference = NULL,
+ .from_float_ref = NULL,
.vec_dot = ggml_vec_dot_iq1_s_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_iq1_m,
.from_float = NULL,
- .from_float_reference = NULL,
+ .from_float_ref = NULL,
.vec_dot = ggml_vec_dot_iq1_m_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_iq4_nl,
.from_float = quantize_row_iq4_nl,
- .from_float_reference = (ggml_from_float_t)quantize_row_iq4_nl_reference,
+ .from_float_ref = (ggml_from_float_t)quantize_row_iq4_nl_ref,
.vec_dot = ggml_vec_dot_iq4_nl_q8_0,
.vec_dot_type = GGML_TYPE_Q8_0,
.nrows = 1,
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_iq4_xs,
.from_float = quantize_row_iq4_xs,
- .from_float_reference = (ggml_from_float_t)quantize_row_iq4_xs_reference,
+ .from_float_ref = (ggml_from_float_t)quantize_row_iq4_xs_ref,
.vec_dot = ggml_vec_dot_iq4_xs_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
.is_quantized = false,
.to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row,
.from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row,
- .from_float_reference = (ggml_from_float_t) ggml_fp32_to_bf16_row,
+ .from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row,
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
.vec_dot_type = GGML_TYPE_BF16,
.nrows = 1,
[GGML_TYPE_Q4_0_4_4] = {
.type_name = "q4_0_4x4",
.blck_size = QK4_0,
+ .blck_size_interleave = 4,
.type_size = sizeof(block_q4_0),
.is_quantized = true,
.to_float = NULL,
.from_float = NULL,
- .from_float_reference = NULL,
+ .from_float_ref = NULL,
.vec_dot = NULL,
.vec_dot_type = GGML_TYPE_Q8_0,
.nrows = 1,
.ncols = 4,
- .interleave_blcksize = 4,
.gemv = ggml_gemv_q4_0_4x4_q8_0,
.gemm = ggml_gemm_q4_0_4x4_q8_0,
},
[GGML_TYPE_Q4_0_4_8] = {
.type_name = "q4_0_4x8",
.blck_size = QK4_0,
+ .blck_size_interleave = 8,
.type_size = sizeof(block_q4_0),
.is_quantized = true,
.to_float = NULL,
.from_float = NULL,
- .from_float_reference = NULL,
+ .from_float_ref = NULL,
.vec_dot = NULL,
.vec_dot_type = GGML_TYPE_Q8_0,
.nrows = 1,
.ncols = 4,
- .interleave_blcksize = 8,
.gemv = ggml_gemv_q4_0_4x8_q8_0,
.gemm = ggml_gemm_q4_0_4x8_q8_0,
},
[GGML_TYPE_Q4_0_8_8] = {
.type_name = "q4_0_8x8",
.blck_size = QK4_0,
+ .blck_size_interleave = 8,
.type_size = sizeof(block_q4_0),
.is_quantized = true,
.to_float = NULL,
.from_float = NULL,
- .from_float_reference = NULL,
+ .from_float_ref = NULL,
.vec_dot = NULL,
.vec_dot_type = GGML_TYPE_Q8_0,
.nrows = 1,
.ncols = 8,
- .interleave_blcksize = 8,
.gemv = ggml_gemv_q4_0_8x8_q8_0,
.gemm = ggml_gemm_q4_0_8x8_q8_0,
}
return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
}
-GGML_CALL int ggml_blck_size(enum ggml_type type) {
+GGML_CALL int64_t ggml_blck_size(enum ggml_type type) {
return type_traits[type].blck_size;
}
const enum ggml_type type = src0->type;
- enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
- ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
- int64_t const vec_dot_num_rows = type_traits[type].nrows;
- int64_t const matmul_num_cols = type_traits[type].ncols;
- int64_t const interleave_blcksize = type_traits[type].interleave_blcksize;
- ggml_from_float_to_mat_t const from_float_to_mat
- = type_traits[vec_dot_type].from_float_to_mat;
- ggml_gemv_t const gemv = type_traits[type].gemv;
- ggml_gemm_t const gemm = type_traits[type].gemm;
+ enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
+ ggml_from_float_t const from_float = type_traits[vec_dot_type].from_float;
+ ggml_from_float_to_mat_t const from_float_to_mat = type_traits[vec_dot_type].from_float_to_mat;
+ int64_t const vec_dot_num_rows = type_traits[type].nrows;
+ int64_t const matmul_num_cols = type_traits[type].ncols;
+ int64_t const blck_size_interleave = type_traits[type].blck_size_interleave;
+ ggml_gemv_t const gemv = type_traits[type].gemv;
+ ggml_gemm_t const gemm = type_traits[type].gemm;
GGML_ASSERT(ne0 == ne01);
GGML_ASSERT(ne1 == ne11);
for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
- 4, ne10, interleave_blcksize);
+ 4, ne10, blck_size_interleave);
}
i11_processed = ne11 - ne11 % 4;
}
for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
- from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
- (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
- ne10);
+ from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
+ (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
+ ne10);
}
}
}
int64_t src0_start = (ith * ne01) / nth;
int64_t src0_end = ((ith + 1) * ne01) / nth;
src0_start = (src0_start % matmul_num_cols) ? src0_start + matmul_num_cols - (src0_start % matmul_num_cols): src0_start;
- src0_end = (src0_end % matmul_num_cols) ? src0_end + matmul_num_cols - (src0_end % matmul_num_cols): src0_end;
+ src0_end = (src0_end % matmul_num_cols) ? src0_end + matmul_num_cols - (src0_end % matmul_num_cols): src0_end;
if (src0_start >= src0_end) return;
// If there are more than three rows in src1, use gemm; otherwise, use gemv.
const bool src1_cont = ggml_is_contiguous(src1);
- ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
- enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
- ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
- int64_t const matmul_num_cols = type_traits[type].ncols;
- ggml_gemv_t const gemv = type_traits[type].gemv;
+ ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
+ enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
+ ggml_from_float_t const from_float = type_traits[vec_dot_type].from_float;
+ int64_t const matmul_num_cols = type_traits[type].ncols;
+ ggml_gemv_t const gemv = type_traits[type].gemv;
// we don't support permuted src0 or src1
GGML_ASSERT(nb00 == ggml_type_size(type));
for (int64_t i13 = 0; i13 < ne13; ++i13) {
for (int64_t i12 = 0; i12 < ne12; ++i12) {
for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
- from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
- (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
- ne10);
+ from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
+ (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
+ ne10);
}
}
}
(int64_t) info->ne[3];
if (ne % ggml_blck_size(info->type) != 0) {
- fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
- __func__, info->name.data, (int)info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
+ fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
+ __func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
fclose(file);
gguf_free(ctx);
return NULL;