GGML_TYPE_F64 = 28,
GGML_TYPE_IQ1_M = 29,
GGML_TYPE_BF16 = 30,
+ GGML_TYPE_Q4_0_4_4 = 31,
+ GGML_TYPE_Q4_0_4_8 = 32,
+ GGML_TYPE_Q4_0_8_8 = 33,
GGML_TYPE_COUNT,
};
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
+ GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
+ GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
+ GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
};
// available tensor operations:
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
const void * GGML_RESTRICT y, size_t by, int nrc);
+ typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr,
+ int64_t k, int64_t bx);
+ typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
+ const void * GGML_RESTRICT y, int nr, int nc);
+ typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
+ const void * GGML_RESTRICT y, int nr, int nc);
typedef struct {
const char * type_name;
ggml_vec_dot_t vec_dot;
enum ggml_type vec_dot_type;
int64_t nrows; // number of rows to process simultaneously;
+ int64_t ncols; // number of columns to process simultaneously;
+ int64_t interleave_blcksize; // interleave elements in blocks of interleave_blcksize;
+ ggml_from_float_to_mat_t from_float_to_mat;
+ ggml_gemv_t gemv;
+ ggml_gemm_t gemm;
} ggml_type_traits_t;
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS}
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
+ ggml-aarch64.c ggml-aarch64.h
)
if (EMSCRIPTEN)
} block_q8_1;
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
+typedef struct {
+ ggml_half d[4]; // deltas for 4 q4_0 blocks
+ uint8_t qs[QK4_0 * 2]; // nibbles / quants for 4 q4_0 blocks
+} block_q4_0x4;
+static_assert(sizeof(block_q4_0x4) == 4 * sizeof(ggml_half) + QK4_0 * 2, "wrong q4_0x4 block size/padding");
+
+typedef struct {
+ ggml_half d[8]; // deltas for 8 q4_0 blocks
+ uint8_t qs[QK4_0 * 4]; // nibbles / quants for 8 q4_0 blocks
+} block_q4_0x8;
+static_assert(sizeof(block_q4_0x8) == 8 * sizeof(ggml_half) + QK4_0 * 4, "wrong q4_0x8 block size/padding");
+
+typedef struct {
+ ggml_half d[4]; // deltas for 4 q8_0 blocks
+ int8_t qs[QK8_0 * 4]; // quants for 4 q8_0 blocks
+} block_q8_0x4;
+static_assert(sizeof(block_q8_0x4) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong q8_0x4 block size/padding");
+
+typedef struct {
+ ggml_half d[8]; // deltas for 8 q8_0 blocks
+ int8_t qs[QK8_0 * 8]; // quants for 8 q8_0 blocks
+} block_q8_0x8;
+static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
+
//
// Super-block quantization structures
//
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif // __ARM_FEATURE_SVE
+
// precomputed f32 table for f16 (256 KB)
// defined in ggml.c, initialized in ggml_init()
extern float ggml_table_f32_f16[1 << 16];
}
#endif
#if defined(__ARM_FEATURE_SVE)
- const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
- const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
+ if (svcntb() == QK8_0) {
+ const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
+ const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
- svfloat32_t sumv0 = svdup_n_f32(0.0f);
- svfloat32_t sumv1 = svdup_n_f32(0.0f);
+ svfloat32_t sumv0 = svdup_n_f32(0.0f);
+ svfloat32_t sumv1 = svdup_n_f32(0.0f);
- assert(nb % 2 == 0); // TODO: handle odd nb
+ assert(nb % 2 == 0); // TODO: handle odd nb
- for (int i = 0; i < nb; i += 2) {
- const block_q4_0 * restrict x0 = &x[i + 0];
- const block_q4_0 * restrict x1 = &x[i + 1];
- const block_q8_0 * restrict y0 = &y[i + 0];
- const block_q8_0 * restrict y1 = &y[i + 1];
+ for (int i = 0; i < nb; i += 2) {
+ const block_q4_0 * restrict x0 = &x[i + 0];
+ const block_q4_0 * restrict x1 = &x[i + 1];
+ const block_q8_0 * restrict y0 = &y[i + 0];
+ const block_q8_0 * restrict y1 = &y[i + 1];
- // load x
- const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
- const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
+ // load x
+ const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
+ const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
- // 4-bit -> 8-bit
- const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx0r, 0x0F), 0x04));
- const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx1r, 0x0F), 0x04));
+ // 4-bit -> 8-bit
+ const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx0r, 0x0F), 0x04));
+ const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx1r, 0x0F), 0x04));
- // sub 8
- const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8);
- const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8);
+ // sub 8
+ const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8);
+ const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8);
- // load y
- const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
- const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
+ // load y
+ const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
+ const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
- // dot product
- sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
- sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
- }
+ // dot product
+ sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+ sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+ }
- *s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
-#elif defined(__ARM_NEON)
+ *s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
+ return;
+ }
+#endif
+#if defined(__ARM_NEON)
float32x4_t sumv0 = vdupq_n_f32(0.0f);
float32x4_t sumv1 = vdupq_n_f32(0.0f);
}
#endif
#if defined(__ARM_FEATURE_SVE)
- svfloat32_t sumv0 = svdup_n_f32(0.0f);
- svfloat32_t sumv1 = svdup_n_f32(0.0f);
+ if (svcntb() == QK8_0) {
+ svfloat32_t sumv0 = svdup_n_f32(0.0f);
+ svfloat32_t sumv1 = svdup_n_f32(0.0f);
- assert(nb % 2 == 0); // TODO: handle odd nb
+ assert(nb % 2 == 0); // TODO: handle odd nb
- for (int i = 0; i < nb; i += 2) {
- const block_q8_0 * restrict x0 = &x[i + 0];
- const block_q8_0 * restrict x1 = &x[i + 1];
- const block_q8_0 * restrict y0 = &y[i + 0];
- const block_q8_0 * restrict y1 = &y[i + 1];
+ for (int i = 0; i < nb; i += 2) {
+ const block_q8_0 * restrict x0 = &x[i + 0];
+ const block_q8_0 * restrict x1 = &x[i + 1];
+ const block_q8_0 * restrict y0 = &y[i + 0];
+ const block_q8_0 * restrict y1 = &y[i + 1];
- // load x
- const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
- const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs);
+ // load x
+ const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
+ const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs);
- // load y
- const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
- const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
+ // load y
+ const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
+ const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
- sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
- sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
- }
+ sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+ sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+ }
- *s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
-#elif defined(__ARM_NEON)
+ *s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
+ return;
+ }
+#endif
+#if defined(__ARM_NEON)
float32x4_t sumv0 = vdupq_n_f32(0.0f);
float32x4_t sumv1 = vdupq_n_f32(0.0f);
} \
}
+#define VALIDATE_ROW_DATA_DVEC_F16_IMPL(type, data, nb, nr) \
+ const type * q = (const type *) (data); \
+ for (size_t i = 0; i < (nb); ++i) { \
+ for (size_t j = 0; j < (nr); ++j) { \
+ if (!validate_fp16(q[i].d[j], i)) { \
+ return false; \
+ } \
+ } \
+ }
+
bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes) {
if (type < 0 || type >= GGML_TYPE_COUNT) {
fprintf(stderr, "%s: invalid type %d\n", __func__, type);
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
} break;
+ case GGML_TYPE_Q4_0_4_4:
+ case GGML_TYPE_Q4_0_4_8:
+ {
+ VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x4, data, nbytes / sizeof(block_q4_0x4), 4);
+ } break;
+ case GGML_TYPE_Q4_0_8_8:
+ {
+ VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x8, data, nbytes / sizeof(block_q4_0x8), 8);
+ } break;
+
case GGML_TYPE_I8:
case GGML_TYPE_I16:
case GGML_TYPE_I32:
#include "ggml-impl.h"
#include "ggml-quants.h"
#include "ggml.h"
+#include "ggml-aarch64.h"
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <unistd.h>
#endif
-#ifdef __ARM_FEATURE_MATMUL_INT8
+#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
#undef GGML_USE_LLAMAFILE
#endif
#else
.nrows = 1,
#endif
+ .from_float_to_mat = quantize_mat_q8_0,
},
[GGML_TYPE_Q8_1] = {
.type_name = "q8_1",
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
.vec_dot_type = GGML_TYPE_BF16,
.nrows = 1,
+ },
+ [GGML_TYPE_Q4_0_4_4] = {
+ .type_name = "q4_0_4x4",
+ .blck_size = QK4_0,
+ .type_size = sizeof(block_q4_0),
+ .is_quantized = true,
+ .to_float = NULL,
+ .from_float = NULL,
+ .from_float_reference = NULL,
+ .vec_dot = NULL,
+ .vec_dot_type = GGML_TYPE_Q8_0,
+ .nrows = 1,
+ .ncols = 4,
+ .interleave_blcksize = 4,
+ .gemv = ggml_gemv_q4_0_4x4_q8_0,
+ .gemm = ggml_gemm_q4_0_4x4_q8_0,
+ },
+ [GGML_TYPE_Q4_0_4_8] = {
+ .type_name = "q4_0_4x8",
+ .blck_size = QK4_0,
+ .type_size = sizeof(block_q4_0),
+ .is_quantized = true,
+ .to_float = NULL,
+ .from_float = NULL,
+ .from_float_reference = NULL,
+ .vec_dot = NULL,
+ .vec_dot_type = GGML_TYPE_Q8_0,
+ .nrows = 1,
+ .ncols = 4,
+ .interleave_blcksize = 8,
+ .gemv = ggml_gemv_q4_0_4x8_q8_0,
+ .gemm = ggml_gemm_q4_0_4x8_q8_0,
+ },
+ [GGML_TYPE_Q4_0_8_8] = {
+ .type_name = "q4_0_8x8",
+ .blck_size = QK4_0,
+ .type_size = sizeof(block_q4_0),
+ .is_quantized = true,
+ .to_float = NULL,
+ .from_float = NULL,
+ .from_float_reference = NULL,
+ .vec_dot = NULL,
+ .vec_dot_type = GGML_TYPE_Q8_0,
+ .nrows = 1,
+ .ncols = 8,
+ .interleave_blcksize = 8,
+ .gemv = ggml_gemv_q4_0_8x8_q8_0,
+ .gemm = ggml_gemm_q4_0_8x8_q8_0,
}
};
case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
+ case GGML_FTYPE_MOSTLY_Q4_0_4_4: wtype = GGML_TYPE_Q4_0_4_4; break;
+ case GGML_FTYPE_MOSTLY_Q4_0_4_8: wtype = GGML_TYPE_Q4_0_4_8; break;
+ case GGML_FTYPE_MOSTLY_Q4_0_8_8: wtype = GGML_TYPE_Q4_0_8_8; break;
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
}
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S:
+ case GGML_TYPE_Q4_0_4_4:
+ case GGML_TYPE_Q4_0_4_8:
+ case GGML_TYPE_Q4_0_8_8:
{
ggml_compute_forward_add_q_f32(params, dst);
} break;
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S:
+ case GGML_TYPE_Q4_0_4_4:
+ case GGML_TYPE_Q4_0_4_8:
+ case GGML_TYPE_Q4_0_8_8:
{
ggml_compute_forward_add1_q_f32(params, dst);
} break;
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S:
+ case GGML_TYPE_Q4_0_4_4:
+ case GGML_TYPE_Q4_0_4_8:
+ case GGML_TYPE_Q4_0_8_8:
default:
{
GGML_ASSERT(false);
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
int64_t const vec_dot_num_rows = type_traits[type].nrows;
+ int64_t const matmul_num_cols = type_traits[type].ncols;
+ int64_t const interleave_blcksize = type_traits[type].interleave_blcksize;
+ ggml_from_float_to_mat_t const from_float_to_mat
+ = type_traits[vec_dot_type].from_float_to_mat;
+ ggml_gemv_t const gemv = type_traits[type].gemv;
+ ggml_gemm_t const gemm = type_traits[type].gemm;
GGML_ASSERT(ne0 == ne01);
GGML_ASSERT(ne1 == ne11);
for (int64_t i13 = 0; i13 < ne13; ++i13) {
for (int64_t i12 = 0; i12 < ne12; ++i12) {
- for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
+ int64_t i11_processed = 0;
+ if ((ggml_n_dims(src1) == 2) && from_float_to_mat && gemm) {
+ for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
+ from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
+ (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
+ 4, ne10, interleave_blcksize);
+ }
+ i11_processed = ne11 - ne11 % 4;
+ }
+ for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
ne10);
const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
+ if ((ggml_n_dims(src0) == 2) && gemv) {
+ const void * src1_wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+ const size_t src1_col_stride = ggml_is_contiguous(src1) || src1->type != vec_dot_type ? ggml_row_size(vec_dot_type, ne10) : nb11;
+ int64_t src0_start = (ith * ne01) / nth;
+ int64_t src0_end = ((ith + 1) * ne01) / nth;
+ src0_start = (src0_start % matmul_num_cols) ? src0_start + matmul_num_cols - (src0_start % matmul_num_cols): src0_start;
+ src0_end = (src0_end % matmul_num_cols) ? src0_end + matmul_num_cols - (src0_end % matmul_num_cols): src0_end;
+ if (src0_start >= src0_end) return;
+
+ // If there are more than three rows in src1, use gemm; otherwise, use gemv.
+ if (gemm && (ne11 > 3)) {
+ gemm(ne00, (float *)((char *) dst->data) + src0_start, ne01, (const char *) src0->data + src0_start * nb01,
+ (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
+ }
+ for (int iter = gemm ? ne11 - ne11 % 4 : 0; iter < ne11; iter++) {
+ gemv(ne00, (float *)((char *) dst->data + (iter * nb1)) + src0_start, ne01,
+ (const char *) src0->data + src0_start * nb01, (const char *) src1_wdata + (src1_col_stride * iter), 1,
+ src0_end - src0_start);
+ }
+ return;
+ }
+
// The first chunk comes from our thread_id, the rest will get auto-assigned.
int current_chunk = ith;
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
+ int64_t const matmul_num_cols = type_traits[type].ncols;
+ ggml_gemv_t const gemv = type_traits[type].gemv;
// we don't support permuted src0 or src1
GGML_ASSERT(nb00 == ggml_type_size(type));
const int64_t nr0 = ne01; // src0 rows
const int64_t nr1 = cne1; // src1 rows
+ if (((ggml_n_dims(src0) - 1) == 2) && gemv) {
+ int64_t src0_cur_start = (ith * ne01) / nth;
+ int64_t src0_cur_end = ((ith + 1) * ne01) / nth;
+ src0_cur_start = (src0_cur_start % matmul_num_cols) ? src0_cur_start + matmul_num_cols - (src0_cur_start % matmul_num_cols): src0_cur_start;
+ src0_cur_end = (src0_cur_end % matmul_num_cols) ? src0_cur_end + matmul_num_cols - (src0_cur_end % matmul_num_cols): src0_cur_end;
+ if (src0_cur_start >= src0_cur_end) return;
+
+ for (int ir1 = 0; ir1 < nr1; ir1++) {
+ struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
+ const int id = row_mapping.i1; // selected expert index
+
+ const int64_t i11 = id % ne11;
+ const int64_t i12 = row_mapping.i2; // row index in src1
+
+ const int64_t i1 = id; // selected expert index
+ const int64_t i2 = i12; // row
+
+ const char * src1_col = (const char *) wdata +
+ (src1_cont || src1->type != vec_dot_type
+ ? (i11 + i12 * ne11) * row_size
+ : (i11 * nb11 + i12 * nb12));
+
+ gemv(ne00, (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
+ (const char *) src0_cur + src0_cur_start * nb01, src1_col, 1, src0_cur_end - src0_cur_start);
+ }
+ continue;
+ }
+
// distribute the thread work across the inner or outer loop based on which one is larger
const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S:
+ case GGML_TYPE_Q4_0_4_4:
+ case GGML_TYPE_Q4_0_4_8:
+ case GGML_TYPE_Q4_0_8_8:
{
ggml_compute_forward_out_prod_q_f32(params, dst);
} break;
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S:
+ case GGML_TYPE_Q4_0_4_4:
+ case GGML_TYPE_Q4_0_4_8:
+ case GGML_TYPE_Q4_0_8_8:
default:
{
GGML_ASSERT(false);
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S:
+ case GGML_TYPE_Q4_0_4_4:
+ case GGML_TYPE_Q4_0_4_8:
+ case GGML_TYPE_Q4_0_8_8:
{
ggml_compute_forward_get_rows_q(params, dst);
} break;
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_Q8_K:
+ case GGML_TYPE_Q4_0_4_4:
+ case GGML_TYPE_Q4_0_4_8:
+ case GGML_TYPE_Q4_0_8_8:
case GGML_TYPE_I8:
case GGML_TYPE_I16:
case GGML_TYPE_I32:
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_F16:
{
size_t elemsize = sizeof(ggml_fp16_t);
int ggml_cpu_has_sve(void) {
#if defined(__ARM_FEATURE_SVE)
- // TODO: Currently, SVE 256 bit is only supported.
- GGML_ASSERT(svcntb() == QK8_0);
return 1;
#else
return 0;