// GGML_TYPE_IQ4_NL_4_8 = 37,
// GGML_TYPE_IQ4_NL_8_8 = 38,
GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block)
- GGML_TYPE_COUNT = 40,
+ GGML_TYPE_NVFP4 = 40, // NVFP4 (4 blocks, E4M3 scale)
+ GGML_TYPE_COUNT = 41,
};
// precision
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors
+ GGML_FTYPE_MOSTLY_NVFP4 = 26, // except 1d tensors
};
// available tensor operations:
#define QI_MXFP4 (QK_MXFP4 / (4 * QR_MXFP4))
#define QR_MXFP4 2
+#define QI_NVFP4 (QK_NVFP4 / (4 * QR_NVFP4))
+#define QR_NVFP4 2
+
#define QI5_0 (QK5_0 / (4 * QR5_0))
#define QR5_0 2
} block_mxfp4;
static_assert(sizeof(block_mxfp4) == sizeof(uint8_t) + QK_MXFP4/2, "wrong mxfp4 block size/padding");
+#define QK_NVFP4 64
+#define QK_NVFP4_SUB 16 // sub-block size for per-group scales
+typedef struct {
+ uint8_t d[QK_NVFP4/QK_NVFP4_SUB]; // UE4M3 scales (4 bytes, one per 16-element sub-block)
+ uint8_t qs[QK_NVFP4/2]; // packed 4-bit E2M1 values (32 bytes)
+} block_nvfp4;
+static_assert(sizeof(block_nvfp4) == sizeof(uint8_t)*(QK_NVFP4/QK_NVFP4_SUB) + QK_NVFP4/2, "wrong nvfp4 block size/padding");
+
#define QK5_0 32
typedef struct {
ggml_half d; // delta
#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
#define ggml_vec_dot_q8_0_q8_0_generic ggml_vec_dot_q8_0_q8_0
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
+#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
#define ggml_gemm_mxfp4_8x8_q8_0_generic ggml_gemm_mxfp4_8x8_q8_0
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
+// quants.c
+#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
// repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
// ref: https://github.com/ggml-org/llama.cpp/pull/14146#issuecomment-2972561679
// quants.c
#define quantize_row_q8_K_generic quantize_row_q8_K
+#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
+#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
// repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
+#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
// repack.cpp
#define ggml_quantize_mat_q8_0_4x1_generic ggml_quantize_mat_q8_0_4x1
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#elif defined(__s390x__)
// quants.c
#define quantize_row_q8_K_generic quantize_row_q8_K
+#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
+#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
// repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
*s = sumf;
}
+void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+ assert(nrc == 1);
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+ assert(n % QK_NVFP4 == 0);
+
+ const block_nvfp4 * GGML_RESTRICT x = vx;
+ const block_q8_0 * GGML_RESTRICT y = vy;
+
+ // Each NVFP4 super-block (64 elements) spans 2 q8_0 blocks
+ const int nb = n / QK_NVFP4;
+
+ float sumf = 0;
+
+#if defined __ARM_NEON
+ const int8x16_t values = vld1q_s8(kvalues_mxfp4);
+ const uint8x16_t m4b = vdupq_n_u8(0x0f);
+ float32x4_t acc = vdupq_n_f32(0.0f);
+
+ for (int ib = 0; ib < nb; ++ib) {
+ const uint8x16_t q4bits_0 = vld1q_u8(x[ib].qs);
+ const uint8x16_t q4bits_1 = vld1q_u8(x[ib].qs + 16);
+
+ const int8x16_t q4_lo_0 = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits_0, m4b));
+ const int8x16_t q4_hi_0 = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits_0, 4));
+ const int8x16_t q4_lo_1 = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits_1, m4b));
+ const int8x16_t q4_hi_1 = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits_1, 4));
+
+ const int8x16_t q8_0a = vld1q_s8(y[2*ib].qs);
+ const int8x16_t q8_0b = vld1q_s8(y[2*ib].qs + 16);
+ const int8x16_t q8_lo_0 = vcombine_s8(vget_low_s8(q8_0a), vget_low_s8(q8_0b));
+ const int8x16_t q8_hi_0 = vcombine_s8(vget_high_s8(q8_0a), vget_high_s8(q8_0b));
+
+ const int8x16_t q8_1a = vld1q_s8(y[2*ib+1].qs);
+ const int8x16_t q8_1b = vld1q_s8(y[2*ib+1].qs + 16);
+ const int8x16_t q8_lo_1 = vcombine_s8(vget_low_s8(q8_1a), vget_low_s8(q8_1b));
+ const int8x16_t q8_hi_1 = vcombine_s8(vget_high_s8(q8_1a), vget_high_s8(q8_1b));
+
+ const int32x4_t p0 = vaddq_s32(
+ ggml_vdotq_s32(vdupq_n_s32(0), q4_lo_0, q8_lo_0),
+ ggml_vdotq_s32(vdupq_n_s32(0), q4_hi_0, q8_hi_0));
+ const int32x4_t p1 = vaddq_s32(
+ ggml_vdotq_s32(vdupq_n_s32(0), q4_lo_1, q8_lo_1),
+ ggml_vdotq_s32(vdupq_n_s32(0), q4_hi_1, q8_hi_1));
+
+ const int32x4_t sums = vpaddq_s32(p0, p1);
+
+ // Decode 4 UE4M3 scales to f32 and multiply with q8 scales
+ const float dy0 = GGML_CPU_FP16_TO_FP32(y[2*ib].d);
+ const float dy1 = GGML_CPU_FP16_TO_FP32(y[2*ib+1].d);
+ const float32x4_t nvsc = {
+ ggml_ue4m3_to_fp32(x[ib].d[0]),
+ ggml_ue4m3_to_fp32(x[ib].d[1]),
+ ggml_ue4m3_to_fp32(x[ib].d[2]),
+ ggml_ue4m3_to_fp32(x[ib].d[3])
+ };
+ const float32x4_t scales = vmulq_f32(nvsc, (float32x4_t){dy0, dy0, dy1, dy1});
+
+ acc = vfmaq_f32(acc, vcvtq_f32_s32(sums), scales);
+ }
+ sumf = vaddvq_f32(acc);
+#else
+ for (int ib = 0; ib < nb; ++ib) {
+ for (int si = 0; si < 4; ++si) {
+ const float d = ggml_ue4m3_to_fp32(x[ib].d[si]);
+ const int q8b = si / 2;
+ const int q8o = (si % 2) * QK_NVFP4_SUB;
+ const float dy = GGML_CPU_FP16_TO_FP32(y[2*ib + q8b].d);
+
+ int sumi_lo = 0, sumi_hi = 0;
+ for (int j = 0; j < QK_NVFP4_SUB/2; ++j) {
+ const uint8_t qv = x[ib].qs[si*(QK_NVFP4_SUB/2) + j];
+ sumi_lo += y[2*ib + q8b].qs[q8o + j + 0] * kvalues_mxfp4[qv & 0xf];
+ sumi_hi += y[2*ib + q8b].qs[q8o + j + QK_NVFP4_SUB/2] * kvalues_mxfp4[qv >> 4];
+ }
+ sumf += dy * d * (sumi_lo + sumi_hi);
+ }
+ }
+#endif
+ *s = sumf;
+}
+
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
const int qk = QK8_0;
const int nb = n / qk;
.vec_dot_type = GGML_TYPE_Q8_0,
.nrows = 1,
},
+ [GGML_TYPE_NVFP4] = {
+ .from_float = quantize_row_nvfp4,
+ .vec_dot = ggml_vec_dot_nvfp4_q8_0,
+ .vec_dot_type = GGML_TYPE_Q8_0,
+ .nrows = 1,
+ },
[GGML_TYPE_Q2_K] = {
.from_float = quantize_row_q2_K,
.vec_dot = ggml_vec_dot_q2_K_q8_K,
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_MXFP4:
+ case GGML_TYPE_NVFP4:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q8_1:
case GGML_TYPE_MXFP4:
+ case GGML_TYPE_NVFP4:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q8_1:
case GGML_TYPE_MXFP4:
+ case GGML_TYPE_NVFP4:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_MXFP4:
+ case GGML_TYPE_NVFP4:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q8_1:
case GGML_TYPE_MXFP4:
+ case GGML_TYPE_NVFP4:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q8_1:
case GGML_TYPE_MXFP4:
+ case GGML_TYPE_NVFP4:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q8_1:
case GGML_TYPE_MXFP4:
+ case GGML_TYPE_NVFP4:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
quantize_row_mxfp4_ref(x, y, k);
}
+void quantize_row_nvfp4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
+ quantize_row_nvfp4_ref(x, y, k);
+}
+
//
// 2-6 bit quantization in super-blocks
//
*s = sumf;
}
+// NVFP4: super-block of 64 elements = 4 sub-blocks of 16 = 2 q8_0 blocks
+void ggml_vec_dot_nvfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+ assert(nrc == 1);
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+ assert(n % QK_NVFP4 == 0);
+
+ const block_nvfp4 * GGML_RESTRICT x = vx;
+ const block_q8_0 * GGML_RESTRICT y = vy;
+
+ const int nb = n / QK_NVFP4;
+
+ float sumf = 0;
+
+ for (int ib = 0; ib < nb; ++ib) {
+ for (int s_idx = 0; s_idx < 4; ++s_idx) {
+ const float d = ggml_ue4m3_to_fp32(x[ib].d[s_idx]);
+ const int q8_block = s_idx / 2;
+ const int q8_off = (s_idx % 2) * QK_NVFP4_SUB;
+ const float dy = GGML_CPU_FP16_TO_FP32(y[2*ib + q8_block].d);
+
+ int sumi_lo = 0, sumi_hi = 0;
+ for (int j = 0; j < QK_NVFP4_SUB/2; ++j) {
+ const uint8_t qv = x[ib].qs[s_idx*(QK_NVFP4_SUB/2) + j];
+ sumi_lo += y[2*ib + q8_block].qs[q8_off + j + 0] * kvalues_mxfp4[qv & 0xf];
+ sumi_hi += y[2*ib + q8_block].qs[q8_off + j + QK_NVFP4_SUB/2] * kvalues_mxfp4[qv >> 4];
+ }
+
+ sumf += dy * d * (sumi_lo + sumi_hi);
+ }
+ }
+ *s = sumf;
+}
+
void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
const int qk = QK8_0;
const int nb = n / qk;
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_mxfp4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_nvfp4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_nvfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
#define GGML_E8M0_TO_FP32(x) ggml_e8m0_to_fp32(x)
#define GGML_E8M0_TO_FP32_HALF(x) ggml_e8m0_to_fp32_half(x)
+// UE4M3: unsigned, 4 exp bits (bias=7), 3 mantissa bits
+// Returns value * 0.5 to match kvalues_mxfp4 convention (kvalues = 2 * E2M1_float)
+static inline float ggml_ue4m3_to_fp32(uint8_t x) {
+ if (x == 0 || x == 0x7F) {
+ return 0.0f;
+ }
+ int exp = (x >> 3) & 0xF;
+ int man = x & 0x7;
+ float raw;
+ if (exp == 0) {
+ raw = ldexpf((float) man, -9);
+ } else {
+ raw = ldexpf(1.0f + (float) man / 8.0f, exp - 7);
+ }
+ return raw * 0.5f;
+}
+
+static inline uint8_t ggml_fp32_to_ue4m3(float x) {
+ if (!(x > 0.0f)) {
+ return 0;
+ }
+ if (x > 448.0f) {
+ x = 448.0f;
+ }
+ uint32_t bits;
+ memcpy(&bits, &x, 4);
+ int fp32_exp = ((bits >> 23) & 0xFF) - 127;
+ int fp32_man = (bits >> 20) & 0x7;
+ int ue4m3_exp = fp32_exp + 7;
+ if (ue4m3_exp <= 0) {
+ // subnormal: value = man * 2^-9, man = round(x * 2^9)
+ int man = (int) (x * 512.0f + 0.5f);
+ if (man > 7) {
+ man = 7;
+ }
+ if (man < 1) {
+ return 0;
+ }
+ return (uint8_t) man;
+ }
+ if (ue4m3_exp >= 15) {
+ return 0x7E;
+ }
+ int round_bit = (bits >> 19) & 1;
+ int ue4m3_man = fp32_man + round_bit;
+ if (ue4m3_man > 7) {
+ ue4m3_man = 0;
+ ue4m3_exp++;
+ if (ue4m3_exp >= 15) {
+ return 0x7E;
+ }
+ }
+ return (uint8_t) ((ue4m3_exp << 3) | ue4m3_man);
+}
+
/**
* Converts brain16 to float32.
*
case GGML_OP_SOLVE_TRI:
case GGML_OP_MUL_MAT:
case GGML_OP_MUL_MAT_ID:
- return has_simdgroup_reduction;
+ return has_simdgroup_reduction && op->src[0]->type != GGML_TYPE_NVFP4;
case GGML_OP_SET:
case GGML_OP_CPY:
case GGML_OP_DUP:
};
}
case GGML_OP_GET_ROWS:
- return true;
+ return op->src[0]->type != GGML_TYPE_NVFP4;
case GGML_OP_SET_ROWS:
{
if (op->src[0]->type != GGML_TYPE_F32) {
}
}
+void quantize_row_nvfp4_ref(const float * GGML_RESTRICT x, block_nvfp4 * GGML_RESTRICT y, int64_t k) {
+ static const int qk = QK_NVFP4;
+ static const int qk_sub = QK_NVFP4_SUB;
+ static const int n_sub = QK_NVFP4 / QK_NVFP4_SUB;
+
+ assert(k % qk == 0);
+
+ const int nb = k / qk;
+
+ for (int i = 0; i < nb; i++) {
+ for (int s = 0; s < n_sub; s++) {
+ const float * xb = x + i*qk + s*qk_sub;
+
+ float amax = 0.0f;
+ for (int j = 0; j < qk_sub; j++) {
+ if (amax < fabsf(xb[j])) {
+ amax = fabsf(xb[j]);
+ }
+ }
+
+ // UE4M3 scale: amax / 6.0 maps the max E2M1 value (6.0) to amax
+ const uint8_t ue = ggml_fp32_to_ue4m3(amax / 6.0f);
+ y[i].d[s] = ue;
+ const float d = ggml_ue4m3_to_fp32(ue);
+
+ for (int j = 0; j < qk_sub/2; ++j) {
+ const uint8_t x0 = best_index_mxfp4(xb[0 + j], d);
+ const uint8_t x1 = best_index_mxfp4(xb[qk_sub/2 + j], d);
+
+ y[i].qs[s*(qk_sub/2) + j] = x0 | (x1 << 4);
+ }
+ }
+ }
+}
+
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
static const int qk = QK4_0;
}
}
+void dequantize_row_nvfp4(const block_nvfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
+ static const int qk = QK_NVFP4;
+ static const int qk_sub = QK_NVFP4_SUB;
+ static const int n_sub = QK_NVFP4 / QK_NVFP4_SUB;
+
+ assert(k % qk == 0);
+
+ const int nb = k / qk;
+
+ for (int i = 0; i < nb; i++) {
+ for (int s = 0; s < n_sub; s++) {
+ const float d = ggml_ue4m3_to_fp32(x[i].d[s]);
+ float * yb = y + i*qk + s*qk_sub;
+
+ for (int j = 0; j < qk_sub/2; ++j) {
+ const int8_t v0 = kvalues_mxfp4[x[i].qs[s*(qk_sub/2) + j] & 0x0F];
+ const int8_t v1 = kvalues_mxfp4[x[i].qs[s*(qk_sub/2) + j] >> 4];
+
+ yb[j + 0 ] = v0*d;
+ yb[j + qk_sub/2] = v1*d;
+ }
+ }
+ }
+}
+
//
// 2-6 bit quantization in super-blocks
//
return nrow * ggml_row_size(GGML_TYPE_MXFP4, n_per_row);
}
+size_t quantize_nvfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+ GGML_UNUSED(quant_weights);
+ quantize_row_nvfp4_ref(src, dst, (int64_t)nrow*n_per_row);
+ return nrow * ggml_row_size(GGML_TYPE_NVFP4, n_per_row);
+}
+
// ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k) {
{
VALIDATE_ROW_DATA_E_E8M0_IMPL(block_mxfp4, data, nb);
} break;
+ case GGML_TYPE_NVFP4:
+ {
+ // UE4M3 scales are uint8_t — all byte values are valid
+ GGML_UNUSED(data);
+ GGML_UNUSED(nb);
+ } break;
case GGML_TYPE_Q2_K:
{
VALIDATE_ROW_DATA_DM_F16_IMPL(block_q2_K, data, nb, d, dmin);
GGML_API void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
GGML_API void quantize_row_mxfp4_ref(const float * GGML_RESTRICT x, block_mxfp4 * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_nvfp4_ref(const float * GGML_RESTRICT x, block_nvfp4 * GGML_RESTRICT y, int64_t k);
GGML_API void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
GGML_API void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
//GGML_API void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_nvfp4(const block_nvfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
GGML_API size_t quantize_mxfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_nvfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
GGML_API void iq2xs_init_impl(enum ggml_type type);
GGML_API void iq2xs_free_impl(enum ggml_type type);
.to_float = (ggml_to_float_t) dequantize_row_mxfp4,
.from_float_ref = (ggml_from_float_t)quantize_row_mxfp4_ref,
},
+ [GGML_TYPE_NVFP4] = {
+ .type_name = "nvfp4",
+ .blck_size = QK_NVFP4,
+ .type_size = sizeof(block_nvfp4),
+ .is_quantized = true,
+ .to_float = (ggml_to_float_t) dequantize_row_nvfp4,
+ .from_float_ref = (ggml_from_float_t)quantize_row_nvfp4_ref,
+ },
[GGML_TYPE_Q2_K] = {
.type_name = "q2_K",
.blck_size = QK_K,
case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
case GGML_FTYPE_MOSTLY_MXFP4: wtype = GGML_TYPE_MXFP4; break;
+ case GGML_FTYPE_MOSTLY_NVFP4: wtype = GGML_TYPE_NVFP4; break;
case GGML_FTYPE_MOSTLY_Q2_K: wtype = GGML_TYPE_Q2_K; break;
case GGML_FTYPE_MOSTLY_Q3_K: wtype = GGML_TYPE_Q3_K; break;
case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break;
case GGML_TYPE_Q5_1: result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q8_0: result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_MXFP4: result = quantize_mxfp4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_NVFP4: result = quantize_nvfp4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q2_K: result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q3_K: result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q4_K: result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;