GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S,
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S,
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S,
+ GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_M,
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL,
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS,
GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32,
+ GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32,
+ GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32,
+ GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32,
+ GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32,
GGML_METAL_KERNEL_TYPE_ROPE_F32,
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S, get_rows_iq3_s, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S, get_rows_iq2_s, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S, get_rows_iq1_s, true);
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_M, get_rows_iq1_m, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, get_rows_iq4_nl, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS, get_rows_iq4_xs, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32, mul_mv_iq3_s_f32, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32, mul_mv_iq2_s_f32, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, mul_mv_iq1_s_f32, ctx->support_simdgroup_reduction);
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32, mul_mv_iq1_m_f32, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, mul_mv_iq4_nl_f32, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32, mul_mv_iq4_xs_f32, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32, mul_mv_id_iq3_s_f32, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32, mul_mv_id_iq2_s_f32, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, mul_mv_id_iq1_s_f32, ctx->support_simdgroup_reduction);
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32, mul_mv_id_iq1_m_f32, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, mul_mv_id_iq4_nl_f32, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32, mul_mv_id_iq4_xs_f32, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32, mul_mm_iq3_s_f32, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32, mul_mm_iq2_s_f32, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, mul_mm_iq1_s_f32, ctx->support_simdgroup_mm);
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32, mul_mm_iq1_m_f32, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, mul_mm_iq4_nl_f32, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32, mul_mm_iq4_xs_f32, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32, mul_mm_id_iq3_s_f32, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32, mul_mm_id_iq2_s_f32, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, mul_mm_id_iq1_s_f32, ctx->support_simdgroup_mm);
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32, mul_mm_id_iq1_m_f32, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, mul_mm_id_iq4_nl_f32, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32, mul_mm_id_iq4_xs_f32, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true);
case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32 ].pipeline; break;
case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32 ].pipeline; break;
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32 ].pipeline; break;
+ case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32 ].pipeline; break;
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break;
case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32 ].pipeline; break;
default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
nth1 = 16;
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32].pipeline;
} break;
+ case GGML_TYPE_IQ1_M:
+ {
+ nth0 = 4;
+ nth1 = 16;
+ pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32].pipeline;
+ } break;
case GGML_TYPE_IQ4_NL:
{
nth0 = 4;
[encoder setBytes:&r2 length:sizeof(r2) atIndex:17];
[encoder setBytes:&r3 length:sizeof(r3) atIndex:18];
- if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
- src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 ||
- src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ2_S) {
+ if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q5_0 ||
+ src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || src0t == GGML_TYPE_Q2_K ||
+ src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}
else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32 ].pipeline; break;
case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32 ].pipeline; break;
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32 ].pipeline; break;
+ case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32 ].pipeline; break;
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break;
case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32 ].pipeline; break;
default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
nth1 = 16;
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32].pipeline;
} break;
+ case GGML_TYPE_IQ1_M:
+ {
+ nth0 = 4;
+ nth1 = 16;
+ pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32].pipeline;
+ } break;
case GGML_TYPE_IQ4_NL:
{
nth0 = 4;
[encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j];
}
- if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 ||
- src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 ||
- src2t == GGML_TYPE_Q2_K || src2t == GGML_TYPE_IQ1_S || src2t == GGML_TYPE_IQ2_S) {
+ if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 || src2t == GGML_TYPE_Q5_0 ||
+ src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 || src2t == GGML_TYPE_Q2_K ||
+ src2t == GGML_TYPE_IQ1_S || src2t == GGML_TYPE_IQ1_M || src2t == GGML_TYPE_IQ2_S) {
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}
else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) {
case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S ].pipeline; break;
case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S ].pipeline; break;
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S ].pipeline; break;
+ case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_M ].pipeline; break;
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break;
case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS ].pipeline; break;
case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break;
}
}
+void kernel_mul_mv_iq1_m_f32_impl(
+ device const void * src0,
+ device const float * src1,
+ device float * dst,
+ constant int64_t & ne00,
+ constant int64_t & ne01,
+ constant int64_t & ne02,
+ constant int64_t & ne10,
+ constant int64_t & ne12,
+ constant int64_t & ne0,
+ constant int64_t & ne1,
+ constant uint & r2,
+ constant uint & r3,
+ uint3 tgpig[[threadgroup_position_in_grid]],
+ uint tiisg[[thread_index_in_simdgroup]],
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
+
+ const int nb = ne00/QK_K;
+ const int r0 = tgpig.x;
+ const int r1 = tgpig.y;
+ const int im = tgpig.z;
+
+ const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
+ const int ib_row = first_row * nb;
+
+ const uint i12 = im%ne12;
+ const uint i13 = im/ne12;
+
+ const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+ device const block_iq1_m * x = (device const block_iq1_m *) src0 + ib_row + offset0;
+ device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
+
+ float yl[32];
+ float sumf[N_DST]={0.f}, all_sum;
+
+ const int nb32 = nb * (QK_K / 32);
+
+ const int ix = tiisg;
+
+ device const float * y4 = y + 32 * ix;
+
+ iq1m_scale_t scale;
+
+ for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
+
+ float4 sumy = {0.f};
+ for (int i = 0; i < 8; ++i) {
+ yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0];
+ yl[i+ 8] = y4[i+ 8]; sumy[1] += yl[i+ 8];
+ yl[i+16] = y4[i+16]; sumy[2] += yl[i+16];
+ yl[i+24] = y4[i+24]; sumy[3] += yl[i+24];
+ }
+
+ const int ibl = ib32 / (QK_K / 32);
+ const int ib = ib32 % (QK_K / 32);
+
+ device const block_iq1_m * xr = x + ibl;
+ device const uint8_t * qs = xr->qs + 4 * ib;
+ device const uint8_t * qh = xr->qh + 2 * ib;
+ device const uint16_t * sc = (device const uint16_t *)xr->scales;
+
+ for (int row = 0; row < N_DST; row++) {
+
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+
+ constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
+ constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700)));
+ constant uint8_t * grid3 = (constant uint8_t *)(iq1s_grid_gpu + (qs[2] | ((qh[1] << 8) & 0x700)));
+ constant uint8_t * grid4 = (constant uint8_t *)(iq1s_grid_gpu + (qs[3] | ((qh[1] << 4) & 0x700)));
+
+ float2 sum = {0.f};
+ for (int j = 0; j < 4; ++j) {
+ sum[0] += yl[j+ 0] * (grid1[j] & 0xf) + yl[j+ 4] * (grid1[j] >> 4)
+ + yl[j+ 8] * (grid2[j] & 0xf) + yl[j+12] * (grid2[j] >> 4);
+ sum[1] += yl[j+16] * (grid3[j] & 0xf) + yl[j+20] * (grid3[j] >> 4)
+ + yl[j+24] * (grid4[j] & 0xf) + yl[j+28] * (grid4[j] >> 4);
+ }
+ const float delta1 = sumy[0] * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[1] * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
+ const float delta2 = sumy[2] * (qh[1] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[3] * (qh[1] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
+ sumf[row] += (float)scale.f16 * ((sum[0] + delta1) * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 7) + 1) +
+ (sum[1] + delta2) * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 7) + 1));
+
+ sc += nb*sizeof(block_iq1_m)/2;
+ qs += nb*sizeof(block_iq1_m);
+ qh += nb*sizeof(block_iq1_m);
+ }
+
+ y4 += 32 * 32;
+ }
+
+ for (int row = 0; row < N_DST; ++row) {
+ all_sum = simd_sum(sumf[row]);
+ if (tiisg == 0) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
+ }
+ }
+}
+
void kernel_mul_mv_iq4_nl_f32_impl(
device const void * src0,
device const float * src1,
kernel_mul_mv_iq1_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg);
}
+[[host_name("kernel_mul_mv_iq1_m_f32")]]
+kernel void kernel_mul_mv_iq1_m_f32(
+ device const void * src0,
+ device const float * src1,
+ device float * dst,
+ constant int64_t & ne00,
+ constant int64_t & ne01,
+ constant int64_t & ne02,
+ constant uint64_t & nb00,
+ constant uint64_t & nb01,
+ constant uint64_t & nb02,
+ constant int64_t & ne10,
+ constant int64_t & ne11,
+ constant int64_t & ne12,
+ constant uint64_t & nb10,
+ constant uint64_t & nb11,
+ constant uint64_t & nb12,
+ constant int64_t & ne0,
+ constant int64_t & ne1,
+ constant uint & r2,
+ constant uint & r3,
+ uint3 tgpig[[threadgroup_position_in_grid]],
+ uint tiisg[[thread_index_in_simdgroup]],
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
+
+ kernel_mul_mv_iq1_m_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg);
+}
+
[[host_name("kernel_mul_mv_iq4_nl_f32")]]
kernel void kernel_mul_mv_iq4_nl_f32(
device const void * src0,
}
}
+template <typename type4x4>
+void dequantize_iq1_m(device const block_iq1_m * xb, short il, thread type4x4 & reg) {
+ // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
+ const int ib32 = il/2;
+ il = il%2;
+ iq1m_scale_t scale;
+ device const uint16_t * sc = (device const uint16_t *)xb->scales;
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+ const float d = scale.f16;
+ device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
+ device const uint8_t * qh = xb->qh + 2*ib32 + il;
+ const float dl = d * (2*((sc[ib32/2] >> (6*(ib32%2)+3*il)) & 7) + 1);
+ const float ml1 = dl * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
+ const float ml2 = dl * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
+ constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
+ constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700)));
+ for (int i = 0; i < 4; ++i) {
+ reg[0][i] = dl * (grid1[i] & 0xf) + ml1;
+ reg[1][i] = dl * (grid1[i] >> 4) + ml1;
+ reg[2][i] = dl * (grid2[i] & 0xf) + ml2;
+ reg[3][i] = dl * (grid2[i] >> 4) + ml2;
+ }
+}
+
template <typename type4x4>
void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4 & reg) {
device const uint16_t * q4 = (device const uint16_t *)xb->qs;
template [[host_name("kernel_get_rows_iq3_s")]] kernel get_rows_t kernel_get_rows<block_iq3_s, QK_NL, dequantize_iq3_s>;
template [[host_name("kernel_get_rows_iq2_s")]] kernel get_rows_t kernel_get_rows<block_iq2_s, QK_NL, dequantize_iq2_s>;
template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows<block_iq1_s, QK_NL, dequantize_iq1_s>;
+template [[host_name("kernel_get_rows_iq1_m")]] kernel get_rows_t kernel_get_rows<block_iq1_m, QK_NL, dequantize_iq1_m>;
template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows<block_iq4_nl, 2, dequantize_iq4_nl>;
#if QK_K == 64
template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_rows<block_iq4_xs, 2, dequantize_iq4_xs>;
template [[host_name("kernel_mul_mm_iq3_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_s, QK_NL, dequantize_iq3_s>;
template [[host_name("kernel_mul_mm_iq2_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_s, QK_NL, dequantize_iq2_s>;
template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_s, QK_NL, dequantize_iq1_s>;
+template [[host_name("kernel_mul_mm_iq1_m_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_m, QK_NL, dequantize_iq1_m>;
template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_nl>;
#if QK_K == 64
template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_xs>;
template [[host_name("kernel_mul_mm_id_iq3_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_s, QK_NL, dequantize_iq3_s>;
template [[host_name("kernel_mul_mm_id_iq2_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_s, QK_NL, dequantize_iq2_s>;
template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s, QK_NL, dequantize_iq1_s>;
+template [[host_name("kernel_mul_mm_id_iq1_m_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_m, QK_NL, dequantize_iq1_m>;
template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl, 2, dequantize_iq4_nl>;
#if QK_K == 64
template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs, 2, dequantize_iq4_xs>;
sgitg);
}
+[[host_name("kernel_mul_mv_id_iq1_m_f32")]]
+kernel void kernel_mul_mv_id_iq1_m_f32(
+ device const char * ids,
+ device const char * src1,
+ device float * dst,
+ constant uint64_t & nbi1,
+ constant int64_t & ne00,
+ constant int64_t & ne01,
+ constant int64_t & ne02,
+ constant uint64_t & nb00,
+ constant uint64_t & nb01,
+ constant uint64_t & nb02,
+ constant int64_t & ne10,
+ constant int64_t & ne11,
+ constant int64_t & ne12,
+ constant int64_t & ne13,
+ constant uint64_t & nb10,
+ constant uint64_t & nb11,
+ constant uint64_t & nb12,
+ constant int64_t & ne0,
+ constant int64_t & ne1,
+ constant uint64_t & nb1,
+ constant uint & r2,
+ constant uint & r3,
+ constant int & idx,
+ device const char * src00,
+ device const char * src01,
+ device const char * src02,
+ device const char * src03,
+ device const char * src04,
+ device const char * src05,
+ device const char * src06,
+ device const char * src07,
+ uint3 tgpig[[threadgroup_position_in_grid]],
+ uint tiitg[[thread_index_in_threadgroup]],
+ uint tiisg[[thread_index_in_simdgroup]],
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
+ device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
+
+ const int64_t bid = tgpig.z/(ne12*ne13);
+
+ tgpig.z = tgpig.z%(ne12*ne13);
+
+ const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
+
+ kernel_mul_mv_iq1_m_f32_impl(
+ src0[id],
+ (device const float *) (src1 + bid*nb11),
+ dst + bid*ne0,
+ ne00,
+ ne01,
+ ne02,
+ ne10,
+ ne12,
+ ne0,
+ ne1,
+ r2,
+ r3,
+ tgpig,
+ tiisg,
+ sgitg);
+}
+
[[host_name("kernel_mul_mv_id_iq4_nl_f32")]]
kernel void kernel_mul_mv_id_iq4_nl_f32(
device const char * ids,
}
}
+void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, int k) {
+ assert(k % QK_K == 0);
+ const int nb = k / QK_K;
+
+ float delta[4];
+ uint16_t idx[4];
+
+ iq1m_scale_t scale;
+
+ for (int i = 0; i < nb; i++) {
+
+ const uint16_t * sc = (const uint16_t *)x[i].scales;
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+ const float d = GGML_FP16_TO_FP32(scale.f16);
+ const uint8_t * qs = x[i].qs;
+ const uint8_t * qh = x[i].qh;
+
+ for (int ib = 0; ib < QK_K/32; ++ib) {
+ const float dl1 = d * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1);
+ const float dl2 = d * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1);
+ idx[0] = qs[0] | ((qh[0] << 8) & 0x700);
+ idx[1] = qs[1] | ((qh[0] << 4) & 0x700);
+ idx[2] = qs[2] | ((qh[1] << 8) & 0x700);
+ idx[3] = qs[3] | ((qh[1] << 4) & 0x700);
+ delta[0] = qh[0] & 0x08 ? -IQ1S_DELTA : IQ1S_DELTA;
+ delta[1] = qh[0] & 0x80 ? -IQ1S_DELTA : IQ1S_DELTA;
+ delta[2] = qh[1] & 0x08 ? -IQ1S_DELTA : IQ1S_DELTA;
+ delta[3] = qh[1] & 0x80 ? -IQ1S_DELTA : IQ1S_DELTA;
+ for (int l = 0; l < 2; ++l) {
+ const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
+ for (int j = 0; j < 8; ++j) {
+ y[j] = dl1 * (grid[j] + delta[l]);
+ }
+ y += 8;
+ }
+ for (int l = 2; l < 4; ++l) {
+ const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
+ for (int j = 0; j < 8; ++j) {
+ y[j] = dl2 * (grid[j] + delta[l]);
+ }
+ y += 8;
+ }
+ qs += 4;
+ qh += 2;
+ }
+ }
+}
+
static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int k) {
#endif
}
+void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+ assert(n % QK_K == 0);
+ assert(nrc == 1);
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+
+ const block_iq1_m * restrict x = vx;
+ const block_q8_K * restrict y = vy;
+
+ const int nb = n / QK_K;
+
+ iq1m_scale_t scale;
+
+#if defined __ARM_NEON
+
+ const int32x4_t mask = vdupq_n_s32(0x7);
+ const int32x4_t mone = vdupq_n_s32(1);
+ const int32x4_t mzero = vdupq_n_s32(0);
+
+ ggml_int8x16x4_t deltas;
+ deltas.val[0] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(+1));
+ deltas.val[1] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(+1));
+ deltas.val[2] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(-1));
+ deltas.val[3] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(-1));
+
+ ggml_int8x16x4_t q1b;
+ ggml_int8x16x4_t q8b;
+
+ uint32_t aux32;
+ const uint8_t * aux8 = (const uint8_t *)&aux32;
+
+ float sumf = 0;
+ for (int i = 0; i < nb; ++i) {
+
+ const int8_t * q8 = y[i].qs;
+ const uint8_t * qs = x[i].qs;
+ const uint8_t * qh = x[i].qh;
+ const uint16_t * sc = (const uint16_t *)x[i].scales;
+
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+
+ int32x4_t sumi1 = mzero;
+ int32x4_t sumi2 = mzero;
+
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
+
+ q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((qh[0] << 8) & 0x700)))),
+ vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((qh[0] << 4) & 0x700)))));
+ q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((qh[1] << 8) & 0x700)))),
+ vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((qh[1] << 4) & 0x700)))));
+ q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((qh[2] << 8) & 0x700)))),
+ vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((qh[2] << 4) & 0x700)))));
+ q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((qh[3] << 8) & 0x700)))),
+ vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((qh[3] << 4) & 0x700)))));
+
+ q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
+
+ const int32x4_t p1 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[0], q8b.val[0]), ggml_vdotq_s32(mzero, q1b.val[1], q8b.val[1]));
+ const int32x4_t p2 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[2], q8b.val[2]), ggml_vdotq_s32(mzero, q1b.val[3], q8b.val[3]));
+ const int32x4_t p12 = vpaddq_s32(p1, p2);
+
+ const uint32_t * qh32 = (const uint32_t *)qh; // we are 4-byte aligned, so we can do that
+ aux32 = ((qh32[0] >> 3) & 0x01010101) | ((qh32[0] >> 6) & 0x02020202);
+
+ const int32x4_t p3 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[0]], q8b.val[0]), ggml_vdotq_s32(mzero, deltas.val[aux8[1]], q8b.val[1]));
+ const int32x4_t p4 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[2]], q8b.val[2]), ggml_vdotq_s32(mzero, deltas.val[aux8[3]], q8b.val[3]));
+ const int32x4_t p34 = vpaddq_s32(p3, p4);
+
+ int32x4_t scales_4 = ggml_vld1q_u32(sc[ib/2] >> 0, sc[ib/2] >> 3, sc[ib/2] >> 6, sc[ib/2] >> 9);
+ scales_4 = vaddq_s32(vshlq_n_s32(vandq_s32(scales_4, mask), 1), mone);
+
+ sumi1 = vmlaq_s32(sumi1, scales_4, p12);
+ sumi2 = vmlaq_s32(sumi2, scales_4, p34);
+
+ qs += 8; qh += 4;
+
+ }
+
+ sumf += y[i].d * GGML_FP16_TO_FP32(scale.f16) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2));
+ }
+
+ *s = sumf;
+
+#elif defined __AVX2__
+
+ const __m256i mask = _mm256_set1_epi16(0x7);
+ const __m256i mone = _mm256_set1_epi16(1);
+
+ __m256 accum1 = _mm256_setzero_ps();
+ __m256 accum2 = _mm256_setzero_ps();
+ for (int i = 0; i < nb; ++i) {
+
+ const int8_t * q8 = y[i].qs;
+ const uint8_t * qs = x[i].qs;
+ const uint8_t * qh = x[i].qh;
+ const uint16_t * sc = (const uint16_t *)x[i].scales;
+
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+
+ __m256i sumi1 = _mm256_setzero_si256();
+ __m256i sumi2 = _mm256_setzero_si256();
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
+ const __m256i q1b_1 = _mm256_set_epi64x(
+ iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)],
+ iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]
+ );
+ const __m256i q1b_2 = _mm256_set_epi64x(
+ iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)],
+ iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]
+ );
+ const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+ const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+
+ const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
+ const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
+
+ const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+ qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
+ qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+ qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
+ const __m256i delta2 = _mm256_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+ qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
+ qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+ qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
+
+ const __m256i dot3 = mul_add_epi8(delta1, q8b_1);
+ const __m256i dot4 = mul_add_epi8(delta2, q8b_2);
+ __m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 3), _mm_set1_epi16(sc[ib/2] >> 0));
+ __m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 9), _mm_set1_epi16(sc[ib/2] >> 6));
+ scale1 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale1, mask), 1), mone);
+ scale2 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale2, mask), 1), mone);
+ const __m256i p1 = _mm256_madd_epi16(dot1, scale1);
+ const __m256i p2 = _mm256_madd_epi16(dot2, scale2);
+ const __m256i p3 = _mm256_madd_epi16(dot3, scale1);
+ const __m256i p4 = _mm256_madd_epi16(dot4, scale2);
+
+ sumi1 = _mm256_add_epi32(sumi1, _mm256_add_epi32(p1, p2));
+ sumi2 = _mm256_add_epi32(sumi2, _mm256_add_epi32(p3, p4));
+
+ qs += 8; qh += 4;
+ }
+
+ const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
+ accum1 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi1), accum1);
+ accum2 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi2), accum2);
+
+ }
+
+ *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
+
+#else
+
+ int sum1[2], sum2[2], delta[4];
+
+ float sumf = 0;
+ for (int i = 0; i < nb; i++) {
+
+ const int8_t * q8 = y[i].qs;
+ const uint8_t * qs = x[i].qs;
+ const uint8_t * qh = x[i].qh;
+ const uint16_t * sc = (const uint16_t *)x[i].scales;
+
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+
+ int sumi1 = 0, sumi2 = 0;
+ for (int ib = 0; ib < QK_K/32; ++ib) {
+ delta[0] = qh[0] & 0x08 ? -1 : 1;
+ delta[1] = qh[0] & 0x80 ? -1 : 1;
+ delta[2] = qh[1] & 0x08 ? -1 : 1;
+ delta[3] = qh[1] & 0x80 ? -1 : 1;
+ sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
+ for (int l = 0; l < 4; ++l) {
+ const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
+ int lsum1 = 0, lsum2 = 0;
+ for (int j = 0; j < 8; ++j) {
+ lsum1 += q8[j] * grid[j];
+ lsum2 += q8[j];
+ }
+ q8 += 8;
+ sum1[l/2] += lsum1;
+ sum2[l/2] += lsum2*delta[l];
+ }
+ const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
+ const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
+ sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
+ sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
+ qs += 4;
+ qh += 2;
+ }
+
+ sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
+ }
+
+ *s = sumf;
+
+#endif
+}
+
void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
assert(nrc == 1);
UNUSED(nrc);
};
static inline int iq2_data_index(enum ggml_type type) {
- GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
+ GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S);
return type == GGML_TYPE_IQ2_XXS ? 0 :
type == GGML_TYPE_IQ2_XS ? 1 :
- type == GGML_TYPE_IQ1_S ? 2 : 3;
+ type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? 2 : 3;
}
static inline int iq2_grid_size(enum ggml_type type) {
- GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
+ GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S);
return type == GGML_TYPE_IQ2_XXS ? 256 :
type == GGML_TYPE_IQ2_XS ? 512 :
- type == GGML_TYPE_IQ1_S ? NGRID_IQ1S : 1024;
+ type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? NGRID_IQ1S : 1024;
}
static int iq2_compare_func(const void * left, const void * right) {
const int kmap_size = 43692;
//const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2;
- const int nwant = type == GGML_TYPE_IQ1_S ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
+ const int nwant = type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 :
type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 :
- type == GGML_TYPE_IQ1_S ? kgrid_1bit_2048 : kgrid_2bit_1024;
+ type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? kgrid_1bit_2048 : kgrid_2bit_1024;
uint64_t * kgrid_q2xs;
int * kmap_q2xs;
uint16_t * kneighbors_q2xs;
}
void iq2xs_free_impl(enum ggml_type type) {
- GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
+ GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S);
const int gindex = iq2_data_index(type);
if (iq2_data[gindex].grid) {
free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL;
}
#define IQ1S_BLOCK_SIZE 32
-static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
+#define IQ1M_BLOCK_SIZE 16
+static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights,
+ float * scales,
+ float * weight,
+ float * sumx,
+ float * sumw,
+ float * pairs,
+ int8_t * L,
+ uint16_t * index,
+ int8_t * shifts) {
const int gindex = iq2_data_index(GGML_TYPE_IQ1_S);
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
GGML_ASSERT(n%QK_K == 0);
+ block_iq1_s * y = vy;
+
const int nbl = n/QK_K;
- block_iq1_s * y = vy;
+ const int block_size = IQ1S_BLOCK_SIZE;
const float x_p[3] = {-1 + IQ1S_DELTA, IQ1S_DELTA, 1 + IQ1S_DELTA};
const float x_m[3] = {-1 - IQ1S_DELTA, -IQ1S_DELTA, 1 - IQ1S_DELTA};
- float scales[QK_K/IQ1S_BLOCK_SIZE];
- float weight[IQ1S_BLOCK_SIZE];
- int8_t L[IQ1S_BLOCK_SIZE];
- float sumx[IQ1S_BLOCK_SIZE+1];
- float sumw[IQ1S_BLOCK_SIZE+1];
- float pairs[2*IQ1S_BLOCK_SIZE];
+
int * idx = (int *)(pairs + 1);
- uint16_t index[IQ1S_BLOCK_SIZE/8];
- int8_t shifts[QK_K/IQ1S_BLOCK_SIZE];
for (int ibl = 0; ibl < nbl; ++ibl) {
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
float sigma2 = 2*sumx2/QK_K;
- for (int ib = 0; ib < QK_K/IQ1S_BLOCK_SIZE; ++ib) {
- const float * xb = xbl + IQ1S_BLOCK_SIZE*ib;
- const float * qw = quant_weights + QK_K*ibl + IQ1S_BLOCK_SIZE*ib;
- for (int i = 0; i < IQ1S_BLOCK_SIZE; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
+ for (int ib = 0; ib < QK_K/block_size; ++ib) {
+ const float * xb = xbl + block_size*ib;
+ const float * qw = quant_weights + QK_K*ibl + block_size*ib;
+ for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
float max = fabsf(xb[0]);
- for (int i = 1; i < IQ1S_BLOCK_SIZE; ++i) max = MAX(max, fabsf(xb[i]));
+ for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
if (!max) {
scales[ib] = 0;
- memset(L, 1, IQ1S_BLOCK_SIZE);
+ memset(L, 1, block_size);
continue;
}
// Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
// in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
// Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
// for each possible and score for each split.
- for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) {
+ for (int j = 0; j < block_size; ++j) {
pairs[2*j] = xb[j];
idx[2*j] = j;
}
- qsort(pairs, IQ1S_BLOCK_SIZE, 2*sizeof(float), iq1_sort_helper);
+ qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
{
sumx[0] = sumw[0] = 0;
- for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) {
+ for (int j = 0; j < block_size; ++j) {
int i = idx[2*j];
sumx[j+1] = sumx[j] + weight[i]*xb[i];
sumw[j+1] = sumw[j] + weight[i];
}
float best_score = 0, scale = max;
int besti1 = -1, besti2 = -1, best_shift = 0;
- for (int i1 = 0; i1 <= IQ1S_BLOCK_SIZE; ++i1) {
- for (int i2 = i1; i2 <= IQ1S_BLOCK_SIZE; ++i2) {
- float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[IQ1S_BLOCK_SIZE] - sumx[i2])*x_p[2];
- float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[IQ1S_BLOCK_SIZE] - sumw[i2])*x_p[2]*x_p[2];
+ for (int i1 = 0; i1 <= block_size; ++i1) {
+ for (int i2 = i1; i2 <= block_size; ++i2) {
+ float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[block_size] - sumx[i2])*x_p[2];
+ float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[block_size] - sumw[i2])*x_p[2]*x_p[2];
if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
scale = sumqx/sumq2; best_score = scale*sumqx;
besti1 = i1; besti2 = i2; best_shift = 1;
}
- sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[IQ1S_BLOCK_SIZE] - sumx[i2])*x_m[2];
- sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[IQ1S_BLOCK_SIZE] - sumw[i2])*x_m[2]*x_m[2];
+ sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[block_size] - sumx[i2])*x_m[2];
+ sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[block_size] - sumw[i2])*x_m[2]*x_m[2];
if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
scale = sumqx/sumq2; best_score = scale*sumqx;
besti1 = i1; besti2 = i2; best_shift = -1;
GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_shift != 0);
for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
- for (int j = besti2; j < IQ1S_BLOCK_SIZE; ++j) L[idx[2*j]] = 2;
+ for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2;
if (scale < 0) {
- for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) L[j] = 2 - L[j];
+ for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j];
scale = -scale; best_shift = -best_shift;
}
bool all_on_grid = true;
const float * xx = best_shift == 1 ? x_p : x_m;
- for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) {
+ for (int k = 0; k < block_size/8; ++k) {
uint16_t u = 0;
for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
int grid_index = kmap_q2xs[u];
}
if (!all_on_grid) {
float sumqx = 0, sumq2 = 0;
- for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) {
+ for (int k = 0; k < block_size/8; ++k) {
const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]);
for (int j = 0; j < 8; ++j) {
float w = weight[8*k + j];
if (sumqx > 0 && sumq2 > 0) scale = sumqx/sumq2;
}
uint16_t h = 0;
- for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) {
- y[ibl].qs[(IQ1S_BLOCK_SIZE/8)*ib + k] = index[k] & 255;
+ for (int k = 0; k < block_size/8; ++k) {
+ y[ibl].qs[(block_size/8)*ib + k] = index[k] & 255;
h |= (index[k] >> 8) << 3*k;
}
y[ibl].qh[ib] = h;
}
if (!max_scale) {
- memset(y[ibl].qs, 0, QK_K/8);
continue;
}
float d = max_scale/15;
- y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.085f is another fudge factor. Don't ask me why it is needed.
+ y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.125f is another fudge factor. Don't ask me why it is needed.
float id = 1/d;
- for (int ib = 0; ib < QK_K/IQ1S_BLOCK_SIZE; ++ib) {
+ for (int ib = 0; ib < QK_K/block_size; ++ib) {
int l = nearest_int(0.5f*(id*scales[ib]-1));
l = MAX(0, MIN(7, l));
if (shifts[ib] == -1) l |= 8;
size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
GGML_ASSERT(n_per_row%QK_K == 0);
+ float scales[QK_K/IQ1S_BLOCK_SIZE];
+ float weight[IQ1S_BLOCK_SIZE];
+ int8_t L[IQ1S_BLOCK_SIZE];
+ float sumx[IQ1S_BLOCK_SIZE+1];
+ float sumw[IQ1S_BLOCK_SIZE+1];
+ float pairs[2*IQ1S_BLOCK_SIZE];
+ uint16_t index[IQ1S_BLOCK_SIZE/8];
+ int8_t shifts[QK_K/IQ1S_BLOCK_SIZE];
int nblock = n_per_row/QK_K;
char * qrow = (char *)dst;
for (int row = 0; row < nrow; ++row) {
- quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights);
+ quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights, scales, weight, sumx, sumw, pairs, L, index, shifts);
src += n_per_row;
qrow += nblock*sizeof(block_iq1_s);
}
return nrow * nblock * sizeof(block_iq1_s);
}
+static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights,
+ float * scales,
+ float * weight,
+ float * pairs,
+ int8_t * L,
+ uint16_t * index,
+ int8_t * shifts) {
+
+ const int gindex = iq2_data_index(GGML_TYPE_IQ1_M);
+
+ const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
+ const int * kmap_q2xs = iq2_data[gindex].map;
+ const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
+
+ //GGML_ASSERT(quant_weights && "missing quantization weights");
+ GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
+ GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
+ GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
+ GGML_ASSERT(n%QK_K == 0);
+
+ block_iq1_m * y = vy;
+
+ const int nbl = n/QK_K;
+
+ const int block_size = IQ1M_BLOCK_SIZE;
+
+ const float x_p[3] = {-1 + IQ1M_DELTA, IQ1M_DELTA, 1 + IQ1M_DELTA};
+ const float x_m[3] = {-1 - IQ1M_DELTA, -IQ1M_DELTA, 1 - IQ1M_DELTA};
+ const uint8_t masks[4] = {0x00, 0x80, 0x08, 0x88};
+
+ int * idx = (int *)(pairs + 1);
+
+ float sumqx[4], sumq2[4];
+
+ iq1m_scale_t s;
+ const float * xx;
+
+ for (int ibl = 0; ibl < nbl; ++ibl) {
+
+ //y[ibl].d = GGML_FP32_TO_FP16(0.f);
+ memset(y[ibl].qs, 0, QK_K/8);
+ memset(y[ibl].qh, 0, QK_K/16);
+ memset(y[ibl].scales, 0, QK_K/32);
+
+ float max_scale = 0;
+
+ const float * xbl = x + QK_K*ibl;
+ float sumx2 = 0;
+ for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
+ float sigma2 = 2*sumx2/QK_K;
+
+ for (int ib = 0; ib < QK_K/block_size; ++ib) {
+ const float * xb = xbl + block_size*ib;
+ if (quant_weights) {
+ const float * qw = quant_weights + QK_K*ibl + block_size*ib;
+ for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
+ } else {
+ for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
+ }
+ float max = fabsf(xb[0]);
+ for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
+ if (!max) {
+ scales[ib] = 0;
+ memset(L, 1, block_size);
+ continue;
+ }
+ // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
+ // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two
+ // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights
+ // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
+ // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
+ // for each possible and score for each split.
+ for (int j = 0; j < block_size; ++j) {
+ pairs[2*j] = xb[j];
+ idx[2*j] = j;
+ }
+ qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
+ float best_score = 0, scale = max;
+ int besti1 = -1, besti2 = -1, best_k = -1;
+ // 0: +, +
+ // 1: +, -
+ // 2: -, +
+ // 3: -, -
+ for (int i1 = 0; i1 <= block_size; ++i1) {
+ for (int i2 = i1; i2 <= block_size; ++i2) {
+ memset(sumqx, 0, 4*sizeof(float));
+ memset(sumq2, 0, 4*sizeof(float));
+ for (int j = 0; j < i1; ++j) {
+ int i = idx[2*j];
+ if (i < block_size/2) {
+ sumqx[0] += weight[i]*x_p[0]*xb[i];
+ sumqx[1] += weight[i]*x_p[0]*xb[i];
+ sumqx[2] += weight[i]*x_m[0]*xb[i];
+ sumqx[3] += weight[i]*x_m[0]*xb[i];
+ sumq2[0] += weight[i]*x_p[0]*x_p[0];
+ sumq2[1] += weight[i]*x_p[0]*x_p[0];
+ sumq2[2] += weight[i]*x_m[0]*x_m[0];
+ sumq2[3] += weight[i]*x_m[0]*x_m[0];
+ } else {
+ sumqx[0] += weight[i]*x_p[0]*xb[i];
+ sumqx[2] += weight[i]*x_p[0]*xb[i];
+ sumqx[1] += weight[i]*x_m[0]*xb[i];
+ sumqx[3] += weight[i]*x_m[0]*xb[i];
+ sumq2[0] += weight[i]*x_p[0]*x_p[0];
+ sumq2[2] += weight[i]*x_p[0]*x_p[0];
+ sumq2[1] += weight[i]*x_m[0]*x_m[0];
+ sumq2[3] += weight[i]*x_m[0]*x_m[0];
+ }
+ }
+ for (int j = i1; j < i2; ++j) {
+ int i = idx[2*j];
+ if (i < block_size/2) {
+ sumqx[0] += weight[i]*x_p[1]*xb[i];
+ sumqx[1] += weight[i]*x_p[1]*xb[i];
+ sumqx[2] += weight[i]*x_m[1]*xb[i];
+ sumqx[3] += weight[i]*x_m[1]*xb[i];
+ sumq2[0] += weight[i]*x_p[1]*x_p[1];
+ sumq2[1] += weight[i]*x_p[1]*x_p[1];
+ sumq2[2] += weight[i]*x_m[1]*x_m[1];
+ sumq2[3] += weight[i]*x_m[1]*x_m[1];
+ } else {
+ sumqx[0] += weight[i]*x_p[1]*xb[i];
+ sumqx[2] += weight[i]*x_p[1]*xb[i];
+ sumqx[1] += weight[i]*x_m[1]*xb[i];
+ sumqx[3] += weight[i]*x_m[1]*xb[i];
+ sumq2[0] += weight[i]*x_p[1]*x_p[1];
+ sumq2[2] += weight[i]*x_p[1]*x_p[1];
+ sumq2[1] += weight[i]*x_m[1]*x_m[1];
+ sumq2[3] += weight[i]*x_m[1]*x_m[1];
+ }
+ }
+ for (int j = i2; j < block_size; ++j) {
+ int i = idx[2*j];
+ if (i < block_size/2) {
+ sumqx[0] += weight[i]*x_p[2]*xb[i];
+ sumqx[1] += weight[i]*x_p[2]*xb[i];
+ sumqx[2] += weight[i]*x_m[2]*xb[i];
+ sumqx[3] += weight[i]*x_m[2]*xb[i];
+ sumq2[0] += weight[i]*x_p[2]*x_p[2];
+ sumq2[1] += weight[i]*x_p[2]*x_p[2];
+ sumq2[2] += weight[i]*x_m[2]*x_m[2];
+ sumq2[3] += weight[i]*x_m[2]*x_m[2];
+ } else {
+ sumqx[0] += weight[i]*x_p[2]*xb[i];
+ sumqx[2] += weight[i]*x_p[2]*xb[i];
+ sumqx[1] += weight[i]*x_m[2]*xb[i];
+ sumqx[3] += weight[i]*x_m[2]*xb[i];
+ sumq2[0] += weight[i]*x_p[2]*x_p[2];
+ sumq2[2] += weight[i]*x_p[2]*x_p[2];
+ sumq2[1] += weight[i]*x_m[2]*x_m[2];
+ sumq2[3] += weight[i]*x_m[2]*x_m[2];
+ }
+ }
+ for (int k = 0; k < 4; ++k) {
+ if (sumq2[k] > 0 && sumqx[k]*sumqx[k] > best_score*sumq2[k]) {
+ scale = sumqx[k]/sumq2[k]; best_score = scale*sumqx[k];
+ besti1 = i1; besti2 = i2; best_k = k;
+ }
+ }
+ }
+ }
+ GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_k >= 0);
+ for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
+ for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
+ for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2;
+ if (scale < 0) {
+ for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j];
+ scale = -scale;
+ best_k = best_k == 0 ? 3 : best_k == 1 ? 2 : best_k == 2 ? 1 : 0;
+ }
+ bool all_on_grid = true;
+ for (int k = 0; k < block_size/8; ++k) {
+ if (k == 0) xx = best_k < 2 ? x_p : x_m;
+ else xx = best_k%2 == 0 ? x_p : x_m;
+ uint16_t u = 0;
+ for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
+ int grid_index = kmap_q2xs[u];
+ if (grid_index < 0) {
+ all_on_grid = false;
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
+ grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S);
+ GGML_ASSERT(grid_index >= 0);
+ }
+ index[k] = grid_index;
+ }
+ if (!all_on_grid) {
+ float sumqx_f = 0, sumq2_f = 0;
+ for (int k = 0; k < block_size/8; ++k) {
+ if (k == 0) xx = best_k < 2 ? x_p : x_m;
+ else xx = best_k%2 == 0 ? x_p : x_m;
+ const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]);
+ for (int j = 0; j < 8; ++j) {
+ float w = weight[8*k + j];
+ float q = xx[(pg[j] - 1)/2];
+ sumqx_f += w*q*xb[8*k+j];
+ sumq2_f += w*q*q;
+ }
+ }
+ if (sumqx_f > 0 && sumq2_f > 0) scale = sumqx_f/sumq2_f;
+ }
+ y[ibl].qs[2*ib + 0] = index[0] & 255;
+ y[ibl].qs[2*ib + 1] = index[1] & 255;
+ y[ibl].qh[ib] = (index[0] >> 8) | ((index[1] >> 8) << 4);
+ GGML_ASSERT(scale >= 0);
+ scales[ib] = scale;
+ shifts[ib] = best_k;
+ max_scale = MAX(max_scale, scale);
+ }
+
+ if (!max_scale) {
+ continue;
+ }
+
+ uint16_t * sc = (uint16_t *)y[ibl].scales;
+ float d = max_scale/15;
+ float id = 1/d;
+ float sumqx_f = 0, sumq2_f = 0;
+ for (int ib = 0; ib < QK_K/block_size; ++ib) {
+ int l = nearest_int(0.5f*(id*scales[ib+0]-1));
+ l = MAX(0, MIN(7, l));
+ sc[ib/4] |= (l << 3*(ib%4));
+ y[ibl].qh[ib] |= masks[shifts[ib]];
+ const float * xb = xbl + block_size*ib;
+ if (quant_weights) {
+ const float * qw = quant_weights + QK_K*ibl + block_size*ib;
+ for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
+ } else {
+ for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
+ }
+ for (int k = 0; k < block_size/8; ++k) {
+ if (k == 0) xx = shifts[ib] < 2 ? x_p : x_m;
+ else xx = shifts[ib]%2 == 0 ? x_p : x_m;
+ const int8_t * pg = (const int8_t *)(kgrid_q2xs + y[ibl].qs[2*ib+k] + ((y[ibl].qh[ib] << (8 - 4*k)) & 0x700));
+ for (int j = 0; j < 8; ++j) {
+ float w = weight[8*k + j];
+ float q = xx[(pg[j] - 1)/2]*(2*l+1);
+ sumqx_f += w*q*xb[8*k+j];
+ sumq2_f += w*q*q;
+ }
+ }
+ }
+ if (sumq2_f > 0) d = sumqx_f/sumq2_f;
+ s.f16 = GGML_FP32_TO_FP16(d*1.1125f); // 1.1125f is another fudge factor. Don't ask me why it is needed.
+ sc[0] |= ((s.u16 & 0x000f) << 12);
+ sc[1] |= ((s.u16 & 0x00f0) << 8);
+ sc[2] |= ((s.u16 & 0x0f00) << 4);
+ sc[3] |= ((s.u16 & 0xf000) << 0);
+ }
+}
+
+size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
+ GGML_ASSERT(n_per_row%QK_K == 0);
+ float scales[QK_K/IQ1M_BLOCK_SIZE];
+ float weight[IQ1M_BLOCK_SIZE];
+ int8_t L[IQ1M_BLOCK_SIZE];
+ float pairs[2*IQ1M_BLOCK_SIZE];
+ uint16_t index[IQ1M_BLOCK_SIZE/8];
+ int8_t shifts[QK_K/IQ1M_BLOCK_SIZE];
+ int nblock = n_per_row/QK_K;
+ char * qrow = (char *)dst;
+ for (int row = 0; row < nrow; ++row) {
+ quantize_row_iq1_m_impl(src, qrow, n_per_row, quant_weights, scales, weight, pairs, L, index, shifts);
+ src += n_per_row;
+ qrow += nblock*sizeof(block_iq1_m);
+ }
+ return nrow * nblock * sizeof(block_iq1_m);
+}
+
// ============================ 4-bit non-linear quants
static inline int best_index_int8(int n, const int8_t * val, float x) {