#if defined(__VXE__) || defined(__VXE2__)
for (int i = 0; i < nb; i++) {
- __vector float srcv [8];
- __vector float asrcv[8];
- __vector float amaxv[8];
+ float32x4_t srcv [8];
+ float32x4_t asrcv[8];
+ float32x4_t amaxv[8];
for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
y[i].d = GGML_CPU_FP32_TO_FP16(d);
for (int j = 0; j < 8; j++) {
- const __vector float v = vec_mul(srcv[j], vec_splats(id));
- const __vector int32_t vi = vec_signed(v);
+ const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
+ const int32x4_t vi = vec_signed(v);
y[i].qs[4*j + 0] = vec_extract(vi, 0);
y[i].qs[4*j + 1] = vec_extract(vi, 1);
#if defined(__VXE__) || defined(__VXE2__)
for (int i = 0; i < nb; i++) {
- __vector float srcv [8];
- __vector float asrcv[8];
- __vector float amaxv[8];
+ float32x4_t srcv [8];
+ float32x4_t asrcv[8];
+ float32x4_t amaxv[8];
for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
y[i].d = GGML_CPU_FP32_TO_FP16(d);
- __vector int32_t acc = vec_splats(0);
+ int32x4_t acc = vec_splats(0);
for (int j = 0; j < 8; j++) {
- const __vector float v = vec_mul(srcv[j], vec_splats(id));
- const __vector int32_t vi = vec_signed(v);
+ const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
+ const int32x4_t vi = vec_signed(v);
y[i].qs[4*j + 0] = vec_extract(vi, 0);
y[i].qs[4*j + 1] = vec_extract(vi, 1);
float sumf = 0;
#if defined(__VXE__) || defined(__VXE2__)
- __vector float acc = vec_splats(0.0f);
+ float32x4_t acc = vec_splats(0.0f);
- const __vector uint8_t v_m = vec_splats((const uint8_t)0x0F);
- const __vector int8_t v_s = vec_splats( (const int8_t)0x08);
+ const uint8x16_t v_m = vec_splats((const uint8_t)0x0F);
+ const int8x16_t v_s = vec_splats( (const int8_t)0x08);
for (; ib < nb; ++ib) {
- const __vector uint8_t v_x = vec_xl(0, x[ib].qs);
- const __vector int8_t v_xl = (const __vector int8_t)(v_x & v_m);
- const __vector int8_t v_xh = (const __vector int8_t)(v_x >> 4);
+ const uint8x16_t v_x = vec_xl(0, x[ib].qs);
+ const int8x16_t v_xl = (const int8x16_t)(v_x & v_m);
+ const int8x16_t v_xh = (const int8x16_t)(v_x >> 4);
- const __vector int8_t v_xls = vec_sub(v_xl, v_s);
- const __vector int8_t v_xhs = vec_sub(v_xh, v_s);
+ const int8x16_t v_xls = vec_sub(v_xl, v_s);
+ const int8x16_t v_xhs = vec_sub(v_xh, v_s);
- const __vector int8_t v_yl = vec_xl(0 , y[ib].qs);
- const __vector int8_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
+ const int8x16_t v_yl = vec_xl(0 , y[ib].qs);
+ const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
- const __vector int16_t v_xylso = vec_mulo(v_xls, v_yl);
- const __vector int16_t v_xylse = vec_mule(v_xls, v_yl);
- const __vector int16_t v_xyhso = vec_mulo(v_xhs, v_yh);
- const __vector int16_t v_xyhse = vec_mule(v_xhs, v_yh);
+ const int16x8_t v_xylso = vec_mulo(v_xls, v_yl);
+ const int16x8_t v_xylse = vec_mule(v_xls, v_yl);
+ const int16x8_t v_xyhso = vec_mulo(v_xhs, v_yh);
+ const int16x8_t v_xyhse = vec_mule(v_xhs, v_yh);
- __vector int16_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
+ int16x8_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
- const __vector float v_xy = vec_float(vec_unpackh(v_xy_));
- const __vector float v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
+ const float32x4_t v_xy = vec_float(vec_unpackh(v_xy_));
+ const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
acc = vec_madd(v_xy, v_d, acc);
}
- sumf = acc[0] + acc[1] + acc[2] + acc[3];
-
+ sumf = vec_hsum_f32x4(acc);
*s = sumf;
#else
UNUSED(nb);
acc = vec_madd(v_xy, v_d, acc);
}
- sumf = acc[0] + acc[1] + acc[2] + acc[3] + summs;
-
+ sumf = vec_hsum_f32x4(acc) + summs;
*s = sumf;
#else
UNUSED(nb);
v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
}
- sumf += vec_hsum(v_sum0) + vec_hsum(v_sum1);
+ sumf += vec_hsum_f32x4(v_sum0) + vec_hsum_f32x4(v_sum1);
#pragma GCC unroll 4
for (; ib < nb; ++ib) {
const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
const float32x4_t v_acc = vec_madd(v_xyf, v_d, vec_splats(0.0f));
- sumf += vec_hsum(v_acc);
+ sumf += vec_hsum_f32x4(v_acc);
}
*s = sumf;
v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
}
- sumf += vec_hsum(v_sum0) + vec_hsum(v_sum1) + summs0 + summs1;
+ sumf += vec_hsum_f32x4(v_sum0) + vec_hsum_f32x4(v_sum1) + summs0 + summs1;
#pragma GCC unroll 4
for (; ib < nb; ++ib) {
const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
const float32x4_t v_acc = vec_madd(v_xyf, v_d, v_acc);
- sumf += vec_hsum(v_acc) + summs;
+ sumf += vec_hsum_f32x4(v_acc) + summs;
}
*s = sumf;
float sumf = 0;
#if defined(__VXE__) || defined(__VXE2__)
- __vector float acc = vec_splats(0.0f);
+ float32x4_t acc = vec_splats(0.0f);
#pragma GCC unroll 8
for (; ib < nb; ++ib) {
acc = vec_madd(v_xy, v_d, acc);
}
- sumf = acc[0] + acc[1] + acc[2] + acc[3];
+ sumf = vec_hsum_f32x4(acc);
*s = sumf;
#else
isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[6]);
isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[7]);
- isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0];
- isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1];
- isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2];
- isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3];
+ isum += vec_hsum_i32x4(isum0) * scale[0];
+ isum += vec_hsum_i32x4(isum1) * scale[1];
+ isum += vec_hsum_i32x4(isum2) * scale[2];
+ isum += vec_hsum_i32x4(isum3) * scale[3];
scale += 4;
v_xl[1] = (int8x16_t)vec_and(v_x[1], v_lm);
const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
- sumi1 += (p1[0] + p1[1] + p1[2] + p1[3]) * scales[2*j+0];
+ sumi1 += vec_hsum_i32x4(p1) * scales[2*j+0];
v_y[0] = vec_xl(0 , y0);
v_y[1] = vec_xl(16, y0);
v_xl[1] = (int8x16_t)vec_sr(v_x[1], 4);
const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
- sumi2 += (p2[0] + p2[1] + p2[2] + p2[3]) * scales[2*j+1];
+ sumi2 += vec_hsum_i32x4(p2) * scales[2*j+1];
}
sumf += d * (sumi1 + sumi2);
const int32x4_t v_minsho = vec_mulo(v_ysums, v_minsh);
const int32x4_t v_minshe = vec_mule(v_ysums, v_minsh);
const int32x4_t v_mins = vec_add(v_minsho, v_minshe);
- const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
+ const int32_t mins = vec_hsum_i32x4(v_mins);
const uint8_t * scales = (const uint8_t *)utmp;
const uint8_t * GGML_RESTRICT x0l = x[i].qs;
int32x4_t sumi0 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[0], v_y[0]), q5b[1], v_y[1]);
int32x4_t sumi1 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[2], v_y[2]), q5b[3], v_y[3]);
- sumi += (sumi0[0] + sumi0[1] + sumi0[2] + sumi0[3]) * *scales++;
- sumi += (sumi1[0] + sumi1[1] + sumi1[2] + sumi1[3]) * *scales++;
+ sumi += vec_hsum_i32x4(sumi0) * *scales++;
+ sumi += vec_hsum_i32x4(sumi1) * *scales++;
}
sumf += d * sumi - dmin * mins;
const int32x4_t v_minshe = vec_mule(v_ysumsh, v_scaleh);
const int32x4_t v_mins = v_minslo + v_minsle + v_minsho + v_minshe;
- const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
+ const int32_t mins = vec_hsum_i32x4(v_mins);
int32_t isum = 0;
for (int j = 0; j < QK_K/128; ++j) {
int32x4_t summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
int32x4_t summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
- isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
- (summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
- (summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
- (summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
+ isum += vec_hsum_i32x4(summs0) * scale[0] +
+ vec_hsum_i32x4(summs1) * scale[1] +
+ vec_hsum_i32x4(summs2) * scale[2] +
+ vec_hsum_i32x4(summs3) * scale[3];
scale += 4;
summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
- isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
- (summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
- (summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
- (summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
+ isum += vec_hsum_i32x4(summs0) * scale[0] +
+ vec_hsum_i32x4(summs1) * scale[1] +
+ vec_hsum_i32x4(summs2) * scale[2] +
+ vec_hsum_i32x4(summs3) * scale[3];
scale += 4;
}
const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
- sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]);
+ sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * vec_hsum_i32x4(v_xy);
}
*s = sumf;
h >>= 4;
- sumi1 += (vsumi0[0] + vsumi0[1] + vsumi0[2] + vsumi0[3]) * ls1;
- sumi2 += (vsumi1[0] + vsumi1[1] + vsumi1[2] + vsumi1[3]) * ls2;
+ sumi1 += vec_hsum_i32x4(vsumi0) * ls1;
+ sumi2 += vec_hsum_i32x4(vsumi1) * ls2;
}
sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);