UNUSED(blocklen);
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
- if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
- const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
-
- for (int c = 0; c < nc; c += ncols_interleaved) {
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
- float32x4_t acc = vdupq_n_f32(0);
- for (int b = 0; b < nb; b++) {
- int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
- int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
- int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
- int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
- float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
-
- int8x16_t a0 = vld1q_s8(a_ptr->qs);
- int8x16_t a1 = vld1q_s8(a_ptr->qs + qk/2);
- float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
-
- int32x4_t ret = vdupq_n_s32(0);
-
- ret = vdotq_laneq_s32(ret, b0 << 4, a0, 0);
- ret = vdotq_laneq_s32(ret, b1 << 4, a0, 1);
- ret = vdotq_laneq_s32(ret, b2 << 4, a0, 2);
- ret = vdotq_laneq_s32(ret, b3 << 4, a0, 3);
-
- ret = vdotq_laneq_s32(ret, b0 & 0xf0U, a1, 0);
- ret = vdotq_laneq_s32(ret, b1 & 0xf0U, a1, 1);
- ret = vdotq_laneq_s32(ret, b2 & 0xf0U, a1, 2);
- ret = vdotq_laneq_s32(ret, b3 & 0xf0U, a1, 3);
-
- acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
- vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
- a_ptr++;
- b_ptr++;
- }
- vst1q_f32(s, acc);
- s += ncols_interleaved;
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
+
+ for (int c = 0; c < nc; c += ncols_interleaved) {
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+ float32x4_t acc = vdupq_n_f32(0);
+ for (int b = 0; b < nb; b++) {
+ int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
+ int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
+ int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
+ int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
+ float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
+
+ int8x16_t a0 = vld1q_s8(a_ptr->qs);
+ int8x16_t a1 = vld1q_s8(a_ptr->qs + qk/2);
+ float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
+
+ int32x4_t ret = vdupq_n_s32(0);
+
+ ret = vdotq_laneq_s32(ret, b0 << 4, a0, 0);
+ ret = vdotq_laneq_s32(ret, b1 << 4, a0, 1);
+ ret = vdotq_laneq_s32(ret, b2 << 4, a0, 2);
+ ret = vdotq_laneq_s32(ret, b3 << 4, a0, 3);
+
+ ret = vdotq_laneq_s32(ret, b0 & 0xf0U, a1, 0);
+ ret = vdotq_laneq_s32(ret, b1 & 0xf0U, a1, 1);
+ ret = vdotq_laneq_s32(ret, b2 & 0xf0U, a1, 2);
+ ret = vdotq_laneq_s32(ret, b3 & 0xf0U, a1, 3);
+
+ acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
+ vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
+ a_ptr++;
+ b_ptr++;
}
- return;
+ vst1q_f32(s, acc);
+ s += ncols_interleaved;
}
+ return;
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
float sumf[4];
int sumi;
UNUSED(blocklen);
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
- if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
- const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
-
- for (int c = 0; c < nc; c += ncols_interleaved) {
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
- float32x4_t acc = vdupq_n_f32(0);
- for (int b = 0; b < nb; b++) {
- int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
- int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
- int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
- int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
- float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
-
- int8x16_t a0 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs);
- int8x16_t a1 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 1);
- int8x16_t a2 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 2);
- int8x16_t a3 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 3);
- float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
-
- int32x4_t ret0 = vdupq_n_s32(0);
- int32x4_t ret1 = vdupq_n_s32(0);
-
- ret0 = vdotq_s32(ret0, b0 << 4, a0);
- ret1 = vdotq_s32(ret1, b1 << 4, a0);
- ret0 = vdotq_s32(ret0, b2 << 4, a1);
- ret1 = vdotq_s32(ret1, b3 << 4, a1);
-
- ret0 = vdotq_s32(ret0, b0 & 0xf0U, a2);
- ret1 = vdotq_s32(ret1, b1 & 0xf0U, a2);
- ret0 = vdotq_s32(ret0, b2 & 0xf0U, a3);
- ret1 = vdotq_s32(ret1, b3 & 0xf0U, a3);
-
- int32x4_t ret = vpaddq_s32(ret0, ret1);
-
- acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
- vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
- a_ptr++;
- b_ptr++;
- }
- vst1q_f32(s, acc);
- s += ncols_interleaved;
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
+
+ for (int c = 0; c < nc; c += ncols_interleaved) {
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+ float32x4_t acc = vdupq_n_f32(0);
+ for (int b = 0; b < nb; b++) {
+ int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
+ int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
+ int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
+ int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
+ float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
+
+ int8x16_t a0 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs);
+ int8x16_t a1 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 1);
+ int8x16_t a2 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 2);
+ int8x16_t a3 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 3);
+ float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
+
+ int32x4_t ret0 = vdupq_n_s32(0);
+ int32x4_t ret1 = vdupq_n_s32(0);
+
+ ret0 = vdotq_s32(ret0, b0 << 4, a0);
+ ret1 = vdotq_s32(ret1, b1 << 4, a0);
+ ret0 = vdotq_s32(ret0, b2 << 4, a1);
+ ret1 = vdotq_s32(ret1, b3 << 4, a1);
+
+ ret0 = vdotq_s32(ret0, b0 & 0xf0U, a2);
+ ret1 = vdotq_s32(ret1, b1 & 0xf0U, a2);
+ ret0 = vdotq_s32(ret0, b2 & 0xf0U, a3);
+ ret1 = vdotq_s32(ret1, b3 & 0xf0U, a3);
+
+ int32x4_t ret = vpaddq_s32(ret0, ret1);
+
+ acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
+ vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
+ a_ptr++;
+ b_ptr++;
}
- return;
+ vst1q_f32(s, acc);
+ s += ncols_interleaved;
}
+ return;
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
float sumf[4];
int sumi;
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
#if defined(__ARM_FEATURE_SVE)
- if (ggml_cpu_has_sve() && ggml_cpu_get_sve_cnt() == QK8_0) {
+ if (ggml_cpu_get_sve_cnt() == QK8_0) {
const void * b_ptr = vx;
const void * a_ptr = vy;
float * res_ptr = s;
UNUSED(blocklen);
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
- if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
- const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl);
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
- float * res_ptr = s;
-
- for (int x = 0; x < nc / ncols_interleaved; x++) {
- const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
-
- float32x4_t sumf = vdupq_n_f32(0);
- for (int l = 0; l < nb; l++) {
- uint8x16_t b_0 = vld1q_u8(b_ptr[l].qs + 0);
- uint8x16_t b_1 = vld1q_u8(b_ptr[l].qs + 16);
- uint8x16_t b_2 = vld1q_u8(b_ptr[l].qs + 32);
- uint8x16_t b_3 = vld1q_u8(b_ptr[l].qs + 48);
-
- int8x16_t b_0_hi = vqtbl1q_s8(kvalues, b_0 >> 4);
- int8x16_t b_0_lo = vqtbl1q_s8(kvalues, b_0 & 0x0F);
- int8x16_t b_1_hi = vqtbl1q_s8(kvalues, b_1 >> 4);
- int8x16_t b_1_lo = vqtbl1q_s8(kvalues, b_1 & 0x0F);
- int8x16_t b_2_hi = vqtbl1q_s8(kvalues, b_2 >> 4);
- int8x16_t b_2_lo = vqtbl1q_s8(kvalues, b_2 & 0x0F);
- int8x16_t b_3_hi = vqtbl1q_s8(kvalues, b_3 >> 4);
- int8x16_t b_3_lo = vqtbl1q_s8(kvalues, b_3 & 0x0F);
-
- int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 0);
- int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16);
-
- int32x4_t sumi = vdupq_n_s32(0);
- sumi = vdotq_laneq_s32(sumi, b_0_lo, a_0, 0);
- sumi = vdotq_laneq_s32(sumi, b_0_hi, a_1, 0);
- sumi = vdotq_laneq_s32(sumi, b_1_lo, a_0, 1);
- sumi = vdotq_laneq_s32(sumi, b_1_hi, a_1, 1);
- sumi = vdotq_laneq_s32(sumi, b_2_lo, a_0, 2);
- sumi = vdotq_laneq_s32(sumi, b_2_hi, a_1, 2);
- sumi = vdotq_laneq_s32(sumi, b_3_lo, a_0, 3);
- sumi = vdotq_laneq_s32(sumi, b_3_hi, a_1, 3);
-
- float32x4_t a_d = vcvt_f32_f16(vld1_dup_f16((const float16_t *)&a_ptr[l].d));
- float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d));
- float32x4_t d = a_d * b_d;
+ const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl);
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+ float * res_ptr = s;
- sumf = vmlaq_f32(sumf, d, vcvtq_f32_s32(sumi));
- }
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
+ const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
- vst1q_f32(res_ptr + x * 4, sumf);
+ float32x4_t sumf = vdupq_n_f32(0);
+ for (int l = 0; l < nb; l++) {
+ uint8x16_t b_0 = vld1q_u8(b_ptr[l].qs + 0);
+ uint8x16_t b_1 = vld1q_u8(b_ptr[l].qs + 16);
+ uint8x16_t b_2 = vld1q_u8(b_ptr[l].qs + 32);
+ uint8x16_t b_3 = vld1q_u8(b_ptr[l].qs + 48);
+
+ int8x16_t b_0_hi = vqtbl1q_s8(kvalues, b_0 >> 4);
+ int8x16_t b_0_lo = vqtbl1q_s8(kvalues, b_0 & 0x0F);
+ int8x16_t b_1_hi = vqtbl1q_s8(kvalues, b_1 >> 4);
+ int8x16_t b_1_lo = vqtbl1q_s8(kvalues, b_1 & 0x0F);
+ int8x16_t b_2_hi = vqtbl1q_s8(kvalues, b_2 >> 4);
+ int8x16_t b_2_lo = vqtbl1q_s8(kvalues, b_2 & 0x0F);
+ int8x16_t b_3_hi = vqtbl1q_s8(kvalues, b_3 >> 4);
+ int8x16_t b_3_lo = vqtbl1q_s8(kvalues, b_3 & 0x0F);
+
+ int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 0);
+ int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16);
+
+ int32x4_t sumi = vdupq_n_s32(0);
+ sumi = vdotq_laneq_s32(sumi, b_0_lo, a_0, 0);
+ sumi = vdotq_laneq_s32(sumi, b_0_hi, a_1, 0);
+ sumi = vdotq_laneq_s32(sumi, b_1_lo, a_0, 1);
+ sumi = vdotq_laneq_s32(sumi, b_1_hi, a_1, 1);
+ sumi = vdotq_laneq_s32(sumi, b_2_lo, a_0, 2);
+ sumi = vdotq_laneq_s32(sumi, b_2_hi, a_1, 2);
+ sumi = vdotq_laneq_s32(sumi, b_3_lo, a_0, 3);
+ sumi = vdotq_laneq_s32(sumi, b_3_hi, a_1, 3);
+
+ float32x4_t a_d = vcvt_f32_f16(vld1_dup_f16((const float16_t *)&a_ptr[l].d));
+ float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d));
+ float32x4_t d = a_d * b_d;
+
+ sumf = vmlaq_f32(sumf, d, vcvtq_f32_s32(sumi));
}
- return;
+
+ vst1q_f32(res_ptr + x * 4, sumf);
}
+ return;
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
{
float sumf[4];
UNUSED(ncols_interleaved);
UNUSED(blocklen);
-#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
- if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
- const void * b_ptr = vx;
- const void * a_ptr = vy;
- float * res_ptr = s;
- size_t res_stride = bs * sizeof(float);
-
- __asm__ __volatile__(
- "mov x10, %x[nr]\n"
- "mov x9, #0x88\n"
- "cmp x10, #0x10\n"
- "mul x9, %x[nb], x9\n"
- "blt 4f\n"
- "1:" // Row loop
- "add x28, %x[b_ptr], #0x8\n"
- "mov x27, %x[nc]\n"
- "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
- "2:" // Column loop
- "add x25, %x[a_ptr], #0x8\n"
- "movi v15.16b, #0x0\n"
- "movi v19.16b, #0x0\n"
- "mov x24, %x[nb]\n"
- "add x23, x25, x9\n"
- "movi v18.16b, #0x0\n"
- "movi v14.16b, #0x0\n"
- "add x22, x23, x9\n"
- "movi v11.16b, #0x0\n"
- "movi v13.16b, #0x0\n"
- "add x21, x22, x9\n"
- "movi v23.16b, #0x0\n"
- "movi v16.16b, #0x0\n"
- "movi v25.16b, #0x0\n"
- "movi v7.16b, #0x0\n"
- "movi v0.16b, #0x0\n"
- "movi v4.16b, #0x0\n"
- "movi v5.16b, #0x0\n"
- "movi v21.16b, #0x0\n"
- "movi v8.16b, #0x0\n"
- "movi v1.16b, #0x0\n"
- "3:" // Block loop
- "ldr q3, [x28, #0x0]\n"
- "ldr q31, [x25, #0x0]\n"
- "movi v28.16b, #0x4\n"
- "movi v10.4s, #0x0\n"
- "ldr q22, [x28, #0x10]\n"
- "ldr q6, [x25, #0x10]\n"
- "movi v29.4s, #0x0\n"
- "movi v9.4s, #0x0\n"
- "ldr q27, [x28, #0x20]\n"
- "ldr q30, [x28, #0x30]\n"
- "movi v20.4s, #0x0\n"
- "movi v24.16b, #0xf0\n"
- "ldr d2, [x25, #-0x8]\n"
- "ldr d26, [x23, #-0x8]\n"
- "sshl v12.16b, v3.16b, v28.16b\n"
- "sub x20, x28, #0x8\n"
- "ldr d17, [x20, #0x0]\n"
- "and v3.16b, v3.16b, v24.16b\n"
- "subs x24, x24, #0x1\n"
- "add x28, x28, #0x48\n"
- ".inst 0x4f9fe18a // sdot v10.4s, v12.16b, v31.4b[0]\n"
- ".inst 0x4fbfe19d // sdot v29.4s, v12.16b, v31.4b[1]\n"
- ".inst 0x4f9fe989 // sdot v9.4s, v12.16b, v31.4b[2]\n"
- ".inst 0x4fbfe994 // sdot v20.4s, v12.16b, v31.4b[3]\n"
- "sshl v31.16b, v22.16b, v28.16b\n"
- "and v22.16b, v22.16b, v24.16b\n"
- "fcvtl v17.4s, v17.4h\n"
- "fcvtl v2.4s, v2.4h\n"
- "fcvtl v26.4s, v26.4h\n"
- ".inst 0x4f86e3ea // sdot v10.4s, v31.16b, v6.4b[0]\n"
- ".inst 0x4fa6e3fd // sdot v29.4s, v31.16b, v6.4b[1]\n"
- ".inst 0x4f86ebe9 // sdot v9.4s, v31.16b, v6.4b[2]\n"
- ".inst 0x4fa6ebf4 // sdot v20.4s, v31.16b, v6.4b[3]\n"
- "sshl v6.16b, v27.16b, v28.16b\n"
- "sshl v28.16b, v30.16b, v28.16b\n"
- "and v27.16b, v27.16b, v24.16b\n"
- "and v30.16b, v30.16b, v24.16b\n"
- "ldr q24, [x25, #0x20]\n"
- ".inst 0x4f98e0ca // sdot v10.4s, v6.16b, v24.4b[0]\n"
- ".inst 0x4fb8e0dd // sdot v29.4s, v6.16b, v24.4b[1]\n"
- ".inst 0x4f98e8c9 // sdot v9.4s, v6.16b, v24.4b[2]\n"
- ".inst 0x4fb8e8d4 // sdot v20.4s, v6.16b, v24.4b[3]\n"
- "ldr q24, [x25, #0x30]\n"
- ".inst 0x4f98e38a // sdot v10.4s, v28.16b, v24.4b[0]\n"
- ".inst 0x4fb8e39d // sdot v29.4s, v28.16b, v24.4b[1]\n"
- ".inst 0x4f98eb89 // sdot v9.4s, v28.16b, v24.4b[2]\n"
- ".inst 0x4fb8eb94 // sdot v20.4s, v28.16b, v24.4b[3]\n"
- "ldr q24, [x25, #0x40]\n"
- ".inst 0x4f98e06a // sdot v10.4s, v3.16b, v24.4b[0]\n"
- ".inst 0x4fb8e07d // sdot v29.4s, v3.16b, v24.4b[1]\n"
- ".inst 0x4f98e869 // sdot v9.4s, v3.16b, v24.4b[2]\n"
- ".inst 0x4fb8e874 // sdot v20.4s, v3.16b, v24.4b[3]\n"
- "ldr q24, [x25, #0x50]\n"
- ".inst 0x4f98e2ca // sdot v10.4s, v22.16b, v24.4b[0]\n"
- ".inst 0x4fb8e2dd // sdot v29.4s, v22.16b, v24.4b[1]\n"
- ".inst 0x4f98eac9 // sdot v9.4s, v22.16b, v24.4b[2]\n"
- ".inst 0x4fb8ead4 // sdot v20.4s, v22.16b, v24.4b[3]\n"
- "ldr q24, [x25, #0x60]\n"
- ".inst 0x4f98e36a // sdot v10.4s, v27.16b, v24.4b[0]\n"
- ".inst 0x4fb8e37d // sdot v29.4s, v27.16b, v24.4b[1]\n"
- ".inst 0x4f98eb69 // sdot v9.4s, v27.16b, v24.4b[2]\n"
- ".inst 0x4fb8eb74 // sdot v20.4s, v27.16b, v24.4b[3]\n"
- "ldr q24, [x25, #0x70]\n"
- "add x25, x25, #0x88\n"
- ".inst 0x4f98e3ca // sdot v10.4s, v30.16b, v24.4b[0]\n"
- ".inst 0x4fb8e3dd // sdot v29.4s, v30.16b, v24.4b[1]\n"
- ".inst 0x4f98ebc9 // sdot v9.4s, v30.16b, v24.4b[2]\n"
- ".inst 0x4fb8ebd4 // sdot v20.4s, v30.16b, v24.4b[3]\n"
- "fmul v24.4s, v17.4s, v2.s[0]\n"
- "scvtf v10.4s, v10.4s, #0x4\n"
- "scvtf v29.4s, v29.4s, #0x4\n"
- "scvtf v9.4s, v9.4s, #0x4\n"
- "scvtf v20.4s, v20.4s, #0x4\n"
- "fmla v15.4s, v10.4s, v24.4s\n"
- "ldr q24, [x23, #0x0]\n"
- "fmul v10.4s, v17.4s, v2.s[1]\n"
- "fmla v19.4s, v29.4s, v10.4s\n"
- "ldr q10, [x23, #0x10]\n"
- "fmul v29.4s, v17.4s, v2.s[2]\n"
- "fmul v2.4s, v17.4s, v2.s[3]\n"
- "fmla v18.4s, v9.4s, v29.4s\n"
- "movi v9.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- ".inst 0x4f98e189 // sdot v9.4s, v12.16b, v24.4b[0]\n"
- ".inst 0x4fb8e19d // sdot v29.4s, v12.16b, v24.4b[1]\n"
- "fmla v14.4s, v20.4s, v2.4s\n"
- "movi v20.4s, #0x0\n"
- "movi v2.4s, #0x0\n"
- ".inst 0x4f98e994 // sdot v20.4s, v12.16b, v24.4b[2]\n"
- ".inst 0x4fb8e982 // sdot v2.4s, v12.16b, v24.4b[3]\n"
- "ldr q24, [x23, #0x20]\n"
- ".inst 0x4f8ae3e9 // sdot v9.4s, v31.16b, v10.4b[0]\n"
- ".inst 0x4faae3fd // sdot v29.4s, v31.16b, v10.4b[1]\n"
- ".inst 0x4f8aebf4 // sdot v20.4s, v31.16b, v10.4b[2]\n"
- ".inst 0x4faaebe2 // sdot v2.4s, v31.16b, v10.4b[3]\n"
- "ldr q10, [x23, #0x30]\n"
- ".inst 0x4f98e0c9 // sdot v9.4s, v6.16b, v24.4b[0]\n"
- ".inst 0x4fb8e0dd // sdot v29.4s, v6.16b, v24.4b[1]\n"
- ".inst 0x4f98e8d4 // sdot v20.4s, v6.16b, v24.4b[2]\n"
- ".inst 0x4fb8e8c2 // sdot v2.4s, v6.16b, v24.4b[3]\n"
- "ldr q24, [x23, #0x40]\n"
- ".inst 0x4f8ae389 // sdot v9.4s, v28.16b, v10.4b[0]\n"
- ".inst 0x4faae39d // sdot v29.4s, v28.16b, v10.4b[1]\n"
- ".inst 0x4f8aeb94 // sdot v20.4s, v28.16b, v10.4b[2]\n"
- ".inst 0x4faaeb82 // sdot v2.4s, v28.16b, v10.4b[3]\n"
- "ldr q10, [x23, #0x50]\n"
- ".inst 0x4f98e069 // sdot v9.4s, v3.16b, v24.4b[0]\n"
- ".inst 0x4fb8e07d // sdot v29.4s, v3.16b, v24.4b[1]\n"
- ".inst 0x4f98e874 // sdot v20.4s, v3.16b, v24.4b[2]\n"
- ".inst 0x4fb8e862 // sdot v2.4s, v3.16b, v24.4b[3]\n"
- "ldr q24, [x23, #0x60]\n"
- ".inst 0x4f8ae2c9 // sdot v9.4s, v22.16b, v10.4b[0]\n"
- ".inst 0x4faae2dd // sdot v29.4s, v22.16b, v10.4b[1]\n"
- ".inst 0x4f8aead4 // sdot v20.4s, v22.16b, v10.4b[2]\n"
- ".inst 0x4faaeac2 // sdot v2.4s, v22.16b, v10.4b[3]\n"
- "ldr q10, [x23, #0x70]\n"
- "add x23, x23, #0x88\n"
- ".inst 0x4f98e369 // sdot v9.4s, v27.16b, v24.4b[0]\n"
- ".inst 0x4fb8e37d // sdot v29.4s, v27.16b, v24.4b[1]\n"
- ".inst 0x4f98eb74 // sdot v20.4s, v27.16b, v24.4b[2]\n"
- ".inst 0x4fb8eb62 // sdot v2.4s, v27.16b, v24.4b[3]\n"
- "ldr q24, [x22, #0x0]\n"
- ".inst 0x4f8ae3c9 // sdot v9.4s, v30.16b, v10.4b[0]\n"
- ".inst 0x4faae3dd // sdot v29.4s, v30.16b, v10.4b[1]\n"
- ".inst 0x4f8aebd4 // sdot v20.4s, v30.16b, v10.4b[2]\n"
- ".inst 0x4faaebc2 // sdot v2.4s, v30.16b, v10.4b[3]\n"
- "fmul v10.4s, v17.4s, v26.s[0]\n"
- "scvtf v9.4s, v9.4s, #0x4\n"
- "scvtf v29.4s, v29.4s, #0x4\n"
- "scvtf v20.4s, v20.4s, #0x4\n"
- "scvtf v2.4s, v2.4s, #0x4\n"
- "fmla v11.4s, v9.4s, v10.4s\n"
- "ldr q9, [x22, #0x10]\n"
- "fmul v10.4s, v17.4s, v26.s[1]\n"
- "fmla v13.4s, v29.4s, v10.4s\n"
- "ldr d29, [x22, #-0x8]\n"
- "fmul v10.4s, v17.4s, v26.s[2]\n"
- "fmul v26.4s, v17.4s, v26.s[3]\n"
- "fcvtl v29.4s, v29.4h\n"
- "fmla v23.4s, v20.4s, v10.4s\n"
- "movi v20.4s, #0x0\n"
- "movi v10.4s, #0x0\n"
- "fmla v16.4s, v2.4s, v26.4s\n"
- "movi v26.4s, #0x0\n"
- "movi v2.4s, #0x0\n"
- ".inst 0x4f98e194 // sdot v20.4s, v12.16b, v24.4b[0]\n"
- ".inst 0x4fb8e18a // sdot v10.4s, v12.16b, v24.4b[1]\n"
- ".inst 0x4f98e99a // sdot v26.4s, v12.16b, v24.4b[2]\n"
- ".inst 0x4fb8e982 // sdot v2.4s, v12.16b, v24.4b[3]\n"
- "ldr q24, [x22, #0x20]\n"
- ".inst 0x4f89e3f4 // sdot v20.4s, v31.16b, v9.4b[0]\n"
- ".inst 0x4fa9e3ea // sdot v10.4s, v31.16b, v9.4b[1]\n"
- ".inst 0x4f89ebfa // sdot v26.4s, v31.16b, v9.4b[2]\n"
- ".inst 0x4fa9ebe2 // sdot v2.4s, v31.16b, v9.4b[3]\n"
- "ldr q9, [x22, #0x30]\n"
- ".inst 0x4f98e0d4 // sdot v20.4s, v6.16b, v24.4b[0]\n"
- ".inst 0x4fb8e0ca // sdot v10.4s, v6.16b, v24.4b[1]\n"
- ".inst 0x4f98e8da // sdot v26.4s, v6.16b, v24.4b[2]\n"
- ".inst 0x4fb8e8c2 // sdot v2.4s, v6.16b, v24.4b[3]\n"
- "ldr q24, [x22, #0x40]\n"
- ".inst 0x4f89e394 // sdot v20.4s, v28.16b, v9.4b[0]\n"
- ".inst 0x4fa9e38a // sdot v10.4s, v28.16b, v9.4b[1]\n"
- ".inst 0x4f89eb9a // sdot v26.4s, v28.16b, v9.4b[2]\n"
- ".inst 0x4fa9eb82 // sdot v2.4s, v28.16b, v9.4b[3]\n"
- "ldr q9, [x22, #0x50]\n"
- ".inst 0x4f98e074 // sdot v20.4s, v3.16b, v24.4b[0]\n"
- ".inst 0x4fb8e06a // sdot v10.4s, v3.16b, v24.4b[1]\n"
- ".inst 0x4f98e87a // sdot v26.4s, v3.16b, v24.4b[2]\n"
- ".inst 0x4fb8e862 // sdot v2.4s, v3.16b, v24.4b[3]\n"
- "ldr q24, [x22, #0x60]\n"
- ".inst 0x4f89e2d4 // sdot v20.4s, v22.16b, v9.4b[0]\n"
- ".inst 0x4fa9e2ca // sdot v10.4s, v22.16b, v9.4b[1]\n"
- ".inst 0x4f89eada // sdot v26.4s, v22.16b, v9.4b[2]\n"
- ".inst 0x4fa9eac2 // sdot v2.4s, v22.16b, v9.4b[3]\n"
- "ldr q9, [x22, #0x70]\n"
- "add x22, x22, #0x88\n"
- ".inst 0x4f98e374 // sdot v20.4s, v27.16b, v24.4b[0]\n"
- ".inst 0x4fb8e36a // sdot v10.4s, v27.16b, v24.4b[1]\n"
- ".inst 0x4f98eb7a // sdot v26.4s, v27.16b, v24.4b[2]\n"
- ".inst 0x4fb8eb62 // sdot v2.4s, v27.16b, v24.4b[3]\n"
- "ldr q24, [x21, #0x0]\n"
- ".inst 0x4f89e3d4 // sdot v20.4s, v30.16b, v9.4b[0]\n"
- ".inst 0x4fa9e3ca // sdot v10.4s, v30.16b, v9.4b[1]\n"
- ".inst 0x4f89ebda // sdot v26.4s, v30.16b, v9.4b[2]\n"
- ".inst 0x4fa9ebc2 // sdot v2.4s, v30.16b, v9.4b[3]\n"
- "fmul v9.4s, v17.4s, v29.s[0]\n"
- "scvtf v20.4s, v20.4s, #0x4\n"
- "scvtf v10.4s, v10.4s, #0x4\n"
- "scvtf v26.4s, v26.4s, #0x4\n"
- "scvtf v2.4s, v2.4s, #0x4\n"
- "fmla v25.4s, v20.4s, v9.4s\n"
- "ldr q9, [x21, #0x10]\n"
- "fmul v20.4s, v17.4s, v29.s[1]\n"
- "fmla v7.4s, v10.4s, v20.4s\n"
- "ldr d20, [x21, #-0x8]\n"
- "fmul v10.4s, v17.4s, v29.s[2]\n"
- "fmul v29.4s, v17.4s, v29.s[3]\n"
- "fcvtl v20.4s, v20.4h\n"
- "fmla v0.4s, v26.4s, v10.4s\n"
- "movi v26.4s, #0x0\n"
- "movi v10.4s, #0x0\n"
- "fmla v4.4s, v2.4s, v29.4s\n"
- "movi v2.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- ".inst 0x4f98e19a // sdot v26.4s, v12.16b, v24.4b[0]\n"
- ".inst 0x4fb8e18a // sdot v10.4s, v12.16b, v24.4b[1]\n"
- ".inst 0x4f98e982 // sdot v2.4s, v12.16b, v24.4b[2]\n"
- ".inst 0x4fb8e99d // sdot v29.4s, v12.16b, v24.4b[3]\n"
- "ldr q12, [x21, #0x20]\n"
- "fmul v24.4s, v17.4s, v20.s[0]\n"
- ".inst 0x4f89e3fa // sdot v26.4s, v31.16b, v9.4b[0]\n"
- ".inst 0x4fa9e3ea // sdot v10.4s, v31.16b, v9.4b[1]\n"
- ".inst 0x4f89ebe2 // sdot v2.4s, v31.16b, v9.4b[2]\n"
- ".inst 0x4fa9ebfd // sdot v29.4s, v31.16b, v9.4b[3]\n"
- "ldr q9, [x21, #0x30]\n"
- "fmul v31.4s, v17.4s, v20.s[1]\n"
- ".inst 0x4f8ce0da // sdot v26.4s, v6.16b, v12.4b[0]\n"
- ".inst 0x4face0ca // sdot v10.4s, v6.16b, v12.4b[1]\n"
- ".inst 0x4f8ce8c2 // sdot v2.4s, v6.16b, v12.4b[2]\n"
- ".inst 0x4face8dd // sdot v29.4s, v6.16b, v12.4b[3]\n"
- "ldr q12, [x21, #0x40]\n"
- "fmul v6.4s, v17.4s, v20.s[2]\n"
- "fmul v20.4s, v17.4s, v20.s[3]\n"
- ".inst 0x4f89e39a // sdot v26.4s, v28.16b, v9.4b[0]\n"
- ".inst 0x4fa9e38a // sdot v10.4s, v28.16b, v9.4b[1]\n"
- ".inst 0x4f89eb82 // sdot v2.4s, v28.16b, v9.4b[2]\n"
- ".inst 0x4fa9eb9d // sdot v29.4s, v28.16b, v9.4b[3]\n"
- "ldr q9, [x21, #0x50]\n"
- ".inst 0x4f8ce07a // sdot v26.4s, v3.16b, v12.4b[0]\n"
- ".inst 0x4face06a // sdot v10.4s, v3.16b, v12.4b[1]\n"
- ".inst 0x4f8ce862 // sdot v2.4s, v3.16b, v12.4b[2]\n"
- ".inst 0x4face87d // sdot v29.4s, v3.16b, v12.4b[3]\n"
- "ldr q12, [x21, #0x60]\n"
- ".inst 0x4f89e2da // sdot v26.4s, v22.16b, v9.4b[0]\n"
- ".inst 0x4fa9e2ca // sdot v10.4s, v22.16b, v9.4b[1]\n"
- ".inst 0x4f89eac2 // sdot v2.4s, v22.16b, v9.4b[2]\n"
- ".inst 0x4fa9eadd // sdot v29.4s, v22.16b, v9.4b[3]\n"
- "ldr q17, [x21, #0x70]\n"
- "add x21, x21, #0x88\n"
- ".inst 0x4f8ce37a // sdot v26.4s, v27.16b, v12.4b[0]\n"
- ".inst 0x4face36a // sdot v10.4s, v27.16b, v12.4b[1]\n"
- ".inst 0x4f8ceb62 // sdot v2.4s, v27.16b, v12.4b[2]\n"
- ".inst 0x4faceb7d // sdot v29.4s, v27.16b, v12.4b[3]\n"
- ".inst 0x4f91e3da // sdot v26.4s, v30.16b, v17.4b[0]\n"
- ".inst 0x4fb1e3ca // sdot v10.4s, v30.16b, v17.4b[1]\n"
- ".inst 0x4f91ebc2 // sdot v2.4s, v30.16b, v17.4b[2]\n"
- ".inst 0x4fb1ebdd // sdot v29.4s, v30.16b, v17.4b[3]\n"
- "scvtf v26.4s, v26.4s, #0x4\n"
- "scvtf v10.4s, v10.4s, #0x4\n"
- "fmla v5.4s, v26.4s, v24.4s\n"
- "scvtf v2.4s, v2.4s, #0x4\n"
- "scvtf v29.4s, v29.4s, #0x4\n"
- "fmla v21.4s, v10.4s, v31.4s\n"
- "fmla v8.4s, v2.4s, v6.4s\n"
- "fmla v1.4s, v29.4s, v20.4s\n"
- "bgt 3b\n"
- "mov x20, %x[res_ptr]\n"
- "subs x27, x27, #0x4\n"
- "add %x[res_ptr], %x[res_ptr], #0x10\n"
- "str q15, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "str q19, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "str q18, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "str q14, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "str q11, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "str q13, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "str q23, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "str q16, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "str q25, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "str q7, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "str q0, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "str q4, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "str q5, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "str q21, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "str q8, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "str q1, [x20, #0x0]\n"
- "bne 2b\n"
- "mov x20, #0x4\n"
- "sub x10, x10, #0x10\n"
- "cmp x10, #0x10\n"
- "mov %x[res_ptr], x26\n"
- "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
- "bge 1b\n"
- "4:" // Row loop skip
- "cbz x10, 9f\n"
- "5:" // Row tail: Row loop
- "add x24, %x[b_ptr], #0x8\n"
- "mov x23, %x[nc]\n"
- "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
- "6:" // Row tail: Column loop
- "movi v15.16b, #0x0\n"
- "movi v19.16b, #0x0\n"
- "add x25, %x[a_ptr], #0x8\n"
- "mov x21, %x[nb]\n"
- "movi v18.16b, #0x0\n"
- "movi v14.16b, #0x0\n"
- "7:" // Row tail: Block loop
- "ldr q7, [x24, #0x0]\n"
- "ldr q5, [x25, #0x0]\n"
- "movi v9.16b, #0x4\n"
- "movi v4.4s, #0x0\n"
- "ldr q3, [x24, #0x10]\n"
- "ldr q2, [x25, #0x10]\n"
- "movi v1.4s, #0x0\n"
- "movi v0.4s, #0x0\n"
- "ldr q13, [x24, #0x20]\n"
- "ldr q31, [x25, #0x20]\n"
- "movi v30.4s, #0x0\n"
- "movi v29.16b, #0xf0\n"
- "ldr q28, [x24, #0x30]\n"
- "ldr q27, [x25, #0x30]\n"
- "sshl v20.16b, v7.16b, v9.16b\n"
- "sub x20, x24, #0x8\n"
- "ldr q26, [x25, #0x40]\n"
- "ldr q25, [x25, #0x50]\n"
- "sshl v17.16b, v3.16b, v9.16b\n"
- "and v7.16b, v7.16b, v29.16b\n"
- "ldr q24, [x25, #0x60]\n"
- "ldr q16, [x25, #0x70]\n"
- "sshl v22.16b, v13.16b, v9.16b\n"
- "and v3.16b, v3.16b, v29.16b\n"
- "ldr d21, [x20, #0x0]\n"
- "ldr d12, [x25, #-0x8]\n"
- ".inst 0x4f85e284 // sdot v4.4s, v20.16b, v5.4b[0]\n"
- ".inst 0x4fa5e281 // sdot v1.4s, v20.16b, v5.4b[1]\n"
- ".inst 0x4f85ea80 // sdot v0.4s, v20.16b, v5.4b[2]\n"
- ".inst 0x4fa5ea9e // sdot v30.4s, v20.16b, v5.4b[3]\n"
- "sshl v9.16b, v28.16b, v9.16b\n"
- "subs x21, x21, #0x1\n"
- "and v13.16b, v13.16b, v29.16b\n"
- "and v28.16b, v28.16b, v29.16b\n"
- "add x25, x25, #0x88\n"
- "add x24, x24, #0x48\n"
- "fcvtl v21.4s, v21.4h\n"
- "fcvtl v12.4s, v12.4h\n"
- ".inst 0x4f82e224 // sdot v4.4s, v17.16b, v2.4b[0]\n"
- ".inst 0x4fa2e221 // sdot v1.4s, v17.16b, v2.4b[1]\n"
- ".inst 0x4f82ea20 // sdot v0.4s, v17.16b, v2.4b[2]\n"
- ".inst 0x4fa2ea3e // sdot v30.4s, v17.16b, v2.4b[3]\n"
- "fmul v11.4s, v21.4s, v12.s[0]\n"
- "fmul v23.4s, v21.4s, v12.s[1]\n"
- "fmul v17.4s, v21.4s, v12.s[2]\n"
- ".inst 0x4f9fe2c4 // sdot v4.4s, v22.16b, v31.4b[0]\n"
- "fmul v6.4s, v21.4s, v12.s[3]\n"
- ".inst 0x4fbfe2c1 // sdot v1.4s, v22.16b, v31.4b[1]\n"
- ".inst 0x4f9feac0 // sdot v0.4s, v22.16b, v31.4b[2]\n"
- ".inst 0x4fbfeade // sdot v30.4s, v22.16b, v31.4b[3]\n"
- ".inst 0x4f9be124 // sdot v4.4s, v9.16b, v27.4b[0]\n"
- ".inst 0x4fbbe121 // sdot v1.4s, v9.16b, v27.4b[1]\n"
- ".inst 0x4f9be920 // sdot v0.4s, v9.16b, v27.4b[2]\n"
- ".inst 0x4fbbe93e // sdot v30.4s, v9.16b, v27.4b[3]\n"
- ".inst 0x4f9ae0e4 // sdot v4.4s, v7.16b, v26.4b[0]\n"
- ".inst 0x4fbae0e1 // sdot v1.4s, v7.16b, v26.4b[1]\n"
- ".inst 0x4f9ae8e0 // sdot v0.4s, v7.16b, v26.4b[2]\n"
- ".inst 0x4fbae8fe // sdot v30.4s, v7.16b, v26.4b[3]\n"
- ".inst 0x4f99e064 // sdot v4.4s, v3.16b, v25.4b[0]\n"
- ".inst 0x4fb9e061 // sdot v1.4s, v3.16b, v25.4b[1]\n"
- ".inst 0x4f99e860 // sdot v0.4s, v3.16b, v25.4b[2]\n"
- ".inst 0x4fb9e87e // sdot v30.4s, v3.16b, v25.4b[3]\n"
- ".inst 0x4f98e1a4 // sdot v4.4s, v13.16b, v24.4b[0]\n"
- ".inst 0x4fb8e1a1 // sdot v1.4s, v13.16b, v24.4b[1]\n"
- ".inst 0x4f98e9a0 // sdot v0.4s, v13.16b, v24.4b[2]\n"
- ".inst 0x4fb8e9be // sdot v30.4s, v13.16b, v24.4b[3]\n"
- ".inst 0x4f90e384 // sdot v4.4s, v28.16b, v16.4b[0]\n"
- ".inst 0x4fb0e381 // sdot v1.4s, v28.16b, v16.4b[1]\n"
- ".inst 0x4f90eb80 // sdot v0.4s, v28.16b, v16.4b[2]\n"
- ".inst 0x4fb0eb9e // sdot v30.4s, v28.16b, v16.4b[3]\n"
- "scvtf v4.4s, v4.4s, #0x4\n"
- "scvtf v1.4s, v1.4s, #0x4\n"
- "scvtf v0.4s, v0.4s, #0x4\n"
- "fmla v15.4s, v4.4s, v11.4s\n"
- "scvtf v30.4s, v30.4s, #0x4\n"
- "fmla v19.4s, v1.4s, v23.4s\n"
- "fmla v18.4s, v0.4s, v17.4s\n"
- "fmla v14.4s, v30.4s, v6.4s\n"
- "bgt 7b\n"
- "mov x20, %x[res_ptr]\n"
- "cmp x10, #0x1\n"
- "str q15, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "ble 8f\n"
- "cmp x10, #0x2\n"
- "str q19, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "ble 8f\n"
- "cmp x10, #0x3\n"
- "str q18, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "ble 8f\n"
- "str q14, [x20, #0x0]\n"
- "8:" // Row tail: Accumulator store skip
- "subs x23, x23, #0x4\n"
- "add %x[res_ptr], %x[res_ptr], #0x10\n"
- "bne 6b\n"
- "subs x10, x10, #0x4\n"
- "add %x[a_ptr], %x[a_ptr], x9\n"
- "mov %x[res_ptr], x22\n"
- "bgt 5b\n"
- "9:" // Row tail: Row loop skip
- : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
- : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
- );
- return;
- }
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+ const void * b_ptr = vx;
+ const void * a_ptr = vy;
+ float * res_ptr = s;
+ size_t res_stride = bs * sizeof(float);
+
+ __asm__ __volatile__(
+ "mov x10, %x[nr]\n"
+ "mov x9, #0x88\n"
+ "cmp x10, #0x10\n"
+ "mul x9, %x[nb], x9\n"
+ "blt 4f\n"
+ "1:" // Row loop
+ "add x28, %x[b_ptr], #0x8\n"
+ "mov x27, %x[nc]\n"
+ "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
+ "2:" // Column loop
+ "add x25, %x[a_ptr], #0x8\n"
+ "movi v15.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "mov x24, %x[nb]\n"
+ "add x23, x25, x9\n"
+ "movi v18.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "add x22, x23, x9\n"
+ "movi v11.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "add x21, x22, x9\n"
+ "movi v23.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v7.16b, #0x0\n"
+ "movi v0.16b, #0x0\n"
+ "movi v4.16b, #0x0\n"
+ "movi v5.16b, #0x0\n"
+ "movi v21.16b, #0x0\n"
+ "movi v8.16b, #0x0\n"
+ "movi v1.16b, #0x0\n"
+ "3:" // Block loop
+ "ldr q3, [x28, #0x0]\n"
+ "ldr q31, [x25, #0x0]\n"
+ "movi v28.16b, #0x4\n"
+ "movi v10.4s, #0x0\n"
+ "ldr q22, [x28, #0x10]\n"
+ "ldr q6, [x25, #0x10]\n"
+ "movi v29.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "ldr q27, [x28, #0x20]\n"
+ "ldr q30, [x28, #0x30]\n"
+ "movi v20.4s, #0x0\n"
+ "movi v24.16b, #0xf0\n"
+ "ldr d2, [x25, #-0x8]\n"
+ "ldr d26, [x23, #-0x8]\n"
+ "sshl v12.16b, v3.16b, v28.16b\n"
+ "sub x20, x28, #0x8\n"
+ "ldr d17, [x20, #0x0]\n"
+ "and v3.16b, v3.16b, v24.16b\n"
+ "subs x24, x24, #0x1\n"
+ "add x28, x28, #0x48\n"
+ ".inst 0x4f9fe18a // sdot v10.4s, v12.16b, v31.4b[0]\n"
+ ".inst 0x4fbfe19d // sdot v29.4s, v12.16b, v31.4b[1]\n"
+ ".inst 0x4f9fe989 // sdot v9.4s, v12.16b, v31.4b[2]\n"
+ ".inst 0x4fbfe994 // sdot v20.4s, v12.16b, v31.4b[3]\n"
+ "sshl v31.16b, v22.16b, v28.16b\n"
+ "and v22.16b, v22.16b, v24.16b\n"
+ "fcvtl v17.4s, v17.4h\n"
+ "fcvtl v2.4s, v2.4h\n"
+ "fcvtl v26.4s, v26.4h\n"
+ ".inst 0x4f86e3ea // sdot v10.4s, v31.16b, v6.4b[0]\n"
+ ".inst 0x4fa6e3fd // sdot v29.4s, v31.16b, v6.4b[1]\n"
+ ".inst 0x4f86ebe9 // sdot v9.4s, v31.16b, v6.4b[2]\n"
+ ".inst 0x4fa6ebf4 // sdot v20.4s, v31.16b, v6.4b[3]\n"
+ "sshl v6.16b, v27.16b, v28.16b\n"
+ "sshl v28.16b, v30.16b, v28.16b\n"
+ "and v27.16b, v27.16b, v24.16b\n"
+ "and v30.16b, v30.16b, v24.16b\n"
+ "ldr q24, [x25, #0x20]\n"
+ ".inst 0x4f98e0ca // sdot v10.4s, v6.16b, v24.4b[0]\n"
+ ".inst 0x4fb8e0dd // sdot v29.4s, v6.16b, v24.4b[1]\n"
+ ".inst 0x4f98e8c9 // sdot v9.4s, v6.16b, v24.4b[2]\n"
+ ".inst 0x4fb8e8d4 // sdot v20.4s, v6.16b, v24.4b[3]\n"
+ "ldr q24, [x25, #0x30]\n"
+ ".inst 0x4f98e38a // sdot v10.4s, v28.16b, v24.4b[0]\n"
+ ".inst 0x4fb8e39d // sdot v29.4s, v28.16b, v24.4b[1]\n"
+ ".inst 0x4f98eb89 // sdot v9.4s, v28.16b, v24.4b[2]\n"
+ ".inst 0x4fb8eb94 // sdot v20.4s, v28.16b, v24.4b[3]\n"
+ "ldr q24, [x25, #0x40]\n"
+ ".inst 0x4f98e06a // sdot v10.4s, v3.16b, v24.4b[0]\n"
+ ".inst 0x4fb8e07d // sdot v29.4s, v3.16b, v24.4b[1]\n"
+ ".inst 0x4f98e869 // sdot v9.4s, v3.16b, v24.4b[2]\n"
+ ".inst 0x4fb8e874 // sdot v20.4s, v3.16b, v24.4b[3]\n"
+ "ldr q24, [x25, #0x50]\n"
+ ".inst 0x4f98e2ca // sdot v10.4s, v22.16b, v24.4b[0]\n"
+ ".inst 0x4fb8e2dd // sdot v29.4s, v22.16b, v24.4b[1]\n"
+ ".inst 0x4f98eac9 // sdot v9.4s, v22.16b, v24.4b[2]\n"
+ ".inst 0x4fb8ead4 // sdot v20.4s, v22.16b, v24.4b[3]\n"
+ "ldr q24, [x25, #0x60]\n"
+ ".inst 0x4f98e36a // sdot v10.4s, v27.16b, v24.4b[0]\n"
+ ".inst 0x4fb8e37d // sdot v29.4s, v27.16b, v24.4b[1]\n"
+ ".inst 0x4f98eb69 // sdot v9.4s, v27.16b, v24.4b[2]\n"
+ ".inst 0x4fb8eb74 // sdot v20.4s, v27.16b, v24.4b[3]\n"
+ "ldr q24, [x25, #0x70]\n"
+ "add x25, x25, #0x88\n"
+ ".inst 0x4f98e3ca // sdot v10.4s, v30.16b, v24.4b[0]\n"
+ ".inst 0x4fb8e3dd // sdot v29.4s, v30.16b, v24.4b[1]\n"
+ ".inst 0x4f98ebc9 // sdot v9.4s, v30.16b, v24.4b[2]\n"
+ ".inst 0x4fb8ebd4 // sdot v20.4s, v30.16b, v24.4b[3]\n"
+ "fmul v24.4s, v17.4s, v2.s[0]\n"
+ "scvtf v10.4s, v10.4s, #0x4\n"
+ "scvtf v29.4s, v29.4s, #0x4\n"
+ "scvtf v9.4s, v9.4s, #0x4\n"
+ "scvtf v20.4s, v20.4s, #0x4\n"
+ "fmla v15.4s, v10.4s, v24.4s\n"
+ "ldr q24, [x23, #0x0]\n"
+ "fmul v10.4s, v17.4s, v2.s[1]\n"
+ "fmla v19.4s, v29.4s, v10.4s\n"
+ "ldr q10, [x23, #0x10]\n"
+ "fmul v29.4s, v17.4s, v2.s[2]\n"
+ "fmul v2.4s, v17.4s, v2.s[3]\n"
+ "fmla v18.4s, v9.4s, v29.4s\n"
+ "movi v9.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ ".inst 0x4f98e189 // sdot v9.4s, v12.16b, v24.4b[0]\n"
+ ".inst 0x4fb8e19d // sdot v29.4s, v12.16b, v24.4b[1]\n"
+ "fmla v14.4s, v20.4s, v2.4s\n"
+ "movi v20.4s, #0x0\n"
+ "movi v2.4s, #0x0\n"
+ ".inst 0x4f98e994 // sdot v20.4s, v12.16b, v24.4b[2]\n"
+ ".inst 0x4fb8e982 // sdot v2.4s, v12.16b, v24.4b[3]\n"
+ "ldr q24, [x23, #0x20]\n"
+ ".inst 0x4f8ae3e9 // sdot v9.4s, v31.16b, v10.4b[0]\n"
+ ".inst 0x4faae3fd // sdot v29.4s, v31.16b, v10.4b[1]\n"
+ ".inst 0x4f8aebf4 // sdot v20.4s, v31.16b, v10.4b[2]\n"
+ ".inst 0x4faaebe2 // sdot v2.4s, v31.16b, v10.4b[3]\n"
+ "ldr q10, [x23, #0x30]\n"
+ ".inst 0x4f98e0c9 // sdot v9.4s, v6.16b, v24.4b[0]\n"
+ ".inst 0x4fb8e0dd // sdot v29.4s, v6.16b, v24.4b[1]\n"
+ ".inst 0x4f98e8d4 // sdot v20.4s, v6.16b, v24.4b[2]\n"
+ ".inst 0x4fb8e8c2 // sdot v2.4s, v6.16b, v24.4b[3]\n"
+ "ldr q24, [x23, #0x40]\n"
+ ".inst 0x4f8ae389 // sdot v9.4s, v28.16b, v10.4b[0]\n"
+ ".inst 0x4faae39d // sdot v29.4s, v28.16b, v10.4b[1]\n"
+ ".inst 0x4f8aeb94 // sdot v20.4s, v28.16b, v10.4b[2]\n"
+ ".inst 0x4faaeb82 // sdot v2.4s, v28.16b, v10.4b[3]\n"
+ "ldr q10, [x23, #0x50]\n"
+ ".inst 0x4f98e069 // sdot v9.4s, v3.16b, v24.4b[0]\n"
+ ".inst 0x4fb8e07d // sdot v29.4s, v3.16b, v24.4b[1]\n"
+ ".inst 0x4f98e874 // sdot v20.4s, v3.16b, v24.4b[2]\n"
+ ".inst 0x4fb8e862 // sdot v2.4s, v3.16b, v24.4b[3]\n"
+ "ldr q24, [x23, #0x60]\n"
+ ".inst 0x4f8ae2c9 // sdot v9.4s, v22.16b, v10.4b[0]\n"
+ ".inst 0x4faae2dd // sdot v29.4s, v22.16b, v10.4b[1]\n"
+ ".inst 0x4f8aead4 // sdot v20.4s, v22.16b, v10.4b[2]\n"
+ ".inst 0x4faaeac2 // sdot v2.4s, v22.16b, v10.4b[3]\n"
+ "ldr q10, [x23, #0x70]\n"
+ "add x23, x23, #0x88\n"
+ ".inst 0x4f98e369 // sdot v9.4s, v27.16b, v24.4b[0]\n"
+ ".inst 0x4fb8e37d // sdot v29.4s, v27.16b, v24.4b[1]\n"
+ ".inst 0x4f98eb74 // sdot v20.4s, v27.16b, v24.4b[2]\n"
+ ".inst 0x4fb8eb62 // sdot v2.4s, v27.16b, v24.4b[3]\n"
+ "ldr q24, [x22, #0x0]\n"
+ ".inst 0x4f8ae3c9 // sdot v9.4s, v30.16b, v10.4b[0]\n"
+ ".inst 0x4faae3dd // sdot v29.4s, v30.16b, v10.4b[1]\n"
+ ".inst 0x4f8aebd4 // sdot v20.4s, v30.16b, v10.4b[2]\n"
+ ".inst 0x4faaebc2 // sdot v2.4s, v30.16b, v10.4b[3]\n"
+ "fmul v10.4s, v17.4s, v26.s[0]\n"
+ "scvtf v9.4s, v9.4s, #0x4\n"
+ "scvtf v29.4s, v29.4s, #0x4\n"
+ "scvtf v20.4s, v20.4s, #0x4\n"
+ "scvtf v2.4s, v2.4s, #0x4\n"
+ "fmla v11.4s, v9.4s, v10.4s\n"
+ "ldr q9, [x22, #0x10]\n"
+ "fmul v10.4s, v17.4s, v26.s[1]\n"
+ "fmla v13.4s, v29.4s, v10.4s\n"
+ "ldr d29, [x22, #-0x8]\n"
+ "fmul v10.4s, v17.4s, v26.s[2]\n"
+ "fmul v26.4s, v17.4s, v26.s[3]\n"
+ "fcvtl v29.4s, v29.4h\n"
+ "fmla v23.4s, v20.4s, v10.4s\n"
+ "movi v20.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "fmla v16.4s, v2.4s, v26.4s\n"
+ "movi v26.4s, #0x0\n"
+ "movi v2.4s, #0x0\n"
+ ".inst 0x4f98e194 // sdot v20.4s, v12.16b, v24.4b[0]\n"
+ ".inst 0x4fb8e18a // sdot v10.4s, v12.16b, v24.4b[1]\n"
+ ".inst 0x4f98e99a // sdot v26.4s, v12.16b, v24.4b[2]\n"
+ ".inst 0x4fb8e982 // sdot v2.4s, v12.16b, v24.4b[3]\n"
+ "ldr q24, [x22, #0x20]\n"
+ ".inst 0x4f89e3f4 // sdot v20.4s, v31.16b, v9.4b[0]\n"
+ ".inst 0x4fa9e3ea // sdot v10.4s, v31.16b, v9.4b[1]\n"
+ ".inst 0x4f89ebfa // sdot v26.4s, v31.16b, v9.4b[2]\n"
+ ".inst 0x4fa9ebe2 // sdot v2.4s, v31.16b, v9.4b[3]\n"
+ "ldr q9, [x22, #0x30]\n"
+ ".inst 0x4f98e0d4 // sdot v20.4s, v6.16b, v24.4b[0]\n"
+ ".inst 0x4fb8e0ca // sdot v10.4s, v6.16b, v24.4b[1]\n"
+ ".inst 0x4f98e8da // sdot v26.4s, v6.16b, v24.4b[2]\n"
+ ".inst 0x4fb8e8c2 // sdot v2.4s, v6.16b, v24.4b[3]\n"
+ "ldr q24, [x22, #0x40]\n"
+ ".inst 0x4f89e394 // sdot v20.4s, v28.16b, v9.4b[0]\n"
+ ".inst 0x4fa9e38a // sdot v10.4s, v28.16b, v9.4b[1]\n"
+ ".inst 0x4f89eb9a // sdot v26.4s, v28.16b, v9.4b[2]\n"
+ ".inst 0x4fa9eb82 // sdot v2.4s, v28.16b, v9.4b[3]\n"
+ "ldr q9, [x22, #0x50]\n"
+ ".inst 0x4f98e074 // sdot v20.4s, v3.16b, v24.4b[0]\n"
+ ".inst 0x4fb8e06a // sdot v10.4s, v3.16b, v24.4b[1]\n"
+ ".inst 0x4f98e87a // sdot v26.4s, v3.16b, v24.4b[2]\n"
+ ".inst 0x4fb8e862 // sdot v2.4s, v3.16b, v24.4b[3]\n"
+ "ldr q24, [x22, #0x60]\n"
+ ".inst 0x4f89e2d4 // sdot v20.4s, v22.16b, v9.4b[0]\n"
+ ".inst 0x4fa9e2ca // sdot v10.4s, v22.16b, v9.4b[1]\n"
+ ".inst 0x4f89eada // sdot v26.4s, v22.16b, v9.4b[2]\n"
+ ".inst 0x4fa9eac2 // sdot v2.4s, v22.16b, v9.4b[3]\n"
+ "ldr q9, [x22, #0x70]\n"
+ "add x22, x22, #0x88\n"
+ ".inst 0x4f98e374 // sdot v20.4s, v27.16b, v24.4b[0]\n"
+ ".inst 0x4fb8e36a // sdot v10.4s, v27.16b, v24.4b[1]\n"
+ ".inst 0x4f98eb7a // sdot v26.4s, v27.16b, v24.4b[2]\n"
+ ".inst 0x4fb8eb62 // sdot v2.4s, v27.16b, v24.4b[3]\n"
+ "ldr q24, [x21, #0x0]\n"
+ ".inst 0x4f89e3d4 // sdot v20.4s, v30.16b, v9.4b[0]\n"
+ ".inst 0x4fa9e3ca // sdot v10.4s, v30.16b, v9.4b[1]\n"
+ ".inst 0x4f89ebda // sdot v26.4s, v30.16b, v9.4b[2]\n"
+ ".inst 0x4fa9ebc2 // sdot v2.4s, v30.16b, v9.4b[3]\n"
+ "fmul v9.4s, v17.4s, v29.s[0]\n"
+ "scvtf v20.4s, v20.4s, #0x4\n"
+ "scvtf v10.4s, v10.4s, #0x4\n"
+ "scvtf v26.4s, v26.4s, #0x4\n"
+ "scvtf v2.4s, v2.4s, #0x4\n"
+ "fmla v25.4s, v20.4s, v9.4s\n"
+ "ldr q9, [x21, #0x10]\n"
+ "fmul v20.4s, v17.4s, v29.s[1]\n"
+ "fmla v7.4s, v10.4s, v20.4s\n"
+ "ldr d20, [x21, #-0x8]\n"
+ "fmul v10.4s, v17.4s, v29.s[2]\n"
+ "fmul v29.4s, v17.4s, v29.s[3]\n"
+ "fcvtl v20.4s, v20.4h\n"
+ "fmla v0.4s, v26.4s, v10.4s\n"
+ "movi v26.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "fmla v4.4s, v2.4s, v29.4s\n"
+ "movi v2.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ ".inst 0x4f98e19a // sdot v26.4s, v12.16b, v24.4b[0]\n"
+ ".inst 0x4fb8e18a // sdot v10.4s, v12.16b, v24.4b[1]\n"
+ ".inst 0x4f98e982 // sdot v2.4s, v12.16b, v24.4b[2]\n"
+ ".inst 0x4fb8e99d // sdot v29.4s, v12.16b, v24.4b[3]\n"
+ "ldr q12, [x21, #0x20]\n"
+ "fmul v24.4s, v17.4s, v20.s[0]\n"
+ ".inst 0x4f89e3fa // sdot v26.4s, v31.16b, v9.4b[0]\n"
+ ".inst 0x4fa9e3ea // sdot v10.4s, v31.16b, v9.4b[1]\n"
+ ".inst 0x4f89ebe2 // sdot v2.4s, v31.16b, v9.4b[2]\n"
+ ".inst 0x4fa9ebfd // sdot v29.4s, v31.16b, v9.4b[3]\n"
+ "ldr q9, [x21, #0x30]\n"
+ "fmul v31.4s, v17.4s, v20.s[1]\n"
+ ".inst 0x4f8ce0da // sdot v26.4s, v6.16b, v12.4b[0]\n"
+ ".inst 0x4face0ca // sdot v10.4s, v6.16b, v12.4b[1]\n"
+ ".inst 0x4f8ce8c2 // sdot v2.4s, v6.16b, v12.4b[2]\n"
+ ".inst 0x4face8dd // sdot v29.4s, v6.16b, v12.4b[3]\n"
+ "ldr q12, [x21, #0x40]\n"
+ "fmul v6.4s, v17.4s, v20.s[2]\n"
+ "fmul v20.4s, v17.4s, v20.s[3]\n"
+ ".inst 0x4f89e39a // sdot v26.4s, v28.16b, v9.4b[0]\n"
+ ".inst 0x4fa9e38a // sdot v10.4s, v28.16b, v9.4b[1]\n"
+ ".inst 0x4f89eb82 // sdot v2.4s, v28.16b, v9.4b[2]\n"
+ ".inst 0x4fa9eb9d // sdot v29.4s, v28.16b, v9.4b[3]\n"
+ "ldr q9, [x21, #0x50]\n"
+ ".inst 0x4f8ce07a // sdot v26.4s, v3.16b, v12.4b[0]\n"
+ ".inst 0x4face06a // sdot v10.4s, v3.16b, v12.4b[1]\n"
+ ".inst 0x4f8ce862 // sdot v2.4s, v3.16b, v12.4b[2]\n"
+ ".inst 0x4face87d // sdot v29.4s, v3.16b, v12.4b[3]\n"
+ "ldr q12, [x21, #0x60]\n"
+ ".inst 0x4f89e2da // sdot v26.4s, v22.16b, v9.4b[0]\n"
+ ".inst 0x4fa9e2ca // sdot v10.4s, v22.16b, v9.4b[1]\n"
+ ".inst 0x4f89eac2 // sdot v2.4s, v22.16b, v9.4b[2]\n"
+ ".inst 0x4fa9eadd // sdot v29.4s, v22.16b, v9.4b[3]\n"
+ "ldr q17, [x21, #0x70]\n"
+ "add x21, x21, #0x88\n"
+ ".inst 0x4f8ce37a // sdot v26.4s, v27.16b, v12.4b[0]\n"
+ ".inst 0x4face36a // sdot v10.4s, v27.16b, v12.4b[1]\n"
+ ".inst 0x4f8ceb62 // sdot v2.4s, v27.16b, v12.4b[2]\n"
+ ".inst 0x4faceb7d // sdot v29.4s, v27.16b, v12.4b[3]\n"
+ ".inst 0x4f91e3da // sdot v26.4s, v30.16b, v17.4b[0]\n"
+ ".inst 0x4fb1e3ca // sdot v10.4s, v30.16b, v17.4b[1]\n"
+ ".inst 0x4f91ebc2 // sdot v2.4s, v30.16b, v17.4b[2]\n"
+ ".inst 0x4fb1ebdd // sdot v29.4s, v30.16b, v17.4b[3]\n"
+ "scvtf v26.4s, v26.4s, #0x4\n"
+ "scvtf v10.4s, v10.4s, #0x4\n"
+ "fmla v5.4s, v26.4s, v24.4s\n"
+ "scvtf v2.4s, v2.4s, #0x4\n"
+ "scvtf v29.4s, v29.4s, #0x4\n"
+ "fmla v21.4s, v10.4s, v31.4s\n"
+ "fmla v8.4s, v2.4s, v6.4s\n"
+ "fmla v1.4s, v29.4s, v20.4s\n"
+ "bgt 3b\n"
+ "mov x20, %x[res_ptr]\n"
+ "subs x27, x27, #0x4\n"
+ "add %x[res_ptr], %x[res_ptr], #0x10\n"
+ "str q15, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "str q19, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "str q18, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "str q14, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "str q11, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "str q13, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "str q23, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "str q16, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "str q25, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "str q7, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "str q0, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "str q4, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "str q5, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "str q21, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "str q8, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "str q1, [x20, #0x0]\n"
+ "bne 2b\n"
+ "mov x20, #0x4\n"
+ "sub x10, x10, #0x10\n"
+ "cmp x10, #0x10\n"
+ "mov %x[res_ptr], x26\n"
+ "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
+ "bge 1b\n"
+ "4:" // Row loop skip
+ "cbz x10, 9f\n"
+ "5:" // Row tail: Row loop
+ "add x24, %x[b_ptr], #0x8\n"
+ "mov x23, %x[nc]\n"
+ "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
+ "6:" // Row tail: Column loop
+ "movi v15.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "add x25, %x[a_ptr], #0x8\n"
+ "mov x21, %x[nb]\n"
+ "movi v18.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "7:" // Row tail: Block loop
+ "ldr q7, [x24, #0x0]\n"
+ "ldr q5, [x25, #0x0]\n"
+ "movi v9.16b, #0x4\n"
+ "movi v4.4s, #0x0\n"
+ "ldr q3, [x24, #0x10]\n"
+ "ldr q2, [x25, #0x10]\n"
+ "movi v1.4s, #0x0\n"
+ "movi v0.4s, #0x0\n"
+ "ldr q13, [x24, #0x20]\n"
+ "ldr q31, [x25, #0x20]\n"
+ "movi v30.4s, #0x0\n"
+ "movi v29.16b, #0xf0\n"
+ "ldr q28, [x24, #0x30]\n"
+ "ldr q27, [x25, #0x30]\n"
+ "sshl v20.16b, v7.16b, v9.16b\n"
+ "sub x20, x24, #0x8\n"
+ "ldr q26, [x25, #0x40]\n"
+ "ldr q25, [x25, #0x50]\n"
+ "sshl v17.16b, v3.16b, v9.16b\n"
+ "and v7.16b, v7.16b, v29.16b\n"
+ "ldr q24, [x25, #0x60]\n"
+ "ldr q16, [x25, #0x70]\n"
+ "sshl v22.16b, v13.16b, v9.16b\n"
+ "and v3.16b, v3.16b, v29.16b\n"
+ "ldr d21, [x20, #0x0]\n"
+ "ldr d12, [x25, #-0x8]\n"
+ ".inst 0x4f85e284 // sdot v4.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x4fa5e281 // sdot v1.4s, v20.16b, v5.4b[1]\n"
+ ".inst 0x4f85ea80 // sdot v0.4s, v20.16b, v5.4b[2]\n"
+ ".inst 0x4fa5ea9e // sdot v30.4s, v20.16b, v5.4b[3]\n"
+ "sshl v9.16b, v28.16b, v9.16b\n"
+ "subs x21, x21, #0x1\n"
+ "and v13.16b, v13.16b, v29.16b\n"
+ "and v28.16b, v28.16b, v29.16b\n"
+ "add x25, x25, #0x88\n"
+ "add x24, x24, #0x48\n"
+ "fcvtl v21.4s, v21.4h\n"
+ "fcvtl v12.4s, v12.4h\n"
+ ".inst 0x4f82e224 // sdot v4.4s, v17.16b, v2.4b[0]\n"
+ ".inst 0x4fa2e221 // sdot v1.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x4f82ea20 // sdot v0.4s, v17.16b, v2.4b[2]\n"
+ ".inst 0x4fa2ea3e // sdot v30.4s, v17.16b, v2.4b[3]\n"
+ "fmul v11.4s, v21.4s, v12.s[0]\n"
+ "fmul v23.4s, v21.4s, v12.s[1]\n"
+ "fmul v17.4s, v21.4s, v12.s[2]\n"
+ ".inst 0x4f9fe2c4 // sdot v4.4s, v22.16b, v31.4b[0]\n"
+ "fmul v6.4s, v21.4s, v12.s[3]\n"
+ ".inst 0x4fbfe2c1 // sdot v1.4s, v22.16b, v31.4b[1]\n"
+ ".inst 0x4f9feac0 // sdot v0.4s, v22.16b, v31.4b[2]\n"
+ ".inst 0x4fbfeade // sdot v30.4s, v22.16b, v31.4b[3]\n"
+ ".inst 0x4f9be124 // sdot v4.4s, v9.16b, v27.4b[0]\n"
+ ".inst 0x4fbbe121 // sdot v1.4s, v9.16b, v27.4b[1]\n"
+ ".inst 0x4f9be920 // sdot v0.4s, v9.16b, v27.4b[2]\n"
+ ".inst 0x4fbbe93e // sdot v30.4s, v9.16b, v27.4b[3]\n"
+ ".inst 0x4f9ae0e4 // sdot v4.4s, v7.16b, v26.4b[0]\n"
+ ".inst 0x4fbae0e1 // sdot v1.4s, v7.16b, v26.4b[1]\n"
+ ".inst 0x4f9ae8e0 // sdot v0.4s, v7.16b, v26.4b[2]\n"
+ ".inst 0x4fbae8fe // sdot v30.4s, v7.16b, v26.4b[3]\n"
+ ".inst 0x4f99e064 // sdot v4.4s, v3.16b, v25.4b[0]\n"
+ ".inst 0x4fb9e061 // sdot v1.4s, v3.16b, v25.4b[1]\n"
+ ".inst 0x4f99e860 // sdot v0.4s, v3.16b, v25.4b[2]\n"
+ ".inst 0x4fb9e87e // sdot v30.4s, v3.16b, v25.4b[3]\n"
+ ".inst 0x4f98e1a4 // sdot v4.4s, v13.16b, v24.4b[0]\n"
+ ".inst 0x4fb8e1a1 // sdot v1.4s, v13.16b, v24.4b[1]\n"
+ ".inst 0x4f98e9a0 // sdot v0.4s, v13.16b, v24.4b[2]\n"
+ ".inst 0x4fb8e9be // sdot v30.4s, v13.16b, v24.4b[3]\n"
+ ".inst 0x4f90e384 // sdot v4.4s, v28.16b, v16.4b[0]\n"
+ ".inst 0x4fb0e381 // sdot v1.4s, v28.16b, v16.4b[1]\n"
+ ".inst 0x4f90eb80 // sdot v0.4s, v28.16b, v16.4b[2]\n"
+ ".inst 0x4fb0eb9e // sdot v30.4s, v28.16b, v16.4b[3]\n"
+ "scvtf v4.4s, v4.4s, #0x4\n"
+ "scvtf v1.4s, v1.4s, #0x4\n"
+ "scvtf v0.4s, v0.4s, #0x4\n"
+ "fmla v15.4s, v4.4s, v11.4s\n"
+ "scvtf v30.4s, v30.4s, #0x4\n"
+ "fmla v19.4s, v1.4s, v23.4s\n"
+ "fmla v18.4s, v0.4s, v17.4s\n"
+ "fmla v14.4s, v30.4s, v6.4s\n"
+ "bgt 7b\n"
+ "mov x20, %x[res_ptr]\n"
+ "cmp x10, #0x1\n"
+ "str q15, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "ble 8f\n"
+ "cmp x10, #0x2\n"
+ "str q19, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "ble 8f\n"
+ "cmp x10, #0x3\n"
+ "str q18, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "ble 8f\n"
+ "str q14, [x20, #0x0]\n"
+ "8:" // Row tail: Accumulator store skip
+ "subs x23, x23, #0x4\n"
+ "add %x[res_ptr], %x[res_ptr], #0x10\n"
+ "bne 6b\n"
+ "subs x10, x10, #0x4\n"
+ "add %x[a_ptr], %x[a_ptr], x9\n"
+ "mov %x[res_ptr], x22\n"
+ "bgt 5b\n"
+ "9:" // Row tail: Row loop skip
+ : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
+ : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+ return;
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
{
float sumf[4][4];
UNUSED(blocklen);
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
- if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
- const void * b_ptr = vx;
- const void * a_ptr = vy;
- float * res_ptr = s;
- size_t res_stride = bs * sizeof(float);
-
- __asm__ __volatile__(
- "mov x10, %x[nr]\n"
- "mov x9, #0x88\n"
- "cmp x10, #0x10\n"
- "mul x9, %x[nb], x9\n"
- "blt 4f\n"
- "1:" // Row loop
- "add x28, %x[b_ptr], #0x8\n"
- "mov x27, %x[nc]\n"
- "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
- "2:" // Column loop
- "add x25, %x[a_ptr], #0x8\n"
- "movi v2.16b, #0x0\n"
- "movi v10.16b, #0x0\n"
- "mov x24, %x[nb]\n"
- "add x23, x25, x9\n"
- "movi v12.16b, #0x0\n"
- "movi v28.16b, #0x0\n"
- "add x22, x23, x9\n"
- "movi v11.16b, #0x0\n"
- "movi v13.16b, #0x0\n"
- "add x21, x22, x9\n"
- "movi v22.16b, #0x0\n"
- "movi v23.16b, #0x0\n"
- "movi v25.16b, #0x0\n"
- "movi v5.16b, #0x0\n"
- "movi v7.16b, #0x0\n"
- "movi v4.16b, #0x0\n"
- "movi v6.16b, #0x0\n"
- "movi v30.16b, #0x0\n"
- "movi v24.16b, #0x0\n"
- "movi v14.16b, #0x0\n"
- "3:" // Block loop
- "ldr q21, [x28, #0x0]\n"
- "ldr q16, [x28, #0x10]\n"
- "movi v1.16b, #0x4\n"
- "movi v19.4s, #0x0\n"
- "ldr q27, [x25, #0x0]\n"
- "ldr q15, [x25, #0x10]\n"
- "movi v26.4s, #0x0\n"
- "movi v18.4s, #0x0\n"
- "ldr q29, [x28, #0x20]\n"
- "ldr q3, [x28, #0x30]\n"
- "movi v17.4s, #0x0\n"
- "movi v0.16b, #0xf0\n"
- "ldr d20, [x25, #-0x8]\n"
- "ldr d9, [x23, #-0x8]\n"
- "sshl v8.16b, v21.16b, v1.16b\n"
- "sshl v31.16b, v16.16b, v1.16b\n"
- "and v21.16b, v21.16b, v0.16b\n"
- "and v16.16b, v16.16b, v0.16b\n"
- "sub x20, x28, #0x8\n"
- "subs x24, x24, #0x1\n"
- "add x28, x28, #0x48\n"
- ".inst 0x4e88a773 // smmla v19.4s, v27.16b, v8.16b\n"
- ".inst 0x4e9fa77a // smmla v26.4s, v27.16b, v31.16b\n"
- "ldr q27, [x25, #0x20]\n"
- ".inst 0x4e88a5f2 // smmla v18.4s, v15.16b, v8.16b\n"
- ".inst 0x4e9fa5f1 // smmla v17.4s, v15.16b, v31.16b\n"
- "sshl v15.16b, v29.16b, v1.16b\n"
- "sshl v1.16b, v3.16b, v1.16b\n"
- "and v29.16b, v29.16b, v0.16b\n"
- "and v3.16b, v3.16b, v0.16b\n"
- "ldr q0, [x25, #0x30]\n"
- "fcvtl v20.4s, v20.4h\n"
- ".inst 0x4e8fa773 // smmla v19.4s, v27.16b, v15.16b\n"
- "fcvtl v9.4s, v9.4h\n"
- ".inst 0x4e81a77a // smmla v26.4s, v27.16b, v1.16b\n"
- "ldr q27, [x25, #0x40]\n"
- ".inst 0x4e8fa412 // smmla v18.4s, v0.16b, v15.16b\n"
- ".inst 0x4e81a411 // smmla v17.4s, v0.16b, v1.16b\n"
- "ldr q0, [x25, #0x50]\n"
- ".inst 0x4e95a773 // smmla v19.4s, v27.16b, v21.16b\n"
- ".inst 0x4e90a77a // smmla v26.4s, v27.16b, v16.16b\n"
- "ldr q27, [x25, #0x60]\n"
- ".inst 0x4e95a412 // smmla v18.4s, v0.16b, v21.16b\n"
- ".inst 0x4e90a411 // smmla v17.4s, v0.16b, v16.16b\n"
- "ldr q0, [x25, #0x70]\n"
- "add x25, x25, #0x88\n"
- ".inst 0x4e9da773 // smmla v19.4s, v27.16b, v29.16b\n"
- ".inst 0x4e83a77a // smmla v26.4s, v27.16b, v3.16b\n"
- "ldr d27, [x20, #0x0]\n"
- ".inst 0x4e9da412 // smmla v18.4s, v0.16b, v29.16b\n"
- ".inst 0x4e83a411 // smmla v17.4s, v0.16b, v3.16b\n"
- "fcvtl v27.4s, v27.4h\n"
- "uzp1 v0.2d, v19.2d, v26.2d\n"
- "uzp2 v26.2d, v19.2d, v26.2d\n"
- "fmul v19.4s, v27.4s, v20.s[0]\n"
- "scvtf v0.4s, v0.4s, #0x4\n"
- "scvtf v26.4s, v26.4s, #0x4\n"
- "fmla v2.4s, v0.4s, v19.4s\n"
- "ldr q19, [x23, #0x0]\n"
- "uzp1 v0.2d, v18.2d, v17.2d\n"
- "uzp2 v18.2d, v18.2d, v17.2d\n"
- "fmul v17.4s, v27.4s, v20.s[1]\n"
- "scvtf v0.4s, v0.4s, #0x4\n"
- "scvtf v18.4s, v18.4s, #0x4\n"
- "fmla v10.4s, v26.4s, v17.4s\n"
- "ldr q17, [x23, #0x10]\n"
- "fmul v26.4s, v27.4s, v20.s[2]\n"
- "fmul v20.4s, v27.4s, v20.s[3]\n"
- "fmla v12.4s, v0.4s, v26.4s\n"
- "ldr d0, [x22, #-0x8]\n"
- "ldr d26, [x21, #-0x8]\n"
- "fcvtl v0.4s, v0.4h\n"
- "fmla v28.4s, v18.4s, v20.4s\n"
- "movi v20.4s, #0x0\n"
- "movi v18.4s, #0x0\n"
- ".inst 0x4e88a674 // smmla v20.4s, v19.16b, v8.16b\n"
- ".inst 0x4e9fa672 // smmla v18.4s, v19.16b, v31.16b\n"
- "ldr q19, [x23, #0x20]\n"
- "fcvtl v26.4s, v26.4h\n"
- ".inst 0x4e8fa674 // smmla v20.4s, v19.16b, v15.16b\n"
- ".inst 0x4e81a672 // smmla v18.4s, v19.16b, v1.16b\n"
- "ldr q19, [x23, #0x40]\n"
- ".inst 0x4e95a674 // smmla v20.4s, v19.16b, v21.16b\n"
- ".inst 0x4e90a672 // smmla v18.4s, v19.16b, v16.16b\n"
- "ldr q19, [x23, #0x60]\n"
- ".inst 0x4e9da674 // smmla v20.4s, v19.16b, v29.16b\n"
- ".inst 0x4e83a672 // smmla v18.4s, v19.16b, v3.16b\n"
- "uzp1 v19.2d, v20.2d, v18.2d\n"
- "scvtf v19.4s, v19.4s, #0x4\n"
- "uzp2 v20.2d, v20.2d, v18.2d\n"
- "fmul v18.4s, v27.4s, v9.s[0]\n"
- "scvtf v20.4s, v20.4s, #0x4\n"
- "fmla v11.4s, v19.4s, v18.4s\n"
- "ldr q18, [x22, #0x0]\n"
- "fmul v19.4s, v27.4s, v9.s[1]\n"
- "fmla v13.4s, v20.4s, v19.4s\n"
- "movi v19.4s, #0x0\n"
- "movi v20.4s, #0x0\n"
- ".inst 0x4e88a633 // smmla v19.4s, v17.16b, v8.16b\n"
- ".inst 0x4e9fa634 // smmla v20.4s, v17.16b, v31.16b\n"
- "ldr q17, [x23, #0x30]\n"
- ".inst 0x4e8fa633 // smmla v19.4s, v17.16b, v15.16b\n"
- ".inst 0x4e81a634 // smmla v20.4s, v17.16b, v1.16b\n"
- "ldr q17, [x23, #0x50]\n"
- ".inst 0x4e95a633 // smmla v19.4s, v17.16b, v21.16b\n"
- ".inst 0x4e90a634 // smmla v20.4s, v17.16b, v16.16b\n"
- "ldr q17, [x23, #0x70]\n"
- "add x23, x23, #0x88\n"
- ".inst 0x4e9da633 // smmla v19.4s, v17.16b, v29.16b\n"
- ".inst 0x4e83a634 // smmla v20.4s, v17.16b, v3.16b\n"
- "uzp1 v17.2d, v19.2d, v20.2d\n"
- "scvtf v17.4s, v17.4s, #0x4\n"
- "uzp2 v20.2d, v19.2d, v20.2d\n"
- "fmul v19.4s, v27.4s, v9.s[2]\n"
- "fmul v9.4s, v27.4s, v9.s[3]\n"
- "scvtf v20.4s, v20.4s, #0x4\n"
- "fmla v22.4s, v17.4s, v19.4s\n"
- "ldr q17, [x22, #0x10]\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x4e88a653 // smmla v19.4s, v18.16b, v8.16b\n"
- "fmla v23.4s, v20.4s, v9.4s\n"
- "movi v20.4s, #0x0\n"
- "movi v9.4s, #0x0\n"
- ".inst 0x4e9fa654 // smmla v20.4s, v18.16b, v31.16b\n"
- "ldr q18, [x22, #0x20]\n"
- ".inst 0x4e88a629 // smmla v9.4s, v17.16b, v8.16b\n"
- ".inst 0x4e8fa653 // smmla v19.4s, v18.16b, v15.16b\n"
- ".inst 0x4e81a654 // smmla v20.4s, v18.16b, v1.16b\n"
- "ldr q18, [x22, #0x40]\n"
- ".inst 0x4e95a653 // smmla v19.4s, v18.16b, v21.16b\n"
- ".inst 0x4e90a654 // smmla v20.4s, v18.16b, v16.16b\n"
- "ldr q18, [x22, #0x60]\n"
- ".inst 0x4e9da653 // smmla v19.4s, v18.16b, v29.16b\n"
- ".inst 0x4e83a654 // smmla v20.4s, v18.16b, v3.16b\n"
- "movi v18.4s, #0x0\n"
- ".inst 0x4e9fa632 // smmla v18.4s, v17.16b, v31.16b\n"
- "ldr q17, [x22, #0x30]\n"
- ".inst 0x4e8fa629 // smmla v9.4s, v17.16b, v15.16b\n"
- ".inst 0x4e81a632 // smmla v18.4s, v17.16b, v1.16b\n"
- "ldr q17, [x22, #0x50]\n"
- ".inst 0x4e95a629 // smmla v9.4s, v17.16b, v21.16b\n"
- ".inst 0x4e90a632 // smmla v18.4s, v17.16b, v16.16b\n"
- "ldr q17, [x22, #0x70]\n"
- "add x22, x22, #0x88\n"
- ".inst 0x4e9da629 // smmla v9.4s, v17.16b, v29.16b\n"
- ".inst 0x4e83a632 // smmla v18.4s, v17.16b, v3.16b\n"
- "uzp1 v17.2d, v19.2d, v20.2d\n"
- "uzp2 v20.2d, v19.2d, v20.2d\n"
- "fmul v19.4s, v27.4s, v0.s[0]\n"
- "scvtf v17.4s, v17.4s, #0x4\n"
- "scvtf v20.4s, v20.4s, #0x4\n"
- "fmla v25.4s, v17.4s, v19.4s\n"
- "ldr q19, [x21, #0x0]\n"
- "fmul v17.4s, v27.4s, v0.s[1]\n"
- "fmla v5.4s, v20.4s, v17.4s\n"
- "ldr q17, [x21, #0x10]\n"
- "uzp1 v20.2d, v9.2d, v18.2d\n"
- "uzp2 v9.2d, v9.2d, v18.2d\n"
- "fmul v18.4s, v27.4s, v0.s[2]\n"
- "fmul v0.4s, v27.4s, v0.s[3]\n"
- "scvtf v20.4s, v20.4s, #0x4\n"
- "scvtf v9.4s, v9.4s, #0x4\n"
- "fmla v7.4s, v20.4s, v18.4s\n"
- "movi v20.4s, #0x0\n"
- "movi v18.4s, #0x0\n"
- ".inst 0x4e88a674 // smmla v20.4s, v19.16b, v8.16b\n"
- ".inst 0x4e9fa672 // smmla v18.4s, v19.16b, v31.16b\n"
- "ldr q19, [x21, #0x20]\n"
- "fmla v4.4s, v9.4s, v0.4s\n"
- "movi v9.4s, #0x0\n"
- "movi v0.4s, #0x0\n"
- ".inst 0x4e88a629 // smmla v9.4s, v17.16b, v8.16b\n"
- "fmul v8.4s, v27.4s, v26.s[0]\n"
- ".inst 0x4e9fa620 // smmla v0.4s, v17.16b, v31.16b\n"
- "ldr q17, [x21, #0x30]\n"
- ".inst 0x4e8fa674 // smmla v20.4s, v19.16b, v15.16b\n"
- "fmul v31.4s, v27.4s, v26.s[1]\n"
- ".inst 0x4e81a672 // smmla v18.4s, v19.16b, v1.16b\n"
- "ldr q19, [x21, #0x40]\n"
- ".inst 0x4e8fa629 // smmla v9.4s, v17.16b, v15.16b\n"
- "fmul v15.4s, v27.4s, v26.s[2]\n"
- "fmul v27.4s, v27.4s, v26.s[3]\n"
- ".inst 0x4e81a620 // smmla v0.4s, v17.16b, v1.16b\n"
- "ldr q1, [x21, #0x50]\n"
- ".inst 0x4e95a674 // smmla v20.4s, v19.16b, v21.16b\n"
- ".inst 0x4e90a672 // smmla v18.4s, v19.16b, v16.16b\n"
- "ldr q26, [x21, #0x60]\n"
- ".inst 0x4e95a429 // smmla v9.4s, v1.16b, v21.16b\n"
- ".inst 0x4e90a420 // smmla v0.4s, v1.16b, v16.16b\n"
- "ldr q21, [x21, #0x70]\n"
- "add x21, x21, #0x88\n"
- ".inst 0x4e9da754 // smmla v20.4s, v26.16b, v29.16b\n"
- ".inst 0x4e83a752 // smmla v18.4s, v26.16b, v3.16b\n"
- ".inst 0x4e9da6a9 // smmla v9.4s, v21.16b, v29.16b\n"
- ".inst 0x4e83a6a0 // smmla v0.4s, v21.16b, v3.16b\n"
- "uzp1 v29.2d, v20.2d, v18.2d\n"
- "uzp2 v21.2d, v20.2d, v18.2d\n"
- "scvtf v29.4s, v29.4s, #0x4\n"
- "uzp1 v18.2d, v9.2d, v0.2d\n"
- "uzp2 v16.2d, v9.2d, v0.2d\n"
- "scvtf v21.4s, v21.4s, #0x4\n"
- "fmla v6.4s, v29.4s, v8.4s\n"
- "scvtf v18.4s, v18.4s, #0x4\n"
- "scvtf v16.4s, v16.4s, #0x4\n"
- "fmla v30.4s, v21.4s, v31.4s\n"
- "fmla v24.4s, v18.4s, v15.4s\n"
- "fmla v14.4s, v16.4s, v27.4s\n"
- "bgt 3b\n"
- "mov x20, %x[res_ptr]\n"
- "subs x27, x27, #0x4\n"
- "add %x[res_ptr], %x[res_ptr], #0x10\n"
- "str q2, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "str q10, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "str q12, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "str q28, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "str q11, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "str q13, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "str q22, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "str q23, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "str q25, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "str q5, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "str q7, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "str q4, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "str q6, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "str q30, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "str q24, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "str q14, [x20, #0x0]\n"
- "bne 2b\n"
- "mov x20, #0x4\n"
- "sub x10, x10, #0x10\n"
- "cmp x10, #0x10\n"
- "mov %x[res_ptr], x26\n"
- "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
- "bge 1b\n"
- "4:" // Row loop skip
- "cbz x10, 9f\n"
- "5:" // Row tail: Row loop
- "add x24, %x[b_ptr], #0x8\n"
- "mov x23, %x[nc]\n"
- "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
- "6:" // Row tail: Column loop
- "movi v2.16b, #0x0\n"
- "movi v10.16b, #0x0\n"
- "add x25, %x[a_ptr], #0x8\n"
- "mov x21, %x[nb]\n"
- "movi v12.16b, #0x0\n"
- "movi v28.16b, #0x0\n"
- "7:" // Row tail: Block loop
- "ldr q6, [x24, #0x0]\n"
- "ldr q5, [x24, #0x10]\n"
- "movi v17.16b, #0x4\n"
- "movi v8.4s, #0x0\n"
- "ldr q4, [x25, #0x0]\n"
- "ldr q13, [x25, #0x10]\n"
- "movi v27.4s, #0x0\n"
- "movi v0.4s, #0x0\n"
- "ldr q31, [x24, #0x20]\n"
- "ldr q14, [x24, #0x30]\n"
- "movi v29.4s, #0x0\n"
- "movi v22.16b, #0xf0\n"
- "ldr q11, [x25, #0x20]\n"
- "ldr q23, [x25, #0x30]\n"
- "sshl v21.16b, v6.16b, v17.16b\n"
- "sshl v16.16b, v5.16b, v17.16b\n"
- "ldr q20, [x25, #0x40]\n"
- "ldr q26, [x25, #0x50]\n"
- "and v6.16b, v6.16b, v22.16b\n"
- "and v5.16b, v5.16b, v22.16b\n"
- "ldr q25, [x25, #0x60]\n"
- "ldr q3, [x25, #0x70]\n"
- "sshl v19.16b, v31.16b, v17.16b\n"
- "sshl v18.16b, v14.16b, v17.16b\n"
- "ldr d17, [x25, #-0x8]\n"
- ".inst 0x4e95a488 // smmla v8.4s, v4.16b, v21.16b\n"
- ".inst 0x4e90a49b // smmla v27.4s, v4.16b, v16.16b\n"
- "and v31.16b, v31.16b, v22.16b\n"
- ".inst 0x4e95a5a0 // smmla v0.4s, v13.16b, v21.16b\n"
- ".inst 0x4e90a5bd // smmla v29.4s, v13.16b, v16.16b\n"
- "and v14.16b, v14.16b, v22.16b\n"
- "sub x20, x24, #0x8\n"
- "ldr d16, [x20, #0x0]\n"
- "subs x21, x21, #0x1\n"
- "add x25, x25, #0x88\n"
- "fcvtl v17.4s, v17.4h\n"
- "add x24, x24, #0x48\n"
- ".inst 0x4e93a568 // smmla v8.4s, v11.16b, v19.16b\n"
- ".inst 0x4e92a57b // smmla v27.4s, v11.16b, v18.16b\n"
- ".inst 0x4e93a6e0 // smmla v0.4s, v23.16b, v19.16b\n"
- ".inst 0x4e92a6fd // smmla v29.4s, v23.16b, v18.16b\n"
- "fcvtl v16.4s, v16.4h\n"
- ".inst 0x4e86a688 // smmla v8.4s, v20.16b, v6.16b\n"
- ".inst 0x4e85a69b // smmla v27.4s, v20.16b, v5.16b\n"
- "fmul v23.4s, v16.4s, v17.s[0]\n"
- "fmul v21.4s, v16.4s, v17.s[1]\n"
- "fmul v1.4s, v16.4s, v17.s[2]\n"
- "fmul v20.4s, v16.4s, v17.s[3]\n"
- ".inst 0x4e86a740 // smmla v0.4s, v26.16b, v6.16b\n"
- ".inst 0x4e85a75d // smmla v29.4s, v26.16b, v5.16b\n"
- ".inst 0x4e9fa728 // smmla v8.4s, v25.16b, v31.16b\n"
- ".inst 0x4e8ea73b // smmla v27.4s, v25.16b, v14.16b\n"
- ".inst 0x4e9fa460 // smmla v0.4s, v3.16b, v31.16b\n"
- ".inst 0x4e8ea47d // smmla v29.4s, v3.16b, v14.16b\n"
- "uzp1 v19.2d, v8.2d, v27.2d\n"
- "uzp2 v18.2d, v8.2d, v27.2d\n"
- "scvtf v19.4s, v19.4s, #0x4\n"
- "uzp1 v17.2d, v0.2d, v29.2d\n"
- "uzp2 v16.2d, v0.2d, v29.2d\n"
- "scvtf v18.4s, v18.4s, #0x4\n"
- "fmla v2.4s, v19.4s, v23.4s\n"
- "scvtf v17.4s, v17.4s, #0x4\n"
- "scvtf v16.4s, v16.4s, #0x4\n"
- "fmla v10.4s, v18.4s, v21.4s\n"
- "fmla v12.4s, v17.4s, v1.4s\n"
- "fmla v28.4s, v16.4s, v20.4s\n"
- "bgt 7b\n"
- "mov x20, %x[res_ptr]\n"
- "cmp x10, #0x1\n"
- "str q2, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "ble 8f\n"
- "cmp x10, #0x2\n"
- "str q10, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "ble 8f\n"
- "cmp x10, #0x3\n"
- "str q12, [x20, #0x0]\n"
- "add x20, x20, %x[res_stride]\n"
- "ble 8f\n"
- "str q28, [x20, #0x0]\n"
- "8:" // Row tail: Accumulator store skip
- "subs x23, x23, #0x4\n"
- "add %x[res_ptr], %x[res_ptr], #0x10\n"
- "bne 6b\n"
- "subs x10, x10, #0x4\n"
- "add %x[a_ptr], %x[a_ptr], x9\n"
- "mov %x[res_ptr], x22\n"
- "bgt 5b\n"
- "9:" // Row tail: Row loop skip
- : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
- : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
- );
- return;
- }
+ const void * b_ptr = vx;
+ const void * a_ptr = vy;
+ float * res_ptr = s;
+ size_t res_stride = bs * sizeof(float);
+
+ __asm__ __volatile__(
+ "mov x10, %x[nr]\n"
+ "mov x9, #0x88\n"
+ "cmp x10, #0x10\n"
+ "mul x9, %x[nb], x9\n"
+ "blt 4f\n"
+ "1:" // Row loop
+ "add x28, %x[b_ptr], #0x8\n"
+ "mov x27, %x[nc]\n"
+ "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
+ "2:" // Column loop
+ "add x25, %x[a_ptr], #0x8\n"
+ "movi v2.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "mov x24, %x[nb]\n"
+ "add x23, x25, x9\n"
+ "movi v12.16b, #0x0\n"
+ "movi v28.16b, #0x0\n"
+ "add x22, x23, x9\n"
+ "movi v11.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "add x21, x22, x9\n"
+ "movi v22.16b, #0x0\n"
+ "movi v23.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v5.16b, #0x0\n"
+ "movi v7.16b, #0x0\n"
+ "movi v4.16b, #0x0\n"
+ "movi v6.16b, #0x0\n"
+ "movi v30.16b, #0x0\n"
+ "movi v24.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "3:" // Block loop
+ "ldr q21, [x28, #0x0]\n"
+ "ldr q16, [x28, #0x10]\n"
+ "movi v1.16b, #0x4\n"
+ "movi v19.4s, #0x0\n"
+ "ldr q27, [x25, #0x0]\n"
+ "ldr q15, [x25, #0x10]\n"
+ "movi v26.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "ldr q29, [x28, #0x20]\n"
+ "ldr q3, [x28, #0x30]\n"
+ "movi v17.4s, #0x0\n"
+ "movi v0.16b, #0xf0\n"
+ "ldr d20, [x25, #-0x8]\n"
+ "ldr d9, [x23, #-0x8]\n"
+ "sshl v8.16b, v21.16b, v1.16b\n"
+ "sshl v31.16b, v16.16b, v1.16b\n"
+ "and v21.16b, v21.16b, v0.16b\n"
+ "and v16.16b, v16.16b, v0.16b\n"
+ "sub x20, x28, #0x8\n"
+ "subs x24, x24, #0x1\n"
+ "add x28, x28, #0x48\n"
+ ".inst 0x4e88a773 // smmla v19.4s, v27.16b, v8.16b\n"
+ ".inst 0x4e9fa77a // smmla v26.4s, v27.16b, v31.16b\n"
+ "ldr q27, [x25, #0x20]\n"
+ ".inst 0x4e88a5f2 // smmla v18.4s, v15.16b, v8.16b\n"
+ ".inst 0x4e9fa5f1 // smmla v17.4s, v15.16b, v31.16b\n"
+ "sshl v15.16b, v29.16b, v1.16b\n"
+ "sshl v1.16b, v3.16b, v1.16b\n"
+ "and v29.16b, v29.16b, v0.16b\n"
+ "and v3.16b, v3.16b, v0.16b\n"
+ "ldr q0, [x25, #0x30]\n"
+ "fcvtl v20.4s, v20.4h\n"
+ ".inst 0x4e8fa773 // smmla v19.4s, v27.16b, v15.16b\n"
+ "fcvtl v9.4s, v9.4h\n"
+ ".inst 0x4e81a77a // smmla v26.4s, v27.16b, v1.16b\n"
+ "ldr q27, [x25, #0x40]\n"
+ ".inst 0x4e8fa412 // smmla v18.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e81a411 // smmla v17.4s, v0.16b, v1.16b\n"
+ "ldr q0, [x25, #0x50]\n"
+ ".inst 0x4e95a773 // smmla v19.4s, v27.16b, v21.16b\n"
+ ".inst 0x4e90a77a // smmla v26.4s, v27.16b, v16.16b\n"
+ "ldr q27, [x25, #0x60]\n"
+ ".inst 0x4e95a412 // smmla v18.4s, v0.16b, v21.16b\n"
+ ".inst 0x4e90a411 // smmla v17.4s, v0.16b, v16.16b\n"
+ "ldr q0, [x25, #0x70]\n"
+ "add x25, x25, #0x88\n"
+ ".inst 0x4e9da773 // smmla v19.4s, v27.16b, v29.16b\n"
+ ".inst 0x4e83a77a // smmla v26.4s, v27.16b, v3.16b\n"
+ "ldr d27, [x20, #0x0]\n"
+ ".inst 0x4e9da412 // smmla v18.4s, v0.16b, v29.16b\n"
+ ".inst 0x4e83a411 // smmla v17.4s, v0.16b, v3.16b\n"
+ "fcvtl v27.4s, v27.4h\n"
+ "uzp1 v0.2d, v19.2d, v26.2d\n"
+ "uzp2 v26.2d, v19.2d, v26.2d\n"
+ "fmul v19.4s, v27.4s, v20.s[0]\n"
+ "scvtf v0.4s, v0.4s, #0x4\n"
+ "scvtf v26.4s, v26.4s, #0x4\n"
+ "fmla v2.4s, v0.4s, v19.4s\n"
+ "ldr q19, [x23, #0x0]\n"
+ "uzp1 v0.2d, v18.2d, v17.2d\n"
+ "uzp2 v18.2d, v18.2d, v17.2d\n"
+ "fmul v17.4s, v27.4s, v20.s[1]\n"
+ "scvtf v0.4s, v0.4s, #0x4\n"
+ "scvtf v18.4s, v18.4s, #0x4\n"
+ "fmla v10.4s, v26.4s, v17.4s\n"
+ "ldr q17, [x23, #0x10]\n"
+ "fmul v26.4s, v27.4s, v20.s[2]\n"
+ "fmul v20.4s, v27.4s, v20.s[3]\n"
+ "fmla v12.4s, v0.4s, v26.4s\n"
+ "ldr d0, [x22, #-0x8]\n"
+ "ldr d26, [x21, #-0x8]\n"
+ "fcvtl v0.4s, v0.4h\n"
+ "fmla v28.4s, v18.4s, v20.4s\n"
+ "movi v20.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ ".inst 0x4e88a674 // smmla v20.4s, v19.16b, v8.16b\n"
+ ".inst 0x4e9fa672 // smmla v18.4s, v19.16b, v31.16b\n"
+ "ldr q19, [x23, #0x20]\n"
+ "fcvtl v26.4s, v26.4h\n"
+ ".inst 0x4e8fa674 // smmla v20.4s, v19.16b, v15.16b\n"
+ ".inst 0x4e81a672 // smmla v18.4s, v19.16b, v1.16b\n"
+ "ldr q19, [x23, #0x40]\n"
+ ".inst 0x4e95a674 // smmla v20.4s, v19.16b, v21.16b\n"
+ ".inst 0x4e90a672 // smmla v18.4s, v19.16b, v16.16b\n"
+ "ldr q19, [x23, #0x60]\n"
+ ".inst 0x4e9da674 // smmla v20.4s, v19.16b, v29.16b\n"
+ ".inst 0x4e83a672 // smmla v18.4s, v19.16b, v3.16b\n"
+ "uzp1 v19.2d, v20.2d, v18.2d\n"
+ "scvtf v19.4s, v19.4s, #0x4\n"
+ "uzp2 v20.2d, v20.2d, v18.2d\n"
+ "fmul v18.4s, v27.4s, v9.s[0]\n"
+ "scvtf v20.4s, v20.4s, #0x4\n"
+ "fmla v11.4s, v19.4s, v18.4s\n"
+ "ldr q18, [x22, #0x0]\n"
+ "fmul v19.4s, v27.4s, v9.s[1]\n"
+ "fmla v13.4s, v20.4s, v19.4s\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ ".inst 0x4e88a633 // smmla v19.4s, v17.16b, v8.16b\n"
+ ".inst 0x4e9fa634 // smmla v20.4s, v17.16b, v31.16b\n"
+ "ldr q17, [x23, #0x30]\n"
+ ".inst 0x4e8fa633 // smmla v19.4s, v17.16b, v15.16b\n"
+ ".inst 0x4e81a634 // smmla v20.4s, v17.16b, v1.16b\n"
+ "ldr q17, [x23, #0x50]\n"
+ ".inst 0x4e95a633 // smmla v19.4s, v17.16b, v21.16b\n"
+ ".inst 0x4e90a634 // smmla v20.4s, v17.16b, v16.16b\n"
+ "ldr q17, [x23, #0x70]\n"
+ "add x23, x23, #0x88\n"
+ ".inst 0x4e9da633 // smmla v19.4s, v17.16b, v29.16b\n"
+ ".inst 0x4e83a634 // smmla v20.4s, v17.16b, v3.16b\n"
+ "uzp1 v17.2d, v19.2d, v20.2d\n"
+ "scvtf v17.4s, v17.4s, #0x4\n"
+ "uzp2 v20.2d, v19.2d, v20.2d\n"
+ "fmul v19.4s, v27.4s, v9.s[2]\n"
+ "fmul v9.4s, v27.4s, v9.s[3]\n"
+ "scvtf v20.4s, v20.4s, #0x4\n"
+ "fmla v22.4s, v17.4s, v19.4s\n"
+ "ldr q17, [x22, #0x10]\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x4e88a653 // smmla v19.4s, v18.16b, v8.16b\n"
+ "fmla v23.4s, v20.4s, v9.4s\n"
+ "movi v20.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ ".inst 0x4e9fa654 // smmla v20.4s, v18.16b, v31.16b\n"
+ "ldr q18, [x22, #0x20]\n"
+ ".inst 0x4e88a629 // smmla v9.4s, v17.16b, v8.16b\n"
+ ".inst 0x4e8fa653 // smmla v19.4s, v18.16b, v15.16b\n"
+ ".inst 0x4e81a654 // smmla v20.4s, v18.16b, v1.16b\n"
+ "ldr q18, [x22, #0x40]\n"
+ ".inst 0x4e95a653 // smmla v19.4s, v18.16b, v21.16b\n"
+ ".inst 0x4e90a654 // smmla v20.4s, v18.16b, v16.16b\n"
+ "ldr q18, [x22, #0x60]\n"
+ ".inst 0x4e9da653 // smmla v19.4s, v18.16b, v29.16b\n"
+ ".inst 0x4e83a654 // smmla v20.4s, v18.16b, v3.16b\n"
+ "movi v18.4s, #0x0\n"
+ ".inst 0x4e9fa632 // smmla v18.4s, v17.16b, v31.16b\n"
+ "ldr q17, [x22, #0x30]\n"
+ ".inst 0x4e8fa629 // smmla v9.4s, v17.16b, v15.16b\n"
+ ".inst 0x4e81a632 // smmla v18.4s, v17.16b, v1.16b\n"
+ "ldr q17, [x22, #0x50]\n"
+ ".inst 0x4e95a629 // smmla v9.4s, v17.16b, v21.16b\n"
+ ".inst 0x4e90a632 // smmla v18.4s, v17.16b, v16.16b\n"
+ "ldr q17, [x22, #0x70]\n"
+ "add x22, x22, #0x88\n"
+ ".inst 0x4e9da629 // smmla v9.4s, v17.16b, v29.16b\n"
+ ".inst 0x4e83a632 // smmla v18.4s, v17.16b, v3.16b\n"
+ "uzp1 v17.2d, v19.2d, v20.2d\n"
+ "uzp2 v20.2d, v19.2d, v20.2d\n"
+ "fmul v19.4s, v27.4s, v0.s[0]\n"
+ "scvtf v17.4s, v17.4s, #0x4\n"
+ "scvtf v20.4s, v20.4s, #0x4\n"
+ "fmla v25.4s, v17.4s, v19.4s\n"
+ "ldr q19, [x21, #0x0]\n"
+ "fmul v17.4s, v27.4s, v0.s[1]\n"
+ "fmla v5.4s, v20.4s, v17.4s\n"
+ "ldr q17, [x21, #0x10]\n"
+ "uzp1 v20.2d, v9.2d, v18.2d\n"
+ "uzp2 v9.2d, v9.2d, v18.2d\n"
+ "fmul v18.4s, v27.4s, v0.s[2]\n"
+ "fmul v0.4s, v27.4s, v0.s[3]\n"
+ "scvtf v20.4s, v20.4s, #0x4\n"
+ "scvtf v9.4s, v9.4s, #0x4\n"
+ "fmla v7.4s, v20.4s, v18.4s\n"
+ "movi v20.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ ".inst 0x4e88a674 // smmla v20.4s, v19.16b, v8.16b\n"
+ ".inst 0x4e9fa672 // smmla v18.4s, v19.16b, v31.16b\n"
+ "ldr q19, [x21, #0x20]\n"
+ "fmla v4.4s, v9.4s, v0.4s\n"
+ "movi v9.4s, #0x0\n"
+ "movi v0.4s, #0x0\n"
+ ".inst 0x4e88a629 // smmla v9.4s, v17.16b, v8.16b\n"
+ "fmul v8.4s, v27.4s, v26.s[0]\n"
+ ".inst 0x4e9fa620 // smmla v0.4s, v17.16b, v31.16b\n"
+ "ldr q17, [x21, #0x30]\n"
+ ".inst 0x4e8fa674 // smmla v20.4s, v19.16b, v15.16b\n"
+ "fmul v31.4s, v27.4s, v26.s[1]\n"
+ ".inst 0x4e81a672 // smmla v18.4s, v19.16b, v1.16b\n"
+ "ldr q19, [x21, #0x40]\n"
+ ".inst 0x4e8fa629 // smmla v9.4s, v17.16b, v15.16b\n"
+ "fmul v15.4s, v27.4s, v26.s[2]\n"
+ "fmul v27.4s, v27.4s, v26.s[3]\n"
+ ".inst 0x4e81a620 // smmla v0.4s, v17.16b, v1.16b\n"
+ "ldr q1, [x21, #0x50]\n"
+ ".inst 0x4e95a674 // smmla v20.4s, v19.16b, v21.16b\n"
+ ".inst 0x4e90a672 // smmla v18.4s, v19.16b, v16.16b\n"
+ "ldr q26, [x21, #0x60]\n"
+ ".inst 0x4e95a429 // smmla v9.4s, v1.16b, v21.16b\n"
+ ".inst 0x4e90a420 // smmla v0.4s, v1.16b, v16.16b\n"
+ "ldr q21, [x21, #0x70]\n"
+ "add x21, x21, #0x88\n"
+ ".inst 0x4e9da754 // smmla v20.4s, v26.16b, v29.16b\n"
+ ".inst 0x4e83a752 // smmla v18.4s, v26.16b, v3.16b\n"
+ ".inst 0x4e9da6a9 // smmla v9.4s, v21.16b, v29.16b\n"
+ ".inst 0x4e83a6a0 // smmla v0.4s, v21.16b, v3.16b\n"
+ "uzp1 v29.2d, v20.2d, v18.2d\n"
+ "uzp2 v21.2d, v20.2d, v18.2d\n"
+ "scvtf v29.4s, v29.4s, #0x4\n"
+ "uzp1 v18.2d, v9.2d, v0.2d\n"
+ "uzp2 v16.2d, v9.2d, v0.2d\n"
+ "scvtf v21.4s, v21.4s, #0x4\n"
+ "fmla v6.4s, v29.4s, v8.4s\n"
+ "scvtf v18.4s, v18.4s, #0x4\n"
+ "scvtf v16.4s, v16.4s, #0x4\n"
+ "fmla v30.4s, v21.4s, v31.4s\n"
+ "fmla v24.4s, v18.4s, v15.4s\n"
+ "fmla v14.4s, v16.4s, v27.4s\n"
+ "bgt 3b\n"
+ "mov x20, %x[res_ptr]\n"
+ "subs x27, x27, #0x4\n"
+ "add %x[res_ptr], %x[res_ptr], #0x10\n"
+ "str q2, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "str q10, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "str q12, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "str q28, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "str q11, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "str q13, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "str q22, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "str q23, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "str q25, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "str q5, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "str q7, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "str q4, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "str q6, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "str q30, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "str q24, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "str q14, [x20, #0x0]\n"
+ "bne 2b\n"
+ "mov x20, #0x4\n"
+ "sub x10, x10, #0x10\n"
+ "cmp x10, #0x10\n"
+ "mov %x[res_ptr], x26\n"
+ "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
+ "bge 1b\n"
+ "4:" // Row loop skip
+ "cbz x10, 9f\n"
+ "5:" // Row tail: Row loop
+ "add x24, %x[b_ptr], #0x8\n"
+ "mov x23, %x[nc]\n"
+ "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
+ "6:" // Row tail: Column loop
+ "movi v2.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "add x25, %x[a_ptr], #0x8\n"
+ "mov x21, %x[nb]\n"
+ "movi v12.16b, #0x0\n"
+ "movi v28.16b, #0x0\n"
+ "7:" // Row tail: Block loop
+ "ldr q6, [x24, #0x0]\n"
+ "ldr q5, [x24, #0x10]\n"
+ "movi v17.16b, #0x4\n"
+ "movi v8.4s, #0x0\n"
+ "ldr q4, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "movi v27.4s, #0x0\n"
+ "movi v0.4s, #0x0\n"
+ "ldr q31, [x24, #0x20]\n"
+ "ldr q14, [x24, #0x30]\n"
+ "movi v29.4s, #0x0\n"
+ "movi v22.16b, #0xf0\n"
+ "ldr q11, [x25, #0x20]\n"
+ "ldr q23, [x25, #0x30]\n"
+ "sshl v21.16b, v6.16b, v17.16b\n"
+ "sshl v16.16b, v5.16b, v17.16b\n"
+ "ldr q20, [x25, #0x40]\n"
+ "ldr q26, [x25, #0x50]\n"
+ "and v6.16b, v6.16b, v22.16b\n"
+ "and v5.16b, v5.16b, v22.16b\n"
+ "ldr q25, [x25, #0x60]\n"
+ "ldr q3, [x25, #0x70]\n"
+ "sshl v19.16b, v31.16b, v17.16b\n"
+ "sshl v18.16b, v14.16b, v17.16b\n"
+ "ldr d17, [x25, #-0x8]\n"
+ ".inst 0x4e95a488 // smmla v8.4s, v4.16b, v21.16b\n"
+ ".inst 0x4e90a49b // smmla v27.4s, v4.16b, v16.16b\n"
+ "and v31.16b, v31.16b, v22.16b\n"
+ ".inst 0x4e95a5a0 // smmla v0.4s, v13.16b, v21.16b\n"
+ ".inst 0x4e90a5bd // smmla v29.4s, v13.16b, v16.16b\n"
+ "and v14.16b, v14.16b, v22.16b\n"
+ "sub x20, x24, #0x8\n"
+ "ldr d16, [x20, #0x0]\n"
+ "subs x21, x21, #0x1\n"
+ "add x25, x25, #0x88\n"
+ "fcvtl v17.4s, v17.4h\n"
+ "add x24, x24, #0x48\n"
+ ".inst 0x4e93a568 // smmla v8.4s, v11.16b, v19.16b\n"
+ ".inst 0x4e92a57b // smmla v27.4s, v11.16b, v18.16b\n"
+ ".inst 0x4e93a6e0 // smmla v0.4s, v23.16b, v19.16b\n"
+ ".inst 0x4e92a6fd // smmla v29.4s, v23.16b, v18.16b\n"
+ "fcvtl v16.4s, v16.4h\n"
+ ".inst 0x4e86a688 // smmla v8.4s, v20.16b, v6.16b\n"
+ ".inst 0x4e85a69b // smmla v27.4s, v20.16b, v5.16b\n"
+ "fmul v23.4s, v16.4s, v17.s[0]\n"
+ "fmul v21.4s, v16.4s, v17.s[1]\n"
+ "fmul v1.4s, v16.4s, v17.s[2]\n"
+ "fmul v20.4s, v16.4s, v17.s[3]\n"
+ ".inst 0x4e86a740 // smmla v0.4s, v26.16b, v6.16b\n"
+ ".inst 0x4e85a75d // smmla v29.4s, v26.16b, v5.16b\n"
+ ".inst 0x4e9fa728 // smmla v8.4s, v25.16b, v31.16b\n"
+ ".inst 0x4e8ea73b // smmla v27.4s, v25.16b, v14.16b\n"
+ ".inst 0x4e9fa460 // smmla v0.4s, v3.16b, v31.16b\n"
+ ".inst 0x4e8ea47d // smmla v29.4s, v3.16b, v14.16b\n"
+ "uzp1 v19.2d, v8.2d, v27.2d\n"
+ "uzp2 v18.2d, v8.2d, v27.2d\n"
+ "scvtf v19.4s, v19.4s, #0x4\n"
+ "uzp1 v17.2d, v0.2d, v29.2d\n"
+ "uzp2 v16.2d, v0.2d, v29.2d\n"
+ "scvtf v18.4s, v18.4s, #0x4\n"
+ "fmla v2.4s, v19.4s, v23.4s\n"
+ "scvtf v17.4s, v17.4s, #0x4\n"
+ "scvtf v16.4s, v16.4s, #0x4\n"
+ "fmla v10.4s, v18.4s, v21.4s\n"
+ "fmla v12.4s, v17.4s, v1.4s\n"
+ "fmla v28.4s, v16.4s, v20.4s\n"
+ "bgt 7b\n"
+ "mov x20, %x[res_ptr]\n"
+ "cmp x10, #0x1\n"
+ "str q2, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "ble 8f\n"
+ "cmp x10, #0x2\n"
+ "str q10, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "ble 8f\n"
+ "cmp x10, #0x3\n"
+ "str q12, [x20, #0x0]\n"
+ "add x20, x20, %x[res_stride]\n"
+ "ble 8f\n"
+ "str q28, [x20, #0x0]\n"
+ "8:" // Row tail: Accumulator store skip
+ "subs x23, x23, #0x4\n"
+ "add %x[res_ptr], %x[res_ptr], #0x10\n"
+ "bne 6b\n"
+ "subs x10, x10, #0x4\n"
+ "add %x[a_ptr], %x[a_ptr], x9\n"
+ "mov %x[res_ptr], x22\n"
+ "bgt 5b\n"
+ "9:" // Row tail: Row loop skip
+ : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
+ : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+ return;
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
float sumf[4][4];
int sumi;
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
- if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) {
+ if (ggml_cpu_get_sve_cnt() == QK8_0) {
const void * b_ptr = vx;
const void * a_ptr = vy;
float * res_ptr = s;
UNUSED(blocklen);
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
- if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
- const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl);
+ const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl);
- for (int y = 0; y < nr / 4; y++) {
- const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
- for (int x = 0; x < nc / ncols_interleaved; x++) {
- const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
+ for (int y = 0; y < nr / 4; y++) {
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
+ const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
- float32x4_t sumf[4];
- for (int m = 0; m < 4; m++) {
- sumf[m] = vdupq_n_f32(0);
- }
+ float32x4_t sumf[4];
+ for (int m = 0; m < 4; m++) {
+ sumf[m] = vdupq_n_f32(0);
+ }
- for (int l = 0; l < nb; l++) {
- float32x4_t a_d = vcvt_f32_f16(vld1_f16((const float16_t *)a_ptr[l].d));
- float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d));
-
- int32x4_t sumi_0 = vdupq_n_s32(0);
- int32x4_t sumi_1 = vdupq_n_s32(0);
- int32x4_t sumi_2 = vdupq_n_s32(0);
- int32x4_t sumi_3 = vdupq_n_s32(0);
-
- for (int k = 0; k < 4; k++) {
- int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 16 * k + 0);
- int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16 * k + 64);
-
- uint8x16_t b = vld1q_u8(b_ptr[l].qs + 16 * k);
- int8x16_t b_hi = vqtbl1q_s8(kvalues, b >> 4);
- int8x16_t b_lo = vqtbl1q_s8(kvalues, b & 0xF);
-
- sumi_0 = vdotq_laneq_s32(sumi_0, b_lo, a_0, 0);
- sumi_1 = vdotq_laneq_s32(sumi_1, b_lo, a_0, 1);
- sumi_2 = vdotq_laneq_s32(sumi_2, b_lo, a_0, 2);
- sumi_3 = vdotq_laneq_s32(sumi_3, b_lo, a_0, 3);
- sumi_0 = vdotq_laneq_s32(sumi_0, b_hi, a_1, 0);
- sumi_1 = vdotq_laneq_s32(sumi_1, b_hi, a_1, 1);
- sumi_2 = vdotq_laneq_s32(sumi_2, b_hi, a_1, 2);
- sumi_3 = vdotq_laneq_s32(sumi_3, b_hi, a_1, 3);
- }
+ for (int l = 0; l < nb; l++) {
+ float32x4_t a_d = vcvt_f32_f16(vld1_f16((const float16_t *)a_ptr[l].d));
+ float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d));
- sumf[0] = vmlaq_f32(sumf[0], vmulq_laneq_f32(b_d, a_d, 0), vcvtq_f32_s32(sumi_0));
- sumf[1] = vmlaq_f32(sumf[1], vmulq_laneq_f32(b_d, a_d, 1), vcvtq_f32_s32(sumi_1));
- sumf[2] = vmlaq_f32(sumf[2], vmulq_laneq_f32(b_d, a_d, 2), vcvtq_f32_s32(sumi_2));
- sumf[3] = vmlaq_f32(sumf[3], vmulq_laneq_f32(b_d, a_d, 3), vcvtq_f32_s32(sumi_3));
+ int32x4_t sumi_0 = vdupq_n_s32(0);
+ int32x4_t sumi_1 = vdupq_n_s32(0);
+ int32x4_t sumi_2 = vdupq_n_s32(0);
+ int32x4_t sumi_3 = vdupq_n_s32(0);
+
+ for (int k = 0; k < 4; k++) {
+ int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 16 * k + 0);
+ int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16 * k + 64);
+
+ uint8x16_t b = vld1q_u8(b_ptr[l].qs + 16 * k);
+ int8x16_t b_hi = vqtbl1q_s8(kvalues, b >> 4);
+ int8x16_t b_lo = vqtbl1q_s8(kvalues, b & 0xF);
+
+ sumi_0 = vdotq_laneq_s32(sumi_0, b_lo, a_0, 0);
+ sumi_1 = vdotq_laneq_s32(sumi_1, b_lo, a_0, 1);
+ sumi_2 = vdotq_laneq_s32(sumi_2, b_lo, a_0, 2);
+ sumi_3 = vdotq_laneq_s32(sumi_3, b_lo, a_0, 3);
+ sumi_0 = vdotq_laneq_s32(sumi_0, b_hi, a_1, 0);
+ sumi_1 = vdotq_laneq_s32(sumi_1, b_hi, a_1, 1);
+ sumi_2 = vdotq_laneq_s32(sumi_2, b_hi, a_1, 2);
+ sumi_3 = vdotq_laneq_s32(sumi_3, b_hi, a_1, 3);
}
- for (int m = 0; m < 4; m++) {
- vst1q_f32(s + (y * 4 + m) * bs + x * 4, sumf[m]);
- }
+ sumf[0] = vmlaq_f32(sumf[0], vmulq_laneq_f32(b_d, a_d, 0), vcvtq_f32_s32(sumi_0));
+ sumf[1] = vmlaq_f32(sumf[1], vmulq_laneq_f32(b_d, a_d, 1), vcvtq_f32_s32(sumi_1));
+ sumf[2] = vmlaq_f32(sumf[2], vmulq_laneq_f32(b_d, a_d, 2), vcvtq_f32_s32(sumi_2));
+ sumf[3] = vmlaq_f32(sumf[3], vmulq_laneq_f32(b_d, a_d, 3), vcvtq_f32_s32(sumi_3));
+ }
+
+ for (int m = 0; m < 4; m++) {
+ vst1q_f32(s + (y * 4 + m) * bs + x * 4, sumf[m]);
}
}
- return;
}
+ return;
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
{
float sumf[4][4];