* logging: add proper checks for clang to avoid errors and warnings with VA_ARGS
* build: add CMake Presets and toolchian files for Windows ARM64
* matmul-int8: enable matmul-int8 with MSVC and fix Clang warnings
* ci: add support for optimized Windows ARM64 builds with MSVC and LLVM
* matmul-int8: fixed typos in q8_0_q8_0 matmuls
Co-authored-by: Georgi Gerganov <redacted>
* matmul-int8: remove unnecessary casts in q8_0_q8_0
---------
Co-authored-by: Georgi Gerganov <redacted>
strategy:
matrix:
include:
- - build: 'rpc'
+ - build: 'rpc-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON'
- - build: 'noavx'
+ - build: 'noavx-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
- - build: 'avx2'
+ - build: 'avx2-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
- - build: 'avx'
+ - build: 'avx-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
- - build: 'avx512'
+ - build: 'avx512-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
- - build: 'clblast'
+ - build: 'clblast-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
- - build: 'openblas'
+ - build: 'openblas-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
- - build: 'kompute'
+ - build: 'kompute-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
- - build: 'vulkan'
+ - build: 'vulkan-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
- - build: 'arm64'
- defines: '-A ARM64 -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+ - build: 'llvm-arm64'
+ defines: '-G Ninja -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+ - build: 'msvc-arm64'
+ defines: '-G Ninja -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
steps:
- name: Clone
- name: Clone Kompute submodule
id: clone_kompute
- if: ${{ matrix.build == 'kompute' }}
+ if: ${{ matrix.build == 'kompute-x64' }}
run: |
git submodule update --init kompute
- name: Download OpenCL SDK
id: get_opencl
- if: ${{ matrix.build == 'clblast' }}
+ if: ${{ matrix.build == 'clblast-x64' }}
run: |
curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
mkdir $env:RUNNER_TEMP/opencl
- name: Download CLBlast
id: get_clblast
- if: ${{ matrix.build == 'clblast' }}
+ if: ${{ matrix.build == 'clblast-x64' }}
run: |
curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
- name: Download OpenBLAS
id: get_openblas
- if: ${{ matrix.build == 'openblas' }}
+ if: ${{ matrix.build == 'openblas-x64' }}
run: |
curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
- name: Install Vulkan SDK
id: get_vulkan
- if: ${{ matrix.build == 'kompute' || matrix.build == 'vulkan' }}
+ if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
run: |
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
+ - name: Install Ninja
+ id: install_ninja
+ run: |
+ choco install ninja
+
- name: Build
id: cmake_build
run: |
- mkdir build
- cd build
- cmake .. ${{ matrix.defines }}
- cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
+ cmake -S . -B build ${{ matrix.defines }}
+ cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
- name: Add clblast.dll
id: add_clblast_dll
- if: ${{ matrix.build == 'clblast' }}
+ if: ${{ matrix.build == 'clblast-x64' }}
run: |
cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
- name: Add libopenblas.dll
id: add_libopenblas_dll
- if: ${{ matrix.build == 'openblas' }}
+ if: ${{ matrix.build == 'openblas-x64' }}
run: |
cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
- name: Check AVX512F support
id: check_avx512f
- if: ${{ matrix.build == 'avx512' }}
+ if: ${{ matrix.build == 'avx512-x64' }}
continue-on-error: true
run: |
cd build
- name: Test
id: cmake_test
# not all machines have native AVX-512
- if: ${{ matrix.build != 'arm64' && matrix.build != 'clblast' && matrix.build != 'kompute' && matrix.build != 'vulkan' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }}
+ if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'clblast-x64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
run: |
cd build
ctest -L main -C Release --verbose --timeout 900
- name: Test (Intel SDE)
id: cmake_test_sde
- if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
+ if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
run: |
curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
# for some weird reason windows tar doesn't like sde tar.xz
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
- 7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
+ 7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
with:
- path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip
- name: llama-bin-win-${{ matrix.build }}-x64.zip
+ path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
+ name: llama-bin-win-${{ matrix.build }}.zip
windows-latest-cmake-cuda:
runs-on: windows-latest
if (GGML_COMPILER_SUPPORT_DOTPROD)
add_compile_definitions(__ARM_FEATURE_DOTPROD)
endif ()
+ check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
+ if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
+ add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
+ endif ()
+
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
--- /dev/null
+{
+ "version": 4,
+ "configurePresets": [
+ {
+ "name": "base",
+ "hidden": true,
+ "generator": "Ninja",
+ "binaryDir": "${sourceDir}/build-${presetName}",
+ "cacheVariables": {
+ "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
+ "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
+ }
+ },
+
+ { "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
+ { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
+ { "name": "static", "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
+
+ {
+ "name": "arm64-windows-msvc", "hidden": true,
+ "architecture": { "value": "arm64", "strategy": "external" },
+ "toolset": { "value": "host=x86_64", "strategy": "external" },
+ "cacheVariables": {
+ "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
+ }
+ },
+
+ {
+ "name": "arm64-windows-llvm", "hidden": true,
+ "architecture": { "value": "arm64", "strategy": "external" },
+ "toolset": { "value": "host=x86_64", "strategy": "external" },
+ "cacheVariables": {
+ "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
+ }
+ },
+
+ { "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
+ { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "release" ] },
+ { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "release", "static" ] },
+
+ { "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
+ { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
+ { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] }
+ ]
+}
--- /dev/null
+set( CMAKE_SYSTEM_NAME Windows )
+set( CMAKE_SYSTEM_PROCESSOR arm64 )
+
+set( target arm64-pc-windows-msvc )
+
+set( CMAKE_C_COMPILER clang )
+set( CMAKE_CXX_COMPILER clang++ )
+
+set( CMAKE_C_COMPILER_TARGET ${target} )
+set( CMAKE_CXX_COMPILER_TARGET ${target} )
+
+set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast" )
+set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
+
+set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
+set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
--- /dev/null
+set( CMAKE_SYSTEM_NAME Windows )
+set( CMAKE_SYSTEM_PROCESSOR arm64 )
+
+set( target arm64-pc-windows-msvc )
+set( CMAKE_C_COMPILER_TARGET ${target} )
+set( CMAKE_CXX_COMPILER_TARGET ${target} )
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
#else
#define LOG_FLF_FMT "[%24s:%5ld][%24s] "
- #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
+ #define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
#endif
#else
#define LOG_FLF_FMT "%s"
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
#else
#define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
- #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
+ #define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
#endif
#else
#define LOG_TEE_FLF_FMT "%s"
// Main LOG macro.
// behaves like printf, and supports arguments the exact same way.
//
-#ifndef _MSC_VER
+#if !defined(_MSC_VER) || defined(__clang__)
#define LOG(...) LOG_IMPL(__VA_ARGS__, "")
#else
#define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
// Secondary target can be changed just like LOG_TARGET
// by defining LOG_TEE_TARGET
//
-#ifndef _MSC_VER
+#if !defined(_MSC_VER) || defined(__clang__)
#define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
#else
#define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
#endif
// LOG macro variants with auto endline.
-#ifndef _MSC_VER
+#if !defined(_MSC_VER) || defined(__clang__)
#define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
#define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
#else
#if defined(__ARM_FEATURE_MATMUL_INT8)
if (nrc == 2) {
const block_q4_0 * restrict vx0 = vx;
- const block_q4_0 * restrict vx1 = vx + bx;
-
+ const block_q4_0 * restrict vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx);
const block_q8_0 * restrict vy0 = vy;
- const block_q8_0 * restrict vy1 = vy + by;
+ const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
float32x4_t sumv0 = vdupq_n_f32(0.0f);
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
- float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
- GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
- GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
- GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
+ float32_t _scale[4] = { GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
+ GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
+ GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
+ GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
+
+ float32x4_t scale = vld1q_f32(_scale);
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
#if defined(__ARM_FEATURE_MATMUL_INT8)
if (nrc == 2) {
const block_q4_1 * restrict vx0 = vx;
- const block_q4_1 * restrict vx1 = vx + bx;
+ const block_q4_1 * restrict vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx);
const block_q8_1 * restrict vy0 = vy;
- const block_q8_1 * restrict vy1 = vy + by;
+ const block_q8_1 * restrict vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by);
float32x4_t sumv0 = vdupq_n_f32(0.0f);
float32x4_t summs0 = vdupq_n_f32(0.0f);
const block_q8_1 * restrict b_y0 = &vy0[i];
const block_q8_1 * restrict b_y1 = &vy1[i];
- float32x4_t summs_t = {GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
- GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
- GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
- GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)};
- summs0 += summs_t;
+ float32_t summs_t[4] = {GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
+ GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
+ GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
+ GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)};
+ summs0 = vaddq_f32(summs0, vld1q_f32(summs_t));
const uint8x16_t m4b = vdupq_n_u8(0x0F);
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
// mmla into int32x4_t
- float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*b_y0->d,
- GGML_FP16_TO_FP32(b_x0->d)*b_y1->d,
- GGML_FP16_TO_FP32(b_x1->d)*b_y0->d,
- GGML_FP16_TO_FP32(b_x1->d)*b_y1->d};
+ float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*b_y0->d,
+ GGML_FP16_TO_FP32(b_x0->d)*b_y1->d,
+ GGML_FP16_TO_FP32(b_x1->d)*b_y0->d,
+ GGML_FP16_TO_FP32(b_x1->d)*b_y1->d};
+ float32x4_t scale = vld1q_f32(_scale);
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
- sumv2 = sumv2 + summs0;
+ sumv2 = vaddq_f32(sumv2, summs0);
vst1_f32(s, vget_low_f32(sumv2));
vst1_f32(s + bs, vget_high_f32(sumv2));
#if defined(__ARM_FEATURE_MATMUL_INT8)
if (nrc == 2) {
const block_q8_0 * restrict vx0 = vx;
- const block_q8_0 * restrict vx1 = vx + bx;
+ const block_q8_0 * restrict vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx);
const block_q8_0 * restrict vy0 = vy;
- const block_q8_0 * restrict vy1 = vy + by;
+ const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
float32x4_t sumv0 = vdupq_n_f32(0.0f);
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
- float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
- GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
- GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
- GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
+ float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
+ GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
+ GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
+ GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
+ float32x4_t scale = vld1q_f32(_scale);
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));