metal : simplify f16 and f32 dequant kernels (llama/0)

author Georgi Gerganov <redacted>

Mon, 4 Nov 2024 11:49:34 +0000 (13:49 +0200)

committer Georgi Gerganov <redacted>

Fri, 15 Nov 2024 13:21:04 +0000 (15:21 +0200)
author Georgi Gerganov <redacted>
Mon, 4 Nov 2024 11:49:34 +0000 (13:49 +0200)
committer Georgi Gerganov <redacted>
Fri, 15 Nov 2024 13:21:04 +0000 (15:21 +0200)
diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal

index 3eb97663328d828b5911f561090a1bc69bf798b0..ff9d3749042be09bebf6bc55c857510eb8eba266 100644 (file)
--- a/ggml/src/ggml-metal.metal
+++ b/ggml/src/ggml-metal.metal
@@ -19,18 +19,12 @@ constexpr constant static float kvalues_iq4nl_f[16] = {
  // NOTE: this is not dequantizing - we are simply fitting the template
  template <typename type4x4>
  void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
-    float4x4 temp = *(((device float4x4 *)src));
-    for (int i = 0; i < 16; i++){
-        reg[i/4][i%4] = temp[i/4][i%4];
-    }
+    reg = (type4x4)(*src);
  }
  
  template <typename type4x4>
  void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) {
-    half4x4 temp = *(((device half4x4 *)src));
-    for (int i = 0; i < 16; i++){
-        reg[i/4][i%4] = temp[i/4][i%4];
-    }
+    reg = (type4x4)(*src);
  }
  
  template <typename type4x4>
author	Georgi Gerganov <redacted>
	Mon, 4 Nov 2024 11:49:34 +0000 (13:49 +0200)
committer	Georgi Gerganov <redacted>
	Fri, 15 Nov 2024 13:21:04 +0000 (15:21 +0200)