ggml : add coverage measurement for Clang, increase test coverage, F16 ggml_sum ...

author goerch <redacted>

Sun, 23 Jul 2023 16:35:43 +0000 (18:35 +0200)

committer GitHub <redacted>

Sun, 23 Jul 2023 16:35:43 +0000 (19:35 +0300)
author goerch <redacted>
Sun, 23 Jul 2023 16:35:43 +0000 (18:35 +0200)
committer GitHub <redacted>
Sun, 23 Jul 2023 16:35:43 +0000 (19:35 +0300)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml

index 8332deff48e86026c34d40e62e450c306d050694..e635257c10341ed22f4a99506896a161ebf3d576 100644 (file)
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -22,6 +22,12 @@ jobs:
      steps:
      - uses: actions/checkout@v2
  
+    - name: Dependencies for Ubuntu
+      if: matrix.os == 'ubuntu-latest'
+      run: |
+        sudo apt-get update
+        sudo apt-get install llvm
+
      - name: Set GGML_N_THREADS for Ubuntu
        run: echo "GGML_N_THREADS=2" >> $GITHUB_ENV
        if: matrix.os == 'ubuntu-latest'
@@ -35,7 +41,7 @@ jobs:
  
      - name: Configure CMake
        working-directory: ./build
-      run: cmake ..
+      run: cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DGGML_TEST_COVERAGE=ON ..
  
      - name: Build
        working-directory: ./build
@@ -44,3 +50,19 @@ jobs:
      - name: Test
        working-directory: ./build
        run: ctest --verbose --timeout 900
+
+    - name: Test Coverage for Ubuntu
+      if: matrix.os == 'ubuntu-latest'
+      working-directory: ./build
+      run: |
+        llvm-profdata merge -sparse tests/*.profraw -o ggml.profdata
+        llvm-cov      report ./bin/test-grad0 -instr-profile=ggml.profdata
+        llvm-cov      report ./bin/test-opt   -instr-profile=ggml.profdata
+
+    - name: Test Coverage for MacOS
+      if: matrix.os == 'macos-latest'
+      working-directory: ./build
+      run: |
+        xcrun llvm-profdata merge -sparse tests/*.profraw -o ggml.profdata
+        xcrun llvm-cov      report ./bin/test-grad0 -instr-profile=ggml.profdata
+        xcrun llvm-cov      report ./bin/test-opt   -instr-profile=ggml.profdata
diff --git a/CMakeLists.txt b/CMakeLists.txt

index 0b774a432e9f75146d67575d3c2642bfa191f9e4..af078e8d86f193ca48b5cad232ff99269ee5089c 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,6 +25,8 @@ option(GGML_SANITIZE_UNDEFINED      "ggml: enable undefined sanitizer" OFF)
  option(GGML_BUILD_TESTS             "ggml: build tests"    ${GGML_STANDALONE})
  option(GGML_BUILD_EXAMPLES          "ggml: build examples" ${GGML_STANDALONE})
  
+option(GGML_TEST_COVERAGE           "ggml: enable test coverage" OFF)
+
  option(GGML_PERF                    "ggml: enable perf timings"          OFF)
  option(GGML_NO_ACCELERATE           "ggml: disable Accelerate framework" OFF)
  option(GGML_OPENBLAS                "ggml: use OpenBLAS"                 OFF)
@@ -67,6 +69,17 @@ if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
      set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "RelWithDebInfo")
  endif ()
  
+if (GGML_BUILD_TESTS)
+    if (GGML_TEST_COVERAGE)
+        if (CMAKE_C_COMPILER_ID MATCHES "Clang")
+            set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -fprofile-instr-generate -fcoverage-mapping")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fprofile-instr-generate -fcoverage-mapping")
+        else()
+            message(WARNING "Test coverage is only supported for Clang")
+        endif()
+    endif()
+endif()
+
  add_subdirectory(src)
  
  if (GGML_BUILD_TESTS)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt

index 9862a0d33b4164a67516ec23e62cea49f34233f9..e37060794381db9a77eddb1d7f408e6cc9d6b88b 100644 (file)
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -118,7 +118,6 @@ else()
      endif()
  endif()
  
-
  # ggml
  
  set(TARGET ggml)
@@ -183,6 +182,7 @@ if (GGML_CLBLAST)
          message(WARNING "clBLAST not found")
      endif()
  endif()
+
  if (GGML_CUBLAS)
      cmake_minimum_required(VERSION 3.17)
  
diff --git a/src/ggml.c b/src/ggml.c

index 333e88450ae9f960f8818556bf1eab9d27095d0d..90f32a57500b29c9a89d11a0147b29b210186bfb 100644 (file)
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -3605,7 +3605,7 @@ inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
  #endif
  }
  
-inline static void ggml_vec_sum_ggf(const int n, ggml_float * s, const float * x) {
+inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) {
      ggml_float sum = 0.0;
      for (int i = 0; i < n; ++i) {
          sum += (ggml_float)x[i];
@@ -3613,6 +3613,14 @@ inline static void ggml_vec_sum_ggf(const int n, ggml_float * s, const float * x
      *s = sum;
  }
  
+inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
+    float sum = 0.0f;
+    for (int i = 0; i < n; ++i) {
+        sum += GGML_FP16_TO_FP32(x[i]);
+    }
+    *s = sum;
+}
+
  inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
  #ifndef GGML_USE_ACCELERATE
      float max = -INFINITY;
@@ -9298,7 +9306,7 @@ static void ggml_compute_forward_sum_f32(
      for (int64_t i03 = 0; i03 < ne03; i03++) {
          for (int64_t i02 = 0; i02 < ne02; i02++) {
              for (int64_t i01 = 0; i01 < ne01; i01++) {
-                ggml_vec_sum_ggf(ne00,
+                ggml_vec_sum_f32_ggf(ne00,
                          &row_sum,
                          (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
                  sum += row_sum;
@@ -9308,6 +9316,38 @@ static void ggml_compute_forward_sum_f32(
      ((float *) dst->data)[0] = sum;
  }
  
+static void ggml_compute_forward_sum_f16(
+    const struct ggml_compute_params * params,
+    const struct ggml_tensor * src0,
+          struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_is_scalar(dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    assert(src0->nb[0] == sizeof(ggml_fp16_t));
+
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb);
+
+    float sum = 0;
+    float row_sum = 0;
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            for (int64_t i01 = 0; i01 < ne01; i01++) {
+                ggml_vec_sum_f16_ggf(ne00,
+                    &row_sum,
+                    (ggml_fp16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03));
+                sum += row_sum;
+            }
+        }
+    }
+    ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum);
+}
+
  static void ggml_compute_forward_sum(
          const struct ggml_compute_params * params,
          const struct ggml_tensor * src0,
@@ -9317,6 +9357,10 @@ static void ggml_compute_forward_sum(
              {
                  ggml_compute_forward_sum_f32(params, src0, dst);
              } break;
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_sum_f16(params, src0, dst);
+            } break;
          default:
              {
                  GGML_ASSERT(false);
@@ -15159,7 +15203,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
          case GGML_OP_RELU:
              {
                  if (src0->grad) {
-                    src0->grad = ggml_sub_impl(ctx,
+                    src0->grad = ggml_add_impl(ctx,
                              src0->grad,
                              ggml_mul(ctx,
                                  ggml_step(ctx, src0),
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt

index 25cec2fac2ace744abb52b68db159680fecd2022..af99e6a8a1346cacde4c94b5aedcb0565cdcc4be 100644 (file)
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -26,68 +26,68 @@ endif()
  
  if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
      message(STATUS "ARM detected")
-    #set(GGML_C_FLAGS "${GGML_C_FLAGS} -mcpu=apple-m1")
+    #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=apple-m1")
  elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
      message(STATUS "PPC64 detected")
      set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mpower9-vector")
  else()
      message(STATUS "x86 detected")
-    #set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx -mavx2 -mfma -mf16c")
+    #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx -mavx2 -mfma -mf16c")
      if (UNAME_S MATCHES "Darwin")
          execute_process(COMMAND sysctl machdep.cpu.features OUTPUT_VARIABLE AVX1_M)
          if (AVX1_M MATCHES "AVX1.0")
-            set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
          endif()
          execute_process(COMMAND sysctl machdep.cpu.leaf7_features OUTPUT_VARIABLE AVX2_M)
          if (AVX2_M MATCHES "AVX2")
-            set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx2")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
          endif()
          if (AVX1_M MATCHES "FMA")
-            set(GGML_C_FLAGS "${GGML_C_FLAGS} -mfma")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
          endif()
-        set(GGML_C_FLAGS "${GGML_C_FLAGS} -mf16c")
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
      elseif (UNAME_S MATCHES "Linux")
          message(STATUS "Linux detected")
          execute_process(COMMAND grep "avx " /proc/cpuinfo OUTPUT_VARIABLE AVX1_M)
          if (AVX1_M MATCHES "avx")
-            set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
          endif()
          execute_process(COMMAND grep "avx2 " /proc/cpuinfo OUTPUT_VARIABLE AVX2_M)
          if (AVX2_M MATCHES "avx2")
-            set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx2")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
          endif()
          execute_process(COMMAND grep "fma " /proc/cpuinfo OUTPUT_VARIABLE FMA_M)
          if (FMA_M MATCHES "fma")
-            set(GGML_C_FLAGS "${GGML_C_FLAGS} -mfma")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
          endif()
          execute_process(COMMAND grep "f16c " /proc/cpuinfo OUTPUT_VARIABLE F16C_M)
          if (F16C_M MATCHES "f16c")
-            set(GGML_C_FLAGS "${GGML_C_FLAGS} -mf16c")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
          endif()
          execute_process(COMMAND grep "sse3 " /proc/cpuinfo OUTPUT_VARIABLE SSE3_M)
          if (SSE3_M MATCHES "sse3")
-            set(GGML_C_FLAGS "${GGML_C_FLAGS} -msse3")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse3")
          endif()
      elseif (UNAME_S MATCHES "Haiku")
          message(STATUS "Haiku detected")
         execute_process(COMMAND sysinfo -cpu COMMAND grep "AVX " OUTPUT_VARIABLE AVX1_M)
          if (AVX1_M MATCHES "avx")
-            set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
          endif()
         execute_process(COMMAND sysinfo -cpu COMMAND grep "AVX2 " OUTPUT_VARIABLE AVX2_M)
          if (AVX2_M MATCHES "avx2")
-            set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx2")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
          endif()
         execute_process(COMMAND sysinfo -cpu COMMAND grep "FMA " OUTPUT_VARIABLE FMA_M)
          if (FMA_M MATCHES "fma")
-            set(GGML_C_FLAGS "${GGML_C_FLAGS} -mfma")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
          endif()
         execute_process(COMMAND sysinfo -cpu COMMAND grep "F16C " OUTPUT_VARIABLE F16C_M)
          if (F16C_M MATCHES "f16c")
-            set(GGML_C_FLAGS "${GGML_C_FLAGS} -mf16c")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
          endif()
      else()
-        set(GGML_C_FLAGS  "${GGML_C_FLAGS} -mfma -mf16c -mavx -mavx2")
+        set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -mfma -mf16c -mavx -mavx2")
      endif()
  endif()
  
@@ -142,8 +142,6 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86")
      set(TEST_TARGET test-vec1)
      add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
      target_link_libraries(${TEST_TARGET} PRIVATE ggml)
-    #set_target_properties(${TEST_TARGET} PROPERTIES COMPILE_FLAGS "-mavx -mavx2 -mfma -mf16c")
-    set_target_properties(${TEST_TARGET} PROPERTIES COMPILE_FLAGS ${GGML_C_FLAGS})
  endif()
  
  #
@@ -161,6 +159,7 @@ set(TEST_TARGET test-grad0)
  add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
  target_link_libraries(${TEST_TARGET} PRIVATE ggml)
  add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
  
  #
  # test-opt
@@ -169,6 +168,7 @@ set(TEST_TARGET test-opt)
  add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
  target_link_libraries(${TEST_TARGET} PRIVATE ggml)
  add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
  
  #
  # test-quantize-fns
@@ -177,6 +177,7 @@ set(TEST_TARGET test-quantize-fns)
  add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
  target_link_libraries(${TEST_TARGET} PRIVATE ggml)
  add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
  
  #
  # test-quantize-perf
@@ -185,6 +186,7 @@ set(TEST_TARGET test-quantize-perf)
  add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
  target_link_libraries(${TEST_TARGET} PRIVATE ggml)
  add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
  
  #
  # test-mul-mat0
@@ -194,6 +196,7 @@ add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
  target_link_libraries(${TEST_TARGET} PRIVATE ggml ${GGML_EXTRA_LIBS})
  target_compile_options(${TEST_TARGET} PRIVATE ${GGML_EXTRA_FLAGS})
  add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
  
  #
  # test-mul-mat1 (arm)
@@ -214,6 +217,7 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" AND NOT GGML_NO_ACCELERATE)
      target_link_libraries(${TEST_TARGET} PRIVATE ggml ${GGML_EXTRA_LIBS})
      target_compile_options(${TEST_TARGET} PRIVATE ${GGML_EXTRA_FLAGS})
      add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> 128 128 128)
+    set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
  endif()
  
  #
@@ -223,6 +227,7 @@ set(TEST_TARGET test-mul-mat2)
  add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
  target_link_libraries(${TEST_TARGET} PRIVATE ggml)
  add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
  
  #
  # test0
@@ -231,6 +236,7 @@ set(TEST_TARGET test0)
  add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
  target_link_libraries(${TEST_TARGET} PRIVATE ggml)
  add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
  
  #
  # test1
@@ -242,6 +248,7 @@ if (MSVC)
      target_link_options(${TEST_TARGET} PRIVATE "/STACK: 8388608") # 8MB
  endif()
  add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
  
  #
  # test2
@@ -250,6 +257,7 @@ set(TEST_TARGET test2)
  add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
  target_link_libraries(${TEST_TARGET} PRIVATE ggml)
  add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
  
  #
  # test3
@@ -258,6 +266,7 @@ set(TEST_TARGET test3)
  add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
  target_link_libraries(${TEST_TARGET} PRIVATE ggml)
  add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
  
  #
  # test-pool
@@ -266,6 +275,7 @@ set(TEST_TARGET test-pool)
  add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
  target_link_libraries(${TEST_TARGET} PRIVATE ggml)
  add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
  
  #
  # test-svd0 (arm/x86)
diff --git a/tests/test-grad0.c b/tests/test-grad0.c

index 01467bc184372e5c36c069c22a6791d4ec6f2163..7e03b5426d57ca1a6ca441601e1b41610c7dfd15 100644 (file)
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@@ -64,7 +64,7 @@ void get_random_dims(int64_t * dims, int ndims) {
      }
  }
  
-struct ggml_tensor * get_random_tensor(
+struct ggml_tensor * get_random_tensor_f32(
          struct ggml_context * ctx0,
          int ndims,
          int64_t ne[],
@@ -112,7 +112,55 @@ struct ggml_tensor * get_random_tensor(
      return result;
  }
  
-struct ggml_tensor * get_random_tensor_int(
+struct ggml_tensor * get_random_tensor_f16(
+        struct ggml_context * ctx0,
+        int ndims,
+        int64_t ne[],
+        float fmin,
+        float fmax) {
+    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F16, ndims, ne);
+
+    switch (ndims) {
+        case 1:
+            for (int i0 = 0; i0 < ne[0]; i0++) {
+                ((ggml_fp16_t *)result->data)[i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
+            }
+            break;
+        case 2:
+            for (int i1 = 0; i1 < ne[1]; i1++) {
+                for (int i0 = 0; i0 < ne[0]; i0++) {
+                    ((ggml_fp16_t *)result->data)[i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
+                }
+            }
+            break;
+        case 3:
+            for (int i2 = 0; i2 < ne[2]; i2++) {
+                for (int i1 = 0; i1 < ne[1]; i1++) {
+                    for (int i0 = 0; i0 < ne[0]; i0++) {
+                        ((ggml_fp16_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
+                    }
+                }
+            }
+            break;
+        case 4:
+            for (int i3 = 0; i3 < ne[3]; i3++) {
+                for (int i2 = 0; i2 < ne[2]; i2++) {
+                    for (int i1 = 0; i1 < ne[1]; i1++) {
+                        for (int i0 = 0; i0 < ne[0]; i0++) {
+                            ((ggml_fp16_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
+                        }
+                    }
+                }
+            }
+            break;
+        default:
+            assert(false);
+    };
+
+    return result;
+}
+
+struct ggml_tensor * get_random_tensor_i32(
          struct ggml_context * ctx0,
          int ndims,
          int64_t ne[],
@@ -161,20 +209,39 @@ struct ggml_tensor * get_random_tensor_int(
  }
  
  float get_element(const struct ggml_tensor * t, int idx) {
-    if (t->type == GGML_TYPE_F32) {
-        return ((float *)t->data)[idx];
-    }
-
-    if (t->type == GGML_TYPE_I32) {
-        return ((int32_t *)t->data)[idx];
+    switch (t->type) {
+        case GGML_TYPE_F32:
+            return ((float *)t->data)[idx];
+        case GGML_TYPE_I32:
+            return ((int32_t *)t->data)[idx];
+        case GGML_TYPE_F16:
+            return ggml_fp16_to_fp32(((ggml_fp16_t *)t->data)[idx]);
+        case GGML_TYPE_I16:
+            return ((int16_t *)t->data)[idx];
+        default:
+            assert(false);
      }
-
-    assert(false);
      return INFINITY;
  }
  
  void set_element(struct ggml_tensor * t, int idx, float value) {
-    ((float *)t->data)[idx] = value;
+    switch (t->type) {
+        case GGML_TYPE_F32:
+            ((float *)t->data)[idx] = value;
+            break;
+        case GGML_TYPE_I32:
+            ((int32_t *)t->data)[idx] = value;
+            break;
+        case GGML_TYPE_F16:
+            ((ggml_fp16_t*)t->data)[idx] = ggml_fp32_to_fp16(value);
+            break;
+        case GGML_TYPE_I16:
+            ((int16_t *)t->data)[idx] = value;
+            break;
+        default:
+            assert(false);
+    }
+    ;
  }
  
  void print_elements(const char* label, const struct ggml_tensor * t) {
@@ -392,19 +459,35 @@ int main(int argc, const char ** argv) {
  
          struct ggml_tensor * x[MAX_NARGS];
  
-        // add
+        // add f32
          {
              const int nargs = 2;
  
              for (int ndims = 1; ndims <= 4; ++ndims) {
                  for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                      ggml_set_param(ctx0, x[i]);
                  }
  
                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
  
-                check_gradient("add", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f);
+                check_gradient("add f32", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f);
+            }
+        }
+
+        // add f16
+        {
+            const int nargs = 2;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
+
+                check_gradient("add f16", ctx0, x, f, ndims, nargs, 1e-1f, 2e-1f, 2e-1f);
              }
          }
  
@@ -414,7 +497,7 @@ int main(int argc, const char ** argv) {
  
              for (int ndims = 1; ndims <= 4; ++ndims) {
                  for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                      ggml_set_param(ctx0, x[i]);
                  }
  
@@ -430,7 +513,7 @@ int main(int argc, const char ** argv) {
  
              for (int ndims = 1; ndims <= 4; ++ndims) {
                  for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                      ggml_set_param(ctx0, x[i]);
                  }
  
@@ -446,7 +529,7 @@ int main(int argc, const char ** argv) {
  
              for (int ndims = 1; ndims <= 4; ++ndims) {
                  for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, 0.5f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 0.5f, 1.0f);
                      ggml_set_param(ctx0, x[i]);
                  }
  
@@ -462,7 +545,7 @@ int main(int argc, const char ** argv) {
  
              for (int ndims = 1; ndims <= 2; ++ndims) {
                  for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                      ggml_set_param(ctx0, x[i]);
                  }
  
@@ -478,7 +561,7 @@ int main(int argc, const char ** argv) {
  
              for (int ndims = 1; ndims <= 2; ++ndims) {
                  for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
                      ggml_set_param(ctx0, x[i]);
                  }
  
@@ -494,7 +577,7 @@ int main(int argc, const char ** argv) {
  
              for (int ndims = 1; ndims <= 2; ++ndims) {
                  for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
                      ggml_set_param(ctx0, x[i]);
                  }
  
@@ -510,7 +593,7 @@ int main(int argc, const char ** argv) {
  
              for (int ndims = 1; ndims <= 2; ++ndims) {
                  for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                      ggml_set_param(ctx0, x[i]);
                  }
  
@@ -527,7 +610,7 @@ int main(int argc, const char ** argv) {
  
              for (int ndims = 1; ndims <= 4; ++ndims) {
                  for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                      ggml_set_param(ctx0, x[i]);
                  }
  
@@ -537,6 +620,40 @@ int main(int argc, const char ** argv) {
              }
          }
  
+        // mean, not yet fully implemented
+        if(0)
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_mean(ctx0, x[0]));
+
+                check_gradient("mean", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // argmax
+        if (0)
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_argmax(ctx0, x[0]));
+
+                check_gradient("argmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
          // repeat
          {
              int64_t ne2[4];
@@ -549,15 +666,36 @@ int main(int argc, const char ** argv) {
  
              const int nargs = 1;
              for (int ndims = 1; ndims <= 2; ++ndims) {
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
-                x[1] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
                  ggml_set_param(ctx0, x[0]);
  
                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[1], ggml_repeat(ctx0, x[0], x[1]))));
  
                  check_gradient("repeat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
              }
+        }
  
+        // repeat back
+        {
+            int64_t ne2[4];
+            get_random_dims(ne2, 4);
+
+            ne2[0] = ne[0] * ne2[0];
+            ne2[1] = ne[1] * ne2[1];
+            ne2[2] = 1;
+            ne2[3] = 1;
+
+            const int nargs = 1;
+            for (int ndims = 1; ndims <= 2; ++ndims) {
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[0]);
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[0], ggml_repeat_back(ctx0, x[1], x[0]))));
+
+                check_gradient("repeat back", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
+            }
          }
  
          // abs (finite differences do not work)
@@ -566,7 +704,7 @@ int main(int argc, const char ** argv) {
  
          //    for (int ndims = 1; ndims <= 2; ++ndims) {
          //        for (int i = 0; i < nargs; ++i) {
-        //            x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+        //            x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
          //            ggml_set_param(ctx0, x[i]);
          //        }
  
@@ -576,17 +714,82 @@ int main(int argc, const char ** argv) {
          //    }
          //}
  
+        // sgn
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_sgn(ctx0, x[0]));
+
+                check_gradient("sgn", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // neg
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_neg(ctx0, x[0]));
+
+                check_gradient("neg", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // step
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_step(ctx0, x[0]));
+
+                check_gradient("step", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // tanh, not yet fully implemented
+        if(0)
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_tanh(ctx0, x[0]));
+
+                check_gradient("tanh", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
          // mul_mat
          {
              const int nargs = 2;
  
              for (int ndims = 2; ndims <= 2; ++ndims) {
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                  {
                      int64_t ne2[4];
                      get_random_dims(ne2, 4);
                      ne2[0] = ne[0];
-                    x[1] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
+                    x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
                  }
  
                  ggml_set_param(ctx0, x[0]);
@@ -602,13 +805,63 @@ int main(int argc, const char ** argv) {
              }
          }
  
+        // elu, not yet fully implemented
+        if(0)
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_elu(ctx0, x[0]));
+
+                check_gradient("elu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // relu
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_relu(ctx0, x[0]));
+
+                check_gradient("relu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // gelu, not yet fully implemented
+        if(0)
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_gelu(ctx0, x[0]));
+
+                check_gradient("gelu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
          // silu
          {
              const int nargs = 1;
  
              for (int ndims = 1; ndims <= 2; ++ndims) {
                  for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                      ggml_set_param(ctx0, x[i]);
                  }
  
@@ -629,7 +882,7 @@ int main(int argc, const char ** argv) {
  
              for (int ndims = 1; ndims <= 2; ++ndims) {
                  for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                      ggml_set_param(ctx0, x[i]);
                  }
  
@@ -647,8 +900,8 @@ int main(int argc, const char ** argv) {
              ne2[0] = 1;
  
              for (int ndims = 1; ndims <= 2; ++ndims) {
-                x[1] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f);
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  
                  ggml_set_param(ctx0, x[0]);
                  ggml_set_param(ctx0, x[1]);
@@ -659,20 +912,37 @@ int main(int argc, const char ** argv) {
              }
          }
  
-        // cpy
+        // cpy f32
          {
              const int nargs = 2;
  
              for (int ndims = 1; ndims <= 2; ++ndims) {
                  for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                      ggml_set_param(ctx0, x[i]);
                  }
                  // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
  
                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
  
-                check_gradient("cpy", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+                check_gradient("cpy f32", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // cpy f16
+        {
+            const int nargs = 2;
+
+            for (int ndims = 1; ndims <= 2; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+                // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
+
+                check_gradient("cpy f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
              }
          }
  
@@ -689,8 +959,8 @@ int main(int argc, const char ** argv) {
                  for (int i = 0; i < ndims; ++i) {
                      ne2[0] *= ne[i];
                  }
-                x[0] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f);
-                x[1] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                  ggml_set_param(ctx0, x[0]);
  
  
@@ -712,8 +982,8 @@ int main(int argc, const char ** argv) {
                  for (int i = 0; i < ndims; ++i) {
                      ne2[0] *= ne[i];
                  }
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
-                x[1] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
                  ggml_set_param(ctx0, x[0]);
  
  
@@ -729,7 +999,7 @@ int main(int argc, const char ** argv) {
              const int nargs = 2;
              for (int ndims = 1; ndims <= 4; ++ndims) {
  
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                  ggml_set_param(ctx0, x[0]);
  
                  get_random_dims(ne2, 1);
@@ -737,7 +1007,7 @@ int main(int argc, const char ** argv) {
                      get_random_dims(ne2, 1);
                  }
  
-                x[1] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
                  ggml_set_param(ctx0, x[1]);
  
                  const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
@@ -758,7 +1028,7 @@ int main(int argc, const char ** argv) {
              const int nargs = 2;
              for (int ndims = 2; ndims <= 4; ++ndims) {
  
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                  ggml_set_param(ctx0, x[0]);
  
                  get_random_dims(ne2, 2);
@@ -766,7 +1036,7 @@ int main(int argc, const char ** argv) {
                      get_random_dims(ne2, 2);
                  }
  
-                x[1] = get_random_tensor(ctx0, 2, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
                  ggml_set_param(ctx0, x[1]);
  
                  max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
@@ -790,7 +1060,7 @@ int main(int argc, const char ** argv) {
              const int nargs = 2;
              for (int ndims = 3; ndims <= 4; ++ndims) {
  
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                  ggml_set_param(ctx0, x[0]);
  
                  get_random_dims(ne2, 3);
@@ -798,7 +1068,7 @@ int main(int argc, const char ** argv) {
                      get_random_dims(ne2, 3);
                  }
  
-                x[1] = get_random_tensor(ctx0, 3, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 3, ne2, -1.0f, 1.0f);
                  ggml_set_param(ctx0, x[1]);
  
                  max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
@@ -824,7 +1094,7 @@ int main(int argc, const char ** argv) {
              const int nargs = 2;
              for (int ndims = 4; ndims <= 4; ++ndims) {
  
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                  ggml_set_param(ctx0, x[0]);
  
                  get_random_dims(ne2, 4);
@@ -832,7 +1102,7 @@ int main(int argc, const char ** argv) {
                      get_random_dims(ne2, 4);
                  }
  
-                x[1] = get_random_tensor(ctx0, 4, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
                  ggml_set_param(ctx0, x[1]);
  
                  max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
@@ -858,7 +1128,7 @@ int main(int argc, const char ** argv) {
              const int nargs = 2;
              for (int ndims = 1; ndims <= 4; ++ndims) {
  
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                  ggml_set_param(ctx0, x[0]);
  
                  get_random_dims(ne2, 1);
@@ -866,7 +1136,7 @@ int main(int argc, const char ** argv) {
                      get_random_dims(ne2, 1);
                  }
  
-                x[1] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
                  ggml_set_param(ctx0, x[1]);
  
                  const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
@@ -887,7 +1157,7 @@ int main(int argc, const char ** argv) {
              const int nargs = 1;
              for (int ndims = 2; ndims <= 4; ++ndims) {
  
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                  ggml_set_param(ctx0, x[0]);
  
                  get_random_dims(ne2, 2);
@@ -895,7 +1165,7 @@ int main(int argc, const char ** argv) {
                      get_random_dims(ne2, 2);
                  }
  
-                x[1] = get_random_tensor(ctx0, 2, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
                  ggml_set_param(ctx0, x[1]);
  
                  max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
@@ -915,7 +1185,7 @@ int main(int argc, const char ** argv) {
              const int nargs = 1;
              for (int ndims = 1; ndims <= 4; ++ndims) {
  
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  
                  ggml_set_param(ctx0, x[0]);
  
@@ -941,7 +1211,7 @@ int main(int argc, const char ** argv) {
              const int nargs = 1;
              for (int ndims = 1; ndims <= 4; ++ndims) {
  
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  
                  get_random_dims(ne2, 2);
                  while (ne2[0]*ne2[1] > ggml_nelements(x[0])) {
@@ -971,7 +1241,7 @@ int main(int argc, const char ** argv) {
              const int nargs = 1;
              for (int ndims = 1; ndims <= 4; ++ndims) {
  
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  
                  get_random_dims(ne2, 3);
                  while (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0])) {
@@ -1010,7 +1280,7 @@ int main(int argc, const char ** argv) {
                  for (int i=ndims; i<4; ++i) {
                      ne2[i] = 1;
                  }
-                x[0] = get_random_tensor(ctx0, 4, ne2, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
  
                  ggml_set_param(ctx0, x[0]);
  
@@ -1043,7 +1313,7 @@ int main(int argc, const char ** argv) {
                  for (int i=ndims; i<4; ++i) {
                      ne2[i] = 1;
                  }
-                x[0] = get_random_tensor(ctx0, 4, ne2, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
  
                  ggml_set_param(ctx0, x[0]);
  
@@ -1060,8 +1330,8 @@ int main(int argc, const char ** argv) {
              int64_t ne3[4] = {1+irand(ne[1]), 1, 1, 1};
              const int nargs = 1;
              const int ndims = 2;
-            x[0] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
-            x[1] = get_random_tensor_int(ctx0, 1, ne3, 0, ne2[1]);
+            x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
+            x[1] = get_random_tensor_i32(ctx0, 1, ne3, 0, ne2[1]);
  
              ggml_set_param(ctx0, x[0]);
  
@@ -1075,7 +1345,7 @@ int main(int argc, const char ** argv) {
              const int nargs = 1;
              const int ndims = 2;
  
-            x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+            x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
              ggml_set_param(ctx0, x[0]);
  
              int n_past = irand(ne[0]);
@@ -1090,7 +1360,7 @@ int main(int argc, const char ** argv) {
              const int nargs = 1;
              const int ndims = 2;
  
-            x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+            x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
              ggml_set_param(ctx0, x[0]);
  
              int n_past = irand(ne[0]);
@@ -1108,7 +1378,7 @@ int main(int argc, const char ** argv) {
              get_random_dims(ne2, 4);
  
              for (int ndims = 1; ndims <= 3; ++ndims) {
-                x[0] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
                  ggml_set_param(ctx0, x[0]);
  
                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_soft_max(ctx0, x[0]));
@@ -1125,8 +1395,8 @@ int main(int argc, const char ** argv) {
              get_random_dims(ne2, 4);
  
              for (int ndims = 1; ndims <= 3; ++ndims) {
-                x[0] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
-                x[1] = get_random_tensor(ctx0, ndims, ne2, 0.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, 0.0f, 1.0f);
                  ggml_set_param(ctx0, x[0]);
  
                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_cross_entropy_loss(ctx0, x[0], x[1]));
@@ -1136,7 +1406,41 @@ int main(int argc, const char ** argv) {
              }
          }
  
-        // rope
+        // rope f32
+        {
+            const int nargs = 1;
+
+            int64_t ne2[4];
+            get_random_dims(ne2, 4);
+            ne2[0] += ne2[0] % 2;
+            int n_rot = ne2[0];
+
+            for (int ndims = 3; ndims <= 4; ++ndims) {
+                for (int mode = 0; mode < 4; ++mode) {
+                    for (int n_past = 1; n_past < ne2[2]; ++n_past) {
+                        x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
+
+                        ggml_set_param(ctx0, x[0]);
+
+                        const bool skip_past = (mode & 1);
+                        if (skip_past) {
+                            // we have no past, so this would have to work on uninitialized memory.
+                            // we only test the gradients here;
+                            // skip_past should have no influence on gradient computation.
+                            // so when other modes work, we assume that this does as well.
+                            continue;
+                        }
+
+                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], n_past, n_rot, mode, 0));
+
+                        GGML_PRINT_DEBUG("rope f32: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
+                        check_gradient("rope f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY);
+                    }
+                }
+            }
+        }
+
+        // rope f16
          {
              const int nargs = 1;
  
@@ -1148,7 +1452,7 @@ int main(int argc, const char ** argv) {
              for (int ndims = 3; ndims <= 4; ++ndims) {
                  for (int mode = 0; mode < 4; ++mode) {
                      for (int n_past = 1; n_past < ne2[2]; ++n_past) {
-                        x[0] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
+                        x[0] = get_random_tensor_f16(ctx0, ndims, ne2, -1.0f, 1.0f);
  
                          ggml_set_param(ctx0, x[0]);
  
@@ -1163,14 +1467,55 @@ int main(int argc, const char ** argv) {
  
                          struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], n_past, n_rot, mode, 0));
  
-                        GGML_PRINT_DEBUG("rope: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
-                        check_gradient("rope", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY);
+                        GGML_PRINT_DEBUG("rope f16: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
+                        check_gradient("rope f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
                      }
                  }
              }
          }
  
-        // flash_attn
+        // flash_attn f32
+        {
+            const int nargs = 3;
+
+            int64_t ne2[4];
+
+            get_random_dims(ne2, 4);
+            int64_t D = ne2[0];
+            int64_t N = ne2[1];
+            int64_t M = ne2[2] + N;
+            int64_t B = ne2[3];
+
+            for (int masked = 0; masked <= 1; ++masked) {
+                for (int ndims = 2; ndims <= 4; ++ndims) {
+                    int64_t neq[4] = { D, N, B, ne[3] };
+                    int64_t nek[4] = { D, M, B, ne[3] };
+                    int64_t nev[4] = { M, D, B, ne[3] };
+                    if (ndims == 2) {
+                        neq[2] = 1; neq[3] = 1;
+                        nek[2] = 1; nek[3] = 1;
+                        nev[2] = 1; nev[3] = 1;
+                    } else if (ndims == 3) {
+                        neq[3] = 1;
+                        nek[3] = 1;
+                        nev[3] = 1;
+                    }
+                    x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
+                    x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
+                    x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
+                    ggml_set_param(ctx0, x[0]);
+                    ggml_set_param(ctx0, x[1]);
+                    ggml_set_param(ctx0, x[2]);
+
+                    struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
+
+                    check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
+                }
+            }
+        }
+
+        // flash_attn f16, not yet fully implemented
+        if(0)
          {
              const int nargs = 3;
  
@@ -1196,16 +1541,16 @@ int main(int argc, const char ** argv) {
                          nek[3] = 1;
                          nev[3] = 1;
                      }
-                    x[0] = get_random_tensor(ctx0, ndims, neq, -0.1250f, 0.1250f);
-                    x[1] = get_random_tensor(ctx0, ndims, nek, -0.1250f, 0.1250f);
-                    x[2] = get_random_tensor(ctx0, ndims, nev, -0.1250f, 0.1250f);
+                    x[0] = get_random_tensor_f16(ctx0, ndims, neq, -0.1250f, 0.1250f);
+                    x[1] = get_random_tensor_f16(ctx0, ndims, nek, -0.1250f, 0.1250f);
+                    x[2] = get_random_tensor_f16(ctx0, ndims, nev, -0.1250f, 0.1250f);
                      ggml_set_param(ctx0, x[0]);
                      ggml_set_param(ctx0, x[1]);
                      ggml_set_param(ctx0, x[2]);
  
                      struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
  
-                    check_gradient("flash_attn", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
+                    check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
                  }
              }
          }
diff --git a/tests/test-opt.c b/tests/test-opt.c

index 5531814c48c997e1d2e0d5e563ffa9235877fbf1..4eef62bcfb96b11eec2b8cf4845d577db93525f7 100644 (file)
--- a/tests/test-opt.c
+++ b/tests/test-opt.c
@@ -125,9 +125,9 @@ int main(void) {
      };
      struct ggml_context * ctx = ggml_init(params);
  
-    int64_t ne1[4] = {4, 1024, 1, 1};
-    int64_t ne2[4] = {4, 2048, 1, 1};;
-    int64_t ne3[4] = {1024, 2048, 1, 1};
+    int64_t ne1[4] = {4, 128, 1, 1};
+    int64_t ne2[4] = {4, 256, 1, 1};;
+    int64_t ne3[4] = {128, 256, 1, 1};
  
      struct ggml_tensor * a = get_random_tensor(ctx, 2, ne1, -1, +1);
      struct ggml_tensor * b = get_random_tensor(ctx, 2, ne2, -1, +1);
author	goerch <redacted>
	Sun, 23 Jul 2023 16:35:43 +0000 (18:35 +0200)
committer	GitHub <redacted>
	Sun, 23 Jul 2023 16:35:43 +0000 (19:35 +0300)
.github/workflows/ci.yml		patch \| blob \| history
CMakeLists.txt		patch \| blob \| history
src/CMakeLists.txt		patch \| blob \| history
src/ggml.c		patch \| blob \| history
tests/CMakeLists.txt		patch \| blob \| history
tests/test-grad0.c		patch \| blob \| history
tests/test-opt.c		patch \| blob \| history