struct ggml_object;
struct ggml_context;
+ struct ggml_cgraph;
// NOTE: always add types at the end of the enum to keep backward compatibility
enum ggml_type {
GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
};
- // ggml object
- struct ggml_object {
- size_t offs;
- size_t size;
-
- struct ggml_object * next;
-
- enum ggml_object_type type;
-
- char padding[4];
- };
-
- static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
-
// n-dimensional tensor
struct ggml_tensor {
- enum ggml_type type;
+ enum ggml_type type;
GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
struct ggml_threadpool; // forward declaration, see ggml.c
- typedef struct ggml_threadpool * ggml_threadpool_t;
+ typedef struct ggml_threadpool * ggml_threadpool_t;
// the compute plan that needs to be prepared for ggml_graph_compute()
// since https://github.com/ggerganov/ggml/issues/287
void * abort_callback_data;
};
- enum ggml_cgraph_eval_order {
- GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
- GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
- GGML_CGRAPH_EVAL_ORDER_COUNT
- };
-
- typedef uint32_t ggml_bitset_t;
-
- struct ggml_hash_set {
- size_t size;
- ggml_bitset_t * used; // whether or not the keys are in use i.e. set
- struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
- };
-
- // computation graph
- struct ggml_cgraph {
- int size;
- int n_nodes;
- int n_leafs;
-
- struct ggml_tensor ** nodes;
- struct ggml_tensor ** grads;
- struct ggml_tensor ** leafs;
-
- struct ggml_hash_set visited_hash_set;
-
- enum ggml_cgraph_eval_order order;
- };
-
// scratch buffer
struct ggml_scratch {
size_t offs;
typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
- #define GGML_N_TASKS_MAX -1
-
GGML_API struct ggml_tensor * ggml_map_custom1(
struct ggml_context * ctx,
struct ggml_tensor * a,
GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
-
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate, bool keep);
float wd); // weight decay
// graph allocation in a context
- GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
- GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads);
- GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
- GGML_API struct ggml_cgraph ggml_graph_view (struct ggml_cgraph * cgraph, int i0, int i1);
- GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
- GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
- GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
+ GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
+ GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
+ GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
+ GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
+ GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
+ GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
+
+ GGML_API int ggml_graph_size (struct ggml_cgraph * cgraph);
+ GGML_API struct ggml_tensor * ggml_graph_node (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
+ GGML_API struct ggml_tensor ** ggml_graph_nodes (struct ggml_cgraph * cgraph);
+ GGML_API int ggml_graph_n_nodes(struct ggml_cgraph * cgraph);
+
+ GGML_API void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
GGML_API size_t ggml_graph_overhead(void);
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
- GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
- GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params *p, int n_threads);
- GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1);
- GGML_API struct ggml_threadpool* ggml_threadpool_new (struct ggml_threadpool_params * params);
- GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
- GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
- GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
- GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
+ GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
+ GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
+ GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
+ GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
+ GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
+ GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
+ GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
+ GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
// ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data
#include "ggml-quants.h"
#include "ggml-impl.h"
+#include "ggml-cpu-impl.h"
#include <math.h>
#include <string.h>
+#include "ggml-impl.h"
#include "ggml-blas.h"
#include "ggml-backend-impl.h"
#include <cstring>
#include <mutex>
+#include "ggml-impl.h"
#include "ggml-backend-impl.h"
#include "ggml-cann/aclnn_ops.h"
#include "ggml-cann/common.h"
--- /dev/null
+#pragma once
+
+// GGML CPU internal header
+
+#include "ggml.h"
+#include "ggml-impl.h"
+#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
+//#include <stddef.h>
+#include <stdbool.h>
+#include <string.h> // memcpy
+#include <math.h> // fabsf
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_MSC_VER)
+
+#define m512bh(p) p
+#define m512i(p) p
+
+#else
+
+#define m512bh(p) (__m512bh)(p)
+#define m512i(p) (__m512i)(p)
+
+#endif
+
+/**
+ * Converts brain16 to float32.
+ *
+ * The bfloat16 floating point format has the following structure:
+ *
+ * ┌sign
+ * │
+ * │ ┌exponent
+ * │ │
+ * │ │ ┌mantissa
+ * │ │ │
+ * │┌──┴───┐┌─┴───┐
+ * 0b0000000000000000 brain16
+ *
+ * Since bf16 has the same number of exponent bits as a 32bit float,
+ * encoding and decoding numbers becomes relatively straightforward.
+ *
+ * ┌sign
+ * │
+ * │ ┌exponent
+ * │ │
+ * │ │ ┌mantissa
+ * │ │ │
+ * │┌──┴───┐┌─┴───────────────────┐
+ * 0b00000000000000000000000000000000 IEEE binary32
+ *
+ * For comparison, the standard fp16 format has fewer exponent bits.
+ *
+ * ┌sign
+ * │
+ * │ ┌exponent
+ * │ │
+ * │ │ ┌mantissa
+ * │ │ │
+ * │┌─┴─┐┌─┴──────┐
+ * 0b0000000000000000 IEEE binary16
+ *
+ * @see IEEE 754-2008
+ */
+static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
+ union {
+ float f;
+ uint32_t i;
+ } u;
+ u.i = (uint32_t)h.bits << 16;
+ return u.f;
+}
+
+/**
+ * Converts float32 to brain16.
+ *
+ * This is binary identical with Google Brain float conversion.
+ * Floats shall round to nearest even, and NANs shall be quiet.
+ * Subnormals aren't flushed to zero, except perhaps when used.
+ * This code should vectorize nicely if using modern compilers.
+ */
+static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
+ ggml_bf16_t h;
+ union {
+ float f;
+ uint32_t i;
+ } u;
+ u.f = s;
+ if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
+ h.bits = (u.i >> 16) | 64; /* force to quiet */
+ return h;
+ }
+ h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
+ return h;
+}
+
+#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
+#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
+
+// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
+#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
+#ifndef __FMA__
+#define __FMA__
+#endif
+#ifndef __F16C__
+#define __F16C__
+#endif
+#endif
+
+// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
+#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
+#ifndef __SSE3__
+#define __SSE3__
+#endif
+#ifndef __SSSE3__
+#define __SSSE3__
+#endif
+#endif
+
+#if defined(__ARM_FEATURE_SVE)
+#include <arm_sve.h>
+#include <sys/prctl.h>
+#endif
+
+// 16-bit float
+// on Arm, we use __fp16
+// on x86, we use uint16_t
+#if defined(__ARM_NEON)
+
+// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
+//
+// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
+//
+#include <arm_neon.h>
+
+#ifdef _MSC_VER
+
+typedef uint16_t ggml_fp16_internal_t;
+
+#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
+
+#else
+
+typedef __fp16 ggml_fp16_internal_t;
+
+#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
+
+#endif // _MSC_VER
+
+#if !defined(__aarch64__)
+
+// 32-bit ARM compatibility
+
+// vaddlvq_s16
+// vpaddq_s16
+// vpaddq_s32
+// vaddvq_s32
+// vaddvq_f32
+// vmaxvq_f32
+// vcvtnq_s32_f32
+// vzip1_u8
+// vzip2_u8
+
+inline static int32_t vaddlvq_s16(int16x8_t v) {
+ int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
+ return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2);
+}
+
+inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
+ int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
+ int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
+ return vcombine_s16(a0, b0);
+}
+
+inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
+ int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
+ int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
+ return vcombine_s32(a0, b0);
+}
+
+inline static int32_t vaddvq_s32(int32x4_t v) {
+ return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
+}
+
+inline static float vaddvq_f32(float32x4_t v) {
+ return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
+}
+
+inline static float vmaxvq_f32(float32x4_t v) {
+ return
+ MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
+ MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
+}
+
+inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
+ int32x4_t res;
+
+ res[0] = roundf(vgetq_lane_f32(v, 0));
+ res[1] = roundf(vgetq_lane_f32(v, 1));
+ res[2] = roundf(vgetq_lane_f32(v, 2));
+ res[3] = roundf(vgetq_lane_f32(v, 3));
+
+ return res;
+}
+
+inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
+ uint8x8_t res;
+
+ res[0] = a[0]; res[1] = b[0];
+ res[2] = a[1]; res[3] = b[1];
+ res[4] = a[2]; res[5] = b[2];
+ res[6] = a[3]; res[7] = b[3];
+
+ return res;
+}
+
+inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
+ uint8x8_t res;
+
+ res[0] = a[4]; res[1] = b[4];
+ res[2] = a[5]; res[3] = b[5];
+ res[4] = a[6]; res[5] = b[6];
+ res[6] = a[7]; res[7] = b[7];
+
+ return res;
+}
+
+// vld1q_s16_x2
+// vld1q_u8_x2
+// vld1q_u8_x4
+// vld1q_s8_x2
+// vld1q_s8_x4
+// TODO: double-check these work correctly
+
+typedef struct ggml_int16x8x2_t {
+ int16x8_t val[2];
+} ggml_int16x8x2_t;
+
+inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
+ ggml_int16x8x2_t res;
+
+ res.val[0] = vld1q_s16(ptr + 0);
+ res.val[1] = vld1q_s16(ptr + 8);
+
+ return res;
+}
+
+typedef struct ggml_uint8x16x2_t {
+ uint8x16_t val[2];
+} ggml_uint8x16x2_t;
+
+inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
+ ggml_uint8x16x2_t res;
+
+ res.val[0] = vld1q_u8(ptr + 0);
+ res.val[1] = vld1q_u8(ptr + 16);
+
+ return res;
+}
+
+typedef struct ggml_uint8x16x4_t {
+ uint8x16_t val[4];
+} ggml_uint8x16x4_t;
+
+inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
+ ggml_uint8x16x4_t res;
+
+ res.val[0] = vld1q_u8(ptr + 0);
+ res.val[1] = vld1q_u8(ptr + 16);
+ res.val[2] = vld1q_u8(ptr + 32);
+ res.val[3] = vld1q_u8(ptr + 48);
+
+ return res;
+}
+
+typedef struct ggml_int8x16x2_t {
+ int8x16_t val[2];
+} ggml_int8x16x2_t;
+
+inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
+ ggml_int8x16x2_t res;
+
+ res.val[0] = vld1q_s8(ptr + 0);
+ res.val[1] = vld1q_s8(ptr + 16);
+
+ return res;
+}
+
+typedef struct ggml_int8x16x4_t {
+ int8x16_t val[4];
+} ggml_int8x16x4_t;
+
+inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
+ ggml_int8x16x4_t res;
+
+ res.val[0] = vld1q_s8(ptr + 0);
+ res.val[1] = vld1q_s8(ptr + 16);
+ res.val[2] = vld1q_s8(ptr + 32);
+ res.val[3] = vld1q_s8(ptr + 48);
+
+ return res;
+}
+
+// NOTE: not tested
+inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
+ int8x16_t res;
+
+ res[ 0] = a[b[ 0]];
+ res[ 1] = a[b[ 1]];
+ res[ 2] = a[b[ 2]];
+ res[ 3] = a[b[ 3]];
+ res[ 4] = a[b[ 4]];
+ res[ 5] = a[b[ 5]];
+ res[ 6] = a[b[ 6]];
+ res[ 7] = a[b[ 7]];
+ res[ 8] = a[b[ 8]];
+ res[ 9] = a[b[ 9]];
+ res[10] = a[b[10]];
+ res[11] = a[b[11]];
+ res[12] = a[b[12]];
+ res[13] = a[b[13]];
+ res[14] = a[b[14]];
+ res[15] = a[b[15]];
+
+ return res;
+}
+
+// NOTE: not tested
+inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
+ uint8x16_t res;
+
+ res[ 0] = a[b[ 0]];
+ res[ 1] = a[b[ 1]];
+ res[ 2] = a[b[ 2]];
+ res[ 3] = a[b[ 3]];
+ res[ 4] = a[b[ 4]];
+ res[ 5] = a[b[ 5]];
+ res[ 6] = a[b[ 6]];
+ res[ 7] = a[b[ 7]];
+ res[ 8] = a[b[ 8]];
+ res[ 9] = a[b[ 9]];
+ res[10] = a[b[10]];
+ res[11] = a[b[11]];
+ res[12] = a[b[12]];
+ res[13] = a[b[13]];
+ res[14] = a[b[14]];
+ res[15] = a[b[15]];
+
+ return res;
+}
+
+#else
+
+#define ggml_int16x8x2_t int16x8x2_t
+#define ggml_uint8x16x2_t uint8x16x2_t
+#define ggml_uint8x16x4_t uint8x16x4_t
+#define ggml_int8x16x2_t int8x16x2_t
+#define ggml_int8x16x4_t int8x16x4_t
+
+#define ggml_vld1q_s16_x2 vld1q_s16_x2
+#define ggml_vld1q_u8_x2 vld1q_u8_x2
+#define ggml_vld1q_u8_x4 vld1q_u8_x4
+#define ggml_vld1q_s8_x2 vld1q_s8_x2
+#define ggml_vld1q_s8_x4 vld1q_s8_x4
+#define ggml_vqtbl1q_s8 vqtbl1q_s8
+#define ggml_vqtbl1q_u8 vqtbl1q_u8
+
+#endif // !defined(__aarch64__)
+
+#if !defined(__ARM_FEATURE_DOTPROD)
+
+inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
+ const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
+ const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
+
+ return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
+}
+
+#else
+
+#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
+
+#endif // !defined(__ARM_FEATURE_DOTPROD)
+
+#endif // defined(__ARM_NEON)
+
+#if defined(__ARM_NEON) && !defined(_MSC_VER)
+
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+
+#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+
+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+ ggml_fp16_internal_t tmp;
+ memcpy(&tmp, &h, sizeof(ggml_fp16_t));
+ return (float)tmp;
+}
+
+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+ ggml_fp16_t res;
+ ggml_fp16_internal_t tmp = f;
+ memcpy(&res, &tmp, sizeof(ggml_fp16_t));
+ return res;
+}
+
+#else
+
+#ifdef __wasm_simd128__
+#include <wasm_simd128.h>
+#else
+#ifdef __POWER9_VECTOR__
+#include <altivec.h>
+#undef bool
+#define bool _Bool
+#else
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#include <intrin.h>
+#else
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
+#if !defined(__riscv)
+#include <immintrin.h>
+#endif
+#endif
+#endif
+#endif
+#endif
+
+#ifdef __riscv_v_intrinsic
+#include <riscv_vector.h>
+#endif
+
+#if defined(__loongarch64)
+#if defined(__loongarch_asx)
+#include <lasxintrin.h>
+#endif
+#if defined(__loongarch_sx)
+#include <lsxintrin.h>
+#endif
+#endif
+
+#if defined(__loongarch_asx)
+
+typedef union {
+ int32_t i;
+ float f;
+} ft_union;
+
+/* float type data load instructions */
+static __m128 __lsx_vreplfr2vr_s(float val) {
+ ft_union fi_tmpval = {.f = val};
+ return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
+}
+
+static __m256 __lasx_xvreplfr2vr_s(float val) {
+ ft_union fi_tmpval = {.f = val};
+ return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
+}
+#endif
+
+#ifdef __F16C__
+
+#ifdef _MSC_VER
+#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
+#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
+#else
+#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
+#endif
+
+#elif defined(__POWER9_VECTOR__)
+
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+/* the inline asm below is about 12% faster than the lookup method */
+#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
+#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+
+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+ register float f;
+ register double d;
+ __asm__(
+ "mtfprd %0,%2\n"
+ "xscvhpdp %0,%0\n"
+ "frsp %1,%0\n" :
+ /* temp */ "=d"(d),
+ /* out */ "=f"(f):
+ /* in */ "r"(h));
+ return f;
+}
+
+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+ register double d;
+ register ggml_fp16_t r;
+ __asm__( /* xscvdphp can work on double or single precision */
+ "xscvdphp %0,%2\n"
+ "mffprd %1,%0\n" :
+ /* temp */ "=d"(d),
+ /* out */ "=r"(r):
+ /* in */ "f"(f));
+ return r;
+}
+
+#else
+
+// FP16 <-> FP32
+// ref: https://github.com/Maratyszcza/FP16
+
+static inline float fp32_from_bits(uint32_t w) {
+ union {
+ uint32_t as_bits;
+ float as_value;
+ } fp32;
+ fp32.as_bits = w;
+ return fp32.as_value;
+}
+
+static inline uint32_t fp32_to_bits(float f) {
+ union {
+ float as_value;
+ uint32_t as_bits;
+ } fp32;
+ fp32.as_value = f;
+ return fp32.as_bits;
+}
+
+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+ const uint32_t w = (uint32_t) h << 16;
+ const uint32_t sign = w & UINT32_C(0x80000000);
+ const uint32_t two_w = w + w;
+
+ const uint32_t exp_offset = UINT32_C(0xE0) << 23;
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+ const float exp_scale = 0x1.0p-112f;
+#else
+ const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
+#endif
+ const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
+
+ const uint32_t magic_mask = UINT32_C(126) << 23;
+ const float magic_bias = 0.5f;
+ const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
+
+ const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
+ const uint32_t result = sign |
+ (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
+ return fp32_from_bits(result);
+}
+
+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+ const float scale_to_inf = 0x1.0p+112f;
+ const float scale_to_zero = 0x1.0p-110f;
+#else
+ const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
+ const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
+#endif
+ float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
+
+ const uint32_t w = fp32_to_bits(f);
+ const uint32_t shl1_w = w + w;
+ const uint32_t sign = w & UINT32_C(0x80000000);
+ uint32_t bias = shl1_w & UINT32_C(0xFF000000);
+ if (bias < UINT32_C(0x71000000)) {
+ bias = UINT32_C(0x71000000);
+ }
+
+ base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
+ const uint32_t bits = fp32_to_bits(base);
+ const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
+ const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
+ const uint32_t nonsign = exp_bits + mantissa_bits;
+ return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
+}
+
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+
+#endif // __F16C__
+
+#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif // __ARM_FEATURE_SVE
+
+// precomputed f32 table for f16 (256 KB)
+// defined in ggml.c, initialized in ggml_init()
+extern float ggml_table_f32_f16[1 << 16];
+
+// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
+// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
+// This is also true for POWER9.
+#if !defined(GGML_FP16_TO_FP32)
+inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
+ uint16_t s;
+ memcpy(&s, &f, sizeof(uint16_t));
+ return ggml_table_f32_f16[s];
+}
+
+#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
+#endif
+
+#if !defined(GGML_FP32_TO_FP16)
+#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+#endif
+
+#ifdef __cplusplus
+}
+#endif
#include "ggml-cuda.h"
-#include "ggml.h"
+#include "ggml-impl.h"
#include "ggml-backend-impl.h"
#include "ggml-cuda/common.cuh"
#pragma once
-#include "ggml.h"
-
// GGML internal header
+#include "ggml.h"
+
#include <assert.h>
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
-#include <stddef.h>
#include <stdbool.h>
-#include <string.h> // memcpy
-#include <math.h> // fabsf
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
#undef MIN
#undef MAX
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
-#if defined(_MSC_VER)
-
-#define m512bh(p) p
-#define m512i(p) p
-
-#else
-
-#define m512bh(p) (__m512bh)(p)
-#define m512i(p) (__m512i)(p)
-
-#endif
-
-/**
- * Converts brain16 to float32.
- *
- * The bfloat16 floating point format has the following structure:
- *
- * ┌sign
- * │
- * │ ┌exponent
- * │ │
- * │ │ ┌mantissa
- * │ │ │
- * │┌──┴───┐┌─┴───┐
- * 0b0000000000000000 brain16
- *
- * Since bf16 has the same number of exponent bits as a 32bit float,
- * encoding and decoding numbers becomes relatively straightforward.
- *
- * ┌sign
- * │
- * │ ┌exponent
- * │ │
- * │ │ ┌mantissa
- * │ │ │
- * │┌──┴───┐┌─┴───────────────────┐
- * 0b00000000000000000000000000000000 IEEE binary32
- *
- * For comparison, the standard fp16 format has fewer exponent bits.
- *
- * ┌sign
- * │
- * │ ┌exponent
- * │ │
- * │ │ ┌mantissa
- * │ │ │
- * │┌─┴─┐┌─┴──────┐
- * 0b0000000000000000 IEEE binary16
- *
- * @see IEEE 754-2008
- */
-static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
- union {
- float f;
- uint32_t i;
- } u;
- u.i = (uint32_t)h.bits << 16;
- return u.f;
-}
-
-/**
- * Converts float32 to brain16.
- *
- * This is binary identical with Google Brain float conversion.
- * Floats shall round to nearest even, and NANs shall be quiet.
- * Subnormals aren't flushed to zero, except perhaps when used.
- * This code should vectorize nicely if using modern compilers.
- */
-static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
- ggml_bf16_t h;
- union {
- float f;
- uint32_t i;
- } u;
- u.f = s;
- if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
- h.bits = (u.i >> 16) | 64; /* force to quiet */
- return h;
- }
- h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
- return h;
-}
-
-#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
-#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
// static_assert should be a #define, but if it's not,
// fall back to the _Static_assert C11 keyword.
// if C99 - static_assert is noop
#endif
#endif
-// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
-#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
-#ifndef __FMA__
-#define __FMA__
-#endif
-#ifndef __F16C__
-#define __F16C__
-#endif
-#endif
-
-// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
-#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
-#ifndef __SSE3__
-#define __SSE3__
-#endif
-#ifndef __SSSE3__
-#define __SSSE3__
-#endif
-#endif
-
-#if defined(__ARM_FEATURE_SVE)
-#include <arm_sve.h>
-#include <sys/prctl.h>
-#endif
-
-// 16-bit float
-// on Arm, we use __fp16
-// on x86, we use uint16_t
-#if defined(__ARM_NEON)
-
-// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
-//
-// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
-//
-#include <arm_neon.h>
-
-#ifdef _MSC_VER
-
-typedef uint16_t ggml_fp16_internal_t;
-
-#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
-
-#else
-
-typedef __fp16 ggml_fp16_internal_t;
-
-#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
-
-#endif // _MSC_VER
-
-#if !defined(__aarch64__)
-
-// 32-bit ARM compatibility
-
-// vaddlvq_s16
-// vpaddq_s16
-// vpaddq_s32
-// vaddvq_s32
-// vaddvq_f32
-// vmaxvq_f32
-// vcvtnq_s32_f32
-// vzip1_u8
-// vzip2_u8
-
-inline static int32_t vaddlvq_s16(int16x8_t v) {
- int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
- return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2);
-}
-
-inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
- int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
- int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
- return vcombine_s16(a0, b0);
-}
-
-inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
- int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
- int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
- return vcombine_s32(a0, b0);
-}
-
-inline static int32_t vaddvq_s32(int32x4_t v) {
- return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
-}
-
-inline static float vaddvq_f32(float32x4_t v) {
- return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
-}
-
-inline static float vmaxvq_f32(float32x4_t v) {
- return
- MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
- MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
-}
-
-inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
- int32x4_t res;
-
- res[0] = roundf(vgetq_lane_f32(v, 0));
- res[1] = roundf(vgetq_lane_f32(v, 1));
- res[2] = roundf(vgetq_lane_f32(v, 2));
- res[3] = roundf(vgetq_lane_f32(v, 3));
-
- return res;
-}
-
-inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
- uint8x8_t res;
-
- res[0] = a[0]; res[1] = b[0];
- res[2] = a[1]; res[3] = b[1];
- res[4] = a[2]; res[5] = b[2];
- res[6] = a[3]; res[7] = b[3];
-
- return res;
-}
-
-inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
- uint8x8_t res;
-
- res[0] = a[4]; res[1] = b[4];
- res[2] = a[5]; res[3] = b[5];
- res[4] = a[6]; res[5] = b[6];
- res[6] = a[7]; res[7] = b[7];
-
- return res;
-}
-
-// vld1q_s16_x2
-// vld1q_u8_x2
-// vld1q_u8_x4
-// vld1q_s8_x2
-// vld1q_s8_x4
-// TODO: double-check these work correctly
-
-typedef struct ggml_int16x8x2_t {
- int16x8_t val[2];
-} ggml_int16x8x2_t;
-
-inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
- ggml_int16x8x2_t res;
-
- res.val[0] = vld1q_s16(ptr + 0);
- res.val[1] = vld1q_s16(ptr + 8);
-
- return res;
-}
-
-typedef struct ggml_uint8x16x2_t {
- uint8x16_t val[2];
-} ggml_uint8x16x2_t;
-
-inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
- ggml_uint8x16x2_t res;
-
- res.val[0] = vld1q_u8(ptr + 0);
- res.val[1] = vld1q_u8(ptr + 16);
-
- return res;
-}
-
-typedef struct ggml_uint8x16x4_t {
- uint8x16_t val[4];
-} ggml_uint8x16x4_t;
-
-inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
- ggml_uint8x16x4_t res;
-
- res.val[0] = vld1q_u8(ptr + 0);
- res.val[1] = vld1q_u8(ptr + 16);
- res.val[2] = vld1q_u8(ptr + 32);
- res.val[3] = vld1q_u8(ptr + 48);
-
- return res;
-}
-
-typedef struct ggml_int8x16x2_t {
- int8x16_t val[2];
-} ggml_int8x16x2_t;
-
-inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
- ggml_int8x16x2_t res;
-
- res.val[0] = vld1q_s8(ptr + 0);
- res.val[1] = vld1q_s8(ptr + 16);
-
- return res;
-}
-
-typedef struct ggml_int8x16x4_t {
- int8x16_t val[4];
-} ggml_int8x16x4_t;
-
-inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
- ggml_int8x16x4_t res;
-
- res.val[0] = vld1q_s8(ptr + 0);
- res.val[1] = vld1q_s8(ptr + 16);
- res.val[2] = vld1q_s8(ptr + 32);
- res.val[3] = vld1q_s8(ptr + 48);
-
- return res;
-}
-
-// NOTE: not tested
-inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
- int8x16_t res;
-
- res[ 0] = a[b[ 0]];
- res[ 1] = a[b[ 1]];
- res[ 2] = a[b[ 2]];
- res[ 3] = a[b[ 3]];
- res[ 4] = a[b[ 4]];
- res[ 5] = a[b[ 5]];
- res[ 6] = a[b[ 6]];
- res[ 7] = a[b[ 7]];
- res[ 8] = a[b[ 8]];
- res[ 9] = a[b[ 9]];
- res[10] = a[b[10]];
- res[11] = a[b[11]];
- res[12] = a[b[12]];
- res[13] = a[b[13]];
- res[14] = a[b[14]];
- res[15] = a[b[15]];
-
- return res;
-}
-
-// NOTE: not tested
-inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
- uint8x16_t res;
-
- res[ 0] = a[b[ 0]];
- res[ 1] = a[b[ 1]];
- res[ 2] = a[b[ 2]];
- res[ 3] = a[b[ 3]];
- res[ 4] = a[b[ 4]];
- res[ 5] = a[b[ 5]];
- res[ 6] = a[b[ 6]];
- res[ 7] = a[b[ 7]];
- res[ 8] = a[b[ 8]];
- res[ 9] = a[b[ 9]];
- res[10] = a[b[10]];
- res[11] = a[b[11]];
- res[12] = a[b[12]];
- res[13] = a[b[13]];
- res[14] = a[b[14]];
- res[15] = a[b[15]];
-
- return res;
-}
-
-#else
-
-#define ggml_int16x8x2_t int16x8x2_t
-#define ggml_uint8x16x2_t uint8x16x2_t
-#define ggml_uint8x16x4_t uint8x16x4_t
-#define ggml_int8x16x2_t int8x16x2_t
-#define ggml_int8x16x4_t int8x16x4_t
-
-#define ggml_vld1q_s16_x2 vld1q_s16_x2
-#define ggml_vld1q_u8_x2 vld1q_u8_x2
-#define ggml_vld1q_u8_x4 vld1q_u8_x4
-#define ggml_vld1q_s8_x2 vld1q_s8_x2
-#define ggml_vld1q_s8_x4 vld1q_s8_x4
-#define ggml_vqtbl1q_s8 vqtbl1q_s8
-#define ggml_vqtbl1q_u8 vqtbl1q_u8
-
-#endif // !defined(__aarch64__)
-
-#if !defined(__ARM_FEATURE_DOTPROD)
-
-inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
- const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
- const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
-
- return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
-}
-
-#else
-
-#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
-
-#endif // !defined(__ARM_FEATURE_DOTPROD)
-
-#endif // defined(__ARM_NEON)
-
-#if defined(__ARM_NEON) && !defined(_MSC_VER)
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-
-#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
- ggml_fp16_internal_t tmp;
- memcpy(&tmp, &h, sizeof(ggml_fp16_t));
- return (float)tmp;
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
- ggml_fp16_t res;
- ggml_fp16_internal_t tmp = f;
- memcpy(&res, &tmp, sizeof(ggml_fp16_t));
- return res;
-}
-
-#else
-
-#ifdef __wasm_simd128__
-#include <wasm_simd128.h>
-#else
-#ifdef __POWER9_VECTOR__
-#include <altivec.h>
-#undef bool
-#define bool _Bool
-#else
-#if defined(_MSC_VER) || defined(__MINGW32__)
-#include <intrin.h>
-#else
-#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
-#if !defined(__riscv)
-#include <immintrin.h>
-#endif
-#endif
-#endif
-#endif
-#endif
-
-#ifdef __riscv_v_intrinsic
-#include <riscv_vector.h>
-#endif
-
-#if defined(__loongarch64)
-#if defined(__loongarch_asx)
-#include <lasxintrin.h>
-#endif
-#if defined(__loongarch_sx)
-#include <lsxintrin.h>
-#endif
-#endif
-
-#if defined(__loongarch_asx)
-
-typedef union {
- int32_t i;
- float f;
-} ft_union;
-
-/* float type data load instructions */
-static __m128 __lsx_vreplfr2vr_s(float val) {
- ft_union fi_tmpval = {.f = val};
- return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
-}
-
-static __m256 __lasx_xvreplfr2vr_s(float val) {
- ft_union fi_tmpval = {.f = val};
- return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
-}
-#endif
-
-#ifdef __F16C__
-
-#ifdef _MSC_VER
-#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
-#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
-#else
-#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
-#endif
-
-#elif defined(__POWER9_VECTOR__)
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-/* the inline asm below is about 12% faster than the lookup method */
-#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
- register float f;
- register double d;
- __asm__(
- "mtfprd %0,%2\n"
- "xscvhpdp %0,%0\n"
- "frsp %1,%0\n" :
- /* temp */ "=d"(d),
- /* out */ "=f"(f):
- /* in */ "r"(h));
- return f;
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
- register double d;
- register ggml_fp16_t r;
- __asm__( /* xscvdphp can work on double or single precision */
- "xscvdphp %0,%2\n"
- "mffprd %1,%0\n" :
- /* temp */ "=d"(d),
- /* out */ "=r"(r):
- /* in */ "f"(f));
- return r;
-}
-
-#else
-
-// FP16 <-> FP32
-// ref: https://github.com/Maratyszcza/FP16
-
-static inline float fp32_from_bits(uint32_t w) {
- union {
- uint32_t as_bits;
- float as_value;
- } fp32;
- fp32.as_bits = w;
- return fp32.as_value;
-}
-
-static inline uint32_t fp32_to_bits(float f) {
- union {
- float as_value;
- uint32_t as_bits;
- } fp32;
- fp32.as_value = f;
- return fp32.as_bits;
-}
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
- const uint32_t w = (uint32_t) h << 16;
- const uint32_t sign = w & UINT32_C(0x80000000);
- const uint32_t two_w = w + w;
-
- const uint32_t exp_offset = UINT32_C(0xE0) << 23;
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
- const float exp_scale = 0x1.0p-112f;
-#else
- const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
-#endif
- const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
-
- const uint32_t magic_mask = UINT32_C(126) << 23;
- const float magic_bias = 0.5f;
- const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
-
- const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
- const uint32_t result = sign |
- (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
- return fp32_from_bits(result);
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
- const float scale_to_inf = 0x1.0p+112f;
- const float scale_to_zero = 0x1.0p-110f;
-#else
- const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
- const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
-#endif
- float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
-
- const uint32_t w = fp32_to_bits(f);
- const uint32_t shl1_w = w + w;
- const uint32_t sign = w & UINT32_C(0x80000000);
- uint32_t bias = shl1_w & UINT32_C(0xFF000000);
- if (bias < UINT32_C(0x71000000)) {
- bias = UINT32_C(0x71000000);
- }
-
- base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
- const uint32_t bits = fp32_to_bits(base);
- const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
- const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
- const uint32_t nonsign = exp_bits + mantissa_bits;
- return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
-}
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-
-#endif // __F16C__
-
-#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
-
-#ifdef __ARM_FEATURE_SVE
-#include <arm_sve.h>
-#endif // __ARM_FEATURE_SVE
-
-// precomputed f32 table for f16 (256 KB)
-// defined in ggml.c, initialized in ggml_init()
-extern float ggml_table_f32_f16[1 << 16];
-
-// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
-// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
-// This is also true for POWER9.
-#if !defined(GGML_FP16_TO_FP32)
-inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
- uint16_t s;
- memcpy(&s, &f, sizeof(uint16_t));
- return ggml_table_f32_f16[s];
-}
-
-#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
-#endif
-
-#if !defined(GGML_FP32_TO_FP16)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
-#endif
-
// bitset
+typedef uint32_t ggml_bitset_t;
+
static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
#define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
#define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
#define GGML_HASHSET_FULL ((size_t)-1)
#define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
+struct ggml_hash_set {
+ size_t size;
+ ggml_bitset_t * used; // whether or not the keys are in use i.e. set
+ struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
+};
+
struct ggml_hash_set ggml_hash_set_new(size_t size);
void ggml_hash_set_free(struct ggml_hash_set * hash_set);
GGML_ABORT("fatal error");
}
+// computation graph
+
+enum ggml_cgraph_eval_order {
+ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
+ GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
+ GGML_CGRAPH_EVAL_ORDER_COUNT
+};
+
+struct ggml_cgraph {
+ int size;
+ int n_nodes;
+ int n_leafs;
+
+ struct ggml_tensor ** nodes;
+ struct ggml_tensor ** grads;
+ struct ggml_tensor ** leafs;
+
+ struct ggml_hash_set visited_hash_set;
+
+ enum ggml_cgraph_eval_order order;
+};
+
+struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
+
#ifdef __cplusplus
}
#endif
-#include "ggml.h"
+#include "ggml-impl.h"
#include "ggml-backend.h"
#include "ggml-backend-impl.h"
#include "ggml-kompute.h"
#import "ggml-metal.h"
+#import "ggml-impl.h"
#import "ggml-backend-impl.h"
-#import "ggml.h"
#import <Foundation/Foundation.h>
// create multiple command buffers and enqueue them
// then, we encode the graph into the command buffers in parallel
- const int n_nodes = gf->n_nodes;
+ const int n_nodes = gf->n_nodes;
const int n_cb = ctx->n_cb;
const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
#include "ggml-quants.h"
#include "ggml-impl.h"
+#include "ggml-cpu-impl.h"
#include <math.h>
#include "ggml-rpc.h"
-#include "ggml.h"
+#include "ggml-impl.h"
#include "ggml-backend-impl.h"
#include <cinttypes>
#include <sycl/half_type.hpp>
#include "ggml-sycl.h"
-#include "ggml.h"
+#include "ggml-impl.h"
#include "ggml-backend-impl.h"
#include "ggml-sycl/backend.hpp"
#include <memory>
#include <mutex>
-#include "ggml.h"
+#include "ggml-impl.h"
#include "ggml-backend-impl.h"
#include "ggml-vulkan-shaders.hpp"
#include "ggml-backend.h"
#include "ggml-impl.h"
+#include "ggml-cpu-impl.h"
#include "ggml-quants.h"
#include "ggml.h"
#include "ggml-aarch64.h"
#define GGML_DEBUG 0
#define GGML_GELU_FP16
#define GGML_GELU_QUICK_FP16
+#define GGML_N_TASKS_MAX (-1)
#define GGML_SOFT_MAX_UNROLL 4
#define GGML_VEC_DOT_UNROLL 2
#define GGML_F32x4_ADD vaddq_f32
#define GGML_F32x4_MUL vmulq_f32
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
-#define GGML_F32x4_REDUCE(res, x) \
-{ \
- int offset = GGML_F32_ARR >> 1; \
- for (int i = 0; i < offset; ++i) { \
- x[i] = vaddq_f32(x[i], x[offset+i]); \
- } \
- offset >>= 1; \
- for (int i = 0; i < offset; ++i) { \
- x[i] = vaddq_f32(x[i], x[offset+i]); \
- } \
- offset >>= 1; \
- for (int i = 0; i < offset; ++i) { \
- x[i] = vaddq_f32(x[i], x[offset+i]); \
- } \
- res = GGML_F32x4_REDUCE_ONE(x[0]); \
+#define GGML_F32x4_REDUCE(res, x) \
+{ \
+ int offset = GGML_F32_ARR >> 1; \
+ for (int i = 0; i < offset; ++i) { \
+ (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
+ } \
+ (res) = GGML_F32x4_REDUCE_ONE((x)[0]); \
}
#define GGML_F32_VEC GGML_F32x4
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
#define GGML_F16x8_ADD vaddq_f16
#define GGML_F16x8_MUL vmulq_f16
- #define GGML_F16x8_REDUCE(res, x) \
- do { \
- int offset = GGML_F16_ARR >> 1; \
- for (int i = 0; i < offset; ++i) { \
- x[i] = vaddq_f16(x[i], x[offset+i]); \
- } \
- offset >>= 1; \
- for (int i = 0; i < offset; ++i) { \
- x[i] = vaddq_f16(x[i], x[offset+i]); \
- } \
- offset >>= 1; \
- for (int i = 0; i < offset; ++i) { \
- x[i] = vaddq_f16(x[i], x[offset+i]); \
- } \
- const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
- const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
- res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
+ #define GGML_F16x8_REDUCE(res, x) \
+ do { \
+ int offset = GGML_F16_ARR >> 1; \
+ for (int i = 0; i < offset; ++i) { \
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
+ } \
+ const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
+ const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
+ (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
} while (0)
#define GGML_F16_VEC GGML_F16x8
#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
#define GGML_F16_VEC_SET1 GGML_F16x8_SET1
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
- #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), r[i])
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i])
#define GGML_F16_VEC_FMA GGML_F16x8_FMA
#define GGML_F16_VEC_ADD GGML_F16x8_ADD
#define GGML_F16_VEC_MUL GGML_F16x8_MUL
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
#endif
+//
+// ggml object
+//
+
+struct ggml_object {
+ size_t offs;
+ size_t size;
+
+ struct ggml_object * next;
+
+ enum ggml_object_type type;
+
+ char padding[4];
+};
+
+static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
+
//
// ggml context
//
ggml_hash_set_reset(&cgraph->visited_hash_set);
}
+int ggml_graph_size(struct ggml_cgraph * cgraph) {
+ return cgraph->size;
+}
+
+struct ggml_tensor * ggml_graph_node(struct ggml_cgraph * cgraph, int i) {
+ if (i < 0) {
+ GGML_ASSERT(cgraph->n_nodes + i >= 0);
+ return cgraph->nodes[cgraph->n_nodes + i];
+ }
+
+ GGML_ASSERT(i < cgraph->n_nodes);
+ return cgraph->nodes[i];
+}
+
+struct ggml_tensor ** ggml_graph_nodes(struct ggml_cgraph * cgraph) {
+ return cgraph->nodes;
+}
+
+int ggml_graph_n_nodes(struct ggml_cgraph * cgraph) {
+ return cgraph->n_nodes;
+}
+
+void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
+ GGML_ASSERT(cgraph->size > cgraph->n_nodes);
+ cgraph->nodes[cgraph->n_nodes] = tensor;
+ cgraph->n_nodes++;
+}
+
// Android's libc implementation "bionic" does not support setting affinity
#if defined(__gnu_linux__)
static void set_numa_thread_affinity(int thread_n) {
// add sentinels as graph nodes so that they are checked in the callback
for (ggml_tensor * sentinel : sentinels) {
- gf->nodes[gf->n_nodes++] = sentinel;
+ ggml_graph_add_node(gf, sentinel);
}
// randomize tensors
// duplicate the op
size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
- int n_runs = std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1;
+ int n_runs = std::min((size_t) ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
for (int i = 1; i < n_runs; i++) {
- gf->nodes[gf->n_nodes++] = out;
+ ggml_graph_add_node(gf, out);
}
// calculate memory
}
return size;
};
- for (int i = 0; i < gf->n_nodes; i++) {
- if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out) {
+ for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
+ if (ggml_is_view_op(ggml_graph_node(gf, i)->op) || ggml_graph_node(gf, i) == out) {
continue;
}
- mem += tensor_op_size(gf->nodes[i]);
+ mem += tensor_op_size(ggml_graph_node(gf, i));
}
// run
ggml_graph_cpy(gf, gb);
ggml_build_backward_expand(ctx, gf, gb, false, false);
if (expect.size() != 1 || expect[0] != 0.0f) {
- GGML_ASSERT(gb->n_nodes > gf->n_nodes);
+ GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf));
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || t->grad->op != GGML_OP_NONE);
}