//
#include <arm_neon.h>
-#if !defined(__aarch64__)
-inline static int32_t vaddvq_s16(int16x8_t v) {
- return
- (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
- (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
- (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
- (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
-}
-
-inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
- int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
- int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
- return vcombine_s16(a0, b0);
-}
-
-inline static int32_t vaddvq_s32(int32x4_t v) {
- return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
-}
-#endif
-
#else
#ifdef __wasm_simd128__
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <intrin.h>
#else
-#if !defined(__riscv) && !defined(__s390__)
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
+#if !defined(__riscv)
#include <immintrin.h>
#endif
#endif
#endif
#endif
#endif
+#endif
#ifdef __riscv_v_intrinsic
#include <riscv_vector.h>
#undef MIN
#undef MAX
+
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
#if defined(__ARM_NEON)
-
#if !defined(__aarch64__)
+// 64-bit compatibility
+
+// vaddvq_s16
+// vpaddq_s16
+// vaddvq_s32
+// vaddvq_f32
+// vmaxvq_f32
+// vcvtnq_s32_f32
+
+inline static int32_t vaddvq_s16(int16x8_t v) {
+ return
+ (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
+ (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
+ (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
+ (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
+}
+
+inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
+ int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
+ int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
+ return vcombine_s16(a0, b0);
+}
+
inline static int32_t vaddvq_s32(int32x4_t v) {
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
}
return res;
}
+// vld1q_s16_x2
+// vld1q_u8_x2
+// vld1q_u8_x4
+// vld1q_s8_x2
+// vld1q_s8_x4
+// TODO: double-check these work correctly
+
+typedef struct ggml_int16x8x2_t {
+ int16x8_t val[2];
+} ggml_int16x8x2_t;
+
+inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
+ ggml_int16x8x2_t res;
+
+ res.val[0] = vld1q_s16(ptr + 0);
+ res.val[1] = vld1q_s16(ptr + 8);
+
+ return res;
+}
+
+typedef struct ggml_uint8x16x2_t {
+ uint8x16_t val[2];
+} ggml_uint8x16x2_t;
+
+inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
+ ggml_uint8x16x2_t res;
+
+ res.val[0] = vld1q_u8(ptr + 0);
+ res.val[1] = vld1q_u8(ptr + 16);
+
+ return res;
+}
+
+typedef struct ggml_uint8x16x4_t {
+ uint8x16_t val[4];
+} ggml_uint8x16x4_t;
+
+inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
+ ggml_uint8x16x4_t res;
+
+ res.val[0] = vld1q_u8(ptr + 0);
+ res.val[1] = vld1q_u8(ptr + 16);
+ res.val[2] = vld1q_u8(ptr + 32);
+ res.val[3] = vld1q_u8(ptr + 48);
+
+ return res;
+}
+
+typedef struct ggml_int8x16x2_t {
+ int8x16_t val[2];
+} ggml_int8x16x2_t;
+
+inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
+ ggml_int8x16x2_t res;
+
+ res.val[0] = vld1q_s8(ptr + 0);
+ res.val[1] = vld1q_s8(ptr + 16);
+
+ return res;
+}
+
+typedef struct ggml_int8x16x4_t {
+ int8x16_t val[4];
+} ggml_int8x16x4_t;
+
+inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
+ ggml_int8x16x4_t res;
+
+ res.val[0] = vld1q_s8(ptr + 0);
+ res.val[1] = vld1q_s8(ptr + 16);
+ res.val[2] = vld1q_s8(ptr + 32);
+ res.val[3] = vld1q_s8(ptr + 48);
+
+ return res;
+}
+
+#else
+
+#define ggml_int16x8x2_t int16x8x2_t
+#define ggml_uint8x16x2_t uint8x16x2_t
+#define ggml_uint8x16x4_t uint8x16x4_t
+#define ggml_int8x16x2_t int8x16x2_t
+#define ggml_int8x16x4_t int8x16x4_t
+
+#define ggml_vld1q_s16_x2 vld1q_s16_x2
+#define ggml_vld1q_u8_x2 vld1q_u8_x2
+#define ggml_vld1q_u8_x4 vld1q_u8_x4
+#define ggml_vld1q_s8_x2 vld1q_s8_x2
+#define ggml_vld1q_s8_x4 vld1q_s8_x4
+
#endif
#endif
const int32x4_t vzero = vdupq_n_s32(0);
#endif
- int8x16x2_t q2bytes;
+ ggml_int8x16x2_t q2bytes;
uint8_t aux[16];
float sum = 0;
vst1q_u8(aux, scales);
const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4);
- const int16x8x2_t q8sums = vld1q_s16_x2(y[i].bsums);
- const int16x8x2_t mins16 = {vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))};
+ const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
+ const ggml_int16x8x2_t mins16 = {vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))};
const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])),
vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0])));
const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])),
#endif
#define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
- q8bytes = vld1q_s8_x2(q8); q8 += 32;\
+ q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\
q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[0], (shift)), m3));\
q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[1], (shift)), m3));\
MULTIPLY_ACCUM_WITH_SCALE((index));
for (int j = 0; j < QK_K/128; ++j) {
- const uint8x16x2_t q2bits = vld1q_u8_x2(q2); q2 += 32;
+ const ggml_uint8x16x2_t q2bits = ggml_vld1q_u8_x2(q2); q2 += 32;
- int8x16x2_t q8bytes = vld1q_s8_x2(q8); q8 += 32;
+ ggml_int8x16x2_t q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[0], m3));
q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[1], m3));
MULTIPLY_ACCUM_WITH_SCALE(0);
const int32x4_t vzero = vdupq_n_s32(0);
#endif
- int8x16x4_t q2bytes;
+ ggml_int8x16x4_t q2bytes;
uint32_t aux32[2];
const uint8_t * scales = (const uint8_t *)aux32;
const uint8x16_t q2bits = vld1q_u8(q2);
- const int8x16x4_t q8bytes = vld1q_s8_x4(q8);
+ const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8);
q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits, m3));
q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 2), m3));
const uint8x16_t m3 = vshlq_n_u8(m0, 3);
const int8_t m32 = 32;
- int8x16x4_t q3bytes;
+ ggml_int8x16x4_t q3bytes;
float sum = 0;
const uint8_t * restrict qh = x[i].hmask;
const int8_t * restrict q8 = y[i].qs;
- uint8x16x2_t qhbits = vld1q_u8_x2(qh);
+ ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
- uint8x16x4_t q3h;
+ ggml_uint8x16x4_t q3h;
int32_t isum = 0;
for (int j = 0; j < QK_K/128; ++j) {
- const uint8x16x2_t q3bits = vld1q_u8_x2(q3); q3 += 32;
- const int8x16x4_t q8bytes_1 = vld1q_s8_x4(q8); q8 += 64;
- const int8x16x4_t q8bytes_2 = vld1q_s8_x4(q8); q8 += 64;
+ const ggml_uint8x16x2_t q3bits = ggml_vld1q_u8_x2(q3); q3 += 32;
+ const ggml_int8x16x4_t q8bytes_1 = ggml_vld1q_s8_x4(q8); q8 += 64;
+ const ggml_int8x16x4_t q8bytes_2 = ggml_vld1q_s8_x4(q8); q8 += 64;
q3h.val[0] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[0]), 2);
q3h.val[1] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[1]), 2);
const uint8x16_t m3b = vdupq_n_u8(0x3);
const uint8x16_t mh = vdupq_n_u8(4);
- int8x16x4_t q3bytes;
+ ggml_int8x16x4_t q3bytes;
uint16_t aux16[2];
int8_t * scales = (int8_t *)aux16;
for (int i = 0; i < nb; ++i) {
- uint8x16x4_t q3h;
+ ggml_uint8x16x4_t q3h;
const uint8x8_t hbits = vld1_u8(x[i].hmask);
const uint8x16_t q3bits = vld1q_u8(x[i].qs);
- const int8x16x4_t q8bytes = vld1q_s8_x4(y[i].qs);
+ const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(y[i].qs);
const uint16_t a = *(const uint16_t *)x[i].scales;
aux16[0] = a & 0x0f0f;
const int32x4_t mzero = vdupq_n_s32(0);
#endif
- int8x16x2_t q4bytes;
- int8x16x2_t q8bytes;
+ ggml_int8x16x2_t q4bytes;
+ ggml_int8x16x2_t q8bytes;
float sumf = 0;
for (int j = 0; j < QK_K/64; ++j) {
- const uint8x16x2_t q4bits = vld1q_u8_x2(q4); q4 += 32;
+ const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
#ifdef __ARM_FEATURE_DOTPROD
- q8bytes = vld1q_s8_x2(q8); q8 += 32;
+ q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
sumi1 += vaddvq_s32(p1) * scales[2*j+0];
- q8bytes = vld1q_s8_x2(q8); q8 += 32;
+ q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
sumi2 += vaddvq_s32(p2) * scales[2*j+1];
#else
- q8bytes = vld1q_s8_x2(q8); q8 += 32;
+ q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1])));
sumi1 += vaddvq_s16(vaddq_s16(p0, p1)) * scales[2*j+0];
- q8bytes = vld1q_s8_x2(q8); q8 += 32;
+ q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
float sumf = 0;
- int8x16x2_t q4bytes;
- int8x16x4_t q8bytes;
+ ggml_int8x16x2_t q4bytes;
+ ggml_int8x16x4_t q8bytes;
float sum_mins = 0.f;
const float d = y[i].d * (float)x[i].d[0];
- const uint8x16x2_t q4bits = vld1q_u8_x2(q4);
+ const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4);
#ifdef __ARM_FEATURE_DOTPROD
- q8bytes = vld1q_s8_x4(q8);
+ q8bytes = ggml_vld1q_s8_x4(q8);
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
const int32_t sumi2 = vaddvq_s32(p2) * scales[1];
#else
- q8bytes = vld1q_s8_x4(q8);
+ q8bytes = ggml_vld1q_s8_x4(q8);
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
const int32x4_t mzero = vdupq_n_s32(0);
#endif
- int8x16x4_t q5bytes;
+ ggml_int8x16x4_t q5bytes;
float sumf = 0;
const uint8_t * restrict qh = x[i].qh;
const int8_t * restrict q8 = y[i].qs;
- uint8x16x2_t qhbits = vld1q_u8_x2(qh);
+ ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
- uint8x16x4_t q5h;
+ ggml_uint8x16x4_t q5h;
int32_t sumi = 0;
for (int j = 0; j < QK_K/64; ++j) {
- const uint8x16x2_t q5bits = vld1q_u8_x2(q5); q5 += 32;
- const int8x16x4_t q8bytes = vld1q_s8_x4(q8); q8 += 64;
+ const ggml_uint8x16x2_t q5bits = ggml_vld1q_u8_x2(q5); q5 += 32;
+ const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
q5h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
q5h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
const int32x4_t mzero = vdupq_n_s32(0);
#endif
- int8x16x4_t q5bytes;
- uint8x16x4_t q5h;
+ ggml_int8x16x4_t q5bytes;
+ ggml_uint8x16x4_t q5h;
float sumf = 0;
const uint8x8_t qhbits = vld1_u8(qh);
- const uint8x16x2_t q5bits = vld1q_u8_x2(q5);
- const int8x16x4_t q8bytes = vld1q_s8_x4(q8);
+ const ggml_uint8x16x2_t q5bits = ggml_vld1q_u8_x2(q5);
+ const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8);
const uint8x16_t htmp = vcombine_u8(qhbits, vshr_n_u8(qhbits, 1));
q5h.val[0] = vbicq_u8(mh, vshlq_n_u8(htmp, 4));
const uint8x16_t mone = vdupq_n_u8(3);
- int8x16x4_t q6bytes;
- uint8x16x4_t q6h;
+ ggml_int8x16x4_t q6bytes;
+ ggml_uint8x16x4_t q6h;
for (int i = 0; i < nb; ++i) {
const int8_t * restrict scale = x[i].scales;
- const int16x8x2_t q8sums = vld1q_s16_x2(y[i].bsums);
+ const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
const int8x16_t scales = vld1q_s8(scale);
- const int16x8x2_t q6scales = {vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))};
+ const ggml_int16x8x2_t q6scales = {vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))};
const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])),
vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),
for (int j = 0; j < QK_K/128; ++j) {
- uint8x16x2_t qhbits = vld1q_u8_x2(qh); qh += 32;
- uint8x16x4_t q6bits = vld1q_u8_x4(q6); q6 += 64;
- int8x16x4_t q8bytes = vld1q_s8_x4(q8); q8 += 64;
+ ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); qh += 32;
+ ggml_uint8x16x4_t q6bits = ggml_vld1q_u8_x4(q6); q6 += 64;
+ ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
q6h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
scale += 2;
#endif
- q8bytes = vld1q_s8_x4(q8); q8 += 64;
+ q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
shifted = vshrq_n_u8(qhbits.val[0], 4);
q6h.val[0] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
const uint8x16_t mone = vdupq_n_u8(3);
- int8x16x4_t q6bytes;
- uint8x16x4_t q6h;
+ ggml_int8x16x4_t q6bytes;
+ ggml_uint8x16x4_t q6h;
for (int i = 0; i < nb; ++i) {
int32_t isum = 0;
- uint8x16_t qhbits = vld1q_u8(qh);
- uint8x16x2_t q6bits = vld1q_u8_x2(q6);
- int8x16x4_t q8bytes = vld1q_s8_x4(q8);
+ uint8x16_t qhbits = vld1q_u8(qh);
+ ggml_uint8x16x2_t q6bits = ggml_vld1q_u8_x2(q6);
+ ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8);
q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits), 4);
uint8x16_t shifted = vshrq_n_u8(qhbits, 2);
// floating point type used to accumulate sums
typedef double ggml_float;
+#undef MIN
+#undef MAX
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
//
// global data
//
// simd mappings
//
+#if defined(__ARM_NEON)
+#if !defined(__aarch64__)
+
+// 64-bit compatibility
+
+inline static float vaddvq_f32(float32x4_t v) {
+ return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
+}
+
+#endif
+#endif
+
// we define a common set of C macros which map to specific intrinsics based on the current architecture
// we then implement the fundamental computation operations below using only these macros
// adding support for new architectures requires to define the corresponding SIMD macros
"ROPE_BACK",
"ALIBI",
"CLAMP",
- "CONV_1D",
- "CONV_1D_STAGE_0",
- "CONV_1D_STAGE_1",
"CONV_TRANSPOSE_1D",
- "CONV_2D",
- "CONV_2D_STAGE_0",
- "CONV_2D_STAGE_1",
+ "IM2COL",
"CONV_TRANSPOSE_2D",
"POOL_1D",
"POOL_2D",
"CROSS_ENTROPY_LOSS_BACK",
};
-static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
+static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
"rope_back(x)",
"alibi(x)",
"clamp(x)",
- "conv_1d(x)",
- "conv_1d_stage_0(x)",
- "conv_1d_stage_1(x)",
"conv_transpose_1d(x)",
- "conv_2d(x)",
- "conv_2d_stage_0(x)",
- "conv_2d_stage_1(x)",
+ "im2col(x)",
"conv_transpose_2d(x)",
"pool_1d(x)",
"pool_2d(x)",
"cross_entropy_loss_back(x,y)",
};
-static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
+static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
p[GGML_OP_GET_ROWS_BACK ] = true;
p[GGML_OP_DIAG_MASK_INF ] = true;
p[GGML_OP_DIAG_MASK_ZERO ] = true;
- p[GGML_OP_CONV_1D ] = true;
- p[GGML_OP_CONV_1D_STAGE_0 ] = true;
- p[GGML_OP_CONV_1D_STAGE_1 ] = true;
p[GGML_OP_CONV_TRANSPOSE_1D ] = true;
- p[GGML_OP_CONV_2D ] = true;
- p[GGML_OP_CONV_2D_STAGE_0 ] = true;
- p[GGML_OP_CONV_2D_STAGE_1 ] = true;
p[GGML_OP_CONV_TRANSPOSE_2D ] = true;
p[GGML_OP_FLASH_ATTN_BACK ] = true;
p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
}
-// im2col: [N, IC, IL] => [N, OL, IC*K]
-// a: [OC,IC, K]
-// b: [N, IC, IL]
-// result: [N, OL, IC*K]
-static struct ggml_tensor * ggml_conv_1d_stage_0(
- struct ggml_context * ctx,
- struct ggml_tensor * a,
- struct ggml_tensor * b,
- int s0,
- int p0,
- int d0) {
- GGML_ASSERT(a->ne[1] == b->ne[1]);
- bool is_node = false;
-
- if (a->grad || b->grad) {
- GGML_ASSERT(false); // TODO: implement backward
- is_node = true;
- }
-
- const int64_t OL = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
-
- const int64_t ne[4] = {
- a->ne[1] * a->ne[0],
- OL,
- b->ne[2],
- 1,
- };
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
-
- int32_t params[] = { s0, p0, d0 };
- ggml_set_op_params(result, params, sizeof(params));
-
- result->op = GGML_OP_CONV_1D_STAGE_0;
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
- result->src[0] = a;
- result->src[1] = b;
-
- return result;
-}
-
-// ggml_conv_1d_stage_1
-
-// gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
-// a: [OC, IC, K]
-// b: [N, OL, IC * K]
-// result: [N, OC, OL]
-static struct ggml_tensor * ggml_conv_1d_stage_1(
- struct ggml_context * ctx,
- struct ggml_tensor * a,
- struct ggml_tensor * b) {
-
- bool is_node = false;
-
- if (a->grad || b->grad) {
- GGML_ASSERT(false); // TODO: implement backward
- is_node = true;
- }
-
- const int64_t ne[4] = {
- b->ne[1],
- a->ne[2],
- b->ne[2],
- 1,
- };
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-
- result->op = GGML_OP_CONV_1D_STAGE_1;
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
- result->src[0] = a;
- result->src[1] = b;
-
- return result;
-}
-
-// ggml_conv_1d
-
GGML_API struct ggml_tensor * ggml_conv_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int s0,
int p0,
int d0) {
- struct ggml_tensor * result = ggml_conv_1d_stage_0(ctx, a, b, s0, p0, d0);
- result = ggml_conv_1d_stage_1(ctx, a, result);
- return result;
-}
-
-// GGML_API struct ggml_tensor * ggml_conv_1d(
-// struct ggml_context * ctx,
-// struct ggml_tensor * a,
-// struct ggml_tensor * b,
-// int s0,
-// int p0,
-// int d0) {
-// GGML_ASSERT(ggml_is_matrix(b));
-// GGML_ASSERT(a->ne[1] == b->ne[1]);
-// bool is_node = false;
-
-// if (a->grad || b->grad) {
-// GGML_ASSERT(false); // TODO: implement backward
-// is_node = true;
-// }
+ struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OL, IC * K]
-// const int64_t ne[4] = {
-// ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
-// a->ne[2], 1, 1,
-// };
-// struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
+ struct ggml_tensor * result =
+ ggml_mul_mat(ctx,
+ ggml_reshape_2d(ctx, im2col, im2col->ne[0], (im2col->ne[2] * im2col->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K]
+ ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]), a->ne[2])); // [OC,IC, K] => [OC, IC * K]
-// int32_t params[] = { s0, p0, d0 };
-// ggml_set_op_params(result, params, sizeof(params));
+ result = ggml_reshape_3d(ctx, result, im2col->ne[1], a->ne[2], im2col->ne[2]); // [N, OC, OL]
-// result->op = GGML_OP_CONV_1D;
-// result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-// result->src[0] = a;
-// result->src[1] = b;
-
-// return result;
-// }
+ return result;
+}
// ggml_conv_1d_ph
// a: [OC,IC, KH, KW]
// b: [N, IC, IH, IW]
// result: [N, OH, OW, IC*KH*KW]
-static struct ggml_tensor * ggml_conv_2d_stage_0(
+struct ggml_tensor * ggml_im2col(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int p0,
int p1,
int d0,
- int d1) {
+ int d1,
+ bool is_2D) {
- GGML_ASSERT(a->ne[2] == b->ne[2]);
+ if(is_2D) {
+ GGML_ASSERT(a->ne[2] == b->ne[2]);
+ } else {
+ GGML_ASSERT(a->ne[1] == b->ne[1]);
+ }
bool is_node = false;
if (a->grad || b->grad) {
is_node = true;
}
- const int64_t OH = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
- const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
+ const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
+ const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
const int64_t ne[4] = {
- a->ne[2] * a->ne[1] * a->ne[0],
+ is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
OW,
- OH,
- b->ne[3],
+ is_2D ? OH : b->ne[2],
+ is_2D ? b->ne[3] : 1,
};
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
- int32_t params[] = { s0, s1, p0, p1, d0, d1 };
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
+ int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
ggml_set_op_params(result, params, sizeof(params));
- result->op = GGML_OP_CONV_2D_STAGE_0;
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
- result->src[0] = a;
- result->src[1] = b;
-
- return result;
-
-}
-
-// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
-// a: [OC, IC, KH, KW]
-// b: [N, OH, OW, IC * KH * KW]
-// result: [N, OC, OH, OW]
-static struct ggml_tensor * ggml_conv_2d_stage_1(
- struct ggml_context * ctx,
- struct ggml_tensor * a,
- struct ggml_tensor * b) {
-
- bool is_node = false;
-
- if (a->grad || b->grad) {
- GGML_ASSERT(false); // TODO: implement backward
- is_node = true;
- }
-
- const int64_t ne[4] = {
- b->ne[1],
- b->ne[2],
- a->ne[3],
- b->ne[3],
- };
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-
- result->op = GGML_OP_CONV_2D_STAGE_1;
+ result->op = GGML_OP_IM2COL;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
result->src[1] = b;
return result;
-
}
// a: [OC,IC, KH, KW]
// b: [N, IC, IH, IW]
// result: [N, OC, OH, OW]
struct ggml_tensor * ggml_conv_2d(
- struct ggml_context * ctx,
- struct ggml_tensor * a,
- struct ggml_tensor * b,
- int s0,
- int s1,
- int p0,
- int p1,
- int d0,
- int d1) {
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ int s0,
+ int s1,
+ int p0,
+ int p1,
+ int d0,
+ int d1) {
+ struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
- struct ggml_tensor * result = ggml_conv_2d_stage_0(ctx, a, b, s0, s1, p0, p1, d0, d1); // [N, OH, OW, IC * KH * KW]
- result = ggml_conv_2d_stage_1(ctx, a, result);
+ struct ggml_tensor * result =
+ ggml_mul_mat(ctx,
+ ggml_reshape_2d(ctx, im2col, im2col->ne[0], im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
+ ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]), a->ne[3])); // [OC,IC, KH, KW] => [OC, IC * KH * KW]
- return result;
+ result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], a->ne[3], im2col->ne[3]); // [N, OC, OH, OW]
+ return result;
}
// ggml_conv_2d_sk_p0
// TODO: find the optimal values for these
if (ggml_is_contiguous(src0) &&
ggml_is_contiguous(src1) &&
+ src0->type == GGML_TYPE_F32 &&
+ src1->type == GGML_TYPE_F32 &&
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
/*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
// we don't support permuted src0 or src1
GGML_ASSERT(nb00 == ggml_type_size(type));
- GGML_ASSERT(nb10 == sizeof(float));
+ GGML_ASSERT(nb10 == ggml_type_size(src1->type));
// dst cannot be transposed or permuted
GGML_ASSERT(nb0 == sizeof(float));
}
}
-// ggml_compute_forward_conv_1d
+// ggml_compute_forward_conv_transpose_1d
-static void ggml_compute_forward_conv_1d_f16_f32(
+static void ggml_compute_forward_conv_transpose_1d_f16_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
const int ith = params->ith;
const int nth = params->nth;
- const int nk = ne00;
-
- // size of the convolution row - the kernel size unrolled across all input channels
- const int ew0 = nk*ne01;
-
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
- const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
- const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
+ const int nk = ne00*ne01*ne02;
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
GGML_ASSERT(nb10 == sizeof(float));
if (params->type == GGML_TASK_INIT) {
memset(params->wdata, 0, params->wsize);
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
+ // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
+ {
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
- for (int64_t i11 = 0; i11 < ne11; i11++) {
- const float * const src = (float *)((char *) src1->data + i11*nb11);
- ggml_fp16_t * dst_data = wdata;
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
+ ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
+ dst_data[i00*ne02 + i02] = src[i00];
+ }
+ }
+ }
+ }
- for (int64_t i0 = 0; i0 < ne0; i0++) {
- for (int64_t ik = 0; ik < nk; ik++) {
- const int idx0 = i0*s0 + ik*d0 - p0;
+ // permute source data (src1) from (L x Cin) to (Cin x L)
+ {
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
+ ggml_fp16_t * dst_data = wdata;
- if(!(idx0 < 0 || idx0 >= ne10)) {
- dst_data[i0*ew0 + i11*nk + ik] = GGML_FP32_TO_FP16(src[idx0]);
- }
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
+ const float * const src = (float *)((char *) src1->data + i11*nb11);
+ for (int64_t i10 = 0; i10 < ne10; i10++) {
+ dst_data[i10*ne11 + i11] = GGML_FP32_TO_FP16(src[i10]);
}
}
}
+ // need to zero dst since we are accumulating into it
+ memset(dst->data, 0, ggml_nbytes(dst));
+
return;
}
return;
}
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+
// total rows in dst
- const int nr = ne2;
+ const int nr = ne1;
// rows per thread
const int dr = (nr + nth - 1)/nth;
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
-
- for (int i2 = 0; i2 < ne2; i2++) {
- for (int i1 = ir0; i1 < ir1; i1++) {
- float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
+ ggml_fp16_t * const wdata_src = wdata + nk;
- for (int i0 = 0; i0 < ne0; i0++) {
- ggml_vec_dot_f16(ew0, dst_data + i0,
- (ggml_fp16_t *) ((char *) src0->data + i1*nb02),
- (ggml_fp16_t *) wdata + i2*nb2 + i0*ew0);
+ for (int i1 = ir0; i1 < ir1; i1++) {
+ float * dst_data = (float *)((char *) dst->data + i1*nb1);
+ ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00;
+ for (int i10 = 0; i10 < ne10; i10++) {
+ const int i1n = i10*ne11;
+ for (int i00 = 0; i00 < ne00; i00++) {
+ float v = 0;
+ ggml_vec_dot_f16(ne02, &v,
+ (ggml_fp16_t *) wdata_src + i1n,
+ (ggml_fp16_t *) wdata_kernel + i00*ne02);
+ dst_data[i10*s0 + i00] += v;
}
}
}
}
-static void ggml_compute_forward_conv_1d_f32(
+static void ggml_compute_forward_conv_transpose_1d_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
const int ith = params->ith;
const int nth = params->nth;
- const int nk = ne00;
-
- const int ew0 = nk*ne01;
-
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
- const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
- const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
+ const int nk = ne00*ne01*ne02;
GGML_ASSERT(nb00 == sizeof(float));
GGML_ASSERT(nb10 == sizeof(float));
if (params->type == GGML_TASK_INIT) {
memset(params->wdata, 0, params->wsize);
- float * const wdata = (float *) params->wdata + 0;
+ // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
+ {
+ float * const wdata = (float *) params->wdata + 0;
- for (int64_t i11 = 0; i11 < ne11; i11++) {
- const float * const src = (float *)((char *) src1->data + i11*nb11);
- float * dst_data = wdata;
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
+ const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
+ float * dst_data = wdata + i01*ne00*ne02;
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
+ dst_data[i00*ne02 + i02] = src[i00];
+ }
+ }
+ }
+ }
- for (int64_t i0 = 0; i0 < ne0; i0++) {
- for (int64_t ik = 0; ik < nk; ik++) {
- const int idx0 = i0*s0 + ik*d0 - p0;
+ // prepare source data (src1)
+ {
+ float * const wdata = (float *) params->wdata + nk;
+ float * dst_data = wdata;
- if(!(idx0 < 0 || idx0 >= ne10)) {
- dst_data[i0*ew0 + i11*nk + ik] = src[idx0];
- }
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
+ const float * const src = (float *)((char *) src1->data + i11*nb11);
+ for (int64_t i10 = 0; i10 < ne10; i10++) {
+ dst_data[i10*ne11 + i11] = src[i10];
}
}
}
+ // need to zero dst since we are accumulating into it
+ memset(dst->data, 0, ggml_nbytes(dst));
+
return;
}
return;
}
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+
// total rows in dst
- const int nr = ne02;
+ const int nr = ne1;
// rows per thread
const int dr = (nr + nth - 1)/nth;
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);
- float * const wdata = (float *) params->wdata + 0;
-
- for (int i2 = 0; i2 < ne2; i2++) {
- for (int i1 = ir0; i1 < ir1; i1++) {
- float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
+ float * const wdata = (float *) params->wdata + 0;
+ float * const wdata_src = wdata + nk;
- for (int i0 = 0; i0 < ne0; i0++) {
- ggml_vec_dot_f32(ew0, dst_data + i0,
- (float *) ((char *) src0->data + i1*nb02),
- (float *) wdata + i2*nb2 + i0*ew0);
+ for (int i1 = ir0; i1 < ir1; i1++) {
+ float * dst_data = (float *)((char *) dst->data + i1*nb1);
+ float * wdata_kernel = wdata + i1*ne02*ne00;
+ for (int i10 = 0; i10 < ne10; i10++) {
+ const int i1n = i10*ne11;
+ for (int i00 = 0; i00 < ne00; i00++) {
+ float v = 0;
+ ggml_vec_dot_f32(ne02, &v,
+ wdata_src + i1n,
+ wdata_kernel + i00*ne02);
+ dst_data[i10*s0 + i00] += v;
}
}
}
}
-// TODO: reuse ggml_mul_mat or implement ggml_im2col and remove stage_0 and stage_1
-static void gemm_f16_out_f32(int64_t m, int64_t n, int64_t k,
- ggml_fp16_t * A,
- ggml_fp16_t * B,
- float * C,
- const int ith, const int nth) {
- // does not seem to make a difference
- int64_t m0, m1, n0, n1;
- // patches per thread
- if (m > n) {
- n0 = 0;
- n1 = n;
-
- // total patches in dst
- const int np = m;
-
- // patches per thread
- const int dp = (np + nth - 1)/nth;
-
- // patch range for this thread
- m0 = dp*ith;
- m1 = MIN(m0 + dp, np);
- } else {
- m0 = 0;
- m1 = m;
-
- // total patches in dst
- const int np = n;
-
- // patches per thread
- const int dp = (np + nth - 1)/nth;
-
- // patch range for this thread
- n0 = dp*ith;
- n1 = MIN(n0 + dp, np);
- }
-
- // block-tiling attempt
- int64_t blck_n = 16;
- int64_t blck_m = 16;
-
- // int64_t CACHE_SIZE = 2 * 1024 * 1024; // 2MB
- // int64_t blck_size = CACHE_SIZE / (sizeof(float) + 2 * sizeof(ggml_fp16_t) * K);
- // if (blck_size > 0) {
- // blck_0 = 4;
- // blck_1 = blck_size / blck_0;
- // if (blck_1 < 0) {
- // blck_1 = 1;
- // }
- // // blck_0 = (int64_t)sqrt(blck_size);
- // // blck_1 = blck_0;
- // }
- // // printf("%zd %zd %zd %zd\n", blck_size, K, blck_0, blck_1);
-
- for (int j = n0; j < n1; j+=blck_n) {
- for (int i = m0; i < m1; i+=blck_m) {
- // printf("i j k => %d %d %d\n", i, j, K);
- for (int ii = i; ii < i + blck_m && ii < m1; ii++) {
- for (int jj = j; jj < j + blck_n && jj < n1; jj++) {
- ggml_vec_dot_f16(k,
- C + ii*n + jj,
- A + ii * k,
- B + jj * k);
- }
- }
- }
+static void ggml_compute_forward_conv_transpose_1d(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
+ struct ggml_tensor * dst) {
+ switch (src0->type) {
+ case GGML_TYPE_F16:
+ {
+ ggml_compute_forward_conv_transpose_1d_f16_f32(params, src0, src1, dst);
+ } break;
+ case GGML_TYPE_F32:
+ {
+ ggml_compute_forward_conv_transpose_1d_f32(params, src0, src1, dst);
+ } break;
+ default:
+ {
+ GGML_ASSERT(false);
+ } break;
}
}
-// src0: kernel [OC, IC, K]
-// src1: signal [N, IC, IL]
-// dst: result [N, OL, IC*K]
-static void ggml_compute_forward_conv_1d_stage_0_f32(
+// src0: kernel [OC, IC, KH, KW]
+// src1: image [N, IC, IH, IW]
+// dst: result [N, OH, OW, IC*KH*KW]
+static void ggml_compute_forward_im2col_f16(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
GGML_TENSOR_BINARY_OP_LOCALS;
- const int64_t N = ne12;
- const int64_t IC = ne11;
- const int64_t IL = ne10;
-
- const int64_t K = ne00;
-
- const int64_t OL = ne1;
+ const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
+ const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
+ const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
+ const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
+ const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
+ const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
+ const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
const int ith = params->ith;
const int nth = params->nth;
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
- const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
- const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
+ const int64_t N = is_2D ? ne13 : ne12;
+ const int64_t IC = is_2D ? ne12 : ne11;
+ const int64_t IH = is_2D ? ne11 : 1;
+ const int64_t IW = ne10;
+
+ const int64_t KH = is_2D ? ne01 : 1;
+ const int64_t KW = ne00;
+
+ const int64_t OH = is_2D ? ne2 : 1;
+ const int64_t OW = ne1;
+
+ int ofs0 = is_2D ? nb13 : nb12;
+ int ofs1 = is_2D ? nb12 : nb11;
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
GGML_ASSERT(nb10 == sizeof(float));
if (params->type == GGML_TASK_INIT) {
- memset(dst->data, 0, ggml_nbytes(dst));
return;
}
return;
}
- // im2col: [N, IC, IL] => [N, OL, IC*K]
+ // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
{
ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
for (int64_t in = 0; in < N; in++) {
- for (int64_t iol = 0; iol < OL; iol++) {
- for (int64_t iic = ith; iic < IC; iic+=nth) {
-
- // micro kernel
- ggml_fp16_t * dst_data = wdata + (in*OL + iol)*(IC*K); // [IC, K]
- const float * const src_data = (float *)((char *) src1->data + in*nb12 + iic*nb11); // [IL]
-
- for (int64_t ik = 0; ik < K; ik++) {
- const int64_t iil = iol*s0 + ik*d0 - p0;
-
- if (!(iil < 0 || iil >= IL)) {
- dst_data[iic*K + ik] = GGML_FP32_TO_FP16(src_data[iil]);
- }
- }
- }
- }
- }
- }
-}
-
-// gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
-// src0: [OC, IC, K]
-// src1: [N, OL, IC * K]
-// result: [N, OC, OL]
-static void ggml_compute_forward_conv_1d_stage_1_f16(
- const struct ggml_compute_params * params,
- const struct ggml_tensor * src0,
- const struct ggml_tensor * src1,
- struct ggml_tensor * dst) {
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
- GGML_ASSERT(src1->type == GGML_TYPE_F16);
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
- int64_t t0 = ggml_perf_time_us();
- UNUSED(t0);
-
- if (params->type == GGML_TASK_INIT) {
- return;
- }
-
- if (params->type == GGML_TASK_FINALIZE) {
- return;
- }
-
- GGML_TENSOR_BINARY_OP_LOCALS;
-
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
- GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
- GGML_ASSERT(nb0 == sizeof(float));
-
- const int N = ne12;
- const int OL = ne11;
-
- const int OC = ne02;
- const int IC = ne01;
- const int K = ne00;
-
- const int ith = params->ith;
- const int nth = params->nth;
-
- int64_t m = OC;
- int64_t n = OL;
- int64_t k = IC * K;
-
- // [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
- for (int i = 0; i < N; i++) {
- ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
- ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
- float * C = (float *)dst->data + i * m * n; // [m, n]
-
- gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
- }
-}
-
-static void ggml_compute_forward_conv_1d(
- const struct ggml_compute_params * params,
- const struct ggml_tensor * src0,
- const struct ggml_tensor * src1,
- struct ggml_tensor * dst) {
- switch(src0->type) {
- case GGML_TYPE_F16:
- {
- ggml_compute_forward_conv_1d_f16_f32(params, src0, src1, dst);
- } break;
- case GGML_TYPE_F32:
- {
- ggml_compute_forward_conv_1d_f32(params, src0, src1, dst);
- } break;
- default:
- {
- GGML_ASSERT(false);
- } break;
- }
-}
-
-static void ggml_compute_forward_conv_1d_stage_0(
- const struct ggml_compute_params * params,
- const struct ggml_tensor * src0,
- const struct ggml_tensor * src1,
- struct ggml_tensor * dst) {
- switch(src0->type) {
- case GGML_TYPE_F16:
- {
- ggml_compute_forward_conv_1d_stage_0_f32(params, src0, src1, dst);
- } break;
- default:
- {
- GGML_ASSERT(false);
- } break;
- }
-}
-
-static void ggml_compute_forward_conv_1d_stage_1(
- const struct ggml_compute_params * params,
- const struct ggml_tensor * src0,
- const struct ggml_tensor * src1,
- struct ggml_tensor * dst) {
- switch(src0->type) {
- case GGML_TYPE_F16:
- {
- ggml_compute_forward_conv_1d_stage_1_f16(params, src0, src1, dst);
- } break;
- default:
- {
- GGML_ASSERT(false);
- } break;
- }
-}
-
-// ggml_compute_forward_conv_transpose_1d
-
-static void ggml_compute_forward_conv_transpose_1d_f16_f32(
- const struct ggml_compute_params * params,
- const struct ggml_tensor * src0,
- const struct ggml_tensor * src1,
- struct ggml_tensor * dst) {
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
- int64_t t0 = ggml_perf_time_us();
- UNUSED(t0);
-
- GGML_TENSOR_BINARY_OP_LOCALS
-
- const int ith = params->ith;
- const int nth = params->nth;
-
- const int nk = ne00*ne01*ne02;
-
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
- GGML_ASSERT(nb10 == sizeof(float));
-
- if (params->type == GGML_TASK_INIT) {
- memset(params->wdata, 0, params->wsize);
-
- // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
- {
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
-
- for (int64_t i02 = 0; i02 < ne02; i02++) {
- for (int64_t i01 = 0; i01 < ne01; i01++) {
- const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
- ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
- for (int64_t i00 = 0; i00 < ne00; i00++) {
- dst_data[i00*ne02 + i02] = src[i00];
- }
- }
- }
- }
-
- // permute source data (src1) from (L x Cin) to (Cin x L)
- {
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
- ggml_fp16_t * dst_data = wdata;
-
- for (int64_t i11 = 0; i11 < ne11; i11++) {
- const float * const src = (float *)((char *) src1->data + i11*nb11);
- for (int64_t i10 = 0; i10 < ne10; i10++) {
- dst_data[i10*ne11 + i11] = GGML_FP32_TO_FP16(src[i10]);
- }
- }
- }
-
- // need to zero dst since we are accumulating into it
- memset(dst->data, 0, ggml_nbytes(dst));
-
- return;
- }
-
- if (params->type == GGML_TASK_FINALIZE) {
- return;
- }
-
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-
- // total rows in dst
- const int nr = ne1;
-
- // rows per thread
- const int dr = (nr + nth - 1)/nth;
-
- // row range for this thread
- const int ir0 = dr*ith;
- const int ir1 = MIN(ir0 + dr, nr);
-
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
- ggml_fp16_t * const wdata_src = wdata + nk;
-
- for (int i1 = ir0; i1 < ir1; i1++) {
- float * dst_data = (float *)((char *) dst->data + i1*nb1);
- ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00;
- for (int i10 = 0; i10 < ne10; i10++) {
- const int i1n = i10*ne11;
- for (int i00 = 0; i00 < ne00; i00++) {
- float v = 0;
- ggml_vec_dot_f16(ne02, &v,
- (ggml_fp16_t *) wdata_src + i1n,
- (ggml_fp16_t *) wdata_kernel + i00*ne02);
- dst_data[i10*s0 + i00] += v;
- }
- }
- }
-}
-
-static void ggml_compute_forward_conv_transpose_1d_f32(
- const struct ggml_compute_params * params,
- const struct ggml_tensor * src0,
- const struct ggml_tensor * src1,
- struct ggml_tensor * dst) {
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
- int64_t t0 = ggml_perf_time_us();
- UNUSED(t0);
-
- GGML_TENSOR_BINARY_OP_LOCALS
-
- const int ith = params->ith;
- const int nth = params->nth;
-
- const int nk = ne00*ne01*ne02;
-
- GGML_ASSERT(nb00 == sizeof(float));
- GGML_ASSERT(nb10 == sizeof(float));
-
- if (params->type == GGML_TASK_INIT) {
- memset(params->wdata, 0, params->wsize);
-
- // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
- {
- float * const wdata = (float *) params->wdata + 0;
-
- for (int64_t i02 = 0; i02 < ne02; i02++) {
- for (int64_t i01 = 0; i01 < ne01; i01++) {
- const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
- float * dst_data = wdata + i01*ne00*ne02;
- for (int64_t i00 = 0; i00 < ne00; i00++) {
- dst_data[i00*ne02 + i02] = src[i00];
- }
- }
- }
- }
-
- // prepare source data (src1)
- {
- float * const wdata = (float *) params->wdata + nk;
- float * dst_data = wdata;
-
- for (int64_t i11 = 0; i11 < ne11; i11++) {
- const float * const src = (float *)((char *) src1->data + i11*nb11);
- for (int64_t i10 = 0; i10 < ne10; i10++) {
- dst_data[i10*ne11 + i11] = src[i10];
- }
- }
- }
-
- // need to zero dst since we are accumulating into it
- memset(dst->data, 0, ggml_nbytes(dst));
-
- return;
- }
-
- if (params->type == GGML_TASK_FINALIZE) {
- return;
- }
-
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-
- // total rows in dst
- const int nr = ne1;
-
- // rows per thread
- const int dr = (nr + nth - 1)/nth;
-
- // row range for this thread
- const int ir0 = dr*ith;
- const int ir1 = MIN(ir0 + dr, nr);
-
- float * const wdata = (float *) params->wdata + 0;
- float * const wdata_src = wdata + nk;
-
- for (int i1 = ir0; i1 < ir1; i1++) {
- float * dst_data = (float *)((char *) dst->data + i1*nb1);
- float * wdata_kernel = wdata + i1*ne02*ne00;
- for (int i10 = 0; i10 < ne10; i10++) {
- const int i1n = i10*ne11;
- for (int i00 = 0; i00 < ne00; i00++) {
- float v = 0;
- ggml_vec_dot_f32(ne02, &v,
- wdata_src + i1n,
- wdata_kernel + i00*ne02);
- dst_data[i10*s0 + i00] += v;
- }
- }
- }
-}
-
-static void ggml_compute_forward_conv_transpose_1d(
- const struct ggml_compute_params * params,
- const struct ggml_tensor * src0,
- const struct ggml_tensor * src1,
- struct ggml_tensor * dst) {
- switch (src0->type) {
- case GGML_TYPE_F16:
- {
- ggml_compute_forward_conv_transpose_1d_f16_f32(params, src0, src1, dst);
- } break;
- case GGML_TYPE_F32:
- {
- ggml_compute_forward_conv_transpose_1d_f32(params, src0, src1, dst);
- } break;
- default:
- {
- GGML_ASSERT(false);
- } break;
- }
-}
-
-// ggml_compute_forward_conv_2d
-
-// src0: kernel [OC, IC, KH, KW]
-// src1: image [N, IC, IH, IW]
-// dst: result [N, OH, OW, IC*KH*KW]
-static void ggml_compute_forward_conv_2d_stage_0_f32(
- const struct ggml_compute_params * params,
- const struct ggml_tensor * src0,
- const struct ggml_tensor * src1,
- struct ggml_tensor * dst) {
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
- GGML_ASSERT( dst->type == GGML_TYPE_F16);
-
- int64_t t0 = ggml_perf_time_us();
- UNUSED(t0);
-
- GGML_TENSOR_BINARY_OP_LOCALS;
-
- const int64_t N = ne13;
- const int64_t IC = ne12;
- const int64_t IH = ne11;
- const int64_t IW = ne10;
-
- // const int64_t OC = ne03;
- // const int64_t IC = ne02;
- const int64_t KH = ne01;
- const int64_t KW = ne00;
-
- const int64_t OH = ne2;
- const int64_t OW = ne1;
-
- const int ith = params->ith;
- const int nth = params->nth;
-
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
- const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
- const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
- const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
- const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
- const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
-
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
- GGML_ASSERT(nb10 == sizeof(float));
-
- if (params->type == GGML_TASK_INIT) {
- memset(dst->data, 0, ggml_nbytes(dst));
- return;
- }
-
- if (params->type == GGML_TASK_FINALIZE) {
- return;
- }
-
- // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
- {
- ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
-
- for (int64_t in = 0; in < N; in++) {
- for (int64_t ioh = 0; ioh < OH; ioh++) {
- for (int64_t iow = 0; iow < OW; iow++) {
- for (int64_t iic = ith; iic < IC; iic+=nth) {
+ for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
+ for (int64_t iow = 0; iow < OW; iow++) {
+ for (int64_t iic = ith; iic < IC; iic += nth) {
// micro kernel
ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
- const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
+ const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
- for (int64_t ikh = 0; ikh < KH; ikh++) {
+ for (int64_t ikh = 0; ikh < KH; ikh++) { // 1
for (int64_t ikw = 0; ikw < KW; ikw++) {
const int64_t iiw = iow*s0 + ikw*d0 - p0;
const int64_t iih = ioh*s1 + ikh*d1 - p1;
- if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
+ if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
+ dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
+ } else {
dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
}
}
}
}
-// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
-// src0: [OC, IC, KH, KW]
-// src1: [N, OH, OW, IC * KH * KW]
-// result: [N, OC, OH, OW]
-static void ggml_compute_forward_conv_2d_stage_1_f16(
- const struct ggml_compute_params * params,
- const struct ggml_tensor * src0,
- const struct ggml_tensor * src1,
- struct ggml_tensor * dst) {
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
- GGML_ASSERT(src1->type == GGML_TYPE_F16);
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
- int64_t t0 = ggml_perf_time_us();
- UNUSED(t0);
-
- if (params->type == GGML_TASK_INIT) {
- return;
- }
-
- if (params->type == GGML_TASK_FINALIZE) {
- return;
- }
-
- GGML_TENSOR_BINARY_OP_LOCALS;
-
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
- GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
- GGML_ASSERT(nb0 == sizeof(float));
-
- const int N = ne13;
- const int OH = ne12;
- const int OW = ne11;
-
- const int OC = ne03;
- const int IC = ne02;
- const int KH = ne01;
- const int KW = ne00;
-
- const int ith = params->ith;
- const int nth = params->nth;
-
- int64_t m = OC;
- int64_t n = OH * OW;
- int64_t k = IC * KH * KW;
-
- // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
- for (int i = 0; i < N; i++) {
- ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
- ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
- float * C = (float *)dst->data + i * m * n; // [m, n]
-
- gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
- }
-}
-
-static void ggml_compute_forward_conv_2d_f16_f32(
- const struct ggml_compute_params * params,
- const struct ggml_tensor * src0,
- const struct ggml_tensor * src1,
- struct ggml_tensor * dst) {
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
- int64_t t0 = ggml_perf_time_us();
- UNUSED(t0);
-
- GGML_TENSOR_BINARY_OP_LOCALS
-
- // src1: image [N, IC, IH, IW]
- // src0: kernel [OC, IC, KH, KW]
- // dst: result [N, OC, OH, OW]
- // ne12: IC
- // ne0: OW
- // ne1: OH
- // nk0: KW
- // nk1: KH
- // ne13: N
-
- const int N = ne13;
- const int IC = ne12;
- const int IH = ne11;
- const int IW = ne10;
-
- const int OC = ne03;
- // const int IC = ne02;
- const int KH = ne01;
- const int KW = ne00;
-
- const int OH = ne1;
- const int OW = ne0;
-
- const int ith = params->ith;
- const int nth = params->nth;
-
- // const int nk0 = ne00;
- // const int nk1 = ne01;
-
- // size of the convolution row - the kernel size unrolled across all channels
- // const int ew0 = nk0*nk1*ne02;
- // ew0: IC*KH*KW
-
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
- const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
- const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
- const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
- const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
- const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
-
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
- GGML_ASSERT(nb10 == sizeof(float));
-
- if (params->type == GGML_TASK_INIT) {
- memset(params->wdata, 0, params->wsize);
-
- // prepare source data (src1)
- // im2col: [N, IC, IH, IW] => [N*OH*OW, IC*KH*KW]
-
- {
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
-
- for (int in = 0; in < N; in++) {
- for (int iic = 0; iic < IC; iic++) {
- for (int ioh = 0; ioh < OH; ioh++) {
- for (int iow = 0; iow < OW; iow++) {
-
- // micro kernel
- ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
- const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
-
- for (int ikh = 0; ikh < KH; ikh++) {
- for (int ikw = 0; ikw < KW; ikw++) {
- const int iiw = iow*s0 + ikw*d0 - p0;
- const int iih = ioh*s1 + ikh*d1 - p1;
-
- if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
- dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
- }
- }
- }
- }
- }
- }
- }
- }
-
- return;
- }
-
- if (params->type == GGML_TASK_FINALIZE) {
- return;
- }
-
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
- // wdata: [N*OH*OW, IC*KH*KW]
- // dst: result [N, OC, OH, OW]
- // src0: kernel [OC, IC, KH, KW]
-
- int64_t m = OC;
- int64_t n = OH * OW;
- int64_t k = IC * KH * KW;
-
- // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
- for (int i = 0; i < N; i++) {
- ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
- ggml_fp16_t * B = (ggml_fp16_t *)wdata + i * m * k; // [n, k]
- float * C = (float *)dst->data + i * m * n; // [m * k]
-
- gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
- }
-}
-
-static void ggml_compute_forward_conv_2d(
- const struct ggml_compute_params * params,
- const struct ggml_tensor * src0,
- const struct ggml_tensor * src1,
- struct ggml_tensor * dst) {
- switch (src0->type) {
- case GGML_TYPE_F16:
- {
- ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, dst);
- } break;
- case GGML_TYPE_F32:
- {
- //ggml_compute_forward_conv_2d_f32(params, src0, src1, dst);
- GGML_ASSERT(false);
- } break;
- default:
- {
- GGML_ASSERT(false);
- } break;
- }
-}
-
-static void ggml_compute_forward_conv_2d_stage_0(
- const struct ggml_compute_params * params,
- const struct ggml_tensor * src0,
- const struct ggml_tensor * src1,
- struct ggml_tensor * dst) {
- switch (src0->type) {
- case GGML_TYPE_F16:
- {
- ggml_compute_forward_conv_2d_stage_0_f32(params, src0, src1, dst);
- } break;
- case GGML_TYPE_F32:
- {
- GGML_ASSERT(false);
- } break;
- default:
- {
- GGML_ASSERT(false);
- } break;
- }
-}
-
-static void ggml_compute_forward_conv_2d_stage_1(
+static void ggml_compute_forward_im2col(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
switch (src0->type) {
case GGML_TYPE_F16:
{
- ggml_compute_forward_conv_2d_stage_1_f16(params, src0, src1, dst);
+ ggml_compute_forward_im2col_f16(params, src0, src1, dst);
} break;
case GGML_TYPE_F32:
{
{
ggml_compute_forward_clamp(params, tensor->src[0], tensor);
} break;
- case GGML_OP_CONV_1D:
- {
- ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor);
- } break;
- case GGML_OP_CONV_1D_STAGE_0:
- {
- ggml_compute_forward_conv_1d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
- } break;
- case GGML_OP_CONV_1D_STAGE_1:
- {
- ggml_compute_forward_conv_1d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
- } break;
case GGML_OP_CONV_TRANSPOSE_1D:
{
ggml_compute_forward_conv_transpose_1d(params, tensor->src[0], tensor->src[1], tensor);
} break;
- case GGML_OP_CONV_2D:
- {
- ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
- } break;
- case GGML_OP_CONV_2D_STAGE_0:
- {
- ggml_compute_forward_conv_2d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
- } break;
- case GGML_OP_CONV_2D_STAGE_1:
+ case GGML_OP_IM2COL:
{
- ggml_compute_forward_conv_2d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
+ ggml_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor);
} break;
case GGML_OP_CONV_TRANSPOSE_2D:
{
{
GGML_ASSERT(false); // TODO: not implemented
} break;
- case GGML_OP_CONV_1D:
- {
- GGML_ASSERT(false); // TODO: not implemented
- } break;
- case GGML_OP_CONV_1D_STAGE_0:
- {
- GGML_ASSERT(false); // TODO: not implemented
- } break;
- case GGML_OP_CONV_1D_STAGE_1:
- {
- GGML_ASSERT(false); // TODO: not implemented
- } break;
case GGML_OP_CONV_TRANSPOSE_1D:
{
GGML_ASSERT(false); // TODO: not implemented
} break;
- case GGML_OP_CONV_2D:
- {
- GGML_ASSERT(false); // TODO: not implemented
- } break;
- case GGML_OP_CONV_2D_STAGE_0:
- {
- GGML_ASSERT(false); // TODO: not implemented
- } break;
- case GGML_OP_CONV_2D_STAGE_1:
+ case GGML_OP_IM2COL:
{
GGML_ASSERT(false); // TODO: not implemented
} break;
{
n_tasks = 1; //TODO
} break;
- case GGML_OP_CONV_1D:
- {
- n_tasks = n_threads;
- } break;
- case GGML_OP_CONV_1D_STAGE_0:
- {
- n_tasks = n_threads;
- } break;
- case GGML_OP_CONV_1D_STAGE_1:
- {
- n_tasks = n_threads;
- } break;
case GGML_OP_CONV_TRANSPOSE_1D:
{
n_tasks = n_threads;
} break;
- case GGML_OP_CONV_2D:
- {
- n_tasks = n_threads;
- } break;
- case GGML_OP_CONV_2D_STAGE_0:
- {
- n_tasks = n_threads;
- } break;
- case GGML_OP_CONV_2D_STAGE_1:
+ case GGML_OP_IM2COL:
{
n_tasks = n_threads;
} break;
} break;
default:
{
+ printf("%s: op %s not implemented\n", __func__, ggml_op_name(node->op));
GGML_ASSERT(false);
} break;
}
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
}
} break;
- case GGML_OP_CONV_1D:
- {
- GGML_ASSERT(node->src[0]->ne[3] == 1);
- GGML_ASSERT(node->src[1]->ne[2] == 1);
- GGML_ASSERT(node->src[1]->ne[3] == 1);
-
- const int64_t ne00 = node->src[0]->ne[0];
- const int64_t ne01 = node->src[0]->ne[1];
- const int64_t ne02 = node->src[0]->ne[2];
-
- const int64_t ne10 = node->src[1]->ne[0];
- const int64_t ne11 = node->src[1]->ne[1];
-
- const int64_t ne0 = node->ne[0];
- const int64_t ne1 = node->ne[1];
- const int64_t nk = ne00;
- const int64_t ew0 = nk * ne01;
-
- UNUSED(ne02);
- UNUSED(ne10);
- UNUSED(ne11);
-
- if (node->src[0]->type == GGML_TYPE_F16 &&
- node->src[1]->type == GGML_TYPE_F32) {
- cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
- } else if (node->src[0]->type == GGML_TYPE_F32 &&
- node->src[1]->type == GGML_TYPE_F32) {
- cur = sizeof(float)*(ne0*ne1*ew0);
- } else {
- GGML_ASSERT(false);
- }
- } break;
case GGML_OP_CONV_TRANSPOSE_1D:
{
GGML_ASSERT(node->src[0]->ne[3] == 1);
GGML_ASSERT(false);
}
} break;
- case GGML_OP_CONV_2D:
+ case GGML_OP_IM2COL:
{
- const int64_t ne00 = node->src[0]->ne[0]; // W
- const int64_t ne01 = node->src[0]->ne[1]; // H
- const int64_t ne02 = node->src[0]->ne[2]; // C
- const int64_t ne03 = node->src[0]->ne[3]; // N
-
- const int64_t ne10 = node->src[1]->ne[0]; // W
- const int64_t ne11 = node->src[1]->ne[1]; // H
- const int64_t ne12 = node->src[1]->ne[2]; // C
-
- const int64_t ne0 = node->ne[0];
- const int64_t ne1 = node->ne[1];
- const int64_t ne2 = node->ne[2];
- const int64_t ne3 = node->ne[3];
- const int64_t nk = ne00*ne01;
- const int64_t ew0 = nk * ne02;
-
- UNUSED(ne03);
- UNUSED(ne2);
-
- if (node->src[0]->type == GGML_TYPE_F16 &&
- node->src[1]->type == GGML_TYPE_F32) {
- // im2col: [N*OH*OW, IC*KH*KW]
- cur = sizeof(ggml_fp16_t)*(ne3*ne0*ne1*ew0);
- } else if (node->src[0]->type == GGML_TYPE_F32 &&
- node->src[1]->type == GGML_TYPE_F32) {
- cur = sizeof(float)* (ne10*ne11*ne12);
- } else {
- GGML_ASSERT(false);
- }
+ n_tasks = n_threads;
} break;
case GGML_OP_CONV_TRANSPOSE_2D:
{