return x;
}
+#define GGML_FP16_TO_FP32(x) (x)
+#define GGML_FP32_TO_FP16(x) (x)
+
#else
#ifdef __wasm_simd128__
return _cvtss_sh(f, 0);
}
+#define GGML_FP16_TO_FP32(x) _cvtsh_ss(x)
+#define GGML_FP32_TO_FP16(x) _cvtss_sh(x, 0)
+
#else
static inline float fp32_from_bits(uint32_t w) {
const uint32_t nonsign = exp_bits + mantissa_bits;
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
}
-#endif
-#endif
+
+#define GGML_FP16_TO_FP32(x) ggml_fp16_to_fp32(x)
+#define GGML_FP32_TO_TP16(x) ggml_fp32_to_fp16(x)
+
+#endif // __F16C__
+
+#endif // __ARM_NEON
//
// global data
// leftovers
for (int i = n32; i < n; ++i) {
- sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
+ sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
}
#elif defined(__AVX2__)
// AVX 256-bit
// leftovers
for (int i = n32; i < n; ++i) {
//GGML_ASSERT(false);
- sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
+ sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
}
#elif defined(__AVX__)
// AVX 256-bit
// leftovers
for (int i = n32; i < n; ++i) {
//GGML_ASSERT(false);
- sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
+ sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
}
#elif defined(__wasm_simd128__)
// WASM 128-bit
for (int i = 0; i < n16; i += 16) {
for (int k = 0; k < 16; ++k) {
- tx[k] = ggml_fp16_to_fp32(x[i + k]);
- ty[k] = ggml_fp16_to_fp32(y[i + k]);
+ tx[k] = GGML_FP16_TO_FP32(x[i + k]);
+ ty[k] = GGML_FP16_TO_FP32(y[i + k]);
}
x0 = wasm_v128_load(tx + 0);
// leftovers
for (int i = n16; i < n; ++i) {
//GGML_ASSERT(false);
- sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
+ sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
}
#else
for (int i = 0; i < n; ++i) {
- sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
+ sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
}
#endif
// leftovers
for (int i = n32; i < n; ++i) {
GGML_ASSERT(false);
- y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
+ y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
}
#elif defined(__AVX2__)
// AVX 256-bit
// leftovers
for (int i = n32; i < n; ++i) {
GGML_ASSERT(false);
- y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
+ y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
}
#elif defined(__AVX__)
// AVX 256-bit
// leftovers
for (int i = n32; i < n; ++i) {
GGML_ASSERT(false);
- y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
+ y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
}
#elif defined(__wasm_simd128__)
// WASM SIMD 128-bit
for (int i = 0; i < n16; i += 16) {
for (int k = 0; k < 16; ++k) {
- tx[k] = ggml_fp16_to_fp32(x[i + k]);
- ty[k] = ggml_fp16_to_fp32(y[i + k]);
+ tx[k] = GGML_FP16_TO_FP32(x[i + k]);
+ ty[k] = GGML_FP16_TO_FP32(y[i + k]);
}
x0 = wasm_v128_load(tx + 0);
wasm_v128_store(ty + 12, y3);
for (int k = 0; k < 16; ++k) {
- y[i + k] = ggml_fp32_to_fp16(ty[k]);
+ y[i + k] = GGML_FP32_TO_FP16(ty[k]);
}
}
// leftovers
for (int i = n16; i < n; ++i) {
GGML_ASSERT(false);
- y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
+ y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
}
#else
for (int i = 0; i < n; ++i) {
- y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
+ y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
}
#endif
}
inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
uint16_t t;
for (int i = 0; i < n; ++i) {
- ggml_fp16_t fp16 = ggml_fp32_to_fp16(x[i]);
+ ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
memcpy(&t, &fp16, sizeof(uint16_t));
- y[i] = ggml_fp16_to_fp32(table_gelu_f16[t]);
+ y[i] = GGML_FP16_TO_FP32(table_gelu_f16[t]);
}
}
#else
for (int i = 0; i < (1 << 16); ++i) {
uint16_t ui = i;
memcpy(&ii, &ui, sizeof(ii));
- const float f = ggml_fp16_to_fp32(ii);
- table_gelu_f16[i] = ggml_fp32_to_fp16(ggml_gelu_f32(f));
- table_exp_f16[i] = ggml_fp32_to_fp16(exp(f));
+ const float f = GGML_FP16_TO_FP32(ii);
+ table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
+ table_exp_f16[i] = GGML_FP32_TO_FP16(exp(f));
}
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
case GGML_TYPE_F16:
{
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
- return ggml_fp16_to_fp32(((ggml_fp16_t *)(tensor->data))[i]);
+ return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
} break;
case GGML_TYPE_F32:
{
case GGML_TYPE_F16:
{
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
- ((ggml_fp16_t *)(tensor->data))[i] = ggml_fp32_to_fp16(value);
+ ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
} break;
case GGML_TYPE_F32:
{
case GGML_TYPE_F16:
{
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
- return ggml_fp16_to_fp32(((ggml_fp16_t *)(tensor->data))[i]);
+ return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
} break;
case GGML_TYPE_F32:
{
case GGML_TYPE_F16:
{
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
- ((ggml_fp16_t *)(tensor->data))[i] = ggml_fp32_to_fp16(value);
+ ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
} break;
case GGML_TYPE_F32:
{
for (int i00 = 0; i00 < ne00; i00++) {
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
- dst_ptr[id] = ggml_fp32_to_fp16(*src0_ptr);
+ dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
id++;
}
}
for (int i00 = 0; i00 < ne00; i00++) {
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
- dst_ptr[id] = ggml_fp32_to_fp16(*src0_ptr);
+ dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
id++;
}
}
int id = 0;
for (int i01 = 0; i01 < ne01; ++i01) {
for (int i00 = 0; i00 < ne00; ++i00) {
- wdata[id++] = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
+ wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
}
}
}
for (int i12 = 0; i12 < ne12; ++i12) {
for (int i11 = 0; i11 < ne11; ++i11) {
for (int i10 = 0; i10 < ne10; ++i10) {
- wdata[id++] = ggml_fp32_to_fp16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
+ wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
}
}
}
const int ic1 = MIN(ic0 + dc, ne);
for (int i = ic0; i < ic1; ++i) {
- ((float *) dst->data)[i] = ggml_fp16_to_fp32(wdata[i]);
+ ((float *) dst->data)[i] = GGML_FP16_TO_FP32(wdata[i]);
}
for (int k = 1; k < nth; k++) {
for (int i = ic0; i < ic1; ++i) {
- ((float *) dst->data)[i] += ggml_fp16_to_fp32(wdata[(ne + CACHE_LINE_SIZE_F32)*k + i]);
+ ((float *) dst->data)[i] += GGML_FP16_TO_FP32(wdata[(ne + CACHE_LINE_SIZE_F32)*k + i]);
}
}
for (int j = 0; j < nc; ++j) {
ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + r*src0->nb[1]))[j];
- ((float *) ((char *) dst->data + i*dst->nb[1]))[j] = ggml_fp16_to_fp32(v);
+ ((float *) ((char *) dst->data + i*dst->nb[1]))[j] = GGML_FP16_TO_FP32(v);
}
}
}
p[i] = 0.0;
} else {
//const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max);
- ggml_fp16_t s = ggml_fp32_to_fp16(p[i] - max);
+ ggml_fp16_t s = GGML_FP32_TO_FP16(p[i] - max);
memcpy(&ss, &s, sizeof(ss));
- const float val = ggml_fp16_to_fp32(table_exp_f16[ss]);
+ const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]);
sum += val;
p[i] = val;
}
const float * const src = (float *)((char *) src1->data + i11*nb11);
ggml_fp16_t * dst_data = wdata;
for (int i10 = 0; i10 < ne10; i10++) {
- dst_data[(i10 + nh)*ew0 + i11] = ggml_fp32_to_fp16(src[i10]);
+ dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
}
}
}
const float * const src = (float *)((char *) src1->data + i11*nb11);
ggml_fp16_t * dst_data = wdata;
for (int i10 = 0; i10 < ne10; i10++) {
- dst_data[(i10 + nh)*ew0 + i11] = ggml_fp32_to_fp16(src[i10]);
+ dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
}
}
}
S[i] = 0.0;
} else {
//const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max);
- ggml_fp16_t s = ggml_fp32_to_fp16(S[i] - max);
+ ggml_fp16_t s = GGML_FP32_TO_FP16(S[i] - max);
memcpy(&ss, &s, sizeof(ss));
- const float val = ggml_fp16_to_fp32(table_exp_f16[ss]);
+ const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]);
sum += val;
S[i] = val;
}
S[i] = 0.0;
} else {
//const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max);
- ggml_fp16_t s = ggml_fp32_to_fp16(S[i] - max);
+ ggml_fp16_t s = GGML_FP32_TO_FP16(S[i] - max);
memcpy(&ss, &s, sizeof(ss));
- const float val = ggml_fp16_to_fp32(table_exp_f16[ss]);
+ const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]);
sum += val;
S[i] = val;
}
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
for (int i = 0; i < M; i++) {
- S16[i] = ggml_fp32_to_fp16(S[i]);
+ S16[i] = GGML_FP32_TO_FP16(S[i]);
}
for (int ic = 0; ic < nev1; ++ic) {
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
for (int i = 0; i < M; i++) {
- S16[i] = ggml_fp32_to_fp16(S[i]);
+ S16[i] = GGML_FP32_TO_FP16(S[i]);
}
ggml_vec_gelu_f16(neb01, S16, S16);