From: Georgi Gerganov Date: Fri, 28 Apr 2023 17:47:27 +0000 (+0300) Subject: ggml : ggml_alibi() fixes (#113) X-Git-Tag: upstream/0.0.1642~1511 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=583c5a3ad6bdb041bff5ad161a49ff4d8fa52f10;p=pkg%2Fggml%2Fsources%2Fggml ggml : ggml_alibi() fixes (#113) --- diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 25628beb..c3754040 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -9,6 +9,7 @@ if (GGML_ALL_WARNINGS) -Wcast-qual \ -Wstrict-prototypes \ -Wpointer-arith \ + -Wdouble-promotion \ -Wno-unused-function \ ") else() diff --git a/src/ggml.c b/src/ggml.c index a9650086..53796bd9 100644 --- a/src/ggml.c +++ b/src/ggml.c @@ -6099,7 +6099,7 @@ struct ggml_tensor * ggml_alibi( //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); struct ggml_tensor * result = ggml_view_tensor(ctx, a); - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3); + struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); ((int32_t *) b->data)[0] = n_past; ((int32_t *) b->data)[1] = n_head; @@ -9340,7 +9340,7 @@ static void ggml_compute_forward_alibi_f32( struct ggml_tensor * dst) { assert(params->ith == 0); assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 3); + assert(ggml_nelements(src1) == 2); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; @@ -9348,12 +9348,11 @@ static void ggml_compute_forward_alibi_f32( const int n_past = ((int32_t *) src1->data)[0]; const int n_head = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1 const int ne1 = src0->ne[1]; // seq_len_without_past - const int ne2 = src0->ne[2]; // n_head -> this is k - const int ne3 = src0->ne[3]; // 1 -> bsz + //const int ne2 = src0->ne[2]; // n_head -> this is k + //const int ne3 = src0->ne[3]; // 1 -> bsz const int n = ggml_nrows(src0); const int ne2_ne3 = n/ne1; // ne2*ne3 @@ -9361,36 +9360,37 @@ static void ggml_compute_forward_alibi_f32( const int nb0 = src0->nb[0]; const int nb1 = src0->nb[1]; const int nb2 = src0->nb[2]; - const int nb3 = src0->nb[3]; + //const int nb3 = src0->nb[3]; assert(nb0 == sizeof(float)); assert(ne1+n_past == ne0); // add alibi to src0 (KQ_scaled) const int n_heads_log2_floor = 1 << (int) floor(log2(n_head)); - const float m0 = pow(2.0, -8.0 / n_heads_log2_floor); - const float m1 = pow(2.0, -4.0 / n_heads_log2_floor); + + const float m0 = powf(2.0f, -8.0f / n_heads_log2_floor); + const float m1 = powf(2.0f, -4.0f / n_heads_log2_floor); for (int i = 0; i < ne0; i++) { for (int j = 0; j < ne1; j++) { for (int k = 0; k < ne2_ne3; k++) { float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2); - float * dst_data = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2); + float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2); // TODO: k*nb2 or k*nb3 float m_k; + if (k < n_heads_log2_floor) { - m_k = pow(m0, k + 1); + m_k = powf(m0, k + 1); } else { - m_k = pow(m1, 2 * (k - n_heads_log2_floor) + 1); + m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1); } - //TODO: optimize - dst_data[0] = (j+1) * m_k + src[0]; + + pdst[0] = (j+1) * m_k + src[0]; } } } - } @@ -9401,7 +9401,7 @@ static void ggml_compute_forward_alibi_f16( struct ggml_tensor * dst) { assert(params->ith == 0); assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 3); + assert(ggml_nelements(src1) == 2); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; @@ -9409,12 +9409,11 @@ static void ggml_compute_forward_alibi_f16( const int n_past = ((int32_t *) src1->data)[0]; const int n_head = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1 const int ne1 = src0->ne[1]; // seq_len_without_past - const int ne2 = src0->ne[2]; // n_head -> this is k - const int ne3 = src0->ne[3]; // 1 -> bsz + //const int ne2 = src0->ne[2]; // n_head -> this is k + //const int ne3 = src0->ne[3]; // 1 -> bsz const int n = ggml_nrows(src0); const int ne2_ne3 = n/ne1; // ne2*ne3 @@ -9422,36 +9421,38 @@ static void ggml_compute_forward_alibi_f16( const int nb0 = src0->nb[0]; const int nb1 = src0->nb[1]; const int nb2 = src0->nb[2]; - const int nb3 = src0->nb[3]; + //const int nb3 = src0->nb[3]; - assert(nb0 == sizeof(float)); + assert(nb0 == sizeof(ggml_fp16_t)); assert(ne1+n_past == ne0); // add alibi to src0 (KQ_scaled) const int n_heads_log2_floor = 1 << (int) floor(log2(n_head)); - const ggml_fp16_t m0 = pow(2.0, -8.0 / n_heads_log2_floor); - const ggml_fp16_t m1 = pow(2.0, -4.0 / n_heads_log2_floor); + + const float m0 = powf(2.0f, -8.0f / n_heads_log2_floor); + const float m1 = powf(2.0f, -4.0f / n_heads_log2_floor); for (int i = 0; i < ne0; i++) { for (int j = 0; j < ne1; j++) { for (int k = 0; k < ne2_ne3; k++) { - ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2); - ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2); + ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2); + float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2); // TODO: k*nb2 or k*nb3 - ggml_fp16_t m_k; + float m_k; + if (k < n_heads_log2_floor) { - m_k = pow(m0, k + 1); + m_k = powf(m0, k + 1); } else { - m_k = pow(m1, 2 * (k - n_heads_log2_floor) + 1); + m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1); } - //TODO: optimize - dst_data[0] = (j+1) * m_k + src[0]; + + // we return F32 + pdst[0] = (j+1) * m_k + GGML_FP16_TO_FP32(src[0]); } } } - } static void ggml_compute_forward_alibi( @@ -9472,7 +9473,10 @@ static void ggml_compute_forward_alibi( case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_2: case GGML_TYPE_Q4_3: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: