//struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
((int32_t *) b->data)[0] = n_past;
((int32_t *) b->data)[1] = n_head;
struct ggml_tensor * dst) {
assert(params->ith == 0);
assert(src1->type == GGML_TYPE_I32);
- assert(ggml_nelements(src1) == 3);
+ assert(ggml_nelements(src1) == 2);
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
const int n_past = ((int32_t *) src1->data)[0];
const int n_head = ((int32_t *) src1->data)[1];
- const int mode = ((int32_t *) src1->data)[2];
const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
const int ne1 = src0->ne[1]; // seq_len_without_past
- const int ne2 = src0->ne[2]; // n_head -> this is k
- const int ne3 = src0->ne[3]; // 1 -> bsz
+ //const int ne2 = src0->ne[2]; // n_head -> this is k
+ //const int ne3 = src0->ne[3]; // 1 -> bsz
const int n = ggml_nrows(src0);
const int ne2_ne3 = n/ne1; // ne2*ne3
const int nb0 = src0->nb[0];
const int nb1 = src0->nb[1];
const int nb2 = src0->nb[2];
- const int nb3 = src0->nb[3];
+ //const int nb3 = src0->nb[3];
assert(nb0 == sizeof(float));
assert(ne1+n_past == ne0);
// add alibi to src0 (KQ_scaled)
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
- const float m0 = pow(2.0, -8.0 / n_heads_log2_floor);
- const float m1 = pow(2.0, -4.0 / n_heads_log2_floor);
+
+ const float m0 = powf(2.0f, -8.0f / n_heads_log2_floor);
+ const float m1 = powf(2.0f, -4.0f / n_heads_log2_floor);
for (int i = 0; i < ne0; i++) {
for (int j = 0; j < ne1; j++) {
for (int k = 0; k < ne2_ne3; k++) {
float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
- float * dst_data = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
+ float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
// TODO: k*nb2 or k*nb3
float m_k;
+
if (k < n_heads_log2_floor) {
- m_k = pow(m0, k + 1);
+ m_k = powf(m0, k + 1);
} else {
- m_k = pow(m1, 2 * (k - n_heads_log2_floor) + 1);
+ m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
}
- //TODO: optimize
- dst_data[0] = (j+1) * m_k + src[0];
+
+ pdst[0] = (j+1) * m_k + src[0];
}
}
}
-
}
struct ggml_tensor * dst) {
assert(params->ith == 0);
assert(src1->type == GGML_TYPE_I32);
- assert(ggml_nelements(src1) == 3);
+ assert(ggml_nelements(src1) == 2);
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
const int n_past = ((int32_t *) src1->data)[0];
const int n_head = ((int32_t *) src1->data)[1];
- const int mode = ((int32_t *) src1->data)[2];
const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
const int ne1 = src0->ne[1]; // seq_len_without_past
- const int ne2 = src0->ne[2]; // n_head -> this is k
- const int ne3 = src0->ne[3]; // 1 -> bsz
+ //const int ne2 = src0->ne[2]; // n_head -> this is k
+ //const int ne3 = src0->ne[3]; // 1 -> bsz
const int n = ggml_nrows(src0);
const int ne2_ne3 = n/ne1; // ne2*ne3
const int nb0 = src0->nb[0];
const int nb1 = src0->nb[1];
const int nb2 = src0->nb[2];
- const int nb3 = src0->nb[3];
+ //const int nb3 = src0->nb[3];
- assert(nb0 == sizeof(float));
+ assert(nb0 == sizeof(ggml_fp16_t));
assert(ne1+n_past == ne0);
// add alibi to src0 (KQ_scaled)
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
- const ggml_fp16_t m0 = pow(2.0, -8.0 / n_heads_log2_floor);
- const ggml_fp16_t m1 = pow(2.0, -4.0 / n_heads_log2_floor);
+
+ const float m0 = powf(2.0f, -8.0f / n_heads_log2_floor);
+ const float m1 = powf(2.0f, -4.0f / n_heads_log2_floor);
for (int i = 0; i < ne0; i++) {
for (int j = 0; j < ne1; j++) {
for (int k = 0; k < ne2_ne3; k++) {
- ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
- ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
+ ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
+ float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
// TODO: k*nb2 or k*nb3
- ggml_fp16_t m_k;
+ float m_k;
+
if (k < n_heads_log2_floor) {
- m_k = pow(m0, k + 1);
+ m_k = powf(m0, k + 1);
} else {
- m_k = pow(m1, 2 * (k - n_heads_log2_floor) + 1);
+ m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
}
- //TODO: optimize
- dst_data[0] = (j+1) * m_k + src[0];
+
+ // we return F32
+ pdst[0] = (j+1) * m_k + GGML_FP16_TO_FP32(src[0]);
}
}
}
-
}
static void ggml_compute_forward_alibi(
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q4_2:
case GGML_TYPE_Q4_3:
+ case GGML_TYPE_Q5_0:
+ case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
+ case GGML_TYPE_Q8_1:
case GGML_TYPE_I8:
case GGML_TYPE_I16:
case GGML_TYPE_I32: