const int n_ctx = hparams.n_ctx;
const int n_vocab = hparams.n_vocab;
- ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
- ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
+ ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
+ ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
- ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // wte
+ ctx_size += ggml_row_size(wtype, n_embd*n_vocab); // wte
- ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // lmh_g
- //ctx_size += n_vocab*ggml_type_sizef(GGML_TYPE_F32); // lmh_b
+ ctx_size += ggml_row_size(wtype, n_embd*n_vocab); // lmh_g
+ //ctx_size += ggml_row_size(GGML_TYPE_F32, n_vocab); // lmh_b
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
- ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w
- ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
+ ctx_size += n_layer*(ggml_row_size(wtype, 3*n_embd*n_embd)); // c_attn_attn_w
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd)); // c_attn_attn_b
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
+ ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd*n_embd)); // c_attn_proj_b
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
- ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
- ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+ ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_fc_w
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_fc_b
- ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
- ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+ ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_proj_w
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_mlp_proj_b
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
+ ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
+ ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
ctx_size += (6 + 16*n_layer)*512; // object overhead
struct ggml_tensor * KQ_scaled =
ggml_scale_inplace(ctx0,
KQ,
- ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
- );
+ 1.0f/sqrt(float(n_embd)/n_head));
// KQ_masked = mask_past(KQ_scaled)
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
const int n_ctx = hparams.n_ctx;
const int n_vocab = hparams.n_vocab;
- ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
- ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
+ ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
+ ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
- ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte
- ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
- ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head
+ ctx_size += ggml_row_size(wtype, n_vocab*n_embd); // wte
+ ctx_size += ggml_row_size(GGML_TYPE_F32 , n_ctx*n_embd); // wpe
+ ctx_size += ggml_row_size(wtype, n_vocab*n_embd); // lm_head
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
- ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w
- ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
+ ctx_size += n_layer*(ggml_row_size(wtype, 3*n_embd*n_embd)); // c_attn_attn_w
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd)); // c_attn_attn_b
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
- ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
+ ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_attn_proj_b
- ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
- ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+ ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_fc_w
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_fc_b
- ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
- ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+ ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_proj_w
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_proj_b
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
+ ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
+ ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
ctx_size += (6 + 12*n_layer)*512; // object overhead
}
}
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
- ggml_allocr_alloc(allocr, KQ_scale);
- if (!ggml_allocr_is_measure(allocr)) {
- ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
- }
-
// wte + wpe
struct ggml_tensor * inpL =
ggml_add(ctx0,
struct ggml_tensor * KQ_scaled =
ggml_scale(ctx0,
KQ,
- KQ_scale);
+ 1.0f/sqrtf(float(n_embd)/n_head));
// KQ_masked = mask_past(KQ_scaled)
// [n_past + N, N, 12]
}
}
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
- ggml_allocr_alloc(allocr, KQ_scale);
- if (!ggml_allocr_is_measure(allocr)) {
- float s = 1.0f/sqrtf(float(n_embd)/n_head);
- ggml_backend_tensor_set(KQ_scale, &s, 0, sizeof(s));
- }
-
// wte + wpe
struct ggml_tensor * inpL =
ggml_add(ctx0,
struct ggml_tensor * KQ_scaled =
ggml_scale(ctx0,
KQ,
- KQ_scale);
+ 1.0f/sqrtf(float(n_embd)/n_head));
// KQ_masked = mask_past(KQ_scaled)
// [n_past + N, N, 12]
const int n_ctx = hparams.n_ctx;
const int n_vocab = hparams.n_vocab;
- buffer_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
- buffer_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
+ buffer_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
+ buffer_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
- buffer_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte
- buffer_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
- buffer_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head
+ buffer_size += ggml_row_size(wtype, n_vocab*n_embd); // wte
+ buffer_size += ggml_row_size(GGML_TYPE_F32, n_ctx*n_embd); // wpe
+ buffer_size += ggml_row_size(wtype, n_vocab*n_embd); // lm_head
- buffer_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
- buffer_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+ buffer_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
+ buffer_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
- buffer_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
- buffer_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
+ buffer_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
+ buffer_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
- buffer_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w
- buffer_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
+ buffer_size += n_layer*(ggml_row_size(wtype, 3*n_embd*n_embd)); // c_attn_attn_w
+ buffer_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd)); // c_attn_attn_b
- buffer_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
- buffer_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
+ buffer_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w
+ buffer_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_attn_proj_b
- buffer_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
- buffer_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+ buffer_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_fc_w
+ buffer_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_fc_b
- buffer_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
- buffer_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+ buffer_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_proj_w
+ buffer_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_proj_b
buffer_size += (6 + 12*n_layer)*128; // alignment overhead
}
}
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
- ggml_allocr_alloc(allocr, KQ_scale);
- if (!ggml_allocr_is_measure(allocr)) {
- float s = 1.0f/sqrtf(float(n_embd)/n_head);
- ggml_backend_tensor_set(KQ_scale, &s, 0, sizeof(s));
- }
-
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
ggml_set_name(KQ_mask, "KQ_mask");
struct ggml_tensor * KQ_scaled =
ggml_scale(ctx0,
KQ,
- KQ_scale);
+ 1.0f/sqrtf(float(n_embd)/n_head));
// KQ_masked = mask_past(KQ_scaled)
// [n_kv, n_tokens, 12]
const int n_ctx = hparams.n_ctx;
const int n_vocab = hparams.n_vocab;
- ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
- ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
+ ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
+ ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
- ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte
- ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
- ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head
+ ctx_size += ggml_row_size(wtype, n_vocab*n_embd); // wte
+ ctx_size += ggml_row_size(GGML_TYPE_F32, n_ctx*n_embd); // wpe
+ ctx_size += ggml_row_size(wtype, n_vocab*n_embd); // lm_head
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
- ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w
- ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
+ ctx_size += n_layer*(ggml_row_size(wtype, 3*n_embd*n_embd)); // c_attn_attn_w
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd)); // c_attn_attn_b
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
- ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
+ ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_attn_proj_b
- ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
- ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+ ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_fc_w
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_fc_b
- ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
- ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+ ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_proj_w
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_proj_b
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
+ ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
+ ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
ctx_size += (6 + 12*n_layer)*512; // object overhead
// KQ_scaled = KQ / sqrt(n_embd/n_head)
// [n_past + N, N, 12]
- struct ggml_tensor * KQ_scaled =
- ggml_scale_inplace(ctx0,
- KQ,
- ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
- );
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, 1.0f/sqrt(float(n_embd)/n_head));
// KQ_masked = mask_past(KQ_scaled)
// [n_past + N, N, 12]
// inputs/constants
struct ggml_tensor * embd;
struct ggml_tensor * position;
- struct ggml_tensor * KQ_scale;
};
void init_backends(gpt2_model & model, const gpt_params & params) {
{
model.embd = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, model.hparams.n_ctx);
model.position = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, model.hparams.n_ctx);
- model.KQ_scale = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); // FIXME: should be in backend_kv, but also shouldn't matter
ggml_set_name(model.embd, "in/embd");
ggml_set_name(model.position, "in/position");
- ggml_set_name(model.KQ_scale, "KQ_scale");
// add input tensors to cpu backend
- size_t input_size = ggml_nbytes(model.embd) + ggml_nbytes(model.position) + ggml_nbytes(model.KQ_scale);
+ size_t input_size = ggml_nbytes(model.embd) + ggml_nbytes(model.position);
// FIXME: use cpu backend after sched impl
ggml_backend_t backend_input = params.n_gpu_layers >= model.hparams.n_layer ? backend_gpu : backend_cpu;
ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_input);
ggml_allocr_alloc(alloc, model.embd);
ggml_allocr_alloc(alloc, model.position);
- ggml_allocr_alloc(alloc, model.KQ_scale);
ggml_allocr_free(alloc);
-
- // initialize KQ_scale
- float s = 1.0f/sqrtf(float(model.hparams.n_embd)/model.hparams.n_head);
- ggml_backend_tensor_set(model.KQ_scale, &s, 0, sizeof(s));
}
return true;
}
//}
- struct ggml_tensor * KQ_scale = model.KQ_scale;
+ const float KQ_scale = 1.0f/sqrtf(float(model.hparams.n_embd)/model.hparams.n_head);
// wte + wpe
struct ggml_tensor * inpL =
// KQ_scaled = KQ / sqrt(n_embd/n_head)
// [n_past + N, N, 12]
- struct ggml_tensor * KQ_scaled =
- ggml_scale(ctx0,
- KQ,
- KQ_scale);
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
ggml_format_name(KQ_scaled, "l%d.KQ_scaled", il);
// KQ_masked = mask_past(KQ_scaled)
const int n_ctx = hparams.n_ctx;
const int n_vocab = hparams.n_vocab;
- ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
- ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
+ ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
+ ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
- ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // wte
+ ctx_size += ggml_row_size(wtype, n_embd*n_vocab); // wte
- ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // lmh_g
- ctx_size += n_vocab*ggml_type_sizef(GGML_TYPE_F32); // lmh_b
+ ctx_size += ggml_row_size(wtype, n_embd*n_vocab); // lmh_g
+ ctx_size += ggml_row_size(GGML_TYPE_F32, n_vocab); // lmh_b
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_q_proj_w
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_k_proj_w
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_v_proj_w
+ ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_q_proj_w
+ ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_k_proj_w
+ ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_v_proj_w
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
+ ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w
- ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
- ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+ ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_fc_w
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_fc_b
- ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
- ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+ ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_proj_w
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_mlp_proj_b
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v
+ ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F16, n_embd); // memory_k
+ ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F16, n_embd); // memory_v
ctx_size += (5 + 10*n_layer)*512; // object overhead
struct ggml_tensor * KQ_scaled =
ggml_scale_inplace(ctx0,
KQ,
- ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
- );
+ 1.0f/sqrt(float(n_embd)/n_head));
// KQ_masked = mask_past(KQ_scaled)
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
const size_t n_ctx = hparams.n_ctx;
const size_t n_vocab = hparams.n_vocab;
- ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
- ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
+ ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
+ ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
- ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // wte
+ ctx_size += ggml_row_size(wtype, n_embd*n_vocab); // wte
- ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // lmh_g
- //ctx_size += n_vocab*ggml_type_sizef(GGML_TYPE_F32); // lmh_b
+ ctx_size += ggml_row_size(wtype, n_embd*n_vocab); // lmh_g
+ //ctx_size += ggml_row_size(GGML_TYPE_F32, n_vocab); // lmh_b
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
- ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w
- ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
+ ctx_size += n_layer*(ggml_row_size(wtype, 3*n_embd*n_embd)); // c_attn_attn_w
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd)); // c_attn_attn_b
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
+ ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd*n_embd)); // c_attn_proj_b
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
- ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
- ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+ ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_fc_w
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_fc_b
- ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
- ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+ ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_proj_w
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_mlp_proj_b
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
+ ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
+ ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
ctx_size += (6 + 16*n_layer)*1024; // object overhead
struct ggml_tensor * KQ_scaled =
ggml_scale_inplace(ctx0,
KQ,
- ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
- );
+ 1.0f/sqrt(float(n_embd)/n_head));
// KQ_masked = mask_past(KQ_scaled)
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
const int n_hidden = hparams.n_hidden;
const int n_classes = hparams.n_classes;
- ctx_size += n_input * n_hidden * ggml_type_sizef(GGML_TYPE_F32); // fc1 weight
- ctx_size += n_hidden * ggml_type_sizef(GGML_TYPE_F32); // fc1 bias
+ ctx_size += n_input * n_hidden * ggml_type_size(GGML_TYPE_F32); // fc1 weight
+ ctx_size += n_hidden * ggml_type_size(GGML_TYPE_F32); // fc1 bias
- ctx_size += n_hidden * n_classes * ggml_type_sizef(GGML_TYPE_F32); // fc2 weight
- ctx_size += n_classes * ggml_type_sizef(GGML_TYPE_F32); // fc2 bias
+ ctx_size += n_hidden * n_classes * ggml_type_size(GGML_TYPE_F32); // fc2 weight
+ ctx_size += n_classes * ggml_type_size(GGML_TYPE_F32); // fc2 bias
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
}
const size_t n_layer = hparams.n_layers;
const size_t n_vocab = hparams.n_vocab;
- ctx_size += n_embd * n_vocab * ggml_type_sizef(wtype); // wte_weight
- ctx_size += n_embd * ggml_type_sizef(GGML_TYPE_F32); // norm_f_weight
+ ctx_size += ggml_row_size(wtype, n_embd * n_vocab); // wte_weight
+ ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // norm_f_weight
- ctx_size += n_layer * (n_embd * ggml_type_sizef(GGML_TYPE_F32)); // ln_1_weight
- ctx_size += n_layer * (3 * n_embd * n_embd * ggml_type_sizef(wtype)); // attn_Wqkv_weight
- ctx_size += n_layer * (n_embd * n_embd * ggml_type_sizef(wtype)); // attn_out_proj_weight
- ctx_size += n_layer * (n_embd * ggml_type_sizef(GGML_TYPE_F32)); // ln_2_weight
- ctx_size += n_layer * (4 * n_embd * n_embd * ggml_type_sizef(wtype)); // mlp_mlp_up_weight
- ctx_size += n_layer * (n_embd * n_embd * 4 * ggml_type_sizef(wtype)); // mlp_mlp_down_weight
+ ctx_size += n_layer * (ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_weight
- ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_k
- ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_v
+ ctx_size += n_layer * (ggml_row_size(wtype, 3 * n_embd * n_embd)); // attn_Wqkv_weight
+ ctx_size += n_layer * (ggml_row_size(wtype, n_embd * n_embd)); // attn_out_proj_weight
+
+ ctx_size += n_layer * (ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_weight
+
+ ctx_size += n_layer * (ggml_row_size(wtype, 4 * n_embd * n_embd)); // mlp_mlp_up_weight
+ ctx_size += n_layer * (ggml_row_size(wtype, 4 * n_embd * n_embd)); // mlp_mlp_down_weight
+
+ ctx_size += n_ctx * n_layer * ggml_row_size(GGML_TYPE_F16, n_embd); // memory_k
+ ctx_size += n_ctx * n_layer * ggml_row_size(GGML_TYPE_F16, n_embd); // memory_v
ctx_size += (1 + 6 * n_layer) * 512; // object overhead
// KQ_scaled = KQ / sqrt(n_embd/n_head)
struct ggml_tensor * KQ_scaled =
- ggml_scale(ctx0, KQ, ggml_new_f32(ctx0, 1.0f / sqrt(float(n_embd) / n_head)));
+ ggml_scale(ctx0, KQ, 1.0f / sqrt(float(n_embd) / n_head));
struct ggml_tensor * KQ_scaled_alibi =
ggml_alibi(ctx0, KQ_scaled, n_past, n_head, model.hparams.alibi_bias_max);
const int n_ctx = hparams.max_seq_len;
const int n_vocab = hparams.n_vocab;
- ctx_size += n_embd * n_vocab * ggml_type_sizef(wtype); // wte_weight
- ctx_size += n_embd * ggml_type_sizef(GGML_TYPE_F32); // ln_f_weight
+ ctx_size += ggml_row_size(wtype, n_embd*n_vocab); // wte_weight
+ ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_weight
- ctx_size += n_layer * (n_embd * ggml_type_sizef(GGML_TYPE_F32)); // ln_1_weight
- ctx_size += n_layer * (3 * n_embd * n_embd * ggml_type_sizef(wtype)); // attn_Wqkv_weight
- ctx_size += n_layer * (n_embd * n_embd * ggml_type_sizef(wtype)); // attn_out_proj_weight
- ctx_size += n_layer * (n_embd * ggml_type_sizef(GGML_TYPE_F32)); // ln_2_weight
- ctx_size += n_layer * (4 * n_embd * n_embd * ggml_type_sizef(wtype)); // mlp_mlp_up_weight
- ctx_size += n_layer * (n_embd * n_embd * 4 * ggml_type_sizef(wtype)); // mlp_mlp_down_weight
+ ctx_size += n_layer * (ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_weight
- ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_k
- ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_v
+ ctx_size += n_layer * (ggml_row_size(wtype, 3 * n_embd * n_embd)); // attn_Wqkv_weight
+ ctx_size += n_layer * (ggml_row_size(wtype, n_embd * n_embd)); // attn_out_proj_weight
+
+ ctx_size += n_layer * (ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_weight
+
+ ctx_size += n_layer * (ggml_row_size(wtype, 4 * n_embd * n_embd)); // mlp_mlp_up_weight
+ ctx_size += n_layer * (ggml_row_size(wtype, 4 * n_embd * n_embd)); // mlp_mlp_down_weight
+
+ ctx_size += n_ctx * n_layer * ggml_row_size(GGML_TYPE_F16, n_embd); // memory_k
+ ctx_size += n_ctx * n_layer * ggml_row_size(GGML_TYPE_F16, n_embd); // memory_v
ctx_size += (1 + 6 * n_layer) * 512; // object overhead
// KQ_scaled = KQ / sqrt(n_embd/n_head)
struct ggml_tensor * KQ_scaled =
- ggml_scale(ctx0, KQ, ggml_new_f32(ctx0, 1.0f / sqrt(float(n_embd) / n_head)));
+ ggml_scale(ctx0, KQ, 1.0f / sqrt(float(n_embd) / n_head));
struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8.0f);
// image encoder
{
- ctx_size += n_enc_state*n_img_embd*n_img_embd*ggml_type_sizef(GGML_TYPE_F32);
+ ctx_size += n_enc_state*n_img_embd*n_img_embd*ggml_type_size(GGML_TYPE_F32);
- ctx_size += n_enc_state*3*n_patch_size*n_patch_size*ggml_type_sizef(GGML_TYPE_F16);
- ctx_size += n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
+ ctx_size += n_enc_state*3*n_patch_size*n_patch_size*ggml_type_size(GGML_TYPE_F16);
+ ctx_size += n_enc_state*ggml_type_size(GGML_TYPE_F32);
- ctx_size += n_enc_state*n_enc_out_chans*1*1*ggml_type_sizef(GGML_TYPE_F16);
- ctx_size += n_enc_out_chans*n_enc_out_chans*3*3*ggml_type_sizef(GGML_TYPE_F16);
+ ctx_size += n_enc_state*n_enc_out_chans*1*1*ggml_type_size(GGML_TYPE_F16);
+ ctx_size += n_enc_out_chans*n_enc_out_chans*3*3*ggml_type_size(GGML_TYPE_F16);
- ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
- ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
+ ctx_size += n_enc_out_chans*ggml_type_size(GGML_TYPE_F32);
+ ctx_size += n_enc_out_chans*ggml_type_size(GGML_TYPE_F32);
- ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
- ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
+ ctx_size += n_enc_out_chans*ggml_type_size(GGML_TYPE_F32);
+ ctx_size += n_enc_out_chans*ggml_type_size(GGML_TYPE_F32);
}
// image encoder layers
{
- ctx_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
- ctx_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
+ ctx_size += n_enc_layer*n_enc_state*ggml_type_size(GGML_TYPE_F32);
+ ctx_size += n_enc_layer*n_enc_state*ggml_type_size(GGML_TYPE_F32);
- ctx_size += n_enc_layer_global*n_enc_head_dim*(2*n_img_embd - 1)*ggml_type_sizef(GGML_TYPE_F16);
- ctx_size += n_enc_layer_global*n_enc_head_dim*(2*n_img_embd - 1)*ggml_type_sizef(GGML_TYPE_F16);
+ ctx_size += n_enc_layer_global*n_enc_head_dim*(2*n_img_embd - 1)*ggml_type_size(GGML_TYPE_F16);
+ ctx_size += n_enc_layer_global*n_enc_head_dim*(2*n_img_embd - 1)*ggml_type_size(GGML_TYPE_F16);
- ctx_size += n_enc_layer_local*n_enc_head_dim*(2*n_window_size - 1)*ggml_type_sizef(GGML_TYPE_F16);
- ctx_size += n_enc_layer_local*n_enc_head_dim*(2*n_window_size - 1)*ggml_type_sizef(GGML_TYPE_F16);
+ ctx_size += n_enc_layer_local*n_enc_head_dim*(2*n_window_size - 1)*ggml_type_size(GGML_TYPE_F16);
+ ctx_size += n_enc_layer_local*n_enc_head_dim*(2*n_window_size - 1)*ggml_type_size(GGML_TYPE_F16);
- ctx_size += n_enc_layer*3*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16);
- ctx_size += n_enc_layer*3*n_enc_state* ggml_type_sizef(GGML_TYPE_F32);
+ ctx_size += n_enc_layer*3*n_enc_state*n_enc_state*ggml_type_size(GGML_TYPE_F16);
+ ctx_size += n_enc_layer*3*n_enc_state* ggml_type_size(GGML_TYPE_F32);
- ctx_size += n_enc_layer*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16);
- ctx_size += n_enc_layer*n_enc_state* ggml_type_sizef(GGML_TYPE_F32);
+ ctx_size += n_enc_layer*n_enc_state*n_enc_state*ggml_type_size(GGML_TYPE_F16);
+ ctx_size += n_enc_layer*n_enc_state* ggml_type_size(GGML_TYPE_F32);
- ctx_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
- ctx_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
+ ctx_size += n_enc_layer*n_enc_state*ggml_type_size(GGML_TYPE_F32);
+ ctx_size += n_enc_layer*n_enc_state*ggml_type_size(GGML_TYPE_F32);
- ctx_size += n_enc_layer*4*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16);
- ctx_size += n_enc_layer*4*n_enc_state* ggml_type_sizef(GGML_TYPE_F32);
+ ctx_size += n_enc_layer*4*n_enc_state*n_enc_state*ggml_type_size(GGML_TYPE_F16);
+ ctx_size += n_enc_layer*4*n_enc_state* ggml_type_size(GGML_TYPE_F32);
- ctx_size += n_enc_layer*4*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16);
- ctx_size += n_enc_layer*4*n_enc_state* ggml_type_sizef(GGML_TYPE_F32);
+ ctx_size += n_enc_layer*4*n_enc_state*n_enc_state*ggml_type_size(GGML_TYPE_F16);
+ ctx_size += n_enc_layer*4*n_enc_state* ggml_type_size(GGML_TYPE_F32);
}
ctx_size += (8 + 14*n_enc_layer)*ggml_tensor_overhead();
// prompt encoder
{
- ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); // 2*(n_enc_out_chans/2)
+ ctx_size += n_enc_out_chans*ggml_type_size(GGML_TYPE_F16); // 2*(n_enc_out_chans/2)
- ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
- ctx_size += n_pt_embd*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
+ ctx_size += n_enc_out_chans*ggml_type_size(GGML_TYPE_F32);
+ ctx_size += n_pt_embd*n_enc_out_chans*ggml_type_size(GGML_TYPE_F32);
}
ctx_size += (2 + n_pt_embd)*ggml_tensor_overhead();
const int n_hypernet_mpls_count = 4;
// self_attn
- ctx_size += tfm_layers_count*qkv_count*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16);
- ctx_size += tfm_layers_count*qkv_count*n_enc_state* ggml_type_sizef(GGML_TYPE_F32);
- ctx_size += tfm_layers_count*n_enc_state* ggml_type_sizef(GGML_TYPE_F32);
+ ctx_size += tfm_layers_count*qkv_count*n_enc_state*n_enc_state*ggml_type_size(GGML_TYPE_F16);
+ ctx_size += tfm_layers_count*qkv_count*n_enc_state* ggml_type_size(GGML_TYPE_F32);
+ ctx_size += tfm_layers_count*n_enc_state* ggml_type_size(GGML_TYPE_F32);
// all norms
- ctx_size += tfm_layers_count*norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
- ctx_size += tfm_layers_count*norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
+ ctx_size += tfm_layers_count*norm_count*n_enc_state*ggml_type_size(GGML_TYPE_F32);
+ ctx_size += tfm_layers_count*norm_count*n_enc_state*ggml_type_size(GGML_TYPE_F32);
// cross_attn_token_to_img
- ctx_size += tfm_layers_count*qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_sizef(GGML_TYPE_F16);
- ctx_size += tfm_layers_count*qkv_count*(n_enc_state/2)* ggml_type_sizef(GGML_TYPE_F32);
- ctx_size += tfm_layers_count*n_enc_state* ggml_type_sizef(GGML_TYPE_F32);
+ ctx_size += tfm_layers_count*qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_size(GGML_TYPE_F16);
+ ctx_size += tfm_layers_count*qkv_count*(n_enc_state/2)* ggml_type_size(GGML_TYPE_F32);
+ ctx_size += tfm_layers_count*n_enc_state* ggml_type_size(GGML_TYPE_F32);
// mlp
- ctx_size += tfm_layers_count*8*n_enc_out_chans*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16);
- ctx_size += tfm_layers_count*8*n_enc_out_chans* ggml_type_sizef(GGML_TYPE_F32);
- ctx_size += tfm_layers_count*n_enc_out_chans*8*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16);
- ctx_size += tfm_layers_count*n_enc_out_chans* ggml_type_sizef(GGML_TYPE_F32);
+ ctx_size += tfm_layers_count*8*n_enc_out_chans*n_enc_out_chans*ggml_type_size(GGML_TYPE_F16);
+ ctx_size += tfm_layers_count*8*n_enc_out_chans* ggml_type_size(GGML_TYPE_F32);
+ ctx_size += tfm_layers_count*n_enc_out_chans*8*n_enc_out_chans*ggml_type_size(GGML_TYPE_F16);
+ ctx_size += tfm_layers_count*n_enc_out_chans* ggml_type_size(GGML_TYPE_F32);
// cross_attn_img_to_token
- ctx_size += tfm_layers_count*qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_sizef(GGML_TYPE_F16);
- ctx_size += tfm_layers_count*qkv_count*(n_enc_state/2)* ggml_type_sizef(GGML_TYPE_F32);
- ctx_size += tfm_layers_count*n_enc_state* ggml_type_sizef(GGML_TYPE_F32);
+ ctx_size += tfm_layers_count*qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_size(GGML_TYPE_F16);
+ ctx_size += tfm_layers_count*qkv_count*(n_enc_state/2)* ggml_type_size(GGML_TYPE_F32);
+ ctx_size += tfm_layers_count*n_enc_state* ggml_type_size(GGML_TYPE_F32);
// transformer_final_attn_token_to_img
- ctx_size += qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_sizef(GGML_TYPE_F16);
- ctx_size += qkv_count*(n_enc_state/2)* ggml_type_sizef(GGML_TYPE_F32);
- ctx_size += n_enc_state* ggml_type_sizef(GGML_TYPE_F32);
+ ctx_size += qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_size(GGML_TYPE_F16);
+ ctx_size += qkv_count*(n_enc_state/2)* ggml_type_size(GGML_TYPE_F32);
+ ctx_size += n_enc_state* ggml_type_size(GGML_TYPE_F32);
// transformer_norm_final
- ctx_size += norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
- ctx_size += norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
+ ctx_size += norm_count*n_enc_state*ggml_type_size(GGML_TYPE_F32);
+ ctx_size += norm_count*n_enc_state*ggml_type_size(GGML_TYPE_F32);
// output_upscaling
- ctx_size += n_enc_out_chans*n_img_embd*2*2*ggml_type_sizef(GGML_TYPE_F16);
- ctx_size += 3*n_img_embd* ggml_type_sizef(GGML_TYPE_F32);
- ctx_size += n_enc_out_chans*n_img_embd*(n_img_embd/2)*2*2*ggml_type_sizef(GGML_TYPE_F16);
- ctx_size += (n_img_embd/2)* ggml_type_sizef(GGML_TYPE_F32);
+ ctx_size += n_enc_out_chans*n_img_embd*2*2*ggml_type_size(GGML_TYPE_F16);
+ ctx_size += 3*n_img_embd* ggml_type_size(GGML_TYPE_F32);
+ ctx_size += n_enc_out_chans*n_img_embd*(n_img_embd/2)*2*2*ggml_type_size(GGML_TYPE_F16);
+ ctx_size += (n_img_embd/2)* ggml_type_size(GGML_TYPE_F32);
// output_hypernetworks_mlps
- ctx_size += n_hypernet_mpls_count*2*n_enc_out_chans*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16);
- ctx_size += n_hypernet_mpls_count*2*n_enc_out_chans* ggml_type_sizef(GGML_TYPE_F32);
- ctx_size += n_hypernet_mpls_count*n_enc_out_chans*(n_img_embd/2)*ggml_type_sizef(GGML_TYPE_F16);
- ctx_size += n_hypernet_mpls_count*(n_img_embd/2)* ggml_type_sizef(GGML_TYPE_F32);
+ ctx_size += n_hypernet_mpls_count*2*n_enc_out_chans*n_enc_out_chans*ggml_type_size(GGML_TYPE_F16);
+ ctx_size += n_hypernet_mpls_count*2*n_enc_out_chans* ggml_type_size(GGML_TYPE_F32);
+ ctx_size += n_hypernet_mpls_count*n_enc_out_chans*(n_img_embd/2)*ggml_type_size(GGML_TYPE_F16);
+ ctx_size += n_hypernet_mpls_count*(n_img_embd/2)* ggml_type_size(GGML_TYPE_F32);
// iou_prediction_head
- ctx_size += 2*n_enc_out_chans*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16);
- ctx_size += 2*n_enc_out_chans* ggml_type_sizef(GGML_TYPE_F32);
- ctx_size += n_pt_embd*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16);
- ctx_size += n_pt_embd* ggml_type_sizef(GGML_TYPE_F32);
+ ctx_size += 2*n_enc_out_chans*n_enc_out_chans*ggml_type_size(GGML_TYPE_F16);
+ ctx_size += 2*n_enc_out_chans* ggml_type_size(GGML_TYPE_F32);
+ ctx_size += n_pt_embd*n_enc_out_chans*ggml_type_size(GGML_TYPE_F16);
+ ctx_size += n_pt_embd* ggml_type_size(GGML_TYPE_F32);
// iou_token_w
- ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
+ ctx_size += n_enc_out_chans*ggml_type_size(GGML_TYPE_F32);
// mask_tokens_w
- ctx_size += n_pt_embd*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
+ ctx_size += n_pt_embd*n_enc_out_chans*ggml_type_size(GGML_TYPE_F32);
}
}
fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
struct ggml_tensor * cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, enc.pe)), xy_embed_stacked);
- cur = ggml_scale(ctx0, cur, ggml_new_f32(ctx0, float(2.0*M_PI)));
+ cur = ggml_scale(ctx0, cur, float(2.0*M_PI));
// concat
// ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/prompt_encoder.py#L192
struct ggml_tensor * KQ_scaled =
ggml_scale_inplace(ctx0,
KQ,
- ggml_new_f32(ctx0, 1.0f/sqrtf(n_enc_head_dim))
- );
+ 1.0f/sqrtf(n_enc_head_dim));
struct ggml_tensor * rw = ggml_get_rel_pos(ctx0, layer.rel_pos_w, W, W);
struct ggml_tensor * rh = ggml_get_rel_pos(ctx0, layer.rel_pos_h, H, H);
struct ggml_tensor * cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, enc.pe)), inp);
- cur = ggml_scale(ctx0, cur, ggml_new_f32(ctx0, float(2.0*M_PI)));
+ cur = ggml_scale(ctx0, cur, float(2.0*M_PI));
// concat
// ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/prompt_encoder.py#L192
// Q * K
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
- struct ggml_tensor * KQ_scaled =
- ggml_scale_inplace(ctx0,
- KQ,
- ggml_new_f32(ctx0, 1.0f/sqrt(float(Q->ne[0]))));
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, 1.0f/sqrt(float(Q->ne[0])));
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_scaled);
const int kv_heads = hparams.n_head; // 1 if MQA else hparams.n_head
const int kv_dim = kv_heads * head_dim;
- ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
- ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
+ ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
+ ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
- ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte
- ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
- ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head
+ ctx_size += n_vocab*ggml_row_size(wtype, n_embd); // wte
+ ctx_size += n_ctx*ggml_row_size(GGML_TYPE_F32, n_embd); // wpe
+ ctx_size += n_vocab*ggml_row_size(wtype, n_embd); // lm_head
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
- ctx_size += n_layer*((n_embd + 2*kv_dim)*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w // TODO:
- ctx_size += n_layer*( (n_embd + 2*kv_dim)*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
+ ctx_size += n_layer*((n_embd + 2*kv_dim)*ggml_row_size(wtype, n_embd)); // c_attn_attn_w // TODO:
+ ctx_size += n_layer*((n_embd + 2*kv_dim)*ggml_row_size(GGML_TYPE_F32, 1)); // c_attn_attn_b
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
- ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
+ ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_attn_proj_b
- ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
- ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+ ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_fc_w
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_fc_b
- ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
- ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+ ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_proj_w
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_mlp_proj_b
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
+ ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
+ ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
ctx_size += (6 + 12*n_layer)*512; // object overhead
// KQ_scaled = KQ / sqrt(n_embd/n_head)
// [n_past + N, N, 12]
struct ggml_tensor * KQ_scaled =
- ggml_scale_inplace(ctx0,
- KQ,
- ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
- );
+ ggml_scale_inplace(ctx0, KQ, 1.0f/sqrt(float(n_embd)/n_head));
// KQ_masked = mask_past(KQ_scaled)
// [n_past + N, N, 12]
struct ggml_tensor * KQ_scaled =
ggml_scale_inplace(ctx0,
KQ,
- ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
- );
+ 1.0f/sqrt(float(n_embd)/n_head));
// KQ_masked = mask_past(KQ_scaled)
// [n_past + N, N, 12]
// measure the memory usage of a graph and prepare the allocr's internal data buffer
static void whisper_allocr_graph_init(struct whisper_allocr & allocr, ggml_backend_t backend, std::function<struct ggml_cgraph *()> && get_graph) {
- auto & alloc = allocr.alloc;
- auto & meta = allocr.meta;
+ auto & alloc = allocr.alloc;
+ auto & meta = allocr.meta;
alloc = ggml_allocr_new_measure_from_backend(backend);
ggml_cgraph * gf = ggml_new_graph_custom(ctx0, WHISPER_MAX_NODES, false);
- ggml_allocr * alloc = wstate.alloc_encode.alloc;
+ //ggml_allocr * alloc = wstate.alloc_encode.alloc;
//struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_ctx, n_state);
//ggml_allocr_alloc(alloc, cur);
//}
struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_conv);
- struct ggml_tensor * KQscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
- ggml_allocr_alloc(alloc, KQscale);
-
- if (!ggml_allocr_is_measure(alloc)) {
- const float val = 1.0f/sqrtf(float(n_state)/n_head);
- ggml_backend_tensor_set(KQscale, &val, 0, sizeof(float));
- }
+ const float KQscale = 1.0f/sqrtf(float(n_state)/n_head);
// ===================================================================
// NOTE: experimenting with partial evaluation of the encoder (ignore)
Qcur = ggml_add(ctx0, Qcur, layer.attn_q_b);
- //Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+ //Qcur = ggml_scale(ctx0, Qcur, pow(float(n_state)/n_head, -0.25));
// note: no bias for Key
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
layer.attn_k_w,
cur);
- //Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+ //Kcur = ggml_scale(ctx0, Kcur, pow(float(n_state)/n_head, -0.25));
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
layer.attn_v_w,
ggml_cgraph * gf = ggml_new_graph(ctx0);
- ggml_allocr * alloc = wstate.alloc_cross.alloc;
+ //ggml_allocr * alloc = wstate.alloc_cross.alloc;
//struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
//ggml_allocr_alloc(alloc, cur);
//}
struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_enc);
- struct ggml_tensor * Kscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
- ggml_allocr_alloc(alloc, Kscale);
-
- if (!ggml_allocr_is_measure(alloc)) {
- const float val = pow(float(n_state) / n_head, -0.25);
- ggml_backend_tensor_set(Kscale, &val, 0, sizeof(float));
- }
+ const float Kscale = pow(float(n_state) / n_head, -0.25);
for (int il = 0; il < model.hparams.n_text_layer; ++il) {
auto & layer = model.layers_decoder[il];
}
}
- struct ggml_tensor * KQscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
- ggml_allocr_alloc(alloc, KQscale);
-
- if (!ggml_allocr_is_measure(alloc)) {
- const float val = pow(float(n_state)/n_head, -0.25);
- ggml_backend_tensor_set(KQscale, &val, 0, sizeof(float));
- }
+ const float KQscale = pow(float(n_state)/n_head, -0.25);
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
ggml_allocr_alloc(alloc, KQ_mask);
// multi-thread
- for (uint32_t k = 1; k <= n_threads; k++) {
+ for (int32_t k = 1; k <= n_threads; k++) {
char * src = (char *) malloc(size);
char * dst = (char *) malloc(size);
const int64_t t0 = ggml_time_us();
std::vector<std::thread> threads(k - 1);
- for (uint32_t th = 0; th < k - 1; ++th) {
+ for (int32_t th = 0; th < k - 1; ++th) {
threads[th] = std::thread(helper, th);
}
helper(k - 1);
- for (uint32_t th = 0; th < k - 1; ++th) {
+ for (int32_t th = 0; th < k - 1; ++th) {
threads[th].join();
}
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
+ GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
// buffer
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+ GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
+ GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
//
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
+#ifdef GGML_USE_CPU_HBM
+ GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
+#endif
+
//
// Backend registry
//
#if defined(__ARM_NEON) && defined(__CUDACC__)
typedef half ggml_fp16_t;
-#elif defined(__ARM_NEON)
+#elif defined(__ARM_NEON) && !defined(_MSC_VER)
typedef __fp16 ggml_fp16_t;
#else
typedef uint16_t ggml_fp16_t;
GGML_TYPE_COUNT,
};
+ // precision
+ enum ggml_prec {
+ GGML_PREC_DEFAULT,
+ GGML_PREC_F32,
+ };
+
enum ggml_backend_type {
GGML_BACKEND_CPU = 0,
GGML_BACKEND_GPU = 10,
enum ggml_log_level {
GGML_LOG_LEVEL_ERROR = 2,
GGML_LOG_LEVEL_WARN = 3,
- GGML_LOG_LEVEL_INFO = 4
+ GGML_LOG_LEVEL_INFO = 4,
+ GGML_LOG_LEVEL_DEBUG = 5
};
// ggml object
struct ggml_backend_buffer * buffer;
- int n_dims;
int64_t ne[GGML_MAX_DIMS]; // number of elements
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
// nb[0] = ggml_type_size(type)
void * extra; // extra things e.g. for ggml-cuda.cu
- char padding[12];
+ char padding[8];
};
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
- GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
- GGML_API int ggml_blck_size (enum ggml_type type);
- GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
- GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
+ GGML_API int ggml_blck_size(enum ggml_type type);
+ GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
+ GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
+
+ GGML_DEPRECATED(
+ GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
+ "use ggml_row_size() instead");
GGML_API const char * ggml_type_name(enum ggml_type type);
GGML_API const char * ggml_op_name (enum ggml_op op);
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
+ GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
+ GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
+ GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
+ GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
+ GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
// Context tensor enumeration and lookup
- GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
- GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
+ GGML_API struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx);
+ GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
struct ggml_tensor * a,
struct ggml_tensor * b);
+ // change the precision of a matrix multiplication
+ // set to GGML_PREC_F32 for higher precision (useful for phi-2)
+ GGML_API void ggml_mul_mat_set_prec(
+ struct ggml_tensor * a,
+ enum ggml_prec prec);
+
// indirect matrix multiplication
// ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
GGML_API struct ggml_tensor * ggml_mul_mat_id(
GGML_API struct ggml_tensor * ggml_scale(
struct ggml_context * ctx,
struct ggml_tensor * a,
- struct ggml_tensor * b);
+ float s);
// in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_scale_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
- struct ggml_tensor * b);
+ float s);
// b -> view(a,offset,nb1,nb2,3), return modified a
GGML_API struct ggml_tensor * ggml_set(
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
- GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
- GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
- GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
- GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
+ GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
+ GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
+ GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
+ GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
+ GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
// overrides existing values or adds a new one
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
// check if a tensor is allocated by this buffer
static bool ggml_tallocr_is_own(ggml_tallocr_t alloc, const struct ggml_tensor * tensor) {
- return tensor->buffer == alloc->buffer;
+ return tensor->buffer == alloc->buffer && (!tensor->view_src || tensor->view_src->buffer == alloc->buffer);
}
static bool ggml_is_view(struct ggml_tensor * t) {
if (update_backend) {
view->backend = view->view_src->backend;
}
- view->buffer = view->view_src->buffer;
+ // views are initialized in the alloc buffer rather than the view_src buffer
+ view->buffer = alloc->buffer;
view->data = (char *)view->view_src->data + view->view_offs;
- // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
- // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
if (!alloc->measure) {
}
void ggml_allocr_free(ggml_allocr_t alloc) {
+ if (alloc == NULL) {
+ return;
+ }
+
ggml_gallocr_free(alloc->galloc);
ggml_tallocr_free(alloc->talloc);
free(alloc);
}
if (nbytes == 0) {
- fprintf(stderr, "%s: no tensors to allocate\n", __func__);
+ // all the tensors in the context are already allocated
return NULL;
}
} else {
ggml_backend_view_init(buffer, t);
}
+ } else {
+ if (t->view_src != NULL) {
+ // view of a pre-allocated tensor
+ ggml_backend_view_init(buffer, t);
+ }
}
}
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
+ // check if tensor data is in host memory
+ // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
+ bool (*is_host) (ggml_backend_buffer_type_t buft);
};
struct ggml_backend_buffer_type {
typedef void * ggml_backend_buffer_context_t;
struct ggml_backend_buffer_i {
- void (*free_buffer)(ggml_backend_buffer_t buffer);
+ void (*free_buffer) (ggml_backend_buffer_t buffer);
//void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
- void * (*get_base) (ggml_backend_buffer_t buffer);
- void (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
- void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
- void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
+ void * (*get_base) (ggml_backend_buffer_t buffer);
+ void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+ void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+ void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
// (optional) copy tensor between different buffer-type, allow for single-copy tranfers
- void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
- void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
+ void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
+ void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
+ void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
};
struct ggml_backend_buffer {
void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
- void (*synchronize) (ggml_backend_t backend);
+ void (*synchronize)(ggml_backend_t backend);
// compute graph with a plan
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
return buft->iface.supports_backend(buft, backend);
}
+bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
+ if (buft->iface.is_host) {
+ return buft->iface.is_host(buft);
+ }
+ return false;
+}
+
// backend buffer
ggml_backend_buffer_t ggml_backend_buffer_init(
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor);
}
+void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+ buffer->iface.clear(buffer, value);
+}
+
+bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
+ return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer));
+}
+
ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
return buffer->buft;
}
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
free(buffer->context);
- GGML_UNUSED(buffer);
}
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
GGML_UNUSED(buffer);
}
+static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+ memset(buffer->context, value, buffer->size);
+}
+
static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
+ /* .clear = */ ggml_backend_cpu_buffer_clear,
};
// for buffers from ptr, free is not called
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
+ /* .clear = */ ggml_backend_cpu_buffer_clear,
};
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
GGML_UNUSED(buft);
}
+static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+ return true;
+
+ GGML_UNUSED(buft);
+}
+
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
- static struct ggml_backend_buffer_type ggml_backend_buffer_type_cpu = {
+ static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
/* .iface = */ {
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
+ /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
},
/* .context = */ NULL,
};
- return &ggml_backend_buffer_type_cpu;
+ return &ggml_backend_cpu_buffer_type;
}
+#ifdef GGML_USE_CPU_HBM
+
+// buffer type HBM
+
+#include <hbwmalloc.h>
+
+static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+ hbw_free(buffer->context);
+}
+
+static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+ //void * ptr = hbw_malloc(size);
+ void * ptr;
+ int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
+ if (result != 0) {
+ fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
+ return NULL;
+ }
+
+ // FIXME: this is a hack to avoid having to implement a new buffer type
+ ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
+ buffer->buft = buft;
+ buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
+
+ return buffer;
+}
+
+ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
+ static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
+ /* .iface = */ {
+ /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
+ /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
+ /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
+ /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
+ },
+ /* .context = */ NULL,
+ };
+
+ return &ggml_backend_cpu_buffer_type_hbm;
+}
+#endif
+
struct ggml_backend_cpu_context {
int n_threads;
void * work_data;
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
- cpu_plan->cgraph = *cgraph;
+ cpu_plan->cgraph = *cgraph; // FIXME: deep copy
if (cpu_plan->cplan.work_size > 0) {
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
// utils
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
GGML_ASSERT(tensor->buffer == NULL);
- GGML_ASSERT(tensor->data == NULL);
+ //GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized
GGML_ASSERT(tensor->view_src != NULL);
GGML_ASSERT(tensor->view_src->buffer != NULL);
GGML_ASSERT(tensor->view_src->data != NULL);
#define CUDA_R_16F HIPBLAS_R_16F
#define CUDA_R_32F HIPBLAS_R_32F
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
+#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
#define cublasCreate hipblasCreate
#define cublasGemmEx hipblasGemmEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx
#define cublasSetStream hipblasSetStream
#define cublasSgemm hipblasSgemm
#define cublasStatus_t hipblasStatus_t
+#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
#define cudaGetDeviceProperties hipGetDeviceProperties
#define cudaGetErrorString hipGetErrorString
#define cudaGetLastError hipGetLastError
+#ifdef GGML_HIP_UMA
+#define cudaMalloc hipMallocManaged
+#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
+#else
#define cudaMalloc hipMalloc
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
+#endif
#define cudaMemcpy hipMemcpy
#define cudaMemcpy2DAsync hipMemcpy2DAsync
#define cudaMemcpyAsync hipMemcpyAsync
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
#define cudaStream_t hipStream_t
#define cudaSuccess hipSuccess
+#define __trap abort
#else
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>
+// CUDA 10.2 does not have these macro definitions.
+#ifndef CUBLAS_TF32_TENSOR_OP_MATH
+#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
+#define CUBLAS_COMPUTE_16F CUDA_R_16F
+#define CUBLAS_COMPUTE_32F CUDA_R_32F
+#define cublasComputeType_t cudaDataType_t
+#endif
#endif // defined(GGML_USE_HIPBLAS)
#include "ggml-cuda.h"
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
+[[noreturn]]
+static __device__ void bad_arch() {
+ printf("ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n");
+ __trap();
+
+ (void) bad_arch; // suppress unused function warning
+}
+
static __device__ __forceinline__ float warp_reduce_sum(float x) {
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
// second part effectively subtracts 8 from each quant value
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
#else
- assert(false);
- return 0.0f; // only to satisfy the compiler
+ bad_arch();
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
#else
- assert(false);
- return 0.0f; // only to satisfy the compiler
+ bad_arch();
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
// second part effectively subtracts 16 from each quant value
return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
#else
- assert(false);
- return 0.0f; // only to satisfy the compiler
+ bad_arch();
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
#else
- assert(false);
- return 0.0f; // only to satisfy the compiler
+ bad_arch();
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return d8_0*d8_1 * sumi;
#else
- assert(false);
- return 0.0f; // only to satisfy the compiler
+ bad_arch();
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
#else
- assert(false);
- return 0.0f; // only to satisfy the compiler
+ bad_arch();
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return dm2f.x*sumf_d - dm2f.y*sumf_m;
#else
- assert(false);
- return 0.0f; // only to satisfy the compiler
+ bad_arch();
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
#else
- assert(false);
- return 0.0f; // only to satisfy the compiler
+ bad_arch();
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return d3 * sumf;
#else
- assert(false);
- return 0.0f; // only to satisfy the compiler
+ bad_arch();
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return d3*d8 * sumi;
#else
- assert(false);
- return 0.0f; // only to satisfy the compiler
+ bad_arch();
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return dm4f.x*sumf_d - dm4f.y*sumf_m;
#else
- assert(false);
- return 0.0f; // only to satisfy the compiler
+ bad_arch();
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return dm4f.x*sumf_d - dm4f.y*sumf_m;
#else
- assert(false);
- return 0.0f; // only to satisfy the compiler
+ bad_arch();
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return dm5f.x*sumf_d - dm5f.y*sumf_m;
#else
- assert(false);
- return 0.0f; // only to satisfy the compiler
+ bad_arch();
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return dm4f.x*sumf_d - dm4f.y*sumf_m;
#else
- assert(false);
- return 0.0f; // only to satisfy the compiler
+ bad_arch();
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return d*sumf;
#else
- assert(false);
- return 0.0f; // only to satisfy the compiler
+ bad_arch();
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return d6 * sumf_d;
#else
- assert(false);
- return 0.0f; // only to satisfy the compiler
+ bad_arch();
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return dall * sumf_d - dmin * sumf_m;
#else
- assert(false);
- return 0.0f; // only to satisfy the compiler
+ bad_arch();
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
#endif
return d * sumf_d;
#else
- assert(false);
- return 0.0f; // only to satisfy the compiler
+ bad_arch();
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
#endif
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
(void) vec_dot_q4_0_q8_1_mul_mat;
- assert(false);
+ bad_arch();
#endif // __CUDA_ARCH__ >= CC_VOLTA
}
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
(void) vec_dot_q4_1_q8_1_mul_mat;
- assert(false);
+ bad_arch();
#endif // __CUDA_ARCH__ >= CC_VOLTA
}
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
(void) vec_dot_q5_0_q8_1_mul_mat;
- assert(false);
+ bad_arch();
#endif // __CUDA_ARCH__ >= CC_VOLTA
}
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
(void) vec_dot_q5_1_q8_1_mul_mat;
- assert(false);
+ bad_arch();
#endif // __CUDA_ARCH__ >= CC_VOLTA
}
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
(void) vec_dot_q8_0_q8_1_mul_mat;
- assert(false);
+ bad_arch();
#endif // __CUDA_ARCH__ >= CC_VOLTA
}
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
(void) vec_dot_q2_K_q8_1_mul_mat;
- assert(false);
+ bad_arch();
#endif // __CUDA_ARCH__ >= CC_VOLTA
}
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
(void) vec_dot_q3_K_q8_1_mul_mat;
- assert(false);
+ bad_arch();
#endif // __CUDA_ARCH__ >= CC_VOLTA
}
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
(void) vec_dot_q4_K_q8_1_mul_mat;
- assert(false);
+ bad_arch();
#endif // __CUDA_ARCH__ >= CC_VOLTA
}
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
(void) vec_dot_q5_K_q8_1_mul_mat;
- assert(false);
+ bad_arch();
#endif // __CUDA_ARCH__ >= CC_VOLTA
}
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
(void) vec_dot_q6_K_q8_1_mul_mat;
- assert(false);
+ bad_arch();
#endif // __CUDA_ARCH__ >= CC_VOLTA
}
const int ib = col / n_dims;
const int ic = col % n_dims;
- const int i = row*ncols + ib*n_dims + ic/2;
+ if (ib > 0) {
+ const int i = row*ncols + ib*n_dims + ic;
+
+ dst[i + 0] = x[i + 0];
+ dst[i + 1] = x[i + 1];
+
+ return;
+ }
+
+ const int i = row*ncols + ib*n_dims + ic/2;
const int i2 = row/p_delta_rows;
float cur_rot = inv_ndims * ic - ib;
break;
default:
// TODO: k-quants
+ fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
GGML_ASSERT(false);
break;
}
const int compute_capability = g_compute_capabilities[id];
- if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
+ if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
half * src0_as_f16 = nullptr;
size_t src0_as = 0;
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
float scale;
- // HACK: support for ggml backend interface
- if (src1->backend == GGML_BACKEND_CPU) {
- scale = ((float *) src1->data)[0];
- } else {
- // TODO: pass pointer to kernel instead of copying to host
- CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost));
- }
+ memcpy(&scale, dst->op_params, sizeof(float));
scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
CUDA_CHECK(cudaGetLastError());
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
- const bool src1_stays_on_host = use_src1 && dst->op == GGML_OP_SCALE;
-
// dd = data device
float * src0_ddf = nullptr;
float * src1_ddf = nullptr;
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
}
- if (use_src1 && !src1_stays_on_host) {
+ if (use_src1) {
if (src1_on_device) {
src1_ddf = (float *) src1_extra->data_device[g_main_device];
} else {
}
#ifdef NDEBUG
+ for (int id = 0; id < g_device_count; ++id) {
+ CUDA_CHECK(ggml_cuda_set_device(id));
+ CUDA_CHECK(cudaDeviceSynchronize());
+ }
+
for (int id = 0; id < g_device_count; ++id) {
CUDA_CHECK(ggml_cuda_set_device(id));
const int nb2 = dst->nb[2];
const int nb3 = dst->nb[3];
- ggml_cuda_set_peer_access(ne11);
-
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
}
static __global__ void k_compute_batched_ptrs(
- const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
+ const half * src0_as_f16, const half * src1_as_f16, char * dst,
const void ** ptrs_src, void ** ptrs_dst,
- int ne12, int ne13,
- int ne23,
- int nb02, int nb03,
- int nb12, int nb13,
- int nb2, int nb3,
- int r2, int r3) {
- int i13 = blockIdx.x * blockDim.x + threadIdx.x;
- int i12 = blockIdx.y * blockDim.y + threadIdx.y;
+ int64_t ne12, int64_t ne13,
+ int64_t ne23,
+ size_t nb02, size_t nb03,
+ size_t nb12, size_t nb13,
+ size_t nbd2, size_t nbd3,
+ int64_t r2, int64_t r3) {
+ int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
+ int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
if (i13 >= ne13 || i12 >= ne12) {
return;
}
- int i03 = i13 / r3;
- int i02 = i12 / r2;
+ int64_t i03 = i13 / r3;
+ int64_t i02 = i12 / r2;
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
- ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
+ ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
}
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
size_t dst_as = 0;
- half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
+
+ half * dst_f16 = nullptr;
+ char * dst_t = nullptr;
+
+ cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
+ cudaDataType_t cu_data_type = CUDA_R_16F;
+
+ // dst strides
+ size_t nbd2 = dst->nb[2];
+ size_t nbd3 = dst->nb[3];
+
+ const half alpha_f16 = 1.0f;
+ const half beta_f16 = 0.0f;
+
+ const float alpha_f32 = 1.0f;
+ const float beta_f32 = 0.0f;
+
+ const void * alpha = &alpha_f16;
+ const void * beta = &beta_f16;
+
+ if (dst->op_params[0] == GGML_PREC_DEFAULT) {
+ dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
+ dst_t = (char *) dst_f16;
+
+ nbd2 /= sizeof(float) / sizeof(half);
+ nbd3 /= sizeof(float) / sizeof(half);
+ } else {
+ dst_t = (char *) dst_ddf;
+
+ cu_compute_type = CUBLAS_COMPUTE_32F;
+ cu_data_type = CUDA_R_32F;
+
+ alpha = &alpha_f32;
+ beta = &beta_f32;
+ }
GGML_ASSERT(ne12 % ne02 == 0);
GGML_ASSERT(ne13 % ne03 == 0);
const int64_t r2 = ne12/ne02;
const int64_t r3 = ne13/ne03;
- const half alpha_f16 = 1.0f;
- const half beta_f16 = 0.0f;
-
#if 0
// use cublasGemmEx
{
int i02 = i12 / r2;
CUBLAS_CHECK(
- cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
+ cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
- &alpha_f16, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
- (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
- &beta_f16, ( char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2, CUDA_R_16F, ne01,
- CUBLAS_COMPUTE_16F,
+ alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
+ (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
+ beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
+ cu_compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
}
}
CUBLAS_CHECK(
cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
- &alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
- (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
- &beta_f16, ( char *) dst_f16, CUDA_R_16F, ne01, dst->nb[2]/sizeof(float), // strideC
+ alpha, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
+ (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
+ beta, ( char *) dst_t, cu_data_type, ne01, dst->nb[2]/sizeof(float), // strideC
ne12*ne13,
- CUBLAS_COMPUTE_16F,
+ cu_compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
} else {
// use cublasGemmBatchedEx
dim3 block_dims(ne13, ne12);
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
- src0_as_f16, src1_as_f16, dst_f16,
+ src0_as_f16, src1_as_f16, dst_t,
ptrs_src, ptrs_dst,
ne12, ne13,
ne23,
nb02, nb03,
nb12, nb13,
- dst->nb[2], dst->nb[3],
+ nbd2, nbd3,
r2, r3);
CUDA_CHECK(cudaGetLastError());
CUBLAS_CHECK(
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
- &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
- (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
- &beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
+ alpha, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
+ (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
+ beta, ( void **) (ptrs_dst + 0*ne23), cu_data_type, ne01,
ne23,
- CUBLAS_COMPUTE_16F,
+ cu_compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
if (ptrs_src_s != 0) {
}
#endif
- const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
- to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
+ if (dst->op_params[0] == GGML_PREC_DEFAULT) {
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
+ to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
+
+ ggml_cuda_pool_free(dst_f16, dst_as);
+ }
ggml_cuda_pool_free(src1_as_f16, src1_as);
- ggml_cuda_pool_free(dst_f16, dst_as);
}
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
// TODO: mmq/mmv support
#endif
- GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
+ const int64_t nb11 = src1->nb[1];
+ const int64_t nb1 = dst->nb[1];
const struct ggml_tensor * ids = src0;
const int32_t id = ((int32_t *) dst->op_params)[0];
std::vector<char> ids_host(ggml_nbytes(ids));
+ const cudaStream_t stream = g_cudaStreams[g_main_device][0];
+
if (ids->backend == GGML_BACKEND_GPU) {
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
- CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
- CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
+ CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
+ CUDA_CHECK(cudaStreamSynchronize(stream));
} else {
memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
}
ggml_tensor src1_row = *src1;
ggml_tensor dst_row = *dst;
- src1_row.ne[1] = 1;
- dst_row.ne[1] = 1;
-
- src1_row.nb[2] = src1_row.nb[1];
- dst_row.nb[2] = dst_row.nb[1];
-
- src1_row.nb[3] = src1_row.nb[1];
- dst_row.nb[3] = dst_row.nb[1];
+ src1_row.backend = GGML_BACKEND_GPU;
+ dst_row.backend = GGML_BACKEND_GPU;
src1_row.extra = &src1_row_extra;
dst_row.extra = &dst_row_extra;
+ char * src1_original = src1->backend == GGML_BACKEND_CPU ?
+ (char *) src1->data : (char *) src1_extra->data_device[g_main_device];
+ char * dst_original = dst->backend == GGML_BACKEND_CPU ?
+ (char *) dst->data : (char *) dst_extra->data_device[g_main_device];
+
+ if (src1->ne[1] == 1) {
+ GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
+ GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
+
+ for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
+ //int32_t row_id;
+ //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
+ //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
+
+ const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
+
+ GGML_ASSERT(row_id >= 0 && row_id < n_as);
+
+ const struct ggml_tensor * src0_row = dst->src[row_id + 2];
+
+ src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1];
+ src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set?
+
+ dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1];
+ dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
+
+ ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
+ }
+ } else {
+ size_t as_src1, as_dst;
+ char * src1_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(src1), &as_src1);
+ char * dst_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(dst), &as_dst);
+
+ src1_row_extra.data_device[g_main_device] = src1_contiguous;
+ dst_row_extra.data_device[g_main_device] = dst_contiguous;
+
+ const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ?
+ cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
+ const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_CPU ?
+ cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice;
- for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
- //int32_t row_id;
- //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
- //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
+ for (int32_t row_id = 0; row_id < n_as; ++row_id) {
+ const struct ggml_tensor * src0_row = dst->src[row_id + 2];
- const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
+ int64_t num_src1_rows = 0;
+ for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
+ const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
- GGML_ASSERT(row_id >= 0 && row_id < n_as);
+ if (row_id_i != row_id) {
+ continue;
+ }
+
+ GGML_ASSERT(row_id >= 0 && row_id < n_as);
+
+ CUDA_CHECK(cudaMemcpyAsync(src1_contiguous + num_src1_rows*nb11, src1_original + i01*nb11,
+ nb11, src1_kind, stream));
+ num_src1_rows++;
+ }
+
+ if (num_src1_rows == 0) {
+ continue;
+ }
- const struct ggml_tensor * src0_row = dst->src[row_id + 2];
+ src1_row.ne[1] = num_src1_rows;
+ dst_row.ne[1] = num_src1_rows;
- src1_row_extra.data_device[g_main_device] = (char *) src1_extra->data_device[g_main_device] + i01*src1->nb[1];
- src1_row.data = (char *) src1->data + i01*src1->nb[1];
+ src1_row.nb[1] = nb11;
+ src1_row.nb[2] = num_src1_rows*nb11;
+ src1_row.nb[3] = num_src1_rows*nb11;
- dst_row_extra.data_device[g_main_device] = (char *) dst_extra->data_device[g_main_device] + i01*dst->nb[1];
- dst_row.data = (char *) dst->data + i01*dst->nb[1];
+ dst_row.nb[1] = nb1;
+ dst_row.nb[2] = num_src1_rows*nb1;
+ dst_row.nb[3] = num_src1_rows*nb1;
- ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
+ ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
+
+ num_src1_rows = 0;
+ for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
+ const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
+
+ if (row_id_i != row_id) {
+ continue;
+ }
+
+ GGML_ASSERT(row_id >= 0 && row_id < n_as);
+
+ CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous + num_src1_rows*nb1,
+ nb1, dst_kind, stream));
+ num_src1_rows++;
+ }
+ }
+
+ ggml_cuda_pool_free(src1_contiguous, as_src1);
+ ggml_cuda_pool_free(dst_contiguous, as_dst);
+ }
+
+ if (dst->backend == GGML_BACKEND_CPU) {
+ CUDA_CHECK(cudaStreamSynchronize(stream));
}
}
(void) dst;
}
+static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
+ static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+ return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
+}
+
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
const int64_t nrows = ggml_nrows(tensor);
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
if (ne0 % MATRIX_ROW_PADDING != 0) {
- size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
- * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
}
char * buf;
}
void ggml_cuda_free_data(struct ggml_tensor * tensor) {
- if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
+ if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
return;
}
ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
- const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
- tensor->op == GGML_OP_VIEW;
+ const bool inplace = tensor->view_src != nullptr;
- if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
- ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
+ if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
size_t view_offset = 0;
if (tensor->op == GGML_OP_VIEW) {
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
- if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
+ if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
return false;
}
if (tensor->op == GGML_OP_MUL_MAT) {
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
#ifndef NDEBUG
- fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
+ fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
#endif
return false;
}
return false;
}
+ if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) {
+ ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
+ }
+
if (params->ith != 0) {
return true;
}
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
if (tensor->view_src != NULL && tensor->view_offs == 0) {
- assert(tensor->view_src->buffer->buft == buffer->buft); // TODO
+ assert(tensor->view_src->buffer->buft == buffer->buft);
tensor->backend = tensor->view_src->backend;
tensor->extra = tensor->view_src->extra;
return;
}
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
}
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
}
+static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+
+ ggml_cuda_set_device(ctx->device);
+ CUDA_CHECK(cudaDeviceSynchronize());
+
+ CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
+}
+
static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
/* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
/* .cpy_tensor_from = */ NULL,
/* .cpy_tensor_to = */ NULL,
+ /* .clear = */ ggml_backend_cuda_buffer_clear,
};
// cuda buffer type
if (ggml_is_quantized(tensor->type)) {
if (ne0 % MATRIX_ROW_PADDING != 0) {
- size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
- * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
}
}
UNUSED(buft);
}
-static ggml_backend_buffer_type_i cuda_backend_buffer_type_interface = {
+static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
+ /* .is_host = */ nullptr,
};
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
- static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda[GGML_CUDA_MAX_DEVICES];
- static bool ggml_backend_buffer_type_cuda_initialized = false;
- if (!ggml_backend_buffer_type_cuda_initialized) {
+ static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
+
+ static bool ggml_backend_cuda_buffer_type_initialized = false;
+
+ if (!ggml_backend_cuda_buffer_type_initialized) {
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
- ggml_backend_buffer_type_cuda[i] = {
- /* .iface = */ cuda_backend_buffer_type_interface,
+ ggml_backend_cuda_buffer_types[i] = {
+ /* .iface = */ ggml_backend_cuda_buffer_type_interface,
/* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
};
}
- ggml_backend_buffer_type_cuda_initialized = true;
+ ggml_backend_cuda_buffer_type_initialized = true;
}
- return &ggml_backend_buffer_type_cuda[device];
+ return &ggml_backend_cuda_buffer_types[device];
}
// host buffer type
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
- CUDA_CHECK(cudaFreeHost(ctx->dev_ptr));
- delete ctx;
+ CUDA_CHECK(cudaFreeHost(buffer->context));
}
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
return buffer;
-
- UNUSED(buft);
}
-struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = {
- /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
- /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
- /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
- /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
-};
-
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
- static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda_host = {
- /* .iface = */ cuda_backend_host_buffer_type_interface,
+ static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
+ /* .iface = */ {
+ /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
+ /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
+ /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
+ /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
+ },
/* .context = */ nullptr,
};
- return &ggml_backend_buffer_type_cuda_host;
+ return &ggml_backend_cuda_buffer_type_host;
}
// backend
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
+GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
+
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
+
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
// helper to check if the device supports a specific family
@implementation GGMLMetalClass
@end
-ggml_log_callback ggml_metal_log_callback = NULL;
+
+static void ggml_metal_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
+ fprintf(stderr, "%s", msg);
+
+ UNUSED(level);
+ UNUSED(user_data);
+}
+
+ggml_log_callback ggml_metal_log_callback = ggml_metal_default_log_callback;
void * ggml_metal_log_user_data = NULL;
void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
}
// temporarily defined here for compatibility between ggml-backend and the old API
-struct ggml_backend_metal_buffer_context {
- void * data;
+
+struct ggml_backend_metal_buffer {
+ void * data;
+ size_t size;
id<MTLBuffer> metal;
};
+struct ggml_backend_metal_buffer_context {
+ void * all_data;
+ size_t all_size;
+ bool owned;
+
+ // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
+ int n_buffers;
+ struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
+};
+
// finds the Metal buffer that contains the tensor data on the GPU device
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
// Metal buffer based on the host memory pointer
const int64_t tsize = ggml_nbytes(t);
+ ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
+
// compatibility with ggml-backend
- if (t->buffer && t->buffer->buft == ggml_backend_metal_buffer_type()) {
- struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) t->buffer->context;
+ if (buffer && buffer->buft == ggml_backend_metal_buffer_type()) {
+ struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context;
+
+ // find the view that contains the tensor fully
+ for (int i = 0; i < buf_ctx->n_buffers; ++i) {
+ const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data;
- const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->data;
+ //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size);
+ if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) {
+ *offs = (size_t) ioffs;
- GGML_ASSERT(ioffs >= 0 && ioffs + tsize <= (int64_t) t->buffer->size);
+ //GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);
+
+ return buf_ctx->buffers[i].metal;
+ }
+ }
- *offs = (size_t) ioffs;
+ GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);
- return buf_ctx->metal;
+ return nil;
}
// find the view that contains the tensor fully
{
GGML_ASSERT(ggml_is_contiguous(src0));
- const float scale = *(const float *) src1->data;
+ const float scale = *(const float *) dst->op_params;
int64_t n = ggml_nelements(dst);
[encoder setComputePipelineState:ctx->pipeline_scale];
}
- [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
- [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
// backend interface
+// default buffer
static id<MTLDevice> g_backend_device = nil;
static int g_backend_device_ref_count = 0;
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
- return ctx->data;
+ return ctx->all_data;
}
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
- [ctx->metal release];
+ for (int i = 0; i < ctx->n_buffers; i++) {
+ [ctx->buffers[i].metal release];
+ }
ggml_backend_metal_free_device();
- free(ctx->data);
- free(ctx);
+ if (ctx->owned) {
+ free(ctx->all_data);
+ }
- UNUSED(buffer);
+ free(ctx);
}
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-
memcpy((char *)tensor->data + offset, data, size);
UNUSED(buffer);
}
static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-
memcpy(data, (const char *)tensor->data + offset, size);
UNUSED(buffer);
UNUSED(buffer);
}
-static struct ggml_backend_buffer_i metal_backend_buffer_i = {
+static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+ struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
+
+ memset(ctx->all_data, value, ctx->all_size);
+}
+
+static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
/* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
/* .get_base = */ ggml_backend_metal_buffer_get_base,
/* .init_tensor = */ NULL,
/* .get_tensor = */ ggml_backend_metal_buffer_get_tensor,
/* .cpy_tensor_from = */ ggml_backend_metal_buffer_cpy_tensor_from,
/* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to,
+ /* .clear = */ ggml_backend_metal_buffer_clear,
};
+// default buffer type
+
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
size_aligned += (size_page - (size_aligned % size_page));
}
- ctx->data = ggml_metal_host_malloc(size);
- ctx->metal = [ggml_backend_metal_get_device() newBufferWithBytesNoCopy:ctx->data
+ id<MTLDevice> device = ggml_backend_metal_get_device();
+
+ ctx->all_data = ggml_metal_host_malloc(size_aligned);
+ ctx->all_size = size_aligned;
+ ctx->owned = true;
+ ctx->n_buffers = 1;
+
+ ctx->buffers[0].data = ctx->all_data;
+ ctx->buffers[0].size = size;
+ ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
length:size_aligned
options:MTLResourceStorageModeShared
deallocator:nil];
- return ggml_backend_buffer_init(buft, metal_backend_buffer_i, ctx, size);
+ if (ctx->buffers[0].metal == nil) {
+ GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
+ free(ctx);
+ ggml_backend_metal_free_device();
+ return NULL;
+ }
+
+ GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
+
+
+#if TARGET_OS_OSX
+ GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
+ device.currentAllocatedSize / 1024.0 / 1024.0,
+ device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
+
+ if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
+ GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
+ } else {
+ GGML_METAL_LOG_INFO("\n");
+ }
+#else
+ GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
+#endif
+
+
+ return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
}
static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
- GGML_UNUSED(buft);
+ UNUSED(buft);
+}
+
+static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+ return true;
+
+ UNUSED(buft);
}
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
+ /* .is_host = */ ggml_backend_metal_buffer_type_is_host,
},
/* .context = */ NULL,
};
return &ggml_backend_buffer_type_metal;
}
+// buffer from ptr
+
+ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
+ struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
+
+ ctx->all_data = data;
+ ctx->all_size = size;
+ ctx->owned = false;
+ ctx->n_buffers = 0;
+
+ const size_t size_page = sysconf(_SC_PAGESIZE);
+ size_t size_aligned = size;
+ if ((size_aligned % size_page) != 0) {
+ size_aligned += (size_page - (size_aligned % size_page));
+ }
+
+ id<MTLDevice> device = ggml_backend_metal_get_device();
+
+ // the buffer fits into the max buffer size allowed by the device
+ if (size_aligned <= device.maxBufferLength) {
+ ctx->buffers[ctx->n_buffers].data = data;
+ ctx->buffers[ctx->n_buffers].size = size;
+
+ ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
+
+ if (ctx->buffers[ctx->n_buffers].metal == nil) {
+ GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
+ return false;
+ }
+
+ GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
+
+ ++ctx->n_buffers;
+ } else {
+ // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
+ // one of the views
+ const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
+ const size_t size_step = device.maxBufferLength - size_ovlp;
+ const size_t size_view = device.maxBufferLength;
+
+ for (size_t i = 0; i < size; i += size_step) {
+ const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
+
+ ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
+ ctx->buffers[ctx->n_buffers].size = size_step_aligned;
+
+ ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
+
+ if (ctx->buffers[ctx->n_buffers].metal == nil) {
+ GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
+ return false;
+ }
+
+ GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, offs = %12ld", __func__, size_step_aligned / 1024.0 / 1024.0, i);
+ if (i + size_step < size) {
+ GGML_METAL_LOG_INFO("\n");
+ }
+
+ ++ctx->n_buffers;
+ }
+ }
+
+#if TARGET_OS_OSX
+ GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
+ device.currentAllocatedSize / 1024.0 / 1024.0,
+ device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
+
+ if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
+ GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
+ } else {
+ GGML_METAL_LOG_INFO("\n");
+ }
+#else
+ GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
+#endif
+
+ return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size);
+}
+
+// backend
+
static const char * ggml_backend_metal_name(ggml_backend_t backend) {
return "Metal";
free(backend);
}
-static void ggml_backend_metal_synchronize(ggml_backend_t backend) {
- UNUSED(backend);
-}
-
static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
return ggml_backend_metal_buffer_type();
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_from_async = */ NULL,
/* .cpy_tensor_to_async = */ NULL,
- /* .synchronize = */ ggml_backend_metal_synchronize,
- /* .graph_plan_create = */ NULL, // the metal implementation does not require creating graph plans atm
+ /* .synchronize = */ NULL,
+ /* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_metal_graph_compute,
/* .supports_op = */ ggml_backend_metal_supports_op,
};
-// TODO: make a common log callback for all backends in ggml-backend
-static void ggml_backend_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
- fprintf(stderr, "%s", msg);
-
- UNUSED(level);
- UNUSED(user_data);
-}
-
ggml_backend_t ggml_backend_metal_init(void) {
- ggml_metal_log_set_callback(ggml_backend_log_callback, NULL);
-
struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
if (ctx == NULL) {
dst_data[1] = x0*sin_theta + x1*cos_theta;
}
} else {
- for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
- for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
+ for (int64_t ic = 2*tiitg; ic < ne0; ic += 2*tptg.x) {
+ if (ic < n_dims) {
+ const int64_t ib = 0;
// simplified from `(ib * n_dims + ic) * inv_ndims`
const float cur_rot = inv_ndims*ic - ib;
dst_data[0] = x0*cos_theta - x1*sin_theta;
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
+ } else {
+ const int64_t i0 = ic;
+
+ device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+ device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+ dst_data[0] = src[0];
+ dst_data[1] = src[1];
}
}
}
const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4);
const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
- const ggml_int16x8x2_t mins16 = {vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))};
+ const ggml_int16x8x2_t mins16 = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))}};
const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])),
vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0])));
const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])),
const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
const int8x16_t scales = vld1q_s8(scale);
- const ggml_int16x8x2_t q6scales = {vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))};
+ const ggml_int16x8x2_t q6scales = {{vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))}};
const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])),
vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),
return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
}
-size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
- static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
- return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type);
-}
-
int ggml_blck_size(enum ggml_type type) {
return type_traits[type].blck_size;
}
return type_traits[type].type_size;
}
-float ggml_type_sizef(enum ggml_type type) {
- return ((float)(type_traits[type].type_size))/type_traits[type].blck_size;
+size_t ggml_row_size(enum ggml_type type, int64_t ne) {
+ assert(ne % ggml_blck_size(type) == 0);
+ return ggml_type_size(type)*ne/ggml_blck_size(type);
+}
+
+double ggml_type_sizef(enum ggml_type type) {
+ return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
}
const char * ggml_type_name(enum ggml_type type) {
return ggml_type_size(tensor->type);
}
-static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) {
+bool ggml_is_scalar(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
}
-static inline bool ggml_is_vector(const struct ggml_tensor * tensor) {
+bool ggml_is_vector(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
}
-static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) {
+bool ggml_is_matrix(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return tensor->ne[2] == 1 && tensor->ne[3] == 1;
}
+bool ggml_is_3d(const struct ggml_tensor * tensor) {
+ return tensor->ne[3] == 1;
+}
+
+int ggml_n_dims(const struct ggml_tensor * tensor) {
+ for (int i = GGML_MAX_DIMS - 1; i >= 1; --i) {
+ if (tensor->ne[i] > 1) {
+ return i + 1;
+ }
+ }
+ return 1;
+}
+
static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
size_t max_size = 0;
- struct ggml_object * obj = ctx->objects_begin;
-
- while (obj != NULL) {
- if (obj->type == GGML_OBJECT_TENSOR) {
- struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
-
- const size_t size = ggml_nbytes(tensor);
-
- if (max_size < size) {
- max_size = size;
- }
- }
-
- obj = obj->next;
+ for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) {
+ max_size = MAX(max_size, ggml_nbytes(tensor));
}
return max_size;
view_src = view_src->view_src;
}
- size_t data_size = ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
+ size_t data_size = ggml_row_size(type, ne[0]);
for (int i = 1; i < n_dims; i++) {
data_size *= ne[i];
}
/*.type =*/ type,
/*.backend =*/ GGML_BACKEND_CPU,
/*.buffer =*/ NULL,
- /*.n_dims =*/ n_dims,
/*.ne =*/ { 1, 1, 1, 1 },
/*.nb =*/ { 0, 0, 0, 0 },
/*.op =*/ GGML_OP_NONE,
}
struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
- return ggml_new_tensor(ctx, src->type, src->n_dims, src->ne);
+ return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne);
}
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
struct ggml_tensor * ggml_view_tensor(
struct ggml_context * ctx,
struct ggml_tensor * src) {
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src, 0);
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, GGML_MAX_DIMS, src->ne, src, 0);
ggml_format_name(result, "%s (view)", src->name);
for (int i = 0; i < GGML_MAX_DIMS; i++) {
return result;
}
-struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
+struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) {
struct ggml_object * obj = ctx->objects_begin;
char * const mem_buffer = ctx->mem_buffer;
return NULL;
}
-struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) {
+struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struct ggml_tensor * tensor) {
struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
obj = obj->next;
is_node = true;
}
- struct ggml_tensor * result = ggml_new_tensor(ctx, type, a->n_dims, a->ne);
+ struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
result->op = GGML_OP_ADD;
- result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne) : NULL;
+ result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne) : NULL;
result->src[0] = a;
result->src[1] = b;
is_node = true;
}
- int64_t ne[4] = {1,1,1,1};
- for (int i=1; i<a->n_dims; ++i) {
+ int64_t ne[GGML_MAX_DIMS] = { 1 };
+ for (int i = 1; i < GGML_MAX_DIMS; ++i) {
ne[i] = a->ne[i];
}
- struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, a->n_dims, ne);
+ struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
result->op = GGML_OP_SUM_ROWS;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
is_node = true;
}
- int64_t ne[GGML_MAX_DIMS] = { 1, a->ne[1], a->ne[2], a->ne[3] };
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, ne);
+ int64_t ne[4] = { 1, a->ne[1], a->ne[2], a->ne[3] };
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
result->op = GGML_OP_MEAN;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
is_node = true;
}
- int64_t ne[GGML_MAX_DIMS] = { a->ne[1], 1, 1, 1 };
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, ne);
+ struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]);
result->op = GGML_OP_ARGMAX;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
is_node = true;
}
- struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
+ struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
result->op = GGML_OP_REPEAT;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
return a;
}
- struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
+ struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
result->op = GGML_OP_REPEAT_BACK;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
}
const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne);
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
result->op = GGML_OP_MUL_MAT;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
return result;
}
+void ggml_mul_mat_set_prec(
+ struct ggml_tensor * a,
+ enum ggml_prec prec) {
+ const int32_t prec_i32 = (int32_t) prec;
+
+ ggml_set_op_params_i32(a, 0, prec_i32);
+}
+
// ggml_mul_mat_id
struct ggml_tensor * ggml_mul_mat_id(
}
const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(as[0]->n_dims, b->n_dims), ne);
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
ggml_set_op_params_i32(result, 0, id);
ggml_set_op_params_i32(result, 1, n_as);
// a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3]
const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] };
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne);
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
result->op = GGML_OP_OUT_PROD;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
static struct ggml_tensor * ggml_scale_impl(
struct ggml_context * ctx,
struct ggml_tensor * a,
- struct ggml_tensor * b,
+ float s,
bool inplace) {
- GGML_ASSERT(ggml_is_scalar(b));
GGML_ASSERT(ggml_is_padded_1d(a));
bool is_node = false;
- if (a->grad || b->grad) {
+ if (a->grad) {
is_node = true;
}
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+ ggml_set_op_params(result, &s, sizeof(s));
+
result->op = GGML_OP_SCALE;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
- result->src[1] = b;
return result;
}
struct ggml_tensor * ggml_scale(
struct ggml_context * ctx,
struct ggml_tensor * a,
- struct ggml_tensor * b) {
- return ggml_scale_impl(ctx, a, b, false);
+ float s) {
+ return ggml_scale_impl(ctx, a, s, false);
}
struct ggml_tensor * ggml_scale_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
- struct ggml_tensor * b) {
- return ggml_scale_impl(ctx, a, b, true);
+ float s) {
+ return ggml_scale_impl(ctx, a, s, true);
}
// ggml_set
//GGML_ASSERT(false);
}
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a, 0);
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0);
ggml_format_name(result, "%s (reshaped)", a->name);
result->op = GGML_OP_RESHAPE;
}
const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] };
- struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, MAX(a->n_dims, 2), ne);
+ struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, 4, ne);
result->op = GGML_OP_DIAG;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
is_node = true;
}
- const int64_t ne[3] = {
+ const int64_t ne[2] = {
ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
a->ne[1],
};
enum ggml_sort_order order) {
bool is_node = false;
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, a->ne);
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
ggml_set_op_params_i32(result, 0, (int32_t) order);
}
//struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, q->n_dims, q->ne);
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, q->ne);
int32_t t = masked ? 1 : 0;
ggml_set_op_params(result, &t, sizeof(t));
}
//struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne);
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne);
result->op = GGML_OP_FLASH_FF;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
const int np = npx*npy;
const int64_t ne[4] = { a->ne[0], w, w, np, };
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
int32_t params[] = { npx, npy, w };
const int ith = params->ith;
const int nth = params->nth;
-// TODO: OpenCL kernel support broadcast
#ifdef GGML_USE_CLBLAST
if (src1->backend == GGML_BACKEND_GPU) {
- GGML_ASSERT(ggml_are_same_shape(src0, src1));
+ // TODO: OpenCL kernel support full broadcast
+ GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
if (ith == 0) {
ggml_cl_mul(src0, src1, dst);
}
float eps;
memcpy(&eps, dst->op_params, sizeof(float));
+ GGML_ASSERT(eps > 0.0f);
+
// TODO: optimize
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
float eps;
memcpy(&eps, dst->op_params, sizeof(float));
+ GGML_ASSERT(eps > 0.0f);
+
// TODO: optimize
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
}
#endif
-// off1 = offset in i11 and i1
-// cne1 = ne11 and ne1
-// in a normal matrix multiplication, off1 = 0 and cne1 = ne1
-// during GGML_TASK_INIT, the full src1 is converted regardless of off1 and cne1
static void ggml_compute_forward_mul_mat(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
- struct ggml_tensor * dst,
- int64_t off1, int64_t cne1) {
+ struct ggml_tensor * dst) {
int64_t t0 = ggml_perf_time_us();
UNUSED(t0);
const int64_t i03 = i13/r3;
const int64_t i02 = i12/r2;
- const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
- const float * y = (float *) ((char *) src1->data + off1*nb11 + i12*nb12 + i13*nb13);
- float * d = (float *) ((char *) dst->data + off1*nb1 + i12*nb2 + i13*nb3);
+ const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
+ const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
if (type != GGML_TYPE_F32) {
float * const wdata = params->wdata;
}
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
- cne1, ne01, ne10,
+ ne1, ne01, ne10,
1.0f, y, ne10,
x, ne00,
0.0f, d, ne01);
if (params->type == GGML_TASK_INIT) {
if (src1->type != vec_dot_type) {
char * wdata = params->wdata;
- const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
+ const size_t row_size = ggml_row_size(vec_dot_type, ne10);
assert(params->wsize >= ne11*ne12*ne13*row_size);
assert(src1->type == GGML_TYPE_F32);
}
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
- const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
+ const size_t row_size = ggml_row_size(vec_dot_type, ne10);
- const int64_t nr0 = ne01; // src0 rows
- const int64_t nr1 = cne1*ne12*ne13; // src1 rows
+ const int64_t nr0 = ne01; // src0 rows
+ const int64_t nr1 = ne1*ne12*ne13; // src1 rows
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
- const int64_t i13 = (ir1/(ne12*cne1));
- const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
- const int64_t i11 = (ir1 - i13*ne12*cne1 - i12*cne1) + off1;
+ const int64_t i13 = (ir1/(ne12*ne1));
+ const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
+ const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
// broadcast src0 into src1
const int64_t i03 = i13/r3;
static void ggml_compute_forward_mul_mat_id(
const struct ggml_compute_params * params,
- const struct ggml_tensor * src0,
+ const struct ggml_tensor * ids,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
- // during GGML_TASK_INIT the entire src1 is converted to vec_dot_type
- ggml_compute_forward_mul_mat(params, dst->src[2], src1, dst, 0, dst->ne[1]);
- return;
- }
+ const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS
+
+ GGML_TENSOR_BINARY_OP_LOCALS
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ const enum ggml_type type = src0->type;
+
+ const bool src1_cont = ggml_is_contiguous(src1);
+
+ ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
+ enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
+ ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
+
+ GGML_ASSERT(ne0 == ne01);
+ GGML_ASSERT(ne1 == ne11);
+ GGML_ASSERT(ne2 == ne12);
+ GGML_ASSERT(ne3 == ne13);
+
+ // we don't support permuted src0 or src1
+ GGML_ASSERT(nb00 == ggml_type_size(type));
+ GGML_ASSERT(nb10 == ggml_type_size(src1->type));
- const struct ggml_tensor * ids = src0;
+ // dst cannot be transposed or permuted
+ GGML_ASSERT(nb0 == sizeof(float));
+ GGML_ASSERT(nb0 <= nb1);
+ GGML_ASSERT(nb1 <= nb2);
+ GGML_ASSERT(nb2 <= nb3);
+
+ // broadcast factors
+ const int64_t r2 = ne12/ne02;
+ const int64_t r3 = ne13/ne03;
+
+ // row groups
const int id = ggml_get_op_params_i32(dst, 0);
const int n_as = ggml_get_op_params_i32(dst, 1);
- for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
- const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
+ char * wdata_src1_end = (src1->type == vec_dot_type) ?
+ (char *) params->wdata :
+ (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
+
+ int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
+ int64_t * matrix_rows = matrix_row_counts + n_as; // [n_as][ne11]
+
+ #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
+
+ if (params->type == GGML_TASK_INIT) {
+ char * wdata = params->wdata;
+ if (src1->type != vec_dot_type) {
+ const size_t row_size = ggml_row_size(vec_dot_type, ne10);
+
+ assert(params->wsize >= ne11*ne12*ne13*row_size);
+ assert(src1->type == GGML_TYPE_F32);
+
+ for (int64_t i13 = 0; i13 < ne13; ++i13) {
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
+ from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
+ wdata += row_size;
+ }
+ }
+ }
+ }
+
+ // initialize matrix_row_counts
+ GGML_ASSERT(wdata == wdata_src1_end);
+ memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
+
+ // group rows by src0 matrix
+ for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
+ const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
- GGML_ASSERT(row_id >= 0 && row_id < n_as);
+ GGML_ASSERT(row_id >= 0 && row_id < n_as);
+ MMID_MATRIX_ROW(row_id, matrix_row_counts[row_id]) = i01;
+ matrix_row_counts[row_id] += 1;
+ }
- const struct ggml_tensor * src0_row = dst->src[row_id + 2];
- ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1);
+ return;
}
+
+ if (params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
+
+ // compute each matrix multiplication in sequence
+ for (int cur_a = 0; cur_a < n_as; ++cur_a) {
+ const int64_t cne1 = matrix_row_counts[cur_a];
+
+ if (cne1 == 0) {
+ continue;
+ }
+
+ const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
+
+ const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+ const size_t row_size = ggml_row_size(vec_dot_type, ne10);
+
+ const int64_t nr0 = ne01; // src0 rows
+ const int64_t nr1 = cne1*ne12*ne13; // src1 rows
+
+ //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
+
+ // distribute the thread work across the inner or outer loop based on which one is larger
+
+ const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
+ const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
+
+ const int64_t ith0 = ith % nth0;
+ const int64_t ith1 = ith / nth0;
+
+ const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
+ const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
+
+ const int64_t ir010 = dr0*ith0;
+ const int64_t ir011 = MIN(ir010 + dr0, nr0);
+
+ const int64_t ir110 = dr1*ith1;
+ const int64_t ir111 = MIN(ir110 + dr1, nr1);
+
+ //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
+
+ // threads with no work simply yield (not sure if it helps)
+ if (ir010 >= ir011 || ir110 >= ir111) {
+ sched_yield();
+ continue;
+ }
+
+ assert(ne12 % ne02 == 0);
+ assert(ne13 % ne03 == 0);
+
+ // block-tiling attempt
+ const int64_t blck_0 = 16;
+ const int64_t blck_1 = 16;
+
+ // attempt to reduce false-sharing (does not seem to make a difference)
+ float tmp[16];
+
+ for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
+ for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
+ for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
+ const int64_t i13 = (ir1/(ne12*cne1)); // Note: currently, src1 is always a matrix
+ const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
+ const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1);
+ const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
+
+ // broadcast src0 into src1
+ const int64_t i03 = i13/r3;
+ const int64_t i02 = i12/r2;
+
+ const int64_t i1 = i11;
+ const int64_t i2 = i12;
+ const int64_t i3 = i13;
+
+ const char * src0_row = (const char *) src0_cur->data + (0 + i02*nb02 + i03*nb03);
+
+ // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
+ // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
+ // the original src1 data pointer, so we should index using the indices directly
+ // TODO: this is a bit of a hack, we should probably have a better way to handle this
+ const char * src1_col = (const char *) wdata +
+ (src1_cont || src1->type != vec_dot_type
+ ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
+ : (i11*nb11 + i12*nb12 + i13*nb13));
+
+ float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
+
+ //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
+ // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
+ //}
+
+ for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
+ vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
+ }
+ memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
+ }
+ }
+ }
+ }
+
+ #undef MMID_MATRIX_ROW
}
// ggml_compute_forward_out_prod
static void ggml_compute_forward_scale_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
- const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_is_contiguous(dst));
GGML_ASSERT(ggml_are_same_shape(src0, dst));
- GGML_ASSERT(ggml_is_scalar(src1));
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
// scale factor
- const float v = *(float *) src1->data;
+ float v;
+ memcpy(&v, dst->op_params, sizeof(float));
const int ith = params->ith;
const int nth = params->nth;
static void ggml_compute_forward_scale(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
- const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
- ggml_compute_forward_scale_f32(params, src0, src1, dst);
+ ggml_compute_forward_scale_f32(params, src0, dst);
} break;
default:
{
}
} else {
// TODO: this might be wrong for ne0 != n_dims - need double check
- // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
+ // it seems we have to rope just the first n_dims elements and do nothing with the rest
+ // ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
theta_base *= freq_scale;
- for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
- for (int64_t ic = 0; ic < n_dims; ic += 2) {
+ for (int64_t ic = 0; ic < ne0; ic += 2) {
+ if (ic < n_dims) {
+ const int64_t ib = 0;
+
// simplified from `(ib * n_dims + ic) * inv_ndims`
float cur_rot = inv_ndims * ic - ib;
dst_data[0] = x0*cos_theta - x1*sin_theta;
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
+ } else {
+ const int64_t i0 = ic;
+
+ const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+ dst_data[0] = src[0];
+ dst_data[1] = src[1];
}
}
}
}
} else {
// TODO: this might be wrong for ne0 != n_dims - need double check
- // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
+ // it seems we have to rope just the first n_dims elements and do nothing with the rest
+ // ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
theta_base *= freq_scale;
- for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
- for (int64_t ic = 0; ic < n_dims; ic += 2) {
+ for (int64_t ic = 0; ic < ne0; ic += 2) {
+ if (ic < n_dims) {
+ const int64_t ib = 0;
+
// simplified from `(ib * n_dims + ic) * inv_ndims`
float cur_rot = inv_ndims * ic - ib;
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
+ } else {
+ const int64_t i0 = ic;
+
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+ ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+ dst_data[0] = src[0];
+ dst_data[1] = src[1];
}
}
}
} break;
case GGML_OP_MUL_MAT:
{
- ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor, 0, tensor->ne[1]);
+ ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
} break;
case GGML_OP_MUL_MAT_ID:
{
} break;
case GGML_OP_SCALE:
{
- ggml_compute_forward_scale(params, tensor->src[0], tensor->src[1], tensor);
+ ggml_compute_forward_scale(params, tensor->src[0], tensor);
} break;
case GGML_OP_SET:
{
return replacements->vals[i];
}
- struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne);
+ struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, GGML_MAX_DIMS, node->ne);
// insert clone into replacements
GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite
static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) {
if (ggml_hash_contains(zero_table, a)) {
- struct ggml_tensor * a_zero = ggml_scale(ctx, a, ggml_new_f32(ctx, 0));
+ struct ggml_tensor * a_zero = ggml_scale(ctx, a, 0.0f);
return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
} else {
return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
src0->grad,
ggml_scale(ctx,
ggml_mul(ctx, src0, tensor->grad),
- ggml_new_f32(ctx, 2.0f)),
+ 2.0f),
zero_table);
}
} break;
ggml_div(ctx,
tensor->grad,
tensor),
- ggml_new_f32(ctx, 0.5f)),
+ 0.5f),
zero_table);
}
} break;
{
// necessary for llama
if (src0->grad) {
+ float s;
+ memcpy(&s, tensor->op_params, sizeof(float));
+
src0->grad =
ggml_add_or_set(ctx,
src0->grad,
- ggml_scale_impl(ctx, tensor->grad, src1, false),
- zero_table);
- }
- if (src1->grad) {
- src1->grad =
- ggml_add_or_set(ctx,
- src1->grad,
- ggml_sum(ctx, ggml_mul_impl(ctx, tensor->grad, src0, false)),
+ ggml_scale_impl(ctx, tensor->grad, s, false),
zero_table);
}
} break;
const int n_past = ((int32_t *) tensor->op_params)[0];
src0->grad =
ggml_add_or_set(ctx, src0->grad,
+ /* ggml_diag_mask_inf_impl() shouldn't be here */
+ /* ref: https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */
ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
zero_table);
}
} break;
case GGML_OP_MUL_MAT_ID:
{
- // FIXME: blas
n_tasks = n_threads;
} break;
case GGML_OP_OUT_PROD:
} else
#endif
if (node->src[1]->type != vec_dot_type) {
- cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
+ cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
}
} break;
case GGML_OP_MUL_MAT_ID:
{
- const struct ggml_tensor * a = node->src[2];
- const struct ggml_tensor * b = node->src[1];
- const enum ggml_type vec_dot_type = type_traits[a->type].vec_dot_type;
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
- if (ggml_compute_forward_mul_mat_use_blas(a, b, node)) {
- if (a->type != GGML_TYPE_F32) {
- // here we need memory just for single 2D matrix from src0
- cur = ggml_type_size(GGML_TYPE_F32)*(a->ne[0]*a->ne[1]);
- }
- } else
-#endif
- if (b->type != vec_dot_type) {
- cur = ggml_type_size(vec_dot_type)*ggml_nelements(b)/ggml_blck_size(vec_dot_type);
+ const struct ggml_tensor * src0 = node->src[2];
+ const struct ggml_tensor * src1 = node->src[1];
+ const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
+ if (src1->type != vec_dot_type) {
+ cur = ggml_row_size(vec_dot_type, ggml_nelements(src1));
}
+ const int n_as = ggml_get_op_params_i32(node, 1);
+ cur = GGML_PAD(cur, sizeof(int64_t)); // align
+ cur += n_as * sizeof(int64_t); // matrix_row_counts
+ cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
} break;
case GGML_OP_OUT_PROD:
{
fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
ggml_type_name(tensor->type),
ggml_op_name (tensor->op),
- tensor->n_dims,
+ ggml_n_dims(tensor),
ne[0], ne[1], ne[2], ne[3],
nb[0], nb[1], nb[2], nb[3],
tensor->data,
arg,
ggml_type_name(tensor->type),
ggml_op_name (tensor->op),
- tensor->n_dims,
+ ggml_n_dims(tensor),
ne[0], ne[1], ne[2], ne[3],
nb[0], nb[1], nb[2], nb[3],
tensor->data,
const uint32_t type = tensor->type;
const uint32_t op = tensor->op;
- const uint32_t n_dims = tensor->n_dims;
fwrite(&type, sizeof(uint32_t), 1, fout);
fwrite(&op, sizeof(uint32_t), 1, fout);
- fwrite(&n_dims, sizeof(uint32_t), 1, fout);
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
const uint64_t ne = tensor->ne[j];
const uint32_t type = tensor->type;
const uint32_t op = tensor->op;
- const uint32_t n_dims = tensor->n_dims;
fwrite(&type, sizeof(uint32_t), 1, fout);
fwrite(&op, sizeof(uint32_t), 1, fout);
- fwrite(&n_dims, sizeof(uint32_t), 1, fout);
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
const uint64_t ne = tensor->ne[j];
{
uint32_t type;
uint32_t op;
- uint32_t n_dims;
for (uint32_t i = 0; i < n_leafs; ++i) {
type = *(const uint32_t *) ptr; ptr += sizeof(type);
op = *(const uint32_t *) ptr; ptr += sizeof(op);
- n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
int64_t ne[GGML_MAX_DIMS];
size_t nb[GGML_MAX_DIMS];
nb[j] = nb_cur;
}
- struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
+ struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
tensor->op = (enum ggml_op) op;
ptr += ggml_nbytes(tensor);
- fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
+ fprintf(stderr, "%s: loaded leaf %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
}
}
{
uint32_t type;
uint32_t op;
- uint32_t n_dims;
for (uint32_t i = 0; i < n_nodes; ++i) {
type = *(const uint32_t *) ptr; ptr += sizeof(type);
op = *(const uint32_t *) ptr; ptr += sizeof(op);
- n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
enum ggml_op eop = (enum ggml_op) op;
} break;
default:
{
- tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
+ tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
tensor->op = eop;
} break;
result->nodes[i] = tensor;
- fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
+ fprintf(stderr, "%s: loaded node %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
}
}
}
fprintf(fp, "(%s)|", ggml_type_name(node->type));
}
- if (node->n_dims == 2) {
+ if (ggml_is_matrix(node)) {
fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op));
} else {
fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op));
int64_t i = 0;
for (int p = 0; p < np; ++p) {
const int64_t ne = ggml_nelements(ps[p]);
- const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched;
+ const float p_decay = ((ggml_n_dims(ps[p]) >= decay_min_ndim) ? decay : 0.0f) * sched;
for (int64_t j = 0; j < ne; ++j) {
float x = ggml_get_f32_1d(ps[p], j);
float g_ = g[i]*gnorm;
return NULL;
}
- const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type);
+ const size_t size_cur = ggml_row_size(info->type, ne);
ctx->size += GGML_PAD(size_cur, ctx->alignment);
}
return ctx->infos[i].name.data;
}
+enum ggml_type gguf_get_tensor_type(const struct gguf_context * ctx, int i) {
+ return ctx->infos[i].type;
+}
+
// returns the index
static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
const int idx = gguf_find_key(ctx, key);
ctx->infos[idx].ne[i] = 1;
}
- ctx->infos[idx].n_dims = tensor->n_dims;
- for (int i = 0; i < tensor->n_dims; i++) {
+ ctx->infos[idx].n_dims = ggml_n_dims(tensor);
+ for (uint32_t i = 0; i < ctx->infos[idx].n_dims; i++) {
ctx->infos[idx].ne[i] = tensor->ne[i];
}
ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
} else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16) {
GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
- std::vector<uint8_t> dataq(ggml_type_size(tensor->type)*size/ggml_blck_size(tensor->type));
+ std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
int64_t hist[16];
ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size, hist);
ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type);
size_t bs = ggml_blck_size(t->type);
+ std::vector<float> vq(ggml_blck_size(t->type));
+ bool quantized = ggml_is_quantized(t->type);
// access elements by index to avoid gaps in views
for (int64_t i3 = 0; i3 < t->ne[3]; i3++) {
tv.push_back(*(float *) &buf[i]);
} else if (t->type == GGML_TYPE_I32) {
tv.push_back((float)*(int32_t *) &buf[i]);
- } else if (ggml_is_quantized(t->type)) {
- std::vector<float> vq(ggml_blck_size(t->type));
- tt.to_float(&buf[i], vq.data(), ggml_blck_size(t->type));
+ } else if (quantized) {
+ tt.to_float(&buf[i], vq.data(), bs);
tv.insert(tv.end(), vq.begin(), vq.end());
} else {
GGML_ASSERT(false);
struct test_scale : public test_case {
const ggml_type type;
const std::array<int64_t, 4> ne;
+ float scale;
std::string vars() override {
- return VARS_TO_STR2(type, ne);
+ return VARS_TO_STR3(type, ne, scale);
}
test_scale(ggml_type type = GGML_TYPE_F32,
- std::array<int64_t, 4> ne = {10, 10, 10, 10})
- : type(type), ne(ne) {}
+ std::array<int64_t, 4> ne = {10, 10, 10, 10},
+ float scale = 2.0f)
+ : type(type), ne(ne), scale(scale) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
- ggml_tensor * scale = ggml_new_tensor_1d(ctx, type, 1);
ggml_tensor * out = ggml_scale(ctx, a, scale);
return out;
}
test_cases.emplace_back(new test_rope(type, { 64, 8, 10, 1}, 64, 2, 512)); // neox (falcon 40B)
test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1}, 64, 2, 512)); // neox (falcon 40B)
test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 20, 2, 512)); // neox (stablelm)
+ test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 32, 2, 512)); // neox (phi-2)
}
test_cases.emplace_back(new test_alibi());
size_t buffer_size = 0;
{
- buffer_size += K * IC * OC * ggml_type_sizef(GGML_TYPE_F16); // tensor a
- buffer_size += IL * IC * N * ggml_type_sizef(GGML_TYPE_F32); // tensor b
+ buffer_size += K * IC * OC * ggml_type_size(GGML_TYPE_F16); // tensor a
+ buffer_size += IL * IC * N * ggml_type_size(GGML_TYPE_F32); // tensor b
buffer_size += 1024; // overhead
}
size_t buffer_size = 0;
{
- buffer_size += KW * KH * IC * OC * ggml_type_sizef(GGML_TYPE_F16); // tensor a
- buffer_size += IW * IH * IC * N * ggml_type_sizef(GGML_TYPE_F32); // tensor b
+ buffer_size += KW * KH * IC * OC * ggml_type_size(GGML_TYPE_F16); // tensor a
+ buffer_size += IW * IH * IC * N * ggml_type_size(GGML_TYPE_F32); // tensor b
buffer_size += 1024; // overhead
}
// scale
{
srand(seed);
- const int nargs = 2;
+ const int nargs = 1;
int64_t ne2[4];
ne2[0] = 1;
for (int ndims = 1; ndims <= 2; ++ndims) {
- x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+ const float s = -1.0f + 2.0f*frand();
+
ggml_set_param(ctx0, x[0]);
- ggml_set_param(ctx0, x[1]);
- struct ggml_tensor * f = ggml_sum(ctx0, ggml_scale(ctx0, x[0], x[1]));
+ struct ggml_tensor * f = ggml_sum(ctx0, ggml_scale(ctx0, x[0], s));
check_gradient("scale", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
}
ggml_add1(ctx0,
ggml_scale(ctx0,
ggml_soft_max(ctx0, x[0]),
- ggml_new_f32(ctx0, 1.0f - eps)),
+ 1.0f - eps),
ggml_new_f32(ctx0, eps))));
check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 2e-1f, INFINITY);
void load_model(test_model & model, float* a, float* b, int M, int N, int K, bool use_gpu = false) {
size_t buffer_size = 0;
{
- buffer_size += (M * N) * ggml_type_sizef(GGML_TYPE_F32); // tensor a
- buffer_size += (N * K) * ggml_type_sizef(GGML_TYPE_F32); // tensor b
+ buffer_size += (M * N) * ggml_type_size(GGML_TYPE_F32); // tensor a
+ buffer_size += (N * K) * ggml_type_size(GGML_TYPE_F32); // tensor b
buffer_size += 1024; // overhead
}
qfns.from_float_reference(test_data1, test_q1, size);
return test_q1[0];
};
- size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+ size_t quantized_size = ggml_row_size(type, size);
benchmark_function(size, quantized_size, iterations, quantize_fn);
}
printf("\n");
qfns.from_float(test_data1, test_q1, size);
return test_q1[0];
};
- size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+ size_t quantized_size = ggml_row_size(type, size);
benchmark_function(size, quantized_size, iterations, quantize_fn);
}
printf("\n");
qfns.to_float(test_q1, test_out, size);
return test_out[0];
};
- size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+ size_t quantized_size = ggml_row_size(type, size);
benchmark_function(size, quantized_size, iterations, quantize_fn);
}
printf("\n");
vdot.from_float(test_data1, test_q1, size);
return test_q1[0];
};
- size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+ size_t quantized_size = ggml_row_size(type, size);
benchmark_function(size, quantized_size, iterations, quantize_fn);
}
printf("\n");
qfns.vec_dot(size, &result, test_q1, test_q2);
return result;
};
- size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+ size_t quantized_size = ggml_row_size(type, size);
benchmark_function(size, quantized_size, iterations, quantize_fn);
}
printf("\n");
struct ggml_tensor * t2 = ggml_new_tensor_2d(ctx0, GGML_TYPE_I16, 10, 20);
struct ggml_tensor * t3 = ggml_new_tensor_3d(ctx0, GGML_TYPE_I32, 10, 20, 30);
- GGML_ASSERT(t1->n_dims == 1);
+ GGML_ASSERT(ggml_n_dims(t1) == 1);
GGML_ASSERT(t1->ne[0] == 10);
GGML_ASSERT(t1->nb[1] == 10*sizeof(float));
- GGML_ASSERT(t2->n_dims == 2);
+ GGML_ASSERT(ggml_n_dims(t2) == 2);
GGML_ASSERT(t2->ne[0] == 10);
GGML_ASSERT(t2->ne[1] == 20);
GGML_ASSERT(t2->nb[1] == 10*sizeof(int16_t));
GGML_ASSERT(t2->nb[2] == 10*20*sizeof(int16_t));
- GGML_ASSERT(t3->n_dims == 3);
+ GGML_ASSERT(ggml_n_dims(t3) == 3);
GGML_ASSERT(t3->ne[0] == 10);
GGML_ASSERT(t3->ne[1] == 20);
GGML_ASSERT(t3->ne[2] == 30);