enum projector_type {
PROJECTOR_TYPE_MLP,
+ PROJECTOR_TYPE_MLP_NORM,
PROJECTOR_TYPE_LDP,
PROJECTOR_TYPE_UNKNOWN,
};
struct ggml_tensor * projection;
// LLaVA projection
- struct ggml_tensor * mm_0_w;
- struct ggml_tensor * mm_0_b;
- struct ggml_tensor * mm_2_w;
- struct ggml_tensor * mm_2_b;
+ struct ggml_tensor * mm_0_w = NULL;
+ struct ggml_tensor * mm_0_b = NULL;
+ struct ggml_tensor * mm_2_w = NULL;
+ struct ggml_tensor * mm_2_b = NULL;
+
+ // Yi type models with mlp+normalization projection
+ struct ggml_tensor * mm_1_w = NULL; // Yi type models have 0, 1, 3, 4
+ struct ggml_tensor * mm_1_b = NULL;
+ struct ggml_tensor * mm_3_w = NULL;
+ struct ggml_tensor * mm_3_b = NULL;
+ struct ggml_tensor * mm_4_w = NULL;
+ struct ggml_tensor * mm_4_b = NULL;
// MobileVLM projection
struct ggml_tensor * mm_model_mlp_1_w;
// pre-layernorm
{
embeddings = ggml_norm(ctx0, embeddings, eps);
+ ggml_set_name(embeddings, "pre_ln");
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
}
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
+
+ } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
+ embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
+ embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
+ // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
+ // First LayerNorm
+ embeddings = ggml_norm(ctx0, embeddings, eps);
+ embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
+ model.mm_1_b);
+
+ // GELU activation
+ embeddings = ggml_gelu(ctx0, embeddings);
+
+ // Second linear layer
+ embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
+ embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
+
+ // Second LayerNorm
+ embeddings = ggml_norm(ctx0, embeddings, eps);
+ embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
+ model.mm_4_b);
}
else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
// MobileVLM projector
else {
new_clip->proj_type = PROJECTOR_TYPE_MLP;
}
+ if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
+ if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) {
+ new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM;
+ }
+ }
}
#ifdef GGML_USE_CUBLAS
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
// LLaVA projection
- if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
+ if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
- vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
- vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
+ try {
+ // Yi-type llava
+ vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight"));
+ vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias"));
+ } catch (std::runtime_error & e) { }
+ try {
+ // missing in Yi-type llava
+ vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
+ vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
+ } catch (std::runtime_error & e) { }
+ try {
+ // Yi-type llava
+ vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight"));
+ vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias"));
+ } catch (std::runtime_error & e) { }
+ try {
+ // Yi-type llava
+ vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight"));
+ vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias"));
+ } catch (std::runtime_error & e) { }
}
else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
// MobileVLM projection
}
else if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
return ctx->vision_model.mm_2_b->ne[0];
+ } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
+ return ctx->vision_model.mm_3_b->ne[0];
}
else {
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama));
- // llava chat format is "<system_prompt>\nUSER:<image_embeddings>\n<textual_prompt>\nASSISTANT:"
- eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params->n_batch, &n_past, add_bos);
+ std::string system_prompt, user_prompt;
+ size_t image_pos = prompt.find("<image>");
+ if (image_pos != std::string::npos) {
+ // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
+
+ system_prompt = prompt.substr(0, image_pos);
+ user_prompt = prompt.substr(image_pos + std::string("<image>").length());
+ // We replace \n with actual newlines in user_prompt, just in case -e was not used in templating string
+ size_t pos = 0;
+ while ((pos = user_prompt.find("\\n", pos)) != std::string::npos) {
+ user_prompt.replace(pos, 2, "\n");
+ pos += 1; // Advance past the replaced newline
+ }
+ while ((pos = system_prompt.find("\\n", pos)) != std::string::npos) {
+ system_prompt.replace(pos, 2, "\n");
+ pos += 1; // Advance past the replaced newline
+ }
+
+ printf("system_prompt: %s\n", system_prompt.c_str());
+ printf("user_prompt: %s\n", user_prompt.c_str());
+ } else {
+ // llava-1.5 native mode
+ system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
+ user_prompt = prompt + "\nASSISTANT:";
+ }
+
+ eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos);
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
- eval_string(ctx_llava->ctx_llama, (prompt + "\nASSISTANT:").c_str(), params->n_batch, &n_past, false);
+ eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
// generate the response
for (int i = 0; i < max_tgt_len; i++) {
const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
if (strcmp(tmp, "</s>") == 0) break;
+ if (strstr(tmp, "###")) break; // Yi-VL behavior
printf("%s", tmp);
fflush(stdout);