Revert "llava : add a MobileVLM_V2-1.7B backup (#6152)"

author Georgi Gerganov <redacted>

Wed, 20 Mar 2024 11:29:49 +0000 (13:29 +0200)

committer Georgi Gerganov <redacted>

Wed, 20 Mar 2024 11:29:49 +0000 (13:29 +0200)
author Georgi Gerganov <redacted>
Wed, 20 Mar 2024 11:29:49 +0000 (13:29 +0200)
committer Georgi Gerganov <redacted>
Wed, 20 Mar 2024 11:29:49 +0000 (13:29 +0200)
diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md

index c1f361d1704f472f94ccefded2baf4b182860bab..9eba791dadfef8a67d92c903c02d13cb48966e63 100644 (file)
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@@ -1,13 +1,11 @@
  # MobileVLM
  
-Currently this implementation supports [MobileVLM-1.7B](https://huggingface.co/mtgv/MobileVLM-1.7B) / [MobileVLM_V2-1.7B](https://huggingface.co/mtgv/MobileVLM_V2-1.7B) variants.
+Currently this implementation supports [MobileVLM-v1.7](https://huggingface.co/mtgv/MobileVLM-1.7B) variants.
  
  for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com/Meituan-AutoML/MobileVLM)
  
  The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava.
  
-Notice: The overall process of model inference for both **MobilVLM** and **MobilVLM_V2** models is the same, but the process of model conversion  is a little different. Therefore, using MobiVLM as an example, the different conversion step will be shown.
-
  ## Usage
  Build with cmake or run `make llava-cli` to build it.
  
@@ -36,7 +34,7 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
  python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
  ```
  
-3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` (for **V2** the arg is `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
+3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` to convert the LLaVA image encoder to GGUF:
  
  ```sh
  python ./examples/llava/convert-image-encoder-to-gguf \
@@ -46,14 +44,6 @@ python ./examples/llava/convert-image-encoder-to-gguf \
      --projector-type ldp
  ```
  
-```sh
-python ./examples/llava/convert-image-encoder-to-gguf \
-    -m path/to/clip-vit-large-patch14-336 \
-    --llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
-    --output-dir path/to/MobileVLM-1.7B_V2 \
-    --projector-type ldpv2
-```
-
  4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
  
  ```sh
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp

index 6d7b4950f5c7066851f5474fe1bd21238fc098e8..690bca2eb77329ae851e32fc3ca0351fcf53fd6f 100644 (file)
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -119,7 +119,6 @@ static std::string format(const char * fmt, ...) {
  #define TN_LLAVA_PROJ      "mm.%d.%s"
  #define TN_MVLM_PROJ_MLP   "mm.model.mlp.%d.%s"
  #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
-#define TN_MVLM_PROJ_PEG   "mm.model.peg.%d.%s"
  #define TN_IMAGE_NEWLINE   "model.image_newline"
  
  
@@ -127,14 +126,12 @@ enum projector_type {
      PROJECTOR_TYPE_MLP,
      PROJECTOR_TYPE_MLP_NORM,
      PROJECTOR_TYPE_LDP,
-    PROJECTOR_TYPE_LDPV2,
      PROJECTOR_TYPE_UNKNOWN,
  };
  
  static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
      { PROJECTOR_TYPE_MLP, "mlp" },
      { PROJECTOR_TYPE_LDP, "ldp" },
-    { PROJECTOR_TYPE_LDPV2, "ldpv2"},
  };
  
  
@@ -810,29 +807,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
              }
              embeddings = block_1;
          }
-        else if (ctx->proj_type == PROJECTOR_TYPE_LDPV2)
-        {
-            int n_patch = 24;
-            struct ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
-            mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
-            mlp_0 = ggml_gelu(ctx0, mlp_0);
-            struct ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
-            mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
-            // mlp_2 ne = [2048, 576, 1, 1]
-            // // AVG Pool Layer 2*2, strides = 2
-            mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3));
-            // mlp_2 ne = [576, 2048, 1, 1]
-            mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
-            // mlp_2 ne [24, 24, 2048, 1]
-            mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
-            // weight ne = [3, 3, 2048, 1]
-            struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
-            peg_0 = ggml_add(ctx0, peg_0, mlp_2);
-            peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
-            peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
-            peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
-            embeddings = peg_0;
-        }
          else {
              GGML_ASSERT(false);
          }
@@ -1203,18 +1177,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
              vision_model.mm_model_block_2_block_2_0_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
              vision_model.mm_model_block_2_block_2_1_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
              vision_model.mm_model_block_2_block_2_1_b   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
-        }
-        else if (new_clip->proj_type == PROJECTOR_TYPE_LDPV2)
-        {
-            // MobilVLM_V2 projection
-            vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "weight"));
-            vision_model.mm_model_mlp_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "bias"));
-            vision_model.mm_model_mlp_2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "weight"));
-            vision_model.mm_model_mlp_2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "bias"));
-            vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight"));
-            vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias"));
-        }
-        else {
+        } else {
              std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
              throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
          }
@@ -2003,9 +1966,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
      if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
          return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
      }
-    if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
-        return ctx->vision_model.mm_model_peg_0_b->ne[0];
-    }
      if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
          return ctx->vision_model.mm_2_b->ne[0];
      }
diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py

index b00bf7c6d0b59804db18a7ef416cb286af6fa055..c69f89ac2b60ec413d99776f37716de69e011216 100644 (file)
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@@ -1,7 +1,6 @@
  import argparse
  import os
  import json
-import re
  
  import torch
  import numpy as np
@@ -39,11 +38,9 @@ def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: b
  def get_tensor_name(name: str) -> str:
      if "projection" in name:
          return name
+
      if "mm_projector" in name:
-        name = name.replace("model.mm_projector", "mm")
-        name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
-        name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
-        return name
+        return name.replace("model.mm_projector", "mm")
  
      return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
  
@@ -86,7 +83,7 @@ ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
  ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
                  help="The clip model is from openclip (for ViT-SO400M type))")
  ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
-ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp")
+ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp")
  ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
  # Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
  # Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
author	Georgi Gerganov <redacted>
	Wed, 20 Mar 2024 11:29:49 +0000 (13:29 +0200)
committer	Georgi Gerganov <redacted>
	Wed, 20 Mar 2024 11:29:49 +0000 (13:29 +0200)
examples/llava/MobileVLM-README.md		patch \| blob \| history
examples/llava/clip.cpp		patch \| blob \| history
examples/llava/convert-image-encoder-to-gguf.py		patch \| blob \| history