export-lora : throw error if lora is quantized (#9002)

author Xuan Son Nguyen <redacted>

Tue, 13 Aug 2024 09:41:14 +0000 (11:41 +0200)

committer GitHub <redacted>

Tue, 13 Aug 2024 09:41:14 +0000 (11:41 +0200)
author Xuan Son Nguyen <redacted>
Tue, 13 Aug 2024 09:41:14 +0000 (11:41 +0200)
committer GitHub <redacted>
Tue, 13 Aug 2024 09:41:14 +0000 (11:41 +0200)
diff --git a/examples/export-lora/README.md b/examples/export-lora/README.md

index 91c33c34acaa936befd0fd8f402c6672d21b21c1..7dce99c9a9e613bc1c575b520293e30016515683 100644 (file)
--- a/examples/export-lora/README.md
+++ b/examples/export-lora/README.md
@@ -17,9 +17,9 @@ For example:
  
  ```bash
  ./bin/llama-export-lora \
-    -m open-llama-3b-v2-q8_0.gguf \
-    -o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
-    --lora lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.gguf
+    -m open-llama-3b-v2.gguf \
+    -o open-llama-3b-v2-english2tokipona-chat.gguf \
+    --lora lora-open-llama-3b-v2-english2tokipona-chat-LATEST.gguf
  ```
  
  Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters:
diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp

index 3176d6e26ef8b59b8bd889859fd63cf414fd642a..c7e5ca78845ee243c6fcbcb8fdb73551f2c685c3 100644 (file)
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -10,6 +10,12 @@
  
  static bool g_verbose = false;
  
+struct tensor_transformation {
+    struct ggml_tensor * in;
+    struct ggml_tensor * out;
+    bool is_copy;
+};
+
  static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){
      int id = gguf_find_key(ctx_gguf, key.c_str());
      return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
@@ -198,8 +204,7 @@ struct lora_merge_ctx {
          }
  
          // mapping base tensor to out tensor (same shape with base, but different type)
-        // if out_tensor == nullptr, we only copy it
-        std::vector<std::pair<struct ggml_tensor *, struct ggml_tensor *>> base_to_out_tensors;
+        std::vector<tensor_transformation> trans;
          for (auto & it : base_model.tensors) {
              bool t_a = true;
              bool t_b = true;
@@ -212,14 +217,22 @@ struct lora_merge_ctx {
                  // only copy
                  struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
                  ggml_set_name(cpy_tensor, base_tensor->name);
-                base_to_out_tensors.push_back(std::make_pair(cpy_tensor, nullptr));
+                trans.push_back({
+                    cpy_tensor,
+                    cpy_tensor,
+                    true,
+                });
                  gguf_add_tensor(ctx_out, cpy_tensor);
              } else if (t_a && t_b) {
                  // need merging
                  struct ggml_tensor * out_tensor = ggml_new_tensor(
                      ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne);
                  ggml_set_name(out_tensor, base_tensor->name);
-                base_to_out_tensors.push_back(std::make_pair(base_tensor, out_tensor));
+                trans.push_back({
+                    base_tensor,
+                    out_tensor,
+                    false,
+                });
                  gguf_add_tensor(ctx_out, out_tensor);
              } else {
                  throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b");
@@ -234,12 +247,12 @@ struct lora_merge_ctx {
  
          // process base model tensors
          size_t n_merged = 0;
-        for (auto & it : base_to_out_tensors) {
-            if (it.second != nullptr) {
-                merge_tensor(it.first, it.second);
+        for (auto & it : trans) {
+            if (!it.is_copy) {
+                merge_tensor(it.in, it.out);
                  n_merged++;
              } else {
-                copy_tensor(it.first);
+                copy_tensor(it.in);
              }
          }
  
@@ -252,7 +265,7 @@ struct lora_merge_ctx {
          }
  
          printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
-        printf("%s : wrote %ld tensors to output file\n", __func__, base_to_out_tensors.size());
+        printf("%s : wrote %ld tensors to output file\n", __func__, trans.size());
      }
  
      void copy_tensor(struct ggml_tensor * base) {
@@ -285,6 +298,10 @@ struct lora_merge_ctx {
          for (size_t i = 0; i < adapters.size(); ++i) {
              auto t_a = adapters[i]->get_tensor(name_lora_a);
              auto t_b = adapters[i]->get_tensor(name_lora_b);
+            // TODO: add support for quantized lora
+            if (ggml_is_quantized(t_a->type) || ggml_is_quantized(t_b->type)) {
+                throw std::runtime_error("quantized LoRA adapters is not supported, please retry with f16 or f32");
+            }
              inp_a[i] = ggml_dup_tensor(ctx, t_a);
              inp_b[i] = ggml_dup_tensor(ctx, t_b);
          }
author	Xuan Son Nguyen <redacted>
	Tue, 13 Aug 2024 09:41:14 +0000 (11:41 +0200)
committer	GitHub <redacted>
	Tue, 13 Aug 2024 09:41:14 +0000 (11:41 +0200)
examples/export-lora/README.md		patch \| blob \| history
examples/export-lora/export-lora.cpp		patch \| blob \| history