From: Xuan Son Nguyen <redacted>
Date: Mon, 15 Jul 2024 18:50:47 +0000 (+0200)
Subject: Refactor lora adapter support (llama/8332)
X-Git-Tag: upstream/0.0.1642~513
X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=295450e73422bfec3d5294f3962fb05f9e845df8;p=pkg%2Fggml%2Fsources%2Fggml

Refactor lora adapter support (llama/8332)

* lora: load to devide buft

* add patch tensor function

* correct tensor patch

* llama_lora_adapter_apply

* correct ggml_backend_tensor_copy

* add llm_build_mm

* fix auto merge

* update based on review comments

* add convert script

* no more transpose A

* add f16 convert

* add metadata check

* add sanity check

* fix ftype

* add requirements

* fix requirements

* fix outfile

* conversion: only allow selected models

* fix types

* cuda : do not use dmmv if the tensor does not have enough cols

* llama : lora fixes

* do not disable mmap with lora

Co-authored-by: slaren <redacted>

* llm_build_lora_mm_id

* convert_lora : MoE LoRA conversion support

* convert_lora : prefer safetensors, similarly to convert_hf

* convert_hf : simplify modify_tensors for InternLM2

* convert_lora : lazy conversion

* llama : load and use alpha from LoRA adapters

* llama : use llm_build_lora_mm in most model graphs

* auto scale

* Revert "auto scale"

This reverts commit 42415a4874e0f963e4aca6796ea5dfb97cd17464.

* remove redundant params

* Apply suggestions from code review

Co-authored-by: slaren <redacted>

* change kv metadata

* move add_type to __init__

* convert_hf : move add_type to main()

* convert_lora : use the GGUFWriter from Model instead of overwriting it

---------

Co-authored-by: slaren <redacted>
Co-authored-by: Francis Couture-Harpin <redacted>
---

diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu
index ed784ea1..39e345b6 100644
--- a/src/ggml-cuda.cu
+++ b/src/ggml-cuda.cu
@@ -1876,7 +1876,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
 
     bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
-        && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
+        && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[0] >= GGML_CUDA_DMMV_X*2
+        && src1->ne[1] == 1;
     bool          use_mul_mat_vec_q =  ggml_is_quantized(src0->type)
         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
         && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
diff --git a/src/ggml.c b/src/ggml.c
index f2c58c37..9e2b1d8c 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -19477,7 +19477,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
 
     fprintf(fp, "digraph G {\n");
     fprintf(fp, "  newrank = true;\n");
-    fprintf(fp, "  rankdir = LR;\n");
+    fprintf(fp, "  rankdir = TB;\n");
 
     for (int i = 0; i < gb->n_nodes; i++) {
         struct ggml_tensor * node = gb->nodes[i];
@@ -19539,7 +19539,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
         }
 
         fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
-        if (ggml_nelements(node) < 5) {
+        if (ggml_nelements(node) < 5 && node->data != NULL) {
             fprintf(fp, " | (");
             for (int j = 0; j < ggml_nelements(node); j++) {
                 if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {