aLoRA Support (#15327)

author Gabe Goodhart <redacted>

Fri, 5 Sep 2025 23:32:39 +0000 (17:32 -0600)

committer GitHub <redacted>

Fri, 5 Sep 2025 23:32:39 +0000 (17:32 -0600)
author Gabe Goodhart <redacted>
Fri, 5 Sep 2025 23:32:39 +0000 (17:32 -0600)
committer GitHub <redacted>
Fri, 5 Sep 2025 23:32:39 +0000 (17:32 -0600)
diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py

index a67c0536a4128dd70e5af20ba0a4553db18f14bb..befe8ab9cc838b4ca0417ab41d770ec581da2f54 100755 (executable)
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -12,7 +12,7 @@ import json
  from math import prod
  from pathlib import Path
  from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
-from transformers import AutoConfig
+from transformers import AutoConfig, AutoTokenizer
  
  import torch
  
@@ -26,6 +26,8 @@ import gguf
  # reuse model definitions from convert_hf_to_gguf.py
  from convert_hf_to_gguf import LazyTorchTensor, ModelBase
  
+from gguf.constants import GGUFValueType
+
  logger = logging.getLogger("lora-to-gguf")
  
  
@@ -369,7 +371,31 @@ if __name__ == '__main__':
                  self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
  
              def set_gguf_parameters(self):
+                logger.debug("GGUF KV: %s = %d", gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
                  self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
+                alora_invocation_tokens = lparams.get("alora_invocation_tokens")
+                invocation_string = lparams.get("invocation_string")
+                if invocation_string and not alora_invocation_tokens:
+                    logger.debug("Tokenizing invocation_string -> alora_invocation_tokens")
+                    base_model_path_or_id = hparams.get("_name_or_path")
+                    try:
+                        tokenizer = AutoTokenizer.from_pretrained(base_model_path_or_id)
+                    except ValueError:
+                        logger.error("Unable to load tokenizer from %s", base_model_path_or_id)
+                        raise
+                    # NOTE: There's an off-by-one with the older aLoRAs where
+                    # the invocation string includes the "<|start_of_turn|>"
+                    # token, but the adapters themselves were trained to
+                    # activate _after_ that first token, so we drop it here.
+                    alora_invocation_tokens = tokenizer(invocation_string)["input_ids"][1:]
+                if alora_invocation_tokens:
+                    logger.debug("GGUF KV: %s = %s", gguf.Keys.Adapter.ALORA_INVOCATION_TOKENS, alora_invocation_tokens)
+                    self.gguf_writer.add_key_value(
+                        gguf.Keys.Adapter.ALORA_INVOCATION_TOKENS,
+                        alora_invocation_tokens,
+                        GGUFValueType.ARRAY,
+                        GGUFValueType.UINT32,
+                    )
  
              def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
                  # Never add extra tensors (e.g. rope_freqs) for LoRA adapters
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py

index c922bacf6eb973488b26a55b590c6e8ce0fa190d..b8ac394580b1f99c34d876be8fd5671fd2aa259a 100644 (file)
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -231,10 +231,11 @@ class Keys:
          MIDDLE_ID            = "tokenizer.ggml.middle_token_id"
  
      class Adapter:
-        TYPE               = "adapter.type"
-        LORA_ALPHA         = "adapter.lora.alpha"
-        LORA_TASK_NAME     = "adapter.lora.task_name"
-        LORA_PROMPT_PREFIX = "adapter.lora.prompt_prefix"
+        TYPE                    = "adapter.type"
+        LORA_ALPHA              = "adapter.lora.alpha"
+        LORA_TASK_NAME          = "adapter.lora.task_name"
+        LORA_PROMPT_PREFIX      = "adapter.lora.prompt_prefix"
+        ALORA_INVOCATION_TOKENS = "adapter.alora.invocation_tokens"
  
      class IMatrix:
          CHUNK_COUNT = "imatrix.chunk_count"
diff --git a/include/llama.h b/include/llama.h

index 11f8a363a5733a44dd75032e3e3e05095ad166ff..453190e852b51e87502e813c6d4f91d85cd1f888 100644 (file)
--- a/include/llama.h
+++ b/include/llama.h
@@ -583,6 +583,10 @@ extern "C" {
      // Note: loaded adapters will be free when the associated model is deleted
      LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
  
+    // Get the invocation tokens if the current lora is an alora
+    LLAMA_API uint64_t            llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
+    LLAMA_API const llama_token * llama_adapter_get_alora_invocation_tokens  (const struct llama_adapter_lora * adapter);
+
      // The following functions operate on a llama_context, hence the naming: llama_verb_...
  
      // Add a loaded LoRA adapter to given context
diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp

index 772ce1b4480886d280a5806b4da1c94bb8b1f73d..d8eef75a7ad70afee2bebb03d023c60731a50cf2 100644 (file)
--- a/src/llama-adapter.cpp
+++ b/src/llama-adapter.cpp
@@ -6,6 +6,7 @@
  
  #include <map>
  #include <cassert>
+#include <sstream>
  #include <stdexcept>
  
  // vec
@@ -215,6 +216,26 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
          }
  
          adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
+
+        // parse alora invocation sequence vector
+        const auto & key = llm_kv(LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS);
+        const int kid = gguf_find_key(ctx_gguf.get(), key.c_str());
+        if (kid >= 0) {
+            if (gguf_get_kv_type(ctx_gguf.get(), kid) != GGUF_TYPE_ARRAY) {
+                throw std::runtime_error("invalid gguf type for " + key);
+            }
+            const auto arr_type = gguf_get_arr_type(ctx_gguf.get(), kid);
+            if (arr_type != GGUF_TYPE_UINT32) {
+                throw std::runtime_error("invalid gguf element type for " + key);
+            }
+            const size_t seq_len = gguf_get_arr_n(ctx_gguf.get(), kid);
+            const void * data = gguf_get_arr_data(ctx_gguf.get(), kid);
+            adapter.alora_invocation_tokens.resize(seq_len);
+            std::copy(
+                (const llama_token *)data,
+                (const llama_token *)data + seq_len,
+                adapter.alora_invocation_tokens.begin());
+        }
      }
  
      int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
@@ -450,3 +471,15 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter,
  void llama_adapter_lora_free(llama_adapter_lora * adapter) {
      delete adapter;
  }
+
+uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
+    if (!adapter) {
+        return 0;
+    }
+    return adapter->alora_invocation_tokens.size();
+}
+
+const llama_token * llama_adapter_get_alora_invocation_tokens(const llama_adapter_lora * adapter) {
+    GGML_ASSERT(adapter);
+    return adapter->alora_invocation_tokens.data();
+}
diff --git a/src/llama-adapter.h b/src/llama-adapter.h

index 9084e7cab08fdfe63f9c1ca012d48dec47c42054..4f65247c0feb1d215e08e6c33f3d0d57bf19c8b8 100644 (file)
--- a/src/llama-adapter.h
+++ b/src/llama-adapter.h
@@ -70,6 +70,9 @@ struct llama_adapter_lora {
      // gguf metadata
      std::unordered_map<std::string, std::string> gguf_kv;
  
+    // activated lora (aLoRA)
+    std::vector<llama_token> alora_invocation_tokens;
+
      llama_adapter_lora() = default;
      ~llama_adapter_lora() = default;
  
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp

index d92bf2dd8515fcb0a616b0210cbc53ddf007bcbd..77b2fecf18fb856041a0c29571a170bbb35829ad 100644 (file)
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -237,10 +237,11 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
      { LLM_KV_TOKENIZER_FIM_REP_ID,           "tokenizer.ggml.fim_rep_token_id"         },
      { LLM_KV_TOKENIZER_FIM_SEP_ID,           "tokenizer.ggml.fim_sep_token_id"         },
  
-    { LLM_KV_ADAPTER_TYPE,               "adapter.type"               },
-    { LLM_KV_ADAPTER_LORA_ALPHA,         "adapter.lora.alpha"         },
-    { LLM_KV_ADAPTER_LORA_TASK_NAME,     "adapter.lora.task_name"     },
-    { LLM_KV_ADAPTER_LORA_PROMPT_PREFIX, "adapter.lora.prompt_prefix" },
+    { LLM_KV_ADAPTER_TYPE,                    "adapter.type"               },
+    { LLM_KV_ADAPTER_LORA_ALPHA,              "adapter.lora.alpha"         },
+    { LLM_KV_ADAPTER_LORA_TASK_NAME,          "adapter.lora.task_name"     },
+    { LLM_KV_ADAPTER_LORA_PROMPT_PREFIX,      "adapter.lora.prompt_prefix" },
+    { LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS, "adapter.alora.invocation_tokens" },
  
      // deprecated
      { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
diff --git a/src/llama-arch.h b/src/llama-arch.h

index 41f377923f50f7dc564c6b80a9ab3bf560c82eab..21ab47bd7af2a9288d386b5bb79e36ad95f406ae 100644 (file)
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -235,6 +235,7 @@ enum llm_kv {
      LLM_KV_ADAPTER_LORA_ALPHA,
      LLM_KV_ADAPTER_LORA_TASK_NAME,
      LLM_KV_ADAPTER_LORA_PROMPT_PREFIX,
+    LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS,
  
      LLM_KV_POSNET_EMBEDDING_LENGTH,
      LLM_KV_POSNET_BLOCK_COUNT,
diff --git a/tools/server/server.cpp b/tools/server/server.cpp

index de7c9931177cfdc8b7964bdbcb8edfbd5ecf299c..73fc43bada5437a5a9c765ecbf7b01cc36763982 100644 (file)
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -117,7 +117,7 @@ struct slot_params {
      int32_t n_keep    =  0; // number of tokens to keep from initial prompt
      int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
      int32_t n_predict = -1; // new tokens to predict
-    int32_t n_indent  =  0; // mininum line indentation for the generated text in number of whitespace characters
+    int32_t n_indent  =  0; // minimum line indentation for the generated text in number of whitespace characters
  
      int64_t t_max_prompt_ms  = -1; // TODO: implement
      int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
@@ -1382,6 +1382,7 @@ struct server_slot {
      common_speculative * spec = nullptr;
  
      std::vector<common_adapter_lora_info> lora;
+    int32_t alora_invocation_start = -1;
  
      // the index relative to completion multi-task request
      size_t index = 0;
@@ -1476,6 +1477,9 @@ struct server_slot {
          // clear speculative decoding stats
          n_draft_total = 0;
          n_draft_accepted = 0;
+
+        // clear alora start
+        alora_invocation_start = -1;
      }
  
      bool need_embd() const {
@@ -2367,11 +2371,65 @@ struct server_context {
          slot.prompt_tokens = std::move(task.prompt_tokens);
  
          if (!are_lora_equal(slot.params.lora, slot.lora)) {
-            // if lora is changed, we cannot reuse cached tokens
-            slot.cache_tokens.clear();
+            // if lora has changed, check to see if the cache should be cleared
+            if (lora_should_clear_cache(slot.lora, slot.params.lora)) {
+                SLT_INF(slot, "clearing cache for lora change. %zu loras -> %zu loras\n", slot.lora.size(), slot.params.lora.size());
+                slot.cache_tokens.clear();
+            } else {
+                SLT_INF(slot, "keeping cache for alora. %zu target loras\n", slot.params.lora.size());
+            }
              slot.lora = slot.params.lora;
          }
  
+        // if using alora, make sure it's only a single one requested and active
+        size_t alora_invocation_start = slot.prompt_tokens.size();
+        if (lora_all_alora(slot.lora)) {
+
+            const auto & enabled_ids = lora_get_enabled_ids(slot.lora);
+            // TODO: This will error out if a user requests two aloras, but only
+            // provides the activation string for one. We could, instead search
+            // for all requested alora activation strings and then either keep
+            // only the last one, or reject if multiple are found.
+            if (enabled_ids.size() != 1) {
+                send_error(task, "Cannot run multiple aLoRAs in a single request", ERROR_TYPE_INVALID_REQUEST);
+                return false;
+            }
+            const auto & lora = slot.lora[enabled_ids[0]].ptr;
+
+            // get the pointer and count for the invocation tokens
+            const uint64_t      n_invocation_tokens = llama_adapter_get_alora_n_invocation_tokens(lora);
+            const llama_token * invocation_tokens   = llama_adapter_get_alora_invocation_tokens  (lora);
+
+            // scan backwards through the prompt tokens to find the last
+            // occurrence of the invocation sequence
+            int match_idx = static_cast<int>(n_invocation_tokens) - 1;
+            for (int i = slot.prompt_tokens.size() - 1; i >= 0; --i) {
+                // the token in this position matches the next token to find in
+                // the invocation sequence
+                if (slot.prompt_tokens[i] == invocation_tokens[match_idx]) {
+                    // if it's a full match, we've found the start
+                    if (match_idx == 0) {
+                        alora_invocation_start = i;
+                        break;
+                    }
+                    // otherwise, check the next token in the sequence
+                    --match_idx;
+                } else {
+                    // no match in this position, so start looking over again
+                    match_idx = static_cast<int>(n_invocation_tokens) - 1;
+                }
+            }
+
+            // if the activation string is not found, disable the alora
+            if (alora_invocation_start == slot.prompt_tokens.size()) {
+                SLT_DBG(slot, "alora %zu requested, but not found. deactivating\n", enabled_ids[0]);
+                slot.lora[enabled_ids[0]].scale = 0.0f;
+            } else {
+                SLT_DBG(slot, "alora %zu activated starting at %zu\n", enabled_ids[0], alora_invocation_start);
+                slot.alora_invocation_start = alora_invocation_start;
+            }
+        }
+
          if (!slot.prompt_tokens.validate(ctx)) {
              send_error(task, "Prompt contains invalid tokens", ERROR_TYPE_INVALID_REQUEST);
              return false;
@@ -3247,6 +3305,8 @@ struct server_context {
          int32_t n_ubatch = llama_n_ubatch(ctx);
  
          // next, batch any pending prompts without exceeding n_batch
+        float alora_scale = -1.0f;
+        size_t alora_disabled_id = 0;
          if (params_base.cont_batching || batch.n_tokens == 0) {
              for (auto & slot : slots) {
                  // check if we can batch this slot with the previous one
@@ -3367,6 +3427,12 @@ struct server_context {
                                  // reuse any previously computed tokens that are common with the new prompt
                                  slot.n_past = slot.cache_tokens.get_common_prefix(prompt_tokens);
  
+                                // if there is an alora invoked, don't cache after the invocation start
+                                if (slot.alora_invocation_start >= 0) {
+                                    SLT_DBG(slot, "only caching to alora invocation start (n_past=%d, alora_invocation_start=%d)\n", slot.n_past, slot.alora_invocation_start);
+                                    slot.n_past = std::min(slot.n_past, slot.alora_invocation_start - 1);
+                                }
+
                                  // reuse chunks from the cached prompt by shifting their KV cache in the new position
                                  if (params_base.n_cache_reuse > 0) {
                                      size_t head_c = slot.n_past; // cache
@@ -3539,6 +3605,20 @@ struct server_context {
                          slot.n_prompt_tokens_processed += n_pos;
                      }
  
+                    // If using an alora, there may be uncached tokens that come
+                    // before the invocation sequence. When this happens, the
+                    // tokens before the invocation sequence need to be
+                    // processed without the adpter in a separate batch, then
+                    // the adapter needs to be enabled for the remaining tokens.
+                    if (lora_all_alora(slot.lora) && slot.alora_invocation_start - 1 > slot.n_past) {
+                        SLT_DBG(slot, "processing pre-alora tokens without the adapter (n_past = %d, alora_invocation_start = %d)\n", slot.n_past, slot.alora_invocation_start);
+                        const auto & enabled_loras = lora_get_enabled_ids(slot.lora);
+                        GGML_ASSERT(enabled_loras.size() == 1);
+                        alora_scale = slot.lora[enabled_loras[0]].scale;
+                        slot.lora[enabled_loras[0]].scale = 0.0f;
+                        alora_disabled_id = enabled_loras[0];
+                    }
+
                      // add prompt tokens for processing in the current batch
                      while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
                          // get next token to process
@@ -3547,6 +3627,14 @@ struct server_context {
                              break; // end of text chunk
                          }
  
+                        // if this is an alora request with pre-invocation
+                        // tokens that are not cached, we need to stop filling
+                        // this batch at those pre-invocation tokens.
+                        if (alora_scale > 0 && slot.n_past == slot.alora_invocation_start - 1) {
+                            SLT_DBG(slot, "stop prompt batch filling at (n_past = %d, alora_invocation_start = %d)\n", slot.n_past, slot.alora_invocation_start);
+                            break;
+                        }
+
                          // embedding requires all tokens in the batch to be output
                          const bool need_embd = server_task_type_need_embd(slot.task_type);
  
@@ -3605,6 +3693,13 @@ struct server_context {
              // apply lora, only need to do it once per batch
              common_set_adapter_lora(ctx, slot_batched->lora);
  
+            // if the lora is temporarily disabled for an alora, re-enable it
+            // for next time
+            if (alora_scale > 0.0f) {
+                SRV_DBG("re-enabling alora with scale %f\n", alora_scale);
+                slot_batched->lora[alora_disabled_id].scale = alora_scale;
+            }
+
              llama_set_embeddings(ctx, slot_batched->need_embd());
          }
  
@@ -4990,13 +5085,26 @@ int main(int argc, char ** argv) {
          const auto & loras = ctx_server.params_base.lora_adapters;
          for (size_t i = 0; i < loras.size(); ++i) {
              auto & lora = loras[i];
-            result.push_back({
+            json entry = {
                  {"id", i},
                  {"path", lora.path},
                  {"scale", lora.scale},
                  {"task_name", lora.task_name},
                  {"prompt_prefix", lora.prompt_prefix},
-            });
+            };
+            std::string alora_invocation_string = "";
+            const uint64_t n_alora_tokens = llama_adapter_get_alora_n_invocation_tokens(lora.ptr);
+            std::vector<llama_token> alora_invocation_tokens;
+            if (n_alora_tokens) {
+                const llama_token * alora_tokens = llama_adapter_get_alora_invocation_tokens(lora.ptr);
+                for (uint64_t i = 0; i < n_alora_tokens; ++i) {
+                    alora_invocation_string += common_token_to_piece(ctx_server.ctx, alora_tokens[i]);
+                    alora_invocation_tokens.push_back(alora_tokens[i]);
+                }
+                entry["alora_invocation_string"] = alora_invocation_string;
+                entry["alora_invocation_tokens"] = alora_invocation_tokens;
+            }
+            result.push_back(std::move(entry));
          }
          res_ok(res, result);
          res.status = 200; // HTTP OK
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp

index 8636cf511d636e469fdc279e343ef9df11347b6c..85fe25e0085fb676c4e2bb36e384e784a0de6bab 100644 (file)
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -1002,6 +1002,47 @@ static bool are_lora_equal(
      return true;
  }
  
+// get the ids of all enabled loras
+static std::vector<size_t> lora_get_enabled_ids(const std::vector<common_adapter_lora_info> & loras) {
+    std::vector<size_t> enabled_ids;
+    for (size_t i = 0; i < loras.size(); ++i) {
+        if (loras[i].scale > 0) {
+            enabled_ids.push_back(i);
+        }
+    }
+    return enabled_ids;
+}
+
+// check whether the given lora set has only aloras activated (empty => false)
+static bool lora_all_alora(const std::vector<common_adapter_lora_info> & loras) {
+    bool found_alora = false;
+    for (const auto & lora : loras) {
+        if (lora.scale != 0) {
+            if (llama_adapter_get_alora_n_invocation_tokens(lora.ptr) == 0) {
+                return false;
+            }
+            found_alora = true;
+        }
+    }
+    return found_alora;
+}
+
+// if the two sets of loras are different, they require a cache clear unless the
+// change is only from aloras to aloras.
+static bool lora_should_clear_cache(
+        const std::vector<common_adapter_lora_info> & current,
+        const std::vector<common_adapter_lora_info> & next) {
+
+    // This should always be called after determining that the two sets are
+    // _not_ equal. This assert is therefore some slightly wasted work and
+    // should be safe to remove as long as this method is called correctly.
+    GGML_ASSERT(!are_lora_equal(current, next));
+
+    return (
+        !(lora_get_enabled_ids(current).empty() || lora_all_alora(current)) ||
+        !lora_all_alora(next));
+}
+
  // parse lora config from JSON request, returned a copy of lora_base with updated scale
  static std::vector<common_adapter_lora_info> parse_lora_request(
          const std::vector<common_adapter_lora_info> & lora_base,
author	Gabe Goodhart <redacted>
	Fri, 5 Sep 2025 23:32:39 +0000 (17:32 -0600)
committer	GitHub <redacted>
	Fri, 5 Sep 2025 23:32:39 +0000 (17:32 -0600)
convert_lora_to_gguf.py		patch \| blob \| history
gguf-py/gguf/constants.py		patch \| blob \| history
include/llama.h		patch \| blob \| history
src/llama-adapter.cpp		patch \| blob \| history
src/llama-adapter.h		patch \| blob \| history
src/llama-arch.cpp		patch \| blob \| history
src/llama-arch.h		patch \| blob \| history
tools/server/server.cpp		patch \| blob \| history
tools/server/utils.hpp		patch \| blob \| history