`tool-call`: fix llama 3.x and functionary 3.2, play nice w/ pydantic_ai package...

author Olivier Chafik <redacted>

Fri, 31 Jan 2025 14:15:25 +0000 (14:15 +0000)

committer GitHub <redacted>

Fri, 31 Jan 2025 14:15:25 +0000 (14:15 +0000)
author Olivier Chafik <redacted>
Fri, 31 Jan 2025 14:15:25 +0000 (14:15 +0000)
committer GitHub <redacted>
Fri, 31 Jan 2025 14:15:25 +0000 (14:15 +0000)
diff --git a/common/chat-template.hpp b/common/chat-template.hpp

index 75ba5d938f8cff5cd5093ee45fc0fa7867e918e3..58e119a3bcdb37a7e8f532184a12e875976a2e2e 100644 (file)
--- a/common/chat-template.hpp
+++ b/common/chat-template.hpp
@@ -283,10 +283,12 @@ class chat_template {
                      message["role"] = "user";
                      auto obj = json {
                          {"tool_response", {
-                            {"tool", message.at("name")},
                              {"content", message.at("content")},
                          }},
                      };
+                    if (message.contains("name")) {
+                        obj["tool_response"]["name"] = message.at("name");
+                    }
                      if (message.contains("tool_call_id")) {
                          obj["tool_response"]["tool_call_id"] = message.at("tool_call_id");
                      }
diff --git a/common/chat.cpp b/common/chat.cpp

index d9a654892ca2adcc5a056a8d4298b9ecf20c77d5..58db12af9a0c233124c1496c86b3b5da08caeda1 100644 (file)
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -384,14 +384,19 @@ static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const com
              tool_rules.push_back(
                  builder.add_rule(
                      name + "-call",
-                    "\"{\" ( \"\\\"type\\\": \\\"function\\\", \" | space ) "
+                    "\"{\" space "
+                    "( \"\\\"type\\\":\" space \"\\\"function\\\",\" space )? "
                      "\"\\\"name\\\": \\\"" + name + "\\\", \\\"parameters\\\": \" " +
                          builder.add_schema(name + "-args", parameters) +
                      " \"}\""));
              data.grammar_triggers.push_back({"{\"name\": \"" + name + "\"", /* .at_start = */ true});
          });
          data.grammar_triggers.push_back({"{\"name\":", /* .at_start = */ true});
+        data.grammar_triggers.push_back({"{\n  \"name\":", /* .at_start = */ true});
+        data.grammar_triggers.push_back({"{\n    \"name\":", /* .at_start = */ true});
          data.grammar_triggers.push_back({"{\"type\": \"function\"", /* .at_start = */ true});
+        data.grammar_triggers.push_back({"{\n  \"type\": \"function\"", /* .at_start = */ true});
+        data.grammar_triggers.push_back({"{\n    \"type\": \"function\"", /* .at_start = */ true});
          if (!builtin_tools.empty()) {
              data.grammar_triggers.push_back({"<|python_tag|>", /* .at_start = */ false});
          }
@@ -586,9 +591,17 @@ static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & in
          }
      }
      // TODO: tighten & simplify.
-    auto res = parse_json_tool_calls(std::string(it, end), std::nullopt, function_regex, close_regex);
-    res.content = content;
-    return res;
+    try {
+        auto res = parse_json_tool_calls(std::string(it, end), std::nullopt, function_regex, close_regex);
+        res.content = content + res.content;
+        return res;
+    } catch (const std::exception & e) {
+        LOG_ERR("Failed to parse functionary v3.2 input: %s\n", e.what());
+        common_chat_msg res;
+        res.role = "assistant";
+        res.content = input;
+        return res;
+    }
  }
  
  static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
diff --git a/examples/server/README.md b/examples/server/README.md

index ce1ae88585a2b89835ad833a4ae46128eaec18bc..276b43013960059b16131d5db64c3154d37ee65d 100644 (file)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -126,7 +126,7 @@ The project is under active development, and we are [looking for feedback and co
  | `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') |
  | `--grammar-file FNAME` | file to read grammar from |
  | `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
-| `--jinja` | Enable experimental Jinja templating engine (needed for tool use) |
+| `--jinja` | Enable experimental Jinja templating engine (required for tool use) |
  
  **Example-specific params**
  
@@ -1069,7 +1069,7 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte
  
  *Options:*
  
-See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported.
+See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). llama.cpp `/completion`-specific features such as `mirostat` are also supported.
  
  The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name",  "type": "string" }, "date": { "title": "Date",  "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants",  "type": "string" } } } }`), similar to other OpenAI-inspired API providers.
  
@@ -1117,17 +1117,111 @@ curl http://localhost:8080/v1/chat/completions \
  }'
  ```
  
-... and even tool usage (needs `--jinja` flag):
+*Tool call support*
  
-  ```shell
-  llama-server --jinja -hfr lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF -hff Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf -fa
+[Function calling](https://platform.openai.com/docs/guides/function-calling) is supported for all models (see https://github.com/ggerganov/llama.cpp/pull/9639):
+
+- Requires `--jinja` flag
+- Native tool call formats supported:
+  - Llama 3.1 / 3.3 (including builtin tools support - tool names for `wolfram_alpha`, `web_search` / `brave_search`, `code_interpreter`), Llama 3.2
+  - Functionary v3.1 / v3.2
+  - Hermes 2/3, Qwen 2.5
+  - Mistral Nemo
+  - Firefunction v2
+  - DeepSeek R1 (WIP / seems reluctant to call any tools?)
+
+  <details>
+  <summary>Show some common templates and which format handler they use</summary>
+
+  | Template | Format |
+  |----------|--------|
+  | CohereForAI-c4ai-command-r-plus-default.jinja | generic tool calls |
+  | CohereForAI-c4ai-command-r-plus-rag.jinja | generic tool calls |
+  | CohereForAI-c4ai-command-r-plus-tool_use.jinja | generic tool calls |
+  | MiniMaxAI-MiniMax-Text-01.jinja | generic tool calls |
+  | NexaAIDev-Octopus-v2.jinja | generic tool calls |
+  | NousResearch-Hermes-2-Pro-Llama-3-8B-default.jinja | generic tool calls |
+  | NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja | hermes 2 pro tool calls |
+  | NousResearch-Hermes-2-Pro-Mistral-7B-default.jinja | generic tool calls |
+  | NousResearch-Hermes-2-Pro-Mistral-7B-tool_use.jinja | hermes 2 pro tool calls |
+  | NousResearch-Hermes-3-Llama-3.1-70B-default.jinja | generic tool calls |
+  | NousResearch-Hermes-3-Llama-3.1-70B-tool_use.jinja | hermes 2 pro tool calls |
+  | OrionStarAI-Orion-14B-Chat.jinja | generic tool calls |
+  | Qwen-QwQ-32B-Preview.jinja | hermes 2 pro tool calls |
+  | Qwen-Qwen2-7B-Instruct.jinja | generic tool calls |
+  | Qwen-Qwen2-VL-7B-Instruct.jinja | generic tool calls |
+  | Qwen-Qwen2.5-7B-Instruct.jinja | hermes 2 pro tool calls |
+  | Qwen-Qwen2.5-Math-7B-Instruct.jinja | hermes 2 pro tool calls |
+  | TheBloke-FusionNet_34Bx2_MoE-AWQ.jinja | generic tool calls |
+  | abacusai-Fewshot-Metamath-OrcaVicuna-Mistral.jinja | generic tool calls |
+  | bofenghuang-vigogne-2-70b-chat.jinja | generic tool calls |
+  | databricks-dbrx-instruct.jinja | generic tool calls |
+  | deepseek-ai-DeepSeek-Coder-V2-Instruct.jinja | generic tool calls |
+  | deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja | deepseek r1 tool calls |
+  | deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja | deepseek r1 tool calls |
+  | deepseek-ai-DeepSeek-R1-Distill-Qwen-7B.jinja | deepseek r1 tool calls |
+  | deepseek-ai-DeepSeek-V2.5.jinja | deepseek r1 tool calls |
+  | deepseek-ai-deepseek-coder-33b-instruct.jinja | generic tool calls |
+  | google-gemma-2-2b-it.jinja | generic tool calls |
+  | google-gemma-7b-it.jinja | generic tool calls |
+  | indischepartij-MiniCPM-3B-OpenHermes-2.5-v2.jinja | generic tool calls |
+  | mattshumer-Reflection-Llama-3.1-70B.jinja | generic tool calls |
+  | meetkai-functionary-medium-v3.2.jinja | functionary v3.2 tool calls |
+  | meta-llama-Llama-3.1-8B-Instruct.jinja | llama 3.x tool calls (w/ builtin tools) |
+  | meta-llama-Llama-3.2-3B-Instruct.jinja | llama 3.x tool calls |
+  | meta-llama-Llama-3.3-70B-Instruct.jinja | llama 3.x tool calls (w/ builtin tools) |
+  | meta-llama-Meta-Llama-3.1-8B-Instruct.jinja | llama 3.x tool calls (w/ builtin tools) |
+  | microsoft-Phi-3-medium-4k-instruct.jinja | generic tool calls |
+  | microsoft-Phi-3-mini-4k-instruct.jinja | generic tool calls |
+  | microsoft-Phi-3-small-8k-instruct.jinja | generic tool calls |
+  | microsoft-Phi-3.5-mini-instruct.jinja | generic tool calls |
+  | microsoft-Phi-3.5-vision-instruct.jinja | generic tool calls |
+  | mistralai-Mistral-7B-Instruct-v0.2.jinja | generic tool calls |
+  | mistralai-Mistral-Large-Instruct-2407.jinja | mistral nemo tool calls |
+  | mistralai-Mistral-Large-Instruct-2411.jinja | generic tool calls |
+  | mistralai-Mistral-Nemo-Instruct-2407.jinja | mistral nemo tool calls |
+  | mistralai-Mixtral-8x7B-Instruct-v0.1.jinja | generic tool calls |
+  | mlabonne-AlphaMonarch-7B.jinja | generic tool calls |
+  | nvidia-Llama-3.1-Nemotron-70B-Instruct-HF.jinja | llama 3.x tool calls (w/ builtin tools) |
+  | openchat-openchat-3.5-0106.jinja | generic tool calls |
+  | teknium-OpenHermes-2.5-Mistral-7B.jinja | generic tool calls |
+
+  This table can be generated with:
  
-  # https://huggingface.co/meetkai/functionary-medium-v3.2
-  llama-server --jinja -hfr bartowski/functionary-medium-v3.2-GGUF -hff functionary-medium-v3.2-IQ4_XS.gguf -fa
+  ```bash
+  ./build/bin/test-chat ../minja/build/tests/*.jinja 2>/dev/null
+
+  </details>
  
-  # https://huggingface.co/meetkai/functionary-medium-v3.1
-  llama-server --jinja -hfr meetkai/functionary-medium-v3.1-GGUF -hff functionary-medium-llama-3.1.Q4_0.gguf -fa
+- Generic tool call is supported when the template isn't recognized by native format handlers (you'll see `Chat format: Generic` in the logs).
+  - Use `--chat-template-file` to override the template when appropriate (see examples below)
+  - Generic support may consume more tokens and be less efficient than a model's native format.
  
+- Run with:
+
+  ```shell
+  # Native support:
+  llama-server --jinja -fa -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M
+  llama-server --jinja -fa -hf bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M
+  llama-server --jinja -fa -hf bartowski/Llama-3.2-3B-Instruct-GGUF:Q6_K
+  llama-server --jinja -fa -hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M
+  llama-server --jinja -fa -hf bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M \
+    --chat-template-file <( python scripts/get_chat_template.py NousResearch/Hermes-2-Pro-Llama-3-8B )
+
+  # Native support requires the right template for these GGUFs:
+  llama-server --jinja -fa -hf bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M \
+    --chat-template-file <( python scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use )
+  llama-server --jinja -fa -hf bartowski/firefunction-v2-GGUF -hff firefunction-v2-IQ1_M.gguf \
+    --chat-template-file <( python scripts/get_chat_template.py fireworks-ai/firellama-3-firefunction-v2 )
+
+  # Generic format support
+  llama-server --jinja -fa -hf bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M
+  llama-server --jinja -fa -hf bartowski/gemma-2-2b-it-GGUF:Q4_K_M
+  ```
+
+- Test in CLI:
+
+  ```bash
    curl http://localhost:8080/v1/chat/completions -d '{
      "model": "gpt-3.5-turbo",
      "tools": [
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index e7daceef1db31fb34647a621fb283e0fc2087624..3451e96a2b163d3ee791a07df27af96e2c8f35e4 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -345,7 +345,7 @@ struct server_task {
              auto it = data.find("chat_format");
              if (it != data.end()) {
                  params.oaicompat_chat_format = static_cast<common_chat_format>(it->get<int>());
-                LOG_DBG("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_format).c_str());
+                LOG_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_format).c_str());
              } else {
                  params.oaicompat_chat_format = defaults.oaicompat_chat_format;
              }
@@ -697,6 +697,7 @@ struct server_task_result_cmpl_final : server_task_result {
          std::string finish_reason = "length";
          common_chat_msg message;
          if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
+            LOG_DBG("Parsing chat message: %s\n", content.c_str());
              message = common_chat_parse(content, oaicompat_chat_format);
              finish_reason = message.tool_calls.empty() ? "stop" : "tool_calls";
          } else {
@@ -713,7 +714,7 @@ struct server_task_result_cmpl_final : server_task_result {
                          {"name", tc.name},
                          {"arguments", tc.arguments},
                      }},
-                    {"id", tc.id.empty() ? json() : json(tc.id)},
+                    {"id", tc.id},
                  });
              }
          }
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp

index 94e189457a2fb607d82ba5d0344d062362d4955d..bfe623c4c31f8ddd27a7787f71223cc6d924b0ad 100644 (file)
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -641,6 +641,10 @@ static json oaicompat_completion_params_parse(
          inputs.tools = tools;
          inputs.tool_choice = tool_choice;
          inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
+        if (inputs.parallel_tool_calls && !tmpl.original_caps().supports_parallel_tool_calls) {
+            LOG_DBG("Disabling parallel_tool_calls because the template does not support it\n");
+            inputs.parallel_tool_calls = false;
+        }
          inputs.stream = stream;
          // TODO: support mixing schema w/ tools beyond generic format.
          inputs.json_schema = json_value(llama_params, "json_schema", json());
author	Olivier Chafik <redacted>
	Fri, 31 Jan 2025 14:15:25 +0000 (14:15 +0000)
committer	GitHub <redacted>
	Fri, 31 Jan 2025 14:15:25 +0000 (14:15 +0000)
common/chat-template.hpp		patch \| blob \| history
common/chat.cpp		patch \| blob \| history
examples/server/README.md		patch \| blob \| history
examples/server/server.cpp		patch \| blob \| history
examples/server/utils.hpp		patch \| blob \| history