/common/arg.* @ggerganov
/common/base64.hpp.* @ggerganov
/common/build-info.* @ggerganov
+/common/chat-peg-parser.* @aldehir
/common/common.* @ggerganov
/common/console.* @ggerganov
/common/http.* @angt
/common/llguidance.* @ggerganov
/common/log.* @ggerganov
+/common/peg-parser.* @aldehir
/common/sampling.* @ggerganov
/common/speculative.* @ggerganov
+/common/unicode.* @aldehir
/convert_*.py @CISC
/examples/batched.swift/ @ggerganov
/examples/batched/ @ggerganov
chat-parser.h
chat-parser-xml-toolcall.h
chat-parser-xml-toolcall.cpp
+ chat-peg-parser.cpp
+ chat-peg-parser.h
chat.cpp
chat.h
common.cpp
log.h
ngram-cache.cpp
ngram-cache.h
+ peg-parser.cpp
+ peg-parser.h
regex-partial.cpp
regex-partial.h
sampling.cpp
sampling.h
speculative.cpp
speculative.h
+ unicode.cpp
+ unicode.h
)
if (BUILD_SHARED_LIBS)
#include "chat-parser.h"
+#include "chat-peg-parser.h"
#include "common.h"
#include "log.h"
+#include "peg-parser.h"
#include "regex-partial.h"
#include <algorithm>
}
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax) {
+ if (syntax.format == COMMON_CHAT_FORMAT_PEG_SIMPLE ||
+ syntax.format == COMMON_CHAT_FORMAT_PEG_NATIVE ||
+ syntax.format == COMMON_CHAT_FORMAT_PEG_CONSTRUCTED) {
+ return common_chat_peg_parse(syntax.parser, input, is_partial, syntax);
+ }
common_chat_msg_parser builder(input, is_partial, syntax);
try {
common_chat_parse(builder);
}
return msg;
}
+
+common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_syntax & syntax) {
+ if (parser.empty()) {
+ throw std::runtime_error("Failed to parse due to missing parser definition.");
+ }
+
+ LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(syntax.format), input.c_str());
+
+ common_peg_parse_context ctx(input, is_partial);
+ auto result = parser.parse(ctx);
+ if (result.fail()) {
+ throw std::runtime_error(std::string("Failed to parse input at pos ") + std::to_string(result.end));
+ }
+
+ common_chat_msg msg;
+ msg.role = "assistant";
+
+ if (syntax.format == COMMON_CHAT_FORMAT_PEG_NATIVE) {
+ auto mapper = common_chat_peg_native_mapper(msg);
+ mapper.from_ast(ctx.ast, result);
+ } else if (syntax.format == COMMON_CHAT_FORMAT_PEG_CONSTRUCTED) {
+ auto mapper = common_chat_peg_constructed_mapper(msg);
+ mapper.from_ast(ctx.ast, result);
+ } else {
+ // Generic mapper
+ auto mapper = common_chat_peg_mapper(msg);
+ mapper.from_ast(ctx.ast, result);
+ }
+ if (!is_partial) {
+ LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
+ }
+ return msg;
+}
--- /dev/null
+#include "chat-peg-parser.h"
+
+#include <nlohmann/json.hpp>
+
+using json = nlohmann::json;
+
+static std::string_view trim_trailing_space(std::string_view sv) {
+ while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.back()))) {
+ sv.remove_suffix(1);
+ }
+ return sv;
+}
+
+void common_chat_peg_mapper::from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result) {
+ arena.visit(result, [this](const common_peg_ast_node & node) {
+ map(node);
+ });
+}
+
+void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
+ bool is_reasoning = node.tag == common_chat_peg_builder::REASONING;
+ bool is_content = node.tag == common_chat_peg_builder::CONTENT;
+
+ if (is_reasoning) {
+ result.reasoning_content = std::string(trim_trailing_space(node.text));
+ }
+
+ if (is_content) {
+ result.content = std::string(trim_trailing_space(node.text));
+ }
+}
+
+void common_chat_peg_native_mapper::map(const common_peg_ast_node & node) {
+ common_chat_peg_mapper::map(node);
+
+ bool is_tool_open = node.tag == common_chat_peg_native_builder::TOOL_OPEN;
+ bool is_tool_name = node.tag == common_chat_peg_native_builder::TOOL_NAME;
+ bool is_tool_id = node.tag == common_chat_peg_native_builder::TOOL_ID;
+ bool is_tool_args = node.tag == common_chat_peg_native_builder::TOOL_ARGS;
+
+ if (is_tool_open) {
+ result.tool_calls.emplace_back();
+ current_tool = &result.tool_calls.back();
+ }
+
+ if (is_tool_id && current_tool) {
+ current_tool->id = std::string(trim_trailing_space(node.text));
+ }
+
+ if (is_tool_name && current_tool) {
+ current_tool->name = std::string(trim_trailing_space(node.text));
+ }
+
+ if (is_tool_args && current_tool) {
+ current_tool->arguments = std::string(trim_trailing_space(node.text));
+ }
+}
+
+void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) {
+ common_chat_peg_mapper::map(node);
+
+ bool is_tool_open = node.tag == common_chat_peg_constructed_builder::TOOL_OPEN;
+ bool is_tool_name = node.tag == common_chat_peg_constructed_builder::TOOL_NAME;
+ bool is_tool_close = node.tag == common_chat_peg_constructed_builder::TOOL_CLOSE;
+ bool is_arg_open = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_OPEN;
+ bool is_arg_close = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_CLOSE;
+ bool is_arg_name = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_NAME;
+ bool is_arg_string = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_STRING_VALUE;
+ bool is_arg_json = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_JSON_VALUE;
+
+ if (is_tool_open) {
+ result.tool_calls.emplace_back();
+ current_tool = &result.tool_calls.back();
+ arg_count = 0;
+ }
+
+ if (is_tool_name) {
+ current_tool->name = std::string(node.text);
+ current_tool->arguments = "{";
+ }
+
+ if (is_arg_open) {
+ needs_closing_quote = false;
+ }
+
+ if (is_arg_name && current_tool) {
+ if (arg_count > 0) {
+ current_tool->arguments += ",";
+ }
+ current_tool->arguments += json(trim_trailing_space(node.text)).dump() + ":";
+ ++arg_count;
+ }
+
+ if (is_arg_string && current_tool) {
+ // Serialize to JSON, but exclude the end quote
+ std::string dumped = json(node.text).dump();
+ current_tool->arguments += dumped.substr(0, dumped.size() - 1);
+ needs_closing_quote = true;
+ }
+
+ if (is_arg_close && current_tool) {
+ if (needs_closing_quote) {
+ current_tool->arguments += "\"";
+ }
+ }
+
+ if (is_arg_json && current_tool) {
+ current_tool->arguments += std::string(trim_trailing_space(node.text));
+ }
+
+ if (is_tool_close && current_tool) {
+ current_tool->arguments += "}";
+ }
+}
--- /dev/null
+#pragma once
+
+#include "chat.h"
+#include "peg-parser.h"
+
+class common_chat_peg_builder : public common_peg_parser_builder {
+ public:
+ static constexpr const char * REASONING_BLOCK = "reasoning-block";
+ static constexpr const char * REASONING = "reasoning";
+ static constexpr const char * CONTENT = "content";
+
+ common_peg_parser reasoning_block(const common_peg_parser & p) { return tag(REASONING_BLOCK, p); }
+ common_peg_parser reasoning(const common_peg_parser & p) { return tag(REASONING, p); }
+ common_peg_parser content(const common_peg_parser & p) { return tag(CONTENT, p); }
+};
+
+inline common_peg_arena build_chat_peg_parser(const std::function<common_peg_parser(common_chat_peg_builder & builder)> & fn) {
+ common_chat_peg_builder builder;
+ builder.set_root(fn(builder));
+ return builder.build();
+}
+
+class common_chat_peg_mapper {
+ public:
+ common_chat_msg & result;
+
+ common_chat_peg_mapper(common_chat_msg & msg) : result(msg) {}
+
+ virtual void from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result);
+ virtual void map(const common_peg_ast_node & node);
+};
+
+class common_chat_peg_native_builder : public common_chat_peg_builder {
+ public:
+ static constexpr const char * TOOL = "tool";
+ static constexpr const char * TOOL_OPEN = "tool-open";
+ static constexpr const char * TOOL_CLOSE = "tool-close";
+ static constexpr const char * TOOL_ID = "tool-id";
+ static constexpr const char * TOOL_NAME = "tool-name";
+ static constexpr const char * TOOL_ARGS = "tool-args";
+
+ common_peg_parser tool(const common_peg_parser & p) { return tag(TOOL, p); }
+ common_peg_parser tool_open(const common_peg_parser & p) { return atomic(tag(TOOL_OPEN, p)); }
+ common_peg_parser tool_close(const common_peg_parser & p) { return atomic(tag(TOOL_CLOSE, p)); }
+ common_peg_parser tool_id(const common_peg_parser & p) { return atomic(tag(TOOL_ID, p)); }
+ common_peg_parser tool_name(const common_peg_parser & p) { return atomic(tag(TOOL_NAME, p)); }
+ common_peg_parser tool_args(const common_peg_parser & p) { return tag(TOOL_ARGS, p); }
+};
+
+class common_chat_peg_native_mapper : public common_chat_peg_mapper {
+ common_chat_tool_call * current_tool;
+
+ public:
+ common_chat_peg_native_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}
+
+ void map(const common_peg_ast_node & node) override;
+};
+
+inline common_peg_arena build_chat_peg_native_parser(const std::function<common_peg_parser(common_chat_peg_native_builder & builder)> & fn) {
+ common_chat_peg_native_builder builder;
+ builder.set_root(fn(builder));
+ return builder.build();
+}
+
+class common_chat_peg_constructed_builder : public common_chat_peg_builder {
+ public:
+ static constexpr const char * TOOL = "tool";
+ static constexpr const char * TOOL_OPEN = "tool-open";
+ static constexpr const char * TOOL_CLOSE = "tool-close";
+ static constexpr const char * TOOL_NAME = "tool-name";
+ static constexpr const char * TOOL_ARG = "tool-arg";
+ static constexpr const char * TOOL_ARG_OPEN = "tool-arg-open";
+ static constexpr const char * TOOL_ARG_CLOSE = "tool-arg-close";
+ static constexpr const char * TOOL_ARG_NAME = "tool-arg-name";
+ static constexpr const char * TOOL_ARG_STRING_VALUE = "tool-arg-string-value";
+ static constexpr const char * TOOL_ARG_JSON_VALUE = "tool-arg-json-value";
+
+ common_peg_parser tool(const common_peg_parser & p) { return tag(TOOL, p); }
+ common_peg_parser tool_open(const common_peg_parser & p) { return atomic(tag(TOOL_OPEN, p)); }
+ common_peg_parser tool_close(const common_peg_parser & p) { return atomic(tag(TOOL_CLOSE, p)); }
+ common_peg_parser tool_name(const common_peg_parser & p) { return atomic(tag(TOOL_NAME, p)); }
+ common_peg_parser tool_arg(const common_peg_parser & p) { return tag(TOOL_ARG, p); }
+ common_peg_parser tool_arg_open(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_OPEN, p)); }
+ common_peg_parser tool_arg_close(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_CLOSE, p)); }
+ common_peg_parser tool_arg_name(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_NAME, p)); }
+ common_peg_parser tool_arg_string_value(const common_peg_parser & p) { return tag(TOOL_ARG_STRING_VALUE, p); }
+ common_peg_parser tool_arg_json_value(const common_peg_parser & p) { return tag(TOOL_ARG_JSON_VALUE, p); }
+};
+
+class common_chat_peg_constructed_mapper : public common_chat_peg_mapper {
+ common_chat_tool_call * current_tool;
+ int arg_count = 0;
+ bool needs_closing_quote = false;
+
+ public:
+ common_chat_peg_constructed_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}
+
+ void map(const common_peg_ast_node & node) override;
+};
+
+inline common_peg_arena build_chat_peg_constructed_parser(const std::function<common_peg_parser(common_chat_peg_constructed_builder & builder)> & fn) {
+ common_chat_peg_constructed_builder builder;
+ builder.set_root(fn(builder));
+ return builder.build();
+}
case COMMON_CHAT_FORMAT_QWEN3_CODER_XML: return "Qwen3 Coder";
case COMMON_CHAT_FORMAT_APRIEL_1_5: return "Apriel 1.5";
case COMMON_CHAT_FORMAT_XIAOMI_MIMO: return "Xiaomi MiMo";
+ case COMMON_CHAT_FORMAT_PEG_SIMPLE: return "peg-simple";
+ case COMMON_CHAT_FORMAT_PEG_NATIVE: return "peg-native";
+ case COMMON_CHAT_FORMAT_PEG_CONSTRUCTED: return "peg-constructed";
default:
throw std::runtime_error("Unknown chat format");
}
#pragma once
#include "common.h"
+#include "peg-parser.h"
#include <functional>
#include <chrono>
#include <string>
COMMON_CHAT_FORMAT_APRIEL_1_5,
COMMON_CHAT_FORMAT_XIAOMI_MIMO,
+ // These are intended to be parsed by the PEG parser
+ COMMON_CHAT_FORMAT_PEG_SIMPLE,
+ COMMON_CHAT_FORMAT_PEG_NATIVE,
+ COMMON_CHAT_FORMAT_PEG_CONSTRUCTED,
+
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
};
std::vector<common_grammar_trigger> grammar_triggers;
std::vector<std::string> preserved_tokens;
std::vector<std::string> additional_stops;
+ std::string parser;
};
struct common_chat_syntax {
bool reasoning_in_content = false;
bool thinking_forced_open = false;
bool parse_tool_calls = true;
+ common_peg_arena parser = {};
};
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
const char* common_reasoning_format_name(common_reasoning_format format);
common_reasoning_format common_reasoning_format_from_name(const std::string & format);
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
+common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_syntax & syntax);
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
--- /dev/null
+#include "common.h"
+#include "peg-parser.h"
+#include "json-schema-to-grammar.h"
+#include "unicode.h"
+
+#include <nlohmann/json.hpp>
+
+#include <algorithm>
+#include <initializer_list>
+#include <map>
+#include <memory>
+#include <regex>
+#include <stdexcept>
+#include <unordered_set>
+
+// Trick to catch missing branches
+template <typename T>
+inline constexpr bool is_always_false_v = false;
+
+const char * common_peg_parse_result_type_name(common_peg_parse_result_type type) {
+ switch (type) {
+ case COMMON_PEG_PARSE_RESULT_FAIL: return "fail";
+ case COMMON_PEG_PARSE_RESULT_SUCCESS: return "success";
+ case COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT: return "need_more_input";
+ default: return "unknown";
+ }
+}
+
+static bool is_hex_digit(const char c) {
+ return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
+}
+
+// Trie for matching multiple literals.
+// This is used in common_peg_until_parser and to build a GBNF exclusion grammar
+struct trie {
+ struct node {
+ size_t depth = 0;
+ std::map<unsigned char, size_t> children;
+ bool is_word;
+ };
+
+ std::vector<node> nodes;
+
+ trie(const std::vector<std::string> & words) {
+ create_node(); // root node
+ for (const auto & w : words) {
+ insert(w);
+ }
+ }
+
+ enum match_result { NO_MATCH, PARTIAL_MATCH, COMPLETE_MATCH };
+
+ // Check if a delimiter starts at the given position
+ match_result check_at(std::string_view sv, size_t start_pos) const {
+ size_t current = 0; // Start at root
+ size_t pos = start_pos;
+
+ while (pos < sv.size()) {
+ auto it = nodes[current].children.find(sv[pos]);
+ if (it == nodes[current].children.end()) {
+ // Can't continue matching
+ return match_result{match_result::NO_MATCH};
+ }
+
+ current = it->second;
+ pos++;
+
+ // Check if we've matched a complete word
+ if (nodes[current].is_word) {
+ return match_result{match_result::COMPLETE_MATCH};
+ }
+ }
+
+ // Reached end of input while still in the trie (not at root)
+ if (current != 0) {
+ // We're in the middle of a potential match
+ return match_result{match_result::PARTIAL_MATCH};
+ }
+
+ // Reached end at root (no match)
+ return match_result{match_result::NO_MATCH};
+ }
+
+ struct prefix_and_next {
+ std::string prefix;
+ std::string next_chars;
+ };
+
+ std::vector<prefix_and_next> collect_prefix_and_next() {
+ std::string prefix;
+ std::vector<prefix_and_next> result;
+ collect_prefix_and_next(0, prefix, result);
+ return result;
+ }
+
+ private:
+ void collect_prefix_and_next(size_t index, std::string & prefix, std::vector<prefix_and_next> & out) {
+ if (!nodes[index].is_word) {
+ if (!nodes[index].children.empty()) {
+ std::string chars;
+ chars.reserve(nodes[index].children.size());
+ for (const auto & p : nodes[index].children) {
+ chars.push_back(p.first);
+ }
+ out.emplace_back(prefix_and_next{prefix, chars});
+ }
+ }
+
+ for (const auto & p : nodes[index].children) {
+ unsigned char ch = p.first;
+ auto child = p.second;
+ prefix.push_back(ch);
+ collect_prefix_and_next(child, prefix, out);
+ prefix.pop_back();
+ }
+ }
+
+ size_t create_node() {
+ size_t index = nodes.size();
+ nodes.emplace_back();
+ return index;
+ }
+
+ void insert(const std::string & word) {
+ size_t current = 0;
+ for (unsigned char ch : word) {
+ auto it = nodes[current].children.find(ch);
+ if (it == nodes[current].children.end()) {
+ size_t child = create_node();
+ nodes[child].depth = nodes[current].depth + 1;
+ nodes[current].children[ch] = child;
+ current = child;
+ } else {
+ current = it->second;
+ }
+ }
+ nodes[current].is_word = true;
+ }
+};
+
+static std::pair<uint32_t, size_t> parse_hex_escape(const std::string & str, size_t pos, int hex_count) {
+ if (pos + hex_count > str.length()) {
+ return {0, 0};
+ }
+
+ uint32_t value = 0;
+ for (int i = 0; i < hex_count; i++) {
+ char c = str[pos + i];
+ if (!is_hex_digit(c)) {
+ return {0, 0};
+ }
+ value <<= 4;
+ if ('a' <= c && c <= 'f') {
+ value += c - 'a' + 10;
+ } else if ('A' <= c && c <= 'F') {
+ value += c - 'A' + 10;
+ } else if ('0' <= c && c <= '9') {
+ value += c - '0';
+ } else {
+ break;
+ }
+ }
+ return {value, static_cast<size_t>(hex_count)};
+}
+
+static std::pair<uint32_t, size_t> parse_char_class_char(const std::string & content, size_t pos) {
+ if (content[pos] == '\\' && pos + 1 < content.length()) {
+ switch (content[pos + 1]) {
+ case 'x': {
+ auto result = parse_hex_escape(content, pos + 2, 2);
+ if (result.second > 0) {
+ return {result.first, 2 + result.second};
+ }
+ // Invalid escape, treat as literal 'x'
+ return {static_cast<uint32_t>('x'), 2};
+ }
+ case 'u': {
+ auto result = parse_hex_escape(content, pos + 2, 4);
+ if (result.second > 0) {
+ return {result.first, 2 + result.second};
+ }
+ // Invalid escape, treat as literal 'u'
+ return {static_cast<uint32_t>('u'), 2};
+ }
+ case 'U': {
+ auto result = parse_hex_escape(content, pos + 2, 8);
+ if (result.second > 0) {
+ return {result.first, 2 + result.second};
+ }
+ // Invalid escape, treat as literal 'U'
+ return {static_cast<uint32_t>('U'), 2};
+ }
+ case 'n': return {'\n', 2};
+ case 't': return {'\t', 2};
+ case 'r': return {'\r', 2};
+ case '\\': return {'\\', 2};
+ case ']': return {']', 2};
+ case '[': return {'[', 2};
+ default: return {static_cast<uint32_t>(content[pos + 1]), 2};
+ }
+ }
+
+ // Regular character - return as codepoint
+ return {static_cast<uint32_t>(static_cast<unsigned char>(content[pos])), 1};
+}
+
+static std::pair<std::vector<common_peg_chars_parser::char_range>, bool> parse_char_classes(const std::string & classes) {
+ std::vector<common_peg_chars_parser::char_range> ranges;
+ bool negated = false;
+
+ std::string content = classes;
+ if (content.front() == '[') {
+ content = content.substr(1);
+ }
+
+ if (content.back() == ']') {
+ content.pop_back();
+ }
+
+ // Check for negation
+ if (!content.empty() && content.front() == '^') {
+ negated = true;
+ content = content.substr(1);
+ }
+
+ size_t i = 0;
+ while (i < content.length()) {
+ auto [start, start_len] = parse_char_class_char(content, i);
+ i += start_len;
+
+ if (i + 1 < content.length() && content[i] == '-') {
+ // Range detected
+ auto [end, end_len] = parse_char_class_char(content, i + 1);
+ ranges.push_back(common_peg_chars_parser::char_range{start, end});
+ i += 1 + end_len;
+ } else {
+ ranges.push_back(common_peg_chars_parser::char_range{start, start});
+ }
+ }
+
+ return {ranges, negated};
+}
+
+void common_peg_ast_arena::visit(common_peg_ast_id id, const common_peg_ast_visitor & visitor) const {
+ if (id == COMMON_PEG_INVALID_AST_ID) {
+ return;
+ }
+ const auto & node = get(id);
+ visitor(node);
+ for (const auto & child : node.children) {
+ visit(child, visitor);
+ }
+}
+
+void common_peg_ast_arena::visit(const common_peg_parse_result & result, const common_peg_ast_visitor & visitor) const {
+ for (const auto & node : result.nodes) {
+ visit(node, visitor);
+ }
+}
+
+struct parser_executor;
+
+common_peg_parser_id common_peg_arena::add_parser(common_peg_parser_variant parser) {
+ common_peg_parser_id id = parsers_.size();
+ parsers_.push_back(std::move(parser));
+ return id;
+}
+
+void common_peg_arena::add_rule(const std::string & name, common_peg_parser_id id) {
+ rules_[name] = id;
+}
+
+common_peg_parser_id common_peg_arena::get_rule(const std::string & name) const {
+ auto it = rules_.find(name);
+ if (it == rules_.end()) {
+ throw std::runtime_error("Rule not found: " + name);
+ }
+ return it->second;
+}
+
+struct parser_executor {
+ const common_peg_arena & arena;
+ common_peg_parse_context & ctx;
+ size_t start_pos;
+
+ parser_executor(const common_peg_arena & arena, common_peg_parse_context & ctx, size_t start)
+ : arena(arena), ctx(ctx), start_pos(start) {}
+
+ common_peg_parse_result operator()(const common_peg_epsilon_parser & /* p */) const {
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos);
+ }
+
+ common_peg_parse_result operator()(const common_peg_start_parser & /* p */) const {
+ return common_peg_parse_result(
+ start_pos == 0 ? COMMON_PEG_PARSE_RESULT_SUCCESS : COMMON_PEG_PARSE_RESULT_FAIL,
+ start_pos
+ );
+ }
+
+ common_peg_parse_result operator()(const common_peg_end_parser & /* p */) const {
+ return common_peg_parse_result(
+ start_pos >= ctx.input.size() ? COMMON_PEG_PARSE_RESULT_SUCCESS : COMMON_PEG_PARSE_RESULT_FAIL,
+ start_pos
+ );
+ }
+
+ common_peg_parse_result operator()(const common_peg_literal_parser & p) {
+ auto pos = start_pos;
+ for (auto i = 0u; i < p.literal.size(); ++i) {
+ if (pos >= ctx.input.size()) {
+ if (!ctx.is_partial) {
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
+ }
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
+ }
+ if (ctx.input[pos] != p.literal[i]) {
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
+ }
+ ++pos;
+ }
+
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
+ }
+
+ common_peg_parse_result operator()(const common_peg_sequence_parser & p) {
+ auto pos = start_pos;
+ std::vector<common_peg_ast_id> nodes;
+
+ for (const auto & child_id : p.children) {
+ auto result = arena.parse(child_id, ctx, pos);
+ if (result.fail()) {
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, result.end);
+ }
+
+ if (!result.nodes.empty()) {
+ nodes.insert(nodes.end(), result.nodes.begin(), result.nodes.end());
+ }
+
+ if (result.need_more_input()) {
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, result.end, std::move(nodes));
+ }
+
+ pos = result.end;
+ }
+
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos, std::move(nodes));
+ }
+
+ common_peg_parse_result operator()(const common_peg_choice_parser & p) {
+ auto pos = start_pos;
+ for (const auto & child_id : p.children) {
+ auto result = arena.parse(child_id, ctx, pos);
+ if (!result.fail()) {
+ return result;
+ }
+ }
+
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
+ }
+
+ common_peg_parse_result operator()(const common_peg_repetition_parser & p) {
+ auto pos = start_pos;
+ int match_count = 0;
+ std::vector<common_peg_ast_id> nodes;
+
+ // Try to match up to max_count times (or unlimited if max_count is -1)
+ while (p.max_count == -1 || match_count < p.max_count) {
+ if (pos >= ctx.input.size()) {
+ break;
+ }
+
+ auto result = arena.parse(p.child, ctx, pos);
+
+ if (result.success()) {
+ // Prevent infinite loop on empty matches
+ if (result.end == pos) {
+ break;
+ }
+
+ if (!result.nodes.empty()) {
+ nodes.insert(nodes.end(), result.nodes.begin(), result.nodes.end());
+ }
+
+ pos = result.end;
+ match_count++;
+ continue;
+ }
+
+ if (result.need_more_input()) {
+ if (!result.nodes.empty()) {
+ nodes.insert(nodes.end(), result.nodes.begin(), result.nodes.end());
+ }
+
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, result.end, std::move(nodes));
+ }
+
+ // Child failed - stop trying
+ break;
+ }
+
+ // Check if we got enough matches
+ if (p.min_count > 0 && match_count < p.min_count) {
+ if (pos >= ctx.input.size() && ctx.is_partial) {
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos, std::move(nodes));
+ }
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos);
+ }
+
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos, std::move(nodes));
+ }
+
+ common_peg_parse_result operator()(const common_peg_and_parser & p) {
+ auto result = arena.parse(p.child, ctx, start_pos);
+ // Pass result but don't consume input
+ return common_peg_parse_result(result.type, start_pos);
+ }
+
+ common_peg_parse_result operator()(const common_peg_not_parser & p) {
+ auto result = arena.parse(p.child, ctx, start_pos);
+
+ if (result.success()) {
+ // Fail if the underlying parser matches
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
+ }
+
+ if (result.need_more_input()) {
+ // Propagate - need to know what child would match before negating
+ return result;
+ }
+
+ // Child failed, so negation succeeds
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos);
+ }
+
+ common_peg_parse_result operator()(const common_peg_any_parser & /* p */) const {
+ // Parse a single UTF-8 codepoint (not just a single byte)
+ auto result = parse_utf8_codepoint(ctx.input, start_pos);
+
+ if (result.status == utf8_parse_result::INCOMPLETE) {
+ if (!ctx.is_partial) {
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
+ }
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos);
+ }
+ if (result.status == utf8_parse_result::INVALID) {
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
+ }
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, start_pos + result.bytes_consumed);
+ }
+
+ common_peg_parse_result operator()(const common_peg_space_parser & /* p */) {
+ auto pos = start_pos;
+ while (pos < ctx.input.size()) {
+ auto c = static_cast<unsigned char>(ctx.input[pos]);
+ if (std::isspace(c)) {
+ ++pos;
+ } else {
+ break;
+ }
+ }
+
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
+ }
+
+ common_peg_parse_result operator()(const common_peg_chars_parser & p) const {
+ auto pos = start_pos;
+ int match_count = 0;
+
+ // Try to match up to max_count times (or unlimited if max_count is -1)
+ while (p.max_count == -1 || match_count < p.max_count) {
+ auto result = parse_utf8_codepoint(ctx.input, pos);
+
+ if (result.status == utf8_parse_result::INCOMPLETE) {
+ if (match_count >= p.min_count) {
+ // We have enough matches, succeed with what we have
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
+ }
+ // Not enough matches yet
+ if (!ctx.is_partial) {
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
+ }
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
+ }
+
+ if (result.status == utf8_parse_result::INVALID) {
+ // Malformed UTF-8 in input
+ if (match_count >= p.min_count) {
+ // We have enough matches, succeed up to here
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
+ }
+ // Not enough matches, fail
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
+ }
+
+ // Check if this codepoint matches our character class
+ bool matches = false;
+ for (const auto & range : p.ranges) {
+ if (range.contains(result.codepoint)) {
+ matches = true;
+ break;
+ }
+ }
+
+ // If negated, invert the match result
+ if (p.negated) {
+ matches = !matches;
+ }
+
+ if (matches) {
+ pos += result.bytes_consumed;
+ ++match_count;
+ } else {
+ // Character doesn't match, stop matching
+ break;
+ }
+ }
+
+ // Check if we got enough matches
+ if (match_count < p.min_count) {
+ if (pos >= ctx.input.size() && ctx.is_partial) {
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
+ }
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos);
+ }
+
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
+ }
+
+ static common_peg_parse_result handle_escape_sequence(common_peg_parse_context & ctx, size_t start, size_t & pos) {
+ ++pos; // consume '\'
+ if (pos >= ctx.input.size()) {
+ if (!ctx.is_partial) {
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
+ }
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start, pos);
+ }
+
+ switch (ctx.input[pos]) {
+ case '"':
+ case '\\':
+ case '/':
+ case 'b':
+ case 'f':
+ case 'n':
+ case 'r':
+ case 't':
+ ++pos;
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start, pos);
+ case 'u':
+ return handle_unicode_escape(ctx, start, pos);
+ default:
+ // Invalid escape sequence
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
+ }
+ }
+
+ static common_peg_parse_result handle_unicode_escape(common_peg_parse_context & ctx, size_t start, size_t & pos) {
+ ++pos; // consume 'u'
+ for (int i = 0; i < 4; ++i) {
+ if (pos >= ctx.input.size()) {
+ if (!ctx.is_partial) {
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
+ }
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start, pos);
+ }
+ if (!is_hex_digit(ctx.input[pos])) {
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
+ }
+ ++pos;
+ }
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start, pos);
+ }
+
+ common_peg_parse_result operator()(const common_peg_json_string_parser & /* p */) {
+ auto pos = start_pos;
+
+ // Parse string content (without quotes)
+ while (pos < ctx.input.size()) {
+ char c = ctx.input[pos];
+
+ if (c == '"') {
+ // Found closing quote - success (don't consume it)
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
+ }
+
+ if (c == '\\') {
+ auto result = handle_escape_sequence(ctx, start_pos, pos);
+ if (!result.success()) {
+ return result;
+ }
+ } else {
+ auto utf8_result = parse_utf8_codepoint(ctx.input, pos);
+
+ if (utf8_result.status == utf8_parse_result::INCOMPLETE) {
+ if (!ctx.is_partial) {
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
+ }
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
+ }
+
+ if (utf8_result.status == utf8_parse_result::INVALID) {
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
+ }
+
+ pos += utf8_result.bytes_consumed;
+ }
+ }
+
+ // Reached end without finding closing quote
+ if (!ctx.is_partial) {
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos);
+ }
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
+ }
+
+ common_peg_parse_result operator()(const common_peg_until_parser & p) const {
+ trie matcher(p.delimiters);
+
+ // Scan input and check for delimiters
+ size_t pos = start_pos;
+ size_t last_valid_pos = start_pos;
+
+ while (pos < ctx.input.size()) {
+ auto utf8_result = parse_utf8_codepoint(ctx.input, pos);
+
+ if (utf8_result.status == utf8_parse_result::INCOMPLETE) {
+ // Incomplete UTF-8 sequence
+ if (!ctx.is_partial) {
+ // Input is complete but UTF-8 is incomplete = malformed
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
+ }
+ // Return what we have so far (before incomplete sequence)
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, last_valid_pos);
+ }
+
+ if (utf8_result.status == utf8_parse_result::INVALID) {
+ // Malformed UTF-8
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
+ }
+
+ // Check if a delimiter starts at this position
+ auto match = matcher.check_at(ctx.input, pos);
+
+ if (match == trie::COMPLETE_MATCH) {
+ // Found a complete delimiter, return everything before it
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
+ }
+
+ if (match == trie::PARTIAL_MATCH) {
+ // Found a partial match extending to end of input, return everything before it
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
+ }
+
+ pos += utf8_result.bytes_consumed;
+ last_valid_pos = pos;
+ }
+
+ if (last_valid_pos == ctx.input.size() && ctx.is_partial) {
+ // Reached the end of a partial stream, there might still be more input that we need to consume.
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, last_valid_pos);
+ }
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, last_valid_pos);
+ }
+
+ common_peg_parse_result operator()(const common_peg_schema_parser & p) {
+ return arena.parse(p.child, ctx, start_pos);
+ }
+
+ common_peg_parse_result operator()(const common_peg_rule_parser & p) {
+ // Parse the child
+ auto result = arena.parse(p.child, ctx, start_pos);
+
+ if (!result.fail()) {
+ std::string_view text;
+ if (result.start < ctx.input.size()) {
+ text = std::string_view(ctx.input).substr(result.start, result.end - result.start);
+ }
+
+ auto node_id = ctx.ast.add_node(
+ p.name,
+ "",
+ result.start,
+ result.end,
+ text,
+ std::move(result.nodes),
+ result.need_more_input()
+ );
+
+ return common_peg_parse_result(result.type, result.start, result.end, { node_id });
+ }
+
+ return result;
+ }
+
+ common_peg_parse_result operator()(const common_peg_tag_parser & p) {
+ // Parse the child
+ auto result = arena.parse(p.child, ctx, start_pos);
+
+ if (!result.fail()) {
+ std::string_view text;
+ if (result.start < ctx.input.size()) {
+ text = std::string_view(ctx.input).substr(result.start, result.end - result.start);
+ }
+
+ auto node_id = ctx.ast.add_node(
+ "",
+ p.tag,
+ result.start,
+ result.end,
+ text,
+ std::move(result.nodes),
+ result.need_more_input()
+ );
+
+ return common_peg_parse_result(result.type, result.start, result.end, { node_id });
+ }
+
+ return result;
+ }
+
+ common_peg_parse_result operator()(const common_peg_ref_parser & p) {
+ auto rule_id = arena.get_rule(p.name);
+ return arena.parse(rule_id, ctx, start_pos);
+ }
+
+ common_peg_parse_result operator()(const common_peg_atomic_parser & p) {
+ auto result = arena.parse(p.child, ctx, start_pos);
+ if (result.need_more_input()) {
+ // Clear nodes so they don't propagate up.
+ result.nodes.clear();
+ }
+ return result;
+ }
+};
+
+common_peg_parse_result common_peg_arena::parse(common_peg_parse_context & ctx, size_t start) const {
+ if (root_ == COMMON_PEG_INVALID_PARSER_ID) {
+ throw std::runtime_error("No root parser set");
+ }
+ return parse(root_, ctx, start);
+}
+
+common_peg_parse_result common_peg_arena::parse(common_peg_parser_id id, common_peg_parse_context & ctx, size_t start) const {
+ // Execute parser
+ const auto & parser = parsers_.at(id);
+ parser_executor exec(*this, ctx, start);
+ return std::visit(exec, parser);
+}
+
+common_peg_parser_id common_peg_arena::resolve_ref(common_peg_parser_id id) {
+ const auto & parser = parsers_.at(id);
+ if (auto ref = std::get_if<common_peg_ref_parser>(&parser)) {
+ return get_rule(ref->name);
+ }
+ return id;
+}
+
+void common_peg_arena::resolve_refs() {
+ // Walk through all parsers and replace refs with their corresponding rule IDs
+ for (auto & parser : parsers_) {
+ std::visit([this](auto & p) {
+ using T = std::decay_t<decltype(p)>;
+
+ if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
+ for (auto & child : p.children) {
+ child = resolve_ref(child);
+ }
+ } else if constexpr (std::is_same_v<T, common_peg_choice_parser>) {
+ for (auto & child : p.children) {
+ child = resolve_ref(child);
+ }
+ } else if constexpr (std::is_same_v<T, common_peg_repetition_parser> ||
+ std::is_same_v<T, common_peg_and_parser> ||
+ std::is_same_v<T, common_peg_not_parser> ||
+ std::is_same_v<T, common_peg_tag_parser> ||
+ std::is_same_v<T, common_peg_atomic_parser>) {
+ p.child = resolve_ref(p.child);
+ } else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
+ p.child = resolve_ref(p.child);
+ } else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
+ p.child = resolve_ref(p.child);
+ } else if constexpr (std::is_same_v<T, common_peg_epsilon_parser> ||
+ std::is_same_v<T, common_peg_start_parser> ||
+ std::is_same_v<T, common_peg_end_parser> ||
+ std::is_same_v<T, common_peg_ref_parser> ||
+ std::is_same_v<T, common_peg_until_parser> ||
+ std::is_same_v<T, common_peg_literal_parser> ||
+ std::is_same_v<T, common_peg_json_string_parser> ||
+ std::is_same_v<T, common_peg_chars_parser> ||
+ std::is_same_v<T, common_peg_any_parser> ||
+ std::is_same_v<T, common_peg_space_parser>) {
+ // These rules do not have children
+ } else {
+ static_assert(is_always_false_v<T>);
+ }
+ }, parser);
+ }
+
+ // Also flatten root if it's a ref
+ if (root_ != COMMON_PEG_INVALID_PARSER_ID) {
+ root_ = resolve_ref(root_);
+ }
+}
+
+std::string common_peg_arena::dump(common_peg_parser_id id) const {
+ const auto & parser = parsers_.at(id);
+
+ return std::visit([this](const auto & p) -> std::string {
+ using T = std::decay_t<decltype(p)>;
+
+ if constexpr (std::is_same_v<T, common_peg_epsilon_parser>) {
+ return "Epsilon";
+ } else if constexpr (std::is_same_v<T, common_peg_start_parser>) {
+ return "Start";
+ } else if constexpr (std::is_same_v<T, common_peg_end_parser>) {
+ return "End";
+ } else if constexpr (std::is_same_v<T, common_peg_literal_parser>) {
+ return "Literal(" + p.literal + ")";
+ } else if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
+ std::vector<std::string> parts;
+ for (const auto & child : p.children) {
+ parts.push_back(dump(child));
+ }
+ return "Sequence(" + string_join(parts, ", ") + ")";
+ } else if constexpr (std::is_same_v<T, common_peg_choice_parser>) {
+ std::vector<std::string> parts;
+ for (const auto & child : p.children) {
+ parts.push_back(dump(child));
+ }
+ return "Choice(" + string_join(parts, ", ") + ")";
+ } else if constexpr (std::is_same_v<T, common_peg_repetition_parser>) {
+ if (p.max_count == -1) {
+ return "Repetition(" + dump(p.child) + ", " + std::to_string(p.min_count) + ", unbounded)";
+ }
+ return "Repetition(" + dump(p.child) + ", " + std::to_string(p.min_count) + ", " + std::to_string(p.max_count) + ")";
+ } else if constexpr (std::is_same_v<T, common_peg_and_parser>) {
+ return "And(" + dump(p.child) + ")";
+ } else if constexpr (std::is_same_v<T, common_peg_not_parser>) {
+ return "Not(" + dump(p.child) + ")";
+ } else if constexpr (std::is_same_v<T, common_peg_any_parser>) {
+ return "Any";
+ } else if constexpr (std::is_same_v<T, common_peg_space_parser>) {
+ return "Space";
+ } else if constexpr (std::is_same_v<T, common_peg_chars_parser>) {
+ if (p.max_count == -1) {
+ return "CharRepeat(" + p.pattern + ", " + std::to_string(p.min_count) + ", unbounded)";
+ }
+ return "CharRepeat(" + p.pattern + ", " + std::to_string(p.min_count) + ", " + std::to_string(p.max_count) + ")";
+ } else if constexpr (std::is_same_v<T, common_peg_json_string_parser>) {
+ return "JsonString()";
+ } else if constexpr (std::is_same_v<T, common_peg_until_parser>) {
+ return "Until(" + string_join(p.delimiters, " | ") + ")";
+ } else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
+ return "Schema(" + dump(p.child) + ", " + (p.schema ? p.schema->dump() : "null") + ")";
+ } else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
+ return "Rule(" + p.name + ", " + dump(p.child) + ")";
+ } else if constexpr (std::is_same_v<T, common_peg_ref_parser>) {
+ return "Ref(" + p.name + ")";
+ } else {
+ return "Unknown";
+ }
+ }, parser);
+}
+
+common_peg_parser & common_peg_parser::operator=(const common_peg_parser & other) {
+ id_ = other.id_;
+ return *this;
+}
+
+common_peg_parser & common_peg_parser::operator+=(const common_peg_parser & other) {
+ id_ = builder_.sequence({id_, other.id_});
+ return *this;
+}
+
+common_peg_parser & common_peg_parser::operator|=(const common_peg_parser & other) {
+ id_ = builder_.choice({id_, other.id_});
+ return *this;
+}
+
+common_peg_parser common_peg_parser::operator+(const common_peg_parser & other) const {
+ return builder_.sequence({id_, other.id_});
+}
+
+common_peg_parser common_peg_parser::operator|(const common_peg_parser & other) const {
+ return builder_.choice({id_, other.id_});
+}
+
+common_peg_parser common_peg_parser::operator<<(const common_peg_parser & other) const {
+ return builder_.sequence({id_, builder_.space(), other.id_});
+}
+
+common_peg_parser common_peg_parser::operator+(const char * str) const {
+ return *this + builder_.literal(str);
+}
+
+common_peg_parser common_peg_parser::operator+(const std::string & str) const {
+ return *this + builder_.literal(str);
+}
+
+common_peg_parser common_peg_parser::operator<<(const char * str) const {
+ return *this << builder_.literal(str);
+}
+
+common_peg_parser common_peg_parser::operator<<(const std::string & str) const {
+ return *this << builder_.literal(str);
+}
+
+common_peg_parser common_peg_parser::operator|(const char * str) const {
+ return *this | builder_.literal(str);
+}
+
+common_peg_parser common_peg_parser::operator|(const std::string & str) const {
+ return *this | builder_.literal(str);
+}
+
+common_peg_parser operator+(const char * str, const common_peg_parser & p) {
+ return p.builder().literal(str) + p;
+}
+
+common_peg_parser operator+(const std::string & str, const common_peg_parser & p) {
+ return operator+(str.c_str(), p);
+}
+
+common_peg_parser operator<<(const char * str, const common_peg_parser & p) {
+ return p.builder().literal(str) << p;
+}
+
+common_peg_parser operator<<(const std::string & str, const common_peg_parser & p) {
+ return operator<<(str.c_str(), p);
+}
+
+common_peg_parser operator|(const char * str, const common_peg_parser & p) {
+ return p.builder().literal(str) | p;
+}
+
+common_peg_parser operator|(const std::string & str, const common_peg_parser & p) {
+ return operator|(str.c_str(), p);
+}
+
+static std::string rule_name(const std::string & name) {
+ static const std::regex invalid_rule_chars_re("[^a-zA-Z0-9-]+");
+ return std::regex_replace(name, invalid_rule_chars_re, "-");
+}
+
+common_peg_parser_builder::common_peg_parser_builder() {}
+
+common_peg_parser common_peg_parser_builder::sequence(const std::vector<common_peg_parser_id> & parsers) {
+ // Flatten nested sequences
+ std::vector<common_peg_parser_id> flattened;
+ for (const auto & p : parsers) {
+ const auto & parser = arena_.get(p);
+ if (auto seq = std::get_if<common_peg_sequence_parser>(&parser)) {
+ flattened.insert(flattened.end(), seq->children.begin(), seq->children.end());
+ } else {
+ flattened.push_back(p);
+ }
+ }
+ return wrap(arena_.add_parser(common_peg_sequence_parser{flattened}));
+}
+
+common_peg_parser common_peg_parser_builder::sequence(const std::vector<common_peg_parser> & parsers) {
+ std::vector<common_peg_parser_id> ids;
+ ids.reserve(parsers.size());
+ for (const auto & p : parsers) {
+ ids.push_back(p.id());
+ }
+ return sequence(ids);
+}
+
+common_peg_parser common_peg_parser_builder::sequence(std::initializer_list<common_peg_parser> parsers) {
+ std::vector<common_peg_parser_id> ids;
+ ids.reserve(parsers.size());
+ for (const auto & p : parsers) {
+ ids.push_back(p.id());
+ }
+ return sequence(ids);
+}
+
+common_peg_parser common_peg_parser_builder::choice(const std::vector<common_peg_parser_id> & parsers) {
+ // Flatten nested choices
+ std::vector<common_peg_parser_id> flattened;
+ for (const auto & p : parsers) {
+ const auto & parser = arena_.get(p);
+ if (auto choice = std::get_if<common_peg_choice_parser>(&parser)) {
+ flattened.insert(flattened.end(), choice->children.begin(), choice->children.end());
+ } else {
+ flattened.push_back(p);
+ }
+ }
+ return wrap(arena_.add_parser(common_peg_choice_parser{flattened}));
+}
+
+common_peg_parser common_peg_parser_builder::choice(const std::vector<common_peg_parser> & parsers) {
+ std::vector<common_peg_parser_id> ids;
+ ids.reserve(parsers.size());
+ for (const auto & p : parsers) {
+ ids.push_back(p.id());
+ }
+ return choice(ids);
+}
+
+common_peg_parser common_peg_parser_builder::choice(std::initializer_list<common_peg_parser> parsers) {
+ std::vector<common_peg_parser_id> ids;
+ ids.reserve(parsers.size());
+ for (const auto & p : parsers) {
+ ids.push_back(p.id());
+ }
+ return choice(ids);
+}
+
+common_peg_parser common_peg_parser_builder::chars(const std::string & classes, int min, int max) {
+ auto [ranges, negated] = parse_char_classes(classes);
+ return wrap(arena_.add_parser(common_peg_chars_parser{classes, ranges, negated, min, max}));
+}
+
+common_peg_parser common_peg_parser_builder::schema(const common_peg_parser & p, const std::string & name, const nlohmann::ordered_json & schema, bool raw) {
+ return wrap(arena_.add_parser(common_peg_schema_parser{p.id(), name, std::make_shared<nlohmann::ordered_json>(schema), raw}));
+}
+
+common_peg_parser common_peg_parser_builder::rule(const std::string & name, const common_peg_parser & p, bool trigger) {
+ auto clean_name = rule_name(name);
+ auto rule_id = arena_.add_parser(common_peg_rule_parser{clean_name, p.id(), trigger});
+ arena_.add_rule(clean_name, rule_id);
+ return ref(clean_name);
+}
+
+common_peg_parser common_peg_parser_builder::rule(const std::string & name, const std::function<common_peg_parser()> & builder_fn, bool trigger) {
+ auto clean_name = rule_name(name);
+ if (arena_.has_rule(clean_name)) {
+ return ref(clean_name);
+ }
+
+ // Create placeholder rule to allow recursive references
+ auto placeholder = any(); // Temporary placeholder
+ auto placeholder_rule_id = arena_.add_parser(common_peg_rule_parser{clean_name, placeholder.id(), trigger});
+ arena_.add_rule(clean_name, placeholder_rule_id);
+
+ // Build the actual parser
+ auto parser = builder_fn();
+
+ // Replace placeholder with actual rule
+ auto rule_id = arena_.add_parser(common_peg_rule_parser{clean_name, parser.id(), trigger});
+ arena_.rules_[clean_name] = rule_id;
+
+ return ref(clean_name);
+}
+
+void common_peg_parser_builder::set_root(const common_peg_parser & p) {
+ arena_.set_root(p.id());
+}
+
+common_peg_arena common_peg_parser_builder::build() {
+ arena_.resolve_refs();
+ return std::move(arena_);
+}
+
+// JSON parsers
+common_peg_parser common_peg_parser_builder::json_number() {
+ return rule("json-number", [this]() {
+ auto digit1_9 = chars("[1-9]", 1, 1);
+ auto digits = chars("[0-9]");
+ auto int_part = choice({literal("0"), sequence({digit1_9, chars("[0-9]", 0, -1)})});
+ auto frac = sequence({literal("."), digits});
+ auto exp = sequence({choice({literal("e"), literal("E")}), optional(chars("[+-]", 1, 1)), digits});
+ return sequence({optional(literal("-")), int_part, optional(frac), optional(exp), space()});
+ });
+}
+
+common_peg_parser common_peg_parser_builder::json_string() {
+ return rule("json-string", [this]() {
+ return sequence({literal("\""), json_string_content(), literal("\""), space()});
+ });
+}
+
+common_peg_parser common_peg_parser_builder::json_bool() {
+ return rule("json-bool", [this]() {
+ return sequence({choice({literal("true"), literal("false")}), space()});
+ });
+}
+
+common_peg_parser common_peg_parser_builder::json_null() {
+ return rule("json-null", [this]() {
+ return sequence({literal("null"), space()});
+ });
+}
+
+common_peg_parser common_peg_parser_builder::json_object() {
+ return rule("json-object", [this]() {
+ auto ws = space();
+ auto member = sequence({json_string(), ws, literal(":"), ws, json()});
+ auto members = sequence({member, zero_or_more(sequence({ws, literal(","), ws, member}))});
+ return sequence({
+ literal("{"),
+ ws,
+ choice({
+ literal("}"),
+ sequence({members, ws, literal("}")})
+ }),
+ ws
+ });
+ });
+}
+
+common_peg_parser common_peg_parser_builder::json_array() {
+ return rule("json-array", [this]() {
+ auto ws = space();
+ auto elements = sequence({json(), zero_or_more(sequence({literal(","), ws, json()}))});
+ return sequence({
+ literal("["),
+ ws,
+ choice({
+ literal("]"),
+ sequence({elements, ws, literal("]")})
+ }),
+ ws
+ });
+ });
+}
+
+common_peg_parser common_peg_parser_builder::json() {
+ return rule("json-value", [this]() {
+ return choice({
+ json_object(),
+ json_array(),
+ json_string(),
+ json_number(),
+ json_bool(),
+ json_null()
+ });
+ });
+}
+
+common_peg_parser common_peg_parser_builder::json_string_content() {
+ return wrap(arena_.add_parser(common_peg_json_string_parser{}));
+}
+
+common_peg_parser common_peg_parser_builder::json_member(const std::string & key, const common_peg_parser & p) {
+ auto ws = space();
+ return sequence({
+ literal("\"" + key + "\""),
+ ws,
+ literal(":"),
+ ws,
+ p,
+ });
+}
+
+
+static std::string gbnf_escape_char_class(char c) {
+ switch (c) {
+ case '\n': return "\\n";
+ case '\t': return "\\t";
+ case '\r': return "\\r";
+ case '\\': return "\\\\";
+ case ']': return "\\]";
+ case '[': return "\\[";
+ default: return std::string(1, c);
+ }
+}
+
+static std::string gbnf_excluding_pattern(const std::vector<std::string> & strings) {
+ trie matcher(strings);
+ auto pieces = matcher.collect_prefix_and_next();
+
+ std::string pattern;
+ for (size_t i = 0; i < pieces.size(); ++i) {
+ if (i > 0) {
+ pattern += " | ";
+ }
+
+ const auto & pre = pieces[i].prefix;
+ const auto & chars = pieces[i].next_chars;
+
+ std::string cls;
+ cls.reserve(chars.size());
+ for (const auto & ch : chars) {
+ cls += gbnf_escape_char_class(ch);
+ }
+
+ if (!pre.empty()) {
+ pattern += gbnf_format_literal(pre) + " [^" + cls + "]";
+ } else {
+ pattern += "[^" + cls + "]";
+ }
+ }
+
+ return "(" + pattern + ")*";
+}
+
+static std::unordered_set<std::string> collect_reachable_rules(
+ const common_peg_arena & arena,
+ const common_peg_parser_id & rule
+) {
+ std::unordered_set<std::string> reachable;
+ std::unordered_set<std::string> visited;
+
+ std::function<void(common_peg_parser_id)> visit = [&](common_peg_parser_id id) {
+ const auto & parser = arena.get(id);
+
+ std::visit([&](const auto & p) {
+ using T = std::decay_t<decltype(p)>;
+
+ if constexpr (std::is_same_v<T, common_peg_epsilon_parser> ||
+ std::is_same_v<T, common_peg_start_parser> ||
+ std::is_same_v<T, common_peg_end_parser> ||
+ std::is_same_v<T, common_peg_until_parser> ||
+ std::is_same_v<T, common_peg_literal_parser> ||
+ std::is_same_v<T, common_peg_chars_parser> ||
+ std::is_same_v<T, common_peg_space_parser> ||
+ std::is_same_v<T, common_peg_any_parser> ||
+ std::is_same_v<T, common_peg_json_string_parser>) {
+ // These parsers do not have any children
+ } else if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
+ for (auto child : p.children) {
+ visit(child);
+ }
+ } else if constexpr (std::is_same_v<T, common_peg_choice_parser>) {
+ for (auto child : p.children) {
+ visit(child);
+ }
+ } else if constexpr (std::is_same_v<T, common_peg_repetition_parser> ||
+ std::is_same_v<T, common_peg_and_parser> ||
+ std::is_same_v<T, common_peg_not_parser> ||
+ std::is_same_v<T, common_peg_tag_parser> ||
+ std::is_same_v<T, common_peg_atomic_parser> ||
+ std::is_same_v<T, common_peg_schema_parser>) {
+ visit(p.child);
+ } else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
+ if (visited.find(p.name) == visited.end()) {
+ visited.insert(p.name);
+ reachable.insert(p.name);
+ visit(p.child);
+ }
+ } else if constexpr (std::is_same_v<T, common_peg_ref_parser>) {
+ // Traverse rules so we pick up everything
+ auto referenced_rule = arena.get_rule(p.name);
+ visit(referenced_rule);
+ } else {
+ static_assert(is_always_false_v<T>);
+ }
+ }, parser);
+ };
+
+ visit(rule);
+ return reachable;
+}
+
+// GBNF generation implementation
+void common_peg_arena::build_grammar(const common_grammar_builder & builder, bool lazy) const {
+ // Generate GBNF for a parser
+ std::function<std::string(common_peg_parser_id)> to_gbnf = [&](common_peg_parser_id id) -> std::string {
+ const auto & parser = parsers_.at(id);
+
+ return std::visit([&](const auto & p) -> std::string {
+ using T = std::decay_t<decltype(p)>;
+
+ if constexpr (std::is_same_v<T, common_peg_epsilon_parser> ||
+ std::is_same_v<T, common_peg_start_parser> ||
+ std::is_same_v<T, common_peg_end_parser>) {
+ return "";
+ } else if constexpr (std::is_same_v<T, common_peg_literal_parser>) {
+ return gbnf_format_literal(p.literal);
+ } else if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
+ std::string s;
+ for (const auto & child : p.children) {
+ if (!s.empty()) {
+ s += " ";
+ }
+ auto child_gbnf = to_gbnf(child);
+ const auto & child_parser = parsers_.at(child);
+ if (std::holds_alternative<common_peg_choice_parser>(child_parser) ||
+ std::holds_alternative<common_peg_sequence_parser>(child_parser)) {
+ s += "(" + child_gbnf + ")";
+ } else {
+ s += child_gbnf;
+ }
+ }
+ return s;
+ } else if constexpr (std::is_same_v<T, common_peg_choice_parser>) {
+ std::string s;
+ for (const auto & child : p.children) {
+ if (!s.empty()) {
+ s += " | ";
+ }
+ auto child_gbnf = to_gbnf(child);
+ const auto & child_parser = parsers_.at(child);
+ if (std::holds_alternative<common_peg_choice_parser>(child_parser)) {
+ s += "(" + child_gbnf + ")";
+ } else {
+ s += child_gbnf;
+ }
+ }
+ return s;
+ } else if constexpr (std::is_same_v<T, common_peg_repetition_parser>) {
+ auto child_gbnf = to_gbnf(p.child);
+ const auto & child_parser = parsers_.at(p.child);
+ if (std::holds_alternative<common_peg_choice_parser>(child_parser) ||
+ std::holds_alternative<common_peg_sequence_parser>(child_parser)) {
+ child_gbnf = "(" + child_gbnf + ")";
+ }
+ if (p.min_count == 0 && p.max_count == 1) {
+ return child_gbnf + "?";
+ }
+ if (p.min_count == 0 && p.max_count == -1) {
+ return child_gbnf + "*";
+ }
+ if (p.min_count == 1 && p.max_count == -1) {
+ return child_gbnf + "+";
+ }
+ if (p.max_count == -1) {
+ return child_gbnf + "{" + std::to_string(p.min_count) + ",}";
+ }
+ if (p.min_count == p.max_count) {
+ if (p.min_count == 1) {
+ return child_gbnf;
+ }
+ return child_gbnf + "{" + std::to_string(p.min_count) + "}";
+ }
+ return child_gbnf + "{" + std::to_string(p.min_count) + "," + std::to_string(p.max_count) + "}";
+ } else if constexpr (std::is_same_v<T, common_peg_and_parser> || std::is_same_v<T, common_peg_not_parser>) {
+ return ""; // Lookahead not supported in GBNF
+ } else if constexpr (std::is_same_v<T, common_peg_any_parser>) {
+ return ".";
+ } else if constexpr (std::is_same_v<T, common_peg_space_parser>) {
+ return "space";
+ } else if constexpr (std::is_same_v<T, common_peg_chars_parser>) {
+ std::string result = p.pattern;
+ if (p.min_count == 0 && p.max_count == 1) {
+ return result + "?";
+ }
+ if (p.min_count == 0 && p.max_count == -1) {
+ return result + "*";
+ }
+ if (p.min_count == 1 && p.max_count == -1) {
+ return result + "+";
+ }
+ if (p.max_count == -1) {
+ return result + "{" + std::to_string(p.min_count) + ",}";
+ }
+ if (p.min_count == p.max_count) {
+ if (p.min_count == 1) {
+ return result;
+ }
+ return result + "{" + std::to_string(p.min_count) + "}";
+ }
+ return result + "{" + std::to_string(p.min_count) + "," + std::to_string(p.max_count) + "}";
+ } else if constexpr (std::is_same_v<T, common_peg_json_string_parser>) {
+ return R"(( [^"\\] | "\\" ( ["\\/ bfnrt] | "u" [0-9a-fA-F]{4} ) )*)";
+ } else if constexpr (std::is_same_v<T, common_peg_until_parser>) {
+ if (p.delimiters.empty()) {
+ return ".*";
+ }
+ return gbnf_excluding_pattern(p.delimiters);
+ } else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
+ if (p.schema) {
+ if (p.raw && p.schema->contains("type") && p.schema->at("type").is_string() && p.schema->at("type") == "string") {
+ // TODO: Implement more comprehensive grammar generation for raw strings.
+ // For now, use the grammar emitted from the underlying parser.
+ return to_gbnf(p.child);
+ }
+ return builder.add_schema(p.name, *p.schema);
+ }
+ return to_gbnf(p.child);
+ } else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
+ return p.name;
+ } else if constexpr (std::is_same_v<T, common_peg_ref_parser>) {
+ // Refs should not exist after flattening, but kept just in case
+ return p.name;
+ } else if constexpr (std::is_same_v<T, common_peg_tag_parser>) {
+ return to_gbnf(p.child);
+ } else if constexpr (std::is_same_v<T, common_peg_atomic_parser>) {
+ return to_gbnf(p.child);
+ } else {
+ static_assert(is_always_false_v<T>);
+ }
+ }, parser);
+ };
+
+ // Collect reachable rules
+ std::unordered_set<std::string> reachable_rules;
+
+ if (lazy) {
+ // Collect rules reachable from trigger rules
+ for (const auto & [name, id] : rules_) {
+ const auto & parser = parsers_.at(id);
+ if (auto rule = std::get_if<common_peg_rule_parser>(&parser)) {
+ if (rule->trigger) {
+ // Mark trigger as reachable and visit it
+ reachable_rules.insert(name);
+ auto add_rules = collect_reachable_rules(*this, id);
+ reachable_rules.insert(add_rules.begin(), add_rules.end());
+ }
+ }
+ }
+ } else {
+ // Collect rules reachable from root
+ reachable_rules = collect_reachable_rules(*this, root_);
+ }
+
+ // Create GBNF rules for all reachable rules
+ for (const auto & [name, rule_id] : rules_) {
+ if (reachable_rules.find(name) == reachable_rules.end()) {
+ continue;
+ }
+
+ const auto & parser = parsers_.at(rule_id);
+ if (auto rule = std::get_if<common_peg_rule_parser>(&parser)) {
+ builder.add_rule(rule->name, to_gbnf(rule->child));
+ }
+ }
+
+ if (lazy) {
+ // Generate root rule from trigger rules only
+ std::vector<std::string> trigger_names;
+ for (const auto & [name, rule_id] : rules_) {
+ const auto & parser = parsers_.at(rule_id);
+ if (auto rule = std::get_if<common_peg_rule_parser>(&parser)) {
+ if (rule->trigger) {
+ trigger_names.push_back(rule->name);
+ }
+ }
+ }
+
+ // Sort for predictable order
+ std::sort(trigger_names.begin(), trigger_names.end());
+ builder.add_rule("root", string_join(trigger_names, " | "));
+ } else if (root_ != COMMON_PEG_INVALID_PARSER_ID) {
+ builder.add_rule("root", to_gbnf(root_));
+ }
+}
+
+static nlohmann::json serialize_parser_variant(const common_peg_parser_variant & variant) {
+ using json = nlohmann::json;
+
+ return std::visit([](const auto & p) -> json {
+ using T = std::decay_t<decltype(p)>;
+
+ if constexpr (std::is_same_v<T, common_peg_epsilon_parser>) {
+ return json{{"type", "epsilon"}};
+ } else if constexpr (std::is_same_v<T, common_peg_start_parser>) {
+ return json{{"type", "start"}};
+ } else if constexpr (std::is_same_v<T, common_peg_end_parser>) {
+ return json{{"type", "end"}};
+ } else if constexpr (std::is_same_v<T, common_peg_literal_parser>) {
+ return json{{"type", "literal"}, {"literal", p.literal}};
+ } else if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
+ return json{{"type", "sequence"}, {"children", p.children}};
+ } else if constexpr (std::is_same_v<T, common_peg_choice_parser>) {
+ return json{{"type", "choice"}, {"children", p.children}};
+ } else if constexpr (std::is_same_v<T, common_peg_repetition_parser>) {
+ return json{
+ {"type", "repetition"},
+ {"child", p.child},
+ {"min_count", p.min_count},
+ {"max_count", p.max_count}
+ };
+ } else if constexpr (std::is_same_v<T, common_peg_and_parser>) {
+ return json{{"type", "and"}, {"child", p.child}};
+ } else if constexpr (std::is_same_v<T, common_peg_not_parser>) {
+ return json{{"type", "not"}, {"child", p.child}};
+ } else if constexpr (std::is_same_v<T, common_peg_any_parser>) {
+ return json{{"type", "any"}};
+ } else if constexpr (std::is_same_v<T, common_peg_space_parser>) {
+ return json{{"type", "space"}};
+ } else if constexpr (std::is_same_v<T, common_peg_chars_parser>) {
+ json ranges = json::array();
+ for (const auto & range : p.ranges) {
+ ranges.push_back({{"start", range.start}, {"end", range.end}});
+ }
+ return json{
+ {"type", "chars"},
+ {"pattern", p.pattern},
+ {"ranges", ranges},
+ {"negated", p.negated},
+ {"min_count", p.min_count},
+ {"max_count", p.max_count}
+ };
+ } else if constexpr (std::is_same_v<T, common_peg_json_string_parser>) {
+ return json{{"type", "json_string"}};
+ } else if constexpr (std::is_same_v<T, common_peg_until_parser>) {
+ return json{{"type", "until"}, {"delimiters", p.delimiters}};
+ } else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
+ return json{
+ {"type", "schema"},
+ {"child", p.child},
+ {"name", p.name},
+ {"schema", p.schema ? *p.schema : nullptr},
+ {"raw", p.raw}
+ };
+ } else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
+ return json{
+ {"type", "rule"},
+ {"name", p.name},
+ {"child", p.child},
+ {"trigger", p.trigger}
+ };
+ } else if constexpr (std::is_same_v<T, common_peg_ref_parser>) {
+ return json{{"type", "ref"}, {"name", p.name}};
+ } else if constexpr (std::is_same_v<T, common_peg_atomic_parser>) {
+ return json{{"type", "atomic"}, {"child", p.child}};
+ } else if constexpr (std::is_same_v<T, common_peg_tag_parser>) {
+ return json{
+ {"type", "tag"},
+ {"child", p.child},
+ {"tag", p.tag}
+ };
+ }
+ }, variant);
+}
+
+nlohmann::json common_peg_arena::to_json() const {
+ auto parsers = nlohmann::json::array();
+ for (const auto & parser : parsers_) {
+ parsers.push_back(serialize_parser_variant(parser));
+ }
+ return nlohmann::json{
+ {"parsers", parsers},
+ {"rules", rules_},
+ {"root", root_}
+ };
+}
+
+static common_peg_parser_variant deserialize_parser_variant(const nlohmann::json & j) {
+ if (!j.contains("type") || !j["type"].is_string()) {
+ throw std::runtime_error("Parser variant JSON missing or invalid 'type' field");
+ }
+
+ std::string type = j["type"];
+
+ if (type == "epsilon") {
+ return common_peg_epsilon_parser{};
+ }
+ if (type == "start") {
+ return common_peg_start_parser{};
+ }
+ if (type == "end") {
+ return common_peg_end_parser{};
+ }
+ if (type == "literal") {
+ if (!j.contains("literal") || !j["literal"].is_string()) {
+ throw std::runtime_error("literal parser missing or invalid 'literal' field");
+ }
+ return common_peg_literal_parser{j["literal"]};
+ }
+ if (type == "sequence") {
+ if (!j.contains("children") || !j["children"].is_array()) {
+ throw std::runtime_error("sequence parser missing or invalid 'children' field");
+ }
+ return common_peg_sequence_parser{j["children"].get<std::vector<common_peg_parser_id>>()};
+ }
+ if (type == "choice") {
+ if (!j.contains("children") || !j["children"].is_array()) {
+ throw std::runtime_error("choice parser missing or invalid 'children' field");
+ }
+ return common_peg_choice_parser{j["children"].get<std::vector<common_peg_parser_id>>()};
+ }
+ if (type == "repetition") {
+ if (!j.contains("child") || !j.contains("min_count") || !j.contains("max_count")) {
+ throw std::runtime_error("repetition parser missing required fields");
+ }
+ return common_peg_repetition_parser{
+ j["child"].get<common_peg_parser_id>(),
+ j["min_count"].get<int>(),
+ j["max_count"].get<int>()
+ };
+ }
+ if (type == "and") {
+ if (!j.contains("child")) {
+ throw std::runtime_error("and parser missing 'child' field");
+ }
+ return common_peg_and_parser{j["child"].get<common_peg_parser_id>()};
+ }
+ if (type == "not") {
+ if (!j.contains("child")) {
+ throw std::runtime_error("not parser missing 'child' field");
+ }
+ return common_peg_not_parser{j["child"].get<common_peg_parser_id>()};
+ }
+ if (type == "any") {
+ return common_peg_any_parser{};
+ }
+ if (type == "space") {
+ return common_peg_space_parser{};
+ }
+ if (type == "chars") {
+ if (!j.contains("pattern") || !j.contains("ranges") || !j.contains("negated") ||
+ !j.contains("min_count") || !j.contains("max_count")) {
+ throw std::runtime_error("chars parser missing required fields");
+ }
+ common_peg_chars_parser parser;
+ parser.pattern = j["pattern"];
+ parser.negated = j["negated"];
+ parser.min_count = j["min_count"];
+ parser.max_count = j["max_count"];
+ for (const auto & range_json : j["ranges"]) {
+ if (!range_json.contains("start") || !range_json.contains("end")) {
+ throw std::runtime_error("char_range missing 'start' or 'end' field");
+ }
+ parser.ranges.push_back({
+ range_json["start"].get<uint32_t>(),
+ range_json["end"].get<uint32_t>()
+ });
+ }
+ return parser;
+ }
+ if (type == "json_string") {
+ return common_peg_json_string_parser{};
+ }
+ if (type == "until") {
+ if (!j.contains("delimiters") || !j["delimiters"].is_array()) {
+ throw std::runtime_error("until parser missing or invalid 'delimiters' field");
+ }
+ return common_peg_until_parser{j["delimiters"].get<std::vector<std::string>>()};
+ }
+ if (type == "schema") {
+ if (!j.contains("child") || !j.contains("name") || !j.contains("schema") || !j.contains("raw")) {
+ throw std::runtime_error("schema parser missing required fields");
+ }
+ common_peg_schema_parser parser;
+ parser.child = j["child"].get<common_peg_parser_id>();
+ parser.name = j["name"];
+ if (!j["schema"].is_null()) {
+ parser.schema = std::make_shared<nlohmann::ordered_json>(j["schema"]);
+ }
+ parser.raw = j["raw"].get<bool>();
+ return parser;
+ }
+ if (type == "rule") {
+ if (!j.contains("name") || !j.contains("child") || !j.contains("trigger")) {
+ throw std::runtime_error("rule parser missing required fields");
+ }
+ return common_peg_rule_parser{
+ j["name"].get<std::string>(),
+ j["child"].get<common_peg_parser_id>(),
+ j["trigger"].get<bool>()
+ };
+ }
+ if (type == "ref") {
+ if (!j.contains("name") || !j["name"].is_string()) {
+ throw std::runtime_error("ref parser missing or invalid 'name' field");
+ }
+ return common_peg_ref_parser{j["name"]};
+ }
+ if (type == "atomic") {
+ if (!j.contains("child")) {
+ throw std::runtime_error("tag parser missing required fields");
+ }
+ return common_peg_atomic_parser{
+ j["child"].get<common_peg_parser_id>(),
+ };
+ }
+ if (type == "tag") {
+ if (!j.contains("child") || !j.contains("tag")) {
+ throw std::runtime_error("tag parser missing required fields");
+ }
+ return common_peg_tag_parser{
+ j["child"].get<common_peg_parser_id>(),
+ j["tag"].get<std::string>(),
+ };
+ }
+
+ throw std::runtime_error("Unknown parser type: " + type);
+}
+
+common_peg_arena common_peg_arena::from_json(const nlohmann::json & j) {
+ if (!j.contains("parsers") || !j["parsers"].is_array()) {
+ throw std::runtime_error("JSON missing or invalid 'parsers' array");
+ }
+ if (!j.contains("rules") || !j["rules"].is_object()) {
+ throw std::runtime_error("JSON missing or invalid 'rules' object");
+ }
+ if (!j.contains("root")) {
+ throw std::runtime_error("JSON missing 'root' field");
+ }
+
+ common_peg_arena arena;
+
+ const auto & parsers_json = j["parsers"];
+ arena.parsers_.reserve(parsers_json.size());
+ for (const auto & parser_json : parsers_json) {
+ arena.parsers_.push_back(deserialize_parser_variant(parser_json));
+ }
+
+ arena.rules_ = j["rules"].get<std::unordered_map<std::string, common_peg_parser_id>>();
+
+ for (const auto & [name, id] : arena.rules_) {
+ if (id >= arena.parsers_.size()) {
+ throw std::runtime_error("Rule '" + name + "' references invalid parser ID: " + std::to_string(id));
+ }
+ }
+
+ arena.root_ = j["root"].get<common_peg_parser_id>();
+ if (arena.root_ != COMMON_PEG_INVALID_PARSER_ID && arena.root_ >= arena.parsers_.size()) {
+ throw std::runtime_error("Root references invalid parser ID: " + std::to_string(arena.root_));
+ }
+
+ return arena;
+}
+
+std::string common_peg_arena::save() const {
+ return to_json().dump();
+}
+
+void common_peg_arena::load(const std::string & data) {
+ *this = from_json(nlohmann::json::parse(data));
+}
+
+common_peg_arena build_peg_parser(const std::function<common_peg_parser(common_peg_parser_builder & builder)> & fn) {
+ common_peg_parser_builder builder;
+ builder.set_root(fn(builder));
+ return builder.build();
+}
--- /dev/null
+#pragma once
+
+#include <nlohmann/json_fwd.hpp>
+
+#include <memory>
+#include <unordered_map>
+#include <string>
+#include <string_view>
+#include <functional>
+#include <vector>
+#include <variant>
+
+struct common_grammar_builder;
+
+class common_peg_parser_builder;
+
+using common_peg_parser_id = size_t;
+constexpr common_peg_parser_id COMMON_PEG_INVALID_PARSER_ID = static_cast<common_peg_parser_id>(-1);
+
+using common_peg_ast_id = size_t;
+constexpr common_peg_ast_id COMMON_PEG_INVALID_AST_ID = static_cast<common_peg_ast_id>(-1);
+
+// Lightweight wrapper around common_peg_parser_id for convenience
+class common_peg_parser {
+ common_peg_parser_id id_;
+ common_peg_parser_builder & builder_;
+
+ public:
+ common_peg_parser(const common_peg_parser & other) : id_(other.id_), builder_(other.builder_) {}
+ common_peg_parser(common_peg_parser_id id, common_peg_parser_builder & builder) : id_(id), builder_(builder) {}
+
+ common_peg_parser & operator=(const common_peg_parser & other);
+ common_peg_parser & operator+=(const common_peg_parser & other);
+ common_peg_parser & operator|=(const common_peg_parser & other);
+
+ operator common_peg_parser_id() const { return id_; }
+ common_peg_parser_id id() const { return id_; }
+
+ common_peg_parser_builder & builder() const { return builder_; }
+
+ // Creates a sequence
+ common_peg_parser operator+(const common_peg_parser & other) const;
+
+ // Creates a sequence separated by spaces.
+ common_peg_parser operator<<(const common_peg_parser & other) const;
+
+ // Creates a choice
+ common_peg_parser operator|(const common_peg_parser & other) const;
+
+ common_peg_parser operator+(const char * str) const;
+ common_peg_parser operator+(const std::string & str) const;
+ common_peg_parser operator<<(const char * str) const;
+ common_peg_parser operator<<(const std::string & str) const;
+ common_peg_parser operator|(const char * str) const;
+ common_peg_parser operator|(const std::string & str) const;
+};
+
+common_peg_parser operator+(const char * str, const common_peg_parser & p);
+common_peg_parser operator+(const std::string & str, const common_peg_parser & p);
+common_peg_parser operator<<(const char * str, const common_peg_parser & p);
+common_peg_parser operator<<(const std::string & str, const common_peg_parser & p);
+common_peg_parser operator|(const char * str, const common_peg_parser & p);
+common_peg_parser operator|(const std::string & str, const common_peg_parser & p);
+
+enum common_peg_parse_result_type {
+ COMMON_PEG_PARSE_RESULT_FAIL = 0,
+ COMMON_PEG_PARSE_RESULT_SUCCESS = 1,
+ COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT = 2,
+};
+
+const char * common_peg_parse_result_type_name(common_peg_parse_result_type type);
+
+struct common_peg_ast_node {
+ common_peg_ast_id id;
+ std::string rule;
+ std::string tag;
+ size_t start;
+ size_t end;
+ std::string_view text;
+ std::vector<common_peg_ast_id> children;
+
+ bool is_partial = false;
+};
+
+struct common_peg_parse_result;
+
+using common_peg_ast_visitor = std::function<void(const common_peg_ast_node & node)>;
+
+class common_peg_ast_arena {
+ std::vector<common_peg_ast_node> nodes_;
+ public:
+ common_peg_ast_id add_node(
+ const std::string & rule,
+ const std::string & tag,
+ size_t start,
+ size_t end,
+ std::string_view text,
+ std::vector<common_peg_ast_id> children,
+ bool is_partial = false
+ ) {
+ common_peg_ast_id id = nodes_.size();
+ nodes_.push_back({id, rule, tag, start, end, text, std::move(children), is_partial});
+ return id;
+ }
+
+ const common_peg_ast_node & get(common_peg_ast_id id) const { return nodes_.at(id); }
+
+ size_t size() const { return nodes_.size(); }
+
+ void clear() { nodes_.clear(); }
+
+ void visit(common_peg_ast_id id, const common_peg_ast_visitor & visitor) const;
+ void visit(const common_peg_parse_result & result, const common_peg_ast_visitor & visitor) const;
+};
+
+struct common_peg_parse_result {
+ common_peg_parse_result_type type = COMMON_PEG_PARSE_RESULT_FAIL;
+ size_t start = 0;
+ size_t end = 0;
+
+ std::vector<common_peg_ast_id> nodes;
+
+ common_peg_parse_result() = default;
+
+ common_peg_parse_result(common_peg_parse_result_type type, size_t start)
+ : type(type), start(start), end(start) {}
+
+ common_peg_parse_result(common_peg_parse_result_type type, size_t start, size_t end)
+ : type(type), start(start), end(end) {}
+
+ common_peg_parse_result(common_peg_parse_result_type type, size_t start, size_t end, std::vector<common_peg_ast_id> nodes)
+ : type(type), start(start), end(end), nodes(std::move(nodes)) {}
+
+ bool fail() const { return type == COMMON_PEG_PARSE_RESULT_FAIL; }
+ bool need_more_input() const { return type == COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT; }
+ bool success() const { return type == COMMON_PEG_PARSE_RESULT_SUCCESS; }
+};
+
+struct common_peg_parse_context {
+ std::string input;
+ bool is_partial;
+ common_peg_ast_arena ast;
+
+ int parse_depth;
+
+ common_peg_parse_context()
+ : is_partial(false), parse_depth(0) {}
+
+ common_peg_parse_context(const std::string & input)
+ : input(input), is_partial(false), parse_depth(0) {}
+
+ common_peg_parse_context(const std::string & input, bool is_partial)
+ : input(input), is_partial(is_partial), parse_depth(0) {}
+};
+
+class common_peg_arena;
+
+// Parser variants
+struct common_peg_epsilon_parser {};
+
+struct common_peg_start_parser {};
+
+struct common_peg_end_parser {};
+
+struct common_peg_literal_parser {
+ std::string literal;
+};
+
+struct common_peg_sequence_parser {
+ std::vector<common_peg_parser_id> children;
+};
+
+struct common_peg_choice_parser {
+ std::vector<common_peg_parser_id> children;
+};
+
+struct common_peg_repetition_parser {
+ common_peg_parser_id child;
+ int min_count;
+ int max_count; // -1 for unbounded
+};
+
+struct common_peg_and_parser {
+ common_peg_parser_id child;
+};
+
+struct common_peg_not_parser {
+ common_peg_parser_id child;
+};
+
+struct common_peg_any_parser {};
+
+struct common_peg_space_parser {};
+
+struct common_peg_chars_parser {
+ struct char_range {
+ uint32_t start;
+ uint32_t end;
+ bool contains(uint32_t codepoint) const { return codepoint >= start && codepoint <= end; }
+ };
+
+ std::string pattern;
+ std::vector<char_range> ranges;
+ bool negated;
+ int min_count;
+ int max_count; // -1 for unbounded
+};
+
+struct common_peg_json_string_parser {};
+
+struct common_peg_until_parser {
+ std::vector<std::string> delimiters;
+};
+
+struct common_peg_schema_parser {
+ common_peg_parser_id child;
+ std::string name;
+ std::shared_ptr<nlohmann::ordered_json> schema;
+
+ // Indicates if the GBNF should accept a raw string that matches the schema.
+ bool raw;
+};
+
+struct common_peg_rule_parser {
+ std::string name;
+ common_peg_parser_id child;
+ bool trigger;
+};
+
+struct common_peg_ref_parser {
+ std::string name;
+};
+
+struct common_peg_atomic_parser {
+ common_peg_parser_id child;
+};
+
+struct common_peg_tag_parser {
+ common_peg_parser_id child;
+ std::string tag;
+};
+
+// Variant holding all parser types
+using common_peg_parser_variant = std::variant<
+ common_peg_epsilon_parser,
+ common_peg_start_parser,
+ common_peg_end_parser,
+ common_peg_literal_parser,
+ common_peg_sequence_parser,
+ common_peg_choice_parser,
+ common_peg_repetition_parser,
+ common_peg_and_parser,
+ common_peg_not_parser,
+ common_peg_any_parser,
+ common_peg_space_parser,
+ common_peg_chars_parser,
+ common_peg_json_string_parser,
+ common_peg_until_parser,
+ common_peg_schema_parser,
+ common_peg_rule_parser,
+ common_peg_ref_parser,
+ common_peg_atomic_parser,
+ common_peg_tag_parser
+>;
+
+class common_peg_arena {
+ std::vector<common_peg_parser_variant> parsers_;
+ std::unordered_map<std::string, common_peg_parser_id> rules_;
+ common_peg_parser_id root_ = COMMON_PEG_INVALID_PARSER_ID;
+
+ public:
+ const common_peg_parser_variant & get(common_peg_parser_id id) const { return parsers_.at(id); }
+ common_peg_parser_variant & get(common_peg_parser_id id) { return parsers_.at(id); }
+
+ size_t size() const { return parsers_.size(); }
+ bool empty() const { return parsers_.empty(); }
+
+ common_peg_parser_id get_rule(const std::string & name) const;
+ bool has_rule(const std::string & name) const { return rules_.find(name) != rules_.end(); }
+
+ common_peg_parser_id root() const { return root_; }
+ void set_root(common_peg_parser_id id) { root_ = id; }
+
+ common_peg_parse_result parse(common_peg_parse_context & ctx, size_t start = 0) const;
+ common_peg_parse_result parse(common_peg_parser_id id, common_peg_parse_context & ctx, size_t start) const;
+
+ void resolve_refs();
+
+ void build_grammar(const common_grammar_builder & builder, bool lazy = false) const;
+
+ std::string dump(common_peg_parser_id id) const;
+
+ nlohmann::json to_json() const;
+ static common_peg_arena from_json(const nlohmann::json & j);
+
+ std::string save() const;
+ void load(const std::string & data);
+
+ friend class common_peg_parser_builder;
+
+ private:
+ common_peg_parser_id add_parser(common_peg_parser_variant parser);
+ void add_rule(const std::string & name, common_peg_parser_id id);
+
+ common_peg_parser_id resolve_ref(common_peg_parser_id id);
+};
+
+class common_peg_parser_builder {
+ common_peg_arena arena_;
+
+ common_peg_parser wrap(common_peg_parser_id id) { return common_peg_parser(id, *this); }
+ common_peg_parser add(const common_peg_parser_variant & p) { return wrap(arena_.add_parser(p)); }
+
+ public:
+ common_peg_parser_builder();
+
+ // Match nothing, always succeed.
+ // S -> ε
+ common_peg_parser eps() { return add(common_peg_epsilon_parser{}); }
+
+ // Matches the start of the input.
+ // S -> ^
+ common_peg_parser start() { return add(common_peg_start_parser{}); }
+
+ // Matches the end of the input.
+ // S -> $
+ common_peg_parser end() { return add(common_peg_end_parser{}); }
+
+ // Matches an exact literal string.
+ // S -> "hello"
+ common_peg_parser literal(const std::string & literal) { return add(common_peg_literal_parser{literal}); }
+
+ // Matches a sequence of parsers in order, all must succeed.
+ // S -> A B C
+ common_peg_parser sequence() { return add(common_peg_sequence_parser{}); }
+ common_peg_parser sequence(const std::vector<common_peg_parser_id> & parsers);
+ common_peg_parser sequence(const std::vector<common_peg_parser> & parsers);
+ common_peg_parser sequence(std::initializer_list<common_peg_parser> parsers);
+
+ // Matches the first parser that succeeds from a list of alternatives.
+ // S -> A | B | C
+ common_peg_parser choice() { return add(common_peg_choice_parser{}); }
+ common_peg_parser choice(const std::vector<common_peg_parser_id> & parsers);
+ common_peg_parser choice(const std::vector<common_peg_parser> & parsers);
+ common_peg_parser choice(std::initializer_list<common_peg_parser> parsers);
+
+ // Matches one or more repetitions of a parser.
+ // S -> A+
+ common_peg_parser one_or_more(const common_peg_parser & p) { return repeat(p, 1, -1); }
+
+ // Matches zero or more repetitions of a parser, always succeeds.
+ // S -> A*
+ common_peg_parser zero_or_more(const common_peg_parser & p) { return repeat(p, 0, -1); }
+
+ // Matches zero or one occurrence of a parser, always succeeds.
+ // S -> A?
+ common_peg_parser optional(const common_peg_parser & p) { return repeat(p, 0, 1); }
+
+ // Positive lookahead: succeeds if child parser succeeds, consumes no input.
+ // S -> &A
+ common_peg_parser peek(const common_peg_parser & p) { return add(common_peg_and_parser{p}); }
+
+ // Negative lookahead: succeeds if child parser fails, consumes no input.
+ // S -> !A
+ common_peg_parser negate(const common_peg_parser & p) { return add(common_peg_not_parser{p}); }
+
+ // Matches any single character.
+ // S -> .
+ common_peg_parser any() { return add(common_peg_any_parser{}); }
+
+ // Matches between min and max repetitions of characters from a character class.
+ // S -> [a-z]{m,n}
+ //
+ // Use -1 for max to represent unbounded repetition (equivalent to {m,})
+ common_peg_parser chars(const std::string & classes, int min = 1, int max = -1);
+
+ // Creates a lightweight reference to a named rule (resolved during build()).
+ // Use this for forward references in recursive grammars.
+ // expr_ref -> expr
+ common_peg_parser ref(const std::string & name) { return add(common_peg_ref_parser{name}); }
+
+ // Matches zero or more whitespace characters (space, tab, newline).
+ // S -> [ \t\n]*
+ common_peg_parser space() { return add(common_peg_space_parser{}); }
+
+ // Matches all characters until a delimiter is found (delimiter not consumed).
+ // S -> (!delim .)*
+ common_peg_parser until(const std::string & delimiter) { return add(common_peg_until_parser{{delimiter}}); }
+
+ // Matches all characters until one of the delimiters in the list is found (delimiter not consumed).
+ // S -> (!delim .)*
+ common_peg_parser until_one_of(const std::vector<std::string> & delimiters) { return add(common_peg_until_parser{delimiters}); }
+
+ // Matches everything
+ // S -> .*
+ common_peg_parser rest() { return until_one_of({}); }
+
+ // Matches between min and max repetitions of a parser (inclusive).
+ // S -> A{m,n}
+ // Use -1 for max to represent unbounded repetition (equivalent to {m,})
+ common_peg_parser repeat(const common_peg_parser & p, int min, int max) { return add(common_peg_repetition_parser{p, min,max}); }
+
+ // Matches exactly n repetitions of a parser.
+ // S -> A{n}
+ common_peg_parser repeat(const common_peg_parser & p, int n) { return repeat(p, n, n); }
+
+ // Creates a complete JSON parser supporting objects, arrays, strings, numbers, booleans, and null.
+ // value -> object | array | string | number | true | false | null
+ common_peg_parser json();
+ common_peg_parser json_object();
+ common_peg_parser json_string();
+ common_peg_parser json_array();
+ common_peg_parser json_number();
+ common_peg_parser json_bool();
+ common_peg_parser json_null();
+
+ // Matches JSON string content without the surrounding quotes.
+ // Useful for extracting content within a JSON string.
+ common_peg_parser json_string_content();
+
+ // Matches a JSON object member with a key and associated parser as the
+ // value.
+ common_peg_parser json_member(const std::string & key, const common_peg_parser & p);
+
+ // Wraps a parser with JSON schema metadata for grammar generation.
+ // Used internally to convert JSON schemas to GBNF grammar rules.
+ common_peg_parser schema(const common_peg_parser & p, const std::string & name, const nlohmann::ordered_json & schema, bool raw = false);
+
+ // Creates a named rule, stores it in the grammar, and returns a ref.
+ // If trigger=true, marks this rule as an entry point for lazy grammar generation.
+ // auto json = p.rule("json", json_obj | json_arr | ...)
+ common_peg_parser rule(const std::string & name, const common_peg_parser & p, bool trigger = false);
+
+ // Creates a named rule using a builder function, and returns a ref.
+ // If trigger=true, marks this rule as an entry point for lazy grammar generation.
+ // auto json = p.rule("json", [&]() { return json_object() | json_array() | ... })
+ common_peg_parser rule(const std::string & name, const std::function<common_peg_parser()> & builder, bool trigger = false);
+
+ // Creates a trigger rule. When generating a lazy grammar from the parser,
+ // only trigger rules and descendents are emitted.
+ common_peg_parser trigger_rule(const std::string & name, const common_peg_parser & p) { return rule(name, p, true); }
+ common_peg_parser trigger_rule(const std::string & name, const std::function<common_peg_parser()> & builder) { return rule(name, builder, true); }
+
+ // Creates an atomic parser. Atomic parsers do not create an AST node if
+ // the child results in a partial parse, i.e. NEEDS_MORE_INPUT. This is
+ // intended for situations where partial output is undesirable.
+ common_peg_parser atomic(const common_peg_parser & p) { return add(common_peg_atomic_parser{p}); }
+
+ // Tags create nodes in the generated AST for semantic purposes.
+ // Unlike rules, you can tag multiple nodes with the same tag.
+ common_peg_parser tag(const std::string & tag, const common_peg_parser & p) { return add(common_peg_tag_parser{p.id(), tag}); }
+
+ void set_root(const common_peg_parser & p);
+
+ common_peg_arena build();
+};
+
+// Helper function for building parsers
+common_peg_arena build_peg_parser(const std::function<common_peg_parser(common_peg_parser_builder & builder)> & fn);
--- /dev/null
+#include "unicode.h"
+
+// implementation adopted from src/unicode.cpp
+
+size_t utf8_sequence_length(unsigned char first_byte) {
+ const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
+ uint8_t highbits = static_cast<uint8_t>(first_byte) >> 4;
+ return lookup[highbits];
+}
+
+utf8_parse_result parse_utf8_codepoint(std::string_view input, size_t offset) {
+ if (offset >= input.size()) {
+ return utf8_parse_result(utf8_parse_result::INCOMPLETE);
+ }
+
+ // ASCII fast path
+ if (!(input[offset] & 0x80)) {
+ return utf8_parse_result(utf8_parse_result::SUCCESS, input[offset], 1);
+ }
+
+ // Invalid: continuation byte as first byte
+ if (!(input[offset] & 0x40)) {
+ return utf8_parse_result(utf8_parse_result::INVALID);
+ }
+
+ // 2-byte sequence
+ if (!(input[offset] & 0x20)) {
+ if (offset + 1 >= input.size()) {
+ return utf8_parse_result(utf8_parse_result::INCOMPLETE);
+ }
+ if ((input[offset + 1] & 0xc0) != 0x80) {
+ return utf8_parse_result(utf8_parse_result::INVALID);
+ }
+ auto result = ((input[offset] & 0x1f) << 6) | (input[offset + 1] & 0x3f);
+ return utf8_parse_result(utf8_parse_result::SUCCESS, result, 2);
+ }
+
+ // 3-byte sequence
+ if (!(input[offset] & 0x10)) {
+ if (offset + 2 >= input.size()) {
+ return utf8_parse_result(utf8_parse_result::INCOMPLETE);
+ }
+ if ((input[offset + 1] & 0xc0) != 0x80 || (input[offset + 2] & 0xc0) != 0x80) {
+ return utf8_parse_result(utf8_parse_result::INVALID);
+ }
+ auto result = ((input[offset] & 0x0f) << 12) | ((input[offset + 1] & 0x3f) << 6) | (input[offset + 2] & 0x3f);
+ return utf8_parse_result(utf8_parse_result::SUCCESS, result, 3);
+ }
+
+ // 4-byte sequence
+ if (!(input[offset] & 0x08)) {
+ if (offset + 3 >= input.size()) {
+ return utf8_parse_result(utf8_parse_result::INCOMPLETE);
+ }
+ if ((input[offset + 1] & 0xc0) != 0x80 || (input[offset + 2] & 0xc0) != 0x80 || (input[offset + 3] & 0xc0) != 0x80) {
+ return utf8_parse_result(utf8_parse_result::INVALID);
+ }
+ auto result = ((input[offset] & 0x07) << 18) | ((input[offset + 1] & 0x3f) << 12) | ((input[offset + 2] & 0x3f) << 6) | (input[offset + 3] & 0x3f);
+ return utf8_parse_result(utf8_parse_result::SUCCESS, result, 4);
+ }
+
+ // Invalid first byte
+ return utf8_parse_result(utf8_parse_result::INVALID);
+}
--- /dev/null
+#pragma once
+
+#include <cstdint>
+#include <string_view>
+
+// UTF-8 parsing utilities for streaming-aware unicode support
+
+struct utf8_parse_result {
+ uint32_t codepoint; // Decoded codepoint (only valid if status == SUCCESS)
+ size_t bytes_consumed; // How many bytes this codepoint uses (1-4)
+ enum status { SUCCESS, INCOMPLETE, INVALID } status;
+
+ utf8_parse_result(enum status s, uint32_t cp = 0, size_t bytes = 0)
+ : codepoint(cp), bytes_consumed(bytes), status(s) {}
+};
+
+// Determine the expected length of a UTF-8 sequence from its first byte
+// Returns 0 for invalid first bytes
+size_t utf8_sequence_length(unsigned char first_byte);
+
+// Parse a single UTF-8 codepoint from input
+utf8_parse_result parse_utf8_codepoint(std::string_view input, size_t offset);
--- /dev/null
+# Parsing Model Output
+
+The `common` library contains a PEG parser implementation suitable for parsing
+model output.
+
+Types with the prefix `common_peg_*` are intended for general use and may have
+applications beyond parsing model output, such as parsing user-provided regex
+patterns.
+
+Types with the prefix `common_chat_peg_*` are specialized helpers for model
+output.
+
+The parser features:
+
+- Partial parsing of streaming input
+- Built-in JSON parsers
+- AST generation with semantics via "tagged" nodes
+
+## Example
+
+Below is a contrived example demonstrating how to use the PEG parser to parse
+output from a model that emits arguments as JSON.
+
+```cpp
+auto parser = build_chat_peg_native_parser([&](common_chat_peg_native_builder & p) {
+ // Build a choice of all available tools
+ auto tool_choice = p.choice();
+ for (const auto & tool : tools) {
+ const auto & function = tool.at("function");
+ std::string name = function.at("name");
+ const auto & schema = function.at("parameters");
+
+ auto tool_name = p.json_member("name", "\"" + p.literal(name) + "\"");
+ auto tool_args = p.json_member("arguments", p.schema(p.json(), "tool-" + name + "-schema", schema));
+
+ tool_choice |= p.rule("tool-" + name, "{" << tool_name << "," << tool_args << "}");
+ }
+
+ // Define the tool call structure: <tool_call>[{tool}]</tool_call>
+ auto tool_call = p.trigger_rule("tool-call",
+ p.sequence({
+ p.literal("<tool_call>["),
+ tool_choice,
+ p.literal("]</tool_call>")
+ })
+ );
+
+ // Parser accepts content, optionally followed by a tool call
+ return p.sequence({
+ p.content(p.until("<tool_call>")),
+ p.optional(tool_call),
+ p.end()
+ });
+});
+```
+
+For a more complete example, see `test_example_native()` in
+[tests/test-chat-peg-parser.cpp](tests/test-chat-peg-parser.cpp).
+
+## Parsers/Combinators
+
+### Basic Matchers
+
+- **`eps()`** - Matches nothing and always succeeds (epsilon/empty match)
+- **`start()`** - Matches the start of input (anchor `^`)
+- **`end()`** - Matches the end of input (anchor `$`)
+- **`literal(string)`** - Matches an exact literal string
+- **`any()`** - Matches any single character (`.`)
+
+### Combinators
+
+- **`sequence(...)`** - Matches parsers in order; all must succeed
+- **`choice(...)`** - Matches the first parser that succeeds from alternatives (ordered choice)
+- **`one_or_more(p)`** - Matches one or more repetitions (`+`)
+- **`zero_or_more(p)`** - Matches zero or more repetitions (`*`)
+- **`optional(p)`** - Matches zero or one occurrence (`?`)
+- **`repeat(p, min, max)`** - Matches between min and max repetitions (use `-1` for unbounded)
+- **`repeat(p, n)`** - Matches exactly n repetitions
+
+### Lookahead
+
+- **`peek(p)`** - Positive lookahead: succeeds if parser succeeds without consuming input (`&`)
+- **`negate(p)`** - Negative lookahead: succeeds if parser fails without consuming input (`!`)
+
+### Character Classes & Utilities
+
+- **`chars(classes, min, max)`** - Matches repetitions of characters from a character class
+- **`space()`** - Matches zero or more whitespace characters (space, tab, newline)
+- **`until(delimiter)`** - Matches characters until delimiter is found (delimiter not consumed)
+- **`until_one_of(delimiters)`** - Matches characters until any delimiter in the list is found
+- **`rest()`** - Matches everything remaining (`.*`)
+
+### JSON Parsers
+
+- **`json()`** - Complete JSON parser (objects, arrays, strings, numbers, booleans, null)
+- **`json_object()`** - JSON object parser
+- **`json_array()`** - JSON array parser
+- **`json_string()`** - JSON string parser
+- **`json_number()`** - JSON number parser
+- **`json_bool()`** - JSON boolean parser
+- **`json_null()`** - JSON null parser
+- **`json_string_content()`** - JSON string content without surrounding quotes
+- **`json_member(key, p)`** - JSON object member with specific key and value parser
+
+### Grammar Building
+
+- **`ref(name)`** - Creates a lightweight reference to a named rule (for recursive grammars)
+- **`rule(name, p, trigger)`** - Creates a named rule and returns a reference
+- **`trigger_rule(name, p)`** - Creates a trigger rule (entry point for lazy grammar generation)
+- **`schema(p, name, schema, raw)`** - Wraps parser with JSON schema metadata for grammar generation
+
+### AST Control
+
+- **`atomic(p)`** - Prevents AST node creation for partial parses
+- **`tag(tag, p)`** - Creates AST nodes with semantic tags (multiple nodes can share tags)
+
+## GBNF Grammar Generation
+
+The PEG parser also acts as a convenient DSL for generating GBNF grammars, with
+some exceptions.
+
+```cpp
+data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+ foreach_function(params.tools, [&](const json & fn) {
+ builder.resolve_refs(fn.at("parameters"));
+ });
+ parser.build_grammar(builder, data.grammar_lazy);
+});
+```
+
+The notable exception is the `negate(p)` lookahead parser, which cannot be
+defined as a CFG grammar and therefore does not produce a rule. Its usage
+should be limited and preferably hidden behind a `schema()` parser. In many
+cases, `until(delimiter)` or `until_one_of(delimiters)` is a better choice.
+
+Another limitation is that the PEG parser requires an unambiguous grammar. In
+contrast, the `llama-grammar` implementation can support ambiguous grammars,
+though they are difficult to parse.
+
+### Lazy Grammars
+
+During lazy grammar generation, only rules reachable from a `trigger_rule(p)`
+are emitted in the grammar. All trigger rules are added as alternations in the
+root rule. It is still necessary to define trigger patterns, as the parser has
+no interaction with the grammar sampling.
+
+### JSON Schema
+
+The `schema(p, name, schema, raw)` parser will use the `json-schema-to-grammar`
+implementation to generate the grammar instead of the underlying parser.
+
+The `raw` option emits a grammar suitable for a raw string instead of a JSON
+string. In other words, it won't be wrapped in quotes or require escaping
+quotes. It should only be used when `type == "string"`.
+
+The downside is that it can potentially lead to ambiguous grammars. For
+example, if a user provides the pattern `^.*$`, the following grammar may be
+generated:
+
+```
+root ::= "<arg>" .* "</arg>"
+```
+
+This creates an ambiguous grammar that cannot be parsed by the PEG parser. To
+help mitigate this, if `.*` is found in the pattern, the grammar from the
+underlying parser will be emitted instead.
+
+## Common AST Shapes for Chat Parsing
+
+Most model output can be placed in one of the following categories:
+
+- Content only
+- Tool calling with arguments emitted as a single JSON object
+- Tool calling with arguments emitted as separate entities, either XML
+ (Qwen3-Coder, MiniMax M2) or pseudo-function calls (LFM2)
+
+To provide broad coverage,
+[`common/chat-peg-parser.h`](common/chat-peg-parser.h) contains builders and
+mappers that help create parsers and visitors/extractors for these types. They
+require parsers to tag nodes to conform to an AST "shape". This normalization
+makes it easy to extract information and generalize parsing.
+
+### Simple
+
+The `common_chat_peg_builder` builds a `simple` parser that supports
+content-only models with optional reasoning.
+
+- **`reasoning(p)`** - Tag node for extracting `reasoning_content`
+- **`content(p)`** - Tag node for extracting `content`
+
+```cpp
+build_chat_peg_parser([&](common_chat_peg_parser & p) {
+ return p.sequence({
+ p.optional("<think>" + p.reasoning(p.until("</think>")) + "</think>"),
+ p.content(p.until("<tool_call>")),
+ p.end()
+ });
+});
+```
+
+Use `common_chat_peg_mapper` to extract the content. Note that this is already
+done for you in `common_chat_peg_parser` when
+`chat_format == COMMON_CHAT_FORMAT_PEG_SIMPLE`.
+
+```cpp
+auto result = parser.parse(ctx);
+
+common_chat_msg msg;
+auto mapper = common_chat_peg_mapper(msg);
+mapper.from_ast(ctx.ast, result);
+```
+
+### Native
+
+The `common_chat_peg_native_builder` builds a `native` parser suitable for
+models that emit tool arguments as a direct JSON object.
+
+- **`reasoning(p)`** - Tag node for `reasoning_content`
+- **`content(p)`** - Tag node for `content`
+- **`tool(p)`** - Tag entirety of a single tool call
+- **`tool_open(p)`** - Tag start of a tool call
+- **`tool_close(p)`** - Tag end of a tool call
+- **`tool_id(p)`** - Tag the tool call ID (optional)
+- **`tool_name(p)`** - Tag the tool name
+- **`tool_args(p)`** - Tag the tool arguments
+
+```cpp
+build_chat_peg_native_parser([&](common_chat_peg_native_parser & p) {
+ auto get_weather_tool = p.tool(p.sequence({
+ p.tool_open(p.literal("{")),
+ p.json_member("name", "\"" + p.tool_name(p.literal("get_weather")) + "\""),
+ p.literal(","),
+ p.json_member("arguments", p.tool_args(p.json())),
+ p.tool_close(p.literal("}"))
+ }));
+
+ return p.sequence({
+ p.content(p.until("<tool_call>")),
+ p.literal("<tool_call>"),
+ get_weather_tool,
+ p.literal("</tool_call>"),
+ p.end()
+ });
+});
+```
+
+### Constructed
+
+The `common_chat_peg_constructed_builder` builds a `constructed` parser
+suitable for models that emit tool arguments as separate entities, such as XML
+tags.
+
+- **`reasoning(p)`** - Tag node for `reasoning_content`
+- **`content(p)`** - Tag node for `content`
+- **`tool(p)`** - Tag entirety of a single tool call
+- **`tool_open(p)`** - Tag start of a tool call
+- **`tool_close(p)`** - Tag end of a tool call
+- **`tool_name(p)`** - Tag the tool name
+- **`tool_arg(p)`** - Tag a complete tool argument (name + value)
+- **`tool_arg_open(p)`** - Tag start of a tool argument
+- **`tool_arg_close(p)`** - Tag end of a tool argument
+- **`tool_arg_name(p)`** - Tag the argument name
+- **`tool_arg_string_value(p)`** - Tag string value for the argument
+- **`tool_arg_json_value(p)`** - Tag JSON value for the argument
+
+```cpp
+build_chat_peg_constructed_parser([&](common_chat_peg_constructed_builder & p) {
+ auto location_arg = p.tool_arg(
+ p.tool_arg_open("<parameter name=\"" + p.tool_arg_name(p.literal("location")) + "\">"),
+ p.tool_arg_string_value(p.until("</parameter>")),
+ p.tool_arg_close(p.literal("</parameter>"))
+ );
+
+ auto get_weather_tool = p.tool(p.sequence({
+ p.tool_open("<function name=\"" + p.tool_name(p.literal("get_weather")) + "\">"),
+ location_arg,
+ p.tool_close(p.literal("</function>"))
+ }));
+
+ return p.sequence({
+ p.content(p.until("<tool_call>")),
+ p.literal("<tool_call>"),
+ get_weather_tool,
+ p.literal("</tool_call>"),
+ p.end()
+ });
+});
+```
*.o
ggml-common.h
**/*.swp
+!peg-parser
llama_add_compile_flags()
function(llama_build source)
+ set(TEST_SOURCES ${source} ${ARGN})
+
if (DEFINED LLAMA_TEST_NAME)
set(TEST_TARGET ${LLAMA_TEST_NAME})
else()
get_filename_component(TEST_TARGET ${source} NAME_WE)
endif()
- add_executable(${TEST_TARGET} ${source})
+ add_executable(${TEST_TARGET} ${TEST_SOURCES})
target_link_libraries(${TEST_TARGET} PRIVATE common)
install(TARGETS ${TEST_TARGET} RUNTIME)
endfunction()
set(multiValueArgs ARGS)
cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+ set(TEST_SOURCES ${source} ${LLAMA_TEST_UNPARSED_ARGUMENTS} get-model.cpp)
+
if (NOT DEFINED LLAMA_TEST_LABEL)
set(LLAMA_TEST_LABEL "main")
endif()
get_filename_component(TEST_TARGET ${source} NAME_WE)
endif()
- add_executable(${TEST_TARGET} ${source} get-model.cpp)
+ add_executable(${TEST_TARGET} ${TEST_SOURCES})
install(TARGETS ${TEST_TARGET} RUNTIME)
target_link_libraries(${TEST_TARGET} PRIVATE common)
endif()
llama_build_and_test(test-chat-parser.cpp)
+llama_build_and_test(test-chat-peg-parser.cpp peg-parser/simple-tokenize.cpp)
llama_build_and_test(test-chat-template.cpp)
llama_build_and_test(test-json-partial.cpp)
llama_build_and_test(test-log.cpp)
+llama_build_and_test(
+ test-peg-parser.cpp
+ peg-parser/simple-tokenize.cpp
+ peg-parser/test-basic.cpp
+ peg-parser/test-gbnf-generation.cpp
+ peg-parser/test-json-parser.cpp
+ peg-parser/test-json-serialization.cpp
+ peg-parser/test-unicode.cpp
+ peg-parser/testing.h
+ peg-parser/tests.h
+)
llama_build_and_test(test-regex-partial.cpp)
if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
--- /dev/null
+#include "simple-tokenize.h"
+
+std::vector<std::string> simple_tokenize(const std::string & input) {
+ std::vector<std::string> result;
+ std::string current;
+
+ for (size_t i = 0; i < input.size(); i++) {
+ switch (input[i]) {
+ case ' ':
+ case '\n':
+ case '\t':
+ case '{':
+ case '}':
+ case ',':
+ case '[':
+ case '"':
+ case ']':
+ case '.':
+ case '<':
+ case '>':
+ case '=':
+ case '/':
+ if (!current.empty()) {
+ result.push_back(current);
+ current.clear();
+ }
+ default:;
+ }
+ current += input[i];
+ }
+
+ if (!current.empty()) {
+ result.push_back(current);
+ }
+
+ return result;
+}
--- /dev/null
+#pragma once
+
+#include <string>
+#include <vector>
+
+std::vector<std::string> simple_tokenize(const std::string &);
--- /dev/null
+#include "tests.h"
+
+void test_basic(testing & t) {
+ t.test("chars", [](testing & t) {
+ // Test common escape sequences - newline
+ t.test("escape_sequence_newline", [](testing &t) {
+ auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[\\n\\t\\\\]"); });
+
+ common_peg_parse_context ctx;
+ common_peg_parse_result result;
+
+ ctx = common_peg_parse_context("\n");
+ result = common_chat_combinator_parser.parse(ctx);
+ t.assert_equal("escape_sequence_newline", true, result.success());
+ });
+
+ // Test common escape sequences - tab
+ t.test("escape_sequence_tab", [](testing &t) {
+ auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[\\n\\t\\\\]"); });
+
+ common_peg_parse_context ctx;
+ common_peg_parse_result result;
+
+ ctx = common_peg_parse_context("\t");
+ result = common_chat_combinator_parser.parse(ctx);
+ t.assert_equal("escape_sequence_tab", true, result.success());
+ });
+
+ // Test common escape sequences - backslash
+ t.test("escape_sequence_backslash", [](testing &t) {
+ auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[\\n\\t\\\\]"); });
+
+ common_peg_parse_context ctx;
+ common_peg_parse_result result;
+
+ ctx = common_peg_parse_context("\\");
+ result = common_chat_combinator_parser.parse(ctx);
+ t.assert_equal("escape_sequence_backslash", true, result.success());
+ });
+
+ // Test common escape sequences - space (should ())
+ t.test("escape_sequence_space_fail", [](testing &t) {
+ auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[\\n\\t\\\\]"); });
+
+ common_peg_parse_context ctx;
+ common_peg_parse_result result;
+
+ ctx = common_peg_parse_context(" ");
+ result = common_chat_combinator_parser.parse(ctx);
+ t.assert_equal("escape_sequence_space_fail", true, result.fail());
+ });
+
+ // Test escaped dash - 'a' should succeed
+ t.test("escaped_dash_a", [](testing &t) {
+ auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[a\\-z]"); });
+
+ common_peg_parse_context ctx;
+ common_peg_parse_result result;
+
+ ctx = common_peg_parse_context("a");
+ result = common_chat_combinator_parser.parse(ctx);
+ t.assert_equal("escaped_dash_a", true, result.success());
+ });
+
+ // Test escaped dash - '-' should succeed (literal dash)
+ t.test("escaped_dash_literal", [](testing &t) {
+ auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[a\\-z]"); });
+
+ common_peg_parse_context ctx;
+ common_peg_parse_result result;
+
+ ctx = common_peg_parse_context("-");
+ result = common_chat_combinator_parser.parse(ctx);
+ t.assert_equal("escaped_dash_literal", true, result.success());
+ });
+
+ // Test escaped dash - 'z' should succeed
+ t.test("escaped_dash_z", [](testing &t) {
+ auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[a\\-z]"); });
+
+ common_peg_parse_context ctx;
+ common_peg_parse_result result;
+
+ ctx = common_peg_parse_context("z");
+ result = common_chat_combinator_parser.parse(ctx);
+ t.assert_equal("escaped_dash_z", true, result.success());
+ });
+
+ // Test escaped dash - 'b' should NOT match (since \- is literal dash, not range)
+ t.test("escaped_dash_b_fail", [](testing &t) {
+ auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[a\\-z]"); });
+
+ common_peg_parse_context ctx;
+ common_peg_parse_result result;
+
+ ctx = common_peg_parse_context("b");
+ result = common_chat_combinator_parser.parse(ctx);
+ t.assert_equal("escaped_dash_b_fail", true, result.fail());
+ });
+ });
+
+
+ t.test("optional", [](testing & t) {
+ // Full match with optional part present
+ t.test("optional_present", [](testing &t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+ return p.literal("hello") + p.optional(p.literal(" world"));
+ });
+
+ auto ctx = common_peg_parse_context("hello world");
+ auto result = parser.parse(ctx);
+ t.assert_equal("optional_present", true, result.success());
+ t.assert_equal("optional_present_end", 11u, result.end);
+ });
+
+ // Full match with optional part absent
+ t.test("optional_absent", [](testing &t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+ return p.literal("hello") + p.optional(p.literal(" world"));
+ });
+
+ auto ctx = common_peg_parse_context("hello", false);
+ auto result = parser.parse(ctx);
+ t.assert_equal("optional_absent", true, result.success());
+ t.assert_equal("optional_absent_end", 5u, result.end);
+ });
+
+ // Partial match - waiting for more input to determine if optional matches
+ t.test("partial_match_need_more", [](testing &t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+ return p.literal("hello") + p.optional(p.literal(" world"));
+ });
+
+ auto ctx = common_peg_parse_context("hello ", true);
+ auto result = parser.parse(ctx);
+ t.assert_equal("partial_match_need_more", true, result.need_more_input());
+ });
+ });
+
+ t.test("partial parsing", [](testing & t) {
+ // Literals - Basic Success
+ t.test("literal_success", [&](testing & t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("hello"); });
+
+ common_peg_parse_context ctx;
+ common_peg_parse_result result;
+
+ ctx = common_peg_parse_context("hello");
+ result = parser.parse(ctx);
+ t.assert_equal("literal_success", true, result.success());
+ });
+
+ // Char Classes - Basic Lowercase Success
+ t.test("char_class_lowercase_success", [&](testing & t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z"); });
+
+ common_peg_parse_context ctx;
+ common_peg_parse_result result;
+
+ ctx = common_peg_parse_context("a");
+ result = parser.parse(ctx);
+ t.assert_equal("char_class_lowercase_success", true, result.success());
+ });
+
+ // Char Classes - Uppercase Fail
+ t.test("char_class_uppercase_fail", [&](testing & t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z"); });
+
+ common_peg_parse_context ctx;
+ common_peg_parse_result result;
+
+ ctx = common_peg_parse_context("A");
+ result = parser.parse(ctx);
+ t.assert_equal("char_class_uppercase_fail", true, result.fail());
+ });
+
+ // Char Classes with Dash - Lowercase Success
+ t.test("char_class_with_dash_lowercase", [&](testing & t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z-"); });
+
+ common_peg_parse_context ctx;
+ common_peg_parse_result result;
+
+ ctx = common_peg_parse_context("f");
+ result = parser.parse(ctx);
+ t.assert_equal("char_class_with_dash_lowercase", true, result.success());
+ });
+
+ // Char Classes with Dash - Literal Dash Success
+ t.test("char_class_with_dash_literal_dash", [&](testing & t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z-"); });
+
+ common_peg_parse_context ctx;
+ common_peg_parse_result result;
+
+ ctx = common_peg_parse_context("-");
+ result = parser.parse(ctx);
+ t.assert_equal("char_class_with_dash_literal_dash", true, result.success());
+ });
+
+ // Char Classes with Dash - Uppercase Fail
+ t.test("char_class_with_dash_uppercase_fail", [&](testing & t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z-"); });
+
+ common_peg_parse_context ctx;
+ common_peg_parse_result result;
+
+ ctx = common_peg_parse_context("A");
+ result = parser.parse(ctx);
+ t.assert_equal("char_class_with_dash_uppercase_fail", true, result.fail());
+ });
+
+ // Sequences - Partial Match 1
+ t.test("sequence_partial_match_1", [&](testing & t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("<think>") + p.literal("</think>"); });
+
+ auto ctx = common_peg_parse_context("<thi", true);
+ auto result = parser.parse(ctx);
+ t.assert_equal("sequence_partial_match_1", true, result.need_more_input());
+ });
+
+ // Sequences - Partial Match 2
+ t.test("sequence_partial_match_2", [&](testing & t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("begin") + p.literal("end"); });
+
+ auto ctx = common_peg_parse_context("begin", true);
+ auto result = parser.parse(ctx);
+ t.assert_equal("sequence_partial_match_2", true, result.need_more_input());
+ });
+
+ // Sequences - Partial Match 3
+ t.test("sequence_partial_match_3", [&](testing & t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("<think>") + p.literal("</think>"); });
+
+ auto ctx = common_peg_parse_context("<think></", true);
+ auto result = parser.parse(ctx);
+ t.assert_equal("sequence_partial_match_3", true, result.need_more_input());
+ });
+
+ // Sequences - Full Match
+ t.test("sequence_full_match", [&](testing & t) {
+ auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("hello") + p.literal("world"); });
+
+ auto ctx = common_peg_parse_context("helloworld", false);
+ auto result = common_chat_combinator_parser.parse(ctx);
+ t.assert_equal("sequence_full_match", true, result.success());
+ });
+
+ // Sequences - No Match
+ t.test("sequence_no_match", [&](testing & t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("<think>") + p.literal("</think>"); });
+
+ auto ctx = common_peg_parse_context("<think>I am common_chat_combinator_parser", true);
+ auto result = parser.parse(ctx);
+ t.assert_equal("sequence_no_match", true, result.fail());
+ });
+
+ // Choices - Partial Match 1
+ t.test("choices_partial_match_1", [&](testing & t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("option1") | p.literal("option2"); });
+
+ auto ctx = common_peg_parse_context("opt", true);
+ auto result = parser.parse(ctx);
+ t.assert_equal("choices_partial_match_1", true, result.need_more_input());
+ });
+
+ // Choices - Partial Match 2
+ t.test("choices_partial_match_2", [&](testing & t) {
+ auto parser =
+ build_peg_parser([](common_peg_parser_builder & p) { return p.literal("choice_a") | p.literal("choice_b"); });
+
+ auto ctx = common_peg_parse_context("choice", true);
+ auto result = parser.parse(ctx);
+ t.assert_equal("choices_partial_match_2", true, result.need_more_input());
+ });
+
+ // Choices - Full Match 1
+ t.test("choices_full_match_1", [&](testing & t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("first") | p.literal("second"); });
+
+ auto ctx = common_peg_parse_context("first", false);
+ auto result = parser.parse(ctx);
+ t.assert_equal("choices_full_match_1", true, result.success());
+ });
+
+ // Choices - Full Match 2
+ t.test("choices_full_match_2", [&](testing & t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("alpha") | p.literal("beta"); });
+
+ auto ctx = common_peg_parse_context("beta", false);
+ auto result = parser.parse(ctx);
+ t.assert_equal("choices_full_match_2", true, result.success());
+ });
+
+ // Choices - No Match
+ t.test("choices_no_match", [&](testing & t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("good") | p.literal("better"); });
+
+ auto ctx = common_peg_parse_context("best", false);
+ auto result = parser.parse(ctx);
+ t.assert_equal("choices_no_match", true, result.fail());
+ });
+
+ // Zero or More - Partial Match 1
+ t.test("zero_or_more_partial_match_1", [&](testing & t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.zero_or_more(p.literal("ab")); });
+
+ auto ctx = common_peg_parse_context("a", true);
+ auto result = parser.parse(ctx);
+ t.assert_equal("zero_or_more_partial_match_1", true, result.need_more_input());
+ });
+
+ // Zero or More - Partial Match 2
+ t.test("zero_or_more_partial_match_2", [&](testing & t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.zero_or_more(p.literal("xy")); });
+
+ auto ctx = common_peg_parse_context("xyx", true);
+ auto result = parser.parse(ctx);
+ t.assert_equal("zero_or_more_partial_match_2", true, result.need_more_input());
+ });
+
+ // Zero or More - Full Match
+ t.test("zero_or_more_full_match", [&](testing & t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.zero_or_more(p.literal("test")); });
+
+ auto ctx = common_peg_parse_context("test", false);
+ auto result = parser.parse(ctx);
+ t.assert_equal("zero_or_more_full_match", true, result.success());
+ });
+
+ // One or More - Partial Match 1
+ t.test("one_or_more_partial_match_1", [&](testing & t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("repeat")); });
+
+ auto ctx = common_peg_parse_context("rep", true);
+ auto result = parser.parse(ctx);
+ t.assert_equal("one_or_more_partial_match_1", true, result.need_more_input());
+ });
+
+ // One or More - Partial Match 2
+ t.test("one_or_more_partial_match_2", [&](testing & t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("ab")); });
+
+ auto ctx = common_peg_parse_context("aba", true);
+ auto result = parser.parse(ctx);
+ t.assert_equal("one_or_more_partial_match_2", true, result.need_more_input());
+ });
+
+ // One or More - Full Match
+ t.test("one_or_more_full_match", [&](testing & t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("single")); });
+
+ auto ctx = common_peg_parse_context("single", false);
+ auto result = parser.parse(ctx);
+ t.assert_equal("one_or_more_full_match", true, result.success());
+ });
+
+ // One or More - No Match
+ t.test("one_or_more_no_match", [&](testing & t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("()")); });
+
+ auto ctx = common_peg_parse_context("success", false);
+ auto result = parser.parse(ctx);
+ t.assert_equal("one_or_more_no_match", true, result.fail());
+ });
+ });
+
+
+ t.test("recursive rules", [](testing &t) {
+ // Test simple number
+ t.test("simple_number", [](testing &t) {
+ auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
+ p.rule("number", p.chars("0-9"));
+ p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
+ return p.rule("value", p.ref("number") | p.ref("list"));
+ });
+
+ common_peg_parse_context ctx("1", false);
+ auto result = value_parser.parse(ctx);
+
+ t.assert_equal("result_is_success", true, result.success());
+ });
+
+ // Test simple list
+ t.test("simple_list", [](testing &t) {
+ auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
+ p.rule("number", p.chars("0-9"));
+ p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
+ return p.rule("value", p.ref("number") | p.ref("list"));
+ });
+
+ common_peg_parse_context ctx("[1]", false);
+ auto result = value_parser.parse(ctx);
+
+ t.assert_equal("result_is_success", true, result.success());
+ });
+
+ // Test nested list
+ t.test("nested_list", [](testing &t) {
+ auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
+ p.rule("number", p.chars("0-9"));
+ p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
+ return p.rule("value", p.ref("number") | p.ref("list"));
+ });
+
+ common_peg_parse_context ctx("[[2]]", false);
+ auto result = value_parser.parse(ctx);
+
+ t.assert_equal("result_is_success", true, result.success());
+ });
+
+ // Test deeply nested list
+ t.test("deeply_nested_list", [](testing &t) {
+ auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
+ p.rule("number", p.chars("0-9"));
+ p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
+ return p.rule("value", p.ref("number") | p.ref("list"));
+ });
+
+ common_peg_parse_context ctx("[[[3]]]", false);
+ auto result = value_parser.parse(ctx);
+
+ t.assert_equal("result_is_success", true, result.success());
+ });
+
+ // Test need_more_input match
+ t.test("need_more_input_match", [](testing &t) {
+ auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
+ p.rule("number", p.chars("0-9"));
+ p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
+ return p.rule("value", p.ref("number") | p.ref("list"));
+ });
+
+ common_peg_parse_context ctx("[[", true);
+ auto result = value_parser.parse(ctx);
+
+ t.assert_equal("result_is_need_more_input", true, result.need_more_input());
+ });
+
+ // Test no match
+ t.test("no_match", [](testing &t) {
+ auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
+ p.rule("number", p.chars("0-9"));
+ p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
+ return p.rule("value", p.ref("number") | p.ref("list"));
+ });
+
+ common_peg_parse_context ctx("[a]", false);
+ auto result = value_parser.parse(ctx);
+
+ t.assert_equal("result_is_fail", true, result.fail());
+ });
+ });
+}
--- /dev/null
+#include "tests.h"
+
+#include "json-schema-to-grammar.h"
+
+#include <regex>
+
+static std::string trim_leading_space(const std::string & s) {
+ static const std::regex leading_ws_re = std::regex(R"((^|\n)\s+)");
+ return std::regex_replace(s, leading_ws_re, "$1");
+}
+
+static void assert_gbnf_equal(testing & t, const std::string & expected, const std::string & actual) {
+ t.assert_equal("gbnf are equal", trim_leading_space(expected), trim_leading_space(actual));
+}
+
+void test_gbnf_generation(testing &t) {
+ t.test("literal grammar generation", [](testing &t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+ return p.literal("hello");
+ });
+
+ auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+ parser.build_grammar(builder);
+ });
+
+ assert_gbnf_equal(t, R"""(
+ root ::= "hello"
+ space ::= | " " | "\n"{1,2} [ \t]{0,20}
+ )""", gbnf);
+ });
+
+ t.test("char class grammar", [](testing &t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+ return p.chars("[a-z]", 1, 1);
+ });
+
+ auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+ parser.build_grammar(builder);
+ });
+
+ assert_gbnf_equal(t, R"""(
+ root ::= [a-z]
+ space ::= | " " | "\n"{1,2} [ \t]{0,20}
+ )""", gbnf);
+ });
+
+ t.test("sequence grammar", [](testing &t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+ return p.literal("hello") + p.literal(" ") + p.literal("world");
+ });
+
+ auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+ parser.build_grammar(builder);
+ });
+
+ assert_gbnf_equal(t, R"""(
+ root ::= "hello" " " "world"
+ space ::= | " " | "\n"{1,2} [ \t]{0,20}
+ )""", gbnf);
+ });
+
+ t.test("choice grammar", [](testing &t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+ return p.literal("cat") | p.literal("dog");
+ });
+
+ auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+ parser.build_grammar(builder);
+ });
+
+ assert_gbnf_equal(t, R"""(
+ root ::= "cat" | "dog"
+ space ::= | " " | "\n"{1,2} [ \t]{0,20}
+ )""", gbnf);
+ });
+
+ t.test("one_or_more grammar", [](testing &t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+ return p.one_or_more(p.literal("a"));
+ });
+
+ auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+ parser.build_grammar(builder);
+ });
+
+ assert_gbnf_equal(t, R"""(
+ root ::= "a"+
+ space ::= | " " | "\n"{1,2} [ \t]{0,20}
+ )""", gbnf);
+ });
+
+ t.test("zero_or_more grammar", [](testing &t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+ return p.zero_or_more(p.literal("a"));
+ });
+
+ auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+ parser.build_grammar(builder);
+ });
+
+ assert_gbnf_equal(t, R"""(
+ root ::= "a"*
+ space ::= | " " | "\n"{1,2} [ \t]{0,20}
+ )""", gbnf);
+ });
+
+ t.test("optional grammar", [](testing &t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+ return p.literal("hello") + p.optional(p.literal(" world"));
+ });
+
+ auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+ parser.build_grammar(builder);
+ });
+
+ assert_gbnf_equal(t, R"""(
+ root ::= "hello" " world"?
+ space ::= | " " | "\n"{1,2} [ \t]{0,20}
+ )""", gbnf);
+ });
+
+ t.test("until grammar", [](testing &t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+ return p.until("</tag>");
+ });
+
+ auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+ parser.build_grammar(builder);
+ });
+
+ assert_gbnf_equal(t, R"""(
+ root ::= ([^<] | "<" [^/] | "</" [^t] | "</t" [^a] | "</ta" [^g] | "</tag" [^>])*
+ space ::= | " " | "\n"{1,2} [ \t]{0,20}
+ )""", gbnf);
+ });
+
+ t.test("complex expressions with parentheses", [](testing &t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+ return p.one_or_more(p.literal("a") | p.literal("b"));
+ });
+
+ auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+ parser.build_grammar(builder);
+ });
+
+ assert_gbnf_equal(t, R"""(
+ root ::= ("a" | "b")+
+ space ::= | " " | "\n"{1,2} [ \t]{0,20}
+ )""", gbnf);
+ });
+
+ t.test("rule references", [](testing &t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+ auto digit = p.rule("digit", p.chars("[0-9]", 1, 1));
+ return p.one_or_more(digit);
+ });
+
+ auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+ parser.build_grammar(builder);
+ });
+
+ assert_gbnf_equal(t, R"""(
+ digit ::= [0-9]
+ root ::= digit+
+ space ::= | " " | "\n"{1,2} [ \t]{0,20}
+ )""", gbnf);
+ });
+
+ t.test("escaping in literals", [](testing &t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+ return p.literal("hello\nworld\n!");
+ });
+
+ auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+ parser.build_grammar(builder);
+ });
+
+ assert_gbnf_equal(t, R"""(
+ root ::= "hello\nworld\n!"
+ space ::= | " " | "\n"{1,2} [ \t]{0,20}
+ )""", gbnf);
+ });
+
+ t.test("operator<< (whitespace insertion)", [](testing &t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+ return p.literal("hello") << p.literal("world");
+ });
+
+ auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+ parser.build_grammar(builder);
+ });
+
+ assert_gbnf_equal(t, R"""(
+ root ::= "hello" space "world"
+ space ::= | " " | "\n"{1,2} [ \t]{0,20}
+ )""", gbnf);
+ });
+
+ t.test("emit only reachable rules", [](testing &t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+ p.rule("orphan", p.literal("orphan"));
+ return p.literal("hello") + p.rule("child", p.literal(" world"));
+ });
+
+ auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+ parser.build_grammar(builder);
+ });
+
+ assert_gbnf_equal(t, R"""(
+ child ::= " world"
+ root ::= "hello" child
+ space ::= | " " | "\n"{1,2} [ \t]{0,20}
+ )""", gbnf);
+ });
+
+ t.test("emit only trigger rules (and references)", [](testing &t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+ auto rule1 = p.rule("rule-1", p.literal("a") + p.ref("rule-2"));
+ p.rule("rule-2", p.literal("b") + p.ref("rule-3"), true);
+ p.rule("rule-3", p.literal("c") + p.ref("rule-4"));
+ p.rule("rule-4", p.literal("d"), true);
+ return rule1;
+ });
+
+ auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+ parser.build_grammar(builder);
+ });
+
+ assert_gbnf_equal(t, R"""(
+ root ::= rule-1
+ rule-1 ::= "a" rule-2
+ rule-2 ::= "b" rule-3
+ rule-3 ::= "c" rule-4
+ rule-4 ::= "d"
+ space ::= | " " | "\n"{1,2} [ \t]{0,20}
+ )""", gbnf);
+
+ auto gbnf_lazy = build_grammar([&](const common_grammar_builder & builder) {
+ parser.build_grammar(builder, true);
+ });
+
+ assert_gbnf_equal(t, R"""(
+ root ::= rule-2 | rule-4
+ rule-2 ::= "b" rule-3
+ rule-3 ::= "c" rule-4
+ rule-4 ::= "d"
+ space ::= | " " | "\n"{1,2} [ \t]{0,20}
+ )""", gbnf_lazy);
+ });
+}
--- /dev/null
+#include "tests.h"
+
+void test_json_parser(testing &t) {
+ // Test parsing a simple JSON object
+ t.test("simple JSON object parsing", [](testing &t) {
+ auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
+
+ std::string input = R"({"name": "test", "value": 42, "flag": true})";
+ common_peg_parse_context ctx(input);
+
+ auto result = json.parse(ctx);
+
+ t.assert_equal("result_is_success", true, result.success());
+ t.assert_equal("result_end", input.size(), result.end);
+ });
+
+ // Test parsing a JSON array with mixed types
+ t.test("JSON array with mixed types", [](testing &t) {
+ auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
+
+ std::string input = R"([1, "hello", true, null, 3.14])";
+ common_peg_parse_context ctx(input);
+
+ auto result = json.parse(ctx);
+
+ t.assert_equal("result_is_success", true, result.success());
+ t.assert_equal("result_end", input.size(), result.end);
+ });
+
+ // Test parsing nested JSON with objects and arrays
+ t.test("nested JSON with objects and arrays", [](testing &t) {
+ auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
+
+ std::string input =
+ R"({"users": [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}], "count": 2, "metadata": {"version": "1.0", "tags": ["admin", "user"]}})";
+ common_peg_parse_context ctx(input);
+
+ auto result = json.parse(ctx);
+
+ t.assert_equal("result_is_success", true, result.success());
+ t.assert_equal("result_end", input.size(), result.end);
+ });
+
+ // Test need_more_input() parsing - incomplete object
+ t.test("need_more_input() parsing - incomplete object", [](testing &t) {
+ auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
+
+ std::string input = R"({"name": "test", "value": )";
+ common_peg_parse_context ctx(input, true);
+
+ auto result = json.parse(ctx);
+
+ t.assert_equal("result_is_need_more_input", true, result.need_more_input());
+ });
+
+ // Test need_more_input() parsing - incomplete array
+ t.test("need_more_input() parsing - incomplete array", [](testing &t) {
+ auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
+
+ std::string input = R"([1, 2, 3, )";
+ common_peg_parse_context ctx(input, true);
+
+ auto result = json.parse(ctx);
+
+ t.assert_equal("result_is_need_more_input", true, result.need_more_input());
+ });
+
+ // Test need_more_input() parsing - incomplete nested structure
+ t.test("need_more_input() parsing - incomplete nested structure", [](testing &t) {
+ auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
+
+ std::string input = R"({"data": {"nested": )";
+ common_peg_parse_context ctx(input, true);
+
+ auto result = json.parse(ctx);
+
+ t.assert_equal("result_is_need_more_input", true, result.need_more_input());
+ });
+
+ t.test("object member", [](testing &t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+ return p.json_member("name", "\"" + p.chars("[a-z]") + "\"");
+ });
+
+ t.test("success", [&](testing &t) {
+ std::string input = R"("name": "bob")";
+ common_peg_parse_context ctx(input, false);
+
+ auto result = parser.parse(ctx);
+ t.assert_true("success", result.success());
+ });
+
+ t.test("partial", [&](testing &t) {
+ std::string input = R"("name": "bo)";
+ common_peg_parse_context ctx(input, true);
+
+ auto result = parser.parse(ctx);
+ t.assert_true("need more input", result.need_more_input());
+ });
+
+ t.test("failed", [&](testing &t) {
+ std::string input = R"([])";
+ common_peg_parse_context ctx(input, false);
+
+ auto result = parser.parse(ctx);
+ t.assert_true("fail", result.fail());
+ });
+ });
+}
--- /dev/null
+#include "tests.h"
+
+void test_json_serialization(testing &t) {
+ auto original = build_peg_parser([](common_peg_parser_builder & p) {
+ return "<tool_call>" + p.json() + "</tool_call>";
+ });
+
+ auto json_serialized = original.to_json().dump();
+
+ t.test("compare before/after", [&](testing &t) {
+ auto deserialized = common_peg_arena::from_json(nlohmann::json::parse(json_serialized));
+
+ // Test complex JSON
+ std::string input = R"({"name": "test", "values": [1, 2, 3], "nested": {"a": true}})";
+ common_peg_parse_context ctx1(input);
+ common_peg_parse_context ctx2(input);
+
+ auto result1 = original.parse(ctx1);
+ auto result2 = deserialized.parse(ctx2);
+
+ t.assert_equal("both_succeed", result1.success(), result2.success());
+ t.assert_equal("same_end_pos", result1.end, result2.end);
+ });
+
+ t.bench("deserialize", [&]() {
+ auto deserialized = common_peg_arena::from_json(nlohmann::json::parse(json_serialized));
+ }, 100);
+}
--- /dev/null
+#include "tests.h"
+
+#include "peg-parser.h"
+
+#include <string>
+#include <sstream>
+#include <iomanip>
+#include <cctype>
+
+static void assert_result_equal(testing & t, common_peg_parse_result_type expected, common_peg_parse_result_type actual) {
+ t.assert_equal(common_peg_parse_result_type_name(expected), common_peg_parse_result_type_name(actual));
+}
+
+static std::string hex_dump(const std::string& str) {
+ std::ostringstream oss;
+ for (unsigned char c : str) {
+ if (std::isprint(c)) {
+ oss << c;
+ } else {
+ oss << "\\x" << std::hex << std::setw(2) << std::setfill('0') << static_cast<int>(c);
+ }
+ }
+ return oss.str();
+}
+
+void test_unicode(testing &t) {
+ struct test_case {
+ std::string input;
+ std::string expected_text;
+ common_peg_parse_result_type expected_result;
+ };
+
+ t.test("any", [](testing &t) {
+ std::vector<test_case> test_cases {
+ // Valid UTF-8 sequences
+ {"Hello", "Hello", COMMON_PEG_PARSE_RESULT_SUCCESS},
+ {std::string("Caf\xC3\xA9"), std::string("Caf\xC3\xA9"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+ {std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+ {std::string("\xF0\x9F\x9A\x80"), std::string("\xF0\x9F\x9A\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+ // Incomplete UTF-8 sequences (partial bytes at end)
+ {std::string("Caf\xC3"), "Caf", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+ {std::string("\xE4\xBD"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+ {std::string("\xF0\x9F\x9A"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+
+ // Invalid/malformed UTF-8 sequences
+ {std::string("\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+ {std::string("Hello\x80World"), "Hello", COMMON_PEG_PARSE_RESULT_FAIL},
+ {std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+ };
+
+ auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+ return p.sequence({p.one_or_more(p.any()), p.end()});
+ });
+
+ for (size_t i = 0; i < test_cases.size(); i++) {
+ const auto & tc = test_cases[i];
+ std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+ t.test(test_name, [&](testing &t) {
+ common_peg_parse_context ctx(tc.input, true);
+ auto result = parser.parse(ctx);
+
+ // Assert result type matches
+ assert_result_equal(t, tc.expected_result, result.type);
+
+ // Assert matched text if success or need_more_input
+ if (result.success() || result.need_more_input()) {
+ std::string matched = tc.input.substr(result.start, result.end - result.start);
+ t.assert_equal(tc.expected_text, matched);
+ }
+ });
+ }
+ });
+
+ t.test("char classes", [](testing &t) {
+ t.test("unicode range U+4E00-U+9FFF (CJK)", [](testing &t) {
+ std::vector<test_case> test_cases {
+ // Within range - CJK Unified Ideographs
+ {std::string("\xE4\xB8\x80"), std::string("\xE4\xB8\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4E00
+ {std::string("\xE4\xBD\xA0"), std::string("\xE4\xBD\xA0"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4F60
+ {std::string("\xE5\xA5\xBD"), std::string("\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+597D
+ {std::string("\xE9\xBF\xBF"), std::string("\xE9\xBF\xBF"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+9FFF
+
+ // Outside range - should fail
+ {"a", "", COMMON_PEG_PARSE_RESULT_FAIL}, // ASCII
+ {std::string("\xE4\xB7\xBF"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+4DFF (before range)
+ {std::string("\xEA\x80\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+A000 (after range)
+
+ // Incomplete sequences in range
+ {std::string("\xE4\xB8"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Incomplete U+4E00
+ {std::string("\xE5\xA5"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Incomplete U+597D
+ };
+
+ auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+ return p.sequence({p.chars(R"([\u4E00-\u9FFF])"), p.end()});
+ });
+
+ for (size_t i = 0; i < test_cases.size(); i++) {
+ const auto & tc = test_cases[i];
+ std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+ t.test(test_name, [&](testing &t) {
+ common_peg_parse_context ctx(tc.input, true);
+ auto result = parser.parse(ctx);
+
+ // Assert result type matches
+ assert_result_equal(t, tc.expected_result, result.type);
+
+ // Assert matched text if success or need_more_input
+ if (result.success() || result.need_more_input()) {
+ std::string matched = tc.input.substr(result.start, result.end - result.start);
+ t.assert_equal(tc.expected_text, matched);
+ }
+ });
+ }
+ });
+
+ t.test("unicode range U+1F600-U+1F64F (emoticons)", [](testing &t) {
+ std::vector<test_case> test_cases {
+ // Within range - Emoticons (all 4-byte UTF-8)
+ {std::string("\xF0\x9F\x98\x80"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F600
+ {std::string("\xF0\x9F\x98\x81"), std::string("\xF0\x9F\x98\x81"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F601
+ {std::string("\xF0\x9F\x99\x8F"), std::string("\xF0\x9F\x99\x8F"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F64F
+
+ // Outside range
+ {std::string("\xF0\x9F\x97\xBF"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F5FF (before range)
+ {std::string("\xF0\x9F\x99\x90"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F650 (after range)
+ {std::string("\xF0\x9F\x9A\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F680 (outside range)
+
+ // Incomplete sequences
+ {std::string("\xF0\x9F\x98"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Incomplete emoji
+ {std::string("\xF0\x9F"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Very incomplete
+ };
+
+ auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+ return p.sequence({p.chars(R"([\U0001F600-\U0001F64F])"), p.end()});
+ });
+
+ for (size_t i = 0; i < test_cases.size(); i++) {
+ const auto & tc = test_cases[i];
+ std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+ t.test(test_name, [&](testing &t) {
+ common_peg_parse_context ctx(tc.input, true);
+ auto result = parser.parse(ctx);
+
+ // Assert result type matches
+ assert_result_equal(t, tc.expected_result, result.type);
+
+ // Assert matched text if success or need_more_input
+ if (result.success() || result.need_more_input()) {
+ std::string matched = tc.input.substr(result.start, result.end - result.start);
+ t.assert_equal(tc.expected_text, matched);
+ }
+ });
+ }
+ });
+
+ t.test("mixed unicode ranges", [](testing &t) {
+ std::vector<test_case> test_cases {
+ // Match CJK
+ {std::string("\xE4\xB8\x80"), std::string("\xE4\xB8\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4E00
+ {std::string("\xE4\xBD\xA0"), std::string("\xE4\xBD\xA0"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4F60
+
+ // Match emoticons
+ {std::string("\xF0\x9F\x98\x80"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F600
+
+ // Match ASCII digits
+ {"5", "5", COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+ // Don't match outside any range
+ {"a", "", COMMON_PEG_PARSE_RESULT_FAIL},
+ {std::string("\xF0\x9F\x9A\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F680
+
+ // Incomplete
+ {std::string("\xE4\xB8"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+ {std::string("\xF0\x9F\x98"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+ };
+
+ auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+ return p.sequence({p.chars(R"([\u4E00-\u9FFF\U0001F600-\U0001F64F0-9])"), p.end()});
+ });
+
+ for (size_t i = 0; i < test_cases.size(); i++) {
+ const auto & tc = test_cases[i];
+ std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+ t.test(test_name, [&](testing &t) {
+ common_peg_parse_context ctx(tc.input, true);
+ auto result = parser.parse(ctx);
+
+ // Assert result type matches
+ assert_result_equal(t, tc.expected_result, result.type);
+
+ // Assert matched text if success or need_more_input
+ if (result.success() || result.need_more_input()) {
+ std::string matched = tc.input.substr(result.start, result.end - result.start);
+ t.assert_equal(tc.expected_text, matched);
+ }
+ });
+ }
+ });
+ });
+
+ t.test("until parser", [](testing &t) {
+ t.test("ASCII delimiter with Unicode content", [](testing &t) {
+ std::vector<test_case> test_cases {
+ // CJK characters before delimiter
+ {std::string("\xE4\xBD\xA0\xE5\xA5\xBD</tag>"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+ // Emoji before delimiter
+ {std::string("\xF0\x9F\x98\x80</tag>"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+ // Mixed content
+ {std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!</tag>"), std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+ };
+
+ auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+ return p.until("</tag>");
+ });
+
+ for (size_t i = 0; i < test_cases.size(); i++) {
+ const auto & tc = test_cases[i];
+ std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+ t.test(test_name, [&](testing &t) {
+ common_peg_parse_context ctx(tc.input, false);
+ auto result = parser.parse(ctx);
+
+ assert_result_equal(t, tc.expected_result, result.type);
+
+ if (result.success()) {
+ std::string matched = tc.input.substr(result.start, result.end - result.start);
+ t.assert_equal(tc.expected_text, matched);
+ }
+ });
+ }
+ });
+
+ t.test("incomplete UTF-8 at end", [](testing &t) {
+ std::vector<test_case> test_cases {
+ // Incomplete emoji at end, no delimiter
+ {std::string("content\xF0\x9F\x98"), std::string("content"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+
+ // Incomplete CJK at end, no delimiter
+ {std::string("hello\xE4\xB8"), std::string("hello"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+
+ // Complete content, no delimiter (should consume all valid UTF-8)
+ {std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+ };
+
+ auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+ return p.until("</tag>");
+ });
+
+ for (size_t i = 0; i < test_cases.size(); i++) {
+ const auto & tc = test_cases[i];
+ std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+ t.test(test_name, [&](testing &t) {
+ common_peg_parse_context ctx(tc.input, true);
+ auto result = parser.parse(ctx);
+
+ assert_result_equal(t, tc.expected_result, result.type);
+
+ if (result.success() || result.need_more_input()) {
+ std::string matched = tc.input.substr(result.start, result.end - result.start);
+ t.assert_equal(tc.expected_text, matched);
+ }
+ });
+ }
+ });
+
+ t.test("malformed UTF-8", [](testing &t) {
+ std::vector<test_case> test_cases {
+ // Invalid UTF-8 bytes
+ {std::string("Hello\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+
+ // Continuation byte without lead byte
+ {std::string("Hello\x80World"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+
+ // Invalid continuation byte
+ {std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+ };
+
+ auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+ return p.until("</tag>");
+ });
+
+ for (size_t i = 0; i < test_cases.size(); i++) {
+ const auto & tc = test_cases[i];
+ std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+ t.test(test_name, [&](testing &t) {
+ common_peg_parse_context ctx(tc.input, false);
+ auto result = parser.parse(ctx);
+
+ assert_result_equal(t, tc.expected_result, result.type);
+ });
+ }
+ });
+ });
+
+ t.test("json_string parser", [](testing &t) {
+ t.test("valid UTF-8 characters", [](testing &t) {
+ std::vector<test_case> test_cases {
+ // ASCII only
+ {"Hello World\"", "Hello World", COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+ // 2-byte UTF-8 (accented characters)
+ {std::string("Caf\xC3\xA9\""), std::string("Caf\xC3\xA9"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+ // 3-byte UTF-8 (CJK)
+ {std::string("\xE4\xBD\xA0\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+ // 4-byte UTF-8 (emoji)
+ {std::string("\xF0\x9F\x98\x80\""), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+ // Mixed content
+ {std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!\""), std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+ };
+
+ for (size_t i = 0; i < test_cases.size(); i++) {
+ const auto & tc = test_cases[i];
+ std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+ t.test(test_name, [&](testing &t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+ return p.sequence({p.json_string_content(), p.literal("\"")});
+ });
+
+ common_peg_parse_context ctx(tc.input, false);
+ auto result = parser.parse(ctx);
+
+ assert_result_equal(t, tc.expected_result, result.type);
+
+ if (result.success()) {
+ std::string matched = tc.input.substr(result.start, result.end - result.start - 1); // -1 to exclude closing quote
+ t.assert_equal(tc.expected_text, matched);
+ }
+ });
+ }
+ });
+
+ t.test("incomplete UTF-8", [](testing &t) {
+ std::vector<test_case> test_cases {
+ // Incomplete 2-byte sequence
+ {std::string("Caf\xC3"), std::string("Caf"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+
+ // Incomplete 3-byte sequence
+ {std::string("Hello\xE4\xB8"), std::string("Hello"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+
+ // Incomplete 4-byte sequence
+ {std::string("Text\xF0\x9F\x98"), std::string("Text"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+
+ // Incomplete at very start
+ {std::string("\xE4\xBD"), std::string(""), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+ };
+
+ for (size_t i = 0; i < test_cases.size(); i++) {
+ const auto & tc = test_cases[i];
+ std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+ t.test(test_name, [&](testing &t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+ return p.json_string_content();
+ });
+
+ common_peg_parse_context ctx(tc.input, true);
+ auto result = parser.parse(ctx);
+
+ assert_result_equal(t, tc.expected_result, result.type);
+
+ if (result.need_more_input()) {
+ std::string matched = tc.input.substr(result.start, result.end - result.start);
+ t.assert_equal(tc.expected_text, matched);
+ }
+ });
+ }
+ });
+
+ t.test("malformed UTF-8", [](testing &t) {
+ std::vector<test_case> test_cases {
+ // Invalid UTF-8 bytes
+ {std::string("Hello\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+
+ // Continuation byte without lead byte
+ {std::string("Hello\x80World"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+
+ // Invalid continuation byte
+ {std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+
+ // Overlong encoding (security issue)
+ {std::string("\xC0\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+ };
+
+ for (size_t i = 0; i < test_cases.size(); i++) {
+ const auto & tc = test_cases[i];
+ std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+ t.test(test_name, [&](testing &t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+ return p.json_string_content();
+ });
+
+ common_peg_parse_context ctx(tc.input, false);
+ auto result = parser.parse(ctx);
+
+ assert_result_equal(t, tc.expected_result, result.type);
+ });
+ }
+ });
+
+ t.test("escape sequences with UTF-8", [](testing &t) {
+ std::vector<test_case> test_cases {
+ // Unicode escape sequence
+ {"Hello\\u0041\"", "Hello\\u0041", COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+ // Mix of UTF-8 and escape sequences
+ {std::string("\xE4\xBD\xA0\\n\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\\n\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+ // Escaped quote in UTF-8 string
+ {std::string("\xE4\xBD\xA0\\\"\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\\\"\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+ };
+
+ for (size_t i = 0; i < test_cases.size(); i++) {
+ const auto & tc = test_cases[i];
+ std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+ t.test(test_name, [&](testing &t) {
+ auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+ return p.sequence({p.json_string_content(), p.literal("\"")});
+ });
+
+ common_peg_parse_context ctx(tc.input, false);
+ auto result = parser.parse(ctx);
+
+ assert_result_equal(t, tc.expected_result, result.type);
+
+ if (result.success()) {
+ std::string matched = tc.input.substr(result.start, result.end - result.start - 1); // -1 to exclude closing quote
+ t.assert_equal(tc.expected_text, matched);
+ }
+ });
+ }
+ });
+ });
+}
--- /dev/null
+#pragma once
+
+#include "common.h"
+
+#include <chrono>
+#include <exception>
+#include <iostream>
+#include <string>
+#include <regex>
+#include <vector>
+
+struct testing {
+ std::ostream &out;
+ std::vector<std::string> stack;
+ std::regex filter;
+ bool filter_tests = false;
+ bool throw_exception = false;
+ bool verbose = false;
+ int tests = 0;
+ int assertions = 0;
+ int failures = 0;
+ int unnamed = 0;
+ int exceptions = 0;
+
+ static constexpr std::size_t status_column = 80;
+
+ explicit testing(std::ostream &os = std::cout) : out(os) {}
+
+ std::string indent() const {
+ if (stack.empty()) {
+ return "";
+ }
+ return std::string((stack.size() - 1) * 2, ' ');
+ }
+
+ std::string full_name() const {
+ return string_join(stack, ".");
+ }
+
+ void log(const std::string & msg) {
+ if (verbose) {
+ out << indent() << " " << msg << "\n";
+ }
+ }
+
+ void set_filter(const std::string & re) {
+ filter = std::regex(re);
+ filter_tests = true;
+ }
+
+ bool should_run() const {
+ if (filter_tests) {
+ if (!std::regex_match(full_name(), filter)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ template <typename F>
+ void run_with_exceptions(F &&f, const char *ctx) {
+ try {
+ f();
+ } catch (const std::exception &e) {
+ ++failures;
+ ++exceptions;
+ out << indent() << "UNHANDLED EXCEPTION (" << ctx << "): " << e.what() << "\n";
+ if (throw_exception) {
+ throw;
+ }
+ } catch (...) {
+ ++failures;
+ ++exceptions;
+ out << indent() << "UNHANDLED EXCEPTION (" << ctx << "): unknown\n";
+ if (throw_exception) {
+ throw;
+ }
+ }
+ }
+
+ void print_result(const std::string &label, int new_failures, int new_assertions, const std::string &extra = "") const {
+ std::string line = indent() + label;
+
+ std::string details;
+ if (new_assertions > 0) {
+ if (new_failures == 0) {
+ details = std::to_string(new_assertions) + " assertion(s)";
+ } else {
+ details = std::to_string(new_failures) + " of " +
+ std::to_string(new_assertions) + " assertion(s) failed";
+ }
+ }
+ if (!extra.empty()) {
+ if (!details.empty()) {
+ details += ", ";
+ }
+ details += extra;
+ }
+
+ if (!details.empty()) {
+ line += " (" + details + ")";
+ }
+
+ std::string status = (new_failures == 0) ? "[PASS]" : "[FAIL]";
+
+ if (line.size() + 1 < status_column) {
+ line.append(status_column - line.size(), ' ');
+ } else {
+ line.push_back(' ');
+ }
+
+ out << line << status << "\n";
+ }
+
+ template <typename F>
+ void test(const std::string &name, F f) {
+ stack.push_back(name);
+ if (!should_run()) {
+ stack.pop_back();
+ return;
+ }
+
+ ++tests;
+ out << indent() << name << "\n";
+
+ int before_failures = failures;
+ int before_assertions = assertions;
+
+ run_with_exceptions([&] { f(*this); }, "test");
+
+ int new_failures = failures - before_failures;
+ int new_assertions = assertions - before_assertions;
+
+ print_result(name, new_failures, new_assertions);
+
+ stack.pop_back();
+ }
+
+ template <typename F>
+ void test(F f) {
+ test("test #" + std::to_string(++unnamed), f);
+ }
+
+ template <typename F>
+ void bench(const std::string &name, F f, int iterations = 100) {
+ stack.push_back(name);
+ if (!should_run()) {
+ stack.pop_back();
+ return;
+ }
+
+ ++tests;
+ out << indent() << "[bench] " << name << "\n";
+
+ int before_failures = failures;
+ int before_assertions = assertions;
+
+ using clock = std::chrono::high_resolution_clock;
+
+ std::chrono::microseconds duration(0);
+
+ run_with_exceptions([&] {
+ for (auto i = 0; i < iterations; i++) {
+ auto start = clock::now();
+ f();
+ duration += std::chrono::duration_cast<std::chrono::microseconds>(clock::now() - start);
+ }
+ }, "bench");
+
+ auto avg_elapsed = duration.count() / iterations;
+ auto avg_elapsed_s = std::chrono::duration_cast<std::chrono::duration<double>>(duration).count() / iterations;
+ auto rate = (avg_elapsed_s > 0.0) ? (1.0 / avg_elapsed_s) : 0.0;
+
+ int new_failures = failures - before_failures;
+ int new_assertions = assertions - before_assertions;
+
+ std::string extra =
+ "n=" + std::to_string(iterations) +
+ " avg=" + std::to_string(avg_elapsed) + "us" +
+ " rate=" + std::to_string(int(rate)) + "/s";
+
+ print_result("[bench] " + name, new_failures, new_assertions, extra);
+
+ stack.pop_back();
+ }
+
+ template <typename F>
+ void bench(F f, int iterations = 100) {
+ bench("bench #" + std::to_string(++unnamed), f, iterations);
+ }
+
+ // Assertions
+ bool assert_true(bool cond) {
+ return assert_true("", cond);
+ }
+
+ bool assert_true(const std::string &msg, bool cond) {
+ ++assertions;
+ if (!cond) {
+ ++failures;
+ out << indent() << "ASSERT TRUE FAILED";
+ if (!msg.empty()) {
+ out << " : " << msg;
+ }
+ out << "\n";
+ return false;
+ }
+ return true;
+ }
+
+ template <typename A, typename B>
+ bool assert_equal(const A &expected, const B &actual) {
+ return assert_equal("", expected, actual);
+ }
+
+ template <typename A, typename B>
+ bool assert_equal(const std::string &msg, const A &expected, const B &actual) {
+ ++assertions;
+ if (!(actual == expected)) {
+ ++failures;
+ out << indent() << "ASSERT EQUAL FAILED";
+ if (!msg.empty()) {
+ out << " : " << msg;
+ }
+ out << "\n";
+
+ out << indent() << " expected: " << expected << "\n";
+ out << indent() << " actual : " << actual << "\n";
+ return false;
+ }
+ return true;
+ }
+
+ // Print summary and return an exit code
+ int summary() const {
+ out << "\n";
+ out << "tests : " << tests << "\n";
+ out << "assertions : " << assertions << "\n";
+ out << "failures : " << failures << "\n";
+ out << "exceptions : " << exceptions << "\n";
+ return failures == 0 ? 0 : 1;
+ }
+};
--- /dev/null
+#pragma once
+
+// Common includes for all test files
+#include <nlohmann/json.hpp>
+#include <string>
+#include <vector>
+
+#include "testing.h"
+#include "peg-parser.h"
+#include "chat-peg-parser.h"
+#include "simple-tokenize.h"
+
+struct bench_tool_call {
+ std::string id;
+ std::string name;
+ nlohmann::ordered_json args;
+};
+
+// Test function declarations
+void test_basic(testing &t);
+void test_json_parser(testing &t);
+void test_gbnf_generation(testing &t);
+void test_unicode(testing &t);
+void test_json_serialization(testing &t);
--- /dev/null
+#include <string>
+#include <iostream>
+#include <numeric>
+
+#include "chat-parser.h"
+#include "chat-peg-parser.h"
+#include "chat.h"
+#include "common.h"
+#include "json-schema-to-grammar.h"
+#include "peg-parser.h"
+#include "peg-parser/testing.h"
+#include "peg-parser/simple-tokenize.h"
+#include "nlohmann/json.hpp"
+
+using json = nlohmann::ordered_json;
+
+static json create_tools();
+static void test_example_native(testing & t);
+static void test_example_qwen3_coder(testing & t);
+static void test_command7_parser_compare(testing & t);
+
+int main(int argc, char *argv[]) {
+ testing t(std::cout);
+ if (argc >= 2) {
+ t.set_filter(argv[1]);
+ }
+
+ const char * verbose = getenv("LLAMA_TEST_VERBOSE");
+ if (verbose) {
+ t.verbose = std::string(verbose) == "1";
+ }
+
+ t.test("native", test_example_native);
+ t.test("qwen3 coder", test_example_qwen3_coder);
+ t.test("comparison", test_command7_parser_compare);
+
+ return t.summary();
+}
+
+static json create_tools() {
+ json tools = json::array();
+
+ json tool_weather = {
+ {"type", "function"},
+ {"function", {
+ {"name", "get_current_weather"},
+ {"description", "Get the current weather in a given location"},
+ {"parameters", {
+ {"type", "object"},
+ {"properties", {
+ {"location", {
+ {"type", "string"},
+ {"description", "The city and state, e.g. San Francisco, CA"}
+ }},
+ {"unit", {
+ {"type", "string"},
+ {"enum", {"celsius", "fahrenheit"}},
+ {"description", "The temperature unit to use. Infer this from the users location."}
+ }}
+ }},
+ {"required", {"location", "unit"}},
+ }},
+ }}
+ };
+ tools.push_back(tool_weather);
+
+ json tool_forecast = {
+ {"type", "function"},
+ {"function", {
+ {"name", "get_forecast"},
+ {"description", "Get the weather forecast for a given location"},
+ {"parameters", {
+ {"type", "object"},
+ {"properties", {
+ {"location", {
+ {"type", "string"},
+ {"description", "The city and state, e.g. San Francisco, CA"}
+ }},
+ {"unit", {
+ {"type", "string"},
+ {"enum", {"celsius", "fahrenheit"}},
+ {"description", "The temperature unit to use. Infer this from the users location."}
+ }},
+ {"days", {
+ {"type", "integer"},
+ {"description", "Number of days to forecast (1-10)"},
+ {"minimum", 1},
+ {"maximum", 10}
+ }}
+ }},
+ {"required", {"location", "unit"}},
+ }},
+ }}
+ };
+ tools.push_back(tool_forecast);
+
+ json tool_search = {
+ {"type", "function"},
+ {"function", {
+ {"name", "search_knowledge_base"},
+ {"description", "Search the internal technical documentation knowledge base."},
+ {"parameters", {
+ {"type", "object"},
+ {"properties", {
+ {"query", {
+ {"type", "string"},
+ {"description", "The search query string."}
+ }},
+ {"max_results", {
+ {"type", "integer"},
+ {"description", "The maximum number of results to return."},
+ {"default", 5}
+ }},
+ {"category", {
+ {"type", "string"},
+ {"enum", {"api", "troubleshooting", "billing", "general"}},
+ {"description", "Filter search by specific category."}
+ }}
+ }},
+ {"required", {"query", "category"}},
+ {"additionalProperties", false}
+ }},
+ {"strict", true}
+ }}
+ };
+ tools.push_back(tool_search);
+
+ return tools;
+}
+
+struct tool_argument {
+ std::string name;
+ std::string type;
+ bool is_required;
+ json schema;
+};
+
+struct tool_definition {
+ std::string name;
+ std::vector<tool_argument> arguments;
+ json schema;
+};
+
+// Test fictitious model output that emits arguments as JSON.
+static void test_example_native(testing & t) {
+ struct test_case {
+ // Parameters
+ std::string name;
+ json tools;
+ common_chat_tool_choice tool_choice;
+ common_reasoning_format reasoning_format;
+ json json_schema;
+ bool parallel_tool_calls;
+ bool thinking_forced_open;
+ std::string input;
+
+ // Expect
+ std::string expect_reasoning;
+ std::string expect_content;
+ std::vector<common_chat_tool_call> expect_tool_calls;
+ };
+
+ auto build_parser = [](const test_case & tc) {
+ return build_chat_peg_native_parser([&](common_chat_peg_native_builder & p) {
+ auto reasoning_in_content = (tc.reasoning_format == COMMON_REASONING_FORMAT_NONE);
+ auto reasoning = p.eps();
+ if (tc.thinking_forced_open) {
+ // If thinking is forced open, expect a closing tag
+ reasoning = p.reasoning(p.until("</think>")) + "</think>" + p.space();
+ } else {
+ // Otherwise, optionally accept thinking wrapped in tags
+ reasoning = p.optional("<think>" + p.reasoning(p.until("</think>")) + "</think>" + p.space());
+ }
+
+ // tool calling parser
+ if (tc.tools.is_array() && !tc.tools.empty()) {
+ auto tools = p.choice();
+ for (const auto & tool : tc.tools) {
+ const auto & function = tool.at("function");
+ std::string name = function.at("name");
+ const auto & schema = function.at("parameters");
+
+ auto tool_name = p.json_member("name", "\"" + p.tool_name(p.literal(name)) + "\"");
+ auto tool_args = p.json_member("arguments", p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", schema)));
+
+ tools |= p.rule("tool-" + name, p.tool_open(p.literal("{")) << tool_name << "," << tool_args << "}");
+ };
+
+ auto parallel_calls = p.eps();
+ if (tc.parallel_tool_calls) {
+ parallel_calls = p.zero_or_more("," << tools);
+ }
+
+ auto tool_call = p.trigger_rule("tool-call",
+ p.sequence({
+ p.literal("<tool_call>["),
+ tools,
+ parallel_calls,
+ p.literal("]</tool_call>")
+ })
+ );
+
+ return p.sequence({
+ (reasoning_in_content ? p.eps() : reasoning),
+ p.content(p.until("<tool_call>")),
+ p.optional(p.space() + tool_call),
+ p.space(),
+ p.end()
+ });
+ }
+
+ // response_format parser
+ if (tc.json_schema.is_object() && !tc.json_schema.empty()) {
+ return p.sequence({
+ (reasoning_in_content ? p.eps() : reasoning),
+ p.content(p.schema(p.json(), "response-output", tc.json_schema)),
+ p.space(),
+ p.end()
+ });
+ }
+
+ // Content-only parser
+ return p.sequence({
+ (reasoning_in_content ? p.eps() : reasoning),
+ p.content(p.rest()),
+ p.end()
+ });
+ });
+ };
+
+ std::vector<test_case> test_cases = std::vector<test_case>{
+ {
+ /* .name = */ "content with thinking_forced_open = false",
+ /* .tools = */ {},
+ /* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE,
+ /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+ /* .json_schema = */ {},
+ /* .parallel_tool_calls = */ false,
+ /* .thinking_forced_open = */ false,
+ /* .input = */ (
+ "<think>The user said hello, I must say hello back</think>\nHello"
+ ),
+ /* .expect_reasoning = */ "The user said hello, I must say hello back",
+ /* .expect_content = */ "Hello",
+ /* .expect_tool_calls = */ {},
+ },
+ {
+ /* .name = */ "content with thinking_forced_open = false and no reasoning",
+ /* .tools = */ {},
+ /* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE,
+ /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+ /* .json_schema = */ {},
+ /* .parallel_tool_calls = */ false,
+ /* .thinking_forced_open = */ false,
+ /* .input = */ (
+ "Hello"
+ ),
+ /* .expect_reasoning = */ "",
+ /* .expect_content = */ "Hello",
+ /* .expect_tool_calls = */ {},
+ },
+ {
+ /* .name = */ "content with thinking_forced_open = false and reasoning_format = none",
+ /* .tools = */ {},
+ /* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE,
+ /* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE,
+ /* .json_schema = */ {},
+ /* .parallel_tool_calls = */ false,
+ /* .thinking_forced_open = */ true,
+ /* .input = */ (
+ "<think>The user said hello, I must say hello back</think>\nHello"
+ ),
+ /* .expect_reasoning = */ "",
+ /* .expect_content = */ "<think>The user said hello, I must say hello back</think>\nHello",
+ /* .expect_tool_calls = */ {},
+ },
+ {
+ /* .name = */ "content with thinking_forced_open = true",
+ /* .tools = */ {},
+ /* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE,
+ /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+ /* .json_schema = */ {},
+ /* .parallel_tool_calls = */ false,
+ /* .thinking_forced_open = */ true,
+ /* .input = */ (
+ "The user said hello, I must say hello back</think>\nHello"
+ ),
+ /* .expect_reasoning = */ "The user said hello, I must say hello back",
+ /* .expect_content = */ "Hello",
+ /* .expect_tool_calls = */ {},
+ },
+ {
+ /* .name = */ "content with thinking_forced_open = true and reasoning_format = none",
+ /* .tools = */ {},
+ /* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE,
+ /* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE,
+ /* .json_schema = */ {},
+ /* .parallel_tool_calls = */ false,
+ /* .thinking_forced_open = */ true,
+ /* .input = */ (
+ "The user said hello, I must say hello back</think>\nHello"
+ ),
+ /* .expect_reasoning = */ "",
+ /* .expect_content = */ "The user said hello, I must say hello back</think>\nHello",
+ /* .expect_tool_calls = */ {},
+ },
+ {
+ /* .name = */ "tools with tool_choice = auto and no parallel_tool_calls",
+ /* .tools = */ create_tools(),
+ /* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_AUTO,
+ /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+ /* .json_schema = */ {},
+ /* .parallel_tool_calls = */ false,
+ /* .thinking_forced_open = */ true,
+ /* .input = */ (
+ "I must get the weather in New York</think>\n"
+ "<tool_call>["
+ R"({"name": "get_current_weather", "arguments": {"location": "New York City, NY", "unit": "fahrenheit"}})"
+ "]</tool_call>"
+ ),
+ /* .expect_reasoning = */ "I must get the weather in New York",
+ /* .expect_content = */ "",
+ /* .expect_tool_calls = */ {{
+ /* .name = */ "get_current_weather",
+ /* .arguments = */ R"({"location": "New York City, NY", "unit": "fahrenheit"})",
+ /* .id = */ "",
+ }},
+ },
+ {
+ /* .name = */ "tools with tool_choice = auto and parallel_tool_calls",
+ /* .tools = */ create_tools(),
+ /* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_AUTO,
+ /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+ /* .json_schema = */ {},
+ /* .parallel_tool_calls = */ true,
+ /* .thinking_forced_open = */ true,
+ /* .input = */ (
+ "I must get the weather in New York and San Francisco and a 3 day forecast of each.</think>\nLet me search that for you."
+ "<tool_call>["
+ R"({"name": "get_current_weather", "arguments": {"location": "New York City, NY", "unit": "fahrenheit"}})"
+ ", "
+ R"({"name": "get_current_weather", "arguments": {"location": "San Francisco, CA", "unit": "fahrenheit"}})"
+ ", "
+ R"({"name": "get_forecast", "arguments": {"location": "New York City, NY", "unit": "fahrenheit", "days": 3}})"
+ ", "
+ R"({"name": "get_forecast", "arguments": {"location": "San Francisco, CA", "unit": "fahrenheit", "days": 3}})"
+ "]</tool_call>"
+ ),
+ /* .expect_reasoning = */ "I must get the weather in New York and San Francisco and a 3 day forecast of each.",
+ /* .expect_content = */ "Let me search that for you.",
+ /* .expect_tool_calls = */ {{
+ /* .name = */ "get_current_weather",
+ /* .arguments = */ R"({"location": "New York City, NY", "unit": "fahrenheit"})",
+ /* .id = */ "",
+ }, {
+ /* .name = */ "get_current_weather",
+ /* .arguments = */ R"({"location": "San Francisco, CA", "unit": "fahrenheit"})",
+ /* .id = */ "",
+ }, {
+ /* .name = */ "get_forecast",
+ /* .arguments = */ R"({"location": "New York City, NY", "unit": "fahrenheit", "days": 3})",
+ /* .id = */ "",
+ }, {
+ /* .name = */ "get_forecast",
+ /* .arguments = */ R"({"location": "San Francisco, CA", "unit": "fahrenheit", "days": 3})",
+ /* .id = */ "",
+ }},
+ },
+ {
+ /* .name = */ "response_format with thinking_forced_open = true",
+ /* .tools = */ {},
+ /* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE,
+ /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+ /* .json_schema = */ {
+ {"type", "object"},
+ {"properties", {
+ {"invoice_number", {{"type", "string"}}},
+ {"amount", {{"type", "number"}}},
+ {"due_date", {{"type", "string"}}}
+ }},
+ {"required", {"invoice_number", "amount", "due_date"}}
+ },
+ /* .parallel_tool_calls = */ false,
+ /* .thinking_forced_open = */ true,
+ /* .input = */ (
+ "I must produce the invoice in the requested format</think>\n"
+ R"({"invoice_number": "INV-2025-001", "amount": 1250.50, "due_date": "2025-12-31"})"
+ ),
+ /* .expect_reasoning = */ "I must produce the invoice in the requested format",
+ /* .expect_content = */ R"({"invoice_number": "INV-2025-001", "amount": 1250.50, "due_date": "2025-12-31"})",
+ /* .expect_tool_calls = */ {},
+ },
+ };
+
+ for (const auto & tc : test_cases) {
+ t.test(tc.name, [&](testing & t) {
+ auto parser = build_parser(tc);
+ auto lazy = !tc.tools.empty() && tc.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+ auto grammar = build_grammar([&](const common_grammar_builder & builder) {
+ for (auto const & def : tc.tools) {
+ auto function = def.at("function");
+ auto parameters = function.at("parameters");
+ builder.resolve_refs(parameters);
+ };
+ parser.build_grammar(builder, lazy);
+ });
+
+ t.log("Grammar:");
+ for (auto const & line : string_split(grammar, "\n")) {
+ t.log(line);
+ }
+
+ common_peg_parse_context ctx(tc.input, false);
+ auto result = parser.parse(ctx);
+
+ t.assert_true("success", result.success());
+
+ common_chat_msg msg;
+ auto mapper = common_chat_peg_native_mapper(msg);
+ mapper.from_ast(ctx.ast, result);
+
+ t.assert_equal("content equal", tc.expect_content, msg.content);
+ t.assert_equal("reasoning equal", tc.expect_reasoning, msg.reasoning_content);
+ t.assert_equal("number of tool calls", tc.expect_tool_calls.size(), msg.tool_calls.size());
+ for (auto i = 0u; i < std::min(tc.expect_tool_calls.size(), msg.tool_calls.size()); i++) {
+ t.assert_equal("tool name", tc.expect_tool_calls[i].name, msg.tool_calls[i].name);
+ t.assert_equal("tool args", tc.expect_tool_calls[i].arguments, msg.tool_calls[i].arguments);
+ }
+ });
+ }
+}
+
+static void test_example_qwen3_coder(testing & t) {
+ auto tools = create_tools();
+ auto parser = build_chat_peg_constructed_parser([&](common_chat_peg_constructed_builder & p) {
+ auto content = p.rule("content", p.content(p.until("<tool_call>")));
+
+ std::vector<common_peg_parser> tool_parsers;
+ for (auto const & def : tools) {
+ auto function = def.at("function");
+ std::string name = function.at("name");
+ auto parameters = function.at("parameters");
+ auto properties = parameters.at("properties");
+
+ std::set<std::string> required_properties;
+ if (function.contains("required")) {
+ function.at("required").get_to(required_properties);
+ }
+
+ std::vector<common_peg_parser> arg_parsers;
+ for (const auto & [param_name, param_schema] : properties.items()) {
+ bool is_required = required_properties.find(param_name) != required_properties.end();
+ auto type = param_schema.value("type", "object");
+
+ auto arg = p.tool_arg(p.sequence({
+ p.tool_arg_open("<parameter=" + p.tool_arg_name(p.literal(param_name)) + ">"),
+ (type == "string" ?
+ p.tool_arg_string_value(
+ p.schema(
+ p.until_one_of({
+ "</parameter>\n<parameter=",
+ "</parameter>\n</function>"
+ }),
+ "tool-" + name + "-arg-" + param_name + "-schema",
+ param_schema,
+ true
+ )
+ ) : p.tool_arg_json_value(
+ p.schema(
+ p.json(),
+ "tool-" + name + "-arg-" + param_name + "-schema",
+ param_schema
+ )
+ )
+ ),
+ p.tool_arg_close(
+ "</parameter>\n" +
+ p.peek(p.literal("<parameter=") | p.literal("</function>"))
+ )
+ }));
+
+ arg_parsers.push_back(is_required ?
+ p.rule("tool-" + name + "-arg-" + param_name, arg) :
+ p.optional(p.rule("tool-" + name + "-arg-" + param_name, arg)));
+ }
+
+ tool_parsers.push_back(p.rule("tool-" + name,
+ p.tool_open("<function=" + p.tool_name(p.literal(name)) + ">")
+ << p.sequence(arg_parsers)
+ << p.tool_close(p.literal("</function>"))
+ ));
+ };
+
+ auto tool_call = p.trigger_rule("tool-call",
+ "<tool_call>"
+ << p.choice(tool_parsers)
+ << "</tool_call>"
+ );
+
+ return content + p.zero_or_more(p.space() + tool_call) + p.end();
+ });
+
+ auto grammar = build_grammar([&](const common_grammar_builder & builder) {
+ for (auto const & def : tools) {
+ auto function = def.at("function");
+ auto parameters = function.at("parameters");
+ builder.resolve_refs(parameters);
+ };
+ parser.build_grammar(builder);
+ });
+
+ t.log("Grammar:");
+ for (auto const & line : string_split(grammar, "\n")) {
+ t.log(line);
+ }
+
+ t.test("incremental parsing", [&](testing &t) {
+ std::string input =
+ "Let me search the knowledge base for cat pictures."
+ "<tool_call>\n"
+ "<function=search_knowledge_base>\n"
+ "<parameter=query>cat pictures</parameter>\n"
+ "<parameter=category>general</parameter>\n"
+ "</function>\n"
+ "</tool_call>";
+
+ std::vector<std::string> tokens = simple_tokenize(input);
+
+ common_chat_msg prev;
+ for (auto it = tokens.begin(); it != tokens.end(); it++) {
+ std::string in = std::accumulate(tokens.begin(), it + 1, std::string());
+
+ common_peg_parse_context ctx(in, it + 1 < tokens.end());
+
+ auto result = parser.parse(ctx);
+ if (!t.assert_equal("not fail", false, result.fail())) {
+ t.log(in.substr(0, result.end) + "[failed->]" + in.substr(result.end));
+ }
+
+ common_chat_msg msg;
+ auto mapper = common_chat_peg_constructed_mapper(msg);
+ mapper.from_ast(ctx.ast, result);
+
+ //t.log("Input: " + input);
+ t.log("===========================================");
+ t.log("Iteration " + std::to_string(in.size()));
+ t.log("Reasoning: " + msg.reasoning_content);
+ t.log("Content : " + msg.content);
+ for (const auto & tc : msg.tool_calls) {
+ t.log("Tool name: " + tc.name);
+ t.log("Tool args: " + tc.arguments);
+ }
+
+ try {
+ // This shouldn't emit any runtime errors
+ auto diffs = common_chat_msg_diff::compute_diffs(prev, msg);
+ } catch(const std::exception & e) {
+ t.log(in.substr(0, result.end) + "[failed->]" + in.substr(result.end));
+ t.assert_true(std::string("failed with ") + e.what(), false);
+ }
+
+ prev = msg;
+ }
+ });
+}
+
+void test_command7_parser_compare(testing & t) {
+ auto parser = build_chat_peg_native_parser([](common_chat_peg_native_builder & p) {
+ auto thinking = p.reasoning_block(
+ "<|START_THINKING|>" << p.reasoning(p.until("<|END_THINKING|>")) << "<|END_THINKING|>");
+
+ auto response = "<|START_RESPONSE|>" << p.content(p.until("<|END_RESPONSE|>")) << "<|END_RESPONSE|>";
+
+ auto tool_call_id = p.atomic("\"tool_call_id\"" << (":" << ("\"" + p.tool_id(p.json_string_content()) + "\"")));
+ auto tool_call_name = p.atomic("\"tool_name\"" << (":" << ("\"" + p.tool_name(p.json_string_content()) + "\"")));
+ auto tool_call_args = "\"parameters\"" << (":" << p.tool_args(p.json()));
+
+ auto tool_call_fields = p.rule("tool-call-fields", tool_call_id | tool_call_name | tool_call_args);
+ auto tool_call = p.rule("tool-call", p.tool(
+ p.tool_open(p.literal("{"))
+ << tool_call_fields
+ << p.zero_or_more( p.literal(",") << tool_call_fields)
+ << p.tool_close(p.literal("}"))
+ ));
+
+ auto tool_calls = p.rule("tool-calls",
+ "<|START_ACTION|>"
+ << ("[" << tool_call << p.zero_or_more(p.literal(",") << tool_call) << "]")
+ << "<|END_ACTION|>");
+
+ return p.optional(thinking) << (tool_calls | response) + p.end();
+ });
+
+ auto test_current = [&](const common_peg_arena & p, const std::string & input, bool is_partial, bool print_results) {
+ common_peg_parse_context ctx(input, is_partial);
+ auto result = p.parse(ctx);
+
+ common_chat_msg msg;
+ auto mapper = common_chat_peg_native_mapper(msg);
+ mapper.from_ast(ctx.ast, result);
+
+ if (print_results) {
+ std::cout << "== Parsed (new) ==\n";
+ std::cout << "=== Reasoning ===\n";
+ std::cout << msg.reasoning_content << "\n";
+ std::cout << "\n\n=== Content ===\n";
+ std::cout << msg.content << "\n";
+ std::cout << "\n\n=== Tool Calls ===\n";
+ for (const auto & tc : msg.tool_calls) {
+ std::cout << "id: " << tc.id << "\n";
+ std::cout << "name: " << tc.name << "\n";
+ std::cout << "args: " << tc.arguments << "\n";
+ }
+ }
+ };
+
+ auto test_legacy = [&](const std::string & input, bool need_more_input, bool print_results) {
+ // Original common_chat_combinator_parser taken from chat.cpp
+ common_chat_msg_parser builder(
+ input,
+ /* .is_partial = */ need_more_input,
+ {
+ /* .format = */ COMMON_CHAT_FORMAT_GENERIC,
+ /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+ /* .reasoning_in_content = */ false,
+ /* .thinking_forced_open = */ false,
+ }
+ );
+
+ builder.try_parse_reasoning("<|START_THINKING|>", "<|END_THINKING|>");
+
+ static const common_regex start_action_regex("<\\|START_ACTION\\|>");
+ static const common_regex end_action_regex("<\\|END_ACTION\\|>");
+ static const common_regex start_response_regex("<\\|START_RESPONSE\\|>");
+ static const common_regex end_response_regex("<\\|END_RESPONSE\\|>");
+
+ if (auto res = builder.try_find_regex(start_action_regex)) {
+ // If we didn't extract thoughts, prelude includes them.
+ auto tool_calls = builder.consume_json_with_dumped_args({ { "parameters" } });
+ for (const auto & tool_call : tool_calls.value) {
+ std::string name = tool_call.contains("tool_name") ? tool_call.at("tool_name") : "";
+ std::string id = tool_call.contains("tool_call_id") ? tool_call.at("tool_call_id") : "";
+ std::string arguments = tool_call.contains("parameters") ? tool_call.at("parameters") : "";
+ if (!builder.add_tool_call(name, id, arguments) || tool_calls.is_partial) {
+ throw common_chat_msg_partial_exception("incomplete tool call");
+ }
+ }
+ if (tool_calls.is_partial) {
+ throw common_chat_msg_partial_exception("incomplete tool call");
+ }
+ builder.consume_regex(end_action_regex);
+ } else if (auto res = builder.try_find_regex(start_response_regex)) {
+ if (!builder.try_find_regex(end_response_regex)) {
+ builder.add_content(builder.consume_rest());
+ throw common_chat_msg_partial_exception(end_response_regex.str());
+ }
+ } else {
+ builder.add_content(builder.consume_rest());
+ }
+
+ if (print_results) {
+ std::cout << "== Parsed (legacy) ==\n";
+ std::cout << "=== Reasoning ===\n";
+ std::cout << builder.result().reasoning_content << "\n";
+ std::cout << "\n\n=== Content ===\n";
+ std::cout << builder.result().content << "\n";
+ std::cout << "\n\n=== Tool Calls ===\n";
+ for (const auto & tc : builder.result().tool_calls) {
+ std::cout << "id: " << tc.id << "\n";
+ std::cout << "name: " << tc.name << "\n";
+ std::cout << "args: " << tc.arguments << "\n";
+ }
+ }
+ };
+
+ std::string reasoning = "To plan an effective trip to Japan that includes both historical sites and modern attractions within a "
+ "budget of $4000 for a two-week stay, we need to:\n\n"
+ "1. Identify key historical sites and modern attractions in Japan.\n"
+ "2. Find affordable accommodation options that provide a balance between comfort and cost.\n"
+ "3. Determine the best modes of transportation for getting around Japan.\n"
+ "4. Create a day-by-day itinerary that ensures the user gets to see a variety of attractions without "
+ "overspending.\n"
+ "5. Provide a detailed cost breakdown that includes accommodation, transportation, meals, and entry fees "
+ "to attractions.";
+
+ std::vector<std::tuple<std::string, std::string, nlohmann::json>> tool_calls = {{
+ "call_0",
+ "plan_trip",
+ nlohmann::json::parse(R"({
+ "destination": "Japan",
+ "duration": 14,
+ "budget": 4000,
+ "interests": ["historical sites", "modern attractions"],
+ "accommodation_preferences": "affordable",
+ "transportation_preferences": "efficient",
+ "meal_preferences": "local cuisine"
+ })")
+ }};
+
+ std::vector<std::string> tokens;
+
+ // Build tokens
+ if (!reasoning.empty()) {
+ auto tokenized = simple_tokenize(reasoning);
+ tokens.emplace_back("<|START_THINKING|>");
+ tokens.insert(tokens.end(), tokenized.begin(), tokenized.end());
+ tokens.emplace_back("<|END_THINKING|>");
+ }
+
+ if (!tool_calls.empty()) {
+ tokens.emplace_back("<|START_ACTION|>");
+
+ auto json = nlohmann::json::array();
+ for (const auto & tc : tool_calls) {
+ auto tc_json = nlohmann::json::object();
+ tc_json["tool_call_id"] = std::get<0>(tc);
+ tc_json["tool_name"] = std::get<1>(tc);
+ tc_json["parameters"] = std::get<2>(tc);
+ json.push_back(tc_json);
+ }
+
+ auto tokenized = simple_tokenize(json.dump(-1, ' ', true));
+ tokens.insert(tokens.end(), tokenized.begin(), tokenized.end());
+
+ tokens.emplace_back("<|END_ACTION|>");
+ }
+
+ std::string input = std::accumulate(tokens.begin(), tokens.end(), std::string());
+
+ // Run tests
+ t.test("legacy_parse", [&](testing & /* t */) {
+ test_legacy(input, false, false);
+ });
+
+ t.test("current_parse", [&](testing & /* t */) {
+ test_current(parser, input, false, false);
+ });
+
+ // Run benchmarks
+ t.bench("legacy_parse_benchmark complete", [&]() {
+ test_legacy(input, false, false);
+ });
+
+ t.bench("legacy_parse_benchmark incremental", [&]() {
+ std::string in;
+ for (auto i = 0u; i < tokens.size(); i++) {
+ in += tokens[i];
+
+ try {
+ test_legacy(in, i + 1 < tokens.size(), false);
+ } catch (common_chat_msg_partial_exception & /* e */) {
+ // Do nothing, this is expected
+ }
+ }
+ }, 20);
+
+ t.bench("current_parse_benchmark complete", [&]() {
+ test_current(parser, input, false, false);
+ }, 100);
+
+ t.bench("current_parse_benchmark incremental", [&]() {
+ std::string in;
+ for (auto i = 0u; i < tokens.size(); i++) {
+ in += tokens[i];
+ test_current(parser, in, i + 1 < tokens.size(), false);
+ }
+ }, 20);
+}
--- /dev/null
+#include <cstdlib>
+#include <string>
+#include <iostream>
+
+#include "peg-parser/tests.h"
+
+int main(int argc, char *argv[]) {
+ testing t(std::cout);
+ if (argc >= 2) {
+ t.set_filter(argv[1]);
+ }
+
+ const char * verbose = getenv("LLAMA_TEST_VERBOSE");
+ if (verbose) {
+ t.verbose = std::string(verbose) == "1";
+ }
+
+ t.test("basic", test_basic);
+ t.test("unicode", test_unicode);
+ t.test("json", test_json_parser);
+ t.test("gbnf", test_gbnf_generation);
+ t.test("serialization", test_json_serialization);
+
+ return t.summary();
+}
for (const auto & stop : chat_params.additional_stops) {
llama_params["stop"].push_back(stop);
}
+ if (!chat_params.parser.empty()) {
+ llama_params["chat_parser"] = chat_params.parser;
+ }
// Handle "n" field
int n_choices = json_value(body, "n", 1);
params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
params.oaicompat_chat_syntax.thinking_forced_open = json_value(data, "thinking_forced_open", false);
params.oaicompat_chat_syntax.parse_tool_calls = json_value(data, "parse_tool_calls", false);
+ if (data.contains("chat_parser")) {
+ params.oaicompat_chat_syntax.parser.load(data.at("chat_parser").get<std::string>());
+ }
}
{