#include "json-schema-to-grammar.h"
+#include "common.h"
+
#include <algorithm>
#include <fstream>
#include <map>
using json = nlohmann::ordered_json;
-template <typename Iterator>
-static std::string join(Iterator begin, Iterator end, const std::string & separator);
-
-static std::string repeat(const std::string & str, size_t n);
-
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
auto has_max = max_items != std::numeric_limits<int>::max();
if (sub_len > 0) {
auto from_sub = from.substr(i + 1);
auto to_sub = to.substr(i + 1);
- auto sub_zeros = repeat("0", sub_len);
- auto sub_nines = repeat("9", sub_len);
+ auto sub_zeros = string_repeat("0", sub_len);
+ auto sub_nines = string_repeat("9", sub_len);
auto to_reached = false;
out << "(";
auto max_digits = max_s.length();
for (auto digits = min_digits; digits < max_digits; digits++) {
- uniform_range(min_s, repeat("9", digits));
- min_s = "1" + repeat("0", digits);
+ uniform_range(min_s, string_repeat("9", digits));
+ min_s = "1" + string_repeat("0", digits);
out << " | ";
}
uniform_range(min_s, max_s);
std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
-template <typename Iterator>
-std::string join(Iterator begin, Iterator end, const std::string & separator) {
- std::ostringstream result;
- if (begin != end) {
- result << *begin;
- for (Iterator it = begin + 1; it != end; ++it) {
- result << separator << *it;
- }
- }
- return result.str();
-}
-
-static std::vector<std::string> split(const std::string & str, const std::string & delimiter) {
- std::vector<std::string> tokens;
- size_t start = 0;
- size_t end = str.find(delimiter);
-
- while (end != std::string::npos) {
- tokens.push_back(str.substr(start, end - start));
- start = end + delimiter.length();
- end = str.find(delimiter, start);
- }
-
- tokens.push_back(str.substr(start));
-
- return tokens;
-}
-
-static std::string repeat(const std::string & str, size_t n) {
- if (n == 0) {
- return "";
- }
-
- std::string result;
- result.reserve(str.length() * n);
-
- for (size_t i = 0; i < n; ++i) {
- result += str;
- }
-
- return result;
-}
-
static std::string replacePattern(const std::string & input, const std::regex & regex, const std::function<std::string(const std::smatch &)> & replacement) {
std::smatch match;
std::string result;
class SchemaConverter {
private:
+ friend std::string build_grammar(const std::function<void(const llama_grammar_builder &)> & cb);
std::function<json(const std::string &)> _fetch_json;
bool _dotall;
std::map<std::string, std::string> _rules;
for (size_t i = 0; i < alt_schemas.size(); i++) {
rules.push_back(visit(alt_schemas[i], name + (name.empty() ? "alternative-" : "-") + std::to_string(i)));
}
- return join(rules.begin(), rules.end(), " | ");
+ return string_join(rules, " | ");
}
std::string _visit_pattern(const std::string & pattern, const std::string & name) {
for (const auto & item : ret) {
results.push_back(to_rule(item));
}
- return std::make_pair(join(results.begin(), results.end(), " "), false);
+ return std::make_pair(string_join(results, " "), false);
};
while (i < length) {
}
curly_brackets += '}';
i++;
- auto nums = split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
+ auto nums = string_split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
int min_times = 0;
int max_times = std::numeric_limits<int>::max();
try {
return;
}
std::string pointer = ref.substr(ref.find('#') + 1);
- std::vector<std::string> tokens = split(pointer, "/");
+ std::vector<std::string> tokens = string_split(pointer, "/");
for (size_t i = 1; i < tokens.size(); ++i) {
std::string sel = tokens[i];
if (target.is_null() || !target.contains(sel)) {
for (const auto & v : schema["enum"]) {
enum_values.push_back(_generate_constant_rule(v));
}
- return _add_rule(rule_name, "(" + join(enum_values.begin(), enum_values.end(), " | ") + ") space");
+ return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ") space");
} else if ((schema_type.is_null() || schema_type == "object")
&& (schema.contains("properties") ||
(schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
void check_errors() {
if (!_errors.empty()) {
- throw std::runtime_error("JSON schema conversion failed:\n" + join(_errors.begin(), _errors.end(), "\n"));
+ throw std::runtime_error("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
}
if (!_warnings.empty()) {
- fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", join(_warnings.begin(), _warnings.end(), "; ").c_str());
+ fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", string_join(_warnings, "; ").c_str());
}
}
};
std::string json_schema_to_grammar(const json & schema) {
- SchemaConverter converter([](const std::string &) { return json::object(); }, /* dotall= */ false);
- auto copy = schema;
- converter.resolve_refs(copy, "input");
- converter.visit(copy, "");
+ return build_grammar([&](const llama_grammar_builder & callbacks) {
+ auto copy = schema;
+ callbacks.resolve_refs(copy);
+ callbacks.add_schema("", copy);
+ });
+}
+
+std::string build_grammar(const std::function<void(const llama_grammar_builder &)> & cb) {
+ SchemaConverter converter([&](const std::string &) { return json(); }, /* dotall= */ false);
+ llama_grammar_builder builder {
+ /* .add_rule = */ [&](const std::string & name, const std::string & rule) {
+ return converter._add_rule(name, rule);
+ },
+ /* .add_schema = */ [&](const std::string & name, const nlohmann::ordered_json & schema) {
+ return converter.visit(schema, name == "root" ? "" : name);
+ },
+ /* .resolve_refs = */ [&](nlohmann::ordered_json & schema) {
+ converter.resolve_refs(schema, "");
+ }
+ };
+ cb(builder);
converter.check_errors();
return converter.format_grammar();
}