From 139cc621e90b4f61830515c3c124cf35b3d7a6dc Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Fri, 28 Jun 2024 09:26:45 +0100 Subject: [PATCH] `json`: restore default additionalProperties to false, fix some pattern escapes (#8180) * json: expand ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS charset * json: revert default of additionalProperties to false * Update README.md --- common/json-schema-to-grammar.cpp | 4 +- examples/json_schema_to_grammar.py | 6 +-- .../server/public/json-schema-to-grammar.mjs | 4 +- grammars/README.md | 37 ++++++++++++------ tests/test-grammar-integration.cpp | 39 ++++++++++++++++++- tests/test-json-schema-to-grammar.cpp | 31 ++------------- 6 files changed, 73 insertions(+), 48 deletions(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index 2f233e2e7..881eb49e3 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -316,7 +316,7 @@ std::unordered_map GRAMMAR_LITERAL_ESCAPES = { }; std::unordered_set NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'}; -std::unordered_set ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'[', ']', '(', ')', '|', '{', '}', '*', '+', '?'}; +std::unordered_set ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'}; template std::string join(Iterator begin, Iterator end, const std::string & separator) { @@ -720,7 +720,7 @@ private: } prop_names.push_back(prop_name); } - if (!(additional_properties.is_boolean() && !additional_properties.get())) { + if ((additional_properties.is_boolean() && additional_properties.get()) || additional_properties.is_object()) { std::string sub_name = name + (name.empty() ? "" : "-") + "additional"; std::string value_rule = additional_properties.is_object() ? visit(additional_properties, sub_name + "-value") diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py index 92f6e3d47..072a230f7 100755 --- a/examples/json_schema_to_grammar.py +++ b/examples/json_schema_to_grammar.py @@ -231,7 +231,7 @@ GRAMMAR_RANGE_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"\]\-\\]') GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]'} NON_LITERAL_SET = set('|.()[]{}*+?') -ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?') +ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('^$.[]()|{}*+?') class SchemaConverter: @@ -602,7 +602,7 @@ class SchemaConverter: else: add_component(t, is_required=True) - return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=[])) + return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None)) elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema): items = schema.get('items') or schema['prefixItems'] @@ -691,7 +691,7 @@ class SchemaConverter: required_props = [k for k in sorted_props if k in required] optional_props = [k for k in sorted_props if k not in required] - if additional_properties != False: + if additional_properties is not None and additional_properties != False: sub_name = f'{name}{"-" if name else ""}additional' value_rule = self.visit(additional_properties, f'{sub_name}-value') if isinstance(additional_properties, dict) else \ self._add_primitive('value', PRIMITIVE_RULES['value']) diff --git a/examples/server/public/json-schema-to-grammar.mjs b/examples/server/public/json-schema-to-grammar.mjs index 06d76edde..7267f3f9c 100644 --- a/examples/server/public/json-schema-to-grammar.mjs +++ b/examples/server/public/json-schema-to-grammar.mjs @@ -259,7 +259,7 @@ const GRAMMAR_RANGE_LITERAL_ESCAPE_RE = /[\n\r"\]\-\\]/g; const GRAMMAR_LITERAL_ESCAPES = { '\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]' }; const NON_LITERAL_SET = new Set('|.()[]{}*+?'); -const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('[]()|{}*+?'); +const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('^$.[]()|{}*+?'); export class SchemaConverter { constructor(options) { @@ -751,7 +751,7 @@ export class SchemaConverter { const requiredProps = sortedProps.filter(k => required.has(k)); const optionalProps = sortedProps.filter(k => !required.has(k)); - if (additionalProperties !== false) { + if (additionalProperties) { const subName = `${name ?? ''}${name ? '-' : ''}additional`; const valueRule = additionalProperties != null && typeof additionalProperties === 'object' ? this.visit(additionalProperties, `${subName}-value`) diff --git a/grammars/README.md b/grammars/README.md index 40f666240..886023f77 100644 --- a/grammars/README.md +++ b/grammars/README.md @@ -182,6 +182,8 @@ space ::= | " " | "\n" [ \t]{0,20} Here is also a list of known limitations (contributions welcome): +- `additionalProperties` defaults to `false` (produces faster grammars + reduces hallucinations). +- `"additionalProperties": true` may produce keys that contain unescaped newlines. - Unsupported features are skipped silently. It is currently advised to use the command-line Python converter (see above) to see any warnings, and to inspect the resulting grammar / test it w/ [llama-gbnf-validator](../examples/gbnf-validator/gbnf-validator.cpp). - Can't mix `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703) - [prefixItems](https://json-schema.org/draft/2020-12/json-schema-core#name-prefixitems) is broken (but [items](https://json-schema.org/draft/2020-12/json-schema-core#name-items) works) @@ -203,10 +205,11 @@ And a non-exhaustive list of other unsupported features that are unlikely to be ### A word about additionalProperties > [!WARNING] -> By default, `object`s accept [additional properties](https://json-schema.org/understanding-json-schema/reference/object#additionalproperties), which you might not want / not expect, and which will make sampling slower (not just because of the extra tokens, but also generates a slower grammar). -> You can set `"additionalProperties": false` on the schema of any object to ensure only properties listed in `properties` are generated (not needed for non-`object` types, e.g. `array` or `string`). +> The JSON schemas spec states `object`s accept [additional properties](https://json-schema.org/understanding-json-schema/reference/object#additionalproperties) by default. +> Since this is slow and seems prone to hallucinations, we default to no additional properties. +> You can set `"additionalProperties": true` in the the schema of any object to explicitly allow additional properties. -If you're using [Pydantic](https://pydantic.dev/) to generate schemas, you can disable additional properties with the `extra` config on each model class: +If you're using [Pydantic](https://pydantic.dev/) to generate schemas, you can enable additional properties with the `extra` config on each model class: ```python # pip install pydantic @@ -215,14 +218,14 @@ from typing import Annotated, List from pydantic import BaseModel, Extra, Field class QAPair(BaseModel): class Config: - extra = 'forbid' # triggers additionalProperties: false in the JSON schema + extra = 'allow' # triggers additionalProperties: true in the JSON schema question: str concise_answer: str justification: str class Summary(BaseModel): class Config: - extra = 'forbid' + extra = 'allow' key_facts: List[Annotated[str, Field(pattern='- .{5,}')]] question_answers: List[Annotated[List[QAPair], Field(min_items=5)]] @@ -236,7 +239,7 @@ print(json.dumps(Summary.model_json_schema(), indent=2)) { "$defs": { "QAPair": { - "additionalProperties": false, + "additionalProperties": true, "properties": { "question": { "title": "Question", @@ -260,7 +263,7 @@ print(json.dumps(Summary.model_json_schema(), indent=2)) "type": "object" } }, - "additionalProperties": false, + "additionalProperties": true, "properties": { "key_facts": { "items": { @@ -292,30 +295,40 @@ print(json.dumps(Summary.model_json_schema(), indent=2)) ``` ``` -QAPair ::= "{" space QAPair-question-kv "," space QAPair-concise-answer-kv "," space QAPair-justification-kv "}" space +QAPair ::= "{" space QAPair-question-kv "," space QAPair-concise-answer-kv "," space QAPair-justification-kv ( "," space ( QAPair-additional-kv ( "," space QAPair-additional-kv )* ) )? "}" space +QAPair-additional-k ::= ["] ( [c] ([o] ([n] ([c] ([i] ([s] ([e] ([_] ([a] ([n] ([s] ([w] ([e] ([r] char+ | [^"r] char*) | [^"e] char*) | [^"w] char*) | [^"s] char*) | [^"n] char*) | [^"a] char*) | [^"_] char*) | [^"e] char*) | [^"s] char*) | [^"i] char*) | [^"c] char*) | [^"n] char*) | [^"o] char*) | [j] ([u] ([s] ([t] ([i] ([f] ([i] ([c] ([a] ([t] ([i] ([o] ([n] char+ | [^"n] char*) | [^"o] char*) | [^"i] char*) | [^"t] char*) | [^"a] char*) | [^"c] char*) | [^"i] char*) | [^"f] char*) | [^"i] char*) | [^"t] char*) | [^"s] char*) | [^"u] char*) | [q] ([u] ([e] ([s] ([t] ([i] ([o] ([n] char+ | [^"n] char*) | [^"o] char*) | [^"i] char*) | [^"t] char*) | [^"s] char*) | [^"e] char*) | [^"u] char*) | [^"cjq] char* )? ["] space +QAPair-additional-kv ::= QAPair-additional-k ":" space value QAPair-concise-answer-kv ::= "\"concise_answer\"" space ":" space string QAPair-justification-kv ::= "\"justification\"" space ":" space string QAPair-question-kv ::= "\"question\"" space ":" space string +additional-k ::= ["] ( [k] ([e] ([y] ([_] ([f] ([a] ([c] ([t] ([s] char+ | [^"s] char*) | [^"t] char*) | [^"c] char*) | [^"a] char*) | [^"f] char*) | [^"_] char*) | [^"y] char*) | [^"e] char*) | [q] ([u] ([e] ([s] ([t] ([i] ([o] ([n] ([_] ([a] ([n] ([s] ([w] ([e] ([r] ([s] char+ | [^"s] char*) | [^"r] char*) | [^"e] char*) | [^"w] char*) | [^"s] char*) | [^"n] char*) | [^"a] char*) | [^"_] char*) | [^"n] char*) | [^"o] char*) | [^"i] char*) | [^"t] char*) | [^"s] char*) | [^"e] char*) | [^"u] char*) | [^"kq] char* )? ["] space +additional-kv ::= additional-k ":" space value +array ::= "[" space ( value ("," space value)* )? "]" space +boolean ::= ("true" | "false") space char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4}) +decimal-part ::= [0-9]{1,16} dot ::= [^\x0A\x0D] +integral-part ::= [0] | [1-9] [0-9]{0,15} key-facts ::= "[" space (key-facts-item ("," space key-facts-item)*)? "]" space key-facts-item ::= "\"" "- " key-facts-item-1{5,} "\"" space key-facts-item-1 ::= dot key-facts-kv ::= "\"key_facts\"" space ":" space key-facts +null ::= "null" space +number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space +object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space question-answers ::= "[" space (question-answers-item ("," space question-answers-item)*)? "]" space question-answers-item ::= "[" space question-answers-item-item ("," space question-answers-item-item){4,} "]" space question-answers-item-item ::= QAPair question-answers-kv ::= "\"question_answers\"" space ":" space question-answers -root ::= "{" space key-facts-kv "," space question-answers-kv "}" space +root ::= "{" space key-facts-kv "," space question-answers-kv ( "," space ( additional-kv ( "," space additional-kv )* ) )? "}" space space ::= | " " | "\n" [ \t]{0,20} string ::= "\"" char* "\"" space +value ::= object | array | string | number | boolean | null ``` -If you're using [Zod](https://zod.dev/), you can make your objects explicitly strict w/ `z.object(...).strict()` or `z.strictObject(...)`. - -Note however that [zod-to-json-schema](https://github.com/StefanTerdell/zod-to-json-schema) currently always seems to set `"additionalProperties": false` anyway (even w/ zod schemas on which `nonstrict()` / `passthrough()` was called). +If you're using [Zod](https://zod.dev/), you can make your objects to explicitly allow extra properties w/ `nonstrict()` / `passthrough()` (or explicitly no extra props w/ `z.object(...).strict()` or `z.strictObject(...)`) but note that [zod-to-json-schema](https://github.com/StefanTerdell/zod-to-json-schema) currently always sets `"additionalProperties": false` anyway. ```js import { z } from 'zod'; diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp index 0e21dc795..975658f79 100644 --- a/tests/test-grammar-integration.cpp +++ b/tests/test-grammar-integration.cpp @@ -993,6 +993,40 @@ static void test_json_schema() { } ); + test_schema( + "simple pattern", + // Schema + R"""({ + "pattern": "^[a-zA-Z0-9_-]*$" + })""", + // Passing strings + { + R"""("")""", + R"""("He_llo-12")""", + }, + // Failing strings + { + R"""("!")""", + R"""("Hello World")""", + } + ); + + test_schema( + "pattern with escapes", + // Schema + R"""({ + "pattern": "^a\\^\\$\\.\\[\\]\\(\\)\\|\\{\\}\\*\\+\\?b$" + })""", + // Passing strings + { + R"""("a^$.[]()|{}*+?b")""", + }, + // Failing strings + { + R"""("ab")""", + } + ); + test_schema( "", // Schema @@ -1062,8 +1096,6 @@ static void test_json_schema() { R"""({ "number": 1600, "street_name": "Pennsylvania" })""", // "By extension, even an empty object is valid" R"""({})""", - // "By default, providing additional properties is valid" - R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""", R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""", }, // Failing strings @@ -1074,6 +1106,9 @@ static void test_json_schema() { R"""({ "street_name": "Pennsylvania", "number": 1600 })""", // Reorder properties R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""", + // "Additional properties default to false for generation, even though the spec says true. + R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""", + } ); diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index 3aaa11833..720a949c7 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -1120,28 +1120,15 @@ static void test_all(const std::string & lang, std::function