mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-11 21:10:24 +01:00
examples : generate JSON according to schema (#1887)
* examples : add JSON schema grammars * complete JSON grammar * ensure primitive types can be used as root of schema * support integer type and adjust usage text
This commit is contained in:
parent
468ea24fb4
commit
8183159cf3
132
examples/json-schema-to-grammar.py
Normal file
132
examples/json-schema-to-grammar.py
Normal file
@ -0,0 +1,132 @@
|
|||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# whitespace is constrained to a single space char to prevent model "running away" in
|
||||||
|
# whitespace. Also maybe improves generation quality?
|
||||||
|
SPACE_RULE = '" "?'
|
||||||
|
|
||||||
|
PRIMITIVE_RULES = {
|
||||||
|
'boolean': '("true" | "false") space',
|
||||||
|
'number': '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
|
||||||
|
'integer': '("-"? ([0-9] | [1-9] [0-9]*)) space',
|
||||||
|
'string': r''' "\"" (
|
||||||
|
[^"\\] |
|
||||||
|
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
|
||||||
|
)* "\"" space ''',
|
||||||
|
'null': '"null" space',
|
||||||
|
}
|
||||||
|
|
||||||
|
INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+')
|
||||||
|
GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
|
||||||
|
GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"'}
|
||||||
|
|
||||||
|
|
||||||
|
class SchemaConverter:
|
||||||
|
def __init__(self, prop_order):
|
||||||
|
self._prop_order = prop_order
|
||||||
|
self._rules = {'space': SPACE_RULE}
|
||||||
|
|
||||||
|
def _format_literal(self, literal):
|
||||||
|
escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
|
||||||
|
lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), json.dumps(literal)
|
||||||
|
)
|
||||||
|
return f'"{escaped}"'
|
||||||
|
|
||||||
|
def _add_rule(self, name, rule):
|
||||||
|
esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
|
||||||
|
if esc_name not in self._rules or self._rules[esc_name] == rule:
|
||||||
|
key = esc_name
|
||||||
|
else:
|
||||||
|
i = 0
|
||||||
|
while f'{esc_name}{i}' in self._rules:
|
||||||
|
i += 1
|
||||||
|
key = f'{esc_name}{i}'
|
||||||
|
self._rules[key] = rule
|
||||||
|
return key
|
||||||
|
|
||||||
|
def visit(self, schema, name):
|
||||||
|
schema_type = schema.get('type')
|
||||||
|
rule_name = name or 'root'
|
||||||
|
|
||||||
|
if 'oneOf' in schema or 'anyOf' in schema:
|
||||||
|
rule = ' | '.join((
|
||||||
|
self.visit(alt_schema, f'{name}{"-" if name else ""}{i}')
|
||||||
|
for i, alt_schema in enumerate(schema.get('oneOf') or schema['anyOf'])
|
||||||
|
))
|
||||||
|
return self._add_rule(rule_name, rule)
|
||||||
|
|
||||||
|
elif 'const' in schema:
|
||||||
|
return self._add_rule(rule_name, self._format_literal(schema['const']))
|
||||||
|
|
||||||
|
elif 'enum' in schema:
|
||||||
|
rule = ' | '.join((self._format_literal(v) for v in schema['enum']))
|
||||||
|
return self._add_rule(rule_name, rule)
|
||||||
|
|
||||||
|
elif schema_type == 'object' and 'properties' in schema:
|
||||||
|
# TODO: `required` keyword
|
||||||
|
prop_order = self._prop_order
|
||||||
|
prop_pairs = sorted(
|
||||||
|
schema['properties'].items(),
|
||||||
|
# sort by position in prop_order (if specified) then by key
|
||||||
|
key=lambda kv: (prop_order.get(kv[0], len(prop_order)), kv[0]),
|
||||||
|
)
|
||||||
|
|
||||||
|
rule = '"{" space'
|
||||||
|
for i, (prop_name, prop_schema) in enumerate(prop_pairs):
|
||||||
|
prop_rule_name = self.visit(prop_schema, f'{name}{"-" if name else ""}{prop_name}')
|
||||||
|
if i > 0:
|
||||||
|
rule += ' "," space'
|
||||||
|
rule += fr' {self._format_literal(prop_name)} space ":" space {prop_rule_name}'
|
||||||
|
rule += ' "}" space'
|
||||||
|
|
||||||
|
return self._add_rule(rule_name, rule)
|
||||||
|
|
||||||
|
elif schema_type == 'array' and 'items' in schema:
|
||||||
|
# TODO `prefixItems` keyword
|
||||||
|
item_rule_name = self.visit(schema['items'], f'{name}{"-" if name else ""}item')
|
||||||
|
rule = f'"[" space ({item_rule_name} ("," space {item_rule_name})*)? "]" space'
|
||||||
|
return self._add_rule(rule_name, rule)
|
||||||
|
|
||||||
|
else:
|
||||||
|
assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}'
|
||||||
|
return self._add_rule(
|
||||||
|
'root' if rule_name == 'root' else schema_type,
|
||||||
|
PRIMITIVE_RULES[schema_type]
|
||||||
|
)
|
||||||
|
|
||||||
|
def format_grammar(self):
|
||||||
|
return '\n'.join((f'{name} ::= {rule}' for name, rule in self._rules.items()))
|
||||||
|
|
||||||
|
|
||||||
|
def main(args_in = None):
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description='''
|
||||||
|
Generates a grammar (suitable for use in ./main) that produces JSON conforming to a
|
||||||
|
given JSON schema. Only a subset of JSON schema features are supported; more may be
|
||||||
|
added in the future.
|
||||||
|
''',
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--prop-order',
|
||||||
|
default=[],
|
||||||
|
type=lambda s: s.split(','),
|
||||||
|
help='''
|
||||||
|
comma-separated property names defining the order of precedence for object properties;
|
||||||
|
properties not specified here are given lower precedence than those that are, and are
|
||||||
|
sorted alphabetically
|
||||||
|
'''
|
||||||
|
)
|
||||||
|
parser.add_argument('schema', help='file containing JSON schema ("-" for stdin)')
|
||||||
|
args = parser.parse_args(args_in)
|
||||||
|
|
||||||
|
schema = json.load(sys.stdin if args.schema == '-' else open(args.schema))
|
||||||
|
prop_order = {name: idx for idx, name in enumerate(args.prop_order)}
|
||||||
|
converter = SchemaConverter(prop_order)
|
||||||
|
converter.visit(schema, '')
|
||||||
|
print(converter.format_grammar())
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
@ -1,19 +1,17 @@
|
|||||||
# Grammar for subset of JSON - doesn't support full string or number syntax
|
|
||||||
|
|
||||||
root ::= object
|
root ::= object
|
||||||
value ::= object | array | string | number | boolean | "null"
|
value ::= object | array | string | number | ("true" | "false" | "null") ws
|
||||||
|
|
||||||
object ::=
|
object ::=
|
||||||
"{" ws (
|
"{" ws (
|
||||||
string ":" ws value
|
string ":" ws value
|
||||||
("," ws string ":" ws value)*
|
("," ws string ":" ws value)*
|
||||||
)? "}"
|
)? "}" ws
|
||||||
|
|
||||||
array ::=
|
array ::=
|
||||||
"[" ws (
|
"[" ws (
|
||||||
value
|
value
|
||||||
("," ws value)*
|
("," ws value)*
|
||||||
)? "]"
|
)? "]" ws
|
||||||
|
|
||||||
string ::=
|
string ::=
|
||||||
"\"" (
|
"\"" (
|
||||||
@ -21,9 +19,7 @@ string ::=
|
|||||||
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
|
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
|
||||||
)* "\"" ws
|
)* "\"" ws
|
||||||
|
|
||||||
# Only plain integers currently
|
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
|
||||||
number ::= "-"? [0-9]+ ws
|
|
||||||
boolean ::= ("true" | "false") ws
|
|
||||||
|
|
||||||
# Optional space: by convention, applied in this grammar after literal chars when allowed
|
# Optional space: by convention, applied in this grammar after literal chars when allowed
|
||||||
ws ::= ([ \t\n] ws)?
|
ws ::= ([ \t\n] ws)?
|
||||||
|
Loading…
x
Reference in New Issue
Block a user