From 52ee4540c0f2e11d52c839db6eb51d014ce060e1 Mon Sep 17 00:00:00 2001 From: Maximilian Winter Date: Fri, 12 Jan 2024 20:46:45 +0100 Subject: [PATCH] examples : add pydantic models to GBNF grammar generator (#4883) * Create pydantic-models-to-grammar.py * Added some comments for usage * Refactored Grammar Generator Added example and usage instruction. * Update pydantic_models_to_grammar.py * Update pydantic-models-to-grammar-examples.py * Renamed module and imported it. * Update pydantic-models-to-grammar.py * Renamed file and fixed grammar generator issue. --- .../pydantic-models-to-grammar-examples.py | 136 ++ examples/pydantic_models_to_grammar.py | 1151 +++++++++++++++++ 2 files changed, 1287 insertions(+) create mode 100644 examples/pydantic-models-to-grammar-examples.py create mode 100644 examples/pydantic_models_to_grammar.py diff --git a/examples/pydantic-models-to-grammar-examples.py b/examples/pydantic-models-to-grammar-examples.py new file mode 100644 index 000000000..a8a4919cf --- /dev/null +++ b/examples/pydantic-models-to-grammar-examples.py @@ -0,0 +1,136 @@ +# Function calling example using pydantic models. + +import json +from enum import Enum +from typing import Union, Optional + +import requests +from pydantic import BaseModel, Field + +import importlib +from pydantic_models_to_grammar import generate_gbnf_grammar_and_documentation + +# Function to get completion on the llama.cpp server with grammar. +def create_completion(prompt, grammar): + headers = {"Content-Type": "application/json"} + data = {"prompt": prompt, "grammar": grammar} + + response = requests.post("http://127.0.0.1:8080/completion", headers=headers, json=data) + data = response.json() + + print(data["content"]) + return data["content"] + + +# A function for the agent to send a message to the user. +class SendMessageToUser(BaseModel): + """ + Send a message to the User. + """ + chain_of_thought: str = Field(..., description="Your chain of thought while sending the message.") + message: str = Field(..., description="Message you want to send to the user.") + + def run(self): + print(self.message) + + +# Enum for the calculator function. +class MathOperation(Enum): + ADD = "add" + SUBTRACT = "subtract" + MULTIPLY = "multiply" + DIVIDE = "divide" + + +# Very simple calculator tool for the agent. +class Calculator(BaseModel): + """ + Perform a math operation on two numbers. + """ + number_one: Union[int, float] = Field(..., description="First number.") + operation: MathOperation = Field(..., description="Math operation to perform.") + number_two: Union[int, float] = Field(..., description="Second number.") + + def run(self): + if self.operation == MathOperation.ADD: + return self.number_one + self.number_two + elif self.operation == MathOperation.SUBTRACT: + return self.number_one - self.number_two + elif self.operation == MathOperation.MULTIPLY: + return self.number_one * self.number_two + elif self.operation == MathOperation.DIVIDE: + return self.number_one / self.number_two + else: + raise ValueError("Unknown operation.") + + +# Here the grammar gets generated by passing the available function models to generate_gbnf_grammar_and_documentation function. This also generates a documentation usable by the LLM. +# pydantic_model_list is the list of pydanitc models +# outer_object_name is an optional name for an outer object around the actual model object. Like a "function" object with "function_parameters" which contains the actual model object. If None, no outer object will be generated +# outer_object_content is the name of outer object content. +# model_prefix is the optional prefix for models in the documentation. (Default="Output Model") +# fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields") +gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation( + pydantic_model_list=[SendMessageToUser, Calculator], outer_object_name="function", + outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters") + +print(gbnf_grammar) +print(documentation) + +system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation + +user_message = "What is 42 * 42?" +prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant" + +text = create_completion(prompt=prompt, grammar=gbnf_grammar) +# This should output something like this: +# { +# "function": "calculator", +# "function_parameters": { +# "number_one": 42, +# "operation": "multiply", +# "number_two": 42 +# } +# } +function_dictionary = json.loads(text) +if function_dictionary["function"] == "calculator": + function_parameters = {**function_dictionary["function_parameters"]} + + print(Calculator(**function_parameters).run()) + # This should output: 1764 + + +# A example structured output based on pydantic models. The LLM will create an entry for a Book database out of an unstructured text. +class Category(Enum): + """ + The category of the book. + """ + Fiction = "Fiction" + NonFiction = "Non-Fiction" + + +class Book(BaseModel): + """ + Represents an entry about a book. + """ + title: str = Field(..., description="Title of the book.") + author: str = Field(..., description="Author of the book.") + published_year: Optional[int] = Field(..., description="Publishing year of the book.") + keywords: list[str] = Field(..., description="A list of keywords.") + category: Category = Field(..., description="Category of the book.") + summary: str = Field(..., description="Summary of the book.") + + +# We need no additional parameters other than our list of pydantic models. +gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation([Book]) + +system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation + +text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 1961–1963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands.""" +prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant" + +text = create_completion(prompt=prompt, grammar=gbnf_grammar) + +json_data = json.loads(text) + +print(Book(**json_data)) diff --git a/examples/pydantic_models_to_grammar.py b/examples/pydantic_models_to_grammar.py new file mode 100644 index 000000000..41b98fdc1 --- /dev/null +++ b/examples/pydantic_models_to_grammar.py @@ -0,0 +1,1151 @@ +import inspect +import json +from copy import copy +from inspect import isclass, getdoc +from types import NoneType + +from pydantic import BaseModel, create_model, Field +from typing import Any, Type, List, get_args, get_origin, Tuple, Union, Optional, _GenericAlias +from enum import Enum +from typing import get_type_hints, Callable +import re + + +class PydanticDataType(Enum): + """ + Defines the data types supported by the grammar_generator. + + Attributes: + STRING (str): Represents a string data type. + BOOLEAN (str): Represents a boolean data type. + INTEGER (str): Represents an integer data type. + FLOAT (str): Represents a float data type. + OBJECT (str): Represents an object data type. + ARRAY (str): Represents an array data type. + ENUM (str): Represents an enum data type. + CUSTOM_CLASS (str): Represents a custom class data type. + """ + STRING = "string" + TRIPLE_QUOTED_STRING = "triple_quoted_string" + MARKDOWN_STRING = "markdown_string" + BOOLEAN = "boolean" + INTEGER = "integer" + FLOAT = "float" + OBJECT = "object" + ARRAY = "array" + ENUM = "enum" + ANY = "any" + NULL = "null" + CUSTOM_CLASS = "custom-class" + CUSTOM_DICT = "custom-dict" + SET = "set" + + +def map_pydantic_type_to_gbnf(pydantic_type: Type[Any]) -> str: + if isclass(pydantic_type) and issubclass(pydantic_type, str): + return PydanticDataType.STRING.value + elif isclass(pydantic_type) and issubclass(pydantic_type, bool): + return PydanticDataType.BOOLEAN.value + elif isclass(pydantic_type) and issubclass(pydantic_type, int): + return PydanticDataType.INTEGER.value + elif isclass(pydantic_type) and issubclass(pydantic_type, float): + return PydanticDataType.FLOAT.value + elif isclass(pydantic_type) and issubclass(pydantic_type, Enum): + return PydanticDataType.ENUM.value + + elif isclass(pydantic_type) and issubclass(pydantic_type, BaseModel): + return format_model_and_field_name(pydantic_type.__name__) + elif get_origin(pydantic_type) == list: + element_type = get_args(pydantic_type)[0] + return f"{map_pydantic_type_to_gbnf(element_type)}-list" + elif get_origin(pydantic_type) == set: + element_type = get_args(pydantic_type)[0] + return f"{map_pydantic_type_to_gbnf(element_type)}-set" + elif get_origin(pydantic_type) == Union: + union_types = get_args(pydantic_type) + union_rules = [map_pydantic_type_to_gbnf(ut) for ut in union_types] + return f"union-{'-or-'.join(union_rules)}" + elif get_origin(pydantic_type) == Optional: + element_type = get_args(pydantic_type)[0] + return f"optional-{map_pydantic_type_to_gbnf(element_type)}" + elif isclass(pydantic_type): + return f"{PydanticDataType.CUSTOM_CLASS.value}-{format_model_and_field_name(pydantic_type.__name__)}" + elif get_origin(pydantic_type) == dict: + key_type, value_type = get_args(pydantic_type) + return f"custom-dict-key-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(key_type))}-value-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(value_type))}" + else: + return "unknown" + + +def format_model_and_field_name(model_name: str) -> str: + parts = re.findall('[A-Z][^A-Z]*', model_name) + if not parts: # Check if the list is empty + return model_name.lower().replace("_", "-") + return '-'.join(part.lower().replace("_", "-") for part in parts) + + +def generate_list_rule(element_type): + """ + Generate a GBNF rule for a list of a given element type. + + :param element_type: The type of the elements in the list (e.g., 'string'). + :return: A string representing the GBNF rule for a list of the given type. + """ + rule_name = f"{map_pydantic_type_to_gbnf(element_type)}-list" + element_rule = map_pydantic_type_to_gbnf(element_type) + list_rule = fr'{rule_name} ::= "[" {element_rule} ("," {element_rule})* "]"' + return list_rule + + +def get_members_structure(cls, rule_name): + if issubclass(cls, Enum): + # Handle Enum types + members = [f'\"\\\"{member.value}\\\"\"' for name, member in cls.__members__.items()] + return f"{cls.__name__.lower()} ::= " + " | ".join(members) + if cls.__annotations__ and cls.__annotations__ != {}: + result = f'{rule_name} ::= "{{"' + type_list_rules = [] + # Modify this comprehension + members = [f' \"\\\"{name}\\\"\" ":" {map_pydantic_type_to_gbnf(param_type)}' + for name, param_type in cls.__annotations__.items() + if name != 'self'] + + result += '"," '.join(members) + result += ' "}"' + return result, type_list_rules + elif rule_name == "custom-class-any": + result = f'{rule_name} ::= ' + result += 'value' + type_list_rules = [] + return result, type_list_rules + else: + init_signature = inspect.signature(cls.__init__) + parameters = init_signature.parameters + result = f'{rule_name} ::= "{{"' + type_list_rules = [] + # Modify this comprehension too + members = [f' \"\\\"{name}\\\"\" ":" {map_pydantic_type_to_gbnf(param.annotation)}' + for name, param in parameters.items() + if name != 'self' and param.annotation != inspect.Parameter.empty] + + result += '", "'.join(members) + result += ' "}"' + return result, type_list_rules + + +def regex_to_gbnf(regex_pattern: str) -> str: + """ + Translate a basic regex pattern to a GBNF rule. + Note: This function handles only a subset of simple regex patterns. + """ + gbnf_rule = regex_pattern + + # Translate common regex components to GBNF + gbnf_rule = gbnf_rule.replace('\\d', '[0-9]') + gbnf_rule = gbnf_rule.replace('\\s', '[ \t\n]') + + # Handle quantifiers and other regex syntax that is similar in GBNF + # (e.g., '*', '+', '?', character classes) + + return gbnf_rule + + +def generate_gbnf_integer_rules(max_digit=None, min_digit=None): + """ + + Generate GBNF Integer Rules + + Generates GBNF (Generalized Backus-Naur Form) rules for integers based on the given maximum and minimum digits. + + Parameters: + max_digit (int): The maximum number of digits for the integer. Default is None. + min_digit (int): The minimum number of digits for the integer. Default is None. + + Returns: + integer_rule (str): The identifier for the integer rule generated. + additional_rules (list): A list of additional rules generated based on the given maximum and minimum digits. + + """ + additional_rules = [] + + # Define the rule identifier based on max_digit and min_digit + integer_rule = "integer-part" + if max_digit is not None: + integer_rule += f"-max{max_digit}" + if min_digit is not None: + integer_rule += f"-min{min_digit}" + + # Handling Integer Rules + if max_digit is not None or min_digit is not None: + # Start with an empty rule part + integer_rule_part = '' + + # Add mandatory digits as per min_digit + if min_digit is not None: + integer_rule_part += '[0-9] ' * min_digit + + # Add optional digits up to max_digit + if max_digit is not None: + optional_digits = max_digit - (min_digit if min_digit is not None else 0) + integer_rule_part += ''.join(['[0-9]? ' for _ in range(optional_digits)]) + + # Trim the rule part and append it to additional rules + integer_rule_part = integer_rule_part.strip() + if integer_rule_part: + additional_rules.append(f'{integer_rule} ::= {integer_rule_part}') + + return integer_rule, additional_rules + + +def generate_gbnf_float_rules(max_digit=None, min_digit=None, max_precision=None, min_precision=None): + """ + Generate GBNF float rules based on the given constraints. + + :param max_digit: Maximum number of digits in the integer part (default: None) + :param min_digit: Minimum number of digits in the integer part (default: None) + :param max_precision: Maximum number of digits in the fractional part (default: None) + :param min_precision: Minimum number of digits in the fractional part (default: None) + :return: A tuple containing the float rule and additional rules as a list + + Example Usage: + max_digit = 3 + min_digit = 1 + max_precision = 2 + min_precision = 1 + generate_gbnf_float_rules(max_digit, min_digit, max_precision, min_precision) + + Output: + ('float-3-1-2-1', ['integer-part-max3-min1 ::= [0-9] [0-9] [0-9]?', 'fractional-part-max2-min1 ::= [0-9] [0-9]?', 'float-3-1-2-1 ::= integer-part-max3-min1 "." fractional-part-max2-min + *1']) + + Note: + GBNF stands for Generalized Backus-Naur Form, which is a notation technique to specify the syntax of programming languages or other formal grammars. + """ + additional_rules = [] + + # Define the integer part rule + integer_part_rule = "integer-part" + (f"-max{max_digit}" if max_digit is not None else "") + ( + f"-min{min_digit}" if min_digit is not None else "") + + # Define the fractional part rule based on precision constraints + fractional_part_rule = "fractional-part" + fractional_rule_part = '' + if max_precision is not None or min_precision is not None: + fractional_part_rule += (f"-max{max_precision}" if max_precision is not None else "") + ( + f"-min{min_precision}" if min_precision is not None else "") + # Minimum number of digits + fractional_rule_part = '[0-9]' * (min_precision if min_precision is not None else 1) + # Optional additional digits + fractional_rule_part += ''.join([' [0-9]?'] * ( + (max_precision - (min_precision if min_precision is not None else 1)) if max_precision is not None else 0)) + additional_rules.append(f'{fractional_part_rule} ::= {fractional_rule_part}') + + # Define the float rule + float_rule = f"float-{max_digit if max_digit is not None else 'X'}-{min_digit if min_digit is not None else 'X'}-{max_precision if max_precision is not None else 'X'}-{min_precision if min_precision is not None else 'X'}" + additional_rules.append(f'{float_rule} ::= {integer_part_rule} "." {fractional_part_rule}') + + # Generating the integer part rule definition, if necessary + if max_digit is not None or min_digit is not None: + integer_rule_part = '[0-9]' + if min_digit is not None and min_digit > 1: + integer_rule_part += ' [0-9]' * (min_digit - 1) + if max_digit is not None: + integer_rule_part += ''.join([' [0-9]?'] * (max_digit - (min_digit if min_digit is not None else 1))) + additional_rules.append(f'{integer_part_rule} ::= {integer_rule_part.strip()}') + + return float_rule, additional_rules + + +def generate_gbnf_rule_for_type(model_name, field_name, + field_type, is_optional, processed_models, created_rules, + field_info=None) -> \ + Tuple[str, list]: + """ + Generate GBNF rule for a given field type. + + :param model_name: Name of the model. + + :param field_name: Name of the field. + :param field_type: Type of the field. + :param is_optional: Whether the field is optional. + :param processed_models: List of processed models. + :param created_rules: List of created rules. + :param field_info: Additional information about the field (optional). + + :return: Tuple containing the GBNF type and a list of additional rules. + :rtype: Tuple[str, list] + """ + rules = [] + + field_name = format_model_and_field_name(field_name) + gbnf_type = map_pydantic_type_to_gbnf(field_type) + + if isclass(field_type) and issubclass(field_type, BaseModel): + nested_model_name = format_model_and_field_name(field_type.__name__) + nested_model_rules = generate_gbnf_grammar(field_type, processed_models, created_rules) + rules.extend(nested_model_rules) + gbnf_type, rules = nested_model_name, rules + elif isclass(field_type) and issubclass(field_type, Enum): + enum_values = [f'\"\\\"{e.value}\\\"\"' for e in field_type] # Adding escaped quotes + enum_rule = f"{model_name}-{field_name} ::= {' | '.join(enum_values)}" + rules.append(enum_rule) + gbnf_type, rules = model_name + "-" + field_name, rules + elif get_origin(field_type) == list or field_type == list: # Array + element_type = get_args(field_type)[0] + element_rule_name, additional_rules = generate_gbnf_rule_for_type(model_name, + f"{field_name}-element", + element_type, is_optional, processed_models, + created_rules) + rules.extend(additional_rules) + array_rule = f"""{model_name}-{field_name} ::= "[" ws {element_rule_name} ("," ws {element_rule_name})* "]" """ + rules.append(array_rule) + gbnf_type, rules = model_name + "-" + field_name, rules + + elif get_origin(field_type) == set or field_type == set: # Array + element_type = get_args(field_type)[0] + element_rule_name, additional_rules = generate_gbnf_rule_for_type(model_name, + f"{field_name}-element", + element_type, is_optional, processed_models, + created_rules) + rules.extend(additional_rules) + array_rule = f"""{model_name}-{field_name} ::= "[" ws {element_rule_name} ("," ws {element_rule_name})* "]" """ + rules.append(array_rule) + gbnf_type, rules = model_name + "-" + field_name, rules + + elif gbnf_type.startswith("custom-class-"): + nested_model_rules, field_types = get_members_structure(field_type, gbnf_type) + rules.append(nested_model_rules) + elif gbnf_type.startswith("custom-dict-"): + key_type, value_type = get_args(field_type) + + additional_key_type, additional_key_rules = generate_gbnf_rule_for_type(model_name, + f"{field_name}-key-type", + key_type, is_optional, processed_models, + created_rules) + additional_value_type, additional_value_rules = generate_gbnf_rule_for_type(model_name, + f"{field_name}-value-type", + value_type, is_optional, + processed_models, created_rules) + gbnf_type = fr'{gbnf_type} ::= "{{" ( {additional_key_type} ":" {additional_value_type} ("," {additional_key_type} ":" {additional_value_type})* )? "}}" ' + + rules.extend(additional_key_rules) + rules.extend(additional_value_rules) + elif gbnf_type.startswith("union-"): + union_types = get_args(field_type) + union_rules = [] + + for union_type in union_types: + if isinstance(union_type, _GenericAlias): + union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(model_name, + field_name, union_type, + False, + processed_models, created_rules) + union_rules.append(union_gbnf_type) + rules.extend(union_rules_list) + + + elif not issubclass(union_type, NoneType): + union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(model_name, + field_name, union_type, + False, + processed_models, created_rules) + union_rules.append(union_gbnf_type) + rules.extend(union_rules_list) + + # Defining the union grammar rule separately + if len(union_rules) == 1: + union_grammar_rule = f"{model_name}-{field_name}-optional ::= {' | '.join(union_rules)} | null" + else: + union_grammar_rule = f"{model_name}-{field_name}-union ::= {' | '.join(union_rules)}" + rules.append(union_grammar_rule) + if len(union_rules) == 1: + gbnf_type = f"{model_name}-{field_name}-optional" + else: + gbnf_type = f"{model_name}-{field_name}-union" + elif isclass(field_type) and issubclass(field_type, str): + if field_info and hasattr(field_info, 'json_schema_extra') and field_info.json_schema_extra is not None: + + triple_quoted_string = field_info.json_schema_extra.get('triple_quoted_string', False) + markdown_string = field_info.json_schema_extra.get('markdown_string', False) + + gbnf_type = PydanticDataType.TRIPLE_QUOTED_STRING.value if triple_quoted_string else PydanticDataType.STRING.value + gbnf_type = PydanticDataType.MARKDOWN_STRING.value if markdown_string else gbnf_type + + elif field_info and hasattr(field_info, 'pattern'): + # Convert regex pattern to grammar rule + regex_pattern = field_info.regex.pattern + gbnf_type = f"pattern-{field_name} ::= {regex_to_gbnf(regex_pattern)}" + else: + gbnf_type = PydanticDataType.STRING.value + + elif isclass(field_type) and issubclass(field_type, float) and field_info and hasattr(field_info, + 'json_schema_extra') and field_info.json_schema_extra is not None: + # Retrieve precision attributes for floats + max_precision = field_info.json_schema_extra.get('max_precision') if field_info and hasattr(field_info, + 'json_schema_extra') else None + min_precision = field_info.json_schema_extra.get('min_precision') if field_info and hasattr(field_info, + 'json_schema_extra') else None + max_digits = field_info.json_schema_extra.get('max_digit') if field_info and hasattr(field_info, + 'json_schema_extra') else None + min_digits = field_info.json_schema_extra.get('min_digit') if field_info and hasattr(field_info, + 'json_schema_extra') else None + + # Generate GBNF rule for float with given attributes + gbnf_type, rules = generate_gbnf_float_rules(max_digit=max_digits, min_digit=min_digits, + max_precision=max_precision, + min_precision=min_precision) + + elif isclass(field_type) and issubclass(field_type, int) and field_info and hasattr(field_info, + 'json_schema_extra') and field_info.json_schema_extra is not None: + # Retrieve digit attributes for integers + max_digits = field_info.json_schema_extra.get('max_digit') if field_info and hasattr(field_info, + 'json_schema_extra') else None + min_digits = field_info.json_schema_extra.get('min_digit') if field_info and hasattr(field_info, + 'json_schema_extra') else None + + # Generate GBNF rule for integer with given attributes + gbnf_type, rules = generate_gbnf_integer_rules(max_digit=max_digits, min_digit=min_digits) + else: + gbnf_type, rules = gbnf_type, [] + + if gbnf_type not in created_rules: + return gbnf_type, rules + else: + if gbnf_type in created_rules: + return gbnf_type, rules + + +def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created_rules: dict) -> (list, bool, bool): + """ + + Generate GBnF Grammar + + Generates a GBnF grammar for a given model. + + :param model: A Pydantic model class to generate the grammar for. Must be a subclass of BaseModel. + :param processed_models: A set of already processed models to prevent infinite recursion. + :param created_rules: A dict containing already created rules to prevent duplicates. + :return: A list of GBnF grammar rules in string format. And two booleans indicating if an extra markdown or triple quoted string is in the grammar. + Example Usage: + ``` + model = MyModel + processed_models = set() + created_rules = dict() + + gbnf_grammar = generate_gbnf_grammar(model, processed_models, created_rules) + ``` + """ + if model in processed_models: + return [] + + processed_models.add(model) + model_name = format_model_and_field_name(model.__name__) + + if not issubclass(model, BaseModel): + # For non-Pydantic classes, generate model_fields from __annotations__ or __init__ + if hasattr(model, '__annotations__') and model.__annotations__: + model_fields = {name: (typ, ...) for name, typ in model.__annotations__.items()} + else: + init_signature = inspect.signature(model.__init__) + parameters = init_signature.parameters + model_fields = {name: (param.annotation, param.default) for name, param in parameters.items() + if name != 'self'} + else: + # For Pydantic models, use model_fields and check for ellipsis (required fields) + model_fields = model.__annotations__ + + model_rule_parts = [] + nested_rules = [] + has_markdown_code_block = False + has_triple_quoted_string = False + look_for_markdown_code_block = False + look_for_triple_quoted_string = False + for field_name, field_info in model_fields.items(): + if not issubclass(model, BaseModel): + field_type, default_value = field_info + # Check if the field is optional (not required) + is_optional = (default_value is not inspect.Parameter.empty) and (default_value is not Ellipsis) + else: + field_type = field_info + field_info = model.model_fields[field_name] + is_optional = field_info.is_required is False and get_origin(field_type) is Optional + rule_name, additional_rules = generate_gbnf_rule_for_type(model_name, + format_model_and_field_name(field_name), + field_type, is_optional, + processed_models, created_rules, field_info) + look_for_markdown_code_block = True if rule_name == "markdown_string" else False + look_for_triple_quoted_string = True if rule_name == "triple_quoted_string" else False + if not look_for_markdown_code_block and not look_for_triple_quoted_string: + if rule_name not in created_rules: + created_rules[rule_name] = additional_rules + model_rule_parts.append(f' ws \"\\\"{field_name}\\\"\" ": " {rule_name}') # Adding escaped quotes + nested_rules.extend(additional_rules) + else: + has_triple_quoted_string = look_for_markdown_code_block + has_markdown_code_block = look_for_triple_quoted_string + + fields_joined = r' "," "\n" '.join(model_rule_parts) + model_rule = fr'{model_name} ::= "{{" "\n" {fields_joined} "\n" ws "}}"' + + if look_for_markdown_code_block or look_for_triple_quoted_string: + model_rule += ' ws "}"' + + if has_triple_quoted_string: + model_rule += '"\\n" triple-quoted-string' + if has_markdown_code_block: + model_rule += '"\\n" markdown-code-block' + all_rules = [model_rule] + nested_rules + + return all_rules, has_markdown_code_block, has_triple_quoted_string + + +def generate_gbnf_grammar_from_pydantic_models(models: List[Type[BaseModel]], outer_object_name: str = None, + outer_object_content: str = None, list_of_outputs: bool = False) -> str: + """ + Generate GBNF Grammar from Pydantic Models. + + This method takes a list of Pydantic models and uses them to generate a GBNF grammar string. The generated grammar string can be used for parsing and validating data using the generated + * grammar. + + Parameters: + models (List[Type[BaseModel]]): A list of Pydantic models to generate the grammar from. + outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling. + outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling. + list_of_outputs (str, optional): Allows a list of output objects + Returns: + str: The generated GBNF grammar string. + + Examples: + models = [UserModel, PostModel] + grammar = generate_gbnf_grammar_from_pydantic(models) + print(grammar) + # Output: + # root ::= UserModel | PostModel + # ... + """ + processed_models = set() + all_rules = [] + created_rules = {} + if outer_object_name is None: + + for model in models: + model_rules, _, _ = generate_gbnf_grammar(model, + processed_models, created_rules) + all_rules.extend(model_rules) + + if list_of_outputs: + root_rule = r'root ::= ws "[" grammar-models ("," grammar-models)* "]"' + "\n" + else: + root_rule = r'root ::= ws grammar-models' + "\n" + root_rule += "grammar-models ::= " + " | ".join( + [format_model_and_field_name(model.__name__) for model in models]) + all_rules.insert(0, root_rule) + return "\n".join(all_rules) + elif outer_object_name is not None: + if list_of_outputs: + root_rule = fr'root ::= ws "[" {format_model_and_field_name(outer_object_name)} ("," {format_model_and_field_name(outer_object_name)})* "]"' + "\n" + else: + root_rule = f"root ::= {format_model_and_field_name(outer_object_name)}\n" + + model_rule = fr'{format_model_and_field_name(outer_object_name)} ::= ws "{{" ws "\"{outer_object_name}\"" ": " grammar-models' + + fields_joined = " | ".join( + [fr'{format_model_and_field_name(model.__name__)}-grammar-model' for model in models]) + + grammar_model_rules = f'\ngrammar-models ::= {fields_joined}' + mod_rules = [] + for model in models: + mod_rule = fr'{format_model_and_field_name(model.__name__)}-grammar-model ::= ws' + mod_rule += fr'"\"{format_model_and_field_name(model.__name__)}\"" "," ws "\"{outer_object_content}\"" ws ":" ws {format_model_and_field_name(model.__name__)}' + '\n' + mod_rules.append(mod_rule) + grammar_model_rules += "\n" + "\n".join(mod_rules) + look_for_markdown_code_block = False + look_for_triple_quoted_string = False + for model in models: + model_rules, markdown_block, triple_quoted_string = generate_gbnf_grammar(model, + processed_models, created_rules) + all_rules.extend(model_rules) + if markdown_block: + look_for_markdown_code_block = True + + if triple_quoted_string: + look_for_triple_quoted_string = True + + if not look_for_markdown_code_block and not look_for_triple_quoted_string: + model_rule += ' ws "}"' + all_rules.insert(0, root_rule + model_rule + grammar_model_rules) + return "\n".join(all_rules) + + +def get_primitive_grammar(grammar): + """ + Returns the needed GBNF primitive grammar for a given GBNF grammar string. + + Args: + grammar (str): The string containing the GBNF grammar. + + Returns: + str: GBNF primitive grammar string. + """ + type_list = [] + if "string-list" in grammar: + type_list.append(str) + if "boolean-list" in grammar: + type_list.append(bool) + if "integer-list" in grammar: + type_list.append(int) + if "float-list" in grammar: + type_list.append(float) + additional_grammar = [generate_list_rule(t) for t in type_list] + primitive_grammar = r""" +boolean ::= "true" | "false" +null ::= "null" +string ::= "\"" ( + [^"\\] | + "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) + )* "\"" ws +ws ::= ([ \t\n] ws)? +float ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws + +integer ::= [0-9]+""" + + any_block = "" + if "custom-class-any" in grammar: + any_block = ''' +value ::= object | array | string | number | boolean | null + +object ::= + "{" ws ( + string ":" ws value + ("," ws string ":" ws value)* + )? "}" ws + +array ::= + "[" ws ( + value + ("," ws value)* + )? "]" ws + +number ::= integer | float''' + + markdown_code_block_grammar = "" + if "markdown-code-block" in grammar: + markdown_code_block_grammar = r''' +markdown-code-block ::= opening-triple-ticks markdown-code-block-content closing-triple-ticks +markdown-code-block-content ::= ( [^`] | "`" [^`] | "`" "`" [^`] )* +opening-triple-ticks ::= "```" "python" "\n" | "```" "c" "\n" | "```" "cpp" "\n" | "```" "txt" "\n" | "```" "text" "\n" | "```" "json" "\n" | "```" "javascript" "\n" | "```" "css" "\n" | "```" "html" "\n" | "```" "markdown" "\n" +closing-triple-ticks ::= "```" "\n"''' + + if "triple-quoted-string" in grammar: + markdown_code_block_grammar = r""" +triple-quoted-string ::= triple-quotes triple-quoted-string-content triple-quotes +triple-quoted-string-content ::= ( [^'] | "'" [^'] | "'" "'" [^'] )* +triple-quotes ::= "'''" """ + return "\n" + '\n'.join(additional_grammar) + any_block + primitive_grammar + markdown_code_block_grammar + + +def generate_field_markdown(field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1) -> str: + indent = ' ' * depth + field_markdown = f"{indent}- **{field_name}** (`{field_type.__name__}`): " + + # Extracting field description from Pydantic Field using __model_fields__ + field_info = model.model_fields.get(field_name) + field_description = field_info.description if field_info and field_info.description else "No description available." + + field_markdown += field_description + '\n' + + # Handling nested BaseModel fields + if isclass(field_type) and issubclass(field_type, BaseModel): + field_markdown += f"{indent} - Details:\n" + for name, type_ in field_type.__annotations__.items(): + field_markdown += generate_field_markdown(name, type_, field_type, depth + 2) + + return field_markdown + + +def generate_markdown_report(pydantic_models: List[Type[BaseModel]]) -> str: + markdown = "" + for model in pydantic_models: + markdown += f"### {format_model_and_field_name(model.__name__)}\n" + + # Check if the model's docstring is different from BaseModel's docstring + class_doc = getdoc(model) + base_class_doc = getdoc(BaseModel) + class_description = class_doc if class_doc and class_doc != base_class_doc else "No specific description available." + + markdown += f"{class_description}\n\n" + markdown += "#### Fields\n" + + if isclass(model) and issubclass(model, BaseModel): + for name, field_type in model.__annotations__.items(): + markdown += generate_field_markdown(format_model_and_field_name(name), field_type, model) + markdown += "\n" + + return markdown + + +def format_json_example(example: dict, depth: int) -> str: + """ + Format a JSON example into a readable string with indentation. + + Args: + example (dict): JSON example to be formatted. + depth (int): Indentation depth. + + Returns: + str: Formatted JSON example string. + """ + indent = ' ' * depth + formatted_example = '{\n' + for key, value in example.items(): + value_text = f"'{value}'" if isinstance(value, str) else value + formatted_example += f"{indent}{key}: {value_text},\n" + formatted_example = formatted_example.rstrip(',\n') + '\n' + indent + '}' + return formatted_example + + +def generate_text_documentation(pydantic_models: List[Type[BaseModel]], model_prefix="Model", + fields_prefix="Fields", documentation_with_field_description=True) -> str: + """ + Generate text documentation for a list of Pydantic models. + + Args: + pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes. + model_prefix (str): Prefix for the model section. + fields_prefix (str): Prefix for the fields section. + documentation_with_field_description (bool): Include field descriptions in the documentation. + + Returns: + str: Generated text documentation. + """ + documentation = "" + pyd_models = [(model, True) for model in pydantic_models] + for model, add_prefix in pyd_models: + if add_prefix: + documentation += f"{model_prefix}: {format_model_and_field_name(model.__name__)}\n" + else: + documentation += f"Model: {format_model_and_field_name(model.__name__)}\n" + + # Handling multi-line model description with proper indentation + + class_doc = getdoc(model) + base_class_doc = getdoc(BaseModel) + class_description = class_doc if class_doc and class_doc != base_class_doc else "" + if class_description != "": + documentation += " Description: " + documentation += "\n" + format_multiline_description(class_description, 2) + "\n" + + if add_prefix: + # Indenting the fields section + documentation += f" {fields_prefix}:\n" + else: + documentation += f" Fields:\n" + if isclass(model) and issubclass(model, BaseModel): + for name, field_type in model.__annotations__.items(): + # if name == "markdown_code_block": + # continue + if get_origin(field_type) == list: + element_type = get_args(field_type)[0] + if isclass(element_type) and issubclass(element_type, BaseModel): + pyd_models.append((element_type, False)) + if get_origin(field_type) == Union: + element_types = get_args(field_type) + for element_type in element_types: + if isclass(element_type) and issubclass(element_type, BaseModel): + pyd_models.append((element_type, False)) + documentation += generate_field_text(name, field_type, model, + documentation_with_field_description=documentation_with_field_description) + documentation += "\n" + + if hasattr(model, 'Config') and hasattr(model.Config, + 'json_schema_extra') and 'example' in model.Config.json_schema_extra: + documentation += f" Expected Example Output for {format_model_and_field_name(model.__name__)}:\n" + json_example = json.dumps(model.Config.json_schema_extra['example']) + documentation += format_multiline_description(json_example, 2) + "\n" + + return documentation + + +def generate_field_text(field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1, + documentation_with_field_description=True) -> str: + """ + Generate text documentation for a Pydantic model field. + + Args: + field_name (str): Name of the field. + field_type (Type[Any]): Type of the field. + model (Type[BaseModel]): Pydantic model class. + depth (int): Indentation depth in the documentation. + documentation_with_field_description (bool): Include field descriptions in the documentation. + + Returns: + str: Generated text documentation for the field. + """ + indent = ' ' * depth + + field_info = model.model_fields.get(field_name) + field_description = field_info.description if field_info and field_info.description else "" + + if get_origin(field_type) == list: + element_type = get_args(field_type)[0] + field_text = f"{indent}{field_name} ({format_model_and_field_name(field_type.__name__)} of {format_model_and_field_name(element_type.__name__)})" + if field_description != "": + field_text += ":\n" + else: + field_text += "\n" + elif get_origin(field_type) == Union: + element_types = get_args(field_type) + types = [] + for element_type in element_types: + types.append(format_model_and_field_name(element_type.__name__)) + field_text = f"{indent}{field_name} ({' or '.join(types)})" + if field_description != "": + field_text += ":\n" + else: + field_text += "\n" + else: + field_text = f"{indent}{field_name} ({format_model_and_field_name(field_type.__name__)})" + if field_description != "": + field_text += ":\n" + else: + field_text += "\n" + + if not documentation_with_field_description: + return field_text + + if field_description != "": + field_text += f"{indent} Description: " + field_description + "\n" + + # Check for and include field-specific examples if available + if hasattr(model, 'Config') and hasattr(model.Config, + 'json_schema_extra') and 'example' in model.Config.json_schema_extra: + field_example = model.Config.json_schema_extra['example'].get(field_name) + if field_example is not None: + example_text = f"'{field_example}'" if isinstance(field_example, str) else field_example + field_text += f"{indent} Example: {example_text}\n" + + if isclass(field_type) and issubclass(field_type, BaseModel): + field_text += f"{indent} Details:\n" + for name, type_ in field_type.__annotations__.items(): + field_text += generate_field_text(name, type_, field_type, depth + 2) + + return field_text + + +def format_multiline_description(description: str, indent_level: int) -> str: + """ + Format a multiline description with proper indentation. + + Args: + description (str): Multiline description. + indent_level (int): Indentation level. + + Returns: + str: Formatted multiline description. + """ + indent = ' ' * indent_level + return indent + description.replace('\n', '\n' + indent) + + +def save_gbnf_grammar_and_documentation(grammar, documentation, grammar_file_path="./grammar.gbnf", + documentation_file_path="./grammar_documentation.md"): + """ + Save GBNF grammar and documentation to specified files. + + Args: + grammar (str): GBNF grammar string. + documentation (str): Documentation string. + grammar_file_path (str): File path to save the GBNF grammar. + documentation_file_path (str): File path to save the documentation. + + Returns: + None + """ + try: + with open(grammar_file_path, 'w') as file: + file.write(grammar + get_primitive_grammar(grammar)) + print(f"Grammar successfully saved to {grammar_file_path}") + except IOError as e: + print(f"An error occurred while saving the grammar file: {e}") + + try: + with open(documentation_file_path, 'w') as file: + file.write(documentation) + print(f"Documentation successfully saved to {documentation_file_path}") + except IOError as e: + print(f"An error occurred while saving the documentation file: {e}") + + +def remove_empty_lines(string): + """ + Remove empty lines from a string. + + Args: + string (str): Input string. + + Returns: + str: String with empty lines removed. + """ + lines = string.splitlines() + non_empty_lines = [line for line in lines if line.strip() != ""] + string_no_empty_lines = "\n".join(non_empty_lines) + return string_no_empty_lines + + +def generate_and_save_gbnf_grammar_and_documentation(pydantic_model_list, + grammar_file_path="./generated_grammar.gbnf", + documentation_file_path="./generated_grammar_documentation.md", + outer_object_name: str = None, + outer_object_content: str = None, + model_prefix: str = "Output Model", + fields_prefix: str = "Output Fields", + list_of_outputs: bool = False, + documentation_with_field_description=True): + """ + Generate GBNF grammar and documentation, and save them to specified files. + + Args: + pydantic_model_list: List of Pydantic model classes. + grammar_file_path (str): File path to save the generated GBNF grammar. + documentation_file_path (str): File path to save the generated documentation. + outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling. + outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling. + model_prefix (str): Prefix for the model section in the documentation. + fields_prefix (str): Prefix for the fields section in the documentation. + list_of_outputs (bool): Whether the output is a list of items. + documentation_with_field_description (bool): Include field descriptions in the documentation. + + Returns: + None + """ + documentation = generate_text_documentation(pydantic_model_list, model_prefix, fields_prefix, + documentation_with_field_description=documentation_with_field_description) + grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name, + outer_object_content, list_of_outputs) + grammar = remove_empty_lines(grammar) + save_gbnf_grammar_and_documentation(grammar, documentation, grammar_file_path, documentation_file_path) + + +def generate_gbnf_grammar_and_documentation(pydantic_model_list, outer_object_name: str = None, + outer_object_content: str = None, + model_prefix: str = "Output Model", + fields_prefix: str = "Output Fields", list_of_outputs: bool = False, + documentation_with_field_description=True): + """ + Generate GBNF grammar and documentation for a list of Pydantic models. + + Args: + pydantic_model_list: List of Pydantic model classes. + outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling. + outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling. + model_prefix (str): Prefix for the model section in the documentation. + fields_prefix (str): Prefix for the fields section in the documentation. + list_of_outputs (bool): Whether the output is a list of items. + documentation_with_field_description (bool): Include field descriptions in the documentation. + + Returns: + tuple: GBNF grammar string, documentation string. + """ + documentation = generate_text_documentation(copy(pydantic_model_list), model_prefix, fields_prefix, + documentation_with_field_description=documentation_with_field_description) + grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name, + outer_object_content, list_of_outputs) + grammar = remove_empty_lines(grammar + get_primitive_grammar(grammar)) + return grammar, documentation + + +def generate_gbnf_grammar_and_documentation_from_dictionaries(dictionaries: List[dict], + outer_object_name: str = None, + outer_object_content: str = None, + model_prefix: str = "Output Model", + fields_prefix: str = "Output Fields", + list_of_outputs: bool = False, + documentation_with_field_description=True): + """ + Generate GBNF grammar and documentation from a list of dictionaries. + + Args: + dictionaries (List[dict]): List of dictionaries representing Pydantic models. + outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling. + outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling. + model_prefix (str): Prefix for the model section in the documentation. + fields_prefix (str): Prefix for the fields section in the documentation. + list_of_outputs (bool): Whether the output is a list of items. + documentation_with_field_description (bool): Include field descriptions in the documentation. + + Returns: + tuple: GBNF grammar string, documentation string. + """ + pydantic_model_list = create_dynamic_models_from_dictionaries(dictionaries) + documentation = generate_text_documentation(copy(pydantic_model_list), model_prefix, fields_prefix, + documentation_with_field_description=documentation_with_field_description) + grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name, + outer_object_content, list_of_outputs) + grammar = remove_empty_lines(grammar + get_primitive_grammar(grammar)) + return grammar, documentation + + +def create_dynamic_model_from_function(func: Callable): + """ + Creates a dynamic Pydantic model from a given function's type hints and adds the function as a 'run' method. + + Args: + func (Callable): A function with type hints from which to create the model. + + Returns: + A dynamic Pydantic model class with the provided function as a 'run' method. + """ + # Extracting type hints from the provided function + type_hints = get_type_hints(func) + type_hints.pop('return', None) + + # Handling default values and annotations + dynamic_fields = {} + defaults = getattr(func, '__defaults__', ()) or () + defaults_index = len(type_hints) - len(defaults) + + for index, (name, typ) in enumerate(type_hints.items()): + if index >= defaults_index: + default_value = defaults[index - defaults_index] + dynamic_fields[name] = (typ, default_value) + else: + dynamic_fields[name] = (typ, ...) + + # Creating the dynamic model + dynamicModel = create_model(f'{func.__name__}', **dynamic_fields) + + dynamicModel.__doc__ = getdoc(func) + + # Wrapping the original function to handle instance 'self' + def run_method_wrapper(self): + func_args = {name: getattr(self, name) for name in type_hints} + return func(**func_args) + + # Adding the wrapped function as a 'run' method + setattr(dynamicModel, 'run', run_method_wrapper) + + return dynamicModel + + +def add_run_method_to_dynamic_model(model: Type[BaseModel], func: Callable): + """ + Add a 'run' method to a dynamic Pydantic model, using the provided function. + + Args: + - model (Type[BaseModel]): Dynamic Pydantic model class. + - func (Callable): Function to be added as a 'run' method to the model. + + Returns: + - Type[BaseModel]: Pydantic model class with the added 'run' method. + """ + + def run_method_wrapper(self): + func_args = {name: getattr(self, name) for name in model.model_fields} + return func(**func_args) + + # Adding the wrapped function as a 'run' method + setattr(model, 'run', run_method_wrapper) + + return model + + +def create_dynamic_models_from_dictionaries(dictionaries: List[dict]): + """ + Create a list of dynamic Pydantic model classes from a list of dictionaries. + + Args: + - dictionaries (List[dict]): List of dictionaries representing model structures. + + Returns: + - List[Type[BaseModel]]: List of generated dynamic Pydantic model classes. + """ + dynamic_models = [] + for func in dictionaries: + model_name = format_model_and_field_name(func.get("name", "")) + dyn_model = convert_dictionary_to_to_pydantic_model(func, model_name) + dynamic_models.append(dyn_model) + return dynamic_models + + +def map_grammar_names_to_pydantic_model_class(pydantic_model_list): + output = {} + for model in pydantic_model_list: + output[format_model_and_field_name(model.__name__)] = model + + return output + + +from enum import Enum + + +def json_schema_to_python_types(schema): + type_map = { + 'any': Any, + 'string': str, + 'number': float, + 'integer': int, + 'boolean': bool, + 'array': list, + } + return type_map[schema] + + +def list_to_enum(enum_name, values): + return Enum(enum_name, {value: value for value in values}) + + +def convert_dictionary_to_to_pydantic_model(dictionary: dict, model_name: str = 'CustomModel') -> Type[BaseModel]: + """ + Convert a dictionary to a Pydantic model class. + + Args: + - dictionary (dict): Dictionary representing the model structure. + - model_name (str): Name of the generated Pydantic model. + + Returns: + - Type[BaseModel]: Generated Pydantic model class. + """ + fields = {} + + if "properties" in dictionary: + for field_name, field_data in dictionary.get("properties", {}).items(): + if field_data == 'object': + submodel = convert_dictionary_to_to_pydantic_model(dictionary, f'{model_name}_{field_name}') + fields[field_name] = (submodel, ...) + else: + field_type = field_data.get('type', 'str') + + if field_data.get("enum", []): + fields[field_name] = (list_to_enum(field_name, field_data.get("enum", [])), ...) + if field_type == "array": + items = field_data.get("items", {}) + if items != {}: + array = {"properties": items} + array_type = convert_dictionary_to_to_pydantic_model(array, f'{model_name}_{field_name}_items') + fields[field_name] = (List[array_type], ...) + else: + fields[field_name] = (list, ...) + elif field_type == 'object': + submodel = convert_dictionary_to_to_pydantic_model(field_data, f'{model_name}_{field_name}') + fields[field_name] = (submodel, ...) + else: + field_type = json_schema_to_python_types(field_type) + fields[field_name] = (field_type, ...) + if "function" in dictionary: + + for field_name, field_data in dictionary.get("function", {}).items(): + if field_name == "name": + model_name = field_data + elif field_name == "description": + fields["__doc__"] = field_data + elif field_name == "parameters": + return convert_dictionary_to_to_pydantic_model(field_data, f'{model_name}') + if "parameters" in dictionary: + field_data = {"function": dictionary} + return convert_dictionary_to_to_pydantic_model(field_data, f'{model_name}') + + custom_model = create_model(model_name, **fields) + return custom_model + + +