mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-11 21:10:24 +01:00
3fd62a6b1c
* py : type-check all Python scripts with Pyright * server-tests : use trailing slash in openai base_url * server-tests : add more type annotations * server-tests : strip "chat" from base_url in oai_chat_completions * server-tests : model metadata is a dict * ci : disable pip cache in type-check workflow The cache is not shared between branches, and it's 250MB in size, so it would become quite a big part of the 10GB cache limit of the repo. * py : fix new type errors from master branch * tests : fix test-tokenizer-random.py Apparently, gcc applies optimisations even when pre-processing, which confuses pycparser. * ci : only show warnings and errors in python type-check The "information" level otherwise has entries from 'examples/pydantic_models_to_grammar.py', which could be confusing for someone trying to figure out what failed, considering that these messages can safely be ignored even though they look like errors.
83 lines
3.1 KiB
Python
83 lines
3.1 KiB
Python
# Usage:
|
|
#! ./llama-server -m some-model.gguf &
|
|
#! pip install pydantic
|
|
#! python json_schema_pydantic_example.py
|
|
|
|
from pydantic import BaseModel, Field, TypeAdapter
|
|
from annotated_types import MinLen
|
|
from typing import Annotated, List, Optional
|
|
import json, requests
|
|
|
|
if True:
|
|
|
|
def create_completion(*, response_model=None, endpoint="http://localhost:8080/v1/chat/completions", messages, **kwargs):
|
|
'''
|
|
Creates a chat completion using an OpenAI-compatible endpoint w/ JSON schema support
|
|
(llama.cpp server, llama-cpp-python, Anyscale / Together...)
|
|
|
|
The response_model param takes a type (+ supports Pydantic) and behaves just as w/ Instructor (see below)
|
|
'''
|
|
response_format = None
|
|
type_adapter = None
|
|
|
|
if response_model:
|
|
type_adapter = TypeAdapter(response_model)
|
|
schema = type_adapter.json_schema()
|
|
messages = [{
|
|
"role": "system",
|
|
"content": f"You respond in JSON format with the following schema: {json.dumps(schema, indent=2)}"
|
|
}] + messages
|
|
response_format={"type": "json_object", "schema": schema}
|
|
|
|
data = requests.post(endpoint, headers={"Content-Type": "application/json"},
|
|
json=dict(messages=messages, response_format=response_format, **kwargs)).json()
|
|
if 'error' in data:
|
|
raise Exception(data['error']['message'])
|
|
|
|
content = data["choices"][0]["message"]["content"]
|
|
return type_adapter.validate_json(content) if type_adapter else content
|
|
|
|
else:
|
|
|
|
# This alternative branch uses Instructor + OpenAI client lib.
|
|
# Instructor support streamed iterable responses, retry & more.
|
|
# (see https://python.useinstructor.com/)
|
|
#! pip install instructor openai
|
|
import instructor, openai
|
|
client = instructor.patch(
|
|
openai.OpenAI(api_key="123", base_url="http://localhost:8080"),
|
|
mode=instructor.Mode.JSON_SCHEMA)
|
|
create_completion = client.chat.completions.create
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
class QAPair(BaseModel):
|
|
class Config:
|
|
extra = 'forbid' # triggers additionalProperties: false in the JSON schema
|
|
question: str
|
|
concise_answer: str
|
|
justification: str
|
|
stars: Annotated[int, Field(ge=1, le=5)]
|
|
|
|
class PyramidalSummary(BaseModel):
|
|
class Config:
|
|
extra = 'forbid' # triggers additionalProperties: false in the JSON schema
|
|
title: str
|
|
summary: str
|
|
question_answers: Annotated[List[QAPair], MinLen(2)]
|
|
sub_sections: Optional[Annotated[List['PyramidalSummary'], MinLen(2)]]
|
|
|
|
print("# Summary\n", create_completion(
|
|
model="...",
|
|
response_model=PyramidalSummary,
|
|
messages=[{
|
|
"role": "user",
|
|
"content": f"""
|
|
You are a highly efficient corporate document summarizer.
|
|
Create a pyramidal summary of an imaginary internal document about our company processes
|
|
(starting high-level, going down to each sub sections).
|
|
Keep questions short, and answers even shorter (trivia / quizz style).
|
|
"""
|
|
}]))
|