Support for sending images into OpenAI chat API (#4827)

This commit is contained in:
kabachuha 2023-12-23 04:45:53 +03:00 committed by GitHub
parent 8956f3ebe2
commit dbe438564e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 74 additions and 2 deletions

View File

@ -67,8 +67,56 @@ This extension uses the following parameters (from `settings.json`):
## Usage through API ## Usage through API
### Chat completions endpoint
#### With an image URL
```shell
curl http://127.0.0.1:5000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"messages": [
{
"role": "user",
"image_url": "https://avatars.githubusercontent.com/u/112222186?v=4"
},
{
"role": "user",
"content": "What is unusual about this image?"
}
]
}'
```
#### With a Base64 image
```python
import base64
import json
import requests
img = open('image.jpg', 'rb')
img_bytes = img.read()
img_base64 = base64.b64encode(img_bytes).decode('utf-8')
data = { "messages": [
{
"role": "user",
"image_url": f"data:image/jpeg;base64,{img_base64}"
},
{
"role": "user",
"content": "what is unusual about this image?"
}
]
}
response = requests.post('http://127.0.0.1:5000/v1/chat/completions', json=data)
print(response.text)
```
You can run the multimodal inference through API, by inputting the images to prompt. Images are embedded like so: `f'<img src="data:image/jpeg;base64,{img_str}">'`, where `img_str` is base-64 jpeg data. Note that you will need to launch `server.py` with the arguments `--api --extensions multimodal`. You can run the multimodal inference through API, by inputting the images to prompt. Images are embedded like so: `f'<img src="data:image/jpeg;base64,{img_str}">'`, where `img_str` is base-64 jpeg data. Note that you will need to launch `server.py` with the arguments `--api --extensions multimodal`.
### Completions endpoint
Python example: Python example:
```Python ```Python

View File

@ -1,10 +1,15 @@
import base64
import copy import copy
import re
import time import time
from collections import deque from collections import deque
from io import BytesIO
import requests
import tiktoken import tiktoken
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from PIL import Image
from transformers import LogitsProcessor, LogitsProcessorList from transformers import LogitsProcessor, LogitsProcessorList
from extensions.openai.errors import InvalidRequestError from extensions.openai.errors import InvalidRequestError
@ -140,7 +145,25 @@ def convert_history(history):
system_message = "" system_message = ""
for entry in history: for entry in history:
content = entry["content"] if "image_url" in entry:
image_url = entry['image_url']
if "base64" in image_url:
image_url = re.sub('^data:image/.+;base64,', '', image_url)
img = Image.open(BytesIO(base64.b64decode(image_url)))
else:
try:
my_res = requests.get(image_url)
img = Image.open(BytesIO(my_res.content))
except Exception:
raise 'Image cannot be loaded from the URL!'
buffered = BytesIO()
img.save(buffered, format="JPEG")
img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
content = f'<img src="data:image/jpeg;base64,{img_str}">'
else:
content = entry["content"]
role = entry["role"] role = entry["role"]
if role == "user": if role == "user":
@ -182,7 +205,8 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False) -
raise InvalidRequestError(message="messages: missing role", param='messages') raise InvalidRequestError(message="messages: missing role", param='messages')
elif m['role'] == 'function': elif m['role'] == 'function':
raise InvalidRequestError(message="role: function is not supported.", param='messages') raise InvalidRequestError(message="role: function is not supported.", param='messages')
if 'content' not in m:
if 'content' not in m and "image_url" not in m:
raise InvalidRequestError(message="messages: missing content", param='messages') raise InvalidRequestError(message="messages: missing content", param='messages')
# Chat Completions # Chat Completions