diff --git a/extensions/multimodal/README.md b/extensions/multimodal/README.md index 87183587..b176eca3 100644 --- a/extensions/multimodal/README.md +++ b/extensions/multimodal/README.md @@ -67,8 +67,56 @@ This extension uses the following parameters (from `settings.json`): ## Usage through API +### Chat completions endpoint + +#### With an image URL + +```shell +curl http://127.0.0.1:5000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [ + { + "role": "user", + "image_url": "https://avatars.githubusercontent.com/u/112222186?v=4" + }, + { + "role": "user", + "content": "What is unusual about this image?" + } + ] + }' +``` + +#### With a Base64 image + +```python +import base64 +import json +import requests + +img = open('image.jpg', 'rb') +img_bytes = img.read() +img_base64 = base64.b64encode(img_bytes).decode('utf-8') +data = { "messages": [ + { + "role": "user", + "image_url": f"data:image/jpeg;base64,{img_base64}" + }, + { + "role": "user", + "content": "what is unusual about this image?" + } + ] +} +response = requests.post('http://127.0.0.1:5000/v1/chat/completions', json=data) +print(response.text) +``` + You can run the multimodal inference through API, by inputting the images to prompt. Images are embedded like so: `f''`, where `img_str` is base-64 jpeg data. Note that you will need to launch `server.py` with the arguments `--api --extensions multimodal`. +### Completions endpoint + Python example: ```Python diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py index 70cdfe48..26017f37 100644 --- a/extensions/openai/completions.py +++ b/extensions/openai/completions.py @@ -1,10 +1,15 @@ +import base64 import copy +import re import time from collections import deque +from io import BytesIO +import requests import tiktoken import torch import torch.nn.functional as F +from PIL import Image from transformers import LogitsProcessor, LogitsProcessorList from extensions.openai.errors import InvalidRequestError @@ -140,7 +145,25 @@ def convert_history(history): system_message = "" for entry in history: - content = entry["content"] + if "image_url" in entry: + image_url = entry['image_url'] + if "base64" in image_url: + image_url = re.sub('^data:image/.+;base64,', '', image_url) + img = Image.open(BytesIO(base64.b64decode(image_url))) + else: + try: + my_res = requests.get(image_url) + img = Image.open(BytesIO(my_res.content)) + except Exception: + raise 'Image cannot be loaded from the URL!' + + buffered = BytesIO() + img.save(buffered, format="JPEG") + img_str = base64.b64encode(buffered.getvalue()).decode('utf-8') + content = f'' + else: + content = entry["content"] + role = entry["role"] if role == "user": @@ -182,7 +205,8 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False) - raise InvalidRequestError(message="messages: missing role", param='messages') elif m['role'] == 'function': raise InvalidRequestError(message="role: function is not supported.", param='messages') - if 'content' not in m: + + if 'content' not in m and "image_url" not in m: raise InvalidRequestError(message="messages: missing content", param='messages') # Chat Completions