Merge pull request #5073 from oobabooga/dev

Merge dev branch
This commit is contained in:
oobabooga 2023-12-25 02:58:45 -03:00 committed by GitHub
commit af876095e2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 157 additions and 9 deletions

View File

@ -67,8 +67,56 @@ This extension uses the following parameters (from `settings.json`):
## Usage through API ## Usage through API
### Chat completions endpoint
#### With an image URL
```shell
curl http://127.0.0.1:5000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"messages": [
{
"role": "user",
"image_url": "https://avatars.githubusercontent.com/u/112222186?v=4"
},
{
"role": "user",
"content": "What is unusual about this image?"
}
]
}'
```
#### With a Base64 image
```python
import base64
import json
import requests
img = open('image.jpg', 'rb')
img_bytes = img.read()
img_base64 = base64.b64encode(img_bytes).decode('utf-8')
data = { "messages": [
{
"role": "user",
"image_url": f"data:image/jpeg;base64,{img_base64}"
},
{
"role": "user",
"content": "what is unusual about this image?"
}
]
}
response = requests.post('http://127.0.0.1:5000/v1/chat/completions', json=data)
print(response.text)
```
You can run the multimodal inference through API, by inputting the images to prompt. Images are embedded like so: `f'<img src="data:image/jpeg;base64,{img_str}">'`, where `img_str` is base-64 jpeg data. Note that you will need to launch `server.py` with the arguments `--api --extensions multimodal`. You can run the multimodal inference through API, by inputting the images to prompt. Images are embedded like so: `f'<img src="data:image/jpeg;base64,{img_str}">'`, where `img_str` is base-64 jpeg data. Note that you will need to launch `server.py` with the arguments `--api --extensions multimodal`.
### Completions endpoint
Python example: Python example:
```Python ```Python

View File

@ -1,10 +1,15 @@
import base64
import copy import copy
import re
import time import time
from collections import deque from collections import deque
from io import BytesIO
import requests
import tiktoken import tiktoken
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from PIL import Image
from transformers import LogitsProcessor, LogitsProcessorList from transformers import LogitsProcessor, LogitsProcessorList
from extensions.openai.errors import InvalidRequestError from extensions.openai.errors import InvalidRequestError
@ -140,7 +145,25 @@ def convert_history(history):
system_message = "" system_message = ""
for entry in history: for entry in history:
content = entry["content"] if "image_url" in entry:
image_url = entry['image_url']
if "base64" in image_url:
image_url = re.sub('^data:image/.+;base64,', '', image_url)
img = Image.open(BytesIO(base64.b64decode(image_url)))
else:
try:
my_res = requests.get(image_url)
img = Image.open(BytesIO(my_res.content))
except Exception:
raise 'Image cannot be loaded from the URL!'
buffered = BytesIO()
img.save(buffered, format="JPEG")
img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
content = f'<img src="data:image/jpeg;base64,{img_str}">'
else:
content = entry["content"]
role = entry["role"] role = entry["role"]
if role == "user": if role == "user":
@ -182,7 +205,8 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False) -
raise InvalidRequestError(message="messages: missing role", param='messages') raise InvalidRequestError(message="messages: missing role", param='messages')
elif m['role'] == 'function': elif m['role'] == 'function':
raise InvalidRequestError(message="role: function is not supported.", param='messages') raise InvalidRequestError(message="role: function is not supported.", param='messages')
if 'content' not in m:
if 'content' not in m and "image_url" not in m:
raise InvalidRequestError(message="messages: missing content", param='messages') raise InvalidRequestError(message="messages: missing content", param='messages')
# Chat Completions # Chat Completions

View File

@ -0,0 +1,25 @@
instruction_template: |-
{%- set found_item = false -%}
{%- for message in messages -%}
{%- if message['role'] == 'system' -%}
{%- set found_item = true -%}
{%- endif -%}
{%- endfor -%}
{%- if not found_item -%}
{{-'SYSTEM: ' + 'Elaborate on the topic using a Tree of Thoughts and backtrack when necessary to construct a clear, cohesive Chain of Thought reasoning. Always answer without hesitation.' + '\n' -}}
{%- endif %}
{%- for message in messages %}
{%- if message['role'] == 'system' -%}
{{-'SYSTEM: ' + message['content'] + '\n' -}}
{%- else -%}
{%- if message['role'] == 'user' -%}
{{-'USER: ' + message['content'] + '\n'-}}
{%- else -%}
{{-'ASSISTANT: ' + message['content'] + '</s>\n' -}}
{%- endif -%}
{%- endif -%}
{%- endfor -%}
{%- if add_generation_prompt -%}
{{-'ASSISTANT:'-}}
{%- endif -%}

View File

@ -0,0 +1,25 @@
instruction_template: |-
{%- set found_item = false -%}
{%- for message in messages -%}
{%- if message['role'] == 'system' -%}
{%- set found_item = true -%}
{%- endif -%}
{%- endfor -%}
{%- if not found_item -%}
{{-'SYSTEM: ' + 'Answer the question thoughtfully and intelligently. Always answer without hesitation.' + '\n' -}}
{%- endif %}
{%- for message in messages %}
{%- if message['role'] == 'system' -%}
{{-'SYSTEM: ' + message['content'] + '\n' -}}
{%- else -%}
{%- if message['role'] == 'user' -%}
{{-'USER: ' + message['content'] + '\n'-}}
{%- else -%}
{{-'ASSISTANT: ' + message['content'] + '</s>\n' -}}
{%- endif -%}
{%- endif -%}
{%- endfor -%}
{%- if add_generation_prompt -%}
{{-'ASSISTANT:'-}}
{%- endif -%}

View File

@ -188,3 +188,5 @@
instruction_template: 'ChatML' instruction_template: 'ChatML'
(dolphin).*: (dolphin).*:
instruction_template: 'ChatML' instruction_template: 'ChatML'
.*synthia:
instruction_template: 'Synthia'

View File

@ -482,6 +482,7 @@ def clear_torch_cache():
def unload_model(): def unload_model():
shared.model = shared.tokenizer = None shared.model = shared.tokenizer = None
shared.model_name = 'None'
shared.lora_names = [] shared.lora_names = []
shared.model_dirty_from_training = False shared.model_dirty_from_training = False
clear_torch_cache() clear_torch_cache()

View File

@ -45,6 +45,7 @@ settings = {
'truncation_length_min': 0, 'truncation_length_min': 0,
'truncation_length_max': 200000, 'truncation_length_max': 200000,
'max_tokens_second': 0, 'max_tokens_second': 0,
'max_updates_second': 0,
'custom_stopping_strings': '', 'custom_stopping_strings': '',
'custom_token_bans': '', 'custom_token_bans': '',
'auto_max_new_tokens': False, 'auto_max_new_tokens': False,

View File

@ -77,6 +77,10 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
state = copy.deepcopy(state) state = copy.deepcopy(state)
state['stream'] = True state['stream'] = True
min_update_interval = 0
if state.get('max_updates_second', 0) > 0:
min_update_interval = 1 / state['max_updates_second']
# Generate # Generate
for reply in generate_func(question, original_question, seed, state, stopping_strings, is_chat=is_chat): for reply in generate_func(question, original_question, seed, state, stopping_strings, is_chat=is_chat):
reply, stop_found = apply_stopping_strings(reply, all_stop_strings) reply, stop_found = apply_stopping_strings(reply, all_stop_strings)
@ -94,10 +98,9 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
last_update = time.time() last_update = time.time()
yield reply yield reply
# Limit updates to 24 or 5 per second to avoid lag in the Gradio UI # Limit updates to avoid lag in the Gradio UI
# API updates are not limited # API updates are not limited
else: else:
min_update_interval = 0 if not for_ui else 0.2 if (shared.args.listen or shared.args.share) else 0.0417
if cur_time - last_update > min_update_interval: if cur_time - last_update > min_update_interval:
last_update = cur_time last_update = cur_time
yield reply yield reply
@ -265,8 +268,15 @@ def apply_stopping_strings(reply, all_stop_strings):
def get_reply_from_output_ids(output_ids, state, starting_from=0): def get_reply_from_output_ids(output_ids, state, starting_from=0):
reply = decode(output_ids[starting_from:], state['skip_special_tokens']) reply = decode(output_ids[starting_from:], state['skip_special_tokens'])
if (hasattr(shared.tokenizer, 'convert_ids_to_tokens') and len(output_ids) > starting_from and shared.tokenizer.convert_ids_to_tokens(int(output_ids[starting_from])).startswith('')) and not reply.startswith(' '):
reply = ' ' + reply # Handle tokenizers that do not add the leading space for the first token
if (hasattr(shared.tokenizer, 'convert_ids_to_tokens') and len(output_ids) > starting_from) and not reply.startswith(' '):
first_token = shared.tokenizer.convert_ids_to_tokens(int(output_ids[starting_from]))
if isinstance(first_token, (bytes,)):
first_token = first_token.decode('utf8')
if first_token.startswith(''):
reply = ' ' + reply
return reply return reply

View File

@ -110,6 +110,7 @@ def list_interface_input_elements():
'max_new_tokens', 'max_new_tokens',
'auto_max_new_tokens', 'auto_max_new_tokens',
'max_tokens_second', 'max_tokens_second',
'max_updates_second',
'seed', 'seed',
'temperature', 'temperature',
'temperature_last', 'temperature_last',

View File

@ -66,7 +66,9 @@ def create_ui(default_preset):
with gr.Row(): with gr.Row():
with gr.Column(): with gr.Column():
shared.gradio['truncation_length'] = gr.Slider(value=get_truncation_length(), minimum=shared.settings['truncation_length_min'], maximum=shared.settings['truncation_length_max'], step=256, label='Truncate the prompt up to this length', info='The leftmost tokens are removed if the prompt exceeds this length. Most models require this to be at most 2048.') shared.gradio['truncation_length'] = gr.Slider(value=get_truncation_length(), minimum=shared.settings['truncation_length_min'], maximum=shared.settings['truncation_length_max'], step=256, label='Truncate the prompt up to this length', info='The leftmost tokens are removed if the prompt exceeds this length. Most models require this to be at most 2048.')
shared.gradio['max_tokens_second'] = gr.Slider(value=shared.settings['max_tokens_second'], minimum=0, maximum=20, step=1, label='Maximum number of tokens/second', info='To make text readable in real time.') shared.gradio['max_tokens_second'] = gr.Slider(value=shared.settings['max_tokens_second'], minimum=0, maximum=20, step=1, label='Maximum tokens/second', info='To make text readable in real time.')
shared.gradio['max_updates_second'] = gr.Slider(value=shared.settings['max_updates_second'], minimum=0, maximum=24, step=1, label='Maximum UI updates/second', info='Set this if you experience lag in the UI during streaming.')
shared.gradio['custom_stopping_strings'] = gr.Textbox(lines=1, value=shared.settings["custom_stopping_strings"] or None, label='Custom stopping strings', info='In addition to the defaults. Written between "" and separated by commas.', placeholder='"\\n", "\\nYou:"') shared.gradio['custom_stopping_strings'] = gr.Textbox(lines=1, value=shared.settings["custom_stopping_strings"] or None, label='Custom stopping strings', info='In addition to the defaults. Written between "" and separated by commas.', placeholder='"\\n", "\\nYou:"')
shared.gradio['custom_token_bans'] = gr.Textbox(value=shared.settings['custom_token_bans'] or None, label='Custom token bans', info='Specific token IDs to ban from generating, comma-separated. The IDs can be found in the Default or Notebook tab.') shared.gradio['custom_token_bans'] = gr.Textbox(value=shared.settings['custom_token_bans'] or None, label='Custom token bans', info='Specific token IDs to ban from generating, comma-separated. The IDs can be found in the Default or Notebook tab.')

View File

@ -5,6 +5,7 @@ einops
exllamav2==0.0.11; platform_system != "Darwin" and platform_machine != "x86_64" exllamav2==0.0.11; platform_system != "Darwin" and platform_machine != "x86_64"
gradio==3.50.* gradio==3.50.*
hqq==0.1.1.post1 hqq==0.1.1.post1
lm_eval==0.3.0
markdown markdown
numpy==1.24.* numpy==1.24.*
optimum==1.16.* optimum==1.16.*
@ -98,4 +99,4 @@ https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_
https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.27+cu121-py3-none-any.whl https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.27+cu121-py3-none-any.whl
autoawq==0.1.7; platform_system == "Linux" or platform_system == "Windows" autoawq==0.1.8; platform_system == "Linux" or platform_system == "Windows"

View File

@ -5,6 +5,7 @@ einops
exllamav2==0.0.11; platform_system == "Windows" or python_version < "3.10" or python_version > "3.11" or platform_machine != "x86_64" exllamav2==0.0.11; platform_system == "Windows" or python_version < "3.10" or python_version > "3.11" or platform_machine != "x86_64"
gradio==3.50.* gradio==3.50.*
hqq==0.1.1.post1 hqq==0.1.1.post1
lm_eval==0.3.0
markdown markdown
numpy==1.24.* numpy==1.24.*
optimum==1.16.* optimum==1.16.*

View File

@ -5,6 +5,7 @@ einops
exllamav2==0.0.11; platform_system == "Windows" or python_version < "3.10" or python_version > "3.11" or platform_machine != "x86_64" exllamav2==0.0.11; platform_system == "Windows" or python_version < "3.10" or python_version > "3.11" or platform_machine != "x86_64"
gradio==3.50.* gradio==3.50.*
hqq==0.1.1.post1 hqq==0.1.1.post1
lm_eval==0.3.0
markdown markdown
numpy==1.24.* numpy==1.24.*
optimum==1.16.* optimum==1.16.*

View File

@ -5,6 +5,7 @@ einops
exllamav2==0.0.11 exllamav2==0.0.11
gradio==3.50.* gradio==3.50.*
hqq==0.1.1.post1 hqq==0.1.1.post1
lm_eval==0.3.0
markdown markdown
numpy==1.24.* numpy==1.24.*
optimum==1.16.* optimum==1.16.*

View File

@ -5,6 +5,7 @@ einops
exllamav2==0.0.11 exllamav2==0.0.11
gradio==3.50.* gradio==3.50.*
hqq==0.1.1.post1 hqq==0.1.1.post1
lm_eval==0.3.0
markdown markdown
numpy==1.24.* numpy==1.24.*
optimum==1.16.* optimum==1.16.*

View File

@ -5,6 +5,7 @@ einops
exllamav2==0.0.11 exllamav2==0.0.11
gradio==3.50.* gradio==3.50.*
hqq==0.1.1.post1 hqq==0.1.1.post1
lm_eval==0.3.0
markdown markdown
numpy==1.24.* numpy==1.24.*
optimum==1.16.* optimum==1.16.*

View File

@ -5,6 +5,7 @@ einops
exllamav2==0.0.11 exllamav2==0.0.11
gradio==3.50.* gradio==3.50.*
hqq==0.1.1.post1 hqq==0.1.1.post1
lm_eval==0.3.0
markdown markdown
numpy==1.24.* numpy==1.24.*
optimum==1.16.* optimum==1.16.*

View File

@ -5,6 +5,7 @@ einops
exllamav2==0.0.11; platform_system != "Darwin" and platform_machine != "x86_64" exllamav2==0.0.11; platform_system != "Darwin" and platform_machine != "x86_64"
gradio==3.50.* gradio==3.50.*
hqq==0.1.1.post1 hqq==0.1.1.post1
lm_eval==0.3.0
markdown markdown
numpy==1.24.* numpy==1.24.*
optimum==1.16.* optimum==1.16.*
@ -98,4 +99,4 @@ https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_
https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX/ctransformers-0.2.27+cu121-py3-none-any.whl https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX/ctransformers-0.2.27+cu121-py3-none-any.whl
autoawq==0.1.7; platform_system == "Linux" or platform_system == "Windows" autoawq==0.1.8; platform_system == "Linux" or platform_system == "Windows"

View File

@ -5,6 +5,7 @@ einops
exllamav2==0.0.11 exllamav2==0.0.11
gradio==3.50.* gradio==3.50.*
hqq==0.1.1.post1 hqq==0.1.1.post1
lm_eval==0.3.0
markdown markdown
numpy==1.24.* numpy==1.24.*
optimum==1.16.* optimum==1.16.*