Merge branch 'dev' into TheLounger-style_improvements

This commit is contained in:
oobabooga 2023-12-24 09:24:55 -08:00
commit 4aeebfc571
22 changed files with 290 additions and 298 deletions

View File

@ -237,7 +237,7 @@ List of command-line flags
| `--no_use_fast` | Set use_fast=False while loading the tokenizer (it's True by default). Use this if you have any problems related to use_fast. | | `--no_use_fast` | Set use_fast=False while loading the tokenizer (it's True by default). Use this if you have any problems related to use_fast. |
| `--use_flash_attention_2` | Set use_flash_attention_2=True while loading the model. | | `--use_flash_attention_2` | Set use_flash_attention_2=True while loading the model. |
#### Accelerate 4-bit #### bitsandbytes 4-bit
⚠️ Requires minimum compute of 7.0 on Windows at the moment. ⚠️ Requires minimum compute of 7.0 on Windows at the moment.

View File

@ -67,8 +67,56 @@ This extension uses the following parameters (from `settings.json`):
## Usage through API ## Usage through API
### Chat completions endpoint
#### With an image URL
```shell
curl http://127.0.0.1:5000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"messages": [
{
"role": "user",
"image_url": "https://avatars.githubusercontent.com/u/112222186?v=4"
},
{
"role": "user",
"content": "What is unusual about this image?"
}
]
}'
```
#### With a Base64 image
```python
import base64
import json
import requests
img = open('image.jpg', 'rb')
img_bytes = img.read()
img_base64 = base64.b64encode(img_bytes).decode('utf-8')
data = { "messages": [
{
"role": "user",
"image_url": f"data:image/jpeg;base64,{img_base64}"
},
{
"role": "user",
"content": "what is unusual about this image?"
}
]
}
response = requests.post('http://127.0.0.1:5000/v1/chat/completions', json=data)
print(response.text)
```
You can run the multimodal inference through API, by inputting the images to prompt. Images are embedded like so: `f'<img src="data:image/jpeg;base64,{img_str}">'`, where `img_str` is base-64 jpeg data. Note that you will need to launch `server.py` with the arguments `--api --extensions multimodal`. You can run the multimodal inference through API, by inputting the images to prompt. Images are embedded like so: `f'<img src="data:image/jpeg;base64,{img_str}">'`, where `img_str` is base-64 jpeg data. Note that you will need to launch `server.py` with the arguments `--api --extensions multimodal`.
### Completions endpoint
Python example: Python example:
```Python ```Python

View File

@ -1,10 +1,15 @@
import base64
import copy import copy
import re
import time import time
from collections import deque from collections import deque
from io import BytesIO
import requests
import tiktoken import tiktoken
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from PIL import Image
from transformers import LogitsProcessor, LogitsProcessorList from transformers import LogitsProcessor, LogitsProcessorList
from extensions.openai.errors import InvalidRequestError from extensions.openai.errors import InvalidRequestError
@ -140,7 +145,25 @@ def convert_history(history):
system_message = "" system_message = ""
for entry in history: for entry in history:
if "image_url" in entry:
image_url = entry['image_url']
if "base64" in image_url:
image_url = re.sub('^data:image/.+;base64,', '', image_url)
img = Image.open(BytesIO(base64.b64decode(image_url)))
else:
try:
my_res = requests.get(image_url)
img = Image.open(BytesIO(my_res.content))
except Exception:
raise 'Image cannot be loaded from the URL!'
buffered = BytesIO()
img.save(buffered, format="JPEG")
img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
content = f'<img src="data:image/jpeg;base64,{img_str}">'
else:
content = entry["content"] content = entry["content"]
role = entry["role"] role = entry["role"]
if role == "user": if role == "user":
@ -182,7 +205,8 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False) -
raise InvalidRequestError(message="messages: missing role", param='messages') raise InvalidRequestError(message="messages: missing role", param='messages')
elif m['role'] == 'function': elif m['role'] == 'function':
raise InvalidRequestError(message="role: function is not supported.", param='messages') raise InvalidRequestError(message="role: function is not supported.", param='messages')
if 'content' not in m:
if 'content' not in m and "image_url" not in m:
raise InvalidRequestError(message="messages: missing content", param='messages') raise InvalidRequestError(message="messages: missing content", param='messages')
# Chat Completions # Chat Completions

View File

@ -0,0 +1,25 @@
instruction_template: |-
{%- set found_item = false -%}
{%- for message in messages -%}
{%- if message['role'] == 'system' -%}
{%- set found_item = true -%}
{%- endif -%}
{%- endfor -%}
{%- if not found_item -%}
{{-'SYSTEM: ' + 'Elaborate on the topic using a Tree of Thoughts and backtrack when necessary to construct a clear, cohesive Chain of Thought reasoning. Always answer without hesitation.' + '\n' -}}
{%- endif %}
{%- for message in messages %}
{%- if message['role'] == 'system' -%}
{{-'SYSTEM: ' + message['content'] + '\n' -}}
{%- else -%}
{%- if message['role'] == 'user' -%}
{{-'USER: ' + message['content'] + '\n'-}}
{%- else -%}
{{-'ASSISTANT: ' + message['content'] + '</s>\n' -}}
{%- endif -%}
{%- endif -%}
{%- endfor -%}
{%- if add_generation_prompt -%}
{{-'ASSISTANT:'-}}
{%- endif -%}

View File

@ -0,0 +1,25 @@
instruction_template: |-
{%- set found_item = false -%}
{%- for message in messages -%}
{%- if message['role'] == 'system' -%}
{%- set found_item = true -%}
{%- endif -%}
{%- endfor -%}
{%- if not found_item -%}
{{-'SYSTEM: ' + 'Answer the question thoughtfully and intelligently. Always answer without hesitation.' + '\n' -}}
{%- endif %}
{%- for message in messages %}
{%- if message['role'] == 'system' -%}
{{-'SYSTEM: ' + message['content'] + '\n' -}}
{%- else -%}
{%- if message['role'] == 'user' -%}
{{-'USER: ' + message['content'] + '\n'-}}
{%- else -%}
{{-'ASSISTANT: ' + message['content'] + '</s>\n' -}}
{%- endif -%}
{%- endif -%}
{%- endfor -%}
{%- if add_generation_prompt -%}
{{-'ASSISTANT:'-}}
{%- endif -%}

View File

@ -188,3 +188,5 @@
instruction_template: 'ChatML' instruction_template: 'ChatML'
(dolphin).*: (dolphin).*:
instruction_template: 'ChatML' instruction_template: 'ChatML'
.*synthia:
instruction_template: 'Synthia'

View File

@ -168,8 +168,9 @@ loaders_and_params = OrderedDict({
] ]
}) })
loaders_samplers = {
'Transformers': { def transformers_samplers():
return {
'temperature', 'temperature',
'temperature_last', 'temperature_last',
'top_p', 'top_p',
@ -205,7 +206,16 @@ loaders_samplers = {
'add_bos_token', 'add_bos_token',
'skip_special_tokens', 'skip_special_tokens',
'auto_max_new_tokens', 'auto_max_new_tokens',
}, }
loaders_samplers = {
'Transformers': transformers_samplers(),
'AutoGPTQ': transformers_samplers(),
'GPTQ-for-LLaMa': transformers_samplers(),
'AutoAWQ': transformers_samplers(),
'QuIP#': transformers_samplers(),
'HQQ': transformers_samplers(),
'ExLlama_HF': { 'ExLlama_HF': {
'temperature', 'temperature',
'temperature_last', 'temperature_last',
@ -306,80 +316,6 @@ loaders_samplers = {
'skip_special_tokens', 'skip_special_tokens',
'auto_max_new_tokens', 'auto_max_new_tokens',
}, },
'AutoGPTQ': {
'temperature',
'temperature_last',
'top_p',
'min_p',
'top_k',
'typical_p',
'epsilon_cutoff',
'eta_cutoff',
'tfs',
'top_a',
'repetition_penalty',
'presence_penalty',
'frequency_penalty',
'repetition_penalty_range',
'encoder_repetition_penalty',
'no_repeat_ngram_size',
'min_length',
'seed',
'do_sample',
'penalty_alpha',
'num_beams',
'length_penalty',
'early_stopping',
'mirostat_mode',
'mirostat_tau',
'mirostat_eta',
'grammar_file_row',
'grammar_string',
'guidance_scale',
'negative_prompt',
'ban_eos_token',
'custom_token_bans',
'add_bos_token',
'skip_special_tokens',
'auto_max_new_tokens',
},
'GPTQ-for-LLaMa': {
'temperature',
'temperature_last',
'top_p',
'min_p',
'top_k',
'typical_p',
'epsilon_cutoff',
'eta_cutoff',
'tfs',
'top_a',
'repetition_penalty',
'presence_penalty',
'frequency_penalty',
'repetition_penalty_range',
'encoder_repetition_penalty',
'no_repeat_ngram_size',
'min_length',
'seed',
'do_sample',
'penalty_alpha',
'num_beams',
'length_penalty',
'early_stopping',
'mirostat_mode',
'mirostat_tau',
'mirostat_eta',
'grammar_file_row',
'grammar_string',
'guidance_scale',
'negative_prompt',
'ban_eos_token',
'custom_token_bans',
'add_bos_token',
'skip_special_tokens',
'auto_max_new_tokens',
},
'llama.cpp': { 'llama.cpp': {
'temperature', 'temperature',
'top_p', 'top_p',
@ -439,117 +375,6 @@ loaders_samplers = {
'repetition_penalty', 'repetition_penalty',
'repetition_penalty_range', 'repetition_penalty_range',
}, },
'AutoAWQ': {
'temperature',
'temperature_last',
'top_p',
'min_p',
'top_k',
'typical_p',
'epsilon_cutoff',
'eta_cutoff',
'tfs',
'top_a',
'repetition_penalty',
'presence_penalty',
'frequency_penalty',
'repetition_penalty_range',
'encoder_repetition_penalty',
'no_repeat_ngram_size',
'min_length',
'seed',
'do_sample',
'penalty_alpha',
'num_beams',
'length_penalty',
'early_stopping',
'mirostat_mode',
'mirostat_tau',
'mirostat_eta',
'grammar_file_row',
'grammar_string',
'guidance_scale',
'negative_prompt',
'ban_eos_token',
'custom_token_bans',
'add_bos_token',
'skip_special_tokens',
'auto_max_new_tokens',
},
'QuIP#': {
'temperature',
'temperature_last',
'top_p',
'min_p',
'top_k',
'typical_p',
'epsilon_cutoff',
'eta_cutoff',
'tfs',
'top_a',
'repetition_penalty',
'presence_penalty',
'frequency_penalty',
'repetition_penalty_range',
'encoder_repetition_penalty',
'no_repeat_ngram_size',
'min_length',
'seed',
'do_sample',
'penalty_alpha',
'num_beams',
'length_penalty',
'early_stopping',
'mirostat_mode',
'mirostat_tau',
'mirostat_eta',
'grammar_file_row',
'grammar_string',
'guidance_scale',
'negative_prompt',
'ban_eos_token',
'custom_token_bans',
'add_bos_token',
'skip_special_tokens',
'auto_max_new_tokens',
},
'HQQ': {
'temperature',
'temperature_last',
'top_p',
'min_p',
'top_k',
'typical_p',
'epsilon_cutoff',
'eta_cutoff',
'tfs',
'top_a',
'repetition_penalty',
'presence_penalty',
'frequency_penalty',
'repetition_penalty_range',
'encoder_repetition_penalty',
'no_repeat_ngram_size',
'min_length',
'seed',
'do_sample',
'penalty_alpha',
'num_beams',
'length_penalty',
'early_stopping',
'mirostat_mode',
'mirostat_tau',
'mirostat_eta',
'grammar_file_row',
'grammar_string',
'guidance_scale',
'negative_prompt',
'ban_eos_token',
'custom_token_bans',
'add_bos_token',
'skip_special_tokens',
'auto_max_new_tokens',
},
} }
loaders_model_types = { loaders_model_types = {

View File

@ -482,6 +482,7 @@ def clear_torch_cache():
def unload_model(): def unload_model():
shared.model = shared.tokenizer = None shared.model = shared.tokenizer = None
shared.model_name = 'None'
shared.lora_names = [] shared.lora_names = []
shared.model_dirty_from_training = False shared.model_dirty_from_training = False
clear_torch_cache() clear_torch_cache()

View File

@ -45,6 +45,7 @@ settings = {
'truncation_length_min': 0, 'truncation_length_min': 0,
'truncation_length_max': 200000, 'truncation_length_max': 200000,
'max_tokens_second': 0, 'max_tokens_second': 0,
'max_updates_second': 0,
'custom_stopping_strings': '', 'custom_stopping_strings': '',
'custom_token_bans': '', 'custom_token_bans': '',
'auto_max_new_tokens': False, 'auto_max_new_tokens': False,
@ -64,137 +65,155 @@ settings = {
'default_extensions': ['gallery'], 'default_extensions': ['gallery'],
} }
parser = argparse.ArgumentParser(formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=54))
# Parser copied from https://github.com/vladmandic/automatic
parser = argparse.ArgumentParser(description="Text generation web UI", conflict_handler='resolve', add_help=True, formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=55, indent_increment=2, width=200))
# Basic settings # Basic settings
parser.add_argument('--multi-user', action='store_true', help='Multi-user mode. Chat histories are not saved or automatically loaded. Warning: this is likely not safe for sharing publicly.') group = parser.add_argument_group('Basic settings')
parser.add_argument('--character', type=str, help='The name of the character to load in chat mode by default.') group.add_argument('--multi-user', action='store_true', help='Multi-user mode. Chat histories are not saved or automatically loaded. Warning: this is likely not safe for sharing publicly.')
parser.add_argument('--model', type=str, help='Name of the model to load by default.') group.add_argument('--character', type=str, help='The name of the character to load in chat mode by default.')
parser.add_argument('--lora', type=str, nargs='+', help='The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces.') group.add_argument('--model', type=str, help='Name of the model to load by default.')
parser.add_argument('--model-dir', type=str, default='models/', help='Path to directory with all the models.') group.add_argument('--lora', type=str, nargs='+', help='The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces.')
parser.add_argument('--lora-dir', type=str, default='loras/', help='Path to directory with all the loras.') group.add_argument('--model-dir', type=str, default='models/', help='Path to directory with all the models.')
parser.add_argument('--model-menu', action='store_true', help='Show a model menu in the terminal when the web UI is first launched.') group.add_argument('--lora-dir', type=str, default='loras/', help='Path to directory with all the loras.')
parser.add_argument('--settings', type=str, help='Load the default interface settings from this yaml file. See settings-template.yaml for an example. If you create a file called settings.yaml, this file will be loaded by default without the need to use the --settings flag.') group.add_argument('--model-menu', action='store_true', help='Show a model menu in the terminal when the web UI is first launched.')
parser.add_argument('--extensions', type=str, nargs='+', help='The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.') group.add_argument('--settings', type=str, help='Load the default interface settings from this yaml file. See settings-template.yaml for an example. If you create a file called settings.yaml, this file will be loaded by default without the need to use the --settings flag.')
parser.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.') group.add_argument('--extensions', type=str, nargs='+', help='The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.')
parser.add_argument('--chat-buttons', action='store_true', help='Show buttons on the chat tab instead of a hover menu.') group.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.')
group.add_argument('--chat-buttons', action='store_true', help='Show buttons on the chat tab instead of a hover menu.')
# Model loader # Model loader
parser.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlama_HF, ExLlamav2_HF, AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, ExLlama, ExLlamav2, ctransformers, QuIP#.') group = parser.add_argument_group('Model loader')
group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlama_HF, ExLlamav2_HF, AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, ExLlama, ExLlamav2, ctransformers, QuIP#.')
# Accelerate/transformers # Transformers/Accelerate
parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text. Warning: Training on CPU is extremely slow.') group = parser.add_argument_group('Transformers/Accelerate')
parser.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.') group.add_argument('--cpu', action='store_true', help='Use the CPU to generate text. Warning: Training on CPU is extremely slow.')
parser.add_argument('--gpu-memory', type=str, nargs='+', help='Maximum GPU memory in GiB to be allocated per GPU. Example: --gpu-memory 10 for a single GPU, --gpu-memory 10 5 for two GPUs. You can also set values in MiB like --gpu-memory 3500MiB.') group.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.')
parser.add_argument('--cpu-memory', type=str, help='Maximum CPU memory in GiB to allocate for offloaded weights. Same as above.') group.add_argument('--gpu-memory', type=str, nargs='+', help='Maximum GPU memory in GiB to be allocated per GPU. Example: --gpu-memory 10 for a single GPU, --gpu-memory 10 5 for two GPUs. You can also set values in MiB like --gpu-memory 3500MiB.')
parser.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.') group.add_argument('--cpu-memory', type=str, help='Maximum CPU memory in GiB to allocate for offloaded weights. Same as above.')
parser.add_argument('--disk-cache-dir', type=str, default='cache', help='Directory to save the disk cache to. Defaults to "cache".') group.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')
parser.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision (using bitsandbytes).') group.add_argument('--disk-cache-dir', type=str, default='cache', help='Directory to save the disk cache to. Defaults to "cache".')
parser.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.') group.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision (using bitsandbytes).')
parser.add_argument('--no-cache', action='store_true', help='Set use_cache to False while generating text. This reduces VRAM usage slightly, but it comes at a performance cost.') group.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.')
parser.add_argument('--xformers', action='store_true', help='Use xformer\'s memory efficient attention. This is really old and probably doesn\'t do anything.') group.add_argument('--no-cache', action='store_true', help='Set use_cache to False while generating text. This reduces VRAM usage slightly, but it comes at a performance cost.')
parser.add_argument('--sdp-attention', action='store_true', help='Use PyTorch 2.0\'s SDP attention. Same as above.') group.add_argument('--xformers', action='store_true', help='Use xformer\'s memory efficient attention. This is really old and probably doesn\'t do anything.')
parser.add_argument('--trust-remote-code', action='store_true', help='Set trust_remote_code=True while loading the model. Necessary for some models.') group.add_argument('--sdp-attention', action='store_true', help='Use PyTorch 2.0\'s SDP attention. Same as above.')
parser.add_argument('--force-safetensors', action='store_true', help='Set use_safetensors=True while loading the model. This prevents arbitrary code execution.') group.add_argument('--trust-remote-code', action='store_true', help='Set trust_remote_code=True while loading the model. Necessary for some models.')
parser.add_argument('--no_use_fast', action='store_true', help='Set use_fast=False while loading the tokenizer (it\'s True by default). Use this if you have any problems related to use_fast.') group.add_argument('--force-safetensors', action='store_true', help='Set use_safetensors=True while loading the model. This prevents arbitrary code execution.')
parser.add_argument('--use_flash_attention_2', action='store_true', help='Set use_flash_attention_2=True while loading the model.') group.add_argument('--no_use_fast', action='store_true', help='Set use_fast=False while loading the tokenizer (it\'s True by default). Use this if you have any problems related to use_fast.')
group.add_argument('--use_flash_attention_2', action='store_true', help='Set use_flash_attention_2=True while loading the model.')
# Accelerate 4-bit # bitsandbytes 4-bit
parser.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision (using bitsandbytes).') group = parser.add_argument_group('bitsandbytes 4-bit')
parser.add_argument('--use_double_quant', action='store_true', help='use_double_quant for 4-bit.') group.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision (using bitsandbytes).')
parser.add_argument('--compute_dtype', type=str, default='float16', help='compute dtype for 4-bit. Valid options: bfloat16, float16, float32.') group.add_argument('--use_double_quant', action='store_true', help='use_double_quant for 4-bit.')
parser.add_argument('--quant_type', type=str, default='nf4', help='quant_type for 4-bit. Valid options: nf4, fp4.') group.add_argument('--compute_dtype', type=str, default='float16', help='compute dtype for 4-bit. Valid options: bfloat16, float16, float32.')
group.add_argument('--quant_type', type=str, default='nf4', help='quant_type for 4-bit. Valid options: nf4, fp4.')
# llama.cpp # llama.cpp
parser.add_argument('--tensorcores', action='store_true', help='Use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards. NVIDIA only.') group = parser.add_argument_group('llama.cpp')
parser.add_argument('--n_ctx', type=int, default=2048, help='Size of the prompt context.') group.add_argument('--tensorcores', action='store_true', help='Use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards. NVIDIA only.')
parser.add_argument('--threads', type=int, default=0, help='Number of threads to use.') group.add_argument('--n_ctx', type=int, default=2048, help='Size of the prompt context.')
parser.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.') group.add_argument('--threads', type=int, default=0, help='Number of threads to use.')
parser.add_argument('--no_mul_mat_q', action='store_true', help='Disable the mulmat kernels.') group.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.')
parser.add_argument('--n_batch', type=int, default=512, help='Maximum number of prompt tokens to batch together when calling llama_eval.') group.add_argument('--no_mul_mat_q', action='store_true', help='Disable the mulmat kernels.')
parser.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.') group.add_argument('--n_batch', type=int, default=512, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
parser.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.') group.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.')
parser.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layers to offload to the GPU.') group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
parser.add_argument('--tensor_split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 18,17.') group.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layers to offload to the GPU.')
parser.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.') group.add_argument('--tensor_split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 18,17.')
parser.add_argument('--logits_all', action='store_true', help='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.') group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
parser.add_argument('--no_offload_kqv', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.') group.add_argument('--logits_all', action='store_true', help='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.')
parser.add_argument('--cache-capacity', type=str, help='Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.') group.add_argument('--no_offload_kqv', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
group.add_argument('--cache-capacity', type=str, help='Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.')
# ExLlama # ExLlama
parser.add_argument('--gpu-split', type=str, help='Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.') group = parser.add_argument_group('ExLlama')
parser.add_argument('--max_seq_len', type=int, default=2048, help='Maximum sequence length.') group.add_argument('--gpu-split', type=str, help='Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.')
parser.add_argument('--cfg-cache', action='store_true', help='ExLlama_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader, but not necessary for CFG with base ExLlama.') group.add_argument('--max_seq_len', type=int, default=2048, help='Maximum sequence length.')
parser.add_argument('--no_flash_attn', action='store_true', help='Force flash-attention to not be used.') group.add_argument('--cfg-cache', action='store_true', help='ExLlama_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader, but not necessary for CFG with base ExLlama.')
parser.add_argument('--cache_8bit', action='store_true', help='Use 8-bit cache to save VRAM.') group.add_argument('--no_flash_attn', action='store_true', help='Force flash-attention to not be used.')
parser.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.') group.add_argument('--cache_8bit', action='store_true', help='Use 8-bit cache to save VRAM.')
group.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.')
# AutoGPTQ # AutoGPTQ
parser.add_argument('--triton', action='store_true', help='Use triton.') group = parser.add_argument_group('AutoGPTQ')
parser.add_argument('--no_inject_fused_attention', action='store_true', help='Disable the use of fused attention, which will use less VRAM at the cost of slower inference.') group.add_argument('--triton', action='store_true', help='Use triton.')
parser.add_argument('--no_inject_fused_mlp', action='store_true', help='Triton mode only: disable the use of fused MLP, which will use less VRAM at the cost of slower inference.') group.add_argument('--no_inject_fused_attention', action='store_true', help='Disable the use of fused attention, which will use less VRAM at the cost of slower inference.')
parser.add_argument('--no_use_cuda_fp16', action='store_true', help='This can make models faster on some systems.') group.add_argument('--no_inject_fused_mlp', action='store_true', help='Triton mode only: disable the use of fused MLP, which will use less VRAM at the cost of slower inference.')
parser.add_argument('--desc_act', action='store_true', help='For models that do not have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.') group.add_argument('--no_use_cuda_fp16', action='store_true', help='This can make models faster on some systems.')
parser.add_argument('--disable_exllama', action='store_true', help='Disable ExLlama kernel, which can improve inference speed on some systems.') group.add_argument('--desc_act', action='store_true', help='For models that do not have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.')
parser.add_argument('--disable_exllamav2', action='store_true', help='Disable ExLlamav2 kernel.') group.add_argument('--disable_exllama', action='store_true', help='Disable ExLlama kernel, which can improve inference speed on some systems.')
group.add_argument('--disable_exllamav2', action='store_true', help='Disable ExLlamav2 kernel.')
# GPTQ-for-LLaMa # GPTQ-for-LLaMa
parser.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.') group = parser.add_argument_group('GPTQ-for-LLaMa')
parser.add_argument('--model_type', type=str, help='Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported.') group.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.')
parser.add_argument('--groupsize', type=int, default=-1, help='Group size.') group.add_argument('--model_type', type=str, help='Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported.')
parser.add_argument('--pre_layer', type=int, nargs='+', help='The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. For multi-gpu, write the numbers separated by spaces, eg --pre_layer 30 60.') group.add_argument('--groupsize', type=int, default=-1, help='Group size.')
parser.add_argument('--checkpoint', type=str, help='The path to the quantized checkpoint file. If not specified, it will be automatically detected.') group.add_argument('--pre_layer', type=int, nargs='+', help='The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. For multi-gpu, write the numbers separated by spaces, eg --pre_layer 30 60.')
parser.add_argument('--monkey-patch', action='store_true', help='Apply the monkey patch for using LoRAs with quantized models.') group.add_argument('--checkpoint', type=str, help='The path to the quantized checkpoint file. If not specified, it will be automatically detected.')
group.add_argument('--monkey-patch', action='store_true', help='Apply the monkey patch for using LoRAs with quantized models.')
# HQQ # HQQ
parser.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.') group = parser.add_argument_group('HQQ')
group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.')
# DeepSpeed # DeepSpeed
parser.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.') group = parser.add_argument_group('DeepSpeed')
parser.add_argument('--nvme-offload-dir', type=str, help='DeepSpeed: Directory to use for ZeRO-3 NVME offloading.') group.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.')
parser.add_argument('--local_rank', type=int, default=0, help='DeepSpeed: Optional argument for distributed setups.') group.add_argument('--nvme-offload-dir', type=str, help='DeepSpeed: Directory to use for ZeRO-3 NVME offloading.')
group.add_argument('--local_rank', type=int, default=0, help='DeepSpeed: Optional argument for distributed setups.')
# RWKV # RWKV
parser.add_argument('--rwkv-strategy', type=str, default=None, help='RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8".') group = parser.add_argument_group('RWKV')
parser.add_argument('--rwkv-cuda-on', action='store_true', help='RWKV: Compile the CUDA kernel for better performance.') group.add_argument('--rwkv-strategy', type=str, default=None, help='RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8".')
group.add_argument('--rwkv-cuda-on', action='store_true', help='RWKV: Compile the CUDA kernel for better performance.')
# RoPE # RoPE
parser.add_argument('--alpha_value', type=float, default=1, help='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.') group = parser.add_argument_group('RoPE')
parser.add_argument('--rope_freq_base', type=int, default=0, help='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63).') group.add_argument('--alpha_value', type=float, default=1, help='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.')
parser.add_argument('--compress_pos_emb', type=int, default=1, help="Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.") group.add_argument('--rope_freq_base', type=int, default=0, help='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63).')
group.add_argument('--compress_pos_emb', type=int, default=1, help="Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.")
# Gradio # Gradio
parser.add_argument('--listen', action='store_true', help='Make the web UI reachable from your local network.') group = parser.add_argument_group('Gradio')
parser.add_argument('--listen-port', type=int, help='The listening port that the server will use.') group.add_argument('--listen', action='store_true', help='Make the web UI reachable from your local network.')
parser.add_argument('--listen-host', type=str, help='The hostname that the server will use.') group.add_argument('--listen-port', type=int, help='The listening port that the server will use.')
parser.add_argument('--share', action='store_true', help='Create a public URL. This is useful for running the web UI on Google Colab or similar.') group.add_argument('--listen-host', type=str, help='The hostname that the server will use.')
parser.add_argument('--auto-launch', action='store_true', default=False, help='Open the web UI in the default browser upon launch.') group.add_argument('--share', action='store_true', help='Create a public URL. This is useful for running the web UI on Google Colab or similar.')
parser.add_argument('--gradio-auth', type=str, help='Set Gradio authentication password in the format "username:password". Multiple credentials can also be supplied with "u1:p1,u2:p2,u3:p3".', default=None) group.add_argument('--auto-launch', action='store_true', default=False, help='Open the web UI in the default browser upon launch.')
parser.add_argument('--gradio-auth-path', type=str, help='Set the Gradio authentication file path. The file should contain one or more user:password pairs in the same format as above.', default=None) group.add_argument('--gradio-auth', type=str, help='Set Gradio authentication password in the format "username:password". Multiple credentials can also be supplied with "u1:p1,u2:p2,u3:p3".', default=None)
parser.add_argument('--ssl-keyfile', type=str, help='The path to the SSL certificate key file.', default=None) group.add_argument('--gradio-auth-path', type=str, help='Set the Gradio authentication file path. The file should contain one or more user:password pairs in the same format as above.', default=None)
parser.add_argument('--ssl-certfile', type=str, help='The path to the SSL certificate cert file.', default=None) group.add_argument('--ssl-keyfile', type=str, help='The path to the SSL certificate key file.', default=None)
group.add_argument('--ssl-certfile', type=str, help='The path to the SSL certificate cert file.', default=None)
# API # API
parser.add_argument('--api', action='store_true', help='Enable the API extension.') group = parser.add_argument_group('API')
parser.add_argument('--public-api', action='store_true', help='Create a public URL for the API using Cloudfare.') group.add_argument('--api', action='store_true', help='Enable the API extension.')
parser.add_argument('--public-api-id', type=str, help='Tunnel ID for named Cloudflare Tunnel. Use together with public-api option.', default=None) group.add_argument('--public-api', action='store_true', help='Create a public URL for the API using Cloudfare.')
parser.add_argument('--api-port', type=int, default=5000, help='The listening port for the API.') group.add_argument('--public-api-id', type=str, help='Tunnel ID for named Cloudflare Tunnel. Use together with public-api option.', default=None)
parser.add_argument('--api-key', type=str, default='', help='API authentication key.') group.add_argument('--api-port', type=int, default=5000, help='The listening port for the API.')
parser.add_argument('--admin-key', type=str, default='', help='API authentication key for admin tasks like loading and unloading models. If not set, will be the same as --api-key.') group.add_argument('--api-key', type=str, default='', help='API authentication key.')
parser.add_argument('--nowebui', action='store_true', help='Do not launch the Gradio UI. Useful for launching the API in standalone mode.') group.add_argument('--admin-key', type=str, default='', help='API authentication key for admin tasks like loading and unloading models. If not set, will be the same as --api-key.')
group.add_argument('--nowebui', action='store_true', help='Do not launch the Gradio UI. Useful for launching the API in standalone mode.')
# Multimodal # Multimodal
parser.add_argument('--multimodal-pipeline', type=str, default=None, help='The multimodal pipeline to use. Examples: llava-7b, llava-13b.') group = parser.add_argument_group('Multimodal')
group.add_argument('--multimodal-pipeline', type=str, default=None, help='The multimodal pipeline to use. Examples: llava-7b, llava-13b.')
# Deprecated parameters # Deprecated parameters
parser.add_argument('--notebook', action='store_true', help='DEPRECATED') group = parser.add_argument_group('Deprecated')
parser.add_argument('--chat', action='store_true', help='DEPRECATED') group.add_argument('--notebook', action='store_true', help='DEPRECATED')
parser.add_argument('--no-stream', action='store_true', help='DEPRECATED') group.add_argument('--chat', action='store_true', help='DEPRECATED')
parser.add_argument('--mul_mat_q', action='store_true', help='DEPRECATED') group.add_argument('--no-stream', action='store_true', help='DEPRECATED')
parser.add_argument('--api-blocking-port', type=int, default=5000, help='DEPRECATED') group.add_argument('--mul_mat_q', action='store_true', help='DEPRECATED')
parser.add_argument('--api-streaming-port', type=int, default=5005, help='DEPRECATED') group.add_argument('--api-blocking-port', type=int, default=5000, help='DEPRECATED')
parser.add_argument('--llama_cpp_seed', type=int, default=0, help='DEPRECATED') group.add_argument('--api-streaming-port', type=int, default=5005, help='DEPRECATED')
parser.add_argument('--use_fast', action='store_true', help='DEPRECATED') group.add_argument('--llama_cpp_seed', type=int, default=0, help='DEPRECATED')
group.add_argument('--use_fast', action='store_true', help='DEPRECATED')
args = parser.parse_args() args = parser.parse_args()
args_defaults = parser.parse_args([]) args_defaults = parser.parse_args([])

View File

@ -77,6 +77,10 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
state = copy.deepcopy(state) state = copy.deepcopy(state)
state['stream'] = True state['stream'] = True
min_update_interval = 0
if state.get('max_updates_second', 0) > 0:
min_update_interval = 1 / state['max_updates_second']
# Generate # Generate
for reply in generate_func(question, original_question, seed, state, stopping_strings, is_chat=is_chat): for reply in generate_func(question, original_question, seed, state, stopping_strings, is_chat=is_chat):
reply, stop_found = apply_stopping_strings(reply, all_stop_strings) reply, stop_found = apply_stopping_strings(reply, all_stop_strings)
@ -94,10 +98,9 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
last_update = time.time() last_update = time.time()
yield reply yield reply
# Limit updates to 24 or 5 per second to avoid lag in the Gradio UI # Limit updates to avoid lag in the Gradio UI
# API updates are not limited # API updates are not limited
else: else:
min_update_interval = 0 if not for_ui else 0.2 if (shared.args.listen or shared.args.share) else 0.0417
if cur_time - last_update > min_update_interval: if cur_time - last_update > min_update_interval:
last_update = cur_time last_update = cur_time
yield reply yield reply
@ -265,7 +268,14 @@ def apply_stopping_strings(reply, all_stop_strings):
def get_reply_from_output_ids(output_ids, state, starting_from=0): def get_reply_from_output_ids(output_ids, state, starting_from=0):
reply = decode(output_ids[starting_from:], state['skip_special_tokens']) reply = decode(output_ids[starting_from:], state['skip_special_tokens'])
if (hasattr(shared.tokenizer, 'convert_ids_to_tokens') and len(output_ids) > starting_from and shared.tokenizer.convert_ids_to_tokens(int(output_ids[starting_from])).startswith('')) and not reply.startswith(' '):
# Handle tokenizers that do not add the leading space for the first token
if (hasattr(shared.tokenizer, 'convert_ids_to_tokens') and len(output_ids) > starting_from) and not reply.startswith(' '):
first_token = shared.tokenizer.convert_ids_to_tokens(int(output_ids[starting_from]))
if isinstance(first_token, (bytes,)):
first_token = first_token.decode('utf8')
if first_token.startswith(''):
reply = ' ' + reply reply = ' ' + reply
return reply return reply

View File

@ -110,6 +110,7 @@ def list_interface_input_elements():
'max_new_tokens', 'max_new_tokens',
'auto_max_new_tokens', 'auto_max_new_tokens',
'max_tokens_second', 'max_tokens_second',
'max_updates_second',
'seed', 'seed',
'temperature', 'temperature',
'temperature_last', 'temperature_last',

View File

@ -66,7 +66,9 @@ def create_ui(default_preset):
with gr.Row(): with gr.Row():
with gr.Column(): with gr.Column():
shared.gradio['truncation_length'] = gr.Slider(value=get_truncation_length(), minimum=shared.settings['truncation_length_min'], maximum=shared.settings['truncation_length_max'], step=256, label='Truncate the prompt up to this length', info='The leftmost tokens are removed if the prompt exceeds this length. Most models require this to be at most 2048.') shared.gradio['truncation_length'] = gr.Slider(value=get_truncation_length(), minimum=shared.settings['truncation_length_min'], maximum=shared.settings['truncation_length_max'], step=256, label='Truncate the prompt up to this length', info='The leftmost tokens are removed if the prompt exceeds this length. Most models require this to be at most 2048.')
shared.gradio['max_tokens_second'] = gr.Slider(value=shared.settings['max_tokens_second'], minimum=0, maximum=20, step=1, label='Maximum number of tokens/second', info='To make text readable in real time.') shared.gradio['max_tokens_second'] = gr.Slider(value=shared.settings['max_tokens_second'], minimum=0, maximum=20, step=1, label='Maximum tokens/second', info='To make text readable in real time.')
shared.gradio['max_updates_second'] = gr.Slider(value=shared.settings['max_updates_second'], minimum=0, maximum=20, step=1, label='Maximum UI updates/second', info='Set this if you experience lag in the UI during streaming.')
shared.gradio['custom_stopping_strings'] = gr.Textbox(lines=1, value=shared.settings["custom_stopping_strings"] or None, label='Custom stopping strings', info='In addition to the defaults. Written between "" and separated by commas.', placeholder='"\\n", "\\nYou:"') shared.gradio['custom_stopping_strings'] = gr.Textbox(lines=1, value=shared.settings["custom_stopping_strings"] or None, label='Custom stopping strings', info='In addition to the defaults. Written between "" and separated by commas.', placeholder='"\\n", "\\nYou:"')
shared.gradio['custom_token_bans'] = gr.Textbox(value=shared.settings['custom_token_bans'] or None, label='Custom token bans', info='Specific token IDs to ban from generating, comma-separated. The IDs can be found in the Default or Notebook tab.') shared.gradio['custom_token_bans'] = gr.Textbox(value=shared.settings['custom_token_bans'] or None, label='Custom token bans', info='Specific token IDs to ban from generating, comma-separated. The IDs can be found in the Default or Notebook tab.')

View File

@ -5,6 +5,7 @@ einops
exllamav2==0.0.11; platform_system != "Darwin" and platform_machine != "x86_64" exllamav2==0.0.11; platform_system != "Darwin" and platform_machine != "x86_64"
gradio==3.50.* gradio==3.50.*
hqq==0.1.1.post1 hqq==0.1.1.post1
lm_eval==0.3.0
markdown markdown
numpy==1.24.* numpy==1.24.*
optimum==1.16.* optimum==1.16.*

View File

@ -5,6 +5,7 @@ einops
exllamav2==0.0.11; platform_system == "Windows" or python_version < "3.10" or python_version > "3.11" or platform_machine != "x86_64" exllamav2==0.0.11; platform_system == "Windows" or python_version < "3.10" or python_version > "3.11" or platform_machine != "x86_64"
gradio==3.50.* gradio==3.50.*
hqq==0.1.1.post1 hqq==0.1.1.post1
lm_eval==0.3.0
markdown markdown
numpy==1.24.* numpy==1.24.*
optimum==1.16.* optimum==1.16.*

View File

@ -5,6 +5,7 @@ einops
exllamav2==0.0.11; platform_system == "Windows" or python_version < "3.10" or python_version > "3.11" or platform_machine != "x86_64" exllamav2==0.0.11; platform_system == "Windows" or python_version < "3.10" or python_version > "3.11" or platform_machine != "x86_64"
gradio==3.50.* gradio==3.50.*
hqq==0.1.1.post1 hqq==0.1.1.post1
lm_eval==0.3.0
markdown markdown
numpy==1.24.* numpy==1.24.*
optimum==1.16.* optimum==1.16.*

View File

@ -5,6 +5,7 @@ einops
exllamav2==0.0.11 exllamav2==0.0.11
gradio==3.50.* gradio==3.50.*
hqq==0.1.1.post1 hqq==0.1.1.post1
lm_eval==0.3.0
markdown markdown
numpy==1.24.* numpy==1.24.*
optimum==1.16.* optimum==1.16.*

View File

@ -5,6 +5,7 @@ einops
exllamav2==0.0.11 exllamav2==0.0.11
gradio==3.50.* gradio==3.50.*
hqq==0.1.1.post1 hqq==0.1.1.post1
lm_eval==0.3.0
markdown markdown
numpy==1.24.* numpy==1.24.*
optimum==1.16.* optimum==1.16.*

View File

@ -5,6 +5,7 @@ einops
exllamav2==0.0.11 exllamav2==0.0.11
gradio==3.50.* gradio==3.50.*
hqq==0.1.1.post1 hqq==0.1.1.post1
lm_eval==0.3.0
markdown markdown
numpy==1.24.* numpy==1.24.*
optimum==1.16.* optimum==1.16.*

View File

@ -5,6 +5,7 @@ einops
exllamav2==0.0.11 exllamav2==0.0.11
gradio==3.50.* gradio==3.50.*
hqq==0.1.1.post1 hqq==0.1.1.post1
lm_eval==0.3.0
markdown markdown
numpy==1.24.* numpy==1.24.*
optimum==1.16.* optimum==1.16.*

View File

@ -5,6 +5,7 @@ einops
exllamav2==0.0.11; platform_system != "Darwin" and platform_machine != "x86_64" exllamav2==0.0.11; platform_system != "Darwin" and platform_machine != "x86_64"
gradio==3.50.* gradio==3.50.*
hqq==0.1.1.post1 hqq==0.1.1.post1
lm_eval==0.3.0
markdown markdown
numpy==1.24.* numpy==1.24.*
optimum==1.16.* optimum==1.16.*

View File

@ -5,6 +5,7 @@ einops
exllamav2==0.0.11 exllamav2==0.0.11
gradio==3.50.* gradio==3.50.*
hqq==0.1.1.post1 hqq==0.1.1.post1
lm_eval==0.3.0
markdown markdown
numpy==1.24.* numpy==1.24.*
optimum==1.16.* optimum==1.16.*

View File

@ -1,6 +1,8 @@
import os import os
import warnings import warnings
from modules import shared
import accelerate # This early import makes Intel GPUs happy import accelerate # This early import makes Intel GPUs happy
import modules.one_click_installer_check import modules.one_click_installer_check
@ -36,7 +38,6 @@ import yaml
import modules.extensions as extensions_module import modules.extensions as extensions_module
from modules import ( from modules import (
chat, chat,
shared,
training, training,
ui, ui,
ui_chat, ui_chat,