mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-22 16:17:57 +01:00
Merge branch 'main' into fix/api-reload
This commit is contained in:
commit
bfe960731f
1
.gitignore
vendored
1
.gitignore
vendored
@ -2,6 +2,7 @@ cache/*
|
|||||||
characters/*
|
characters/*
|
||||||
extensions/silero_tts/outputs/*
|
extensions/silero_tts/outputs/*
|
||||||
extensions/elevenlabs_tts/outputs/*
|
extensions/elevenlabs_tts/outputs/*
|
||||||
|
extensions/sd_api_pictures/outputs/*
|
||||||
logs/*
|
logs/*
|
||||||
loras/*
|
loras/*
|
||||||
models/*
|
models/*
|
||||||
|
24
README.md
24
README.md
@ -84,10 +84,6 @@ pip install -r requirements.txt
|
|||||||
>
|
>
|
||||||
> For bitsandbytes and `--load-in-8bit` to work on Linux/WSL, this dirty fix is currently necessary: https://github.com/oobabooga/text-generation-webui/issues/400#issuecomment-1474876859
|
> For bitsandbytes and `--load-in-8bit` to work on Linux/WSL, this dirty fix is currently necessary: https://github.com/oobabooga/text-generation-webui/issues/400#issuecomment-1474876859
|
||||||
|
|
||||||
### Alternative: native Windows installation
|
|
||||||
|
|
||||||
As an alternative to the recommended WSL method, you can install the web UI natively on Windows using this guide. It will be a lot harder and the performance may be slower: [Installation instructions for human beings](https://github.com/oobabooga/text-generation-webui/wiki/Installation-instructions-for-human-beings).
|
|
||||||
|
|
||||||
### Alternative: one-click installers
|
### Alternative: one-click installers
|
||||||
|
|
||||||
[oobabooga-windows.zip](https://github.com/oobabooga/one-click-installers/archive/refs/heads/oobabooga-windows.zip)
|
[oobabooga-windows.zip](https://github.com/oobabooga/one-click-installers/archive/refs/heads/oobabooga-windows.zip)
|
||||||
@ -101,7 +97,13 @@ Just download the zip above, extract it, and double click on "install". The web
|
|||||||
|
|
||||||
Source codes: https://github.com/oobabooga/one-click-installers
|
Source codes: https://github.com/oobabooga/one-click-installers
|
||||||
|
|
||||||
This method lags behind the newest developments and does not support 8-bit mode on Windows without additional set up: https://github.com/oobabooga/text-generation-webui/issues/147#issuecomment-1456040134, https://github.com/oobabooga/text-generation-webui/issues/20#issuecomment-1411650652
|
> **Note**
|
||||||
|
>
|
||||||
|
> To get 8-bit and 4-bit models working in your 1-click Windows installation, you can use the [one-click-bandaid](https://github.com/ClayShoaf/oobabooga-one-click-bandaid).
|
||||||
|
|
||||||
|
### Alternative: native Windows installation
|
||||||
|
|
||||||
|
As an alternative to the recommended WSL method, you can install the web UI natively on Windows using this guide. It will be a lot harder and the performance may be slower: [Installation instructions for human beings](https://github.com/oobabooga/text-generation-webui/wiki/Installation-instructions-for-human-beings).
|
||||||
|
|
||||||
### Alternative: Docker
|
### Alternative: Docker
|
||||||
|
|
||||||
@ -175,15 +177,17 @@ Optionally, you can use the following command-line flags:
|
|||||||
| `--cpu` | Use the CPU to generate text.|
|
| `--cpu` | Use the CPU to generate text.|
|
||||||
| `--load-in-8bit` | Load the model with 8-bit precision.|
|
| `--load-in-8bit` | Load the model with 8-bit precision.|
|
||||||
| `--load-in-4bit` | DEPRECATED: use `--gptq-bits 4` instead. |
|
| `--load-in-4bit` | DEPRECATED: use `--gptq-bits 4` instead. |
|
||||||
| `--gptq-bits GPTQ_BITS` | Load a pre-quantized model with specified precision. 2, 3, 4 and 8 (bit) are supported. Currently only works with LLaMA and OPT. |
|
| `--gptq-bits GPTQ_BITS` | GPTQ: Load a pre-quantized model with specified precision. 2, 3, 4 and 8 (bit) are supported. Currently only works with LLaMA and OPT. |
|
||||||
| `--gptq-model-type MODEL_TYPE` | Model type of pre-quantized model. Currently only LLaMa and OPT are supported. |
|
| `--gptq-model-type MODEL_TYPE` | GPTQ: Model type of pre-quantized model. Currently only LLaMa and OPT are supported. |
|
||||||
|
| `--gptq-pre-layer GPTQ_PRE_LAYER` | GPTQ: The number of layers to preload. |
|
||||||
| `--bf16` | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. |
|
| `--bf16` | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. |
|
||||||
| `--auto-devices` | Automatically split the model across the available GPU(s) and CPU.|
|
| `--auto-devices` | Automatically split the model across the available GPU(s) and CPU.|
|
||||||
| `--disk` | If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. |
|
| `--disk` | If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. |
|
||||||
| `--disk-cache-dir DISK_CACHE_DIR` | Directory to save the disk cache to. Defaults to `cache/`. |
|
| `--disk-cache-dir DISK_CACHE_DIR` | Directory to save the disk cache to. Defaults to `cache/`. |
|
||||||
| `--gpu-memory GPU_MEMORY [GPU_MEMORY ...]` | Maxmimum GPU memory in GiB to be allocated per GPU. Example: `--gpu-memory 10` for a single GPU, `--gpu-memory 10 5` for two GPUs. |
|
| `--gpu-memory GPU_MEMORY [GPU_MEMORY ...]` | Maxmimum GPU memory in GiB to be allocated per GPU. Example: `--gpu-memory 10` for a single GPU, `--gpu-memory 10 5` for two GPUs. You can also set values in MiB like `--gpu-memory 3500MiB`. |
|
||||||
| `--cpu-memory CPU_MEMORY` | Maximum CPU memory in GiB to allocate for offloaded weights. Must be an integer number. Defaults to 99.|
|
| `--cpu-memory CPU_MEMORY` | Maximum CPU memory in GiB to allocate for offloaded weights. Must be an integer number. Defaults to 99.|
|
||||||
| `--flexgen` | Enable the use of FlexGen offloading. |
|
| `--no-cache` | Set `use_cache` to False while generating text. This reduces the VRAM usage a bit with a performance cost. |
|
||||||
|
| `--flexgen` | Enable the use of FlexGen offloading. |
|
||||||
| `--percent PERCENT [PERCENT ...]` | FlexGen: allocation percentages. Must be 6 numbers separated by spaces (default: 0, 100, 100, 0, 100, 0). |
|
| `--percent PERCENT [PERCENT ...]` | FlexGen: allocation percentages. Must be 6 numbers separated by spaces (default: 0, 100, 100, 0, 100, 0). |
|
||||||
| `--compress-weight` | FlexGen: Whether to compress weight (default: False).|
|
| `--compress-weight` | FlexGen: Whether to compress weight (default: False).|
|
||||||
| `--pin-weight [PIN_WEIGHT]` | FlexGen: whether to pin weights (setting this to False reduces CPU memory by 20%). |
|
| `--pin-weight [PIN_WEIGHT]` | FlexGen: whether to pin weights (setting this to False reduces CPU memory by 20%). |
|
||||||
@ -201,7 +205,7 @@ Optionally, you can use the following command-line flags:
|
|||||||
| `--auto-launch` | Open the web UI in the default browser upon launch. |
|
| `--auto-launch` | Open the web UI in the default browser upon launch. |
|
||||||
| `--verbose` | Print the prompts to the terminal. |
|
| `--verbose` | Print the prompts to the terminal. |
|
||||||
|
|
||||||
Out of memory errors? [Check this guide](https://github.com/oobabooga/text-generation-webui/wiki/Low-VRAM-guide).
|
Out of memory errors? [Check the low VRAM guide](https://github.com/oobabooga/text-generation-webui/wiki/Low-VRAM-guide).
|
||||||
|
|
||||||
## Presets
|
## Presets
|
||||||
|
|
||||||
|
@ -34,6 +34,7 @@ async def run(context):
|
|||||||
'penalty_alpha': 0,
|
'penalty_alpha': 0,
|
||||||
'length_penalty': 1,
|
'length_penalty': 1,
|
||||||
'early_stopping': False,
|
'early_stopping': False,
|
||||||
|
'seed': -1,
|
||||||
}
|
}
|
||||||
session = random_hash()
|
session = random_hash()
|
||||||
|
|
||||||
@ -44,14 +45,14 @@ async def run(context):
|
|||||||
case "send_hash":
|
case "send_hash":
|
||||||
await websocket.send(json.dumps({
|
await websocket.send(json.dumps({
|
||||||
"session_hash": session,
|
"session_hash": session,
|
||||||
"fn_index": 7
|
"fn_index": 12
|
||||||
}))
|
}))
|
||||||
case "estimation":
|
case "estimation":
|
||||||
pass
|
pass
|
||||||
case "send_data":
|
case "send_data":
|
||||||
await websocket.send(json.dumps({
|
await websocket.send(json.dumps({
|
||||||
"session_hash": session,
|
"session_hash": session,
|
||||||
"fn_index": 7,
|
"fn_index": 12,
|
||||||
"data": [
|
"data": [
|
||||||
context,
|
context,
|
||||||
params['max_new_tokens'],
|
params['max_new_tokens'],
|
||||||
@ -68,6 +69,7 @@ async def run(context):
|
|||||||
params['penalty_alpha'],
|
params['penalty_alpha'],
|
||||||
params['length_penalty'],
|
params['length_penalty'],
|
||||||
params['early_stopping'],
|
params['early_stopping'],
|
||||||
|
params['seed'],
|
||||||
]
|
]
|
||||||
}))
|
}))
|
||||||
case "process_starts":
|
case "process_starts":
|
||||||
|
@ -32,6 +32,7 @@ params = {
|
|||||||
'penalty_alpha': 0,
|
'penalty_alpha': 0,
|
||||||
'length_penalty': 1,
|
'length_penalty': 1,
|
||||||
'early_stopping': False,
|
'early_stopping': False,
|
||||||
|
'seed': -1,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Input prompt
|
# Input prompt
|
||||||
@ -54,6 +55,7 @@ response = requests.post(f"http://{server}:7860/run/textgen", json={
|
|||||||
params['penalty_alpha'],
|
params['penalty_alpha'],
|
||||||
params['length_penalty'],
|
params['length_penalty'],
|
||||||
params['early_stopping'],
|
params['early_stopping'],
|
||||||
|
params['seed'],
|
||||||
]
|
]
|
||||||
}).json()
|
}).json()
|
||||||
|
|
||||||
|
@ -50,3 +50,7 @@ ol li p, ul li p {
|
|||||||
#main, #parameters, #chat-settings, #interface-mode, #lora {
|
#main, #parameters, #chat-settings, #interface-mode, #lora {
|
||||||
border: 0;
|
border: 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.gradio-container-3-18-0 .prose * h1, h2, h3, h4 {
|
||||||
|
color: white;
|
||||||
|
}
|
||||||
|
@ -116,10 +116,11 @@ def get_download_links_from_huggingface(model, branch):
|
|||||||
|
|
||||||
is_pytorch = re.match("(pytorch|adapter)_model.*\.bin", fname)
|
is_pytorch = re.match("(pytorch|adapter)_model.*\.bin", fname)
|
||||||
is_safetensors = re.match("model.*\.safetensors", fname)
|
is_safetensors = re.match("model.*\.safetensors", fname)
|
||||||
|
is_pt = re.match(".*\.pt", fname)
|
||||||
is_tokenizer = re.match("tokenizer.*\.model", fname)
|
is_tokenizer = re.match("tokenizer.*\.model", fname)
|
||||||
is_text = re.match(".*\.(txt|json)", fname) or is_tokenizer
|
is_text = re.match(".*\.(txt|json|py)", fname) or is_tokenizer
|
||||||
|
|
||||||
if any((is_pytorch, is_safetensors, is_text, is_tokenizer)):
|
if any((is_pytorch, is_safetensors, is_pt, is_tokenizer, is_text)):
|
||||||
if is_text:
|
if is_text:
|
||||||
links.append(f"https://huggingface.co/{model}/resolve/{branch}/{fname}")
|
links.append(f"https://huggingface.co/{model}/resolve/{branch}/{fname}")
|
||||||
classifications.append('text')
|
classifications.append('text')
|
||||||
@ -132,7 +133,8 @@ def get_download_links_from_huggingface(model, branch):
|
|||||||
elif is_pytorch:
|
elif is_pytorch:
|
||||||
has_pytorch = True
|
has_pytorch = True
|
||||||
classifications.append('pytorch')
|
classifications.append('pytorch')
|
||||||
|
elif is_pt:
|
||||||
|
classifications.append('pt')
|
||||||
|
|
||||||
cursor = base64.b64encode(f'{{"file_name":"{dict[-1]["path"]}"}}'.encode()) + b':50'
|
cursor = base64.b64encode(f'{{"file_name":"{dict[-1]["path"]}"}}'.encode()) + b':50'
|
||||||
cursor = base64.b64encode(cursor)
|
cursor = base64.b64encode(cursor)
|
||||||
|
@ -57,6 +57,7 @@ class Handler(BaseHTTPRequestHandler):
|
|||||||
penalty_alpha=0,
|
penalty_alpha=0,
|
||||||
length_penalty=1,
|
length_penalty=1,
|
||||||
early_stopping=False,
|
early_stopping=False,
|
||||||
|
seed=-1,
|
||||||
)
|
)
|
||||||
|
|
||||||
answer = ''
|
answer = ''
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import gradio as gr
|
import gradio as gr
|
||||||
|
import modules.shared as shared
|
||||||
from elevenlabslib import ElevenLabsUser
|
from elevenlabslib import ElevenLabsUser
|
||||||
from elevenlabslib.helpers import save_bytes_to_path
|
from elevenlabslib.helpers import save_bytes_to_path
|
||||||
|
|
||||||
@ -15,6 +17,9 @@ wav_idx = 0
|
|||||||
user = ElevenLabsUser(params['api_key'])
|
user = ElevenLabsUser(params['api_key'])
|
||||||
user_info = None
|
user_info = None
|
||||||
|
|
||||||
|
if not shared.args.no_stream:
|
||||||
|
print("Please add --no-stream. This extension is not meant to be used with streaming.")
|
||||||
|
raise ValueError
|
||||||
|
|
||||||
# Check if the API is valid and refresh the UI accordingly.
|
# Check if the API is valid and refresh the UI accordingly.
|
||||||
def check_valid_api():
|
def check_valid_api():
|
||||||
@ -47,14 +52,9 @@ def refresh_voices():
|
|||||||
return
|
return
|
||||||
|
|
||||||
def remove_surrounded_chars(string):
|
def remove_surrounded_chars(string):
|
||||||
new_string = ""
|
# this expression matches to 'as few symbols as possible (0 upwards) between any asterisks' OR
|
||||||
in_star = False
|
# 'as few symbols as possible (0 upwards) between an asterisk and the end of the string'
|
||||||
for char in string:
|
return re.sub('\*[^\*]*?(\*|$)','',string)
|
||||||
if char == '*':
|
|
||||||
in_star = not in_star
|
|
||||||
elif not in_star:
|
|
||||||
new_string += char
|
|
||||||
return new_string
|
|
||||||
|
|
||||||
def input_modifier(string):
|
def input_modifier(string):
|
||||||
"""
|
"""
|
||||||
|
179
extensions/sd_api_pictures/script.py
Normal file
179
extensions/sd_api_pictures/script.py
Normal file
@ -0,0 +1,179 @@
|
|||||||
|
import base64
|
||||||
|
import io
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import gradio as gr
|
||||||
|
import modules.chat as chat
|
||||||
|
import modules.shared as shared
|
||||||
|
import requests
|
||||||
|
import torch
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
torch._C._jit_set_profiling_mode(False)
|
||||||
|
|
||||||
|
# parameters which can be customized in settings.json of webui
|
||||||
|
params = {
|
||||||
|
'enable_SD_api': False,
|
||||||
|
'address': 'http://127.0.0.1:7860',
|
||||||
|
'save_img': False,
|
||||||
|
'SD_model': 'NeverEndingDream', # not really used right now
|
||||||
|
'prompt_prefix': '(Masterpiece:1.1), (solo:1.3), detailed, intricate, colorful',
|
||||||
|
'negative_prompt': '(worst quality, low quality:1.3)',
|
||||||
|
'side_length': 512,
|
||||||
|
'restore_faces': False
|
||||||
|
}
|
||||||
|
|
||||||
|
SD_models = ['NeverEndingDream'] # TODO: get with http://{address}}/sdapi/v1/sd-models and allow user to select
|
||||||
|
|
||||||
|
streaming_state = shared.args.no_stream # remember if chat streaming was enabled
|
||||||
|
picture_response = False # specifies if the next model response should appear as a picture
|
||||||
|
pic_id = 0
|
||||||
|
|
||||||
|
def remove_surrounded_chars(string):
|
||||||
|
# this expression matches to 'as few symbols as possible (0 upwards) between any asterisks' OR
|
||||||
|
# 'as few symbols as possible (0 upwards) between an asterisk and the end of the string'
|
||||||
|
return re.sub('\*[^\*]*?(\*|$)','',string)
|
||||||
|
|
||||||
|
# I don't even need input_hijack for this as visible text will be commited to history as the unmodified string
|
||||||
|
def input_modifier(string):
|
||||||
|
"""
|
||||||
|
This function is applied to your text inputs before
|
||||||
|
they are fed into the model.
|
||||||
|
"""
|
||||||
|
global params, picture_response
|
||||||
|
if not params['enable_SD_api']:
|
||||||
|
return string
|
||||||
|
|
||||||
|
commands = ['send', 'mail', 'me']
|
||||||
|
mediums = ['image', 'pic', 'picture', 'photo']
|
||||||
|
subjects = ['yourself', 'own']
|
||||||
|
lowstr = string.lower()
|
||||||
|
|
||||||
|
# TODO: refactor out to separate handler and also replace detection with a regexp
|
||||||
|
if any(command in lowstr for command in commands) and any(case in lowstr for case in mediums): # trigger the generation if a command signature and a medium signature is found
|
||||||
|
picture_response = True
|
||||||
|
shared.args.no_stream = True # Disable streaming cause otherwise the SD-generated picture would return as a dud
|
||||||
|
shared.processing_message = "*Is sending a picture...*"
|
||||||
|
string = "Please provide a detailed description of your surroundings, how you look and the situation you're in and what you are doing right now"
|
||||||
|
if any(target in lowstr for target in subjects): # the focus of the image should be on the sending character
|
||||||
|
string = "Please provide a detailed and vivid description of how you look and what you are wearing"
|
||||||
|
|
||||||
|
return string
|
||||||
|
|
||||||
|
# Get and save the Stable Diffusion-generated picture
|
||||||
|
def get_SD_pictures(description):
|
||||||
|
|
||||||
|
global params, pic_id
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"prompt": params['prompt_prefix'] + description,
|
||||||
|
"seed": -1,
|
||||||
|
"sampler_name": "DPM++ 2M Karras",
|
||||||
|
"steps": 32,
|
||||||
|
"cfg_scale": 7,
|
||||||
|
"width": params['side_length'],
|
||||||
|
"height": params['side_length'],
|
||||||
|
"restore_faces": params['restore_faces'],
|
||||||
|
"negative_prompt": params['negative_prompt']
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post(url=f'{params["address"]}/sdapi/v1/txt2img', json=payload)
|
||||||
|
r = response.json()
|
||||||
|
|
||||||
|
visible_result = ""
|
||||||
|
for img_str in r['images']:
|
||||||
|
image = Image.open(io.BytesIO(base64.b64decode(img_str.split(",",1)[0])))
|
||||||
|
if params['save_img']:
|
||||||
|
output_file = Path(f'extensions/sd_api_pictures/outputs/{pic_id:06d}.png')
|
||||||
|
image.save(output_file.as_posix())
|
||||||
|
pic_id += 1
|
||||||
|
# lower the resolution of received images for the chat, otherwise the log size gets out of control quickly with all the base64 values in visible history
|
||||||
|
image.thumbnail((300, 300))
|
||||||
|
buffered = io.BytesIO()
|
||||||
|
image.save(buffered, format="JPEG")
|
||||||
|
buffered.seek(0)
|
||||||
|
image_bytes = buffered.getvalue()
|
||||||
|
img_str = "data:image/jpeg;base64," + base64.b64encode(image_bytes).decode()
|
||||||
|
visible_result = visible_result + f'<img src="{img_str}" alt="{description}">\n'
|
||||||
|
|
||||||
|
return visible_result
|
||||||
|
|
||||||
|
# TODO: how do I make the UI history ignore the resulting pictures (I don't want HTML to appear in history)
|
||||||
|
# and replace it with 'text' for the purposes of logging?
|
||||||
|
def output_modifier(string):
|
||||||
|
"""
|
||||||
|
This function is applied to the model outputs.
|
||||||
|
"""
|
||||||
|
global pic_id, picture_response, streaming_state
|
||||||
|
|
||||||
|
if not picture_response:
|
||||||
|
return string
|
||||||
|
|
||||||
|
string = remove_surrounded_chars(string)
|
||||||
|
string = string.replace('"', '')
|
||||||
|
string = string.replace('“', '')
|
||||||
|
string = string.replace('\n', ' ')
|
||||||
|
string = string.strip()
|
||||||
|
|
||||||
|
if string == '':
|
||||||
|
string = 'no viable description in reply, try regenerating'
|
||||||
|
|
||||||
|
# I can't for the love of all that's holy get the name from shared.gradio['name1'], so for now it will be like this
|
||||||
|
text = f'*Description: "{string}"*'
|
||||||
|
|
||||||
|
image = get_SD_pictures(string)
|
||||||
|
|
||||||
|
picture_response = False
|
||||||
|
|
||||||
|
shared.processing_message = "*Is typing...*"
|
||||||
|
shared.args.no_stream = streaming_state
|
||||||
|
return image + "\n" + text
|
||||||
|
|
||||||
|
def bot_prefix_modifier(string):
|
||||||
|
"""
|
||||||
|
This function is only applied in chat mode. It modifies
|
||||||
|
the prefix text for the Bot and can be used to bias its
|
||||||
|
behavior.
|
||||||
|
"""
|
||||||
|
|
||||||
|
return string
|
||||||
|
|
||||||
|
def force_pic():
|
||||||
|
global picture_response
|
||||||
|
picture_response = True
|
||||||
|
|
||||||
|
def ui():
|
||||||
|
|
||||||
|
# Gradio elements
|
||||||
|
with gr.Accordion("Stable Diffusion api integration", open=True):
|
||||||
|
with gr.Row():
|
||||||
|
with gr.Column():
|
||||||
|
enable = gr.Checkbox(value=params['enable_SD_api'], label='Activate SD Api integration')
|
||||||
|
save_img = gr.Checkbox(value=params['save_img'], label='Keep original received images in the outputs subdir')
|
||||||
|
with gr.Column():
|
||||||
|
address = gr.Textbox(placeholder=params['address'], value=params['address'], label='Stable Diffusion host address')
|
||||||
|
|
||||||
|
with gr.Row():
|
||||||
|
force_btn = gr.Button("Force the next response to be a picture")
|
||||||
|
generate_now_btn = gr.Button("Generate an image response to the input")
|
||||||
|
|
||||||
|
with gr.Accordion("Generation parameters", open=False):
|
||||||
|
prompt_prefix = gr.Textbox(placeholder=params['prompt_prefix'], value=params['prompt_prefix'], label='Prompt Prefix (best used to describe the look of the character)')
|
||||||
|
with gr.Row():
|
||||||
|
negative_prompt = gr.Textbox(placeholder=params['negative_prompt'], value=params['negative_prompt'], label='Negative Prompt')
|
||||||
|
dimensions = gr.Slider(256,702,value=params['side_length'],step=64,label='Image dimensions')
|
||||||
|
# model = gr.Dropdown(value=SD_models[0], choices=SD_models, label='Model')
|
||||||
|
|
||||||
|
# Event functions to update the parameters in the backend
|
||||||
|
enable.change(lambda x: params.update({"enable_SD_api": x}), enable, None)
|
||||||
|
save_img.change(lambda x: params.update({"save_img": x}), save_img, None)
|
||||||
|
address.change(lambda x: params.update({"address": x}), address, None)
|
||||||
|
prompt_prefix.change(lambda x: params.update({"prompt_prefix": x}), prompt_prefix, None)
|
||||||
|
negative_prompt.change(lambda x: params.update({"negative_prompt": x}), negative_prompt, None)
|
||||||
|
dimensions.change(lambda x: params.update({"side_length": x}), dimensions, None)
|
||||||
|
# model.change(lambda x: params.update({"SD_model": x}), model, None)
|
||||||
|
|
||||||
|
force_btn.click(force_pic)
|
||||||
|
generate_now_btn.click(force_pic)
|
||||||
|
generate_now_btn.click(eval('chat.cai_chatbot_wrapper'), shared.input_params, shared.gradio['display'], show_progress=shared.args.no_stream)
|
@ -2,11 +2,11 @@ import base64
|
|||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
import gradio as gr
|
import gradio as gr
|
||||||
import torch
|
|
||||||
from transformers import BlipForConditionalGeneration, BlipProcessor
|
|
||||||
|
|
||||||
import modules.chat as chat
|
import modules.chat as chat
|
||||||
import modules.shared as shared
|
import modules.shared as shared
|
||||||
|
import torch
|
||||||
|
from PIL import Image
|
||||||
|
from transformers import BlipForConditionalGeneration, BlipProcessor
|
||||||
|
|
||||||
# If 'state' is True, will hijack the next chat generation with
|
# If 'state' is True, will hijack the next chat generation with
|
||||||
# custom input text given by 'value' in the format [text, visible_text]
|
# custom input text given by 'value' in the format [text, visible_text]
|
||||||
@ -25,10 +25,12 @@ def caption_image(raw_image):
|
|||||||
|
|
||||||
def generate_chat_picture(picture, name1, name2):
|
def generate_chat_picture(picture, name1, name2):
|
||||||
text = f'*{name1} sends {name2} a picture that contains the following: "{caption_image(picture)}"*'
|
text = f'*{name1} sends {name2} a picture that contains the following: "{caption_image(picture)}"*'
|
||||||
|
# lower the resolution of sent images for the chat, otherwise the log size gets out of control quickly with all the base64 values in visible history
|
||||||
|
picture.thumbnail((300, 300))
|
||||||
buffer = BytesIO()
|
buffer = BytesIO()
|
||||||
picture.save(buffer, format="JPEG")
|
picture.save(buffer, format="JPEG")
|
||||||
img_str = base64.b64encode(buffer.getvalue()).decode('utf-8')
|
img_str = base64.b64encode(buffer.getvalue()).decode('utf-8')
|
||||||
visible_text = f'<img src="data:image/jpeg;base64,{img_str}">'
|
visible_text = f'<img src="data:image/jpeg;base64,{img_str}" alt="{text}">'
|
||||||
return text, visible_text
|
return text, visible_text
|
||||||
|
|
||||||
def ui():
|
def ui():
|
||||||
|
@ -1,11 +1,11 @@
|
|||||||
|
import re
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import gradio as gr
|
import gradio as gr
|
||||||
import torch
|
|
||||||
|
|
||||||
import modules.chat as chat
|
import modules.chat as chat
|
||||||
import modules.shared as shared
|
import modules.shared as shared
|
||||||
|
import torch
|
||||||
|
|
||||||
torch._C._jit_set_profiling_mode(False)
|
torch._C._jit_set_profiling_mode(False)
|
||||||
|
|
||||||
@ -46,14 +46,9 @@ def load_model():
|
|||||||
model = load_model()
|
model = load_model()
|
||||||
|
|
||||||
def remove_surrounded_chars(string):
|
def remove_surrounded_chars(string):
|
||||||
new_string = ""
|
# this expression matches to 'as few symbols as possible (0 upwards) between any asterisks' OR
|
||||||
in_star = False
|
# 'as few symbols as possible (0 upwards) between an asterisk and the end of the string'
|
||||||
for char in string:
|
return re.sub('\*[^\*]*?(\*|$)','',string)
|
||||||
if char == '*':
|
|
||||||
in_star = not in_star
|
|
||||||
elif not in_star:
|
|
||||||
new_string += char
|
|
||||||
return new_string
|
|
||||||
|
|
||||||
def remove_tts_from_history(name1, name2):
|
def remove_tts_from_history(name1, name2):
|
||||||
for i, entry in enumerate(shared.history['internal']):
|
for i, entry in enumerate(shared.history['internal']):
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import re
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
@ -8,6 +9,7 @@ import modules.shared as shared
|
|||||||
|
|
||||||
sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa")))
|
sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa")))
|
||||||
import llama
|
import llama
|
||||||
|
import llama_inference_offload
|
||||||
import opt
|
import opt
|
||||||
|
|
||||||
|
|
||||||
@ -23,7 +25,10 @@ def load_quantized(model_name):
|
|||||||
model_type = shared.args.gptq_model_type.lower()
|
model_type = shared.args.gptq_model_type.lower()
|
||||||
|
|
||||||
if model_type == 'llama':
|
if model_type == 'llama':
|
||||||
load_quant = llama.load_quant
|
if not shared.args.gptq_pre_layer:
|
||||||
|
load_quant = llama.load_quant
|
||||||
|
else:
|
||||||
|
load_quant = llama_inference_offload.load_quant
|
||||||
elif model_type == 'opt':
|
elif model_type == 'opt':
|
||||||
load_quant = opt.load_quant
|
load_quant = opt.load_quant
|
||||||
else:
|
else:
|
||||||
@ -52,20 +57,28 @@ def load_quantized(model_name):
|
|||||||
print(f"Could not find {pt_model}, exiting...")
|
print(f"Could not find {pt_model}, exiting...")
|
||||||
exit()
|
exit()
|
||||||
|
|
||||||
model = load_quant(str(path_to_model), str(pt_path), shared.args.gptq_bits)
|
# qwopqwop200's offload
|
||||||
|
if shared.args.gptq_pre_layer:
|
||||||
# Multiple GPUs or GPU+CPU
|
model = load_quant(str(path_to_model), str(pt_path), shared.args.gptq_bits, shared.args.gptq_pre_layer)
|
||||||
if shared.args.gpu_memory:
|
|
||||||
max_memory = {}
|
|
||||||
for i in range(len(shared.args.gpu_memory)):
|
|
||||||
max_memory[i] = f"{shared.args.gpu_memory[i]}GiB"
|
|
||||||
max_memory['cpu'] = f"{shared.args.cpu_memory or '99'}GiB"
|
|
||||||
|
|
||||||
device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=["LlamaDecoderLayer"])
|
|
||||||
model = accelerate.dispatch_model(model, device_map=device_map)
|
|
||||||
|
|
||||||
# Single GPU
|
|
||||||
else:
|
else:
|
||||||
model = model.to(torch.device('cuda:0'))
|
model = load_quant(str(path_to_model), str(pt_path), shared.args.gptq_bits)
|
||||||
|
|
||||||
|
# accelerate offload (doesn't work properly)
|
||||||
|
if shared.args.gpu_memory:
|
||||||
|
memory_map = list(map(lambda x : x.strip(), shared.args.gpu_memory))
|
||||||
|
max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB'
|
||||||
|
max_memory = {}
|
||||||
|
for i in range(len(memory_map)):
|
||||||
|
max_memory[i] = f'{memory_map[i]}GiB' if not re.match('.*ib$', memory_map[i].lower()) else memory_map[i]
|
||||||
|
max_memory['cpu'] = max_cpu_memory
|
||||||
|
|
||||||
|
device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=["LlamaDecoderLayer"])
|
||||||
|
print("Using the following device map for the 4-bit model:", device_map)
|
||||||
|
# https://huggingface.co/docs/accelerate/package_reference/big_modeling#accelerate.dispatch_model
|
||||||
|
model = accelerate.dispatch_model(model, device_map=device_map, offload_buffers=True)
|
||||||
|
|
||||||
|
# No offload
|
||||||
|
elif not shared.args.cpu:
|
||||||
|
model = model.to(torch.device('cuda:0'))
|
||||||
|
|
||||||
return model
|
return model
|
||||||
|
@ -2,21 +2,36 @@ from pathlib import Path
|
|||||||
|
|
||||||
import modules.shared as shared
|
import modules.shared as shared
|
||||||
from modules.models import load_model
|
from modules.models import load_model
|
||||||
|
from modules.text_generation import clear_torch_cache
|
||||||
|
|
||||||
|
|
||||||
|
def reload_model():
|
||||||
|
shared.model = shared.tokenizer = None
|
||||||
|
clear_torch_cache()
|
||||||
|
shared.model, shared.tokenizer = load_model(shared.model_name)
|
||||||
|
|
||||||
def add_lora_to_model(lora_name):
|
def add_lora_to_model(lora_name):
|
||||||
|
|
||||||
from peft import PeftModel
|
from peft import PeftModel
|
||||||
|
|
||||||
# Is there a more efficient way of returning to the base model?
|
# If a LoRA had been previously loaded, or if we want
|
||||||
if lora_name == "None":
|
# to unload a LoRA, reload the model
|
||||||
print("Reloading the model to remove the LoRA...")
|
if shared.lora_name != "None" or lora_name == "None":
|
||||||
shared.model, shared.tokenizer = load_model(shared.model_name)
|
reload_model()
|
||||||
else:
|
shared.lora_name = lora_name
|
||||||
# Why doesn't this work in 16-bit mode?
|
|
||||||
print(f"Adding the LoRA {lora_name} to the model...")
|
|
||||||
|
|
||||||
|
if lora_name != "None":
|
||||||
|
print(f"Adding the LoRA {lora_name} to the model...")
|
||||||
params = {}
|
params = {}
|
||||||
#params['device_map'] = {'': 0}
|
if not shared.args.cpu:
|
||||||
#params['dtype'] = shared.model.dtype
|
params['dtype'] = shared.model.dtype
|
||||||
|
if hasattr(shared.model, "hf_device_map"):
|
||||||
|
params['device_map'] = {"base_model.model."+k: v for k, v in shared.model.hf_device_map.items()}
|
||||||
|
elif shared.args.load_in_8bit:
|
||||||
|
params['device_map'] = {'': 0}
|
||||||
|
|
||||||
shared.model = PeftModel.from_pretrained(shared.model, Path(f"loras/{lora_name}"), **params)
|
shared.model = PeftModel.from_pretrained(shared.model, Path(f"loras/{lora_name}"), **params)
|
||||||
|
if not shared.args.load_in_8bit and not shared.args.cpu:
|
||||||
|
shared.model.half()
|
||||||
|
if not hasattr(shared.model, "hf_device_map"):
|
||||||
|
shared.model.cuda()
|
||||||
|
@ -45,11 +45,11 @@ class RWKVModel:
|
|||||||
token_stop = token_stop
|
token_stop = token_stop
|
||||||
)
|
)
|
||||||
|
|
||||||
return context+self.pipeline.generate(context, token_count=token_count, args=args, callback=callback)
|
return self.pipeline.generate(context, token_count=token_count, args=args, callback=callback)
|
||||||
|
|
||||||
def generate_with_streaming(self, **kwargs):
|
def generate_with_streaming(self, **kwargs):
|
||||||
with Iteratorize(self.generate, kwargs, callback=None) as generator:
|
with Iteratorize(self.generate, kwargs, callback=None) as generator:
|
||||||
reply = kwargs['context']
|
reply = ''
|
||||||
for token in generator:
|
for token in generator:
|
||||||
reply += token
|
reply += token
|
||||||
yield reply
|
yield reply
|
||||||
|
@ -11,24 +11,22 @@ import modules.shared as shared
|
|||||||
# Copied from https://github.com/PygmalionAI/gradio-ui/
|
# Copied from https://github.com/PygmalionAI/gradio-ui/
|
||||||
class _SentinelTokenStoppingCriteria(transformers.StoppingCriteria):
|
class _SentinelTokenStoppingCriteria(transformers.StoppingCriteria):
|
||||||
|
|
||||||
def __init__(self, sentinel_token_ids: torch.LongTensor,
|
def __init__(self, sentinel_token_ids: list[torch.LongTensor], starting_idx: int):
|
||||||
starting_idx: int):
|
|
||||||
transformers.StoppingCriteria.__init__(self)
|
transformers.StoppingCriteria.__init__(self)
|
||||||
self.sentinel_token_ids = sentinel_token_ids
|
self.sentinel_token_ids = sentinel_token_ids
|
||||||
self.starting_idx = starting_idx
|
self.starting_idx = starting_idx
|
||||||
|
|
||||||
def __call__(self, input_ids: torch.LongTensor,
|
def __call__(self, input_ids: torch.LongTensor, _scores: torch.FloatTensor) -> bool:
|
||||||
_scores: torch.FloatTensor) -> bool:
|
|
||||||
for sample in input_ids:
|
for sample in input_ids:
|
||||||
trimmed_sample = sample[self.starting_idx:]
|
trimmed_sample = sample[self.starting_idx:]
|
||||||
# Can't unfold, output is still too tiny. Skip.
|
|
||||||
if trimmed_sample.shape[-1] < self.sentinel_token_ids.shape[-1]:
|
|
||||||
continue
|
|
||||||
|
|
||||||
for window in trimmed_sample.unfold(
|
for i in range(len(self.sentinel_token_ids)):
|
||||||
0, self.sentinel_token_ids.shape[-1], 1):
|
# Can't unfold, output is still too tiny. Skip.
|
||||||
if torch.all(torch.eq(self.sentinel_token_ids, window)):
|
if trimmed_sample.shape[-1] < self.sentinel_token_ids[i].shape[-1]:
|
||||||
return True
|
continue
|
||||||
|
for window in trimmed_sample.unfold(0, self.sentinel_token_ids[i].shape[-1], 1):
|
||||||
|
if torch.all(torch.eq(self.sentinel_token_ids[i], window)):
|
||||||
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
class Stream(transformers.StoppingCriteria):
|
class Stream(transformers.StoppingCriteria):
|
||||||
|
@ -51,47 +51,37 @@ def generate_chat_prompt(user_input, max_new_tokens, name1, name2, context, chat
|
|||||||
prompt = ''.join(rows)
|
prompt = ''.join(rows)
|
||||||
return prompt
|
return prompt
|
||||||
|
|
||||||
def extract_message_from_reply(question, reply, name1, name2, check, impersonate=False):
|
def extract_message_from_reply(reply, name1, name2, check):
|
||||||
next_character_found = False
|
next_character_found = False
|
||||||
|
|
||||||
asker = name1 if not impersonate else name2
|
|
||||||
replier = name2 if not impersonate else name1
|
|
||||||
|
|
||||||
previous_idx = [m.start() for m in re.finditer(f"(^|\n){re.escape(replier)}:", question)]
|
|
||||||
idx = [m.start() for m in re.finditer(f"(^|\n){re.escape(replier)}:", reply)]
|
|
||||||
idx = idx[max(len(previous_idx)-1, 0)]
|
|
||||||
|
|
||||||
if not impersonate:
|
|
||||||
reply = reply[idx + 1 + len(apply_extensions(f"{replier}:", "bot_prefix")):]
|
|
||||||
else:
|
|
||||||
reply = reply[idx + 1 + len(f"{replier}:"):]
|
|
||||||
|
|
||||||
if check:
|
if check:
|
||||||
lines = reply.split('\n')
|
lines = reply.split('\n')
|
||||||
reply = lines[0].strip()
|
reply = lines[0].strip()
|
||||||
if len(lines) > 1:
|
if len(lines) > 1:
|
||||||
next_character_found = True
|
next_character_found = True
|
||||||
else:
|
else:
|
||||||
idx = reply.find(f"\n{asker}:")
|
for string in [f"\n{name1}:", f"\n{name2}:"]:
|
||||||
if idx != -1:
|
idx = reply.find(string)
|
||||||
reply = reply[:idx]
|
if idx != -1:
|
||||||
next_character_found = True
|
reply = reply[:idx]
|
||||||
reply = fix_newlines(reply)
|
next_character_found = True
|
||||||
|
|
||||||
# If something like "\nYo" is generated just before "\nYou:"
|
# If something like "\nYo" is generated just before "\nYou:"
|
||||||
# is completed, trim it
|
# is completed, trim it
|
||||||
next_turn = f"\n{asker}:"
|
if not next_character_found:
|
||||||
for j in range(len(next_turn)-1, 0, -1):
|
for string in [f"\n{name1}:", f"\n{name2}:"]:
|
||||||
if reply[-j:] == next_turn[:j]:
|
for j in range(len(string)-1, 0, -1):
|
||||||
reply = reply[:-j]
|
if reply[-j:] == string[:j]:
|
||||||
break
|
reply = reply[:-j]
|
||||||
|
break
|
||||||
|
|
||||||
|
reply = fix_newlines(reply)
|
||||||
return reply, next_character_found
|
return reply, next_character_found
|
||||||
|
|
||||||
def stop_everything_event():
|
def stop_everything_event():
|
||||||
shared.stop_everything = True
|
shared.stop_everything = True
|
||||||
|
|
||||||
def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1, regenerate=False):
|
def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1, regenerate=False):
|
||||||
shared.stop_everything = False
|
shared.stop_everything = False
|
||||||
just_started = True
|
just_started = True
|
||||||
eos_token = '\n' if check else None
|
eos_token = '\n' if check else None
|
||||||
@ -125,12 +115,13 @@ def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical
|
|||||||
yield shared.history['visible']+[[visible_text, shared.processing_message]]
|
yield shared.history['visible']+[[visible_text, shared.processing_message]]
|
||||||
|
|
||||||
# Generate
|
# Generate
|
||||||
reply = ''
|
cumulative_reply = ''
|
||||||
for i in range(chat_generation_attempts):
|
for i in range(chat_generation_attempts):
|
||||||
for reply in generate_reply(f"{prompt}{' ' if len(reply) > 0 else ''}{reply}", max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, eos_token=eos_token, stopping_string=f"\n{name1}:"):
|
for reply in generate_reply(f"{prompt}{' ' if len(cumulative_reply) > 0 else ''}{cumulative_reply}", max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=eos_token, stopping_strings=[f"\n{name1}:", f"\n{name2}:"]):
|
||||||
|
reply = cumulative_reply + reply
|
||||||
|
|
||||||
# Extracting the reply
|
# Extracting the reply
|
||||||
reply, next_character_found = extract_message_from_reply(prompt, reply, name1, name2, check)
|
reply, next_character_found = extract_message_from_reply(reply, name1, name2, check)
|
||||||
visible_reply = re.sub("(<USER>|<user>|{{user}})", name1_original, reply)
|
visible_reply = re.sub("(<USER>|<user>|{{user}})", name1_original, reply)
|
||||||
visible_reply = apply_extensions(visible_reply, "output")
|
visible_reply = apply_extensions(visible_reply, "output")
|
||||||
if shared.args.chat:
|
if shared.args.chat:
|
||||||
@ -152,9 +143,11 @@ def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical
|
|||||||
if next_character_found:
|
if next_character_found:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
cumulative_reply = reply
|
||||||
|
|
||||||
yield shared.history['visible']
|
yield shared.history['visible']
|
||||||
|
|
||||||
def impersonate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1):
|
def impersonate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1):
|
||||||
eos_token = '\n' if check else None
|
eos_token = '\n' if check else None
|
||||||
|
|
||||||
if 'pygmalion' in shared.model_name.lower():
|
if 'pygmalion' in shared.model_name.lower():
|
||||||
@ -162,22 +155,27 @@ def impersonate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typ
|
|||||||
|
|
||||||
prompt = generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size, impersonate=True)
|
prompt = generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size, impersonate=True)
|
||||||
|
|
||||||
reply = ''
|
|
||||||
# Yield *Is typing...*
|
# Yield *Is typing...*
|
||||||
yield shared.processing_message
|
yield shared.processing_message
|
||||||
|
|
||||||
|
cumulative_reply = ''
|
||||||
for i in range(chat_generation_attempts):
|
for i in range(chat_generation_attempts):
|
||||||
for reply in generate_reply(prompt+reply, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, eos_token=eos_token, stopping_string=f"\n{name2}:"):
|
for reply in generate_reply(f"{prompt}{' ' if len(cumulative_reply) > 0 else ''}{cumulative_reply}", max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=eos_token, stopping_strings=[f"\n{name1}:", f"\n{name2}:"]):
|
||||||
reply, next_character_found = extract_message_from_reply(prompt, reply, name1, name2, check, impersonate=True)
|
reply = cumulative_reply + reply
|
||||||
|
reply, next_character_found = extract_message_from_reply(reply, name1, name2, check)
|
||||||
yield reply
|
yield reply
|
||||||
if next_character_found:
|
if next_character_found:
|
||||||
break
|
break
|
||||||
yield reply
|
|
||||||
|
|
||||||
def cai_chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1):
|
cumulative_reply = reply
|
||||||
for _history in chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts):
|
|
||||||
|
yield reply
|
||||||
|
|
||||||
|
def cai_chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1):
|
||||||
|
for _history in chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, name1, name2, context, check, chat_prompt_size, chat_generation_attempts):
|
||||||
yield generate_chat_html(_history, name1, name2, shared.character)
|
yield generate_chat_html(_history, name1, name2, shared.character)
|
||||||
|
|
||||||
def regenerate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1):
|
def regenerate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1):
|
||||||
if (shared.character != 'None' and len(shared.history['visible']) == 1) or len(shared.history['internal']) == 0:
|
if (shared.character != 'None' and len(shared.history['visible']) == 1) or len(shared.history['internal']) == 0:
|
||||||
yield generate_chat_output(shared.history['visible'], name1, name2, shared.character)
|
yield generate_chat_output(shared.history['visible'], name1, name2, shared.character)
|
||||||
else:
|
else:
|
||||||
@ -185,7 +183,7 @@ def regenerate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typi
|
|||||||
last_internal = shared.history['internal'].pop()
|
last_internal = shared.history['internal'].pop()
|
||||||
# Yield '*Is typing...*'
|
# Yield '*Is typing...*'
|
||||||
yield generate_chat_output(shared.history['visible']+[[last_visible[0], shared.processing_message]], name1, name2, shared.character)
|
yield generate_chat_output(shared.history['visible']+[[last_visible[0], shared.processing_message]], name1, name2, shared.character)
|
||||||
for _history in chatbot_wrapper(last_internal[0], max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts, regenerate=True):
|
for _history in chatbot_wrapper(last_internal[0], max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, name1, name2, context, check, chat_prompt_size, chat_generation_attempts, regenerate=True):
|
||||||
if shared.args.cai_chat:
|
if shared.args.cai_chat:
|
||||||
shared.history['visible'][-1] = [last_visible[0], _history[-1][1]]
|
shared.history['visible'][-1] = [last_visible[0], _history[-1][1]]
|
||||||
else:
|
else:
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import time
|
import time
|
||||||
import zipfile
|
import zipfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@ -120,11 +121,12 @@ def load_model(model_name):
|
|||||||
params["torch_dtype"] = torch.float16
|
params["torch_dtype"] = torch.float16
|
||||||
|
|
||||||
if shared.args.gpu_memory:
|
if shared.args.gpu_memory:
|
||||||
memory_map = shared.args.gpu_memory
|
memory_map = list(map(lambda x : x.strip(), shared.args.gpu_memory))
|
||||||
|
max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB'
|
||||||
max_memory = {}
|
max_memory = {}
|
||||||
for i in range(len(memory_map)):
|
for i in range(len(memory_map)):
|
||||||
max_memory[i] = f'{memory_map[i]}GiB'
|
max_memory[i] = f'{memory_map[i]}GiB' if not re.match('.*ib$', memory_map[i].lower()) else memory_map[i]
|
||||||
max_memory['cpu'] = f'{shared.args.cpu_memory or 99}GiB'
|
max_memory['cpu'] = max_cpu_memory
|
||||||
params['max_memory'] = max_memory
|
params['max_memory'] = max_memory
|
||||||
elif shared.args.auto_devices:
|
elif shared.args.auto_devices:
|
||||||
total_mem = (torch.cuda.get_device_properties(0).total_memory / (1024*1024))
|
total_mem = (torch.cuda.get_device_properties(0).total_memory / (1024*1024))
|
||||||
|
@ -27,9 +27,9 @@ settings = {
|
|||||||
'max_new_tokens': 200,
|
'max_new_tokens': 200,
|
||||||
'max_new_tokens_min': 1,
|
'max_new_tokens_min': 1,
|
||||||
'max_new_tokens_max': 2000,
|
'max_new_tokens_max': 2000,
|
||||||
'name1': 'Person 1',
|
'name1': 'You',
|
||||||
'name2': 'Person 2',
|
'name2': 'Assistant',
|
||||||
'context': 'This is a conversation between two people.',
|
'context': 'This is a conversation with your Assistant. The Assistant is very helpful and is eager to chat with you and answer your questions.',
|
||||||
'stop_at_newline': False,
|
'stop_at_newline': False,
|
||||||
'chat_prompt_size': 2048,
|
'chat_prompt_size': 2048,
|
||||||
'chat_prompt_size_min': 0,
|
'chat_prompt_size_min': 0,
|
||||||
@ -56,7 +56,7 @@ settings = {
|
|||||||
},
|
},
|
||||||
'lora_prompts': {
|
'lora_prompts': {
|
||||||
'default': 'Common sense questions and answers\n\nQuestion: \nFactual answer:',
|
'default': 'Common sense questions and answers\n\nQuestion: \nFactual answer:',
|
||||||
'alpaca-lora-7b': "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n### Instruction:\nWrite a poem about the transformers Python library. \nMention the word \"large language models\" in that poem.\n### Response:\n"
|
'(alpaca-lora-7b|alpaca-lora-13b|alpaca-lora-30b)': "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n### Instruction:\nWrite a poem about the transformers Python library. \nMention the word \"large language models\" in that poem.\n### Response:\n"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -79,14 +79,16 @@ parser.add_argument('--cai-chat', action='store_true', help='Launch the web UI i
|
|||||||
parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text.')
|
parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text.')
|
||||||
parser.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision.')
|
parser.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision.')
|
||||||
parser.add_argument('--load-in-4bit', action='store_true', help='DEPRECATED: use --gptq-bits 4 instead.')
|
parser.add_argument('--load-in-4bit', action='store_true', help='DEPRECATED: use --gptq-bits 4 instead.')
|
||||||
parser.add_argument('--gptq-bits', type=int, default=0, help='Load a pre-quantized model with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA and OPT.')
|
parser.add_argument('--gptq-bits', type=int, default=0, help='GPTQ: Load a pre-quantized model with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA and OPT.')
|
||||||
parser.add_argument('--gptq-model-type', type=str, help='Model type of pre-quantized model. Currently only LLaMa and OPT are supported.')
|
parser.add_argument('--gptq-model-type', type=str, help='GPTQ: Model type of pre-quantized model. Currently only LLaMa and OPT are supported.')
|
||||||
|
parser.add_argument('--gptq-pre-layer', type=int, default=0, help='GPTQ: The number of layers to preload.')
|
||||||
parser.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.')
|
parser.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.')
|
||||||
parser.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.')
|
parser.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.')
|
||||||
parser.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')
|
parser.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')
|
||||||
parser.add_argument('--disk-cache-dir', type=str, default="cache", help='Directory to save the disk cache to. Defaults to "cache".')
|
parser.add_argument('--disk-cache-dir', type=str, default="cache", help='Directory to save the disk cache to. Defaults to "cache".')
|
||||||
parser.add_argument('--gpu-memory', type=int, nargs="+", help='Maxmimum GPU memory in GiB to be allocated per GPU. Example: --gpu-memory 10 for a single GPU, --gpu-memory 10 5 for two GPUs.')
|
parser.add_argument('--gpu-memory', type=str, nargs="+", help='Maxmimum GPU memory in GiB to be allocated per GPU. Example: --gpu-memory 10 for a single GPU, --gpu-memory 10 5 for two GPUs.')
|
||||||
parser.add_argument('--cpu-memory', type=int, help='Maximum CPU memory in GiB to allocate for offloaded weights. Must be an integer number. Defaults to 99.')
|
parser.add_argument('--cpu-memory', type=str, help='Maximum CPU memory in GiB to allocate for offloaded weights. Must be an integer number. Defaults to 99.')
|
||||||
|
parser.add_argument('--no-cache', action='store_true', help='Set use_cache to False while generating text. This reduces the VRAM usage a bit at a performance cost.')
|
||||||
parser.add_argument('--flexgen', action='store_true', help='Enable the use of FlexGen offloading.')
|
parser.add_argument('--flexgen', action='store_true', help='Enable the use of FlexGen offloading.')
|
||||||
parser.add_argument('--percent', type=int, nargs="+", default=[0, 100, 100, 0, 100, 0], help='FlexGen: allocation percentages. Must be 6 numbers separated by spaces (default: 0, 100, 100, 0, 100, 0).')
|
parser.add_argument('--percent', type=int, nargs="+", default=[0, 100, 100, 0, 100, 0], help='FlexGen: allocation percentages. Must be 6 numbers separated by spaces (default: 0, 100, 100, 0, 100, 0).')
|
||||||
parser.add_argument("--compress-weight", action="store_true", help="FlexGen: activate weight compression.")
|
parser.add_argument("--compress-weight", action="store_true", help="FlexGen: activate weight compression.")
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import gc
|
import gc
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
|
import traceback
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
@ -92,30 +93,16 @@ def clear_torch_cache():
|
|||||||
if not shared.args.cpu:
|
if not shared.args.cpu:
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, eos_token=None, stopping_string=None):
|
def set_manual_seed(seed):
|
||||||
clear_torch_cache()
|
if seed != -1:
|
||||||
t0 = time.time()
|
torch.manual_seed(seed)
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
torch.cuda.manual_seed_all(seed)
|
||||||
|
|
||||||
# These models are not part of Hugging Face, so we handle them
|
def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=None, stopping_strings=[]):
|
||||||
# separately and terminate the function call earlier
|
clear_torch_cache()
|
||||||
if shared.is_RWKV:
|
set_manual_seed(seed)
|
||||||
try:
|
t0 = time.time()
|
||||||
if shared.args.no_stream:
|
|
||||||
reply = shared.model.generate(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k)
|
|
||||||
yield formatted_outputs(reply, shared.model_name)
|
|
||||||
else:
|
|
||||||
if not (shared.args.chat or shared.args.cai_chat):
|
|
||||||
yield formatted_outputs(question, shared.model_name)
|
|
||||||
# RWKV has proper streaming, which is very nice.
|
|
||||||
# No need to generate 8 tokens at a time.
|
|
||||||
for reply in shared.model.generate_with_streaming(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k):
|
|
||||||
yield formatted_outputs(reply, shared.model_name)
|
|
||||||
finally:
|
|
||||||
t1 = time.time()
|
|
||||||
output = encode(reply)[0]
|
|
||||||
input_ids = encode(question)
|
|
||||||
print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(input_ids[0])} tokens)")
|
|
||||||
return
|
|
||||||
|
|
||||||
original_question = question
|
original_question = question
|
||||||
if not (shared.args.chat or shared.args.cai_chat):
|
if not (shared.args.chat or shared.args.cai_chat):
|
||||||
@ -123,17 +110,46 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
|
|||||||
if shared.args.verbose:
|
if shared.args.verbose:
|
||||||
print(f"\n\n{question}\n--------------------\n")
|
print(f"\n\n{question}\n--------------------\n")
|
||||||
|
|
||||||
|
# These models are not part of Hugging Face, so we handle them
|
||||||
|
# separately and terminate the function call earlier
|
||||||
|
if shared.is_RWKV:
|
||||||
|
try:
|
||||||
|
if shared.args.no_stream:
|
||||||
|
reply = shared.model.generate(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k)
|
||||||
|
if not (shared.args.chat or shared.args.cai_chat):
|
||||||
|
reply = original_question + apply_extensions(reply, "output")
|
||||||
|
yield formatted_outputs(reply, shared.model_name)
|
||||||
|
else:
|
||||||
|
if not (shared.args.chat or shared.args.cai_chat):
|
||||||
|
yield formatted_outputs(question, shared.model_name)
|
||||||
|
|
||||||
|
# RWKV has proper streaming, which is very nice.
|
||||||
|
# No need to generate 8 tokens at a time.
|
||||||
|
for reply in shared.model.generate_with_streaming(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k):
|
||||||
|
if not (shared.args.chat or shared.args.cai_chat):
|
||||||
|
reply = original_question + apply_extensions(reply, "output")
|
||||||
|
yield formatted_outputs(reply, shared.model_name)
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
traceback.print_exc()
|
||||||
|
finally:
|
||||||
|
t1 = time.time()
|
||||||
|
output = encode(reply)[0]
|
||||||
|
input_ids = encode(question)
|
||||||
|
print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(input_ids[0])} tokens)")
|
||||||
|
return
|
||||||
|
|
||||||
input_ids = encode(question, max_new_tokens)
|
input_ids = encode(question, max_new_tokens)
|
||||||
original_input_ids = input_ids
|
original_input_ids = input_ids
|
||||||
output = input_ids[0]
|
output = input_ids[0]
|
||||||
|
|
||||||
cuda = not any((shared.args.cpu, shared.args.deepspeed, shared.args.flexgen))
|
cuda = not any((shared.args.cpu, shared.args.deepspeed, shared.args.flexgen))
|
||||||
eos_token_ids = [shared.tokenizer.eos_token_id] if shared.tokenizer.eos_token_id is not None else []
|
eos_token_ids = [shared.tokenizer.eos_token_id] if shared.tokenizer.eos_token_id is not None else []
|
||||||
if eos_token is not None:
|
if eos_token is not None:
|
||||||
eos_token_ids.append(int(encode(eos_token)[0][-1]))
|
eos_token_ids.append(int(encode(eos_token)[0][-1]))
|
||||||
stopping_criteria_list = transformers.StoppingCriteriaList()
|
stopping_criteria_list = transformers.StoppingCriteriaList()
|
||||||
if stopping_string is not None:
|
if type(stopping_strings) is list and len(stopping_strings) > 0:
|
||||||
# Copied from https://github.com/PygmalionAI/gradio-ui/blob/master/src/model.py
|
t = [encode(string, 0, add_special_tokens=False) for string in stopping_strings]
|
||||||
t = encode(stopping_string, 0, add_special_tokens=False)
|
|
||||||
stopping_criteria_list.append(_SentinelTokenStoppingCriteria(sentinel_token_ids=t, starting_idx=len(input_ids[0])))
|
stopping_criteria_list.append(_SentinelTokenStoppingCriteria(sentinel_token_ids=t, starting_idx=len(input_ids[0])))
|
||||||
|
|
||||||
generate_params = {}
|
generate_params = {}
|
||||||
@ -163,6 +179,8 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
|
|||||||
"temperature": temperature,
|
"temperature": temperature,
|
||||||
"stop": eos_token_ids[-1],
|
"stop": eos_token_ids[-1],
|
||||||
})
|
})
|
||||||
|
if shared.args.no_cache:
|
||||||
|
generate_params.update({"use_cache": False})
|
||||||
if shared.args.deepspeed:
|
if shared.args.deepspeed:
|
||||||
generate_params.update({"synced_gpus": True})
|
generate_params.update({"synced_gpus": True})
|
||||||
if shared.soft_prompt:
|
if shared.soft_prompt:
|
||||||
@ -182,9 +200,10 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
|
|||||||
if shared.soft_prompt:
|
if shared.soft_prompt:
|
||||||
output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
|
output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
|
||||||
|
|
||||||
reply = decode(output)
|
new_tokens = len(output) - len(input_ids[0])
|
||||||
|
reply = decode(output[-new_tokens:])
|
||||||
if not (shared.args.chat or shared.args.cai_chat):
|
if not (shared.args.chat or shared.args.cai_chat):
|
||||||
reply = original_question + apply_extensions(reply[len(question):], "output")
|
reply = original_question + apply_extensions(reply, "output")
|
||||||
|
|
||||||
yield formatted_outputs(reply, shared.model_name)
|
yield formatted_outputs(reply, shared.model_name)
|
||||||
|
|
||||||
@ -207,10 +226,11 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
|
|||||||
for output in generator:
|
for output in generator:
|
||||||
if shared.soft_prompt:
|
if shared.soft_prompt:
|
||||||
output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
|
output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
|
||||||
reply = decode(output)
|
|
||||||
|
|
||||||
|
new_tokens = len(output) - len(input_ids[0])
|
||||||
|
reply = decode(output[-new_tokens:])
|
||||||
if not (shared.args.chat or shared.args.cai_chat):
|
if not (shared.args.chat or shared.args.cai_chat):
|
||||||
reply = original_question + apply_extensions(reply[len(question):], "output")
|
reply = original_question + apply_extensions(reply, "output")
|
||||||
|
|
||||||
if output[-1] in eos_token_ids:
|
if output[-1] in eos_token_ids:
|
||||||
break
|
break
|
||||||
@ -226,10 +246,11 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
|
|||||||
output = shared.model.generate(**generate_params)[0]
|
output = shared.model.generate(**generate_params)[0]
|
||||||
if shared.soft_prompt:
|
if shared.soft_prompt:
|
||||||
output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
|
output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
|
||||||
reply = decode(output)
|
|
||||||
|
|
||||||
|
new_tokens = len(output) - len(original_input_ids[0])
|
||||||
|
reply = decode(output[-new_tokens:])
|
||||||
if not (shared.args.chat or shared.args.cai_chat):
|
if not (shared.args.chat or shared.args.cai_chat):
|
||||||
reply = original_question + apply_extensions(reply[len(question):], "output")
|
reply = original_question + apply_extensions(reply, "output")
|
||||||
|
|
||||||
if np.count_nonzero(np.isin(input_ids[0], eos_token_ids)) < np.count_nonzero(np.isin(output, eos_token_ids)):
|
if np.count_nonzero(np.isin(input_ids[0], eos_token_ids)) < np.count_nonzero(np.isin(output, eos_token_ids)):
|
||||||
break
|
break
|
||||||
@ -238,9 +259,15 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
|
|||||||
input_ids = np.reshape(output, (1, output.shape[0]))
|
input_ids = np.reshape(output, (1, output.shape[0]))
|
||||||
if shared.soft_prompt:
|
if shared.soft_prompt:
|
||||||
inputs_embeds, filler_input_ids = generate_softprompt_input_tensors(input_ids)
|
inputs_embeds, filler_input_ids = generate_softprompt_input_tensors(input_ids)
|
||||||
|
generate_params.update({"inputs_embeds": inputs_embeds})
|
||||||
|
generate_params.update({"inputs": filler_input_ids})
|
||||||
|
else:
|
||||||
|
generate_params.update({"inputs": input_ids})
|
||||||
|
|
||||||
yield formatted_outputs(reply, shared.model_name)
|
yield formatted_outputs(reply, shared.model_name)
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
traceback.print_exc()
|
||||||
finally:
|
finally:
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(original_input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(original_input_ids[0])} tokens)")
|
print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(original_input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(original_input_ids[0])} tokens)")
|
||||||
|
@ -1,12 +1,7 @@
|
|||||||
do_sample=True
|
do_sample=True
|
||||||
temperature=1
|
top_p=0.5
|
||||||
top_p=1
|
top_k=40
|
||||||
typical_p=1
|
temperature=0.7
|
||||||
repetition_penalty=1
|
repetition_penalty=1.2
|
||||||
top_k=50
|
typical_p=1.0
|
||||||
num_beams=1
|
|
||||||
penalty_alpha=0
|
|
||||||
min_length=0
|
|
||||||
length_penalty=1
|
|
||||||
no_repeat_ngram_size=0
|
|
||||||
early_stopping=False
|
early_stopping=False
|
||||||
|
@ -1,6 +0,0 @@
|
|||||||
do_sample=True
|
|
||||||
top_p=0.9
|
|
||||||
top_k=50
|
|
||||||
temperature=1.39
|
|
||||||
repetition_penalty=1.08
|
|
||||||
typical_p=0.2
|
|
@ -6,6 +6,7 @@ markdown
|
|||||||
numpy
|
numpy
|
||||||
peft==0.2.0
|
peft==0.2.0
|
||||||
requests
|
requests
|
||||||
|
rwkv==0.7.0
|
||||||
safetensors==0.3.0
|
safetensors==0.3.0
|
||||||
sentencepiece
|
sentencepiece
|
||||||
tqdm
|
tqdm
|
||||||
|
38
server.py
38
server.py
@ -1,4 +1,3 @@
|
|||||||
import gc
|
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
@ -8,7 +7,6 @@ import zipfile
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import gradio as gr
|
import gradio as gr
|
||||||
import torch
|
|
||||||
|
|
||||||
import modules.chat as chat
|
import modules.chat as chat
|
||||||
import modules.extensions as extensions_module
|
import modules.extensions as extensions_module
|
||||||
@ -17,7 +15,7 @@ import modules.ui as ui
|
|||||||
from modules.html_generator import generate_chat_html
|
from modules.html_generator import generate_chat_html
|
||||||
from modules.LoRA import add_lora_to_model
|
from modules.LoRA import add_lora_to_model
|
||||||
from modules.models import load_model, load_soft_prompt
|
from modules.models import load_model, load_soft_prompt
|
||||||
from modules.text_generation import generate_reply
|
from modules.text_generation import clear_torch_cache, generate_reply
|
||||||
|
|
||||||
# Loading custom settings
|
# Loading custom settings
|
||||||
settings_file = None
|
settings_file = None
|
||||||
@ -56,21 +54,14 @@ def load_model_wrapper(selected_model):
|
|||||||
if selected_model != shared.model_name:
|
if selected_model != shared.model_name:
|
||||||
shared.model_name = selected_model
|
shared.model_name = selected_model
|
||||||
shared.model = shared.tokenizer = None
|
shared.model = shared.tokenizer = None
|
||||||
if not shared.args.cpu:
|
clear_torch_cache()
|
||||||
gc.collect()
|
|
||||||
torch.cuda.empty_cache()
|
|
||||||
shared.model, shared.tokenizer = load_model(shared.model_name)
|
shared.model, shared.tokenizer = load_model(shared.model_name)
|
||||||
|
|
||||||
return selected_model
|
return selected_model
|
||||||
|
|
||||||
def load_lora_wrapper(selected_lora):
|
def load_lora_wrapper(selected_lora):
|
||||||
shared.lora_name = selected_lora
|
|
||||||
default_text = shared.settings['lora_prompts'][next((k for k in shared.settings['lora_prompts'] if re.match(k.lower(), shared.lora_name.lower())), 'default')]
|
|
||||||
|
|
||||||
if not shared.args.cpu:
|
|
||||||
gc.collect()
|
|
||||||
torch.cuda.empty_cache()
|
|
||||||
add_lora_to_model(selected_lora)
|
add_lora_to_model(selected_lora)
|
||||||
|
default_text = shared.settings['lora_prompts'][next((k for k in shared.settings['lora_prompts'] if re.match(k.lower(), shared.lora_name.lower())), 'default')]
|
||||||
|
|
||||||
return selected_lora, default_text
|
return selected_lora, default_text
|
||||||
|
|
||||||
@ -102,7 +93,7 @@ def load_preset_values(preset_menu, return_dict=False):
|
|||||||
if return_dict:
|
if return_dict:
|
||||||
return generate_params
|
return generate_params
|
||||||
else:
|
else:
|
||||||
return generate_params['do_sample'], generate_params['temperature'], generate_params['top_p'], generate_params['typical_p'], generate_params['repetition_penalty'], generate_params['encoder_repetition_penalty'], generate_params['top_k'], generate_params['min_length'], generate_params['no_repeat_ngram_size'], generate_params['num_beams'], generate_params['penalty_alpha'], generate_params['length_penalty'], generate_params['early_stopping']
|
return preset_menu, generate_params['do_sample'], generate_params['temperature'], generate_params['top_p'], generate_params['typical_p'], generate_params['repetition_penalty'], generate_params['encoder_repetition_penalty'], generate_params['top_k'], generate_params['min_length'], generate_params['no_repeat_ngram_size'], generate_params['num_beams'], generate_params['penalty_alpha'], generate_params['length_penalty'], generate_params['early_stopping']
|
||||||
|
|
||||||
def upload_soft_prompt(file):
|
def upload_soft_prompt(file):
|
||||||
with zipfile.ZipFile(io.BytesIO(file)) as zf:
|
with zipfile.ZipFile(io.BytesIO(file)) as zf:
|
||||||
@ -160,6 +151,12 @@ def create_settings_menus(default_preset):
|
|||||||
shared.gradio['length_penalty'] = gr.Slider(-5, 5, value=generate_params['length_penalty'], label='length_penalty')
|
shared.gradio['length_penalty'] = gr.Slider(-5, 5, value=generate_params['length_penalty'], label='length_penalty')
|
||||||
shared.gradio['early_stopping'] = gr.Checkbox(value=generate_params['early_stopping'], label='early_stopping')
|
shared.gradio['early_stopping'] = gr.Checkbox(value=generate_params['early_stopping'], label='early_stopping')
|
||||||
|
|
||||||
|
shared.gradio['seed'] = gr.Number(value=-1, label='Seed (-1 for random)')
|
||||||
|
|
||||||
|
with gr.Row():
|
||||||
|
shared.gradio['preset_menu_mirror'] = gr.Dropdown(choices=available_presets, value=default_preset if not shared.args.flexgen else 'Naive', label='Generation parameters preset')
|
||||||
|
ui.create_refresh_button(shared.gradio['preset_menu_mirror'], lambda : None, lambda : {'choices': get_available_presets()}, 'refresh-button')
|
||||||
|
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
shared.gradio['lora_menu'] = gr.Dropdown(choices=available_loras, value=shared.lora_name, label='LoRA')
|
shared.gradio['lora_menu'] = gr.Dropdown(choices=available_loras, value=shared.lora_name, label='LoRA')
|
||||||
ui.create_refresh_button(shared.gradio['lora_menu'], lambda : None, lambda : {'choices': get_available_loras()}, 'refresh-button')
|
ui.create_refresh_button(shared.gradio['lora_menu'], lambda : None, lambda : {'choices': get_available_loras()}, 'refresh-button')
|
||||||
@ -174,7 +171,8 @@ def create_settings_menus(default_preset):
|
|||||||
shared.gradio['upload_softprompt'] = gr.File(type='binary', file_types=['.zip'])
|
shared.gradio['upload_softprompt'] = gr.File(type='binary', file_types=['.zip'])
|
||||||
|
|
||||||
shared.gradio['model_menu'].change(load_model_wrapper, [shared.gradio['model_menu']], [shared.gradio['model_menu']], show_progress=True)
|
shared.gradio['model_menu'].change(load_model_wrapper, [shared.gradio['model_menu']], [shared.gradio['model_menu']], show_progress=True)
|
||||||
shared.gradio['preset_menu'].change(load_preset_values, [shared.gradio['preset_menu']], [shared.gradio['do_sample'], shared.gradio['temperature'], shared.gradio['top_p'], shared.gradio['typical_p'], shared.gradio['repetition_penalty'], shared.gradio['encoder_repetition_penalty'], shared.gradio['top_k'], shared.gradio['min_length'], shared.gradio['no_repeat_ngram_size'], shared.gradio['num_beams'], shared.gradio['penalty_alpha'], shared.gradio['length_penalty'], shared.gradio['early_stopping']])
|
shared.gradio['preset_menu'].change(load_preset_values, [shared.gradio['preset_menu']], [shared.gradio[k] for k in ['preset_menu_mirror', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping']])
|
||||||
|
shared.gradio['preset_menu_mirror'].change(load_preset_values, [shared.gradio['preset_menu_mirror']], [shared.gradio[k] for k in ['preset_menu', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping']])
|
||||||
shared.gradio['lora_menu'].change(load_lora_wrapper, [shared.gradio['lora_menu']], [shared.gradio['lora_menu'], shared.gradio['textbox']], show_progress=True)
|
shared.gradio['lora_menu'].change(load_lora_wrapper, [shared.gradio['lora_menu']], [shared.gradio['lora_menu'], shared.gradio['textbox']], show_progress=True)
|
||||||
shared.gradio['softprompts_menu'].change(load_soft_prompt, [shared.gradio['softprompts_menu']], [shared.gradio['softprompts_menu']], show_progress=True)
|
shared.gradio['softprompts_menu'].change(load_soft_prompt, [shared.gradio['softprompts_menu']], [shared.gradio['softprompts_menu']], show_progress=True)
|
||||||
shared.gradio['upload_softprompt'].upload(upload_soft_prompt, [shared.gradio['upload_softprompt']], [shared.gradio['softprompts_menu']])
|
shared.gradio['upload_softprompt'].upload(upload_soft_prompt, [shared.gradio['upload_softprompt']], [shared.gradio['softprompts_menu']])
|
||||||
@ -235,9 +233,7 @@ else:
|
|||||||
shared.model_name = available_models[i]
|
shared.model_name = available_models[i]
|
||||||
shared.model, shared.tokenizer = load_model(shared.model_name)
|
shared.model, shared.tokenizer = load_model(shared.model_name)
|
||||||
if shared.args.lora:
|
if shared.args.lora:
|
||||||
print(shared.args.lora)
|
add_lora_to_model(shared.args.lora)
|
||||||
shared.lora_name = shared.args.lora
|
|
||||||
add_lora_to_model(shared.lora_name)
|
|
||||||
|
|
||||||
# Default UI settings
|
# Default UI settings
|
||||||
default_preset = shared.settings['presets'][next((k for k in shared.settings['presets'] if re.match(k.lower(), shared.model_name.lower())), 'default')]
|
default_preset = shared.settings['presets'][next((k for k in shared.settings['presets'] if re.match(k.lower(), shared.model_name.lower())), 'default')]
|
||||||
@ -325,13 +321,13 @@ def create_interface():
|
|||||||
create_settings_menus(default_preset)
|
create_settings_menus(default_preset)
|
||||||
|
|
||||||
function_call = 'chat.cai_chatbot_wrapper' if shared.args.cai_chat else 'chat.chatbot_wrapper'
|
function_call = 'chat.cai_chatbot_wrapper' if shared.args.cai_chat else 'chat.chatbot_wrapper'
|
||||||
shared.input_params = [shared.gradio[k] for k in ['textbox', 'max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'name1', 'name2', 'context', 'check', 'chat_prompt_size_slider', 'chat_generation_attempts']]
|
shared.input_params = [shared.gradio[k] for k in ['textbox', 'max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'seed', 'name1', 'name2', 'context', 'check', 'chat_prompt_size_slider', 'chat_generation_attempts']]
|
||||||
|
|
||||||
gen_events.append(shared.gradio['Generate'].click(eval(function_call), shared.input_params, shared.gradio['display'], show_progress=shared.args.no_stream))
|
gen_events.append(shared.gradio['Generate'].click(eval(function_call), shared.input_params, shared.gradio['display'], show_progress=shared.args.no_stream))
|
||||||
gen_events.append(shared.gradio['textbox'].submit(eval(function_call), shared.input_params, shared.gradio['display'], show_progress=shared.args.no_stream))
|
gen_events.append(shared.gradio['textbox'].submit(eval(function_call), shared.input_params, shared.gradio['display'], show_progress=shared.args.no_stream))
|
||||||
gen_events.append(shared.gradio['Regenerate'].click(chat.regenerate_wrapper, shared.input_params, shared.gradio['display'], show_progress=shared.args.no_stream))
|
gen_events.append(shared.gradio['Regenerate'].click(chat.regenerate_wrapper, shared.input_params, shared.gradio['display'], show_progress=shared.args.no_stream))
|
||||||
gen_events.append(shared.gradio['Impersonate'].click(chat.impersonate_wrapper, shared.input_params, shared.gradio['textbox'], show_progress=shared.args.no_stream))
|
gen_events.append(shared.gradio['Impersonate'].click(chat.impersonate_wrapper, shared.input_params, shared.gradio['textbox'], show_progress=shared.args.no_stream))
|
||||||
shared.gradio['Stop'].click(chat.stop_everything_event, [], [], cancels=gen_events)
|
shared.gradio['Stop'].click(chat.stop_everything_event, [], [], cancels=gen_events, queue=False)
|
||||||
|
|
||||||
shared.gradio['Copy last reply'].click(chat.send_last_reply_to_input, [], shared.gradio['textbox'], show_progress=shared.args.no_stream)
|
shared.gradio['Copy last reply'].click(chat.send_last_reply_to_input, [], shared.gradio['textbox'], show_progress=shared.args.no_stream)
|
||||||
shared.gradio['Replace last reply'].click(chat.replace_last_reply, [shared.gradio['textbox'], shared.gradio['name1'], shared.gradio['name2']], shared.gradio['display'], show_progress=shared.args.no_stream)
|
shared.gradio['Replace last reply'].click(chat.replace_last_reply, [shared.gradio['textbox'], shared.gradio['name1'], shared.gradio['name2']], shared.gradio['display'], show_progress=shared.args.no_stream)
|
||||||
@ -388,7 +384,7 @@ def create_interface():
|
|||||||
with gr.Tab("Parameters", elem_id="parameters"):
|
with gr.Tab("Parameters", elem_id="parameters"):
|
||||||
create_settings_menus(default_preset)
|
create_settings_menus(default_preset)
|
||||||
|
|
||||||
shared.input_params = [shared.gradio[k] for k in ['textbox', 'max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping']]
|
shared.input_params = [shared.gradio[k] for k in ['textbox', 'max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'seed']]
|
||||||
output_params = [shared.gradio[k] for k in ['textbox', 'markdown', 'html']]
|
output_params = [shared.gradio[k] for k in ['textbox', 'markdown', 'html']]
|
||||||
gen_events.append(shared.gradio['Generate'].click(generate_reply, shared.input_params, output_params, show_progress=shared.args.no_stream, api_name='textgen'))
|
gen_events.append(shared.gradio['Generate'].click(generate_reply, shared.input_params, output_params, show_progress=shared.args.no_stream, api_name='textgen'))
|
||||||
gen_events.append(shared.gradio['textbox'].submit(generate_reply, shared.input_params, output_params, show_progress=shared.args.no_stream))
|
gen_events.append(shared.gradio['textbox'].submit(generate_reply, shared.input_params, output_params, show_progress=shared.args.no_stream))
|
||||||
@ -420,7 +416,7 @@ def create_interface():
|
|||||||
with gr.Tab("Parameters", elem_id="parameters"):
|
with gr.Tab("Parameters", elem_id="parameters"):
|
||||||
create_settings_menus(default_preset)
|
create_settings_menus(default_preset)
|
||||||
|
|
||||||
shared.input_params = [shared.gradio[k] for k in ['textbox', 'max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping']]
|
shared.input_params = [shared.gradio[k] for k in ['textbox', 'max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'seed']]
|
||||||
output_params = [shared.gradio[k] for k in ['output_textbox', 'markdown', 'html']]
|
output_params = [shared.gradio[k] for k in ['output_textbox', 'markdown', 'html']]
|
||||||
gen_events.append(shared.gradio['Generate'].click(generate_reply, shared.input_params, output_params, show_progress=shared.args.no_stream, api_name='textgen'))
|
gen_events.append(shared.gradio['Generate'].click(generate_reply, shared.input_params, output_params, show_progress=shared.args.no_stream, api_name='textgen'))
|
||||||
gen_events.append(shared.gradio['textbox'].submit(generate_reply, shared.input_params, output_params, show_progress=shared.args.no_stream))
|
gen_events.append(shared.gradio['textbox'].submit(generate_reply, shared.input_params, output_params, show_progress=shared.args.no_stream))
|
||||||
|
@ -2,9 +2,9 @@
|
|||||||
"max_new_tokens": 200,
|
"max_new_tokens": 200,
|
||||||
"max_new_tokens_min": 1,
|
"max_new_tokens_min": 1,
|
||||||
"max_new_tokens_max": 2000,
|
"max_new_tokens_max": 2000,
|
||||||
"name1": "Person 1",
|
"name1": "You",
|
||||||
"name2": "Person 2",
|
"name2": "Assistant",
|
||||||
"context": "This is a conversation between two people.",
|
"context": "This is a conversation with your Assistant. The Assistant is very helpful and is eager to chat with you and answer your questions.",
|
||||||
"stop_at_newline": false,
|
"stop_at_newline": false,
|
||||||
"chat_prompt_size": 2048,
|
"chat_prompt_size": 2048,
|
||||||
"chat_prompt_size_min": 0,
|
"chat_prompt_size_min": 0,
|
||||||
|
Loading…
Reference in New Issue
Block a user