Merge branch 'main' into fix/api-reload

2024-11-22 08:07:56 +01:00 · 2023-03-24 16:54:41 -03:00 · 2023-03-24 16:54:41 -03:00 · bfe960731f
commit bfe960731f
parent 4a724ed22f d8e950d6bd
24 changed files with 429 additions and 196 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,6 +2,7 @@ cache/*
 characters/*
 extensions/silero_tts/outputs/*
 extensions/elevenlabs_tts/outputs/*
 extensions/sd_api_pictures/outputs/*
 logs/*
 loras/*
 models/*
--- a/README.md
+++ b/README.md
@ -84,10 +84,6 @@ pip install -r requirements.txt
 > 
 > For bitsandbytes and `--load-in-8bit` to work on Linux/WSL, this dirty fix is currently necessary: https://github.com/oobabooga/text-generation-webui/issues/400#issuecomment-1474876859
 ### Alternative: native Windows installation
 As an alternative to the recommended WSL method, you can install the web UI natively on Windows using this guide. It will be a lot harder and the performance may be slower: [Installation instructions for human beings](https://github.com/oobabooga/text-generation-webui/wiki/Installation-instructions-for-human-beings).
 ### Alternative: one-click installers
 [oobabooga-windows.zip](https://github.com/oobabooga/one-click-installers/archive/refs/heads/oobabooga-windows.zip)
@ -101,7 +97,13 @@ Just download the zip above, extract it, and double click on "install". The web
 Source codes: https://github.com/oobabooga/one-click-installers
-This method lags behind the newest developments and does not support 8-bit mode on Windows without additional set up: https://github.com/oobabooga/text-generation-webui/issues/147#issuecomment-1456040134, https://github.com/oobabooga/text-generation-webui/issues/20#issuecomment-1411650652
+> **Note**
 > 
 > To get 8-bit and 4-bit models working in your 1-click Windows installation, you can use the [one-click-bandaid](https://github.com/ClayShoaf/oobabooga-one-click-bandaid).
 ### Alternative: native Windows installation
 As an alternative to the recommended WSL method, you can install the web UI natively on Windows using this guide. It will be a lot harder and the performance may be slower: [Installation instructions for human beings](https://github.com/oobabooga/text-generation-webui/wiki/Installation-instructions-for-human-beings).
 ### Alternative: Docker
@ -175,15 +177,17 @@ Optionally, you can use the following command-line flags:
 | `--cpu`          | Use the CPU to generate text.|
 | `--load-in-8bit` | Load the model with 8-bit precision.|
 | `--load-in-4bit` | DEPRECATED: use `--gptq-bits 4` instead. |
-| `--gptq-bits GPTQ_BITS` |  Load a pre-quantized model with specified precision. 2, 3, 4 and 8 (bit) are supported. Currently only works with LLaMA and OPT. |
+| `--gptq-bits GPTQ_BITS` |  GPTQ: Load a pre-quantized model with specified precision. 2, 3, 4 and 8 (bit) are supported. Currently only works with LLaMA and OPT. |
-| `--gptq-model-type MODEL_TYPE` |  Model type of pre-quantized model. Currently only LLaMa and OPT are supported. |
+| `--gptq-model-type MODEL_TYPE` |  GPTQ: Model type of pre-quantized model. Currently only LLaMa and OPT are supported. |
 | `--gptq-pre-layer GPTQ_PRE_LAYER` |  GPTQ: The number of layers to preload. |
 | `--bf16`         | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. |
 | `--auto-devices` | Automatically split the model across the available GPU(s) and CPU.|
 | `--disk`         | If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. |
 | `--disk-cache-dir DISK_CACHE_DIR` | Directory to save the disk cache to. Defaults to `cache/`. |
-|  `--gpu-memory GPU_MEMORY [GPU_MEMORY ...]` |  Maxmimum GPU memory in GiB to be allocated per GPU. Example: `--gpu-memory 10` for a single GPU, `--gpu-memory 10 5` for two GPUs. |
+|  `--gpu-memory GPU_MEMORY [GPU_MEMORY ...]` |  Maxmimum GPU memory in GiB to be allocated per GPU. Example: `--gpu-memory 10` for a single GPU, `--gpu-memory 10 5` for two GPUs. You can also set values in MiB like `--gpu-memory 3500MiB`. |
 | `--cpu-memory CPU_MEMORY` | Maximum CPU memory in GiB to allocate for offloaded weights. Must be an integer number. Defaults to 99.|
-| `--flexgen`      |         Enable the use of FlexGen offloading. |
+| `--no-cache`     | Set `use_cache` to False while generating text. This reduces the VRAM usage a bit with a performance cost. |
 | `--flexgen`      | Enable the use of FlexGen offloading. |
 |  `--percent PERCENT [PERCENT ...]` |  FlexGen: allocation percentages. Must be 6 numbers separated by spaces (default: 0, 100, 100, 0, 100, 0). |
 |  `--compress-weight` |  FlexGen: Whether to compress weight (default: False).|
 |  `--pin-weight [PIN_WEIGHT]` |       FlexGen: whether to pin weights (setting this to False reduces CPU memory by 20%). |
@ -201,7 +205,7 @@ Optionally, you can use the following command-line flags:
 | `--auto-launch`  | Open the web UI in the default browser upon launch. |
 | `--verbose`      | Print the prompts to the terminal. |
-Out of memory errors? [Check this guide](https://github.com/oobabooga/text-generation-webui/wiki/Low-VRAM-guide).
+Out of memory errors? [Check the low VRAM guide](https://github.com/oobabooga/text-generation-webui/wiki/Low-VRAM-guide).
 ## Presets
--- a/api-example-stream.py
+++ b/api-example-stream.py
@ -34,6 +34,7 @@ async def run(context):
        'penalty_alpha': 0,
        'length_penalty': 1,
        'early_stopping': False,
        'seed': -1,
    }
    session = random_hash()
@ -44,14 +45,14 @@ async def run(context):
                case "send_hash":
                    await websocket.send(json.dumps({
                        "session_hash": session,
-                        "fn_index": 7
+                        "fn_index": 12
                    }))
                case "estimation":
                    pass
                case "send_data":
                    await websocket.send(json.dumps({
                        "session_hash": session,
-                        "fn_index": 7,
+                        "fn_index": 12,
                        "data": [
                            context,
                            params['max_new_tokens'],
@ -68,6 +69,7 @@ async def run(context):
                            params['penalty_alpha'],
                            params['length_penalty'],
                            params['early_stopping'],
                            params['seed'],
                        ]
                    }))
                case "process_starts":
--- a/api-example.py
+++ b/api-example.py
@ -32,6 +32,7 @@ params = {
    'penalty_alpha': 0,
    'length_penalty': 1,
    'early_stopping': False,
    'seed': -1,
 }
 # Input prompt
@ -54,6 +55,7 @@ response = requests.post(f"http://{server}:7860/run/textgen", json={
        params['penalty_alpha'],
        params['length_penalty'],
        params['early_stopping'],
        params['seed'],
    ]
 }).json()
--- a/css/main.css
+++ b/css/main.css
@ -50,3 +50,7 @@ ol li p, ul li p {
 #main, #parameters, #chat-settings, #interface-mode, #lora {
  border: 0;
 }
 .gradio-container-3-18-0 .prose * h1, h2, h3, h4 {
  color: white;
 }
--- a/download-model.py
+++ b/download-model.py
@ -116,10 +116,11 @@ def get_download_links_from_huggingface(model, branch):
            is_pytorch = re.match("(pytorch|adapter)_model.*\.bin", fname)
            is_safetensors = re.match("model.*\.safetensors", fname)
            is_pt = re.match(".*\.pt", fname)
            is_tokenizer = re.match("tokenizer.*\.model", fname)
-            is_text = re.match(".*\.(txt|json)", fname) or is_tokenizer
+            is_text = re.match(".*\.(txt|json|py)", fname) or is_tokenizer
-            if any((is_pytorch, is_safetensors, is_text, is_tokenizer)):
+            if any((is_pytorch, is_safetensors, is_pt, is_tokenizer, is_text)):
                if is_text:
                    links.append(f"https://huggingface.co/{model}/resolve/{branch}/{fname}")
                    classifications.append('text')
@ -132,7 +133,8 @@ def get_download_links_from_huggingface(model, branch):
                    elif is_pytorch:
                        has_pytorch = True
                        classifications.append('pytorch')
-
+                    elif is_pt:
                        classifications.append('pt')
        cursor = base64.b64encode(f'{{"file_name":"{dict[-1]["path"]}"}}'.encode()) + b':50'
        cursor = base64.b64encode(cursor)
--- a/extensions/api/script.py
+++ b/extensions/api/script.py
@ -57,6 +57,7 @@ class Handler(BaseHTTPRequestHandler):
                penalty_alpha=0, 
                length_penalty=1,
                early_stopping=False,
                seed=-1,
            )
            answer = ''
--- a/extensions/elevenlabs_tts/script.py
+++ b/extensions/elevenlabs_tts/script.py
@ -1,6 +1,8 @@
 import re
 from pathlib import Path
 import gradio as gr
 import modules.shared as shared
 from elevenlabslib import ElevenLabsUser
 from elevenlabslib.helpers import save_bytes_to_path
@ -15,7 +17,10 @@ wav_idx = 0
 user = ElevenLabsUser(params['api_key'])
 user_info = None
-
+if not shared.args.no_stream:
    print("Please add --no-stream. This extension is not meant to be used with streaming.")
    raise ValueError
 # Check if the API is valid and refresh the UI accordingly.
 def check_valid_api():
@ -47,14 +52,9 @@ def refresh_voices():
        return
 def remove_surrounded_chars(string):
-    new_string = ""
+    # this expression matches to 'as few symbols as possible (0 upwards) between any asterisks' OR
-    in_star = False
+    # 'as few symbols as possible (0 upwards) between an asterisk and the end of the string'
-    for char in string:
+    return re.sub('\*[^\*]*?(\*|$)','',string)
        if char == '*':
            in_star = not in_star
        elif not in_star:
            new_string += char
    return new_string
 def input_modifier(string):
    """
@ -110,4 +110,4 @@ def ui():
    voice.change(lambda x: params.update({'selected_voice': x}), voice, None)
    api_key.change(lambda x: params.update({'api_key': x}), api_key, None)
    connect.click(check_valid_api, [], connection_status)
-    connect.click(refresh_voices, [], voice)
+    connect.click(refresh_voices, [], voice)
--- a/extensions/sd_api_pictures/script.py
+++ b/extensions/sd_api_pictures/script.py
@ -0,0 +1,179 @@
 import base64
 import io
 import re
 from pathlib import Path
 import gradio as gr
 import modules.chat as chat
 import modules.shared as shared
 import requests
 import torch
 from PIL import Image
 torch._C._jit_set_profiling_mode(False)
 # parameters which can be customized in settings.json of webui  
 params = {
    'enable_SD_api': False,
    'address': 'http://127.0.0.1:7860',
    'save_img': False,
    'SD_model': 'NeverEndingDream', # not really used right now
    'prompt_prefix': '(Masterpiece:1.1), (solo:1.3), detailed, intricate, colorful',
    'negative_prompt': '(worst quality, low quality:1.3)',
    'side_length': 512,
    'restore_faces': False
 }
 SD_models = ['NeverEndingDream'] # TODO: get with http://{address}}/sdapi/v1/sd-models and allow user to select
 streaming_state = shared.args.no_stream # remember if chat streaming was enabled
 picture_response = False # specifies if the next model response should appear as a picture
 pic_id = 0
 def remove_surrounded_chars(string):
    # this expression matches to 'as few symbols as possible (0 upwards) between any asterisks' OR
    # 'as few symbols as possible (0 upwards) between an asterisk and the end of the string'
    return re.sub('\*[^\*]*?(\*|$)','',string)
 # I don't even need input_hijack for this as visible text will be commited to history as the unmodified string
 def input_modifier(string):
    """
    This function is applied to your text inputs before
    they are fed into the model.
    """
    global params, picture_response
    if not params['enable_SD_api']:
        return string
    commands = ['send', 'mail', 'me']
    mediums = ['image', 'pic', 'picture', 'photo']
    subjects = ['yourself', 'own']
    lowstr = string.lower()
    # TODO: refactor out to separate handler and also replace detection with a regexp
    if any(command in lowstr for command in commands) and any(case in lowstr for case in mediums): # trigger the generation if a command signature and a medium signature is found
        picture_response = True
        shared.args.no_stream = True                                                               # Disable streaming cause otherwise the SD-generated picture would return as a dud
        shared.processing_message = "*Is sending a picture...*"
        string = "Please provide a detailed description of your surroundings, how you look and the situation you're in and what you are doing right now"
        if any(target in lowstr for target in subjects):                                           # the focus of the image should be on the sending character
            string = "Please provide a detailed and vivid description of how you look and what you are wearing"
    return string
 # Get and save the Stable Diffusion-generated picture
 def get_SD_pictures(description):
    global params, pic_id
    payload = {
        "prompt": params['prompt_prefix'] + description,
        "seed": -1,
        "sampler_name": "DPM++ 2M Karras",
        "steps": 32,
        "cfg_scale": 7,
        "width": params['side_length'],
        "height": params['side_length'],
        "restore_faces": params['restore_faces'],
        "negative_prompt": params['negative_prompt']
    }
    response = requests.post(url=f'{params["address"]}/sdapi/v1/txt2img', json=payload)
    r = response.json()
    visible_result = ""
    for img_str in r['images']:
        image = Image.open(io.BytesIO(base64.b64decode(img_str.split(",",1)[0])))
        if params['save_img']:
            output_file = Path(f'extensions/sd_api_pictures/outputs/{pic_id:06d}.png')
            image.save(output_file.as_posix())
            pic_id += 1
        # lower the resolution of received images for the chat, otherwise the log size gets out of control quickly with all the base64 values in visible history
        image.thumbnail((300, 300))
        buffered = io.BytesIO()
        image.save(buffered, format="JPEG")
        buffered.seek(0)
        image_bytes = buffered.getvalue()
        img_str = "data:image/jpeg;base64," + base64.b64encode(image_bytes).decode()
        visible_result = visible_result + f'<img src="{img_str}" alt="{description}">\n'
    return visible_result
 # TODO: how do I make the UI history ignore the resulting pictures (I don't want HTML to appear in history)
 # and replace it with 'text' for the purposes of logging?
 def output_modifier(string):
    """
    This function is applied to the model outputs.
    """
    global pic_id, picture_response, streaming_state
    if not picture_response:
        return string
    string = remove_surrounded_chars(string)
    string = string.replace('"', '')
    string = string.replace('“', '')
    string = string.replace('\n', ' ')
    string = string.strip()
    if string == '':
        string = 'no viable description in reply, try regenerating'
    # I can't for the love of all that's holy get the name from shared.gradio['name1'], so for now it will be like this
    text = f'*Description: "{string}"*'
    image = get_SD_pictures(string)
    picture_response = False
    shared.processing_message = "*Is typing...*"
    shared.args.no_stream = streaming_state
    return image + "\n" + text
 def bot_prefix_modifier(string):
    """
    This function is only applied in chat mode. It modifies
    the prefix text for the Bot and can be used to bias its
    behavior.
    """
    return string
 def force_pic():
    global picture_response
    picture_response = True
 def ui():
    # Gradio elements
    with gr.Accordion("Stable Diffusion api integration", open=True):
        with gr.Row():
            with gr.Column():
                enable = gr.Checkbox(value=params['enable_SD_api'], label='Activate SD Api integration')
                save_img = gr.Checkbox(value=params['save_img'], label='Keep original received images in the outputs subdir')
            with gr.Column():
                address = gr.Textbox(placeholder=params['address'], value=params['address'], label='Stable Diffusion host address')
        with gr.Row():
            force_btn = gr.Button("Force the next response to be a picture")
            generate_now_btn = gr.Button("Generate an image response to the input")
        with gr.Accordion("Generation parameters", open=False):
            prompt_prefix = gr.Textbox(placeholder=params['prompt_prefix'], value=params['prompt_prefix'], label='Prompt Prefix (best used to describe the look of the character)')
            with gr.Row():
                negative_prompt = gr.Textbox(placeholder=params['negative_prompt'], value=params['negative_prompt'], label='Negative Prompt')
                dimensions = gr.Slider(256,702,value=params['side_length'],step=64,label='Image dimensions')
                # model = gr.Dropdown(value=SD_models[0], choices=SD_models, label='Model')
    # Event functions to update the parameters in the backend
    enable.change(lambda x: params.update({"enable_SD_api": x}), enable, None)
    save_img.change(lambda x: params.update({"save_img": x}), save_img, None)
    address.change(lambda x: params.update({"address": x}), address, None)
    prompt_prefix.change(lambda x: params.update({"prompt_prefix": x}), prompt_prefix, None)
    negative_prompt.change(lambda x: params.update({"negative_prompt": x}), negative_prompt, None)
    dimensions.change(lambda x: params.update({"side_length": x}), dimensions, None)
    # model.change(lambda x: params.update({"SD_model": x}), model, None)
    force_btn.click(force_pic)
    generate_now_btn.click(force_pic)
    generate_now_btn.click(eval('chat.cai_chatbot_wrapper'), shared.input_params, shared.gradio['display'], show_progress=shared.args.no_stream)
--- a/extensions/send_pictures/script.py
+++ b/extensions/send_pictures/script.py
@ -2,11 +2,11 @@ import base64
 from io import BytesIO
 import gradio as gr
 import torch
 from transformers import BlipForConditionalGeneration, BlipProcessor
 import modules.chat as chat
 import modules.shared as shared
 import torch
 from PIL import Image
 from transformers import BlipForConditionalGeneration, BlipProcessor
 # If 'state' is True, will hijack the next chat generation with
 # custom input text given by 'value' in the format [text, visible_text]
@ -25,10 +25,12 @@ def caption_image(raw_image):
 def generate_chat_picture(picture, name1, name2):
    text = f'*{name1} sends {name2} a picture that contains the following: "{caption_image(picture)}"*'
    # lower the resolution of sent images for the chat, otherwise the log size gets out of control quickly with all the base64 values in visible history
    picture.thumbnail((300, 300))
    buffer = BytesIO()
    picture.save(buffer, format="JPEG")
    img_str = base64.b64encode(buffer.getvalue()).decode('utf-8')
-    visible_text = f'<img src="data:image/jpeg;base64,{img_str}">'
+    visible_text = f'<img src="data:image/jpeg;base64,{img_str}" alt="{text}">'
    return text, visible_text
 def ui():
--- a/extensions/silero_tts/script.py
+++ b/extensions/silero_tts/script.py
@ -1,11 +1,11 @@
 import re
 import time
 from pathlib import Path
 import gradio as gr
 import torch
 import modules.chat as chat
 import modules.shared as shared
 import torch
 torch._C._jit_set_profiling_mode(False)
@ -46,14 +46,9 @@ def load_model():
 model = load_model()
 def remove_surrounded_chars(string):
-    new_string = ""
+    # this expression matches to 'as few symbols as possible (0 upwards) between any asterisks' OR
-    in_star = False
+    # 'as few symbols as possible (0 upwards) between an asterisk and the end of the string'
-    for char in string:
+    return re.sub('\*[^\*]*?(\*|$)','',string)
        if char == '*':
            in_star = not in_star
        elif not in_star:
            new_string += char
    return new_string
 def remove_tts_from_history(name1, name2):
    for i, entry in enumerate(shared.history['internal']):
@ -166,4 +161,4 @@ def ui():
    autoplay.change(lambda x: params.update({"autoplay": x}), autoplay, None)
    voice.change(lambda x: params.update({"speaker": x}), voice, None)
    v_pitch.change(lambda x: params.update({"voice_pitch": x}), v_pitch, None)
-    v_speed.change(lambda x: params.update({"voice_speed": x}), v_speed, None)
+    v_speed.change(lambda x: params.update({"voice_speed": x}), v_speed, None)
--- a/modules/GPTQ_loader.py
+++ b/modules/GPTQ_loader.py
@ -1,3 +1,4 @@
 import re
 import sys
 from pathlib import Path
@ -8,6 +9,7 @@ import modules.shared as shared
 sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa")))
 import llama
 import llama_inference_offload
 import opt
@ -23,7 +25,10 @@ def load_quantized(model_name):
        model_type = shared.args.gptq_model_type.lower()
    if model_type == 'llama':
-        load_quant = llama.load_quant
+        if not shared.args.gptq_pre_layer:
            load_quant = llama.load_quant
        else:
            load_quant = llama_inference_offload.load_quant
    elif model_type == 'opt':
        load_quant = opt.load_quant
    else:
@ -52,20 +57,28 @@ def load_quantized(model_name):
        print(f"Could not find {pt_model}, exiting...")
        exit()
-    model = load_quant(str(path_to_model), str(pt_path), shared.args.gptq_bits)
+    # qwopqwop200's offload
-
+    if shared.args.gptq_pre_layer:
-    # Multiple GPUs or GPU+CPU
+        model = load_quant(str(path_to_model), str(pt_path), shared.args.gptq_bits, shared.args.gptq_pre_layer)
    if shared.args.gpu_memory:
        max_memory = {}
        for i in range(len(shared.args.gpu_memory)):
            max_memory[i] = f"{shared.args.gpu_memory[i]}GiB"
        max_memory['cpu'] = f"{shared.args.cpu_memory or '99'}GiB"
        device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=["LlamaDecoderLayer"])
        model = accelerate.dispatch_model(model, device_map=device_map)
    # Single GPU
    else:
-        model = model.to(torch.device('cuda:0'))
+        model = load_quant(str(path_to_model), str(pt_path), shared.args.gptq_bits)
        # accelerate offload (doesn't work properly)
        if shared.args.gpu_memory:
            memory_map = list(map(lambda x : x.strip(), shared.args.gpu_memory))
            max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB'
            max_memory = {}
            for i in range(len(memory_map)):
                max_memory[i] = f'{memory_map[i]}GiB' if not re.match('.*ib$', memory_map[i].lower()) else memory_map[i]
            max_memory['cpu'] = max_cpu_memory
            device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=["LlamaDecoderLayer"])
            print("Using the following device map for the 4-bit model:", device_map)
            # https://huggingface.co/docs/accelerate/package_reference/big_modeling#accelerate.dispatch_model
            model = accelerate.dispatch_model(model, device_map=device_map, offload_buffers=True)
        # No offload
        elif not shared.args.cpu:
            model = model.to(torch.device('cuda:0'))
    return model
--- a/modules/LoRA.py
+++ b/modules/LoRA.py
@ -2,21 +2,36 @@ from pathlib import Path
 import modules.shared as shared
 from modules.models import load_model
 from modules.text_generation import clear_torch_cache
 def reload_model():
    shared.model = shared.tokenizer = None
    clear_torch_cache()
    shared.model, shared.tokenizer = load_model(shared.model_name)
 def add_lora_to_model(lora_name):
    from peft import PeftModel
-    # Is there a more efficient way of returning to the base model?
+    # If a LoRA had been previously loaded, or if we want
-    if lora_name == "None":
+    # to unload a LoRA, reload the model
-        print("Reloading the model to remove the LoRA...")
+    if shared.lora_name != "None" or lora_name == "None":
-        shared.model, shared.tokenizer = load_model(shared.model_name)
+        reload_model()
-    else:
+    shared.lora_name = lora_name
        # Why doesn't this work in 16-bit mode?
        print(f"Adding the LoRA {lora_name} to the model...")
    if lora_name != "None":
        print(f"Adding the LoRA {lora_name} to the model...")
        params = {}
-        #params['device_map'] = {'': 0}
+        if not shared.args.cpu:
-        #params['dtype'] = shared.model.dtype
+            params['dtype'] = shared.model.dtype
            if hasattr(shared.model, "hf_device_map"):
                params['device_map'] = {"base_model.model."+k: v for k, v in shared.model.hf_device_map.items()}
            elif shared.args.load_in_8bit:
                params['device_map'] = {'': 0}
        shared.model = PeftModel.from_pretrained(shared.model, Path(f"loras/{lora_name}"), **params)
        if not shared.args.load_in_8bit and not shared.args.cpu:
            shared.model.half()
            if not hasattr(shared.model, "hf_device_map"):
                shared.model.cuda()
--- a/modules/RWKV.py
+++ b/modules/RWKV.py
@ -45,11 +45,11 @@ class RWKVModel:
            token_stop = token_stop
        )
-        return context+self.pipeline.generate(context, token_count=token_count, args=args, callback=callback)
+        return self.pipeline.generate(context, token_count=token_count, args=args, callback=callback)
    def generate_with_streaming(self, **kwargs):
        with Iteratorize(self.generate, kwargs, callback=None) as generator:
-            reply = kwargs['context']
+            reply = ''
            for token in generator:
                reply += token
                yield reply
--- a/modules/callbacks.py
+++ b/modules/callbacks.py
@ -11,24 +11,22 @@ import modules.shared as shared
 # Copied from https://github.com/PygmalionAI/gradio-ui/
 class _SentinelTokenStoppingCriteria(transformers.StoppingCriteria):
-    def __init__(self, sentinel_token_ids: torch.LongTensor,
+    def __init__(self, sentinel_token_ids: list[torch.LongTensor], starting_idx: int):
                 starting_idx: int):
        transformers.StoppingCriteria.__init__(self)
        self.sentinel_token_ids = sentinel_token_ids
        self.starting_idx = starting_idx
-    def __call__(self, input_ids: torch.LongTensor,
+    def __call__(self, input_ids: torch.LongTensor, _scores: torch.FloatTensor) -> bool:
                 _scores: torch.FloatTensor) -> bool:
        for sample in input_ids:
            trimmed_sample = sample[self.starting_idx:]
            # Can't unfold, output is still too tiny. Skip.
            if trimmed_sample.shape[-1] < self.sentinel_token_ids.shape[-1]:
                continue
-            for window in trimmed_sample.unfold(
+            for i in range(len(self.sentinel_token_ids)):
-                    0, self.sentinel_token_ids.shape[-1], 1):
+                # Can't unfold, output is still too tiny. Skip.
-                if torch.all(torch.eq(self.sentinel_token_ids, window)):
+                if trimmed_sample.shape[-1] < self.sentinel_token_ids[i].shape[-1]:
-                    return True
+                    continue
                for window in trimmed_sample.unfold(0, self.sentinel_token_ids[i].shape[-1], 1):
                    if torch.all(torch.eq(self.sentinel_token_ids[i], window)):
                        return True
        return False
 class Stream(transformers.StoppingCriteria):
--- a/modules/chat.py
+++ b/modules/chat.py
@ -51,47 +51,37 @@ def generate_chat_prompt(user_input, max_new_tokens, name1, name2, context, chat
    prompt = ''.join(rows)
    return prompt
-def extract_message_from_reply(question, reply, name1, name2, check, impersonate=False):
+def extract_message_from_reply(reply, name1, name2, check):
    next_character_found = False
    asker = name1 if not impersonate else name2
    replier = name2 if not impersonate else name1
    previous_idx = [m.start() for m in re.finditer(f"(^|\n){re.escape(replier)}:", question)]
    idx = [m.start() for m in re.finditer(f"(^|\n){re.escape(replier)}:", reply)]
    idx = idx[max(len(previous_idx)-1, 0)]
    if not impersonate:
        reply = reply[idx + 1 + len(apply_extensions(f"{replier}:", "bot_prefix")):]
    else:
        reply = reply[idx + 1 + len(f"{replier}:"):]
    if check:
        lines = reply.split('\n')
        reply = lines[0].strip()
        if len(lines) > 1:
            next_character_found = True
    else:
-        idx = reply.find(f"\n{asker}:")
+        for string in [f"\n{name1}:", f"\n{name2}:"]:
-        if idx != -1:
+            idx = reply.find(string)
-            reply = reply[:idx]
+            if idx != -1:
-            next_character_found = True
+                reply = reply[:idx]
-        reply = fix_newlines(reply)
+                next_character_found = True
        # If something like "\nYo" is generated just before "\nYou:"
        # is completed, trim it
-        next_turn = f"\n{asker}:"
+        if not next_character_found:
-        for j in range(len(next_turn)-1, 0, -1):
+            for string in [f"\n{name1}:", f"\n{name2}:"]:
-            if reply[-j:] == next_turn[:j]:
+                for j in range(len(string)-1, 0, -1):
-                reply = reply[:-j]
+                    if reply[-j:] == string[:j]:
-                break
+                        reply = reply[:-j]
                        break
    reply = fix_newlines(reply)
    return reply, next_character_found
 def stop_everything_event():
    shared.stop_everything = True
-def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1, regenerate=False):
+def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1, regenerate=False):
    shared.stop_everything = False
    just_started = True
    eos_token = '\n' if check else None
@ -125,12 +115,13 @@ def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical
        yield shared.history['visible']+[[visible_text, shared.processing_message]]
    # Generate
-    reply = ''
+    cumulative_reply = ''
    for i in range(chat_generation_attempts):
-        for reply in generate_reply(f"{prompt}{' ' if len(reply) > 0 else ''}{reply}", max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, eos_token=eos_token, stopping_string=f"\n{name1}:"):
+        for reply in generate_reply(f"{prompt}{' ' if len(cumulative_reply) > 0 else ''}{cumulative_reply}", max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=eos_token, stopping_strings=[f"\n{name1}:", f"\n{name2}:"]):
            reply = cumulative_reply + reply
            # Extracting the reply
-            reply, next_character_found = extract_message_from_reply(prompt, reply, name1, name2, check)
+            reply, next_character_found = extract_message_from_reply(reply, name1, name2, check)
            visible_reply = re.sub("(<USER>|<user>|{{user}})", name1_original, reply)
            visible_reply = apply_extensions(visible_reply, "output")
            if shared.args.chat:
@ -152,9 +143,11 @@ def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical
            if next_character_found:
                break
        cumulative_reply = reply
    yield shared.history['visible']
-def impersonate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1):
+def impersonate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1):
    eos_token = '\n' if check else None
    if 'pygmalion' in shared.model_name.lower():
@ -162,22 +155,27 @@ def impersonate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typ
    prompt = generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size, impersonate=True)
    reply = ''
    # Yield *Is typing...*
    yield shared.processing_message
    cumulative_reply = ''
    for i in range(chat_generation_attempts):
-        for reply in generate_reply(prompt+reply, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, eos_token=eos_token, stopping_string=f"\n{name2}:"):
+        for reply in generate_reply(f"{prompt}{' ' if len(cumulative_reply) > 0 else ''}{cumulative_reply}", max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=eos_token, stopping_strings=[f"\n{name1}:", f"\n{name2}:"]):
-            reply, next_character_found = extract_message_from_reply(prompt, reply, name1, name2, check, impersonate=True)
+            reply = cumulative_reply + reply
            reply, next_character_found = extract_message_from_reply(reply, name1, name2, check)
            yield reply
            if next_character_found:
                break
        yield reply
-def cai_chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1):
+        cumulative_reply = reply
-    for _history in chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts):
+
    yield reply
 def cai_chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1):
    for _history in chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, name1, name2, context, check, chat_prompt_size, chat_generation_attempts):
        yield generate_chat_html(_history, name1, name2, shared.character)
-def regenerate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1):
+def regenerate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1):
    if (shared.character != 'None' and len(shared.history['visible']) == 1) or len(shared.history['internal']) == 0:
        yield generate_chat_output(shared.history['visible'], name1, name2, shared.character)
    else:
@ -185,7 +183,7 @@ def regenerate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typi
        last_internal = shared.history['internal'].pop()
        # Yield '*Is typing...*'
        yield generate_chat_output(shared.history['visible']+[[last_visible[0], shared.processing_message]], name1, name2, shared.character)
-        for _history in chatbot_wrapper(last_internal[0], max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts, regenerate=True):
+        for _history in chatbot_wrapper(last_internal[0], max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, name1, name2, context, check, chat_prompt_size, chat_generation_attempts, regenerate=True):
            if shared.args.cai_chat:
                shared.history['visible'][-1] = [last_visible[0], _history[-1][1]]
            else:
--- a/modules/models.py
+++ b/modules/models.py
@ -1,5 +1,6 @@
 import json
 import os
 import re
 import time
 import zipfile
 from pathlib import Path
@ -120,11 +121,12 @@ def load_model(model_name):
                params["torch_dtype"] = torch.float16
            if shared.args.gpu_memory:
-                memory_map = shared.args.gpu_memory
+                memory_map = list(map(lambda x : x.strip(), shared.args.gpu_memory))
                max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB'
                max_memory = {}
                for i in range(len(memory_map)):
-                    max_memory[i] = f'{memory_map[i]}GiB'
+                    max_memory[i] = f'{memory_map[i]}GiB' if not re.match('.*ib$', memory_map[i].lower()) else memory_map[i]
-                max_memory['cpu'] = f'{shared.args.cpu_memory or 99}GiB'
+                max_memory['cpu'] = max_cpu_memory
                params['max_memory'] = max_memory
            elif shared.args.auto_devices:
                total_mem = (torch.cuda.get_device_properties(0).total_memory / (1024*1024))
--- a/modules/shared.py
+++ b/modules/shared.py
@ -27,9 +27,9 @@ settings = {
    'max_new_tokens': 200,
    'max_new_tokens_min': 1,
    'max_new_tokens_max': 2000,
-    'name1': 'Person 1',
+    'name1': 'You',
-    'name2': 'Person 2',
+    'name2': 'Assistant',
-    'context': 'This is a conversation between two people.',
+    'context': 'This is a conversation with your Assistant. The Assistant is very helpful and is eager to chat with you and answer your questions.',
    'stop_at_newline': False,
    'chat_prompt_size': 2048,
    'chat_prompt_size_min': 0,
@ -56,7 +56,7 @@ settings = {
    },
    'lora_prompts': {
        'default': 'Common sense questions and answers\n\nQuestion: \nFactual answer:',
-        'alpaca-lora-7b': "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n### Instruction:\nWrite a poem about the transformers Python library. \nMention the word \"large language models\" in that poem.\n### Response:\n"
+        '(alpaca-lora-7b|alpaca-lora-13b|alpaca-lora-30b)': "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n### Instruction:\nWrite a poem about the transformers Python library. \nMention the word \"large language models\" in that poem.\n### Response:\n"
    }
 }
@ -79,14 +79,16 @@ parser.add_argument('--cai-chat', action='store_true', help='Launch the web UI i
 parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text.')
 parser.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision.')
 parser.add_argument('--load-in-4bit', action='store_true', help='DEPRECATED: use --gptq-bits 4 instead.')
-parser.add_argument('--gptq-bits', type=int, default=0, help='Load a pre-quantized model with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA and OPT.')
+parser.add_argument('--gptq-bits', type=int, default=0, help='GPTQ: Load a pre-quantized model with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA and OPT.')
-parser.add_argument('--gptq-model-type', type=str, help='Model type of pre-quantized model. Currently only LLaMa and OPT are supported.')
+parser.add_argument('--gptq-model-type', type=str, help='GPTQ: Model type of pre-quantized model. Currently only LLaMa and OPT are supported.')
 parser.add_argument('--gptq-pre-layer', type=int, default=0, help='GPTQ: The number of layers to preload.')
 parser.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.')
 parser.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.')
 parser.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')
 parser.add_argument('--disk-cache-dir', type=str, default="cache", help='Directory to save the disk cache to. Defaults to "cache".')
-parser.add_argument('--gpu-memory', type=int, nargs="+", help='Maxmimum GPU memory in GiB to be allocated per GPU. Example: --gpu-memory 10 for a single GPU, --gpu-memory 10 5 for two GPUs.')
+parser.add_argument('--gpu-memory', type=str, nargs="+", help='Maxmimum GPU memory in GiB to be allocated per GPU. Example: --gpu-memory 10 for a single GPU, --gpu-memory 10 5 for two GPUs.')
-parser.add_argument('--cpu-memory', type=int, help='Maximum CPU memory in GiB to allocate for offloaded weights. Must be an integer number. Defaults to 99.')
+parser.add_argument('--cpu-memory', type=str, help='Maximum CPU memory in GiB to allocate for offloaded weights. Must be an integer number. Defaults to 99.')
 parser.add_argument('--no-cache', action='store_true', help='Set use_cache to False while generating text. This reduces the VRAM usage a bit at a performance cost.')
 parser.add_argument('--flexgen', action='store_true', help='Enable the use of FlexGen offloading.')
 parser.add_argument('--percent', type=int, nargs="+", default=[0, 100, 100, 0, 100, 0], help='FlexGen: allocation percentages. Must be 6 numbers separated by spaces (default: 0, 100, 100, 0, 100, 0).')
 parser.add_argument("--compress-weight", action="store_true", help="FlexGen: activate weight compression.")
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@ -1,6 +1,7 @@
 import gc
 import re
 import time
 import traceback
 import numpy as np
 import torch
@ -92,30 +93,16 @@ def clear_torch_cache():
    if not shared.args.cpu:
        torch.cuda.empty_cache()
-def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, eos_token=None, stopping_string=None):
+def set_manual_seed(seed):
-    clear_torch_cache()
+    if seed != -1:
-    t0 = time.time()
+        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)
-    # These models are not part of Hugging Face, so we handle them
+def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=None, stopping_strings=[]):
-    # separately and terminate the function call earlier
+    clear_torch_cache()
-    if shared.is_RWKV:
+    set_manual_seed(seed)
-        try:
+    t0 = time.time()
            if shared.args.no_stream:
                reply = shared.model.generate(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k)
                yield formatted_outputs(reply, shared.model_name)
            else:
                if not (shared.args.chat or shared.args.cai_chat):
                    yield formatted_outputs(question, shared.model_name)
                # RWKV has proper streaming, which is very nice.
                # No need to generate 8 tokens at a time.
                for reply in shared.model.generate_with_streaming(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k):
                    yield formatted_outputs(reply, shared.model_name)
        finally:
            t1 = time.time()
            output = encode(reply)[0]
            input_ids = encode(question)
            print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(input_ids[0])} tokens)")
            return
    original_question = question
    if not (shared.args.chat or shared.args.cai_chat):
@ -123,17 +110,46 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
    if shared.args.verbose:
        print(f"\n\n{question}\n--------------------\n")
    # These models are not part of Hugging Face, so we handle them
    # separately and terminate the function call earlier
    if shared.is_RWKV:
        try:
            if shared.args.no_stream:
                reply = shared.model.generate(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k)
                if not (shared.args.chat or shared.args.cai_chat):
                    reply = original_question + apply_extensions(reply, "output")
                yield formatted_outputs(reply, shared.model_name)
            else:
                if not (shared.args.chat or shared.args.cai_chat):
                    yield formatted_outputs(question, shared.model_name)
                # RWKV has proper streaming, which is very nice.
                # No need to generate 8 tokens at a time.
                for reply in shared.model.generate_with_streaming(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k):
                    if not (shared.args.chat or shared.args.cai_chat):
                        reply = original_question + apply_extensions(reply, "output")
                    yield formatted_outputs(reply, shared.model_name)
        except Exception:
            traceback.print_exc()
        finally:
            t1 = time.time()
            output = encode(reply)[0]
            input_ids = encode(question)
            print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(input_ids[0])} tokens)")
            return
    input_ids = encode(question, max_new_tokens)
    original_input_ids = input_ids
    output = input_ids[0]
    cuda = not any((shared.args.cpu, shared.args.deepspeed, shared.args.flexgen))
    eos_token_ids = [shared.tokenizer.eos_token_id] if shared.tokenizer.eos_token_id is not None else []
    if eos_token is not None:
        eos_token_ids.append(int(encode(eos_token)[0][-1]))
    stopping_criteria_list = transformers.StoppingCriteriaList()
-    if stopping_string is not None:
+    if type(stopping_strings) is list and len(stopping_strings) > 0:
-        # Copied from https://github.com/PygmalionAI/gradio-ui/blob/master/src/model.py
+        t = [encode(string, 0, add_special_tokens=False) for string in stopping_strings]
        t = encode(stopping_string, 0, add_special_tokens=False)
        stopping_criteria_list.append(_SentinelTokenStoppingCriteria(sentinel_token_ids=t, starting_idx=len(input_ids[0])))
    generate_params = {}
@ -163,6 +179,8 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
            "temperature": temperature,
            "stop": eos_token_ids[-1],
        })
    if shared.args.no_cache:
        generate_params.update({"use_cache": False})
    if shared.args.deepspeed:
        generate_params.update({"synced_gpus": True})
    if shared.soft_prompt:
@ -182,9 +200,10 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
            if shared.soft_prompt:
                output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
-            reply = decode(output)
+            new_tokens = len(output) - len(input_ids[0])
            reply = decode(output[-new_tokens:])
            if not (shared.args.chat or shared.args.cai_chat):
-                reply = original_question + apply_extensions(reply[len(question):], "output")
+                reply = original_question + apply_extensions(reply, "output")
            yield formatted_outputs(reply, shared.model_name)
@ -207,10 +226,11 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
                for output in generator:
                    if shared.soft_prompt:
                        output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
                    reply = decode(output)
                    new_tokens = len(output) - len(input_ids[0])
                    reply = decode(output[-new_tokens:])
                    if not (shared.args.chat or shared.args.cai_chat):
-                        reply = original_question + apply_extensions(reply[len(question):], "output")
+                        reply = original_question + apply_extensions(reply, "output")
                    if output[-1] in eos_token_ids:
                        break
@ -226,10 +246,11 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
                    output = shared.model.generate(**generate_params)[0]
                if shared.soft_prompt:
                    output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
                reply = decode(output)
                new_tokens = len(output) - len(original_input_ids[0])
                reply = decode(output[-new_tokens:])
                if not (shared.args.chat or shared.args.cai_chat):
-                    reply = original_question + apply_extensions(reply[len(question):], "output")
+                    reply = original_question + apply_extensions(reply, "output")
                if np.count_nonzero(np.isin(input_ids[0], eos_token_ids)) < np.count_nonzero(np.isin(output, eos_token_ids)):
                    break
@ -238,9 +259,15 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
                input_ids = np.reshape(output, (1, output.shape[0]))
                if shared.soft_prompt:
                    inputs_embeds, filler_input_ids = generate_softprompt_input_tensors(input_ids)
                    generate_params.update({"inputs_embeds": inputs_embeds})
                    generate_params.update({"inputs": filler_input_ids})
                else:
                    generate_params.update({"inputs": input_ids})
            yield formatted_outputs(reply, shared.model_name)
    except Exception:
        traceback.print_exc()
    finally:
        t1 = time.time()
        print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(original_input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(original_input_ids[0])} tokens)")
--- a/presets/Default.txt
+++ b/presets/Default.txt
@ -1,12 +1,7 @@
 do_sample=True
-temperature=1
+top_p=0.5
-top_p=1
+top_k=40
-typical_p=1
+temperature=0.7
-repetition_penalty=1
+repetition_penalty=1.2
-top_k=50
+typical_p=1.0
 num_beams=1
 penalty_alpha=0
 min_length=0
 length_penalty=1
 no_repeat_ngram_size=0
 early_stopping=False
--- a/presets/Individual
+++ b/presets/Individual
@ -1,6 +0,0 @@
 do_sample=True
 top_p=0.9
 top_k=50
 temperature=1.39
 repetition_penalty=1.08
 typical_p=0.2
--- a/requirements.txt
+++ b/requirements.txt
@ -6,6 +6,7 @@ markdown
 numpy
 peft==0.2.0
 requests
 rwkv==0.7.0
 safetensors==0.3.0
 sentencepiece
 tqdm
--- a/server.py
+++ b/server.py
@ -1,4 +1,3 @@
 import gc
 import io
 import json
 import re
@ -8,7 +7,6 @@ import zipfile
 from pathlib import Path
 import gradio as gr
 import torch
 import modules.chat as chat
 import modules.extensions as extensions_module
@ -17,7 +15,7 @@ import modules.ui as ui
 from modules.html_generator import generate_chat_html
 from modules.LoRA import add_lora_to_model
 from modules.models import load_model, load_soft_prompt
-from modules.text_generation import generate_reply
+from modules.text_generation import clear_torch_cache, generate_reply
 # Loading custom settings
 settings_file = None
@ -56,21 +54,14 @@ def load_model_wrapper(selected_model):
    if selected_model != shared.model_name:
        shared.model_name = selected_model
        shared.model = shared.tokenizer = None
-        if not shared.args.cpu:
+        clear_torch_cache()
            gc.collect()
            torch.cuda.empty_cache()
        shared.model, shared.tokenizer = load_model(shared.model_name)
    return selected_model
 def load_lora_wrapper(selected_lora):
    shared.lora_name = selected_lora
    default_text = shared.settings['lora_prompts'][next((k for k in shared.settings['lora_prompts'] if re.match(k.lower(), shared.lora_name.lower())), 'default')]
    if not shared.args.cpu:
        gc.collect()
        torch.cuda.empty_cache()
    add_lora_to_model(selected_lora)
    default_text = shared.settings['lora_prompts'][next((k for k in shared.settings['lora_prompts'] if re.match(k.lower(), shared.lora_name.lower())), 'default')]
    return selected_lora, default_text
@ -102,7 +93,7 @@ def load_preset_values(preset_menu, return_dict=False):
    if return_dict:
        return generate_params
    else:
-        return generate_params['do_sample'], generate_params['temperature'], generate_params['top_p'], generate_params['typical_p'], generate_params['repetition_penalty'], generate_params['encoder_repetition_penalty'], generate_params['top_k'], generate_params['min_length'], generate_params['no_repeat_ngram_size'], generate_params['num_beams'], generate_params['penalty_alpha'], generate_params['length_penalty'], generate_params['early_stopping']
+        return preset_menu, generate_params['do_sample'], generate_params['temperature'], generate_params['top_p'], generate_params['typical_p'], generate_params['repetition_penalty'], generate_params['encoder_repetition_penalty'], generate_params['top_k'], generate_params['min_length'], generate_params['no_repeat_ngram_size'], generate_params['num_beams'], generate_params['penalty_alpha'], generate_params['length_penalty'], generate_params['early_stopping']
 def upload_soft_prompt(file):
    with zipfile.ZipFile(io.BytesIO(file)) as zf:
@ -160,6 +151,12 @@ def create_settings_menus(default_preset):
                        shared.gradio['length_penalty'] = gr.Slider(-5, 5, value=generate_params['length_penalty'], label='length_penalty')
                shared.gradio['early_stopping'] = gr.Checkbox(value=generate_params['early_stopping'], label='early_stopping')
            shared.gradio['seed'] = gr.Number(value=-1, label='Seed (-1 for random)')
    with gr.Row():
        shared.gradio['preset_menu_mirror'] = gr.Dropdown(choices=available_presets, value=default_preset if not shared.args.flexgen else 'Naive', label='Generation parameters preset')
        ui.create_refresh_button(shared.gradio['preset_menu_mirror'], lambda : None, lambda : {'choices': get_available_presets()}, 'refresh-button')
    with gr.Row():
        shared.gradio['lora_menu'] = gr.Dropdown(choices=available_loras, value=shared.lora_name, label='LoRA')
        ui.create_refresh_button(shared.gradio['lora_menu'], lambda : None, lambda : {'choices': get_available_loras()}, 'refresh-button')
@ -174,7 +171,8 @@ def create_settings_menus(default_preset):
            shared.gradio['upload_softprompt'] = gr.File(type='binary', file_types=['.zip'])
    shared.gradio['model_menu'].change(load_model_wrapper, [shared.gradio['model_menu']], [shared.gradio['model_menu']], show_progress=True)
-    shared.gradio['preset_menu'].change(load_preset_values, [shared.gradio['preset_menu']], [shared.gradio['do_sample'], shared.gradio['temperature'], shared.gradio['top_p'], shared.gradio['typical_p'], shared.gradio['repetition_penalty'], shared.gradio['encoder_repetition_penalty'], shared.gradio['top_k'], shared.gradio['min_length'], shared.gradio['no_repeat_ngram_size'], shared.gradio['num_beams'], shared.gradio['penalty_alpha'], shared.gradio['length_penalty'], shared.gradio['early_stopping']])
+    shared.gradio['preset_menu'].change(load_preset_values, [shared.gradio['preset_menu']], [shared.gradio[k] for k in ['preset_menu_mirror', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping']])
    shared.gradio['preset_menu_mirror'].change(load_preset_values, [shared.gradio['preset_menu_mirror']], [shared.gradio[k] for k in ['preset_menu', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping']])
    shared.gradio['lora_menu'].change(load_lora_wrapper, [shared.gradio['lora_menu']], [shared.gradio['lora_menu'], shared.gradio['textbox']], show_progress=True)
    shared.gradio['softprompts_menu'].change(load_soft_prompt, [shared.gradio['softprompts_menu']], [shared.gradio['softprompts_menu']], show_progress=True)
    shared.gradio['upload_softprompt'].upload(upload_soft_prompt, [shared.gradio['upload_softprompt']], [shared.gradio['softprompts_menu']])
@ -235,9 +233,7 @@ else:
    shared.model_name = available_models[i]
 shared.model, shared.tokenizer = load_model(shared.model_name)
 if shared.args.lora:
-    print(shared.args.lora)
+    add_lora_to_model(shared.args.lora)
    shared.lora_name = shared.args.lora
    add_lora_to_model(shared.lora_name)
 # Default UI settings
 default_preset = shared.settings['presets'][next((k for k in shared.settings['presets'] if re.match(k.lower(), shared.model_name.lower())), 'default')]
@ -325,13 +321,13 @@ def create_interface():
                create_settings_menus(default_preset)
            function_call = 'chat.cai_chatbot_wrapper' if shared.args.cai_chat else 'chat.chatbot_wrapper'
-            shared.input_params = [shared.gradio[k] for k in ['textbox', 'max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'name1', 'name2', 'context', 'check', 'chat_prompt_size_slider', 'chat_generation_attempts']]
+            shared.input_params = [shared.gradio[k] for k in ['textbox', 'max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'seed', 'name1', 'name2', 'context', 'check', 'chat_prompt_size_slider', 'chat_generation_attempts']]
            gen_events.append(shared.gradio['Generate'].click(eval(function_call), shared.input_params, shared.gradio['display'], show_progress=shared.args.no_stream))
            gen_events.append(shared.gradio['textbox'].submit(eval(function_call), shared.input_params, shared.gradio['display'], show_progress=shared.args.no_stream))
            gen_events.append(shared.gradio['Regenerate'].click(chat.regenerate_wrapper, shared.input_params, shared.gradio['display'], show_progress=shared.args.no_stream))
            gen_events.append(shared.gradio['Impersonate'].click(chat.impersonate_wrapper, shared.input_params, shared.gradio['textbox'], show_progress=shared.args.no_stream))
-            shared.gradio['Stop'].click(chat.stop_everything_event, [], [], cancels=gen_events)
+            shared.gradio['Stop'].click(chat.stop_everything_event, [], [], cancels=gen_events, queue=False)
            shared.gradio['Copy last reply'].click(chat.send_last_reply_to_input, [], shared.gradio['textbox'], show_progress=shared.args.no_stream)
            shared.gradio['Replace last reply'].click(chat.replace_last_reply, [shared.gradio['textbox'], shared.gradio['name1'], shared.gradio['name2']], shared.gradio['display'], show_progress=shared.args.no_stream)
@ -388,7 +384,7 @@ def create_interface():
            with gr.Tab("Parameters", elem_id="parameters"):
                create_settings_menus(default_preset)
-            shared.input_params = [shared.gradio[k] for k in ['textbox', 'max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping']]
+            shared.input_params = [shared.gradio[k] for k in ['textbox', 'max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'seed']]
            output_params = [shared.gradio[k] for k in ['textbox', 'markdown', 'html']]
            gen_events.append(shared.gradio['Generate'].click(generate_reply, shared.input_params, output_params, show_progress=shared.args.no_stream, api_name='textgen'))
            gen_events.append(shared.gradio['textbox'].submit(generate_reply, shared.input_params, output_params, show_progress=shared.args.no_stream))
@ -420,7 +416,7 @@ def create_interface():
            with gr.Tab("Parameters", elem_id="parameters"):
                create_settings_menus(default_preset)
-            shared.input_params = [shared.gradio[k] for k in ['textbox', 'max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping']]
+            shared.input_params = [shared.gradio[k] for k in ['textbox', 'max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'seed']]
            output_params = [shared.gradio[k] for k in ['output_textbox', 'markdown', 'html']]
            gen_events.append(shared.gradio['Generate'].click(generate_reply, shared.input_params, output_params, show_progress=shared.args.no_stream, api_name='textgen'))
            gen_events.append(shared.gradio['textbox'].submit(generate_reply, shared.input_params, output_params, show_progress=shared.args.no_stream))
--- a/settings-template.json
+++ b/settings-template.json
@ -2,9 +2,9 @@
    "max_new_tokens": 200,
    "max_new_tokens_min": 1,
    "max_new_tokens_max": 2000,
-    "name1": "Person 1",
+    "name1": "You",
-    "name2": "Person 2",
+    "name2": "Assistant",
-    "context": "This is a conversation between two people.",
+    "context": "This is a conversation with your Assistant. The Assistant is very helpful and is eager to chat with you and answer your questions.",
    "stop_at_newline": false,
    "chat_prompt_size": 2048,
    "chat_prompt_size_min": 0,