From 0d9932815c593fd425e25c4c9dff69005d1df95e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 2 Aug 2023 08:45:14 -0700
Subject: [PATCH 001/169] Improve TheEncrypted777 on mobile devices

---
 css/chat.css                       | 10 +++++++
 css/chat_style-TheEncrypted777.css | 45 ++++++++++++++++++++++++------
 css/main.css                       |  6 ++++
 server.py                          |  2 +-
 4 files changed, 54 insertions(+), 9 deletions(-)

diff --git a/css/chat.css b/css/chat.css
index 17b8d142..ad76f5cc 100644
--- a/css/chat.css
+++ b/css/chat.css
@@ -46,6 +46,16 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     min-width: 0 !important;
 }
 
+@media screen and (max-width: 688px) {
+    #main {
+        padding: 0px;
+    }
+
+    .chat {
+        height: calc(100vh - 274px) !important;
+    }
+}
+
 /*****************************************************/
 /*************** Chat box declarations ***************/
 /*****************************************************/
diff --git a/css/chat_style-TheEncrypted777.css b/css/chat_style-TheEncrypted777.css
index 7682011d..d92e982d 100644
--- a/css/chat_style-TheEncrypted777.css
+++ b/css/chat_style-TheEncrypted777.css
@@ -10,17 +10,10 @@
     line-height: 1.428571429;
 }
 
-.circle-you {
-    background-color: gray;
-    border-radius: 1rem;
-    /*Change color to any you like to be the border of your image*/
-    border: 2px solid white;
-}
-
+.circle-you,
 .circle-bot {
     background-color: gray;
     border-radius: 1rem;
-    /*Change color to any you like to be the border of the bot's image*/
     border: 2px solid white;
 }
 
@@ -105,3 +98,39 @@
 .message-body p em {
     color: rgb(110, 110, 110) !important;
 }
+
+@media screen and (max-width: 688px) {
+    .message {
+        display: grid;
+        grid-template-columns: 60px minmax(0, 1fr);
+        padding-bottom: 25px;
+        font-size: 15px;
+        font-family: Helvetica, Arial, sans-serif;
+        line-height: 1.428571429;
+    }
+
+    .circle-you, .circle-bot {
+        width: 50px;
+        height: 73px;
+        border-radius: 0.5rem;
+    }
+
+    .circle-bot img,
+    .circle-you img {
+        width: 100%;
+        height: 100%;
+        object-fit: cover;
+    }
+
+    .text {
+        padding-left: 0px;
+    }
+
+    .message-body p {
+        font-size: 16px !important;
+    }
+
+    .username {
+        font-size: 20px;
+    }
+}
diff --git a/css/main.css b/css/main.css
index 5c17a179..b4066c91 100644
--- a/css/main.css
+++ b/css/main.css
@@ -26,6 +26,10 @@
     max-width: 2.2em;
 }
 
+.button_nowrap {
+    white-space: nowrap;
+}
+
 #slim-column {
     flex: none !important;
     min-width: 0 !important;
@@ -90,6 +94,8 @@ div.svelte-15lo0d8 > *, div.svelte-15lo0d8 > .form > * {
 .header_bar {
     background-color: #f7f7f7;
     margin-bottom: 20px;
+    display: inline !important;
+    overflow-x: scroll;
 }
 
 .dark .header_bar {
diff --git a/server.py b/server.py
index ecb8ddc9..0f1b9332 100644
--- a/server.py
+++ b/server.py
@@ -643,7 +643,7 @@ def create_interface():
                 with gr.Row():
                     shared.gradio['Impersonate'] = gr.Button('Impersonate')
                     shared.gradio['Regenerate'] = gr.Button('Regenerate')
-                    shared.gradio['Remove last'] = gr.Button('Remove last')
+                    shared.gradio['Remove last'] = gr.Button('Remove last', elem_classes=['button_nowrap'])
 
                 with gr.Row():
                     shared.gradio['Copy last reply'] = gr.Button('Copy last reply')

From e931844fe25620e8ccc2e7c4ed9ab06fc6644471 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 2 Aug 2023 14:52:20 -0300
Subject: [PATCH 002/169] Add auto_max_new_tokens parameter (#3419)

---
 api-examples/api-example-chat-stream.py | 1 +
 api-examples/api-example-chat.py        | 1 +
 api-examples/api-example-stream.py      | 1 +
 api-examples/api-example.py             | 1 +
 extensions/api/util.py                  | 1 +
 extensions/openai/defaults.py           | 1 +
 modules/loaders.py                      | 5 +++++
 modules/shared.py                       | 1 +
 modules/text_generation.py              | 2 ++
 modules/ui.py                           | 1 +
 server.py                               | 1 +
 settings-template.yaml                  | 1 +
 12 files changed, 17 insertions(+)

diff --git a/api-examples/api-example-chat-stream.py b/api-examples/api-example-chat-stream.py
index 14f6f9d6..493661c2 100644
--- a/api-examples/api-example-chat-stream.py
+++ b/api-examples/api-example-chat-stream.py
@@ -20,6 +20,7 @@ async def run(user_input, history):
     request = {
         'user_input': user_input,
         'max_new_tokens': 250,
+        'auto_max_new_tokens': False,
         'history': history,
         'mode': 'instruct',  # Valid options: 'chat', 'chat-instruct', 'instruct'
         'character': 'Example',
diff --git a/api-examples/api-example-chat.py b/api-examples/api-example-chat.py
index 0e155c63..31641815 100644
--- a/api-examples/api-example-chat.py
+++ b/api-examples/api-example-chat.py
@@ -14,6 +14,7 @@ def run(user_input, history):
     request = {
         'user_input': user_input,
         'max_new_tokens': 250,
+        'auto_max_new_tokens': False,
         'history': history,
         'mode': 'instruct',  # Valid options: 'chat', 'chat-instruct', 'instruct'
         'character': 'Example',
diff --git a/api-examples/api-example-stream.py b/api-examples/api-example-stream.py
index 1ae5a91c..175275f9 100644
--- a/api-examples/api-example-stream.py
+++ b/api-examples/api-example-stream.py
@@ -20,6 +20,7 @@ async def run(context):
     request = {
         'prompt': context,
         'max_new_tokens': 250,
+        'auto_max_new_tokens': False,
 
         # Generation params. If 'preset' is set to different than 'None', the values
         # in presets/preset-name.yaml are used instead of the individual numbers.
diff --git a/api-examples/api-example.py b/api-examples/api-example.py
index 4e45de9e..7f8bc1d2 100644
--- a/api-examples/api-example.py
+++ b/api-examples/api-example.py
@@ -12,6 +12,7 @@ def run(prompt):
     request = {
         'prompt': prompt,
         'max_new_tokens': 250,
+        'auto_max_new_tokens': False,
 
         # Generation params. If 'preset' is set to different than 'None', the values
         # in presets/preset-name.yaml are used instead of the individual numbers.
diff --git a/extensions/api/util.py b/extensions/api/util.py
index 2358b7d2..5cc259db 100644
--- a/extensions/api/util.py
+++ b/extensions/api/util.py
@@ -21,6 +21,7 @@ def build_parameters(body, chat=False):
 
     generate_params = {
         'max_new_tokens': int(body.get('max_new_tokens', body.get('max_length', 200))),
+        'auto_max_new_tokens': bool(body.get('auto_max_new_tokens', False)),
         'do_sample': bool(body.get('do_sample', True)),
         'temperature': float(body.get('temperature', 0.5)),
         'top_p': float(body.get('top_p', 1)),
diff --git a/extensions/openai/defaults.py b/extensions/openai/defaults.py
index 52f0d641..cb8308e7 100644
--- a/extensions/openai/defaults.py
+++ b/extensions/openai/defaults.py
@@ -4,6 +4,7 @@ import copy
 # Data type is important, Ex. use 0.0 for a float 0
 default_req_params = {
     'max_new_tokens': 16,  # 'Inf' for chat
+    'auto_max_new_tokens': False,
     'temperature': 1.0,
     'top_p': 1.0,
     'top_k': 1,  # choose 20 for chat in absence of another default
diff --git a/modules/loaders.py b/modules/loaders.py
index 6d0291bf..838ecc86 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -116,6 +116,7 @@ loaders_samplers = {
         'ban_eos_token',
         'add_bos_token',
         'skip_special_tokens',
+        'auto_max_new_tokens',
     },
     'ExLlama_HF': {
         'temperature',
@@ -139,6 +140,7 @@ loaders_samplers = {
         'ban_eos_token',
         'add_bos_token',
         'skip_special_tokens',
+        'auto_max_new_tokens',
     },
     'ExLlama': {
         'temperature',
@@ -176,6 +178,7 @@ loaders_samplers = {
         'ban_eos_token',
         'add_bos_token',
         'skip_special_tokens',
+        'auto_max_new_tokens',
     },
     'GPTQ-for-LLaMa': {
         'temperature',
@@ -203,6 +206,7 @@ loaders_samplers = {
         'ban_eos_token',
         'add_bos_token',
         'skip_special_tokens',
+        'auto_max_new_tokens',
     },
     'llama.cpp': {
         'temperature',
@@ -237,6 +241,7 @@ loaders_samplers = {
         'ban_eos_token',
         'add_bos_token',
         'skip_special_tokens',
+        'auto_max_new_tokens',
     },
 }
 
diff --git a/modules/shared.py b/modules/shared.py
index 59d49ab6..a2782e65 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -36,6 +36,7 @@ settings = {
     'max_new_tokens': 200,
     'max_new_tokens_min': 1,
     'max_new_tokens_max': 4096,
+    'auto_max_new_tokens': False,
     'seed': -1,
     'character': 'None',
     'name1': 'You',
diff --git a/modules/text_generation.py b/modules/text_generation.py
index e1be6aa3..f6f71990 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -247,6 +247,8 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
     input_ids = encode(question, add_bos_token=state['add_bos_token'], truncation_length=get_max_prompt_length(state))
     output = input_ids[0]
     cuda = not any((shared.args.cpu, shared.args.deepspeed))
+    if state['auto_max_new_tokens']:
+        generate_params['max_new_tokens'] = state['truncation_length'] - input_ids.shape[-1]
 
     # Add the encoded tokens to generate_params
     question, input_ids, inputs_embeds = apply_extensions('tokenizer', state, question, input_ids, None)
diff --git a/modules/ui.py b/modules/ui.py
index d9b3a131..fe3482d2 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -79,6 +79,7 @@ def list_model_elements():
 def list_interface_input_elements():
     elements = [
         'max_new_tokens',
+        'auto_max_new_tokens',
         'seed',
         'temperature',
         'top_p',
diff --git a/server.py b/server.py
index 0f1b9332..d622cdbe 100644
--- a/server.py
+++ b/server.py
@@ -425,6 +425,7 @@ def create_settings_menus(default_preset):
                         shared.gradio['truncation_length'] = gr.Slider(value=shared.settings['truncation_length'], minimum=shared.settings['truncation_length_min'], maximum=shared.settings['truncation_length_max'], step=256, label='Truncate the prompt up to this length', info='The leftmost tokens are removed if the prompt exceeds this length. Most models require this to be at most 2048.')
                         shared.gradio['custom_stopping_strings'] = gr.Textbox(lines=1, value=shared.settings["custom_stopping_strings"] or None, label='Custom stopping strings', info='In addition to the defaults. Written between "" and separated by commas. For instance: "\\nYour Assistant:", "\\nThe assistant:"')
                     with gr.Column():
+                        shared.gradio['auto_max_new_tokens'] = gr.Checkbox(value=shared.settings['auto_max_new_tokens'], label='auto_max_new_tokens', info='Expand max_new_tokens to the available context length.')
                         shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label='Ban the eos_token', info='Forces the model to never end the generation prematurely.')
                         shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.')
 
diff --git a/settings-template.yaml b/settings-template.yaml
index 3d6585d3..62e86371 100644
--- a/settings-template.yaml
+++ b/settings-template.yaml
@@ -3,6 +3,7 @@ autoload_model: false
 max_new_tokens: 200
 max_new_tokens_min: 1
 max_new_tokens_max: 4096
+auto_max_new_tokens: false
 seed: -1
 character: None
 name1: You

From 32a2bbee4ae9e9bcf26c6b10d0386168a42d9f14 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 2 Aug 2023 11:01:29 -0700
Subject: [PATCH 003/169] Implement auto_max_new_tokens for ExLlama

---
 modules/exllama.py | 6 +++++-
 modules/loaders.py | 1 +
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/modules/exllama.py b/modules/exllama.py
index ecfb10a4..00b37b9c 100644
--- a/modules/exllama.py
+++ b/modules/exllama.py
@@ -94,11 +94,15 @@ class ExllamaModel:
         # Tokenizing the input
         ids = self.generator.tokenizer.encode(prompt)
         ids = ids[:, -get_max_prompt_length(state):]
+        if state['auto_max_new_tokens']:
+            max_new_tokens = state['truncation_length'] - ids.shape[-1]
+        else:
+            max_new_tokens = state['max_new_tokens']
 
         self.generator.gen_begin_reuse(ids)
         initial_len = self.generator.sequence[0].shape[0]
         has_leading_space = False
-        for i in range(state['max_new_tokens']):
+        for i in range(max_new_tokens):
             token = self.generator.gen_single_token()
             if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'):
                 has_leading_space = True
diff --git a/modules/loaders.py b/modules/loaders.py
index 838ecc86..68b48204 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -151,6 +151,7 @@ loaders_samplers = {
         'repetition_penalty_range',
         'seed',
         'ban_eos_token',
+        'auto_max_new_tokens',
     },
     'AutoGPTQ': {
         'temperature',

From 0e8f9354b5c841f90db4c7f74a84a2582d3cfa66 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 2 Aug 2023 18:50:13 -0700
Subject: [PATCH 004/169] Add direct download for session/chat history JSONs

---
 css/main.js       |  1 +
 css/save_files.js | 40 ++++++++++++++++++++++++++++
 modules/chat.py   | 17 ------------
 modules/ui.py     |  5 ++--
 server.py         | 66 +++++++++++++++++------------------------------
 5 files changed, 66 insertions(+), 63 deletions(-)
 create mode 100644 css/save_files.js

diff --git a/css/main.js b/css/main.js
index f3b3c05f..9663d464 100644
--- a/css/main.js
+++ b/css/main.js
@@ -17,6 +17,7 @@ main_parent.addEventListener('click', function(e) {
     }
 });
 
+// Add some scrollbars
 const textareaElements = document.querySelectorAll('.add_scrollbar textarea');
 for(i = 0; i < textareaElements.length; i++) {
     textareaElements[i].classList.remove('scroll-hide');
diff --git a/css/save_files.js b/css/save_files.js
new file mode 100644
index 00000000..7dfbcfda
--- /dev/null
+++ b/css/save_files.js
@@ -0,0 +1,40 @@
+// Functions for downloading JSON files
+function getCurrentTimestamp() {
+    const now = new Date();
+    const timezoneOffset = now.getTimezoneOffset() * 60000; // Convert to milliseconds
+    const localTime = new Date(now.getTime() - timezoneOffset);
+    const formattedTimestamp = localTime.toISOString().replace(/[-:]/g, '').slice(0, 15);
+    return formattedTimestamp;
+}
+
+function saveFile(contents, filename) {
+    const element = document.createElement('a');
+    element.setAttribute('href', 'data:text/plain;charset=utf-8,' + encodeURIComponent(contents));
+    element.setAttribute('download', filename);
+    element.style.display = 'none';
+    document.body.appendChild(element);
+    element.click();
+    document.body.removeChild(element);
+}
+
+function saveHistory(history, character, mode) {
+    let path = null;
+
+    if (['chat', 'chat-instruct'].includes(mode) && character && character.trim() !== '') {
+        path = `history_${character}_${getCurrentTimestamp()}.json`;
+    } else {
+        try {
+            path = `history_${mode}_${getCurrentTimestamp()}.json`;
+        } catch (error) {
+            path = `history_${getCurrentTimestamp()}.json`;
+        }
+    }
+    saveFile(history, path);
+}
+
+function saveSession(session, mode) {
+    let path = null;
+
+    path = `session_${mode}_${getCurrentTimestamp()}.json`;
+    saveFile(session, path);
+}
diff --git a/modules/chat.py b/modules/chat.py
index 070f45a4..57a04606 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -412,23 +412,6 @@ def load_history(file, history):
         return history
 
 
-def save_history_at_user_request(history, character, mode):
-    def make_timestamp_path(character=None):
-        return f"logs/{character or ''}{'_' if character else ''}{datetime.now().strftime('%Y%m%d-%H%M%S')}.json"
-
-    path = None
-    if mode in ['chat', 'chat-instruct'] and character not in ['', 'None', None]:
-        path = make_timestamp_path(character)
-    else:
-        # Try to use mode as the file name, otherwise just use the timestamp
-        try:
-            path = make_timestamp_path(mode.capitalize())
-        except:
-            path = make_timestamp_path()
-
-    return save_history(history, path)
-
-
 def save_persistent_history(history, character, mode):
     if mode in ['chat', 'chat-instruct'] and character not in ['', 'None', None] and not shared.args.multi_user:
         save_history(history, path=Path(f'logs/{character}_persistent.json'))
diff --git a/modules/ui.py b/modules/ui.py
index fe3482d2..df36a331 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -15,6 +15,8 @@ with open(Path(__file__).resolve().parent / '../css/main.js', 'r') as f:
     main_js = f.read()
 with open(Path(__file__).resolve().parent / '../css/chat.js', 'r') as f:
     chat_js = f.read()
+with open(Path(__file__).resolve().parent / '../css/save_files.js', 'r') as f:
+    save_files_js = f.read()
 
 refresh_symbol = '🔄'
 delete_symbol = '🗑️'
@@ -145,9 +147,6 @@ def gather_interface_values(*args):
 
     if not shared.args.multi_user:
         shared.persistent_interface_state = output
-        Path('logs').mkdir(exist_ok=True)
-        with open(Path(f'logs/session_{shared.get_mode()}_autosave.json'), 'w') as f:
-            f.write(json.dumps(output, indent=4))
 
     return output
 
diff --git a/server.py b/server.py
index d622cdbe..b53d6a12 100644
--- a/server.py
+++ b/server.py
@@ -508,44 +508,24 @@ def create_file_saving_event_handlers():
 
     if not shared.args.multi_user:
 
-        def load_session(session, state):
-            with open(Path(f'logs/{session}.json'), 'r') as f:
-                state.update(json.loads(f.read()))
+        def load_session(file, state):
+            decoded_file = file if type(file) == str else file.decode('utf-8')
+            data = json.loads(decoded_file)
+            state.update(data)
 
             if shared.is_chat():
                 chat.save_persistent_history(state['history'], state['character_menu'], state['mode'])
 
             return state
 
-        if shared.is_chat():
-            shared.gradio['save_session'].click(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                lambda x: json.dumps(x, indent=4), gradio('interface_state'), gradio('save_contents')).then(
-                lambda: 'logs/', None, gradio('save_root')).then(
-                lambda x: f'session_{shared.get_mode()}_{x + "_" if x not in ["None", None, ""] else ""}{utils.current_time()}.json', gradio('character_menu'), gradio('save_filename')).then(
-                lambda: gr.update(visible=True), None, gradio('file_saver'))
+        shared.gradio['save_session'].click(
+            ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+            lambda x: json.dumps(x, indent=4), gradio('interface_state'), gradio('temporary_text')).then(
+                None, gradio('temporary_text'), None, _js=f"(contents) => {{{ui.save_files_js}; saveSession(contents, \"{shared.get_mode()}\")}}")
 
-            shared.gradio['session_menu'].change(
-                load_session, gradio('session_menu', 'interface_state'), gradio('interface_state')).then(
-                ui.apply_interface_values, gradio('interface_state'), gradio(ui.list_interface_input_elements()), show_progress=False).then(
-                chat.redraw_html, shared.reload_inputs, gradio('display'))
-
-        else:
-            shared.gradio['save_session'].click(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                lambda x: json.dumps(x, indent=4), gradio('interface_state'), gradio('save_contents')).then(
-                lambda: 'logs/', None, gradio('save_root')).then(
-                lambda: f'session_{shared.get_mode()}_{utils.current_time()}.json', None, gradio('save_filename')).then(
-                lambda: gr.update(visible=True), None, gradio('file_saver'))
-
-            shared.gradio['session_menu'].change(
-                load_session, gradio('session_menu', 'interface_state'), gradio('interface_state')).then(
-                ui.apply_interface_values, gradio('interface_state'), gradio(ui.list_interface_input_elements()), show_progress=False)
-
-        shared.gradio['delete_session'].click(
-            lambda x: f'{x}.json', gradio('session_menu'), gradio('delete_filename')).then(
-            lambda: 'logs/', None, gradio('delete_root')).then(
-            lambda: gr.update(visible=True), None, gradio('file_deleter'))
+        shared.gradio['load_session'].upload(
+            load_session, gradio('load_session', 'interface_state'), gradio('interface_state')).then(
+            ui.apply_interface_values, gradio('interface_state'), gradio(ui.list_interface_input_elements()), show_progress=False)
 
 
 def set_interface_arguments(interface_mode, extensions, bool_active):
@@ -558,7 +538,6 @@ def set_interface_arguments(interface_mode, extensions, bool_active):
         setattr(shared.args, k, False)
     if interface_mode != "default":
         setattr(shared.args, interface_mode, True)
-
     for k in bool_list:
         setattr(shared.args, k, False)
     for k in bool_active:
@@ -622,6 +601,9 @@ def create_interface():
         # Floating menus for saving/deleting files
         create_file_saving_menus()
 
+        # Used for saving files using javascript
+        shared.gradio['temporary_text'] = gr.Textbox(visible=False)
+
         # Create chat mode interface
         if shared.is_chat():
             shared.input_elements = ui.list_interface_input_elements()
@@ -702,11 +684,10 @@ def create_interface():
                 with gr.Tab('Chat history'):
                     with gr.Row():
                         with gr.Column():
-                            shared.gradio['download'] = gr.File(label="Download")
-                            shared.gradio['download_button'] = gr.Button(value='Refresh')
+                            shared.gradio['save_chat_history'] = gr.Button(value='Save history')
 
                         with gr.Column():
-                            shared.gradio['upload_chat_history'] = gr.File(type='binary', file_types=['.json', '.txt'], label="Upload")
+                            shared.gradio['load_chat_history'] = gr.File(type='binary', file_types=['.json', '.txt'], label="Upload History JSON")
 
                 with gr.Tab('Upload character'):
                     with gr.Tab('YAML or JSON'):
@@ -845,11 +826,8 @@ def create_interface():
 
                 with gr.Column():
                     if not shared.args.multi_user:
-                        with gr.Row():
-                            shared.gradio['session_menu'] = gr.Dropdown(choices=utils.get_available_sessions(), value='None', label='Session', elem_classes='slim-dropdown', info='When saving a session, make sure to keep the initial part of the filename (session_chat, session_notebook, or session_default), otherwise it will not appear on this list afterwards.')
-                            ui.create_refresh_button(shared.gradio['session_menu'], lambda: None, lambda: {'choices': utils.get_available_sessions()}, ['refresh-button'])
-                            shared.gradio['save_session'] = gr.Button('💾', elem_classes=['refresh-button'])
-                            shared.gradio['delete_session'] = gr.Button('🗑️', elem_classes=['refresh-button'])
+                        shared.gradio['save_session'] = gr.Button('Save session')
+                        shared.gradio['load_session'] = gr.File(type='binary', file_types=['.json'], label="Upload Session JSON")
 
                     extension_name = gr.Textbox(lines=1, label='Install or update an extension', info='Enter the GitHub URL below and press Enter. For a list of extensions, see: https://github.com/oobabooga/text-generation-webui-extensions ⚠️  WARNING ⚠️ : extensions can execute arbitrary code. Make sure to inspect their source code before activating them.')
                     extension_status = gr.Markdown()
@@ -967,8 +945,8 @@ def create_interface():
             shared.gradio['instruction_template'].change(
                 partial(chat.load_character, instruct=True), gradio('instruction_template', 'name1_instruct', 'name2_instruct'), gradio('name1_instruct', 'name2_instruct', 'dummy', 'dummy', 'context_instruct', 'turn_template'))
 
-            shared.gradio['upload_chat_history'].upload(
-                chat.load_history, gradio('upload_chat_history', 'history'), gradio('history')).then(
+            shared.gradio['load_chat_history'].upload(
+                chat.load_history, gradio('load_chat_history', 'history'), gradio('history')).then(
                 chat.redraw_html, shared.reload_inputs, gradio('display'))
 
             shared.gradio['Copy last reply'].click(chat.send_last_reply_to_input, gradio('history'), gradio('textbox'), show_progress=False)
@@ -991,7 +969,9 @@ def create_interface():
                 lambda: 'characters/instruction-following/', None, gradio('delete_root')).then(
                 lambda: gr.update(visible=True), None, gradio('file_deleter'))
 
-            shared.gradio['download_button'].click(chat.save_history_at_user_request, gradio('history', 'character_menu', 'mode'), gradio('download'))
+            shared.gradio['save_chat_history'].click(lambda x: json.dumps(x, indent=4), gradio('history'), gradio('temporary_text')).then(
+                None, gradio('temporary_text', 'character_menu', 'mode'), None, _js=f"(hist, char, mode) => {{{ui.save_files_js}; saveHistory(hist, char, mode)}}")
+
             shared.gradio['Submit character'].click(chat.upload_character, gradio('upload_json', 'upload_img_bot'), gradio('character_menu'))
             shared.gradio['upload_json'].upload(lambda: gr.update(interactive=True), None, gradio('Submit character'))
             shared.gradio['upload_json'].clear(lambda: gr.update(interactive=False), None, gradio('Submit character'))

From 4b6c1d3f080bb36b96ffb25bbc8e843bfe3bf945 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 2 Aug 2023 20:20:23 -0700
Subject: [PATCH 005/169] CSS change

---
 css/main.css | 4 ++++
 server.py    | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/css/main.css b/css/main.css
index b4066c91..d37e3f63 100644
--- a/css/main.css
+++ b/css/main.css
@@ -45,6 +45,10 @@
     min-height: 0
 }
 
+#save_session {
+    margin-top: 32px;
+}
+
 #accordion {
 }
 
diff --git a/server.py b/server.py
index b53d6a12..a0229995 100644
--- a/server.py
+++ b/server.py
@@ -826,7 +826,7 @@ def create_interface():
 
                 with gr.Column():
                     if not shared.args.multi_user:
-                        shared.gradio['save_session'] = gr.Button('Save session')
+                        shared.gradio['save_session'] = gr.Button('Save session', elem_id="save_session")
                         shared.gradio['load_session'] = gr.File(type='binary', file_types=['.json'], label="Upload Session JSON")
 
                     extension_name = gr.Textbox(lines=1, label='Install or update an extension', info='Enter the GitHub URL below and press Enter. For a list of extensions, see: https://github.com/oobabooga/text-generation-webui-extensions ⚠️  WARNING ⚠️ : extensions can execute arbitrary code. Make sure to inspect their source code before activating them.')

From 32c564509ed615a9627c7dc71fc55d5246fcfd04 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 2 Aug 2023 21:13:16 -0700
Subject: [PATCH 006/169] Fix loading session in chat mode

---
 modules/chat.py   |  4 ++++
 modules/shared.py |  4 ++++
 server.py         | 22 ++++++++++++++--------
 3 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 57a04606..5e4eb245 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -418,6 +418,10 @@ def save_persistent_history(history, character, mode):
 
 
 def load_persistent_history(state):
+    if shared.session_is_loading:
+        shared.session_is_loading = False
+        return state['history']
+
     if state['mode'] == 'instruct':
         return state['history']
 
diff --git a/modules/shared.py b/modules/shared.py
index a2782e65..bac3fa8c 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -30,6 +30,10 @@ reload_inputs = []  # Parameters for reloading the chat interface
 # For restarting the interface
 need_restart = False
 
+# To prevent the persistent chat history from being loaded when
+# a session JSON file is being loaded in chat mode
+session_is_loading = False
+
 settings = {
     'dark_theme': True,
     'autoload_model': False,
diff --git a/server.py b/server.py
index a0229995..6715cc4d 100644
--- a/server.py
+++ b/server.py
@@ -511,21 +511,27 @@ def create_file_saving_event_handlers():
         def load_session(file, state):
             decoded_file = file if type(file) == str else file.decode('utf-8')
             data = json.loads(decoded_file)
+
+            if shared.is_chat() and 'character_menu' in data and state.get('character_menu') != data.get('character_menu'):
+                shared.session_is_loading = True
+
             state.update(data)
-
-            if shared.is_chat():
-                chat.save_persistent_history(state['history'], state['character_menu'], state['mode'])
-
             return state
 
         shared.gradio['save_session'].click(
             ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
             lambda x: json.dumps(x, indent=4), gradio('interface_state'), gradio('temporary_text')).then(
-                None, gradio('temporary_text'), None, _js=f"(contents) => {{{ui.save_files_js}; saveSession(contents, \"{shared.get_mode()}\")}}")
+            None, gradio('temporary_text'), None, _js=f"(contents) => {{{ui.save_files_js}; saveSession(contents, \"{shared.get_mode()}\")}}")
 
-        shared.gradio['load_session'].upload(
-            load_session, gradio('load_session', 'interface_state'), gradio('interface_state')).then(
-            ui.apply_interface_values, gradio('interface_state'), gradio(ui.list_interface_input_elements()), show_progress=False)
+        if shared.is_chat():
+            shared.gradio['load_session'].upload(
+                load_session, gradio('load_session', 'interface_state'), gradio('interface_state')).then(
+                ui.apply_interface_values, gradio('interface_state'), gradio(ui.list_interface_input_elements()), show_progress=False).then(
+                chat.redraw_html, shared.reload_inputs, gradio('display'))
+        else:
+            shared.gradio['load_session'].upload(
+                load_session, gradio('load_session', 'interface_state'), gradio('interface_state')).then(
+                ui.apply_interface_values, gradio('interface_state'), gradio(ui.list_interface_input_elements()), show_progress=False)
 
 
 def set_interface_arguments(interface_mode, extensions, bool_active):

From 6bf9e855f85854b6585e518f33c1420e0f718524 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 2 Aug 2023 21:39:56 -0700
Subject: [PATCH 007/169] Minor change

---
 server.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/server.py b/server.py
index 6715cc4d..6f0c5b67 100644
--- a/server.py
+++ b/server.py
@@ -525,11 +525,13 @@ def create_file_saving_event_handlers():
 
         if shared.is_chat():
             shared.gradio['load_session'].upload(
+                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
                 load_session, gradio('load_session', 'interface_state'), gradio('interface_state')).then(
                 ui.apply_interface_values, gradio('interface_state'), gradio(ui.list_interface_input_elements()), show_progress=False).then(
                 chat.redraw_html, shared.reload_inputs, gradio('display'))
         else:
             shared.gradio['load_session'].upload(
+                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
                 load_session, gradio('load_session', 'interface_state'), gradio('interface_state')).then(
                 ui.apply_interface_values, gradio('interface_state'), gradio(ui.list_interface_input_elements()), show_progress=False)
 

From e074538b5886c78ef546fb85f1104e5e61295088 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 2 Aug 2023 21:45:10 -0700
Subject: [PATCH 008/169] Revert "Make long_replies ban the eos token as well"

This reverts commit 6c521ce96787552a9604c344b9949945ef359a59.
---
 extensions/long_replies/script.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extensions/long_replies/script.py b/extensions/long_replies/script.py
index a30b05a7..035e8c9e 100644
--- a/extensions/long_replies/script.py
+++ b/extensions/long_replies/script.py
@@ -28,7 +28,7 @@ class MyLogits(LogitsProcessor):
     def __call__(self, input_ids, scores):
         if input_ids.shape[-1] - initial_size < params["min_length"]:
             scores[...,self.newline_id] = -1000
-            scores[...,shared.tokenizer.eos_token_id] = -1000
+            # scores[...,shared.tokenizer.eos_token_id] = -1000
 
         # probs = torch.softmax(scores, dim=-1, dtype=torch.float)
         # probs[0] /= probs[0].sum()

From 3390196a1421fe66dd946d848b33936b8a4a42e9 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 2 Aug 2023 22:13:57 -0700
Subject: [PATCH 009/169] Add some javascript alerts for confirmations

---
 server.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/server.py b/server.py
index 6f0c5b67..679c9e93 100644
--- a/server.py
+++ b/server.py
@@ -528,12 +528,14 @@ def create_file_saving_event_handlers():
                 ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
                 load_session, gradio('load_session', 'interface_state'), gradio('interface_state')).then(
                 ui.apply_interface_values, gradio('interface_state'), gradio(ui.list_interface_input_elements()), show_progress=False).then(
-                chat.redraw_html, shared.reload_inputs, gradio('display'))
+                chat.redraw_html, shared.reload_inputs, gradio('display')).then(
+                None, None, None, _js='() => {alert("The session has been loaded.")}')
         else:
             shared.gradio['load_session'].upload(
                 ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
                 load_session, gradio('load_session', 'interface_state'), gradio('interface_state')).then(
-                ui.apply_interface_values, gradio('interface_state'), gradio(ui.list_interface_input_elements()), show_progress=False)
+                ui.apply_interface_values, gradio('interface_state'), gradio(ui.list_interface_input_elements()), show_progress=False).then(
+                None, None, None, _js='() => {alert("The session has been loaded.")}')
 
 
 def set_interface_arguments(interface_mode, extensions, bool_active):
@@ -955,7 +957,8 @@ def create_interface():
 
             shared.gradio['load_chat_history'].upload(
                 chat.load_history, gradio('load_chat_history', 'history'), gradio('history')).then(
-                chat.redraw_html, shared.reload_inputs, gradio('display'))
+                chat.redraw_html, shared.reload_inputs, gradio('display')).then(
+                None, None, None, _js='() => {alert("The history has been loaded.")}')
 
             shared.gradio['Copy last reply'].click(chat.send_last_reply_to_input, gradio('history'), gradio('textbox'), show_progress=False)
 
@@ -977,14 +980,20 @@ def create_interface():
                 lambda: 'characters/instruction-following/', None, gradio('delete_root')).then(
                 lambda: gr.update(visible=True), None, gradio('file_deleter'))
 
-            shared.gradio['save_chat_history'].click(lambda x: json.dumps(x, indent=4), gradio('history'), gradio('temporary_text')).then(
+            shared.gradio['save_chat_history'].click(
+                lambda x: json.dumps(x, indent=4), gradio('history'), gradio('temporary_text')).then(
                 None, gradio('temporary_text', 'character_menu', 'mode'), None, _js=f"(hist, char, mode) => {{{ui.save_files_js}; saveHistory(hist, char, mode)}}")
 
-            shared.gradio['Submit character'].click(chat.upload_character, gradio('upload_json', 'upload_img_bot'), gradio('character_menu'))
+            shared.gradio['Submit character'].click(
+                chat.upload_character, gradio('upload_json', 'upload_img_bot'), gradio('character_menu')).then(
+                None, None, None, _js='() => {alert("The character has been loaded.")}')
+
+            shared.gradio['Submit tavern character'].click(
+                chat.upload_tavern_character, gradio('upload_img_tavern', 'tavern_json'), gradio('character_menu')).then(
+                None, None, None, _js='() => {alert("The character has been loaded.")}')
+
             shared.gradio['upload_json'].upload(lambda: gr.update(interactive=True), None, gradio('Submit character'))
             shared.gradio['upload_json'].clear(lambda: gr.update(interactive=False), None, gradio('Submit character'))
-
-            shared.gradio['Submit tavern character'].click(chat.upload_tavern_character, gradio('upload_img_tavern', 'tavern_json'), gradio('character_menu'))
             shared.gradio['upload_img_tavern'].upload(chat.check_tavern_character, gradio('upload_img_tavern'), gradio('tavern_name', 'tavern_desc', 'tavern_json', 'Submit tavern character'), show_progress=False)
             shared.gradio['upload_img_tavern'].clear(lambda: (None, None, None, gr.update(interactive=False)), None, gradio('tavern_name', 'tavern_desc', 'tavern_json', 'Submit tavern character'), show_progress=False)
             shared.gradio['your_picture'].change(

From 3e70bce576926b6c9e1a9b2fcefeab79749af1a1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 3 Aug 2023 06:57:21 -0700
Subject: [PATCH 010/169] Properly format exceptions in the UI

---
 modules/training.py | 4 ++--
 server.py           | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/training.py b/modules/training.py
index c98fded2..ef833679 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -483,7 +483,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
                 exc = traceback.format_exc()
                 logger.error('Failed to reload the model.')
                 print(exc)
-                return exc
+                return exc.replace('\n', '\n\n')
 
     # == Start prepping the model itself ==
     if not hasattr(shared.model, 'lm_head') or hasattr(shared.model.lm_head, 'weight'):
@@ -518,7 +518,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
             state_dict_peft = torch.load(f"{lora_file_path}/adapter_model.bin")
             set_peft_model_state_dict(lora_model, state_dict_peft)
     except:
-        yield traceback.format_exc()
+        yield traceback.format_exc().replace('\n', '\n\n')
         return
 
     if shared.args.monkey_patch:
diff --git a/server.py b/server.py
index 679c9e93..601ae33f 100644
--- a/server.py
+++ b/server.py
@@ -75,7 +75,7 @@ def load_model_wrapper(selected_model, loader, autoload=False):
             exc = traceback.format_exc()
             logger.error('Failed to load the model.')
             print(exc)
-            yield exc
+            yield exc.replace('\n', '\n\n')
 
 
 def load_lora_wrapper(selected_loras):
@@ -159,7 +159,7 @@ def download_model_wrapper(repo_id, progress=gr.Progress()):
             yield ("Done!")
     except:
         progress(1.0)
-        yield traceback.format_exc()
+        yield traceback.format_exc().replace('\n', '\n\n')
 
 
 def create_model_menus():

From 87dab03dc02eb48b8fd7c8b9a2acb8281678798e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 3 Aug 2023 11:00:36 -0300
Subject: [PATCH 011/169] Add the --cpu option for llama.cpp to prevent CUDA
 from being used (#3432)

---
 README.md                 |  5 +++--
 modules/llamacpp_hf.py    | 17 ++++++++++++++---
 modules/llamacpp_model.py | 22 +++++++++++++++++++---
 modules/loaders.py        |  2 ++
 modules/shared.py         |  4 ++--
 5 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 073a841d..6ec84ba2 100644
--- a/README.md
+++ b/README.md
@@ -249,8 +249,9 @@ Optionally, you can use the following command-line flags:
 | `--n-gpu-layers N_GPU_LAYERS` | Number of layers to offload to the GPU. Only works if llama-cpp-python was compiled with BLAS. Set this to 1000000000 to offload all layers to the GPU. |
 | `--n_ctx N_CTX` | Size of the prompt context. |
 | `--llama_cpp_seed SEED` | Seed for llama-cpp models. Default 0 (random). |
-| `--n_gqa N_GQA`         | grouped-query attention. Must be 8 for llama2 70b. |
-| `--rms_norm_eps RMS_NORM_EPS`  | Must be 1e-5 for llama2 70b. |
+| `--n_gqa N_GQA`         | grouped-query attention. Must be 8 for llama-2 70b. |
+| `--rms_norm_eps RMS_NORM_EPS`  | 5e-6 is a good value for llama-2 models. |
+| `--cpu`                        | Use the CPU version of llama-cpp-python instead of the GPU-accelerated version. |
 
 #### AutoGPTQ
 
diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py
index 349a5782..e9f4ade6 100644
--- a/modules/llamacpp_hf.py
+++ b/modules/llamacpp_hf.py
@@ -10,13 +10,22 @@ from transformers.modeling_outputs import CausalLMOutputWithPast
 from modules import shared
 from modules.logging_colors import logger
 
+import llama_cpp
+
 if torch.cuda.is_available() and not torch.version.hip:
     try:
-        from llama_cpp_cuda import Llama
+        import llama_cpp_cuda
     except:
-        from llama_cpp import Llama
+        llama_cpp_cuda = None
 else:
-    from llama_cpp import Llama
+    llama_cpp_cuda = None
+
+
+def llama_cpp_lib():
+    if shared.args.cpu or llama_cpp_cuda is None:
+        return llama_cpp
+    else:
+        return llama_cpp_cuda
 
 
 class LlamacppHF(PreTrainedModel):
@@ -111,5 +120,7 @@ class LlamacppHF(PreTrainedModel):
             'logits_all': True,
         }
 
+        Llama = llama_cpp_lib().Llama
         model = Llama(**params)
+
         return LlamacppHF(model)
diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
index 0f9c3470..53177f4f 100644
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@@ -7,13 +7,22 @@ from modules import shared
 from modules.callbacks import Iteratorize
 from modules.logging_colors import logger
 
+import llama_cpp
+
 if torch.cuda.is_available() and not torch.version.hip:
     try:
-        from llama_cpp_cuda import Llama, LlamaCache, LogitsProcessorList
+        import llama_cpp_cuda
     except:
-        from llama_cpp import Llama, LlamaCache, LogitsProcessorList
+        llama_cpp_cuda = None
 else:
-    from llama_cpp import Llama, LlamaCache, LogitsProcessorList
+    llama_cpp_cuda = None
+
+
+def llama_cpp_lib():
+    if shared.args.cpu or llama_cpp_cuda is None:
+        return llama_cpp
+    else:
+        return llama_cpp_cuda
 
 
 def ban_eos_logits_processor(eos_token, input_ids, logits):
@@ -30,6 +39,10 @@ class LlamaCppModel:
 
     @classmethod
     def from_pretrained(self, path):
+
+        Llama = llama_cpp_lib().Llama
+        LlamaCache = llama_cpp_lib().LlamaCache
+
         result = self()
         cache_capacity = 0
         if shared.args.cache_capacity is not None:
@@ -74,6 +87,9 @@ class LlamaCppModel:
         return self.model.detokenize(tokens)
 
     def generate(self, prompt, state, callback=None):
+
+        LogitsProcessorList = llama_cpp_lib().LogitsProcessorList
+
         prompt = prompt if type(prompt) is str else prompt.decode()
         completion_chunks = self.model.create_completion(
             prompt=prompt,
diff --git a/modules/loaders.py b/modules/loaders.py
index 68b48204..aa1afcb8 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -41,6 +41,7 @@ loaders_and_params = {
         'llama_cpp_seed',
         'compress_pos_emb',
         'alpha_value',
+        'cpu',
     ],
     'llamacpp_HF': [
         'n_ctx',
@@ -55,6 +56,7 @@ loaders_and_params = {
         'llama_cpp_seed',
         'compress_pos_emb',
         'alpha_value',
+        'cpu',
         'llamacpp_HF_info',
     ],
     'Transformers': [
diff --git a/modules/shared.py b/modules/shared.py
index bac3fa8c..fc9ba3cf 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -132,8 +132,8 @@ parser.add_argument('--cache-capacity', type=str, help='Maximum cache capacity.
 parser.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layers to offload to the GPU.')
 parser.add_argument('--n_ctx', type=int, default=2048, help='Size of the prompt context.')
 parser.add_argument('--llama_cpp_seed', type=int, default=0, help='Seed for llama-cpp models. Default 0 (random)')
-parser.add_argument('--n_gqa', type=int, default=0, help='grouped-query attention. Must be 8 for llama2 70b.')
-parser.add_argument('--rms_norm_eps', type=float, default=0, help='Must be 1e-5 for llama2 70b.')
+parser.add_argument('--n_gqa', type=int, default=0, help='grouped-query attention. Must be 8 for llama-2 70b.')
+parser.add_argument('--rms_norm_eps', type=float, default=0, help='5e-6 is a good value for llama-2 models.')
 
 # GPTQ
 parser.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.')

From 1839dff7639ff03ebfb6a5d8984070f7fac9d4e0 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 3 Aug 2023 08:13:17 -0700
Subject: [PATCH 012/169] Use Esc to Stop the generation

---
 css/main.js | 11 +++++++++++
 server.py   |  4 ++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/css/main.js b/css/main.js
index 9663d464..7a2368fe 100644
--- a/css/main.js
+++ b/css/main.js
@@ -24,3 +24,14 @@ for(i = 0; i < textareaElements.length; i++) {
     textareaElements[i].classList.add('pretty_scrollbar');
     textareaElements[i].style.resize = "none";
 }
+
+// Stop generation on Esc pressed
+document.addEventListener("keydown", function(event) {
+  if (event.key === "Escape") {
+    // Find the element with id 'stop' and click it
+    var stopButton = document.getElementById("stop");
+    if (stopButton) {
+      stopButton.click();
+    }
+  }
+});
diff --git a/server.py b/server.py
index 601ae33f..10ea4ece 100644
--- a/server.py
+++ b/server.py
@@ -741,7 +741,7 @@ def create_interface():
 
                         with gr.Row():
                             shared.gradio['Generate'] = gr.Button('Generate', variant='primary', elem_classes="small-button")
-                            shared.gradio['Stop'] = gr.Button('Stop', elem_classes="small-button")
+                            shared.gradio['Stop'] = gr.Button('Stop', elem_classes="small-button", elem_id='stop')
                             shared.gradio['Undo'] = gr.Button('Undo', elem_classes="small-button")
                             shared.gradio['Regenerate'] = gr.Button('Regenerate', elem_classes="small-button")
 
@@ -772,7 +772,7 @@ def create_interface():
                         shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens'])
                         with gr.Row():
                             shared.gradio['Generate'] = gr.Button('Generate', variant='primary')
-                            shared.gradio['Stop'] = gr.Button('Stop')
+                            shared.gradio['Stop'] = gr.Button('Stop', elem_id='stop')
                             shared.gradio['Continue'] = gr.Button('Continue')
                             shared.gradio['count_tokens'] = gr.Button('Count tokens')
 

From d578baeb2c2b4363fbbdb9446b332d5d062af0c6 Mon Sep 17 00:00:00 2001
From: rafa-9 <92696534+rafa-9@users.noreply.github.com>
Date: Thu, 3 Aug 2023 14:56:40 -0400
Subject: [PATCH 013/169] Use character settings from API properties if present
 (#3428)

---
 api-examples/api-example-chat-stream.py | 10 ++++++++--
 api-examples/api-example-chat.py        | 10 ++++++++--
 extensions/api/util.py                  | 16 ++++++++--------
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/api-examples/api-example-chat-stream.py b/api-examples/api-example-chat-stream.py
index 493661c2..2914d451 100644
--- a/api-examples/api-example-chat-stream.py
+++ b/api-examples/api-example-chat-stream.py
@@ -25,9 +25,15 @@ async def run(user_input, history):
         'mode': 'instruct',  # Valid options: 'chat', 'chat-instruct', 'instruct'
         'character': 'Example',
         'instruction_template': 'Vicuna-v1.1',  # Will get autodetected if unset
-        # 'context_instruct': '',  # Optional
         'your_name': 'You',
-
+        # 'name1': 'name of user', # Optional
+        # 'name2': 'name of character', # Optional
+        # 'context': 'character context', # Optional
+        # 'greeting': 'greeting', # Optional
+        # 'name1_instruct': 'You', # Optional
+        # 'name2_instruct': 'Assistant', # Optional
+        # 'context_instruct': 'context_instruct', # Optional
+        # 'turn_template': 'turn_template', # Optional
         'regenerate': False,
         '_continue': False,
         'stop_at_newline': False,
diff --git a/api-examples/api-example-chat.py b/api-examples/api-example-chat.py
index 31641815..e2797f1e 100644
--- a/api-examples/api-example-chat.py
+++ b/api-examples/api-example-chat.py
@@ -19,9 +19,15 @@ def run(user_input, history):
         'mode': 'instruct',  # Valid options: 'chat', 'chat-instruct', 'instruct'
         'character': 'Example',
         'instruction_template': 'Vicuna-v1.1',  # Will get autodetected if unset
-        # 'context_instruct': '',  # Optional
         'your_name': 'You',
-
+        # 'name1': 'name of user', # Optional
+        # 'name2': 'name of character', # Optional
+        # 'context': 'character context', # Optional
+        # 'greeting': 'greeting', # Optional
+        # 'name1_instruct': 'You', # Optional
+        # 'name2_instruct': 'Assistant', # Optional
+        # 'context_instruct': 'context_instruct', # Optional
+        # 'turn_template': 'turn_template', # Optional
         'regenerate': False,
         '_continue': False,
         'stop_at_newline': False,
diff --git a/extensions/api/util.py b/extensions/api/util.py
index 5cc259db..ef58a70f 100644
--- a/extensions/api/util.py
+++ b/extensions/api/util.py
@@ -69,14 +69,14 @@ def build_parameters(body, chat=False):
             'stop_at_newline': bool(body.get('stop_at_newline', shared.settings['stop_at_newline'])),
             'chat_generation_attempts': int(body.get('chat_generation_attempts', shared.settings['chat_generation_attempts'])),
             'mode': str(body.get('mode', 'chat')),
-            'name1': name1,
-            'name2': name2,
-            'context': context,
-            'greeting': greeting,
-            'name1_instruct': name1_instruct,
-            'name2_instruct': name2_instruct,
-            'context_instruct': body.get('context_instruct', context_instruct),
-            'turn_template': turn_template,
+            'name1': str(body.get('name1', name1)),
+            'name2': str(body.get('name2', name2)),
+            'context': str(body.get('context', context)),
+            'greeting': str(body.get('greeting', greeting)),
+            'name1_instruct': str(body.get('name1_instruct', name1_instruct)),
+            'name2_instruct': str(body.get('name2_instruct', name2_instruct)),
+            'context_instruct': str(body.get('context_instruct', context_instruct)),
+            'turn_template': str(body.get('turn_template', turn_template)),
             'chat-instruct_command': str(body.get('chat-instruct_command', shared.settings['chat-instruct_command'])),
             'history': body.get('history', {'internal': [], 'visible': []})
         })

From f61573bbde6531b5ac574e5111960d8a9b843f66 Mon Sep 17 00:00:00 2001
From: Paul DeCarlo <toolboc@gmail.com>
Date: Thu, 3 Aug 2023 21:57:33 +0300
Subject: [PATCH 014/169] Add standalone Dockerfile for NVIDIA Jetson (#3336)

---
 docker/Dockerfile.jetson | 51 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 docker/Dockerfile.jetson

diff --git a/docker/Dockerfile.jetson b/docker/Dockerfile.jetson
new file mode 100644
index 00000000..cefbc3c2
--- /dev/null
+++ b/docker/Dockerfile.jetson
@@ -0,0 +1,51 @@
+#Standalone Dockerfile for text-generation-webui on NVIDIA Jetson Embedded devices
+
+FROM nvcr.io/nvidia/l4t-pytorch:r35.2.1-pth2.0-py3 as builder
+ENV TORCH_CUDA_ARCH_LIST Turing
+RUN apt-get update && \
+    apt-get install -y python3 python3-pip git build-essential python3-dev
+
+RUN pip3 install --upgrade pip setuptools
+RUN git clone https://github.com/g588928812/bitsandbytes_jetsonX.git /build
+WORKDIR /build
+RUN CUDA_VERSION=118 make cuda11x
+RUN mkdir /wheels
+RUN python3 setup.py bdist_wheel -d /wheels
+RUN rm -rf /build
+RUN git clone https://github.com/oobabooga/GPTQ-for-LLaMa /build
+WORKDIR /build
+RUN pip3 install -r requirements.txt
+RUN python3 setup_cuda.py bdist_wheel -d /wheels
+
+FROM nvcr.io/nvidia/l4t-pytorch:r35.2.1-pth2.0-py3
+COPY --from=builder /wheels /wheels
+COPY --from=builder /build /build
+RUN apt-get update && \
+    apt-get install --no-install-recommends -y git python3-dev python3 python3-pip make g++ && \
+    rm -rf /var/lib/apt/lists/*
+RUN pip3 install /wheels/*.whl
+RUN rm -rf /wheels
+WORKDIR /build
+RUN pip3 install -r requirements.txt
+RUN git clone https://github.com/oobabooga/text-generation-webui /app
+
+WORKDIR /app
+#ENV WEBUI_VERSION="2908a515877ffde2b1684b2353f6d72e6cb4d31b"
+#RUN git reset --hard ${WEBUI_VERSION}
+RUN pip3 install --upgrade pip setuptools
+RUN pip3 install protobuf>=3.3.0
+RUN pip3 install -r requirements.txt
+#Force to use bitsandbytes_jetsonX
+RUN pip3 uninstall -y bitsandbytes
+RUN mkdir /app/repositories
+RUN mv /build /app/repositories/GPTQ-for-LLaMa
+
+#Remove Python 3.10 specific macros
+RUN sed -i 's/@functools.cache/@functools.lru_cache(maxsize=None)/g' /app/modules/chat.py
+RUN sed -i 's/@functools.cache/@functools.lru_cache(maxsize=None)/g' /app/modules/loaders.py
+RUN sed -i 's/@functools.cache/@functools.lru_cache(maxsize=None)/g' /app/modules/presets.py
+
+EXPOSE 7860
+
+ENV CLI_ARGS="--listen"
+CMD python3 server.py ${CLI_ARGS}
\ No newline at end of file

From 32e7cbb635612be3aeff6e2598434e466429a26e Mon Sep 17 00:00:00 2001
From: matatonic <73265741+matatonic@users.noreply.github.com>
Date: Thu, 3 Aug 2023 15:02:54 -0400
Subject: [PATCH 015/169] More models: +StableBeluga2 (#3415)

---
 characters/instruction-following/StableBeluga2.yaml | 4 ++++
 models/config.yaml                                  | 5 +++++
 2 files changed, 9 insertions(+)
 create mode 100644 characters/instruction-following/StableBeluga2.yaml

diff --git a/characters/instruction-following/StableBeluga2.yaml b/characters/instruction-following/StableBeluga2.yaml
new file mode 100644
index 00000000..cd5675f8
--- /dev/null
+++ b/characters/instruction-following/StableBeluga2.yaml
@@ -0,0 +1,4 @@
+user: "### User:"
+bot: "### Assistant:"
+turn_template: "<|user|>\n<|user-message|>\n\n<|bot|>\n<|bot-message|>\n\n"
+context: "### System:\nThis is a system prompt, please behave and help the user.\n\n"
diff --git a/models/config.yaml b/models/config.yaml
index 0c1027c0..4d618de2 100644
--- a/models/config.yaml
+++ b/models/config.yaml
@@ -283,3 +283,8 @@ TheBloke_WizardLM-30B-GPTQ:
 .*newhope:
   mode: 'instruct'
   instruction_template: 'NewHope'
+.*stablebeluga2:
+  mode: 'instruct'
+  instruction_template: 'StableBeluga2'
+  truncation_length: 4096
+  rms_norm_eps: 5.0e-6

From 8f98268252ab6fe4b54609e7abac752eff268ea5 Mon Sep 17 00:00:00 2001
From: matatonic <73265741+matatonic@users.noreply.github.com>
Date: Thu, 3 Aug 2023 15:10:49 -0400
Subject: [PATCH 016/169] extensions/openai: include content-length for json
 replies (#3416)

---
 extensions/openai/script.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/extensions/openai/script.py b/extensions/openai/script.py
index f95205a5..d1faa019 100644
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@@ -67,10 +67,13 @@ class Handler(BaseHTTPRequestHandler):
         self.send_response(code)
         self.send_access_control_headers()
         self.send_header('Content-Type', 'application/json')
-        self.end_headers()
 
         response = json.dumps(ret)
         r_utf8 = response.encode('utf-8')
+
+        self.send_header('Content-Length', str(len(r_utf8)))
+        self.end_headers()
+
         self.wfile.write(r_utf8)
         if not no_debug:
             debug_msg(r_utf8)

From 4e6dc6d99d4b8288d632a966750b2b42e3c8d47e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 3 Aug 2023 14:36:35 -0700
Subject: [PATCH 017/169] Add Contributing guidelines

---
 .github/pull_request_template.md | 3 +++
 README.md                        | 5 +----
 2 files changed, 4 insertions(+), 4 deletions(-)
 create mode 100644 .github/pull_request_template.md

diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
new file mode 100644
index 00000000..51e26b13
--- /dev/null
+++ b/.github/pull_request_template.md
@@ -0,0 +1,3 @@
+## Checklist:
+
+- [ ] I have read the [Contributing guidelines](https://github.com/oobabooga/text-generation-webui/wiki/Contributing-guidelines).
diff --git a/README.md b/README.md
index 6ec84ba2..4756672e 100644
--- a/README.md
+++ b/README.md
@@ -341,10 +341,7 @@ The presets that are included by default are the result of a contest that receiv
 
 ## Contributing
 
-* Pull requests, suggestions, and issue reports are welcome. 
-* Make sure to carefully [search](https://github.com/oobabooga/text-generation-webui/issues) existing issues before starting a new one.
-* If you have some experience with git, testing an open pull request and leaving a comment on whether it works as expected or not is immensely helpful.
-* A simple way to contribute, even if you are not a programmer, is to leave a 👍 on an issue or pull request that you find relevant.
+If you would like to contribute to the project, check out the [Contributing guidelines](https://github.com/oobabooga/text-generation-webui/wiki/Contributing-guidelines).
 
 ## Community
 

From f4005164f4318ce8ba728d0ed7de7b7d40315bf3 Mon Sep 17 00:00:00 2001
From: Pete <33569918+jparmstr@users.noreply.github.com>
Date: Thu, 3 Aug 2023 19:01:15 -0400
Subject: [PATCH 018/169] Fix llama.cpp truncation (#3400)

---------

Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com>
---
 modules/llamacpp_model.py  | 7 +++++++
 modules/text_generation.py | 1 -
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
index 53177f4f..e5401378 100644
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@@ -6,6 +6,7 @@ import torch
 from modules import shared
 from modules.callbacks import Iteratorize
 from modules.logging_colors import logger
+from modules.text_generation import get_max_prompt_length
 
 import llama_cpp
 
@@ -91,6 +92,12 @@ class LlamaCppModel:
         LogitsProcessorList = llama_cpp_lib().LogitsProcessorList
 
         prompt = prompt if type(prompt) is str else prompt.decode()
+
+        # Handle truncation
+        prompt = self.encode(prompt)
+        prompt = prompt[-get_max_prompt_length(state):]
+        prompt = self.decode(prompt).decode('utf-8')
+
         completion_chunks = self.model.create_completion(
             prompt=prompt,
             max_tokens=state['max_new_tokens'],
diff --git a/modules/text_generation.py b/modules/text_generation.py
index f6f71990..7507a731 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -39,7 +39,6 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
     if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel']:
         input_ids = shared.tokenizer.encode(str(prompt))
         input_ids = np.array(input_ids).reshape(1, len(input_ids))
-        return input_ids
     else:
         input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=add_special_tokens)
 

From 4b3384e353b9630bdc16efe946c02daa78b33f48 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 3 Aug 2023 17:10:57 -0700
Subject: [PATCH 019/169] Handle unfinished lists during markdown streaming

---
 download-model.py         | 12 ++++++------
 modules/html_generator.py | 22 ++++++++++++++++++++--
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/download-model.py b/download-model.py
index 0f650516..e1afa9ef 100644
--- a/download-model.py
+++ b/download-model.py
@@ -75,12 +75,12 @@ class ModelDownloader:
                 if not is_lora and fname.endswith(('adapter_config.json', 'adapter_model.bin')):
                     is_lora = True
 
-                is_pytorch = re.match("(pytorch|adapter|gptq)_model.*\.bin", fname)
-                is_safetensors = re.match(".*\.safetensors", fname)
-                is_pt = re.match(".*\.pt", fname)
-                is_ggml = re.match(".*ggml.*\.bin", fname)
-                is_tokenizer = re.match("(tokenizer|ice|spiece).*\.model", fname)
-                is_text = re.match(".*\.(txt|json|py|md)", fname) or is_tokenizer
+                is_pytorch = re.match(r"(pytorch|adapter|gptq)_model.*\.bin", fname)
+                is_safetensors = re.match(r".*\.safetensors", fname)
+                is_pt = re.match(r".*\.pt", fname)
+                is_ggml = re.match(r".*ggml.*\.bin", fname)
+                is_tokenizer = re.match(r"(tokenizer|ice|spiece).*\.model", fname)
+                is_text = re.match(r".*\.(txt|json|py|md)", fname) or is_tokenizer
                 if any((is_pytorch, is_safetensors, is_pt, is_ggml, is_tokenizer, is_text)):
                     if 'lfs' in dict[i]:
                         sha256.append([fname, dict[i]['lfs']['oid']])
diff --git a/modules/html_generator.py b/modules/html_generator.py
index ab0aeab0..c6ca13b6 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -61,8 +61,26 @@ def convert_to_markdown(string):
     if is_code:
         result = result + '```'  # Unfinished code block
 
-    string = result.strip()
-    return markdown.markdown(string, extensions=['fenced_code', 'tables'])
+    result = result.strip()
+
+    # Unfinished list, like "\n1.". A |delete| string is added and then
+    # removed to force a <ol> to be generated instead of a <p>.
+    if re.search(r'(\d+\.?)$', result):
+        delete_str = '|delete|'
+
+        if not result.endswith('.'):
+            result += '.'
+
+        result = re.sub(r'(\d+\.)$', r'\g<1> ' + delete_str, result)
+
+        html = markdown.markdown(result, extensions=['fenced_code', 'tables'])
+        pos = html.rfind(delete_str)
+        if pos > -1:
+            html = html[:pos] + html[pos + len(delete_str):]
+    else:
+        html = markdown.markdown(result, extensions=['fenced_code', 'tables'])
+
+    return html
 
 
 def generate_basic_html(string):

From 2336b75d925a94ff121073f6287bcdbba81f0e7b Mon Sep 17 00:00:00 2001
From: missionfloyd <missionfloyd@users.noreply.github.com>
Date: Thu, 3 Aug 2023 22:58:37 -0600
Subject: [PATCH 020/169] Remove unnecessary chat.js (#3445)

---
 css/chat.css  | 7 ++++++-
 css/chat.js   | 4 ----
 modules/ui.py | 2 --
 server.py     | 2 +-
 4 files changed, 7 insertions(+), 8 deletions(-)
 delete mode 100644 css/chat.js

diff --git a/css/chat.css b/css/chat.css
index ad76f5cc..67bbe512 100644
--- a/css/chat.css
+++ b/css/chat.css
@@ -26,7 +26,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 
 #extensions {
     padding: 0;
-    padding: 0;
 }
 
 #gradio-chatbot {
@@ -46,6 +45,12 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     min-width: 0 !important;
 }
 
+#main > :first-child, #extensions {
+    max-width: 800px;
+    margin-left: auto;
+    margin-right: auto;
+}
+
 @media screen and (max-width: 688px) {
     #main {
         padding: 0px;
diff --git a/css/chat.js b/css/chat.js
deleted file mode 100644
index e304f125..00000000
--- a/css/chat.js
+++ /dev/null
@@ -1,4 +0,0 @@
-document.getElementById("main").childNodes[0].style = "max-width: 800px; margin-left: auto; margin-right: auto";
-document.getElementById("extensions").style.setProperty("max-width", "800px");
-document.getElementById("extensions").style.setProperty("margin-left", "auto");
-document.getElementById("extensions").style.setProperty("margin-right", "auto");
diff --git a/modules/ui.py b/modules/ui.py
index df36a331..eed2ef66 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -13,8 +13,6 @@ with open(Path(__file__).resolve().parent / '../css/chat.css', 'r') as f:
     chat_css = f.read()
 with open(Path(__file__).resolve().parent / '../css/main.js', 'r') as f:
     main_js = f.read()
-with open(Path(__file__).resolve().parent / '../css/chat.js', 'r') as f:
-    chat_js = f.read()
 with open(Path(__file__).resolve().parent / '../css/save_files.js', 'r') as f:
     save_files_js = f.read()
 
diff --git a/server.py b/server.py
index 10ea4ece..d1a23bbe 100644
--- a/server.py
+++ b/server.py
@@ -597,7 +597,7 @@ def create_interface():
 
     # css/js strings
     css = ui.css if not shared.is_chat() else ui.css + ui.chat_css
-    js = ui.main_js if not shared.is_chat() else ui.main_js + ui.chat_js
+    js = ui.main_js
     css += apply_extensions('css')
     js += apply_extensions('js')
 

From ed57a79c6e44c1eafd4667ebd19e72a2143b7a4d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 4 Aug 2023 02:29:14 -0300
Subject: [PATCH 021/169] Add back silero preview by @missionfloyd (#3446)

---
 extensions/silero_tts/harvard_sentences.txt | 720 ++++++++++++++++++++
 extensions/silero_tts/script.py             |  42 +-
 extensions/silero_tts/style.css             |   8 +
 3 files changed, 768 insertions(+), 2 deletions(-)
 create mode 100644 extensions/silero_tts/harvard_sentences.txt
 create mode 100644 extensions/silero_tts/style.css

diff --git a/extensions/silero_tts/harvard_sentences.txt b/extensions/silero_tts/harvard_sentences.txt
new file mode 100644
index 00000000..958d7f3c
--- /dev/null
+++ b/extensions/silero_tts/harvard_sentences.txt
@@ -0,0 +1,720 @@
+The birch canoe slid on the smooth planks.
+Glue the sheet to the dark blue background.
+It's easy to tell the depth of a well.
+These days a chicken leg is a rare dish.
+Rice is often served in round bowls.
+The juice of lemons makes fine punch.
+The box was thrown beside the parked truck.
+The hogs were fed chopped corn and garbage.
+Four hours of steady work faced us.
+A large size in stockings is hard to sell.
+The boy was there when the sun rose.
+A rod is used to catch pink salmon.
+The source of the huge river is the clear spring.
+Kick the ball straight and follow through.
+Help the woman get back to her feet.
+A pot of tea helps to pass the evening.
+Smoky fires lack flame and heat.
+The soft cushion broke the man's fall.
+The salt breeze came across from the sea.
+The girl at the booth sold fifty bonds.
+The small pup gnawed a hole in the sock.
+The fish twisted and turned on the bent hook.
+Press the pants and sew a button on the vest.
+The swan dive was far short of perfect.
+The beauty of the view stunned the young boy.
+Two blue fish swam in the tank.
+Her purse was full of useless trash.
+The colt reared and threw the tall rider.
+It snowed, rained, and hailed the same morning.
+Read verse out loud for pleasure.
+Hoist the load to your left shoulder.
+Take the winding path to reach the lake.
+Note closely the size of the gas tank.
+Wipe the grease off his dirty face.
+Mend the coat before you go out.
+The wrist was badly strained and hung limp.
+The stray cat gave birth to kittens.
+The young girl gave no clear response.
+The meal was cooked before the bell rang.
+What joy there is in living.
+A king ruled the state in the early days.
+The ship was torn apart on the sharp reef.
+Sickness kept him home the third week.
+The wide road shimmered in the hot sun.
+The lazy cow lay in the cool grass.
+Lift the square stone over the fence.
+The rope will bind the seven books at once.
+Hop over the fence and plunge in.
+The friendly gang left the drug store.
+Mesh wire keeps chicks inside.
+The frosty air passed through the coat.
+The crooked maze failed to fool the mouse.
+Adding fast leads to wrong sums.
+The show was a flop from the very start.
+A saw is a tool used for making boards.
+The wagon moved on well oiled wheels.
+March the soldiers past the next hill.
+A cup of sugar makes sweet fudge.
+Place a rosebush near the porch steps.
+Both lost their lives in the raging storm.
+We talked of the side show in the circus.
+Use a pencil to write the first draft.
+He ran half way to the hardware store.
+The clock struck to mark the third period.
+A small creek cut across the field.
+Cars and busses stalled in snow drifts.
+The set of china hit the floor with a crash.
+This is a grand season for hikes on the road.
+The dune rose from the edge of the water.
+Those words were the cue for the actor to leave.
+A yacht slid around the point into the bay.
+The two met while playing on the sand.
+The ink stain dried on the finished page.
+The walled town was seized without a fight.
+The lease ran out in sixteen weeks.
+A tame squirrel makes a nice pet.
+The horn of the car woke the sleeping cop.
+The heart beat strongly and with firm strokes.
+The pearl was worn in a thin silver ring.
+The fruit peel was cut in thick slices.
+The Navy attacked the big task force.
+See the cat glaring at the scared mouse.
+There are more than two factors here.
+The hat brim was wide and too droopy.
+The lawyer tried to lose his case.
+The grass curled around the fence post.
+Cut the pie into large parts.
+Men strive but seldom get rich.
+Always close the barn door tight.
+He lay prone and hardly moved a limb.
+The slush lay deep along the street.
+A wisp of cloud hung in the blue air.
+A pound of sugar costs more than eggs.
+The fin was sharp and cut the clear water.
+The play seems dull and quite stupid.
+Bail the boat to stop it from sinking.
+The term ended in late June that year.
+A tusk is used to make costly gifts.
+Ten pins were set in order.
+The bill was paid every third week.
+Oak is strong and also gives shade.
+Cats and dogs each hate the other.
+The pipe began to rust while new.
+Open the crate but don't break the glass.
+Add the sum to the product of these three.
+Thieves who rob friends deserve jail.
+The ripe taste of cheese improves with age.
+Act on these orders with great speed.
+The hog crawled under the high fence.
+Move the vat over the hot fire.
+The bark of the pine tree was shiny and dark.
+Leaves turn brown and yellow in the fall.
+The pennant waved when the wind blew.
+Split the log with a quick, sharp blow.
+Burn peat after the logs give out.
+He ordered peach pie with ice cream.
+Weave the carpet on the right hand side.
+Hemp is a weed found in parts of the tropics.
+A lame back kept his score low.
+We find joy in the simplest things.
+Type out three lists of orders.
+The harder he tried the less he got done.
+The boss ran the show with a watchful eye.
+The cup cracked and spilled its contents.
+Paste can cleanse the most dirty brass.
+The slang word for raw whiskey is booze.
+It caught its hind paw in a rusty trap.
+The wharf could be seen at the farther shore.
+Feel the heat of the weak dying flame.
+The tiny girl took off her hat.
+A cramp is no small danger on a swim.
+He said the same phrase thirty times.
+Pluck the bright rose without leaves.
+Two plus seven is less than ten.
+The glow deepened in the eyes of the sweet girl.
+Bring your problems to the wise chief.
+Write a fond note to the friend you cherish.
+Clothes and lodging are free to new men.
+We frown when events take a bad turn.
+Port is a strong wine with a smoky taste.
+The young kid jumped the rusty gate.
+Guess the results from the first scores.
+A salt pickle tastes fine with ham.
+The just claim got the right verdict.
+These thistles bend in a high wind.
+Pure bred poodles have curls.
+The tree top waved in a graceful way.
+The spot on the blotter was made by green ink.
+Mud was spattered on the front of his white shirt.
+The cigar burned a hole in the desk top.
+The empty flask stood on the tin tray.
+A speedy man can beat this track mark.
+He broke a new shoelace that day.
+The coffee stand is too high for the couch.
+The urge to write short stories is rare.
+The pencils have all been used.
+The pirates seized the crew of the lost ship.
+We tried to replace the coin but failed.
+She sewed the torn coat quite neatly.
+The sofa cushion is red and of light weight.
+The jacket hung on the back of the wide chair.
+At that high level the air is pure.
+Drop the two when you add the figures.
+A filing case is now hard to buy.
+An abrupt start does not win the prize.
+Wood is best for making toys and blocks.
+The office paint was a dull, sad tan.
+He knew the skill of the great young actress.
+A rag will soak up spilled water.
+A shower of dirt fell from the hot pipes.
+Steam hissed from the broken valve.
+The child almost hurt the small dog.
+There was a sound of dry leaves outside.
+The sky that morning was clear and bright blue.
+Torn scraps littered the stone floor.
+Sunday is the best part of the week.
+The doctor cured him with these pills.
+The new girl was fired today at noon.
+They felt gay when the ship arrived in port.
+Add the store's account to the last cent.
+Acid burns holes in wool cloth.
+Fairy tales should be fun to write.
+Eight miles of woodland burned to waste.
+The third act was dull and tired the players.
+A young child should not suffer fright.
+Add the column and put the sum here.
+We admire and love a good cook.
+There the flood mark is ten inches.
+He carved a head from the round block of marble.
+She has a smart way of wearing clothes.
+The fruit of a fig tree is apple-shaped.
+Corn cobs can be used to kindle a fire.
+Where were they when the noise started.
+The paper box is full of thumb tacks.
+Sell your gift to a buyer at a good gain.
+The tongs lay beside the ice pail.
+The petals fall with the next puff of wind.
+Bring your best compass to the third class.
+They could laugh although they were sad.
+Farmers came in to thresh the oat crop.
+The brown house was on fire to the attic.
+The lure is used to catch trout and flounder.
+Float the soap on top of the bath water.
+A blue crane is a tall wading bird.
+A fresh start will work such wonders.
+The club rented the rink for the fifth night.
+After the dance, they went straight home.
+The hostess taught the new maid to serve.
+He wrote his last novel there at the inn.
+Even the worst will beat his low score.
+The cement had dried when he moved it.
+The loss of the second ship was hard to take.
+The fly made its way along the wall.
+Do that with a wooden stick.
+Live wires should be kept covered.
+The large house had hot water taps.
+It is hard to erase blue or red ink.
+Write at once or you may forget it.
+The doorknob was made of bright clean brass.
+The wreck occurred by the bank on Main Street.
+A pencil with black lead writes best.
+Coax a young calf to drink from a bucket.
+Schools for ladies teach charm and grace.
+The lamp shone with a steady green flame.
+They took the axe and the saw to the forest.
+The ancient coin was quite dull and worn.
+The shaky barn fell with a loud crash.
+Jazz and swing fans like fast music.
+Rake the rubbish up and then burn it.
+Slash the gold cloth into fine ribbons.
+Try to have the court decide the case.
+They are pushed back each time they attack.
+He broke his ties with groups of former friends.
+They floated on the raft to sun their white backs.
+The map had an X that meant nothing.
+Whitings are small fish caught in nets.
+Some ads serve to cheat buyers.
+Jerk the rope and the bell rings weakly.
+A waxed floor makes us lose balance.
+Madam, this is the best brand of corn.
+On the islands the sea breeze is soft and mild.
+The play began as soon as we sat down.
+This will lead the world to more sound and fury.
+Add salt before you fry the egg.
+The rush for funds reached its peak Tuesday.
+The birch looked stark white and lonesome.
+The box is held by a bright red snapper.
+To make pure ice, you freeze water.
+The first worm gets snapped early.
+Jump the fence and hurry up the bank.
+Yell and clap as the curtain slides back.
+They are men who walk the middle of the road.
+Both brothers wear the same size.
+In some form or other we need fun.
+The prince ordered his head chopped off.
+The houses are built of red clay bricks.
+Ducks fly north but lack a compass.
+Fruit flavors are used in fizz drinks.
+These pills do less good than others.
+Canned pears lack full flavor.
+The dark pot hung in the front closet.
+Carry the pail to the wall and spill it there.
+The train brought our hero to the big town.
+We are sure that one war is enough.
+Gray paint stretched for miles around.
+The rude laugh filled the empty room.
+High seats are best for football fans.
+Tea served from the brown jug is tasty.
+A dash of pepper spoils beef stew.
+A zestful food is the hot-cross bun.
+The horse trotted around the field at a brisk pace.
+Find the twin who stole the pearl necklace.
+Cut the cord that binds the box tightly.
+The red tape bound the smuggled food.
+Look in the corner to find the tan shirt.
+The cold drizzle will halt the bond drive.
+Nine men were hired to dig the ruins.
+The junk yard had a mouldy smell.
+The flint sputtered and lit a pine torch.
+Soak the cloth and drown the sharp odor.
+The shelves were bare of both jam or crackers.
+A joy to every child is the swan boat.
+All sat frozen and watched the screen.
+A cloud of dust stung his tender eyes.
+To reach the end he needs much courage.
+Shape the clay gently into block form.
+A ridge on a smooth surface is a bump or flaw.
+Hedge apples may stain your hands green.
+Quench your thirst, then eat the crackers.
+Tight curls get limp on rainy days.
+The mute muffled the high tones of the horn.
+The gold ring fits only a pierced ear.
+The old pan was covered with hard fudge.
+Watch the log float in the wide river.
+The node on the stalk of wheat grew daily.
+The heap of fallen leaves was set on fire.
+Write fast if you want to finish early.
+His shirt was clean but one button was gone.
+The barrel of beer was a brew of malt and hops.
+Tin cans are absent from store shelves.
+Slide the box into that empty space.
+The plant grew large and green in the window.
+The beam dropped down on the workmen's head.
+Pink clouds floated with the breeze.
+She danced like a swan, tall and graceful.
+The tube was blown and the tire flat and useless.
+It is late morning on the old wall clock.
+Let's all join as we sing the last chorus.
+The last switch cannot be turned off.
+The fight will end in just six minutes.
+The store walls were lined with colored frocks.
+The peace league met to discuss their plans.
+The rise to fame of a person takes luck.
+Paper is scarce, so write with much care.
+The quick fox jumped on the sleeping cat.
+The nozzle of the fire hose was bright brass.
+Screw the round cap on as tight as needed.
+Time brings us many changes.
+The purple tie was ten years old.
+Men think and plan and sometimes act.
+Fill the ink jar with sticky glue.
+He smoke a big pipe with strong contents.
+We need grain to keep our mules healthy.
+Pack the records in a neat thin case.
+The crunch of feet in the snow was the only sound.
+The copper bowl shone in the sun's rays.
+Boards will warp unless kept dry.
+The plush chair leaned against the wall.
+Glass will clink when struck by metal.
+Bathe and relax in the cool green grass.
+Nine rows of soldiers stood in line.
+The beach is dry and shallow at low tide.
+The idea is to sew both edges straight.
+The kitten chased the dog down the street.
+Pages bound in cloth make a book.
+Try to trace the fine lines of the painting.
+Women form less than half of the group.
+The zones merge in the central part of town.
+A gem in the rough needs work to polish.
+Code is used when secrets are sent.
+Most of the news is easy for us to hear.
+He used the lathe to make brass objects.
+The vane on top of the pole revolved in the wind.
+Mince pie is a dish served to children.
+The clan gathered on each dull night.
+Let it burn, it gives us warmth and comfort.
+A castle built from sand fails to endure.
+A child's wit saved the day for us.
+Tack the strip of carpet to the worn floor.
+Next Tuesday we must vote.
+Pour the stew from the pot into the plate.
+Each penny shone like new.
+The man went to the woods to gather sticks.
+The dirt piles were lines along the road.
+The logs fell and tumbled into the clear stream.
+Just hoist it up and take it away.
+A ripe plum is fit for a king's palate.
+Our plans right now are hazy.
+Brass rings are sold by these natives.
+It takes a good trap to capture a bear.
+Feed the white mouse some flower seeds.
+The thaw came early and freed the stream.
+He took the lead and kept it the whole distance.
+The key you designed will fit the lock.
+Plead to the council to free the poor thief.
+Better hash is made of rare beef.
+This plank was made for walking on.
+The lake sparkled in the red hot sun.
+He crawled with care along the ledge.
+Tend the sheep while the dog wanders.
+It takes a lot of help to finish these.
+Mark the spot with a sign painted red.
+Take two shares as a fair profit.
+The fur of cats goes by many names.
+North winds bring colds and fevers.
+He asks no person to vouch for him.
+Go now and come here later.
+A sash of gold silk will trim her dress.
+Soap can wash most dirt away.
+That move means the game is over.
+He wrote down a long list of items.
+A siege will crack the strong defense.
+Grape juice and water mix well.
+Roads are paved with sticky tar.
+Fake stones shine but cost little.
+The drip of the rain made a pleasant sound.
+Smoke poured out of every crack.
+Serve the hot rum to the tired heroes.
+Much of the story makes good sense.
+The sun came up to light the eastern sky.
+Heave the line over the port side.
+A lathe cuts and trims any wood.
+It's a dense crowd in two distinct ways.
+His hip struck the knee of the next player.
+The stale smell of old beer lingers.
+The desk was firm on the shaky floor.
+It takes heat to bring out the odor.
+Beef is scarcer than some lamb.
+Raise the sail and steer the ship northward.
+A cone costs five cents on Mondays.
+A pod is what peas always grow in.
+Jerk the dart from the cork target.
+No cement will hold hard wood.
+We now have a new base for shipping.
+A list of names is carved around the base.
+The sheep were led home by a dog.
+Three for a dime, the young peddler cried.
+The sense of smell is better than that of touch.
+No hardship seemed to keep him sad.
+Grace makes up for lack of beauty.
+Nudge gently but wake her now.
+The news struck doubt into restless minds.
+Once we stood beside the shore.
+A chink in the wall allowed a draft to blow.
+Fasten two pins on each side.
+A cold dip restores health and zest.
+He takes the oath of office each March.
+The sand drifts over the sill of the old house.
+The point of the steel pen was bent and twisted.
+There is a lag between thought and act.
+Seed is needed to plant the spring corn.
+Draw the chart with heavy black lines.
+The boy owed his pal thirty cents.
+The chap slipped into the crowd and was lost.
+Hats are worn to tea and not to dinner.
+The ramp led up to the wide highway.
+Beat the dust from the rug onto the lawn.
+Say it slowly but make it ring clear.
+The straw nest housed five robins.
+Screen the porch with woven straw mats.
+This horse will nose his way to the finish.
+The dry wax protects the deep scratch.
+He picked up the dice for a second roll.
+These coins will be needed to pay his debt.
+The nag pulled the frail cart along.
+Twist the valve and release hot steam.
+The vamp of the shoe had a gold buckle.
+The smell of burned rags itches my nose.
+New pants lack cuffs and pockets.
+The marsh will freeze when cold enough.
+They slice the sausage thin with a knife.
+The bloom of the rose lasts a few days.
+A gray mare walked before the colt.
+Breakfast buns are fine with a hot drink.
+Bottles hold four kinds of rum.
+The man wore a feather in his felt hat.
+He wheeled the bike past the winding road.
+Drop the ashes on the worn old rug.
+The desk and both chairs were painted tan.
+Throw out the used paper cup and plate.
+A clean neck means a neat collar.
+The couch cover and hall drapes were blue.
+The stems of the tall glasses cracked and broke.
+The wall phone rang loud and often.
+The clothes dried on a thin wooden rack.
+Turn on the lantern which gives us light.
+The cleat sank deeply into the soft turf.
+The bills were mailed promptly on the tenth of the month.
+To have is better than to wait and hope.
+The price is fair for a good antique clock.
+The music played on while they talked.
+Dispense with a vest on a day like this.
+The bunch of grapes was pressed into wine.
+He sent the figs, but kept the ripe cherries.
+The hinge on the door creaked with old age.
+The screen before the fire kept in the sparks.
+Fly by night, and you waste little time.
+Thick glasses helped him read the print.
+Birth and death mark the limits of life.
+The chair looked strong but had no bottom.
+The kite flew wildly in the high wind.
+A fur muff is stylish once more.
+The tin box held priceless stones.
+We need an end of all such matter.
+The case was puzzling to the old and wise.
+The bright lanterns were gay on the dark lawn.
+We don't get much money but we have fun.
+The youth drove with zest, but little skill.
+Five years he lived with a shaggy dog.
+A fence cuts through the corner lot.
+The way to save money is not to spend much.
+Shut the hatch before the waves push it in.
+The odor of spring makes young hearts jump.
+Crack the walnut with your sharp side teeth.
+He offered proof in the form of a large chart.
+Send the stuff in a thick paper bag.
+A quart of milk is water for the most part.
+They told wild tales to frighten him.
+The three story house was built of stone.
+In the rear of the ground floor was a large passage.
+A man in a blue sweater sat at the desk.
+Oats are a food eaten by horse and man.
+Their eyelids droop for want of sleep.
+A sip of tea revives his tired friend.
+There are many ways to do these things.
+Tuck the sheet under the edge of the mat.
+A force equal to that would move the earth.
+We like to see clear weather.
+The work of the tailor is seen on each side.
+Take a chance and win a china doll.
+Shake the dust from your shoes, stranger.
+She was kind to sick old people.
+The square wooden crate was packed to be shipped.
+The dusty bench stood by the stone wall.
+We dress to suit the weather of most days.
+Smile when you say nasty words.
+A bowl of rice is free with chicken stew.
+The water in this well is a source of good health.
+Take shelter in this tent, but keep still.
+That guy is the writer of a few banned books.
+The little tales they tell are false.
+The door was barred, locked, and bolted as well.
+Ripe pears are fit for a queen's table.
+A big wet stain was on the round carpet.
+The kite dipped and swayed, but stayed aloft.
+The pleasant hours fly by much too soon.
+The room was crowded with a wild mob.
+This strong arm shall shield your honor.
+She blushed when he gave her a white orchid.
+The beetle droned in the hot June sun.
+Press the pedal with your left foot.
+Neat plans fail without luck.
+The black trunk fell from the landing.
+The bank pressed for payment of the debt.
+The theft of the pearl pin was kept secret.
+Shake hands with this friendly child.
+The vast space stretched into the far distance.
+A rich farm is rare in this sandy waste.
+His wide grin earned many friends.
+Flax makes a fine brand of paper.
+Hurdle the pit with the aid of a long pole.
+A strong bid may scare your partner stiff.
+Even a just cause needs power to win.
+Peep under the tent and see the clowns.
+The leaf drifts along with a slow spin.
+Cheap clothes are flashy but don't last.
+A thing of small note can cause despair.
+Flood the mails with requests for this book.
+A thick coat of black paint covered all.
+The pencil was cut to be sharp at both ends.
+Those last words were a strong statement.
+He wrote his name boldly at the top of the sheet.
+Dill pickles are sour but taste fine.
+Down that road is the way to the grain farmer.
+Either mud or dust are found at all times.
+The best method is to fix it in place with clips.
+If you mumble your speech will be lost.
+At night the alarm roused him from a deep sleep.
+Read just what the meter says.
+Fill your pack with bright trinkets for the poor.
+The small red neon lamp went out.
+Clams are small, round, soft, and tasty.
+The fan whirled its round blades softly.
+The line where the edges join was clean.
+Breathe deep and smell the piny air.
+It matters not if he reads these words or those.
+A brown leather bag hung from its strap.
+A toad and a frog are hard to tell apart.
+A white silk jacket goes with any shoes.
+A break in the dam almost caused a flood.
+Paint the sockets in the wall dull green.
+The child crawled into the dense grass.
+Bribes fail where honest men work.
+Trample the spark, else the flames will spread.
+The hilt of the sword was carved with fine designs.
+A round hole was drilled through the thin board.
+Footprints showed the path he took up the beach.
+She was waiting at my front lawn.
+A vent near the edge brought in fresh air.
+Prod the old mule with a crooked stick.
+It is a band of steel three inches wide.
+The pipe ran almost the length of the ditch.
+It was hidden from sight by a mass of leaves and shrubs.
+The weight of the package was seen on the high scale.
+Wake and rise, and step into the green outdoors.
+The green light in the brown box flickered.
+The brass tube circled the high wall.
+The lobes of her ears were pierced to hold rings.
+Hold the hammer near the end to drive the nail.
+Next Sunday is the twelfth of the month.
+Every word and phrase he speaks is true.
+He put his last cartridge into the gun and fired.
+They took their kids from the public school.
+Drive the screw straight into the wood.
+Keep the hatch tight and the watch constant.
+Sever the twine with a quick snip of the knife.
+Paper will dry out when wet.
+Slide the catch back and open the desk.
+Help the weak to preserve their strength.
+A sullen smile gets few friends.
+Stop whistling and watch the boys march.
+Jerk the cord, and out tumbles the gold.
+Slide the tray across the glass top.
+The cloud moved in a stately way and was gone.
+Light maple makes for a swell room.
+Set the piece here and say nothing.
+Dull stories make her laugh.
+A stiff cord will do to fasten your shoe.
+Get the trust fund to the bank early.
+Choose between the high road and the low.
+A plea for funds seems to come again.
+He lent his coat to the tall gaunt stranger.
+There is a strong chance it will happen once more.
+The duke left the park in a silver coach.
+Greet the new guests and leave quickly.
+When the frost has come it is time for turkey.
+Sweet words work better than fierce.
+A thin stripe runs down the middle.
+A six comes up more often than a ten.
+Lush fern grow on the lofty rocks.
+The ram scared the school children off.
+The team with the best timing looks good.
+The farmer swapped his horse for a brown ox.
+Sit on the perch and tell the others what to do.
+A steep trail is painful for our feet.
+The early phase of life moves fast.
+Green moss grows on the northern side.
+Tea in thin china has a sweet taste.
+Pitch the straw through the door of the stable.
+The latch on the back gate needed a nail.
+The goose was brought straight from the old market.
+The sink is the thing in which we pile dishes.
+A whiff of it will cure the most stubborn cold.
+The facts don't always show who is right.
+She flaps her cape as she parades the street.
+The loss of the cruiser was a blow to the fleet.
+Loop the braid to the left and then over.
+Plead with the lawyer to drop the lost cause.
+Calves thrive on tender spring grass.
+Post no bills on this office wall.
+Tear a thin sheet from the yellow pad.
+A cruise in warm waters in a sleek yacht is fun.
+A streak of color ran down the left edge.
+It was done before the boy could see it.
+Crouch before you jump or miss the mark.
+Pack the kits and don't forget the salt.
+The square peg will settle in the round hole.
+Fine soap saves tender skin.
+Poached eggs and tea must suffice.
+Bad nerves are jangled by a door slam.
+Ship maps are different from those for planes.
+Dimes showered down from all sides.
+They sang the same tunes at each party.
+The sky in the west is tinged with orange red.
+The pods of peas ferment in bare fields.
+The horse balked and threw the tall rider.
+The hitch between the horse and cart broke.
+Pile the coal high in the shed corner.
+A gold vase is both rare and costly.
+The knife was hung inside its bright sheath.
+The rarest spice comes from the far East.
+The roof should be tilted at a sharp slant.
+A smatter of French is worse than none.
+The mule trod the treadmill day and night.
+The aim of the contest is to raise a great fund.
+To send it now in large amounts is bad.
+There is a fine hard tang in salty air.
+Cod is the main business of the north shore.
+The slab was hewn from heavy blocks of slate.
+Dunk the stale biscuits into strong drink.
+Hang tinsel from both branches.
+Cap the jar with a tight brass cover.
+The poor boy missed the boat again.
+Be sure to set the lamp firmly in the hole.
+Pick a card and slip it under the pack.
+A round mat will cover the dull spot.
+The first part of the plan needs changing.
+A good book informs of what we ought to know.
+The mail comes in three batches per day.
+You cannot brew tea in a cold pot.
+Dots of light betrayed the black cat.
+Put the chart on the mantel and tack it down.
+The night shift men rate extra pay.
+The red paper brightened the dim stage.
+See the player scoot to third base.
+Slide the bill between the two leaves.
+Many hands help get the job done.
+We don't like to admit our small faults.
+No doubt about the way the wind blows.
+Dig deep in the earth for pirate's gold.
+The steady drip is worse than a drenching rain.
+A flat pack takes less luggage space.
+Green ice frosted the punch bowl.
+A stuffed chair slipped from the moving van.
+The stitch will serve but needs to be shortened.
+A thin book fits in the side pocket.
+The gloss on top made it unfit to read.
+The hail pattered on the burnt brown grass.
+Seven seals were stamped on great sheets.
+Our troops are set to strike heavy blows.
+The store was jammed before the sale could start.
+It was a bad error on the part of the new judge.
+One step more and the board will collapse.
+Take the match and strike it against your shoe.
+The pot boiled, but the contents failed to jell.
+The baby puts his right foot in his mouth.
+The bombs left most of the town in ruins.
+Stop and stare at the hard working man.
+The streets are narrow and full of sharp turns.
+The pup jerked the leash as he saw a feline shape.
+Open your book to the first page.
+Fish evade the net and swim off.
+Dip the pail once and let it settle.
+Will you please answer that phone.
+The big red apple fell to the ground.
+The curtain rose and the show was on.
+The young prince became heir to the throne.
+He sent the boy on a short errand.
+Leave now and you will arrive on time.
+The corner store was robbed last night.
+A gold ring will please most any girl.
+The long journey home took a year.
+She saw a cat in the neighbor's house.
+A pink shell was found on the sandy beach.
+Small children came to see him.
+The grass and bushes were wet with dew.
+The blind man counted his old coins.
+A severe storm tore down the barn.
+She called his name many times.
+When you hear the bell, come quickly.
\ No newline at end of file
diff --git a/extensions/silero_tts/script.py b/extensions/silero_tts/script.py
index 3ecd5bd9..b96a47fd 100644
--- a/extensions/silero_tts/script.py
+++ b/extensions/silero_tts/script.py
@@ -1,3 +1,4 @@
+import random
 import time
 from pathlib import Path
 
@@ -106,6 +107,7 @@ def history_modifier(history):
 
 def output_modifier(string, state):
     global model, current_params, streaming_state
+
     for i in params:
         if params[i] != current_params[i]:
             model = load_model()
@@ -140,6 +142,35 @@ def setup():
     model = load_model()
 
 
+def random_sentence():
+    with open(Path("extensions/silero_tts/harvard_sentences.txt")) as f:
+        return random.choice(list(f))
+
+
+def voice_preview(preview_text):
+    global model, current_params, streaming_state
+
+    for i in params:
+        if params[i] != current_params[i]:
+            model = load_model()
+            current_params = params.copy()
+            break
+
+    string = tts_preprocessor.preprocess(preview_text or random_sentence())
+
+    output_file = Path('extensions/silero_tts/outputs/voice_preview.wav')
+    prosody = f"<prosody rate=\"{params['voice_speed']}\" pitch=\"{params['voice_pitch']}\">"
+    silero_input = f'<speak>{prosody}{xmlesc(string)}</prosody></speak>'
+    model.save_wav(ssml_text=silero_input, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
+
+    return f'<audio src="file/{output_file.as_posix()}?{int(time.time())}" controls autoplay></audio>'
+
+
+def custom_css():
+    path_to_css = Path(__file__).parent.resolve() / 'style.css'
+    return open(path_to_css, 'r').read()
+
+
 def ui():
     # Gradio elements
     with gr.Accordion("Silero TTS"):
@@ -153,13 +184,16 @@ def ui():
             v_pitch = gr.Dropdown(value=params['voice_pitch'], choices=voice_pitches, label='Voice pitch')
             v_speed = gr.Dropdown(value=params['voice_speed'], choices=voice_speeds, label='Voice speed')
 
+        with gr.Row():
+            preview_text = gr.Text(show_label=False, placeholder="Preview text", elem_id="silero_preview_text")
+            preview_play = gr.Button("Preview")
+            preview_audio = gr.HTML(visible=False)
+
         with gr.Row():
             convert = gr.Button('Permanently replace audios with the message texts')
             convert_cancel = gr.Button('Cancel', visible=False)
             convert_confirm = gr.Button('Confirm (cannot be undone)', variant="stop", visible=False)
 
-        gr.Markdown('[Click here for Silero audio samples](https://oobabooga.github.io/silero-samples/index.html)')
-
     if shared.is_chat():
         # Convert history with confirmation
         convert_arr = [convert_confirm, convert, convert_cancel]
@@ -185,3 +219,7 @@ def ui():
     voice.change(lambda x: params.update({"speaker": x}), voice, None)
     v_pitch.change(lambda x: params.update({"voice_pitch": x}), v_pitch, None)
     v_speed.change(lambda x: params.update({"voice_speed": x}), v_speed, None)
+
+    # Play preview
+    preview_text.submit(voice_preview, preview_text, preview_audio)
+    preview_play.click(voice_preview, preview_text, preview_audio)
diff --git a/extensions/silero_tts/style.css b/extensions/silero_tts/style.css
new file mode 100644
index 00000000..2ab7aefb
--- /dev/null
+++ b/extensions/silero_tts/style.css
@@ -0,0 +1,8 @@
+.SDAP .hires_opts input[type="number"] {
+    width: 6em !important;
+}
+
+/* silero_tts preview */
+.form:has(> #silero_preview_text) {
+    min-width: 75%
+}

From 8df3cdfd511f3857c5e09038559063ca442e59fc Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 4 Aug 2023 13:57:31 -0300
Subject: [PATCH 022/169] Add SSL certificate support (#3453)

---
 README.md         |  2 ++
 modules/shared.py |  2 ++
 server.py         | 16 +++++++++++-----
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 4756672e..218fa765 100644
--- a/README.md
+++ b/README.md
@@ -317,6 +317,8 @@ Optionally, you can use the following command-line flags:
 | `--auto-launch`                       | Open the web UI in the default browser upon launch. |
 | `--gradio-auth USER:PWD`              | set gradio authentication like "username:password"; or comma-delimit multiple like "u1:p1,u2:p2,u3:p3" |
 | `--gradio-auth-path GRADIO_AUTH_PATH` | Set the gradio authentication file path. The file should contain one or more user:password pairs in this format: "u1:p1,u2:p2,u3:p3" |
+| `--ssl-keyfile SSL_KEYFILE`           | The path to the SSL certificate key file. |
+| `--ssl-certfile SSL_CERTFILE`         | The path to the SSL certificate cert file. |
 
 #### API
 
diff --git a/modules/shared.py b/modules/shared.py
index fc9ba3cf..51017a1b 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -180,6 +180,8 @@ parser.add_argument('--share', action='store_true', help='Create a public URL. T
 parser.add_argument('--auto-launch', action='store_true', default=False, help='Open the web UI in the default browser upon launch.')
 parser.add_argument("--gradio-auth", type=str, help='set gradio authentication like "username:password"; or comma-delimit multiple like "u1:p1,u2:p2,u3:p3"', default=None)
 parser.add_argument("--gradio-auth-path", type=str, help='Set the gradio authentication file path. The file should contain one or more user:password pairs in this format: "u1:p1,u2:p2,u3:p3"', default=None)
+parser.add_argument("--ssl-keyfile", type=str, help='The path to the SSL certificate key file.', default=None)
+parser.add_argument("--ssl-certfile", type=str, help='The path to the SSL certificate cert file.', default=None)
 
 # API
 parser.add_argument('--api', action='store_true', help='Enable the API extension.')
diff --git a/server.py b/server.py
index d1a23bbe..0e1d199d 100644
--- a/server.py
+++ b/server.py
@@ -1081,11 +1081,17 @@ def create_interface():
     # Launch the interface
     shared.gradio['interface'].queue()
     with OpenMonkeyPatch():
-        if shared.args.listen:
-            shared.gradio['interface'].launch(prevent_thread_lock=True, share=shared.args.share, server_name=shared.args.listen_host or '0.0.0.0', server_port=shared.args.listen_port, inbrowser=shared.args.auto_launch, auth=auth)
-        else:
-            shared.gradio['interface'].launch(prevent_thread_lock=True, share=shared.args.share, server_port=shared.args.listen_port, inbrowser=shared.args.auto_launch, auth=auth)
-
+        shared.gradio['interface'].launch(
+            prevent_thread_lock=True,
+            share=shared.args.share,
+            server_name = None if not shared.args.listen else (shared.args.listen_host or '0.0.0.0'),
+            server_port=shared.args.listen_port,
+            inbrowser=shared.args.auto_launch,
+            auth=auth,
+            ssl_verify=False if (shared.args.ssl_keyfile or shared.args.ssl_certfile) else True,
+            ssl_keyfile=shared.args.ssl_keyfile,
+            ssl_certfile=shared.args.ssl_certfile
+        )
 
 if __name__ == "__main__":
     # Loading custom settings

From 6e30f76ba555defce214e47839c8f4f303317931 Mon Sep 17 00:00:00 2001
From: jllllll <3887729+jllllll@users.noreply.github.com>
Date: Fri, 4 Aug 2023 17:28:59 -0500
Subject: [PATCH 023/169] Bump bitsandbytes to 0.41.1 (#3457)

---
 requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 9486f808..fd3b42c4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,8 +19,8 @@ transformers==4.31.*
 tqdm
 wandb
 git+https://github.com/huggingface/peft@96c0277a1b9a381b10ab34dbf84917f9b3b992e6
-bitsandbytes==0.41.0; platform_system != "Windows"
-https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.0-py3-none-win_amd64.whl; platform_system == "Windows"
+bitsandbytes==0.41.1; platform_system != "Windows"
+https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl; platform_system == "Windows"
 https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.3.0/auto_gptq-0.3.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.3.0/auto_gptq-0.3.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/jllllll/exllama/releases/download/0.0.9/exllama-0.0.9+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"

From 23055b21ee59eeea48a8b5696ad04c6fb8bc38f8 Mon Sep 17 00:00:00 2001
From: SodaPrettyCold <139355831+SodaPrettyCold@users.noreply.github.com>
Date: Sat, 5 Aug 2023 07:20:28 +0800
Subject: [PATCH 024/169] [Bug fix] Remove html tags form the Prompt sent to
 Stable Diffusion (#3151)

---
 extensions/sd_api_pictures/script.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/extensions/sd_api_pictures/script.py b/extensions/sd_api_pictures/script.py
index 88a0d940..e33367d4 100644
--- a/extensions/sd_api_pictures/script.py
+++ b/extensions/sd_api_pictures/script.py
@@ -133,6 +133,9 @@ def get_SD_pictures(description, character):
     if params['manage_VRAM']:
         give_VRAM_priority('SD')
 
+    description = re.sub('<audio.*?</audio>', ' ', description)
+    description = f"({description}:1)"
+
     payload = {
         "prompt": params['prompt_prefix'] + description,
         "seed": params['seed'],

From 9dcb37e8d4fafb5c1b59f7a56e25fcb9c21e1398 Mon Sep 17 00:00:00 2001
From: Forkoz <59298527+Ph0rk0z@users.noreply.github.com>
Date: Sat, 5 Aug 2023 16:45:47 +0000
Subject: [PATCH 025/169] Fix: Mirostat fails on models split across multiple
 GPUs

---
 modules/sampler_hijack.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/sampler_hijack.py b/modules/sampler_hijack.py
index 0a86b4fd..d5ebbb76 100644
--- a/modules/sampler_hijack.py
+++ b/modules/sampler_hijack.py
@@ -104,7 +104,7 @@ class MirostatLogitsWarper(LogitsWarper):
                 break
 
         # Normalize the probabilities of the remaining words
-        prob_topk = torch.softmax(sorted_logits, dim=0)
+        prob_topk = torch.softmax(sorted_logits, dim=0).to('cuda')
 
         prev_i = torch.multinomial(prob_topk, num_samples=1, replacement=True).to('cuda')
 

From 5ee95d126cac801fd773e4350a1be5b06979b799 Mon Sep 17 00:00:00 2001
From: jllllll <3887729+jllllll@users.noreply.github.com>
Date: Sat, 5 Aug 2023 11:46:14 -0500
Subject: [PATCH 026/169] Bump exllama wheels to 0.0.10 (#3467)

---
 requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index fd3b42c4..5a46addd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -23,8 +23,8 @@ bitsandbytes==0.41.1; platform_system != "Windows"
 https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl; platform_system == "Windows"
 https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.3.0/auto_gptq-0.3.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.3.0/auto_gptq-0.3.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/jllllll/exllama/releases/download/0.0.9/exllama-0.0.9+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
-https://github.com/jllllll/exllama/releases/download/0.0.9/exllama-0.0.9+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
+https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 # llama-cpp-python without GPU support
 llama-cpp-python==0.1.77; platform_system != "Windows"
 https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.77/llama_cpp_python-0.1.77-cp310-cp310-win_amd64.whl; platform_system == "Windows"

From 44f31731af75eae977cb098de070a62c0e362156 Mon Sep 17 00:00:00 2001
From: jllllll <3887729+jllllll@users.noreply.github.com>
Date: Sat, 5 Aug 2023 11:47:16 -0500
Subject: [PATCH 027/169] Create logs dir if missing when saving history
 (#3462)

---
 modules/chat.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/chat.py b/modules/chat.py
index 5e4eb245..8e562b98 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -394,6 +394,8 @@ def redraw_html(history, name1, name2, mode, style, reset_cache=False):
 
 def save_history(history, path=None):
     p = path or Path('logs/exported_history.json')
+    if not p.parent.is_dir():
+        p.parent.mkdir(parents=True)
     with open(p, 'w', encoding='utf-8') as f:
         f.write(json.dumps(history, indent=4))
 

From 5134878344ced8cfb42bd56ff6dbd357935370f3 Mon Sep 17 00:00:00 2001
From: missionfloyd <missionfloyd@users.noreply.github.com>
Date: Sat, 5 Aug 2023 10:53:54 -0600
Subject: [PATCH 028/169] Fix chat message order (#3461)

---
 css/chat.css              |   5 ++
 modules/html_generator.py | 104 ++++++++++++++++++--------------------
 2 files changed, 54 insertions(+), 55 deletions(-)

diff --git a/css/chat.css b/css/chat.css
index 67bbe512..677d86db 100644
--- a/css/chat.css
+++ b/css/chat.css
@@ -79,6 +79,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     padding-top: 1px;
 }
 
+.chat > .messages {
+    display: flex;
+    flex-direction: column;
+}
+
 .message-body li {
     margin-top: 0.5em !important;
     margin-bottom: 0.5em !important;
diff --git a/modules/html_generator.py b/modules/html_generator.py
index c6ca13b6..15c731c3 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -168,10 +168,21 @@ def get_image_cache(path):
 
 
 def generate_instruct_html(history):
-    output = f'<style>{instruct_css}</style><div class="chat pretty_scrollbar" id="chat">'
-    for i, _row in enumerate(history[::-1]):
+    output = f'<style>{instruct_css}</style><div class="chat pretty_scrollbar" id="chat"><div class="messages">'
+    for i, _row in enumerate(history):
         row = [convert_to_markdown(entry) for entry in _row]
 
+        if row[0]:  # don't display empty user messages
+            output += f"""
+                  <div class="user-message">
+                    <div class="text">
+                      <div class="message-body">
+                        {row[0]}
+                      </div>
+                    </div>
+                  </div>
+                """
+
         output += f"""
               <div class="assistant-message">
                 <div class="text">
@@ -182,34 +193,38 @@ def generate_instruct_html(history):
               </div>
             """
 
-        if len(row[0]) == 0:  # don't display empty user messages
-            continue
-
-        output += f"""
-              <div class="user-message">
-                <div class="text">
-                  <div class="message-body">
-                    {row[0]}
-                  </div>
-                </div>
-              </div>
-            """
-
-    output += "</div>"
+    output += "</div></div>"
 
     return output
 
 
 def generate_cai_chat_html(history, name1, name2, style, reset_cache=False):
-    output = f'<style>{chat_styles[style]}</style><div class="chat pretty_scrollbar" id="chat">'
+    output = f'<style>{chat_styles[style]}</style><div class="chat pretty_scrollbar" id="chat"><div class="messages">'
 
     # We use ?name2 and ?time.time() to force the browser to reset caches
     img_bot = f'<img src="file/cache/pfp_character.png?{name2}">' if Path("cache/pfp_character.png").exists() else ''
     img_me = f'<img src="file/cache/pfp_me.png?{time.time() if reset_cache else ""}">' if Path("cache/pfp_me.png").exists() else ''
 
-    for i, _row in enumerate(history[::-1]):
+    for i, _row in enumerate(history):
         row = [convert_to_markdown(entry) for entry in _row]
 
+        if row[0]:  # don't display empty user messages
+            output += f"""
+                  <div class="message">
+                    <div class="circle-you">
+                      {img_me}
+                    </div>
+                    <div class="text">
+                      <div class="username">
+                        {name1}
+                      </div>
+                      <div class="message-body">
+                        {row[0]}
+                      </div>
+                    </div>
+                  </div>
+                """
+
         output += f"""
               <div class="message">
                 <div class="circle-bot">
@@ -226,49 +241,18 @@ def generate_cai_chat_html(history, name1, name2, style, reset_cache=False):
               </div>
             """
 
-        if len(row[0]) == 0:  # don't display empty user messages
-            continue
-
-        output += f"""
-              <div class="message">
-                <div class="circle-you">
-                  {img_me}
-                </div>
-                <div class="text">
-                  <div class="username">
-                    {name1}
-                  </div>
-                  <div class="message-body">
-                    {row[0]}
-                  </div>
-                </div>
-              </div>
-            """
-
-    output += "</div>"
+    output += "</div></div>"
     return output
 
 
 def generate_chat_html(history, name1, name2, reset_cache=False):
-    output = f'<style>{chat_styles["wpp"]}</style><div class="chat pretty_scrollbar" id="chat">'
+    output = f'<style>{chat_styles["wpp"]}</style><div class="chat pretty_scrollbar" id="chat"><div class="messages">'
 
-    for i, _row in enumerate(history[::-1]):
+    for i, _row in enumerate(history):
         row = [convert_to_markdown(entry) for entry in _row]
 
-        output += f"""
-              <div class="message">
-                <div class="text-bot">
-                  <div class="message-body">
-                    {row[1]}
-                  </div>
-                </div>
-              </div>
-            """
-
-        if len(row[0]) == 0:  # don't display empty user messages
-            continue
-
-        output += f"""
+        if row[0]:  # don't display empty user messages
+            output += f"""
               <div class="message">
                 <div class="text-you">
                   <div class="message-body">
@@ -278,7 +262,17 @@ def generate_chat_html(history, name1, name2, reset_cache=False):
               </div>
             """
 
-    output += "</div>"
+        output += f"""
+          <div class="message">
+            <div class="text-bot">
+              <div class="message-body">
+                {row[1]}
+              </div>
+            </div>
+          </div>
+        """
+
+    output += "</div></div>"
     return output
 
 

From 0af10ab49bfc1cab80d0126707321a58bd9e3485 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 6 Aug 2023 17:22:48 -0300
Subject: [PATCH 029/169] Add Classifier Free Guidance (CFG) for
 Transformers/ExLlama (#3325)

---
 api-examples/api-example-chat-stream.py |  2 +
 api-examples/api-example-chat.py        |  2 +
 api-examples/api-example-stream.py      |  2 +
 api-examples/api-example.py             |  2 +
 extensions/api/util.py                  |  2 +
 extensions/openai/defaults.py           |  2 +
 modules/exllama.py                      | 97 ++++++++++++++++++++-----
 modules/exllama_hf.py                   |  9 +--
 modules/llamacpp_hf.py                  |  9 +--
 modules/loaders.py                      |  8 ++
 modules/presets.py                      | 21 ++++--
 modules/shared.py                       |  1 +
 modules/text_generation.py              |  5 +-
 modules/ui.py                           |  2 +
 requirements.txt                        |  2 +-
 server.py                               |  6 +-
 settings-template.yaml                  |  1 +
 17 files changed, 131 insertions(+), 42 deletions(-)

diff --git a/api-examples/api-example-chat-stream.py b/api-examples/api-example-chat-stream.py
index 2914d451..a774f907 100644
--- a/api-examples/api-example-chat-stream.py
+++ b/api-examples/api-example-chat-stream.py
@@ -63,6 +63,8 @@ async def run(user_input, history):
         'mirostat_mode': 0,
         'mirostat_tau': 5,
         'mirostat_eta': 0.1,
+        'guidance_scale': 1,
+        'negative_prompt': '',
 
         'seed': -1,
         'add_bos_token': True,
diff --git a/api-examples/api-example-chat.py b/api-examples/api-example-chat.py
index e2797f1e..824bf3a0 100644
--- a/api-examples/api-example-chat.py
+++ b/api-examples/api-example-chat.py
@@ -57,6 +57,8 @@ def run(user_input, history):
         'mirostat_mode': 0,
         'mirostat_tau': 5,
         'mirostat_eta': 0.1,
+        'guidance_scale': 1,
+        'negative_prompt': '',
 
         'seed': -1,
         'add_bos_token': True,
diff --git a/api-examples/api-example-stream.py b/api-examples/api-example-stream.py
index 175275f9..bf5eabac 100644
--- a/api-examples/api-example-stream.py
+++ b/api-examples/api-example-stream.py
@@ -45,6 +45,8 @@ async def run(context):
         'mirostat_mode': 0,
         'mirostat_tau': 5,
         'mirostat_eta': 0.1,
+        'guidance_scale': 1,
+        'negative_prompt': '',
 
         'seed': -1,
         'add_bos_token': True,
diff --git a/api-examples/api-example.py b/api-examples/api-example.py
index 7f8bc1d2..16029807 100644
--- a/api-examples/api-example.py
+++ b/api-examples/api-example.py
@@ -37,6 +37,8 @@ def run(prompt):
         'mirostat_mode': 0,
         'mirostat_tau': 5,
         'mirostat_eta': 0.1,
+        'guidance_scale': 1,
+        'negative_prompt': '',
 
         'seed': -1,
         'add_bos_token': True,
diff --git a/extensions/api/util.py b/extensions/api/util.py
index ef58a70f..2654d046 100644
--- a/extensions/api/util.py
+++ b/extensions/api/util.py
@@ -43,6 +43,8 @@ def build_parameters(body, chat=False):
         'mirostat_mode': int(body.get('mirostat_mode', 0)),
         'mirostat_tau': float(body.get('mirostat_tau', 5)),
         'mirostat_eta': float(body.get('mirostat_eta', 0.1)),
+        'guidance_scale': float(body.get('guidance_scale', 1)),
+        'negative_prompt': str(body.get('negative_prompt', '')),
         'seed': int(body.get('seed', -1)),
         'add_bos_token': bool(body.get('add_bos_token', True)),
         'truncation_length': int(body.get('truncation_length', body.get('max_context_length', 2048))),
diff --git a/extensions/openai/defaults.py b/extensions/openai/defaults.py
index cb8308e7..ffef12d0 100644
--- a/extensions/openai/defaults.py
+++ b/extensions/openai/defaults.py
@@ -33,6 +33,8 @@ default_req_params = {
     'mirostat_mode': 0,
     'mirostat_tau': 5.0,
     'mirostat_eta': 0.1,
+    'guidance_scale': 1,
+    'negative_prompt': '',
     'ban_eos_token': False,
     'skip_special_tokens': True,
     'custom_stopping_strings': '',
diff --git a/modules/exllama.py b/modules/exllama.py
index 00b37b9c..dc632a25 100644
--- a/modules/exllama.py
+++ b/modules/exllama.py
@@ -1,9 +1,11 @@
 from pathlib import Path
 
+import torch.nn.functional as F
 from torch import version as torch_version
 
 from modules import shared
 from modules.logging_colors import logger
+from modules.models import clear_torch_cache
 from modules.text_generation import get_max_prompt_length
 
 try:
@@ -78,6 +80,21 @@ class ExllamaModel:
         return result, result
 
     def generate_with_streaming(self, prompt, state):
+
+        # The cache batch size must be 2 for CFG and 1 otherwise
+        if state['guidance_scale'] == 1:
+            if self.cache.batch_size == 2:
+                del self.cache
+                clear_torch_cache()
+                self.cache = ExLlamaCache(self.model)
+                self.generator = ExLlamaGenerator(self.model, self.tokenizer, self.cache)
+        else:
+            if self.cache.batch_size == 1:
+                del self.cache
+                clear_torch_cache()
+                self.cache = ExLlamaCache(self.model, batch_size=2)
+                self.generator = ExLlamaGenerator(self.model, self.tokenizer, self.cache)
+
         self.generator.settings.temperature = state['temperature']
         self.generator.settings.top_p = state['top_p']
         self.generator.settings.top_k = state['top_k']
@@ -89,31 +106,71 @@ class ExllamaModel:
         else:
             self.generator.disallow_tokens(None)
 
-        self.generator.end_beam_search()
+        # Case 1: no CFG
+        if state['guidance_scale'] == 1:
+            self.generator.end_beam_search()
 
-        # Tokenizing the input
-        ids = self.generator.tokenizer.encode(prompt)
-        ids = ids[:, -get_max_prompt_length(state):]
-        if state['auto_max_new_tokens']:
-            max_new_tokens = state['truncation_length'] - ids.shape[-1]
+            # Tokenizing the input
+            ids = self.generator.tokenizer.encode(prompt)
+            ids = ids[:, -get_max_prompt_length(state):]
+            if state['auto_max_new_tokens']:
+                max_new_tokens = state['truncation_length'] - ids.shape[-1]
+            else:
+                max_new_tokens = state['max_new_tokens']
+
+            self.generator.gen_begin_reuse(ids)
+            initial_len = self.generator.sequence[0].shape[0]
+            has_leading_space = False
+
+            for i in range(max_new_tokens):
+                token = self.generator.gen_single_token()
+                if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'):
+                    has_leading_space = True
+
+                decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:])
+                if has_leading_space:
+                    decoded_text = ' ' + decoded_text
+
+                yield decoded_text
+                if token.item() == self.generator.tokenizer.eos_token_id or shared.stop_everything:
+                    break
+
+        # Case 2: CFG
         else:
-            max_new_tokens = state['max_new_tokens']
+            alpha = state['guidance_scale']
+            prompts = [prompt, state['negative_prompt'] or '']
 
-        self.generator.gen_begin_reuse(ids)
-        initial_len = self.generator.sequence[0].shape[0]
-        has_leading_space = False
-        for i in range(max_new_tokens):
-            token = self.generator.gen_single_token()
-            if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'):
-                has_leading_space = True
+            ids, mask = self.tokenizer.encode(prompts, return_mask=True)
+            if state['auto_max_new_tokens']:
+                max_new_tokens = state['truncation_length'] - ids[0].shape[-1]
+            else:
+                max_new_tokens = state['max_new_tokens']
 
-            decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:])
-            if has_leading_space:
-                decoded_text = ' ' + decoded_text
+            self.generator.gen_begin(ids, mask=mask)
+            initial_len = self.generator.sequence[0].shape[0]
+            has_leading_space = False
 
-            yield decoded_text
-            if token.item() == self.generator.tokenizer.eos_token_id or shared.stop_everything:
-                break
+            for i in range(max_new_tokens):
+                logits = self.model.forward(self.generator.sequence[:, -1:], self.cache, input_mask=mask)
+                self.generator.apply_rep_penalty(logits)
+
+                logits = F.log_softmax(logits, dim=-1)
+                logits_mixed = alpha * logits[0] + (1 - alpha) * logits[1]
+
+                token, _ = self.generator.sample_current(logits_mixed)
+                if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'):
+                    has_leading_space = True
+
+                decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:])
+                if has_leading_space:
+                    decoded_text = ' ' + decoded_text
+
+                yield decoded_text
+                if token.item() == self.tokenizer.eos_token_id or shared.stop_everything:
+                    break
+
+                batch_token = token.repeat(2, 1)
+                self.generator.gen_accept_token(batch_token)
 
     def generate(self, prompt, state):
         output = ''
diff --git a/modules/exllama_hf.py b/modules/exllama_hf.py
index fd775b4a..ebafb4f7 100644
--- a/modules/exllama_hf.py
+++ b/modules/exllama_hf.py
@@ -47,12 +47,11 @@ class ExllamaHF(PreTrainedModel):
         return torch.device(0)
 
     def __call__(self, *args, **kwargs):
-        # TODO: Some decoding methods (such as Contrastive Search) may not work at this time
-        assert len(args) == 0, 'no *args should be passed to forward'
+        input_ids = args[0] if len(args) > 0 else kwargs['input_ids']
         use_cache = kwargs.get('use_cache', True)
         labels = kwargs.get('labels', None)
-        seq = kwargs['input_ids'][0].tolist()
-        cache = kwargs['past_key_values'] if 'past_key_values' in kwargs else None
+        cache = kwargs.get('past_key_values', None)
+        seq = input_ids[0].tolist()
 
         if labels is None:
             if cache is None:
@@ -60,7 +59,7 @@ class ExllamaHF(PreTrainedModel):
                 cache = self.ex_cache
                 self.ex_model.forward(torch.tensor([seq[:-1]], dtype=torch.long), cache, preprocess_only=True, lora=self.lora)
 
-            logits = self.ex_model.forward(torch.tensor([seq[-1:]], dtype=torch.long), cache, lora=self.lora).to(kwargs['input_ids'].device)
+            logits = self.ex_model.forward(torch.tensor([seq[-1:]], dtype=torch.long), cache, lora=self.lora).to(input_ids.device)
         else:
             if cache is None:
                 self.ex_cache.current_seq_len = 0
diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py
index e9f4ade6..df9e0b2e 100644
--- a/modules/llamacpp_hf.py
+++ b/modules/llamacpp_hf.py
@@ -49,12 +49,11 @@ class LlamacppHF(PreTrainedModel):
         return torch.device(0)
 
     def __call__(self, *args, **kwargs):
-        # TODO: Some decoding methods (such as Contrastive Search) may not work at this time
-        assert len(args) == 0, 'no *args should be passed to forward'
+        input_ids = args[0] if len(args) > 0 else kwargs['input_ids']
         use_cache = kwargs.get('use_cache', True)
         labels = kwargs.get('labels', None)
-        seq = kwargs['input_ids'][0].tolist()
-        cache = kwargs['past_key_values'] if 'past_key_values' in kwargs else None
+        cache = kwargs.get('past_key_values', None)
+        seq = input_ids[0].tolist()
 
         # Make the forward call
         seq_tensor = torch.tensor(seq)
@@ -70,7 +69,7 @@ class LlamacppHF(PreTrainedModel):
             self.model.reset()
             self.model.eval(seq)
             logits = torch.tensor(self.model.eval_logits)
-            logits = logits.view(1, logits.shape[0], logits.shape[1]).to(kwargs['input_ids'].device)
+            logits = logits.view(1, logits.shape[0], logits.shape[1]).to(input_ids.device)
 
         self.cache = seq_tensor
 
diff --git a/modules/loaders.py b/modules/loaders.py
index aa1afcb8..519e47a7 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -115,6 +115,8 @@ loaders_samplers = {
         'mirostat_mode',
         'mirostat_tau',
         'mirostat_eta',
+        'guidance_scale',
+        'negative_prompt',
         'ban_eos_token',
         'add_bos_token',
         'skip_special_tokens',
@@ -152,6 +154,8 @@ loaders_samplers = {
         'repetition_penalty',
         'repetition_penalty_range',
         'seed',
+        'guidance_scale',
+        'negative_prompt',
         'ban_eos_token',
         'auto_max_new_tokens',
     },
@@ -178,6 +182,8 @@ loaders_samplers = {
         'mirostat_mode',
         'mirostat_tau',
         'mirostat_eta',
+        'guidance_scale',
+        'negative_prompt',
         'ban_eos_token',
         'add_bos_token',
         'skip_special_tokens',
@@ -206,6 +212,8 @@ loaders_samplers = {
         'mirostat_mode',
         'mirostat_tau',
         'mirostat_eta',
+        'guidance_scale',
+        'negative_prompt',
         'ban_eos_token',
         'add_bos_token',
         'skip_special_tokens',
diff --git a/modules/presets.py b/modules/presets.py
index 072b15fd..32b7f71c 100644
--- a/modules/presets.py
+++ b/modules/presets.py
@@ -9,6 +9,7 @@ def default_preset():
         'do_sample': True,
         'temperature': 1,
         'top_p': 1,
+        'top_k': 0,
         'typical_p': 1,
         'epsilon_cutoff': 0,
         'eta_cutoff': 0,
@@ -17,19 +18,23 @@ def default_preset():
         'repetition_penalty': 1,
         'repetition_penalty_range': 0,
         'encoder_repetition_penalty': 1,
-        'top_k': 0,
-        'num_beams': 1,
-        'penalty_alpha': 0,
-        'min_length': 0,
-        'length_penalty': 1,
         'no_repeat_ngram_size': 0,
-        'early_stopping': False,
+        'min_length': 0,
+        'guidance_scale': 1,
         'mirostat_mode': 0,
         'mirostat_tau': 5.0,
         'mirostat_eta': 0.1,
+        'penalty_alpha': 0,
+        'num_beams': 1,
+        'length_penalty': 1,
+        'early_stopping': False,
     }
 
 
+def presets_params():
+    return [k for k in default_preset()]
+
+
 def load_preset(name):
     generate_params = default_preset()
     if name not in ['None', None, '']:
@@ -51,12 +56,12 @@ def load_preset_memoized(name):
 def load_preset_for_ui(name, state):
     generate_params = load_preset(name)
     state.update(generate_params)
-    return state, *[generate_params[k] for k in ['do_sample', 'temperature', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'repetition_penalty', 'repetition_penalty_range', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'tfs', 'top_a']]
+    return state, *[generate_params[k] for k in presets_params()]
 
 
 def generate_preset_yaml(state):
     defaults = default_preset()
-    data = {k: state[k] for k in ['do_sample', 'temperature', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'repetition_penalty', 'repetition_penalty_range', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'tfs', 'top_a']}
+    data = {k: state[k] for k in presets_params()}
 
     # Remove entries that are identical to the defaults
     for k in list(data.keys()):
diff --git a/modules/shared.py b/modules/shared.py
index 51017a1b..be5be109 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -42,6 +42,7 @@ settings = {
     'max_new_tokens_max': 4096,
     'auto_max_new_tokens': False,
     'seed': -1,
+    'negative_prompt': '',
     'character': 'None',
     'name1': 'You',
     'name2': 'Assistant',
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 7507a731..df9d708b 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -226,9 +226,12 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False):
 
 def generate_reply_HF(question, original_question, seed, state, stopping_strings=None, is_chat=False):
     generate_params = {}
-    for k in ['max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'repetition_penalty_range', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'tfs', 'top_a', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta']:
+    for k in ['max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'repetition_penalty_range', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'tfs', 'top_a', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'guidance_scale']:
         generate_params[k] = state[k]
 
+    if state['negative_prompt'] != '':
+        generate_params['negative_prompt_ids'] = encode(state['negative_prompt'])
+
     for k in ['epsilon_cutoff', 'eta_cutoff']:
         if state[k] > 0:
             generate_params[k] = state[k] * 1e-4
diff --git a/modules/ui.py b/modules/ui.py
index eed2ef66..8a7f9f47 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -100,6 +100,8 @@ def list_interface_input_elements():
         'mirostat_mode',
         'mirostat_tau',
         'mirostat_eta',
+        'negative_prompt',
+        'guidance_scale',
         'add_bos_token',
         'ban_eos_token',
         'truncation_length',
diff --git a/requirements.txt b/requirements.txt
index 5a46addd..9deadd48 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,10 +15,10 @@ safetensors==0.3.1
 scipy
 sentencepiece
 tensorboard
-transformers==4.31.*
 tqdm
 wandb
 git+https://github.com/huggingface/peft@96c0277a1b9a381b10ab34dbf84917f9b3b992e6
+git+https://github.com/huggingface/transformers@d533465150532b0c5de167b574e59f64c68b1154
 bitsandbytes==0.41.1; platform_system != "Windows"
 https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl; platform_system == "Windows"
 https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.3.0/auto_gptq-0.3.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
diff --git a/server.py b/server.py
index 0e1d199d..adff9669 100644
--- a/server.py
+++ b/server.py
@@ -229,7 +229,7 @@ def create_model_menus():
                         shared.gradio['pre_layer'] = gr.Slider(label="pre_layer", minimum=0, maximum=100, value=shared.args.pre_layer[0] if shared.args.pre_layer is not None else 0)
                         shared.gradio['autogptq_info'] = gr.Markdown('* ExLlama_HF is recommended over AutoGPTQ for models derived from LLaMA.')
                         shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
-                        shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=2048, maximum=16384, step=256, info='Maximum sequence length.', value=shared.args.max_seq_len)
+                        shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=16384, step=256, info='Maximum sequence length.', value=shared.args.max_seq_len)
                         shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should typically be set to max_seq_len / 2048.', value=shared.args.compress_pos_emb)
                         shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=32, step=1, info='Positional embeddings alpha factor for NTK RoPE scaling. Scaling is not identical to embedding compression. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value)
 
@@ -408,6 +408,8 @@ def create_settings_menus(default_preset):
             with gr.Box():
                 with gr.Row():
                     with gr.Column():
+                        shared.gradio['guidance_scale'] = gr.Slider(-0.5, 2.5, step=0.05, value=generate_params['guidance_scale'], label='guidance_scale', info='For CFG. 1.5 is a good value.')
+                        shared.gradio['negative_prompt'] = gr.Textbox(value=shared.settings['negative_prompt'], label='Negative prompt')
                         shared.gradio['mirostat_mode'] = gr.Slider(0, 2, step=1, value=generate_params['mirostat_mode'], label='mirostat_mode', info='mode=1 is for llama.cpp only.')
                         shared.gradio['mirostat_tau'] = gr.Slider(0, 10, step=0.01, value=generate_params['mirostat_tau'], label='mirostat_tau')
                         shared.gradio['mirostat_eta'] = gr.Slider(0, 1, step=0.01, value=generate_params['mirostat_eta'], label='mirostat_eta')
@@ -433,7 +435,7 @@ def create_settings_menus(default_preset):
                         shared.gradio['stream'] = gr.Checkbox(value=not shared.args.no_stream, label='Activate text streaming')
 
     filter_by_loader.change(loaders.blacklist_samplers, filter_by_loader, gradio(loaders.list_all_samplers()), show_progress=False)
-    shared.gradio['preset_menu'].change(presets.load_preset_for_ui, gradio('preset_menu', 'interface_state'), gradio('interface_state', 'do_sample', 'temperature', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'repetition_penalty', 'repetition_penalty_range', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'tfs', 'top_a'))
+    shared.gradio['preset_menu'].change(presets.load_preset_for_ui, gradio('preset_menu', 'interface_state'), gradio('interface_state') + gradio(presets.presets_params()))
 
 
 def create_file_saving_menus():
diff --git a/settings-template.yaml b/settings-template.yaml
index 62e86371..a0c53b33 100644
--- a/settings-template.yaml
+++ b/settings-template.yaml
@@ -5,6 +5,7 @@ max_new_tokens_min: 1
 max_new_tokens_max: 4096
 auto_max_new_tokens: false
 seed: -1
+negative_prompt: ''
 character: None
 name1: You
 name2: Assistant

From d4b851bdc835669072df1b243cf2d6739df5d2b0 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 6 Aug 2023 13:42:43 -0700
Subject: [PATCH 030/169] Credit turboderp

---
 modules/exllama.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/exllama.py b/modules/exllama.py
index dc632a25..30c37634 100644
--- a/modules/exllama.py
+++ b/modules/exllama.py
@@ -136,6 +136,7 @@ class ExllamaModel:
                     break
 
         # Case 2: CFG
+        # Copied from https://github.com/turboderp/exllama/blob/master/example_cfg.py
         else:
             alpha = state['guidance_scale']
             prompts = [prompt, state['negative_prompt'] or '']

From 65aa11890ff947adf4e1d38ff174c3a17a512c2e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 6 Aug 2023 21:49:27 -0300
Subject: [PATCH 031/169] Refactor everything (#3481)

---
 {css => js}/main.js        |    0
 {css => js}/save_files.js  |    0
 modules/chat.py            |    1 -
 modules/llamacpp_hf.py     |    4 +-
 modules/llamacpp_model.py  |    2 +-
 modules/models.py          |    4 +-
 modules/models_settings.py |    4 +-
 modules/prompts.py         |   51 ++
 modules/text_generation.py |  173 +++---
 modules/training.py        |  163 +++---
 modules/ui.py              |   10 +-
 modules/ui_chat.py         |  262 +++++++++
 modules/ui_default.py      |   94 ++++
 modules/ui_file_saving.py  |  108 ++++
 modules/ui_model_menu.py   |  229 ++++++++
 modules/ui_notebook.py     |   98 ++++
 modules/ui_parameters.py   |  143 +++++
 modules/ui_session.py      |   71 +++
 server.py                  | 1067 +++---------------------------------
 19 files changed, 1306 insertions(+), 1178 deletions(-)
 rename {css => js}/main.js (100%)
 rename {css => js}/save_files.js (100%)
 create mode 100644 modules/prompts.py
 create mode 100644 modules/ui_chat.py
 create mode 100644 modules/ui_default.py
 create mode 100644 modules/ui_file_saving.py
 create mode 100644 modules/ui_model_menu.py
 create mode 100644 modules/ui_notebook.py
 create mode 100644 modules/ui_parameters.py
 create mode 100644 modules/ui_session.py

diff --git a/css/main.js b/js/main.js
similarity index 100%
rename from css/main.js
rename to js/main.js
diff --git a/css/save_files.js b/js/save_files.js
similarity index 100%
rename from css/save_files.js
rename to js/save_files.js
diff --git a/modules/chat.py b/modules/chat.py
index 8e562b98..8a86523c 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -3,7 +3,6 @@ import copy
 import functools
 import json
 import re
-from datetime import datetime
 from pathlib import Path
 
 import gradio as gr
diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py
index df9e0b2e..fa0554cd 100644
--- a/modules/llamacpp_hf.py
+++ b/modules/llamacpp_hf.py
@@ -64,7 +64,7 @@ class LlamacppHF(PreTrainedModel):
             else:
                 self.model.eval([seq[-1]])
 
-            logits = torch.tensor(self.model.scores[self.model.n_tokens-1, :]).view(1, 1, -1).to(kwargs['input_ids'].device)
+            logits = torch.tensor(self.model.scores[self.model.n_tokens - 1, :]).view(1, 1, -1).to(kwargs['input_ids'].device)
         else:
             self.model.reset()
             self.model.eval(seq)
@@ -112,7 +112,7 @@ class LlamacppHF(PreTrainedModel):
             'use_mlock': shared.args.mlock,
             'low_vram': shared.args.low_vram,
             'n_gpu_layers': shared.args.n_gpu_layers,
-            'rope_freq_base': 10000 * shared.args.alpha_value ** (64/63.),
+            'rope_freq_base': 10000 * shared.args.alpha_value ** (64 / 63.),
             'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
             'n_gqa': shared.args.n_gqa or None,
             'rms_norm_eps': shared.args.rms_norm_eps or None,
diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
index e5401378..f7f4cc9b 100644
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@@ -65,7 +65,7 @@ class LlamaCppModel:
             'use_mlock': shared.args.mlock,
             'low_vram': shared.args.low_vram,
             'n_gpu_layers': shared.args.n_gpu_layers,
-            'rope_freq_base': 10000 * shared.args.alpha_value ** (64/63.),
+            'rope_freq_base': 10000 * shared.args.alpha_value ** (64 / 63.),
             'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
             'n_gqa': shared.args.n_gqa or None,
             'rms_norm_eps': shared.args.rms_norm_eps or None,
diff --git a/modules/models.py b/modules/models.py
index 4866893a..4f6a44c1 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -1,9 +1,9 @@
 import gc
+import hashlib
 import os
 import re
 import time
 from pathlib import Path
-import hashlib
 
 import torch
 import transformers
@@ -14,7 +14,7 @@ from transformers import (
     AutoModelForCausalLM,
     AutoModelForSeq2SeqLM,
     AutoTokenizer,
-    BitsAndBytesConfig,
+    BitsAndBytesConfig
 )
 
 import modules.shared as shared
diff --git a/modules/models_settings.py b/modules/models_settings.py
index 00a6b90f..06a41da4 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -26,9 +26,9 @@ def infer_loader(model_name):
         loader = 'AutoGPTQ'
     elif len(list(path_to_model.glob('*ggml*.bin'))) > 0:
         loader = 'llama.cpp'
-    elif re.match('.*ggml.*\.bin', model_name.lower()):
+    elif re.match(r'.*ggml.*\.bin', model_name.lower()):
         loader = 'llama.cpp'
-    elif re.match('.*rwkv.*\.pth', model_name.lower()):
+    elif re.match(r'.*rwkv.*\.pth', model_name.lower()):
         loader = 'RWKV'
     else:
         loader = 'Transformers'
diff --git a/modules/prompts.py b/modules/prompts.py
new file mode 100644
index 00000000..f68c83c4
--- /dev/null
+++ b/modules/prompts.py
@@ -0,0 +1,51 @@
+import re
+from pathlib import Path
+
+import yaml
+
+from modules import utils
+from modules.text_generation import get_encoded_length
+
+
+def load_prompt(fname):
+    if fname in ['None', '']:
+        return ''
+    elif fname.startswith('Instruct-'):
+        fname = re.sub('^Instruct-', '', fname)
+        file_path = Path(f'characters/instruction-following/{fname}.yaml')
+        if not file_path.exists():
+            return ''
+
+        with open(file_path, 'r', encoding='utf-8') as f:
+            data = yaml.safe_load(f)
+            output = ''
+            if 'context' in data:
+                output += data['context']
+
+            replacements = {
+                '<|user|>': data['user'],
+                '<|bot|>': data['bot'],
+                '<|user-message|>': 'Input',
+            }
+
+            output += utils.replace_all(data['turn_template'].split('<|bot-message|>')[0], replacements)
+            return output.rstrip(' ')
+    else:
+        file_path = Path(f'prompts/{fname}.txt')
+        if not file_path.exists():
+            return ''
+
+        with open(file_path, 'r', encoding='utf-8') as f:
+            text = f.read()
+            if text[-1] == '\n':
+                text = text[:-1]
+
+            return text
+
+
+def count_tokens(text):
+    try:
+        tokens = get_encoded_length(text)
+        return f'{tokens} tokens in the input.'
+    except:
+        return 'Couldn\'t count the number of tokens. Is a tokenizer loaded?'
diff --git a/modules/text_generation.py b/modules/text_generation.py
index df9d708b..6e95414b 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -31,8 +31,62 @@ def generate_reply(*args, **kwargs):
         shared.generation_lock.release()
 
 
-def get_max_prompt_length(state):
-    return state['truncation_length'] - state['max_new_tokens']
+def _generate_reply(question, state, stopping_strings=None, is_chat=False):
+
+    # Find the appropriate generation function
+    generate_func = apply_extensions('custom_generate_reply')
+    if generate_func is None:
+        if shared.model_name == 'None' or shared.model is None:
+            logger.error("No model is loaded! Select one in the Model tab.")
+            yield ''
+            return
+
+        if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel']:
+            generate_func = generate_reply_custom
+        else:
+            generate_func = generate_reply_HF
+
+    # Prepare the input
+    original_question = question
+    if not is_chat:
+        state = apply_extensions('state', state)
+        question = apply_extensions('input', question, state)
+
+    # Find the stopping strings
+    all_stop_strings = []
+    for st in (stopping_strings, ast.literal_eval(f"[{state['custom_stopping_strings']}]")):
+        if type(st) is list and len(st) > 0:
+            all_stop_strings += st
+
+    if shared.args.verbose:
+        print(f'\n\n{question}\n--------------------\n')
+
+    shared.stop_everything = False
+    clear_torch_cache()
+    seed = set_manual_seed(state['seed'])
+    last_update = -1
+    reply = ''
+    is_stream = state['stream']
+    if len(all_stop_strings) > 0 and not state['stream']:
+        state = copy.deepcopy(state)
+        state['stream'] = True
+
+    # Generate
+    for reply in generate_func(question, original_question, seed, state, stopping_strings, is_chat=is_chat):
+        reply, stop_found = apply_stopping_strings(reply, all_stop_strings)
+        if is_stream:
+            cur_time = time.time()
+            if cur_time - last_update > 0.041666666666666664:  # Limit streaming to 24 fps
+                last_update = cur_time
+                yield reply
+
+        if stop_found:
+            break
+
+    if not is_chat:
+        reply = apply_extensions('output', reply, state)
+
+    yield reply
 
 
 def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_length=None):
@@ -61,6 +115,10 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
         return input_ids.cuda()
 
 
+def decode(output_ids, skip_special_tokens=True):
+    return shared.tokenizer.decode(output_ids, skip_special_tokens)
+
+
 def get_encoded_length(prompt):
     length_after_extensions = apply_extensions('tokenized_length', prompt)
     if length_after_extensions is not None:
@@ -69,12 +127,36 @@ def get_encoded_length(prompt):
     return len(encode(prompt)[0])
 
 
-def decode(output_ids, skip_special_tokens=True):
-    return shared.tokenizer.decode(output_ids, skip_special_tokens)
+def get_max_prompt_length(state):
+    return state['truncation_length'] - state['max_new_tokens']
+
+
+def generate_reply_wrapper(question, state, stopping_strings=None):
+    """
+    Returns formatted outputs for the UI
+    """
+    reply = question if not shared.is_seq2seq else ''
+    yield formatted_outputs(reply, shared.model_name)
+
+    for reply in generate_reply(question, state, stopping_strings, is_chat=False):
+        if not shared.is_seq2seq:
+            reply = question + reply
+
+        yield formatted_outputs(reply, shared.model_name)
+
+
+def formatted_outputs(reply, model_name):
+    if any(s in model_name for s in ['gpt-4chan', 'gpt4chan']):
+        reply = fix_gpt4chan(reply)
+        return reply, generate_4chan_html(reply)
+    else:
+        return reply, generate_basic_html(reply)
 
 
-# Removes empty replies from gpt4chan outputs
 def fix_gpt4chan(s):
+    """
+    Removes empty replies from gpt4chan outputs
+    """
     for i in range(10):
         s = re.sub("--- [0-9]*\n>>[0-9]*\n---", "---", s)
         s = re.sub("--- [0-9]*\n *\n---", "---", s)
@@ -83,8 +165,10 @@ def fix_gpt4chan(s):
     return s
 
 
-# Fix the LaTeX equations in galactica
 def fix_galactica(s):
+    """
+    Fix the LaTeX equations in GALACTICA
+    """
     s = s.replace(r'\[', r'$')
     s = s.replace(r'\]', r'$')
     s = s.replace(r'\(', r'$')
@@ -109,14 +193,6 @@ def get_reply_from_output_ids(output_ids, input_ids, original_question, state, i
     return reply
 
 
-def formatted_outputs(reply, model_name):
-    if any(s in model_name for s in ['gpt-4chan', 'gpt4chan']):
-        reply = fix_gpt4chan(reply)
-        return reply, generate_4chan_html(reply)
-    else:
-        return reply, generate_basic_html(reply)
-
-
 def set_manual_seed(seed):
     seed = int(seed)
     if seed == -1:
@@ -133,17 +209,6 @@ def stop_everything_event():
     shared.stop_everything = True
 
 
-def generate_reply_wrapper(question, state, stopping_strings=None):
-    reply = question if not shared.is_seq2seq else ''
-    yield formatted_outputs(reply, shared.model_name)
-
-    for reply in generate_reply(question, state, stopping_strings, is_chat=False):
-        if not shared.is_seq2seq:
-            reply = question + reply
-
-        yield formatted_outputs(reply, shared.model_name)
-
-
 def apply_stopping_strings(reply, all_stop_strings):
     stop_found = False
     for string in all_stop_strings:
@@ -169,61 +234,6 @@ def apply_stopping_strings(reply, all_stop_strings):
     return reply, stop_found
 
 
-def _generate_reply(question, state, stopping_strings=None, is_chat=False):
-    generate_func = apply_extensions('custom_generate_reply')
-    if generate_func is None:
-        if shared.model_name == 'None' or shared.model is None:
-            logger.error("No model is loaded! Select one in the Model tab.")
-            yield ''
-            return
-
-        if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel']:
-            generate_func = generate_reply_custom
-        else:
-            generate_func = generate_reply_HF
-
-    # Preparing the input
-    original_question = question
-    if not is_chat:
-        state = apply_extensions('state', state)
-        question = apply_extensions('input', question, state)
-
-    # Finding the stopping strings
-    all_stop_strings = []
-    for st in (stopping_strings, ast.literal_eval(f"[{state['custom_stopping_strings']}]")):
-        if type(st) is list and len(st) > 0:
-            all_stop_strings += st
-
-    if shared.args.verbose:
-        print(f'\n\n{question}\n--------------------\n')
-
-    shared.stop_everything = False
-    clear_torch_cache()
-    seed = set_manual_seed(state['seed'])
-    last_update = -1
-    reply = ''
-    is_stream = state['stream']
-    if len(all_stop_strings) > 0 and not state['stream']:
-        state = copy.deepcopy(state)
-        state['stream'] = True
-
-    for reply in generate_func(question, original_question, seed, state, stopping_strings, is_chat=is_chat):
-        reply, stop_found = apply_stopping_strings(reply, all_stop_strings)
-        if is_stream:
-            cur_time = time.time()
-            if cur_time - last_update > 0.041666666666666664:  # Limit streaming to 24 fps
-                last_update = cur_time
-                yield reply
-
-        if stop_found:
-            break
-
-    if not is_chat:
-        reply = apply_extensions('output', reply, state)
-
-    yield reply
-
-
 def generate_reply_HF(question, original_question, seed, state, stopping_strings=None, is_chat=False):
     generate_params = {}
     for k in ['max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'repetition_penalty_range', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'tfs', 'top_a', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'guidance_scale']:
@@ -316,6 +326,9 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
 
 
 def generate_reply_custom(question, original_question, seed, state, stopping_strings=None, is_chat=False):
+    """
+    For models that do not use the transformers library for sampling
+    """
     seed = set_manual_seed(state['seed'])
 
     t0 = time.time()
diff --git a/modules/training.py b/modules/training.py
index ef833679..7558cd5d 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -17,8 +17,6 @@ from pathlib import Path
 import gradio as gr
 import torch
 import transformers
-from modules.models import load_model, unload_model
-
 from datasets import Dataset, load_dataset
 from peft import (
     LoraConfig,
@@ -34,6 +32,7 @@ from modules.evaluate import (
     save_past_evaluations
 )
 from modules.logging_colors import logger
+from modules.models import load_model, unload_model
 from modules.utils import natural_keys
 
 # This mapping is from a very recent commit, not yet released.
@@ -65,100 +64,101 @@ WANT_INTERRUPT = False
 PARAMETERS = ["lora_name", "always_override", "save_steps", "micro_batch_size", "batch_size", "epochs", "learning_rate", "lr_scheduler_type", "lora_rank", "lora_alpha", "lora_dropout", "cutoff_len", "dataset", "eval_dataset", "format", "eval_steps", "raw_text_file", "overlap_len", "newline_favor_len", "higher_rank_limit", "warmup_steps", "optimizer", "hard_cut_string", "train_only_after", "stop_at_loss", "add_eos_token", "min_chars", "report_to"]
 
 
-def create_train_interface():
-    with gr.Tab('Train LoRA', elem_id='lora-train-tab'):
-        gr.Markdown("Confused? [[Click here for a guide]](https://github.com/oobabooga/text-generation-webui/blob/main/docs/Training-LoRAs.md)")
-
-        with gr.Row():
-            lora_name = gr.Textbox(label='Name', info='The name of your new LoRA file')
-            always_override = gr.Checkbox(label='Override Existing Files', value=False, info='If the name given is the same as an existing file, checking this will replace that file. Leaving unchecked will load that file and continue from it (must use the same rank value as the original had).')
-            save_steps = gr.Number(label='Save every n steps', value=0, info='If above 0, a checkpoint of the LoRA will be saved every time this many steps pass.')
-
-        with gr.Row():
-            copy_from = gr.Dropdown(label='Copy parameters from', value='None', choices=utils.get_available_loras())
-            ui.create_refresh_button(copy_from, lambda: None, lambda: {'choices': utils.get_available_loras()}, 'refresh-button')
-
-        with gr.Row():
-            # TODO: Implement multi-device support.
-            micro_batch_size = gr.Slider(label='Micro Batch Size', value=4, minimum=1, maximum=128, step=1, info='Per-device batch size (NOTE: multiple devices not yet implemented). Increasing this will increase VRAM usage.')
-            batch_size = gr.Slider(label='Batch Size', value=128, minimum=0, maximum=1024, step=4, info='Global batch size. The two batch sizes together determine gradient accumulation (gradientAccum = batch / microBatch). Higher gradient accum values lead to better quality training.')
-
-        with gr.Row():
-            epochs = gr.Number(label='Epochs', value=3, info='Number of times every entry in the dataset should be fed into training. So 1 means feed each item in once, 5 means feed it in five times, etc.')
-            learning_rate = gr.Textbox(label='Learning Rate', value='3e-4', info='Learning rate, in scientific notation. 3e-4 is a good starting base point. 1e-2 is extremely high, 1e-6 is extremely low.')
-            lr_scheduler_type = gr.Dropdown(label='LR Scheduler', value='linear', choices=['linear', 'constant', 'constant_with_warmup', 'cosine', 'cosine_with_restarts', 'polynomial', 'inverse_sqrt'], info='Learning rate scheduler - defines how the learning rate changes over time. "Constant" means never change, "linear" means to go in a straight line from the learning rate down to 0, cosine follows a curve, etc.')
-
-        # TODO: What is the actual maximum rank? Likely distinct per model. This might be better to somehow be on a log scale.
-        lora_rank = gr.Slider(label='LoRA Rank', value=32, minimum=0, maximum=1024, step=4, info='LoRA Rank, or dimension count. Higher values produce a larger file with better control over the model\'s content. Smaller values produce a smaller file with less overall control. Small values like 4 or 8 are great for stylistic guidance, higher values like 128 or 256 are good for teaching content upgrades, extremely high values (1024+) are difficult to train but may improve fine-detail learning for large datasets. Higher ranks also require higher VRAM.')
-        lora_alpha = gr.Slider(label='LoRA Alpha', value=64, minimum=0, maximum=2048, step=4, info='LoRA Alpha. This divided by the rank becomes the scaling of the LoRA. Higher means stronger. A good standard value is twice your Rank.')
-
-        cutoff_len = gr.Slider(label='Cutoff Length', minimum=0, maximum=2048, value=256, step=32, info='Cutoff length for text input. Essentially, how long of a line of text to feed in at a time. Higher values require drastically more VRAM.')
-
-        with gr.Tab(label='Formatted Dataset'):
-            with gr.Row():
-                dataset = gr.Dropdown(choices=utils.get_datasets('training/datasets', 'json'), value='None', label='Dataset', info='The dataset file to use for training.')
-                ui.create_refresh_button(dataset, lambda: None, lambda: {'choices': utils.get_datasets('training/datasets', 'json')}, 'refresh-button')
-                eval_dataset = gr.Dropdown(choices=utils.get_datasets('training/datasets', 'json'), value='None', label='Evaluation Dataset', info='The (optional) dataset file used to evaluate the model after training.')
-                ui.create_refresh_button(eval_dataset, lambda: None, lambda: {'choices': utils.get_datasets('training/datasets', 'json')}, 'refresh-button')
-                format = gr.Dropdown(choices=utils.get_datasets('training/formats', 'json'), value='None', label='Data Format', info='The format file used to decide how to format the dataset input.')
-                ui.create_refresh_button(format, lambda: None, lambda: {'choices': utils.get_datasets('training/formats', 'json')}, 'refresh-button')
-
-            eval_steps = gr.Number(label='Evaluate every n steps', value=100, info='If an evaluation dataset is given, test it every time this many steps pass.')
-
-        with gr.Tab(label="Raw text file"):
-            with gr.Row():
-                raw_text_file = gr.Dropdown(choices=utils.get_datasets('training/datasets', 'txt'), value='None', label='Text file', info='The raw text file to use for training.')
-                ui.create_refresh_button(raw_text_file, lambda: None, lambda: {'choices': utils.get_datasets('training/datasets', 'txt')}, 'refresh-button')
-                hard_cut_string = gr.Textbox(label='Hard Cut String', value='\\n\\n\\n', info='String that indicates a hard cut between text parts. Helps prevent unwanted overlap.')
-                min_chars = gr.Number(label='Ignore small blocks', value=0, info='Ignore Hard Cut blocks that have less or equal characters than this number')
+def create_ui():
+    with gr.Tab("Training", elem_id="training-tab"):
+        tmp = gr.State('')
+        with gr.Tab('Train LoRA', elem_id='lora-train-tab'):
+            gr.Markdown("Confused? [[Click here for a guide]](https://github.com/oobabooga/text-generation-webui/blob/main/docs/Training-LoRAs.md)")
 
             with gr.Row():
-                overlap_len = gr.Slider(label='Overlap Length', minimum=0, maximum=512, value=128, step=16, info='Overlap length - ie how many tokens from the prior chunk of text to include into the next chunk. (The chunks themselves will be of a size determined by Cutoff Length below). Setting overlap to exactly half the cutoff length may be ideal.')
-                newline_favor_len = gr.Slider(label='Prefer Newline Cut Length', minimum=0, maximum=512, value=128, step=16, info='Length (in characters, not tokens) of the maximum distance to shift an overlap cut by to ensure chunks cut at newlines. If too low, cuts may occur in the middle of lines.')
-
-        with gr.Accordion(label='Advanced Options', open=False):
-            lora_dropout = gr.Slider(label='LoRA Dropout', minimum=0.0, maximum=1.0, step=0.025, value=0.05, info='Percentage probability for dropout of LoRA layers. This can help reduce overfitting. Most users should leave at default.')
-            warmup_steps = gr.Number(label='Warmup Steps', value=100, info='For this many steps at the start, the learning rate will be lower than normal. This helps the trainer prepare the model and precompute statistics to improve the quality of training after the start.')
-            optimizer = gr.Dropdown(label='Optimizer', value='adamw_torch', choices=['adamw_hf', 'adamw_torch', 'adamw_torch_fused', 'adamw_torch_xla', 'adamw_apex_fused', 'adafactor', 'adamw_bnb_8bit', 'adamw_anyprecision', 'sgd', 'adagrad'], info='Different optimizer implementation options, for advanced users. Effects of different options are not well documented yet.')
-            train_only_after = gr.Textbox(label='Train Only After', value='', info='Only consider text *after* this string in any given chunk for training. For Alpaca datasets, use "### Response:" to only train the response and ignore the input.')
-            stop_at_loss = gr.Slider(label='Stop at loss', minimum=0.0, maximum=3.0, step=0.1, value=0.00, info='The process will automatically stop once the desired loss value is reached. (reasonable numbers are 1.5-1.8)')
-            add_eos_token = gr.Checkbox(label='Add EOS token', value=False, info="Adds EOS token for each dataset item. In case of raw text, the EOS will be added at the Hard Cut")
+                lora_name = gr.Textbox(label='Name', info='The name of your new LoRA file')
+                always_override = gr.Checkbox(label='Override Existing Files', value=False, info='If the name given is the same as an existing file, checking this will replace that file. Leaving unchecked will load that file and continue from it (must use the same rank value as the original had).')
+                save_steps = gr.Number(label='Save every n steps', value=0, info='If above 0, a checkpoint of the LoRA will be saved every time this many steps pass.')
 
             with gr.Row():
-                higher_rank_limit = gr.Checkbox(label='Enable higher ranks', value=False, info='If checked, changes Rank/Alpha slider above to go much higher. This will not work without a datacenter-class GPU.')
+                copy_from = gr.Dropdown(label='Copy parameters from', value='None', choices=utils.get_available_loras())
+                ui.create_refresh_button(copy_from, lambda: None, lambda: {'choices': utils.get_available_loras()}, 'refresh-button')
+
             with gr.Row():
-                report_to = gr.Radio(label="Save detailed logs with", value="None", choices=["None", "wandb", "tensorboard"], interactive=True)
+                # TODO: Implement multi-device support.
+                micro_batch_size = gr.Slider(label='Micro Batch Size', value=4, minimum=1, maximum=128, step=1, info='Per-device batch size (NOTE: multiple devices not yet implemented). Increasing this will increase VRAM usage.')
+                batch_size = gr.Slider(label='Batch Size', value=128, minimum=0, maximum=1024, step=4, info='Global batch size. The two batch sizes together determine gradient accumulation (gradientAccum = batch / microBatch). Higher gradient accum values lead to better quality training.')
 
-        with gr.Row():
-            start_button = gr.Button("Start LoRA Training")
-            stop_button = gr.Button("Interrupt")
+            with gr.Row():
+                epochs = gr.Number(label='Epochs', value=3, info='Number of times every entry in the dataset should be fed into training. So 1 means feed each item in once, 5 means feed it in five times, etc.')
+                learning_rate = gr.Textbox(label='Learning Rate', value='3e-4', info='Learning rate, in scientific notation. 3e-4 is a good starting base point. 1e-2 is extremely high, 1e-6 is extremely low.')
+                lr_scheduler_type = gr.Dropdown(label='LR Scheduler', value='linear', choices=['linear', 'constant', 'constant_with_warmup', 'cosine', 'cosine_with_restarts', 'polynomial', 'inverse_sqrt'], info='Learning rate scheduler - defines how the learning rate changes over time. "Constant" means never change, "linear" means to go in a straight line from the learning rate down to 0, cosine follows a curve, etc.')
 
-        output = gr.Markdown(value="Ready")
+            # TODO: What is the actual maximum rank? Likely distinct per model. This might be better to somehow be on a log scale.
+            lora_rank = gr.Slider(label='LoRA Rank', value=32, minimum=0, maximum=1024, step=4, info='LoRA Rank, or dimension count. Higher values produce a larger file with better control over the model\'s content. Smaller values produce a smaller file with less overall control. Small values like 4 or 8 are great for stylistic guidance, higher values like 128 or 256 are good for teaching content upgrades, extremely high values (1024+) are difficult to train but may improve fine-detail learning for large datasets. Higher ranks also require higher VRAM.')
+            lora_alpha = gr.Slider(label='LoRA Alpha', value=64, minimum=0, maximum=2048, step=4, info='LoRA Alpha. This divided by the rank becomes the scaling of the LoRA. Higher means stronger. A good standard value is twice your Rank.')
 
-    with gr.Tab('Perplexity evaluation', elem_id='evaluate-tab'):
-        with gr.Row():
-            with gr.Column():
-                models = gr.Dropdown(utils.get_available_models(), label='Models', multiselect=True)
-                evaluate_text_file = gr.Dropdown(choices=['wikitext', 'ptb', 'ptb_new'] + utils.get_datasets('training/datasets', 'txt')[1:], value='wikitext', label='Input dataset', info='The raw text file on which the model will be evaluated. The first options are automatically downloaded: wikitext, ptb, and ptb_new. The next options are your local text files under training/datasets.')
+            cutoff_len = gr.Slider(label='Cutoff Length', minimum=0, maximum=2048, value=256, step=32, info='Cutoff length for text input. Essentially, how long of a line of text to feed in at a time. Higher values require drastically more VRAM.')
+
+            with gr.Tab(label='Formatted Dataset'):
                 with gr.Row():
-                    stride_length = gr.Slider(label='Stride', minimum=1, maximum=2048, value=512, step=1, info='Used to make the evaluation faster at the cost of accuracy. 1 = slowest but most accurate. 512 is a common value.')
-                    max_length = gr.Slider(label='max_length', minimum=0, maximum=8096, value=0, step=1, info='The context for each evaluation. If set to 0, the maximum context length for the model will be used.')
+                    dataset = gr.Dropdown(choices=utils.get_datasets('training/datasets', 'json'), value='None', label='Dataset', info='The dataset file to use for training.')
+                    ui.create_refresh_button(dataset, lambda: None, lambda: {'choices': utils.get_datasets('training/datasets', 'json')}, 'refresh-button')
+                    eval_dataset = gr.Dropdown(choices=utils.get_datasets('training/datasets', 'json'), value='None', label='Evaluation Dataset', info='The (optional) dataset file used to evaluate the model after training.')
+                    ui.create_refresh_button(eval_dataset, lambda: None, lambda: {'choices': utils.get_datasets('training/datasets', 'json')}, 'refresh-button')
+                    format = gr.Dropdown(choices=utils.get_datasets('training/formats', 'json'), value='None', label='Data Format', info='The format file used to decide how to format the dataset input.')
+                    ui.create_refresh_button(format, lambda: None, lambda: {'choices': utils.get_datasets('training/formats', 'json')}, 'refresh-button')
+
+                eval_steps = gr.Number(label='Evaluate every n steps', value=100, info='If an evaluation dataset is given, test it every time this many steps pass.')
+
+            with gr.Tab(label="Raw text file"):
+                with gr.Row():
+                    raw_text_file = gr.Dropdown(choices=utils.get_datasets('training/datasets', 'txt'), value='None', label='Text file', info='The raw text file to use for training.')
+                    ui.create_refresh_button(raw_text_file, lambda: None, lambda: {'choices': utils.get_datasets('training/datasets', 'txt')}, 'refresh-button')
+                    hard_cut_string = gr.Textbox(label='Hard Cut String', value='\\n\\n\\n', info='String that indicates a hard cut between text parts. Helps prevent unwanted overlap.')
+                    min_chars = gr.Number(label='Ignore small blocks', value=0, info='Ignore Hard Cut blocks that have less or equal characters than this number')
 
                 with gr.Row():
-                    start_current_evaluation = gr.Button("Evaluate loaded model")
-                    start_evaluation = gr.Button("Evaluate selected models")
-                    stop_evaluation = gr.Button("Interrupt")
+                    overlap_len = gr.Slider(label='Overlap Length', minimum=0, maximum=512, value=128, step=16, info='Overlap length - ie how many tokens from the prior chunk of text to include into the next chunk. (The chunks themselves will be of a size determined by Cutoff Length below). Setting overlap to exactly half the cutoff length may be ideal.')
+                    newline_favor_len = gr.Slider(label='Prefer Newline Cut Length', minimum=0, maximum=512, value=128, step=16, info='Length (in characters, not tokens) of the maximum distance to shift an overlap cut by to ensure chunks cut at newlines. If too low, cuts may occur in the middle of lines.')
 
-            with gr.Column():
-                evaluation_log = gr.Markdown(value='')
+            with gr.Accordion(label='Advanced Options', open=False):
+                lora_dropout = gr.Slider(label='LoRA Dropout', minimum=0.0, maximum=1.0, step=0.025, value=0.05, info='Percentage probability for dropout of LoRA layers. This can help reduce overfitting. Most users should leave at default.')
+                warmup_steps = gr.Number(label='Warmup Steps', value=100, info='For this many steps at the start, the learning rate will be lower than normal. This helps the trainer prepare the model and precompute statistics to improve the quality of training after the start.')
+                optimizer = gr.Dropdown(label='Optimizer', value='adamw_torch', choices=['adamw_hf', 'adamw_torch', 'adamw_torch_fused', 'adamw_torch_xla', 'adamw_apex_fused', 'adafactor', 'adamw_bnb_8bit', 'adamw_anyprecision', 'sgd', 'adagrad'], info='Different optimizer implementation options, for advanced users. Effects of different options are not well documented yet.')
+                train_only_after = gr.Textbox(label='Train Only After', value='', info='Only consider text *after* this string in any given chunk for training. For Alpaca datasets, use "### Response:" to only train the response and ignore the input.')
+                stop_at_loss = gr.Slider(label='Stop at loss', minimum=0.0, maximum=3.0, step=0.1, value=0.00, info='The process will automatically stop once the desired loss value is reached. (reasonable numbers are 1.5-1.8)')
+                add_eos_token = gr.Checkbox(label='Add EOS token', value=False, info="Adds EOS token for each dataset item. In case of raw text, the EOS will be added at the Hard Cut")
 
-        evaluation_table = gr.Dataframe(value=generate_markdown_table(), interactive=True)
-        with gr.Row():
-            save_comments = gr.Button('Save comments', elem_classes="small-button")
-            refresh_table = gr.Button('Refresh the table', elem_classes="small-button")
+                with gr.Row():
+                    higher_rank_limit = gr.Checkbox(label='Enable higher ranks', value=False, info='If checked, changes Rank/Alpha slider above to go much higher. This will not work without a datacenter-class GPU.')
+                with gr.Row():
+                    report_to = gr.Radio(label="Save detailed logs with", value="None", choices=["None", "wandb", "tensorboard"], interactive=True)
+
+            with gr.Row():
+                start_button = gr.Button("Start LoRA Training")
+                stop_button = gr.Button("Interrupt")
+
+            output = gr.Markdown(value="Ready")
+
+        with gr.Tab('Perplexity evaluation', elem_id='evaluate-tab'):
+            with gr.Row():
+                with gr.Column():
+                    models = gr.Dropdown(utils.get_available_models(), label='Models', multiselect=True)
+                    evaluate_text_file = gr.Dropdown(choices=['wikitext', 'ptb', 'ptb_new'] + utils.get_datasets('training/datasets', 'txt')[1:], value='wikitext', label='Input dataset', info='The raw text file on which the model will be evaluated. The first options are automatically downloaded: wikitext, ptb, and ptb_new. The next options are your local text files under training/datasets.')
+                    with gr.Row():
+                        stride_length = gr.Slider(label='Stride', minimum=1, maximum=2048, value=512, step=1, info='Used to make the evaluation faster at the cost of accuracy. 1 = slowest but most accurate. 512 is a common value.')
+                        max_length = gr.Slider(label='max_length', minimum=0, maximum=8096, value=0, step=1, info='The context for each evaluation. If set to 0, the maximum context length for the model will be used.')
+
+                    with gr.Row():
+                        start_current_evaluation = gr.Button("Evaluate loaded model")
+                        start_evaluation = gr.Button("Evaluate selected models")
+                        stop_evaluation = gr.Button("Interrupt")
+
+                with gr.Column():
+                    evaluation_log = gr.Markdown(value='')
+
+            evaluation_table = gr.Dataframe(value=generate_markdown_table(), interactive=True)
+            with gr.Row():
+                save_comments = gr.Button('Save comments', elem_classes="small-button")
+                refresh_table = gr.Button('Refresh the table', elem_classes="small-button")
 
     # Training events
-
     all_params = [lora_name, always_override, save_steps, micro_batch_size, batch_size, epochs, learning_rate, lr_scheduler_type, lora_rank, lora_alpha, lora_dropout, cutoff_len, dataset, eval_dataset, format, eval_steps, raw_text_file, overlap_len, newline_favor_len, higher_rank_limit, warmup_steps, optimizer, hard_cut_string, train_only_after, stop_at_loss, add_eos_token, min_chars, report_to]
 
     copy_from.change(do_copy_params, [copy_from] + all_params, all_params)
@@ -172,7 +172,6 @@ def create_train_interface():
     ev = start_evaluation.click(calculate_perplexity, [models, evaluate_text_file, stride_length, max_length], evaluation_log, show_progress=False)
     start_evaluation.click(generate_markdown_table, None, evaluation_table, show_progress=False)
 
-    tmp = gr.State('')
     start_current_evaluation.click(lambda: ['current model'], None, tmp)
     ev_cur = start_current_evaluation.click(calculate_perplexity, [tmp, evaluate_text_file, stride_length, max_length], evaluation_log, show_progress=False)
     start_current_evaluation.click(generate_markdown_table, None, evaluation_table, show_progress=False)
diff --git a/modules/ui.py b/modules/ui.py
index 8a7f9f47..b58b7dd6 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -1,4 +1,3 @@
-import json
 from pathlib import Path
 
 import gradio as gr
@@ -11,9 +10,9 @@ with open(Path(__file__).resolve().parent / '../css/main.css', 'r') as f:
     css = f.read()
 with open(Path(__file__).resolve().parent / '../css/chat.css', 'r') as f:
     chat_css = f.read()
-with open(Path(__file__).resolve().parent / '../css/main.js', 'r') as f:
+with open(Path(__file__).resolve().parent / '../js/main.js', 'r') as f:
     main_js = f.read()
-with open(Path(__file__).resolve().parent / '../css/save_files.js', 'r') as f:
+with open(Path(__file__).resolve().parent / '../js/save_files.js', 'r') as f:
     save_files_js = f.read()
 
 refresh_symbol = '🔄'
@@ -30,6 +29,11 @@ theme = gr.themes.Default(
     background_fill_secondary='#eaeaea'
 )
 
+if Path("notification.mp3").exists():
+    audio_notification_js = "document.querySelector('#audio_notification audio')?.play();"
+else:
+    audio_notification_js = ""
+
 
 def list_model_elements():
     elements = [
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
new file mode 100644
index 00000000..8a0c103b
--- /dev/null
+++ b/modules/ui_chat.py
@@ -0,0 +1,262 @@
+import json
+from functools import partial
+from pathlib import Path
+
+import gradio as gr
+from PIL import Image
+
+from modules import chat, shared, ui, utils
+from modules.html_generator import chat_html_wrapper
+from modules.text_generation import stop_everything_event
+from modules.utils import gradio
+
+
+def create_ui():
+
+    shared.gradio.update({
+        'interface_state': gr.State({k: None for k in shared.input_elements}),
+        'Chat input': gr.State(),
+        'dummy': gr.State(),
+        'history': gr.State({'internal': [], 'visible': []}),
+    })
+
+    with gr.Tab('Text generation', elem_id='main'):
+        shared.gradio['display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': []}, shared.settings['name1'], shared.settings['name2'], 'chat', 'cai-chat'))
+        shared.gradio['textbox'] = gr.Textbox(label='Input')
+        with gr.Row():
+            shared.gradio['Stop'] = gr.Button('Stop', elem_id='stop')
+            shared.gradio['Generate'] = gr.Button('Generate', elem_id='Generate', variant='primary')
+            shared.gradio['Continue'] = gr.Button('Continue')
+
+        with gr.Row():
+            shared.gradio['Impersonate'] = gr.Button('Impersonate')
+            shared.gradio['Regenerate'] = gr.Button('Regenerate')
+            shared.gradio['Remove last'] = gr.Button('Remove last', elem_classes=['button_nowrap'])
+
+        with gr.Row():
+            shared.gradio['Copy last reply'] = gr.Button('Copy last reply')
+            shared.gradio['Replace last reply'] = gr.Button('Replace last reply')
+            shared.gradio['Send dummy message'] = gr.Button('Send dummy message')
+            shared.gradio['Send dummy reply'] = gr.Button('Send dummy reply')
+
+        with gr.Row():
+            shared.gradio['Clear history'] = gr.Button('Clear history')
+            shared.gradio['Clear history-confirm'] = gr.Button('Confirm', variant='stop', visible=False)
+            shared.gradio['Clear history-cancel'] = gr.Button('Cancel', visible=False)
+
+        with gr.Row():
+            shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'])
+
+        with gr.Row():
+            shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], value=shared.settings['mode'] if shared.settings['mode'] in ['chat', 'instruct', 'chat-instruct'] else 'chat', label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template selected under "Chat settings" must match the current model.')
+            shared.gradio['chat_style'] = gr.Dropdown(choices=utils.get_available_chat_styles(), label='Chat style', value=shared.settings['chat_style'], visible=shared.settings['mode'] != 'instruct')
+
+    with gr.Tab('Chat settings', elem_id='chat-settings'):
+        with gr.Tab("Character"):
+            with gr.Row():
+                with gr.Column(scale=8):
+                    with gr.Row():
+                        shared.gradio['character_menu'] = gr.Dropdown(value='None', choices=utils.get_available_characters(), label='Character', elem_id='character-menu', info='Used in chat and chat-instruct modes.', elem_classes='slim-dropdown')
+                        ui.create_refresh_button(shared.gradio['character_menu'], lambda: None, lambda: {'choices': utils.get_available_characters()}, 'refresh-button')
+                        shared.gradio['save_character'] = gr.Button('💾', elem_classes='refresh-button')
+                        shared.gradio['delete_character'] = gr.Button('🗑️', elem_classes='refresh-button')
+
+                    shared.gradio['name1'] = gr.Textbox(value=shared.settings['name1'], lines=1, label='Your name')
+                    shared.gradio['name2'] = gr.Textbox(value=shared.settings['name2'], lines=1, label='Character\'s name')
+                    shared.gradio['context'] = gr.Textbox(value=shared.settings['context'], lines=4, label='Context', elem_classes=['add_scrollbar'])
+                    shared.gradio['greeting'] = gr.Textbox(value=shared.settings['greeting'], lines=4, label='Greeting', elem_classes=['add_scrollbar'])
+
+                with gr.Column(scale=1):
+                    shared.gradio['character_picture'] = gr.Image(label='Character picture', type='pil')
+                    shared.gradio['your_picture'] = gr.Image(label='Your picture', type='pil', value=Image.open(Path('cache/pfp_me.png')) if Path('cache/pfp_me.png').exists() else None)
+
+        with gr.Tab("Instruction template"):
+            with gr.Row():
+                with gr.Row():
+                    shared.gradio['instruction_template'] = gr.Dropdown(choices=utils.get_available_instruction_templates(), label='Instruction template', value='None', info='Change this according to the model/LoRA that you are using. Used in instruct and chat-instruct modes.', elem_classes='slim-dropdown')
+                    ui.create_refresh_button(shared.gradio['instruction_template'], lambda: None, lambda: {'choices': utils.get_available_instruction_templates()}, 'refresh-button')
+                    shared.gradio['save_template'] = gr.Button('💾', elem_classes='refresh-button')
+                    shared.gradio['delete_template'] = gr.Button('🗑️ ', elem_classes='refresh-button')
+
+            shared.gradio['name1_instruct'] = gr.Textbox(value='', lines=2, label='User string')
+            shared.gradio['name2_instruct'] = gr.Textbox(value='', lines=1, label='Bot string')
+            shared.gradio['context_instruct'] = gr.Textbox(value='', lines=4, label='Context')
+            shared.gradio['turn_template'] = gr.Textbox(value=shared.settings['turn_template'], lines=1, label='Turn template', info='Used to precisely define the placement of spaces and new line characters in instruction prompts.')
+            with gr.Row():
+                shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=4, label='Command for chat-instruct mode', info='<|character|> gets replaced by the bot name, and <|prompt|> gets replaced by the regular chat prompt.', elem_classes=['add_scrollbar'])
+
+        with gr.Tab('Chat history'):
+            with gr.Row():
+                with gr.Column():
+                    shared.gradio['save_chat_history'] = gr.Button(value='Save history')
+
+                with gr.Column():
+                    shared.gradio['load_chat_history'] = gr.File(type='binary', file_types=['.json', '.txt'], label="Upload History JSON")
+
+        with gr.Tab('Upload character'):
+            with gr.Tab('YAML or JSON'):
+                with gr.Row():
+                    shared.gradio['upload_json'] = gr.File(type='binary', file_types=['.json', '.yaml'], label='JSON or YAML File')
+                    shared.gradio['upload_img_bot'] = gr.Image(type='pil', label='Profile Picture (optional)')
+
+                shared.gradio['Submit character'] = gr.Button(value='Submit', interactive=False)
+
+            with gr.Tab('TavernAI PNG'):
+                with gr.Row():
+                    with gr.Column():
+                        shared.gradio['upload_img_tavern'] = gr.Image(type='pil', label='TavernAI PNG File', elem_id="upload_img_tavern")
+                        shared.gradio['tavern_json'] = gr.State()
+                    with gr.Column():
+                        shared.gradio['tavern_name'] = gr.Textbox(value='', lines=1, label='Name', interactive=False)
+                        shared.gradio['tavern_desc'] = gr.Textbox(value='', lines=4, max_lines=4, label='Description', interactive=False)
+
+                shared.gradio['Submit tavern character'] = gr.Button(value='Submit', interactive=False)
+
+
+def create_event_handlers():
+    gen_events = []
+
+    shared.input_params = gradio('Chat input', 'start_with', 'interface_state')
+    clear_arr = gradio('Clear history-confirm', 'Clear history', 'Clear history-cancel')
+    shared.reload_inputs = gradio('history', 'name1', 'name2', 'mode', 'chat_style')
+
+    gen_events.append(shared.gradio['Generate'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
+        chat.generate_chat_reply_wrapper, shared.input_params, gradio('display', 'history'), show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
+        lambda: None, None, None, _js=f"() => {{{ui.audio_notification_js}}}")
+    )
+
+    gen_events.append(shared.gradio['textbox'].submit(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
+        chat.generate_chat_reply_wrapper, shared.input_params, gradio('display', 'history'), show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
+        lambda: None, None, None, _js=f"() => {{{ui.audio_notification_js}}}")
+    )
+
+    gen_events.append(shared.gradio['Regenerate'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        partial(chat.generate_chat_reply_wrapper, regenerate=True), shared.input_params, gradio('display', 'history'), show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
+        lambda: None, None, None, _js=f"() => {{{ui.audio_notification_js}}}")
+    )
+
+    gen_events.append(shared.gradio['Continue'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        partial(chat.generate_chat_reply_wrapper, _continue=True), shared.input_params, gradio('display', 'history'), show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
+        lambda: None, None, None, _js=f"() => {{{ui.audio_notification_js}}}")
+    )
+
+    gen_events.append(shared.gradio['Impersonate'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda x: x, gradio('textbox'), gradio('Chat input'), show_progress=False).then(
+        chat.impersonate_wrapper, shared.input_params, gradio('textbox'), show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: None, None, None, _js=f"() => {{{ui.audio_notification_js}}}")
+    )
+
+    shared.gradio['Replace last reply'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.replace_last_reply, gradio('textbox', 'interface_state'), gradio('history')).then(
+        lambda: '', None, gradio('textbox'), show_progress=False).then(
+        chat.redraw_html, shared.reload_inputs, gradio('display')).then(
+        chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None)
+
+    shared.gradio['Send dummy message'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.send_dummy_message, gradio('textbox', 'interface_state'), gradio('history')).then(
+        lambda: '', None, gradio('textbox'), show_progress=False).then(
+        chat.redraw_html, shared.reload_inputs, gradio('display')).then(
+        chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None)
+
+    shared.gradio['Send dummy reply'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.send_dummy_reply, gradio('textbox', 'interface_state'), gradio('history')).then(
+        lambda: '', None, gradio('textbox'), show_progress=False).then(
+        chat.redraw_html, shared.reload_inputs, gradio('display')).then(
+        chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None)
+
+    shared.gradio['Clear history'].click(lambda: [gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, clear_arr)
+    shared.gradio['Clear history-cancel'].click(lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, clear_arr)
+    shared.gradio['Clear history-confirm'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, clear_arr).then(
+        chat.clear_chat_log, gradio('interface_state'), gradio('history')).then(
+        chat.redraw_html, shared.reload_inputs, gradio('display')).then(
+        chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None)
+
+    shared.gradio['Remove last'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.remove_last_message, gradio('history'), gradio('textbox', 'history'), show_progress=False).then(
+        chat.redraw_html, shared.reload_inputs, gradio('display')).then(
+        chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None)
+
+    shared.gradio['character_menu'].change(
+        partial(chat.load_character, instruct=False), gradio('character_menu', 'name1', 'name2'), gradio('name1', 'name2', 'character_picture', 'greeting', 'context', 'dummy')).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.load_persistent_history, gradio('interface_state'), gradio('history')).then(
+        chat.redraw_html, shared.reload_inputs, gradio('display'))
+
+    shared.gradio['Stop'].click(
+        stop_everything_event, None, None, queue=False, cancels=gen_events if shared.args.no_stream else None).then(
+        chat.redraw_html, shared.reload_inputs, gradio('display'))
+
+    shared.gradio['mode'].change(
+        lambda x: gr.update(visible=x != 'instruct'), gradio('mode'), gradio('chat_style'), show_progress=False).then(
+        chat.redraw_html, shared.reload_inputs, gradio('display'))
+
+    shared.gradio['chat_style'].change(chat.redraw_html, shared.reload_inputs, gradio('display'))
+    shared.gradio['instruction_template'].change(
+        partial(chat.load_character, instruct=True), gradio('instruction_template', 'name1_instruct', 'name2_instruct'), gradio('name1_instruct', 'name2_instruct', 'dummy', 'dummy', 'context_instruct', 'turn_template'))
+
+    shared.gradio['load_chat_history'].upload(
+        chat.load_history, gradio('load_chat_history', 'history'), gradio('history')).then(
+        chat.redraw_html, shared.reload_inputs, gradio('display')).then(
+        None, None, None, _js='() => {alert("The history has been loaded.")}')
+
+    shared.gradio['Copy last reply'].click(chat.send_last_reply_to_input, gradio('history'), gradio('textbox'), show_progress=False)
+
+    # Save/delete a character
+    shared.gradio['save_character'].click(
+        lambda x: x, gradio('name2'), gradio('save_character_filename')).then(
+        lambda: gr.update(visible=True), None, gradio('character_saver'))
+
+    shared.gradio['delete_character'].click(lambda: gr.update(visible=True), None, gradio('character_deleter'))
+
+    shared.gradio['save_template'].click(
+        lambda: 'My Template.yaml', None, gradio('save_filename')).then(
+        lambda: 'characters/instruction-following/', None, gradio('save_root')).then(
+        chat.generate_instruction_template_yaml, gradio('name1_instruct', 'name2_instruct', 'context_instruct', 'turn_template'), gradio('save_contents')).then(
+        lambda: gr.update(visible=True), None, gradio('file_saver'))
+
+    shared.gradio['delete_template'].click(
+        lambda x: f'{x}.yaml', gradio('instruction_template'), gradio('delete_filename')).then(
+        lambda: 'characters/instruction-following/', None, gradio('delete_root')).then(
+        lambda: gr.update(visible=True), None, gradio('file_deleter'))
+
+    shared.gradio['save_chat_history'].click(
+        lambda x: json.dumps(x, indent=4), gradio('history'), gradio('temporary_text')).then(
+        None, gradio('temporary_text', 'character_menu', 'mode'), None, _js=f"(hist, char, mode) => {{{ui.save_files_js}; saveHistory(hist, char, mode)}}")
+
+    shared.gradio['Submit character'].click(
+        chat.upload_character, gradio('upload_json', 'upload_img_bot'), gradio('character_menu')).then(
+        None, None, None, _js='() => {alert("The character has been loaded.")}')
+
+    shared.gradio['Submit tavern character'].click(
+        chat.upload_tavern_character, gradio('upload_img_tavern', 'tavern_json'), gradio('character_menu')).then(
+        None, None, None, _js='() => {alert("The character has been loaded.")}')
+
+    shared.gradio['upload_json'].upload(lambda: gr.update(interactive=True), None, gradio('Submit character'))
+    shared.gradio['upload_json'].clear(lambda: gr.update(interactive=False), None, gradio('Submit character'))
+    shared.gradio['upload_img_tavern'].upload(chat.check_tavern_character, gradio('upload_img_tavern'), gradio('tavern_name', 'tavern_desc', 'tavern_json', 'Submit tavern character'), show_progress=False)
+    shared.gradio['upload_img_tavern'].clear(lambda: (None, None, None, gr.update(interactive=False)), None, gradio('tavern_name', 'tavern_desc', 'tavern_json', 'Submit tavern character'), show_progress=False)
+    shared.gradio['your_picture'].change(
+        chat.upload_your_profile_picture, gradio('your_picture'), None).then(
+        partial(chat.redraw_html, reset_cache=True), shared.reload_inputs, gradio('display'))
diff --git a/modules/ui_default.py b/modules/ui_default.py
new file mode 100644
index 00000000..f0ab74ef
--- /dev/null
+++ b/modules/ui_default.py
@@ -0,0 +1,94 @@
+import gradio as gr
+
+from modules import shared, ui, utils
+from modules.prompts import count_tokens, load_prompt
+from modules.text_generation import (
+    generate_reply_wrapper,
+    stop_everything_event
+)
+from modules.utils import gradio
+
+
+def create_ui():
+    default_text = load_prompt(shared.settings['prompt'])
+
+    shared.gradio['interface_state'] = gr.State({k: None for k in shared.input_elements})
+    shared.gradio['last_input'] = gr.State('')
+
+    with gr.Tab("Text generation", elem_id="main"):
+        with gr.Row():
+            with gr.Column():
+                shared.gradio['textbox'] = gr.Textbox(value=default_text, elem_classes=['textbox_default', 'add_scrollbar'], lines=27, label='Input')
+                shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens'])
+                with gr.Row():
+                    shared.gradio['Generate'] = gr.Button('Generate', variant='primary')
+                    shared.gradio['Stop'] = gr.Button('Stop', elem_id='stop')
+                    shared.gradio['Continue'] = gr.Button('Continue')
+                    shared.gradio['count_tokens'] = gr.Button('Count tokens')
+
+                with gr.Row():
+                    shared.gradio['prompt_menu'] = gr.Dropdown(choices=utils.get_available_prompts(), value='None', label='Prompt', elem_classes='slim-dropdown')
+                    ui.create_refresh_button(shared.gradio['prompt_menu'], lambda: None, lambda: {'choices': utils.get_available_prompts()}, 'refresh-button')
+                    shared.gradio['save_prompt'] = gr.Button('💾', elem_classes='refresh-button')
+                    shared.gradio['delete_prompt'] = gr.Button('🗑️', elem_classes='refresh-button')
+
+                shared.gradio['status'] = gr.Markdown('')
+
+            with gr.Column():
+                with gr.Tab('Raw'):
+                    shared.gradio['output_textbox'] = gr.Textbox(lines=27, label='Output', elem_classes=['textbox_default_output', 'add_scrollbar'])
+
+                with gr.Tab('Markdown'):
+                    shared.gradio['markdown_render'] = gr.Button('Render')
+                    shared.gradio['markdown'] = gr.Markdown()
+
+                with gr.Tab('HTML'):
+                    shared.gradio['html'] = gr.HTML()
+
+
+def create_event_handlers():
+    gen_events = []
+    shared.input_params = gradio('textbox', 'interface_state')
+    output_params = gradio('output_textbox', 'html')
+
+    gen_events.append(shared.gradio['Generate'].click(
+        lambda x: x, gradio('textbox'), gradio('last_input')).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        generate_reply_wrapper, shared.input_params, output_params, show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: None, None, None, _js=f"() => {{{ui.audio_notification_js}}}")
+        # lambda: None, None, None, _js="() => {element = document.getElementsByTagName('textarea')[0]; element.scrollTop = element.scrollHeight}")
+    )
+
+    gen_events.append(shared.gradio['textbox'].submit(
+        lambda x: x, gradio('textbox'), gradio('last_input')).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        generate_reply_wrapper, shared.input_params, output_params, show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: None, None, None, _js=f"() => {{{ui.audio_notification_js}}}")
+        # lambda: None, None, None, _js="() => {element = document.getElementsByTagName('textarea')[0]; element.scrollTop = element.scrollHeight}")
+    )
+
+    shared.gradio['markdown_render'].click(lambda x: x, gradio('output_textbox'), gradio('markdown'), queue=False)
+    gen_events.append(shared.gradio['Continue'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        generate_reply_wrapper, [shared.gradio['output_textbox']] + shared.input_params[1:], output_params, show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: None, None, None, _js=f"() => {{{ui.audio_notification_js}}}")
+        # lambda: None, None, None, _js="() => {element = document.getElementsByTagName('textarea')[1]; element.scrollTop = element.scrollHeight}")
+    )
+
+    shared.gradio['Stop'].click(stop_everything_event, None, None, queue=False, cancels=gen_events if shared.args.no_stream else None)
+    shared.gradio['prompt_menu'].change(load_prompt, gradio('prompt_menu'), gradio('textbox'), show_progress=False)
+    shared.gradio['save_prompt'].click(
+        lambda x: x, gradio('textbox'), gradio('save_contents')).then(
+        lambda: 'prompts/', None, gradio('save_root')).then(
+        lambda: utils.current_time() + '.txt', None, gradio('save_filename')).then(
+        lambda: gr.update(visible=True), None, gradio('file_saver'))
+
+    shared.gradio['delete_prompt'].click(
+        lambda: 'prompts/', None, gradio('delete_root')).then(
+        lambda x: x + '.txt', gradio('prompt_menu'), gradio('delete_filename')).then(
+        lambda: gr.update(visible=True), None, gradio('file_deleter'))
+
+    shared.gradio['count_tokens'].click(count_tokens, gradio('textbox'), gradio('status'), show_progress=False)
diff --git a/modules/ui_file_saving.py b/modules/ui_file_saving.py
new file mode 100644
index 00000000..952d66c9
--- /dev/null
+++ b/modules/ui_file_saving.py
@@ -0,0 +1,108 @@
+import json
+
+import gradio as gr
+
+from modules import chat, presets, shared, ui, utils
+from modules.utils import gradio
+
+
+def create_ui():
+
+    # Text file saver
+    with gr.Box(visible=False, elem_classes='file-saver') as shared.gradio['file_saver']:
+        shared.gradio['save_filename'] = gr.Textbox(lines=1, label='File name')
+        shared.gradio['save_root'] = gr.Textbox(lines=1, label='File folder', info='For reference. Unchangeable.', interactive=False)
+        shared.gradio['save_contents'] = gr.Textbox(lines=10, label='File contents')
+        with gr.Row():
+            shared.gradio['save_confirm'] = gr.Button('Save', elem_classes="small-button")
+            shared.gradio['save_cancel'] = gr.Button('Cancel', elem_classes="small-button")
+
+    # Text file deleter
+    with gr.Box(visible=False, elem_classes='file-saver') as shared.gradio['file_deleter']:
+        shared.gradio['delete_filename'] = gr.Textbox(lines=1, label='File name')
+        shared.gradio['delete_root'] = gr.Textbox(lines=1, label='File folder', info='For reference. Unchangeable.', interactive=False)
+        with gr.Row():
+            shared.gradio['delete_confirm'] = gr.Button('Delete', elem_classes="small-button", variant='stop')
+            shared.gradio['delete_cancel'] = gr.Button('Cancel', elem_classes="small-button")
+
+    # Character saver/deleter
+    if shared.is_chat():
+        with gr.Box(visible=False, elem_classes='file-saver') as shared.gradio['character_saver']:
+            shared.gradio['save_character_filename'] = gr.Textbox(lines=1, label='File name', info='The character will be saved to your characters/ folder with this base filename.')
+            with gr.Row():
+                shared.gradio['save_character_confirm'] = gr.Button('Save', elem_classes="small-button")
+                shared.gradio['save_character_cancel'] = gr.Button('Cancel', elem_classes="small-button")
+
+        with gr.Box(visible=False, elem_classes='file-saver') as shared.gradio['character_deleter']:
+            gr.Markdown('Confirm the character deletion?')
+            with gr.Row():
+                shared.gradio['delete_character_confirm'] = gr.Button('Delete', elem_classes="small-button", variant='stop')
+                shared.gradio['delete_character_cancel'] = gr.Button('Cancel', elem_classes="small-button")
+
+
+def create_event_handlers():
+    shared.gradio['save_confirm'].click(
+        lambda x, y, z: utils.save_file(x + y, z), gradio('save_root', 'save_filename', 'save_contents'), None).then(
+        lambda: gr.update(visible=False), None, gradio('file_saver'))
+
+    shared.gradio['delete_confirm'].click(
+        lambda x, y: utils.delete_file(x + y), gradio('delete_root', 'delete_filename'), None).then(
+        lambda: gr.update(visible=False), None, gradio('file_deleter'))
+
+    shared.gradio['delete_cancel'].click(lambda: gr.update(visible=False), None, gradio('file_deleter'))
+    shared.gradio['save_cancel'].click(lambda: gr.update(visible=False), None, gradio('file_saver'))
+    if shared.is_chat():
+        shared.gradio['save_character_confirm'].click(
+            chat.save_character, gradio('name2', 'greeting', 'context', 'character_picture', 'save_character_filename'), None).then(
+            lambda: gr.update(visible=False), None, gradio('character_saver'))
+
+        shared.gradio['delete_character_confirm'].click(
+            chat.delete_character, gradio('character_menu'), None).then(
+            lambda: gr.update(visible=False), None, gradio('character_deleter')).then(
+            lambda: gr.update(choices=utils.get_available_characters()), None, gradio('character_menu'))
+
+        shared.gradio['save_character_cancel'].click(lambda: gr.update(visible=False), None, gradio('character_saver'))
+        shared.gradio['delete_character_cancel'].click(lambda: gr.update(visible=False), None, gradio('character_deleter'))
+
+    shared.gradio['save_preset'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        presets.generate_preset_yaml, gradio('interface_state'), gradio('save_contents')).then(
+        lambda: 'presets/', None, gradio('save_root')).then(
+        lambda: 'My Preset.yaml', None, gradio('save_filename')).then(
+        lambda: gr.update(visible=True), None, gradio('file_saver'))
+
+    shared.gradio['delete_preset'].click(
+        lambda x: f'{x}.yaml', gradio('preset_menu'), gradio('delete_filename')).then(
+        lambda: 'presets/', None, gradio('delete_root')).then(
+        lambda: gr.update(visible=True), None, gradio('file_deleter'))
+
+    if not shared.args.multi_user:
+        shared.gradio['save_session'].click(
+            ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+            lambda x: json.dumps(x, indent=4), gradio('interface_state'), gradio('temporary_text')).then(
+            None, gradio('temporary_text'), None, _js=f"(contents) => {{{ui.save_files_js}; saveSession(contents, \"{shared.get_mode()}\")}}")
+
+        if shared.is_chat():
+            shared.gradio['load_session'].upload(
+                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+                load_session, gradio('load_session', 'interface_state'), gradio('interface_state')).then(
+                ui.apply_interface_values, gradio('interface_state'), gradio(ui.list_interface_input_elements()), show_progress=False).then(
+                chat.redraw_html, shared.reload_inputs, gradio('display')).then(
+                None, None, None, _js='() => {alert("The session has been loaded.")}')
+        else:
+            shared.gradio['load_session'].upload(
+                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+                load_session, gradio('load_session', 'interface_state'), gradio('interface_state')).then(
+                ui.apply_interface_values, gradio('interface_state'), gradio(ui.list_interface_input_elements()), show_progress=False).then(
+                None, None, None, _js='() => {alert("The session has been loaded.")}')
+
+
+def load_session(file, state):
+    decoded_file = file if type(file) == str else file.decode('utf-8')
+    data = json.loads(decoded_file)
+
+    if shared.is_chat() and 'character_menu' in data and state.get('character_menu') != data.get('character_menu'):
+        shared.session_is_loading = True
+
+    state.update(data)
+    return state
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
new file mode 100644
index 00000000..c9d772b8
--- /dev/null
+++ b/modules/ui_model_menu.py
@@ -0,0 +1,229 @@
+import importlib
+import math
+import re
+import traceback
+from functools import partial
+
+import gradio as gr
+import psutil
+import torch
+
+from modules import loaders, shared, ui, utils
+from modules.logging_colors import logger
+from modules.LoRA import add_lora_to_model
+from modules.models import load_model, unload_model
+from modules.models_settings import (
+    apply_model_settings_to_state,
+    save_model_settings,
+    update_model_parameters
+)
+from modules.utils import gradio
+
+
+def create_ui():
+    # Finding the default values for the GPU and CPU memories
+    total_mem = []
+    for i in range(torch.cuda.device_count()):
+        total_mem.append(math.floor(torch.cuda.get_device_properties(i).total_memory / (1024 * 1024)))
+
+    default_gpu_mem = []
+    if shared.args.gpu_memory is not None and len(shared.args.gpu_memory) > 0:
+        for i in shared.args.gpu_memory:
+            if 'mib' in i.lower():
+                default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)))
+            else:
+                default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)) * 1000)
+    while len(default_gpu_mem) < len(total_mem):
+        default_gpu_mem.append(0)
+
+    total_cpu_mem = math.floor(psutil.virtual_memory().total / (1024 * 1024))
+    if shared.args.cpu_memory is not None:
+        default_cpu_mem = re.sub('[a-zA-Z ]', '', shared.args.cpu_memory)
+    else:
+        default_cpu_mem = 0
+
+    with gr.Tab("Model", elem_id="model-tab"):
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    with gr.Column():
+                        with gr.Row():
+                            shared.gradio['model_menu'] = gr.Dropdown(choices=utils.get_available_models(), value=shared.model_name, label='Model', elem_classes='slim-dropdown')
+                            ui.create_refresh_button(shared.gradio['model_menu'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button')
+                            shared.gradio['load_model'] = gr.Button("Load", visible=not shared.settings['autoload_model'], elem_classes='refresh-button')
+                            shared.gradio['unload_model'] = gr.Button("Unload", elem_classes='refresh-button')
+                            shared.gradio['reload_model'] = gr.Button("Reload", elem_classes='refresh-button')
+                            shared.gradio['save_model_settings'] = gr.Button("Save settings", elem_classes='refresh-button')
+
+                    with gr.Column():
+                        with gr.Row():
+                            shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown')
+                            ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button')
+                            shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button')
+
+        with gr.Row():
+            with gr.Column():
+                shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=["Transformers", "ExLlama_HF", "ExLlama", "AutoGPTQ", "GPTQ-for-LLaMa", "llama.cpp", "llamacpp_HF"], value=None)
+                with gr.Box():
+                    with gr.Row():
+                        with gr.Column():
+                            for i in range(len(total_mem)):
+                                shared.gradio[f'gpu_memory_{i}'] = gr.Slider(label=f"gpu-memory in MiB for device :{i}", maximum=total_mem[i], value=default_gpu_mem[i])
+
+                            shared.gradio['cpu_memory'] = gr.Slider(label="cpu-memory in MiB", maximum=total_cpu_mem, value=default_cpu_mem)
+                            shared.gradio['transformers_info'] = gr.Markdown('load-in-4bit params:')
+                            shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype)
+                            shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type)
+
+                            shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=128, value=shared.args.n_gpu_layers)
+                            shared.gradio['n_ctx'] = gr.Slider(minimum=0, maximum=16384, step=256, label="n_ctx", value=shared.args.n_ctx)
+                            shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=32, value=shared.args.threads)
+                            shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, value=shared.args.n_batch)
+                            shared.gradio['n_gqa'] = gr.Slider(minimum=0, maximum=16, step=1, label="n_gqa", value=shared.args.n_gqa, info='grouped-query attention. Must be 8 for llama-2 70b.')
+                            shared.gradio['rms_norm_eps'] = gr.Slider(minimum=0, maximum=1e-5, step=1e-6, label="rms_norm_eps", value=shared.args.n_gqa, info='5e-6 is a good value for llama-2 models.')
+
+                            shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=str(shared.args.wbits) if shared.args.wbits > 0 else "None")
+                            shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=str(shared.args.groupsize) if shared.args.groupsize > 0 else "None")
+                            shared.gradio['model_type'] = gr.Dropdown(label="model_type", choices=["None", "llama", "opt", "gptj"], value=shared.args.model_type or "None")
+                            shared.gradio['pre_layer'] = gr.Slider(label="pre_layer", minimum=0, maximum=100, value=shared.args.pre_layer[0] if shared.args.pre_layer is not None else 0)
+                            shared.gradio['autogptq_info'] = gr.Markdown('* ExLlama_HF is recommended over AutoGPTQ for models derived from LLaMA.')
+                            shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
+                            shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=16384, step=256, info='Maximum sequence length.', value=shared.args.max_seq_len)
+                            shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should typically be set to max_seq_len / 2048.', value=shared.args.compress_pos_emb)
+                            shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=32, step=1, info='Positional embeddings alpha factor for NTK RoPE scaling. Scaling is not identical to embedding compression. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value)
+
+                        with gr.Column():
+                            shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
+                            shared.gradio['no_inject_fused_attention'] = gr.Checkbox(label="no_inject_fused_attention", value=shared.args.no_inject_fused_attention, info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Disable if running low on VRAM.')
+                            shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
+                            shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.')
+                            shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
+                            shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu)
+                            shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
+                            shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
+                            shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices)
+                            shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
+                            shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
+                            shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant)
+                            shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
+                            shared.gradio['low_vram'] = gr.Checkbox(label="low-vram", value=shared.args.low_vram)
+                            shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
+                            shared.gradio['llama_cpp_seed'] = gr.Number(label='Seed (0 for random)', value=shared.args.llama_cpp_seed)
+                            shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.')
+                            shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa is currently 2x faster than AutoGPTQ on some systems. It is installed by default with the one-click installers. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')
+                            shared.gradio['exllama_info'] = gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/blob/main/docs/ExLlama.md).')
+                            shared.gradio['exllama_HF_info'] = gr.Markdown('ExLlama_HF is a wrapper that lets you use ExLlama like a Transformers model, which means it can use the Transformers samplers. It\'s a bit slower than the regular ExLlama.')
+                            shared.gradio['llamacpp_HF_info'] = gr.Markdown('llamacpp_HF is a wrapper that lets you use llama.cpp like a Transformers model, which means it can use the Transformers samplers. To use it, make sure to first download oobabooga/llama-tokenizer under "Download custom model or LoRA".')
+
+            with gr.Column():
+                with gr.Row():
+                    shared.gradio['autoload_model'] = gr.Checkbox(value=shared.settings['autoload_model'], label='Autoload the model', info='Whether to load the model as soon as it is selected in the Model dropdown.')
+
+                shared.gradio['custom_model_menu'] = gr.Textbox(label="Download custom model or LoRA", info="Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main")
+                shared.gradio['download_model_button'] = gr.Button("Download")
+
+                with gr.Row():
+                    shared.gradio['model_status'] = gr.Markdown('No model is loaded' if shared.model_name == 'None' else 'Ready')
+
+
+def create_event_handlers():
+    shared.gradio['loader'].change(loaders.make_loader_params_visible, gradio('loader'), gradio(loaders.get_all_params()))
+
+    # In this event handler, the interface state is read and updated
+    # with the model defaults (if any), and then the model is loaded
+    # unless "autoload_model" is unchecked
+    shared.gradio['model_menu'].change(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        apply_model_settings_to_state, gradio('model_menu', 'interface_state'), gradio('interface_state')).then(
+        ui.apply_interface_values, gradio('interface_state'), gradio(ui.list_interface_input_elements()), show_progress=False).then(
+        update_model_parameters, gradio('interface_state'), None).then(
+        load_model_wrapper, gradio('model_menu', 'loader', 'autoload_model'), gradio('model_status'), show_progress=False)
+
+    shared.gradio['load_model'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        update_model_parameters, gradio('interface_state'), None).then(
+        partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=False)
+
+    shared.gradio['unload_model'].click(
+        unload_model, None, None).then(
+        lambda: "Model unloaded", None, gradio('model_status'))
+
+    shared.gradio['reload_model'].click(
+        unload_model, None, None).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        update_model_parameters, gradio('interface_state'), None).then(
+        partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=False)
+
+    shared.gradio['save_model_settings'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        save_model_settings, gradio('model_menu', 'interface_state'), gradio('model_status'), show_progress=False)
+
+    shared.gradio['lora_menu_apply'].click(load_lora_wrapper, gradio('lora_menu'), gradio('model_status'), show_progress=False)
+    shared.gradio['download_model_button'].click(download_model_wrapper, gradio('custom_model_menu'), gradio('model_status'), show_progress=True)
+    shared.gradio['autoload_model'].change(lambda x: gr.update(visible=not x), gradio('autoload_model'), gradio('load_model'))
+
+
+def load_model_wrapper(selected_model, loader, autoload=False):
+    if not autoload:
+        yield f"The settings for {selected_model} have been updated.\nClick on \"Load\" to load it."
+        return
+
+    if selected_model == 'None':
+        yield "No model selected"
+    else:
+        try:
+            yield f"Loading {selected_model}..."
+            shared.model_name = selected_model
+            unload_model()
+            if selected_model != '':
+                shared.model, shared.tokenizer = load_model(shared.model_name, loader)
+
+            if shared.model is not None:
+                yield f"Successfully loaded {selected_model}"
+            else:
+                yield f"Failed to load {selected_model}."
+        except:
+            exc = traceback.format_exc()
+            logger.error('Failed to load the model.')
+            print(exc)
+            yield exc.replace('\n', '\n\n')
+
+
+def load_lora_wrapper(selected_loras):
+    yield ("Applying the following LoRAs to {}:\n\n{}".format(shared.model_name, '\n'.join(selected_loras)))
+    add_lora_to_model(selected_loras)
+    yield ("Successfuly applied the LoRAs")
+
+
+def download_model_wrapper(repo_id, progress=gr.Progress()):
+    try:
+        downloader_module = importlib.import_module("download-model")
+        downloader = downloader_module.ModelDownloader()
+        repo_id_parts = repo_id.split(":")
+        model = repo_id_parts[0] if len(repo_id_parts) > 0 else repo_id
+        branch = repo_id_parts[1] if len(repo_id_parts) > 1 else "main"
+        check = False
+
+        progress(0.0)
+        yield ("Cleaning up the model/branch names")
+        model, branch = downloader.sanitize_model_and_branch_names(model, branch)
+
+        yield ("Getting the download links from Hugging Face")
+        links, sha256, is_lora = downloader.get_download_links_from_huggingface(model, branch, text_only=False)
+
+        yield ("Getting the output folder")
+        base_folder = shared.args.lora_dir if is_lora else shared.args.model_dir
+        output_folder = downloader.get_output_folder(model, branch, is_lora, base_folder=base_folder)
+
+        if check:
+            progress(0.5)
+            yield ("Checking previously downloaded files")
+            downloader.check_model_files(model, branch, links, sha256, output_folder)
+            progress(1.0)
+        else:
+            yield (f"Downloading files to {output_folder}")
+            downloader.download_model_files(model, branch, links, sha256, output_folder, progress_bar=progress, threads=1)
+            yield ("Done!")
+    except:
+        progress(1.0)
+        yield traceback.format_exc().replace('\n', '\n\n')
diff --git a/modules/ui_notebook.py b/modules/ui_notebook.py
new file mode 100644
index 00000000..998a7cf7
--- /dev/null
+++ b/modules/ui_notebook.py
@@ -0,0 +1,98 @@
+import gradio as gr
+
+from modules import shared, ui, utils
+from modules.prompts import count_tokens, load_prompt
+from modules.text_generation import (
+    generate_reply_wrapper,
+    stop_everything_event
+)
+from modules.utils import gradio
+
+
+def create_ui():
+    default_text = load_prompt(shared.settings['prompt'])
+
+    shared.gradio['interface_state'] = gr.State({k: None for k in shared.input_elements})
+    shared.gradio['last_input'] = gr.State('')
+
+    with gr.Tab("Text generation", elem_id="main"):
+        with gr.Row():
+            with gr.Column(scale=4):
+                with gr.Tab('Raw'):
+                    shared.gradio['textbox'] = gr.Textbox(value=default_text, elem_classes=['textbox', 'add_scrollbar'], lines=27)
+
+                with gr.Tab('Markdown'):
+                    shared.gradio['markdown_render'] = gr.Button('Render')
+                    shared.gradio['markdown'] = gr.Markdown()
+
+                with gr.Tab('HTML'):
+                    shared.gradio['html'] = gr.HTML()
+
+                with gr.Row():
+                    shared.gradio['Generate'] = gr.Button('Generate', variant='primary', elem_classes="small-button")
+                    shared.gradio['Stop'] = gr.Button('Stop', elem_classes="small-button", elem_id='stop')
+                    shared.gradio['Undo'] = gr.Button('Undo', elem_classes="small-button")
+                    shared.gradio['Regenerate'] = gr.Button('Regenerate', elem_classes="small-button")
+
+            with gr.Column(scale=1):
+                gr.HTML('<div style="padding-bottom: 13px"></div>')
+                shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens'])
+                with gr.Row():
+                    shared.gradio['prompt_menu'] = gr.Dropdown(choices=utils.get_available_prompts(), value='None', label='Prompt', elem_classes='slim-dropdown')
+                    ui.create_refresh_button(shared.gradio['prompt_menu'], lambda: None, lambda: {'choices': utils.get_available_prompts()}, ['refresh-button', 'refresh-button-small'])
+                    shared.gradio['save_prompt'] = gr.Button('💾', elem_classes=['refresh-button', 'refresh-button-small'])
+                    shared.gradio['delete_prompt'] = gr.Button('🗑️', elem_classes=['refresh-button', 'refresh-button-small'])
+
+                shared.gradio['count_tokens'] = gr.Button('Count tokens')
+                shared.gradio['status'] = gr.Markdown('')
+
+
+def create_event_handlers():
+    gen_events = []
+
+    shared.input_params = gradio('textbox', 'interface_state')
+    output_params = gradio('textbox', 'html')
+
+    gen_events.append(shared.gradio['Generate'].click(
+        lambda x: x, gradio('textbox'), gradio('last_input')).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        generate_reply_wrapper, shared.input_params, output_params, show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: None, None, None, _js=f"() => {{{ui.audio_notification_js}}}")
+        # lambda: None, None, None, _js="() => {element = document.getElementsByTagName('textarea')[0]; element.scrollTop = element.scrollHeight}")
+    )
+
+    gen_events.append(shared.gradio['textbox'].submit(
+        lambda x: x, gradio('textbox'), gradio('last_input')).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        generate_reply_wrapper, shared.input_params, output_params, show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: None, None, None, _js=f"() => {{{ui.audio_notification_js}}}")
+        # lambda: None, None, None, _js="() => {element = document.getElementsByTagName('textarea')[0]; element.scrollTop = element.scrollHeight}")
+    )
+
+    shared.gradio['Undo'].click(lambda x: x, gradio('last_input'), gradio('textbox'), show_progress=False)
+    shared.gradio['markdown_render'].click(lambda x: x, gradio('textbox'), gradio('markdown'), queue=False)
+    gen_events.append(shared.gradio['Regenerate'].click(
+        lambda x: x, gradio('last_input'), gradio('textbox'), show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        generate_reply_wrapper, shared.input_params, output_params, show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: None, None, None, _js=f"() => {{{ui.audio_notification_js}}}")
+        # lambda: None, None, None, _js="() => {element = document.getElementsByTagName('textarea')[0]; element.scrollTop = element.scrollHeight}")
+    )
+
+    shared.gradio['Stop'].click(stop_everything_event, None, None, queue=False, cancels=gen_events if shared.args.no_stream else None)
+    shared.gradio['prompt_menu'].change(load_prompt, gradio('prompt_menu'), gradio('textbox'), show_progress=False)
+    shared.gradio['save_prompt'].click(
+        lambda x: x, gradio('textbox'), gradio('save_contents')).then(
+        lambda: 'prompts/', None, gradio('save_root')).then(
+        lambda: utils.current_time() + '.txt', None, gradio('save_filename')).then(
+        lambda: gr.update(visible=True), None, gradio('file_saver'))
+
+    shared.gradio['delete_prompt'].click(
+        lambda: 'prompts/', None, gradio('delete_root')).then(
+        lambda x: x + '.txt', gradio('prompt_menu'), gradio('delete_filename')).then(
+        lambda: gr.update(visible=True), None, gradio('file_deleter'))
+
+    shared.gradio['count_tokens'].click(count_tokens, gradio('textbox'), gradio('status'), show_progress=False)
diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
new file mode 100644
index 00000000..75bce9b1
--- /dev/null
+++ b/modules/ui_parameters.py
@@ -0,0 +1,143 @@
+import gradio as gr
+
+from modules import loaders, presets, shared, ui, utils
+from modules.utils import gradio
+
+
+def create_ui(default_preset):
+    generate_params = presets.load_preset(default_preset)
+    with gr.Tab("Parameters", elem_id="parameters"):
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    shared.gradio['preset_menu'] = gr.Dropdown(choices=utils.get_available_presets(), value=default_preset, label='Generation parameters preset', elem_classes='slim-dropdown')
+                    ui.create_refresh_button(shared.gradio['preset_menu'], lambda: None, lambda: {'choices': utils.get_available_presets()}, 'refresh-button')
+                    shared.gradio['save_preset'] = gr.Button('💾', elem_classes='refresh-button')
+                    shared.gradio['delete_preset'] = gr.Button('🗑️', elem_classes='refresh-button')
+
+            with gr.Column():
+                shared.gradio['filter_by_loader'] = gr.Dropdown(label="Filter by loader", choices=["All", "Transformers", "ExLlama_HF", "ExLlama", "AutoGPTQ", "GPTQ-for-LLaMa", "llama.cpp", "llamacpp_HF"], value="All", elem_classes='slim-dropdown')
+
+        with gr.Row():
+            with gr.Column():
+                with gr.Box():
+                    with gr.Row():
+                        with gr.Column():
+                            shared.gradio['temperature'] = gr.Slider(0.01, 1.99, value=generate_params['temperature'], step=0.01, label='temperature')
+                            shared.gradio['top_p'] = gr.Slider(0.0, 1.0, value=generate_params['top_p'], step=0.01, label='top_p')
+                            shared.gradio['top_k'] = gr.Slider(0, 200, value=generate_params['top_k'], step=1, label='top_k')
+                            shared.gradio['typical_p'] = gr.Slider(0.0, 1.0, value=generate_params['typical_p'], step=0.01, label='typical_p')
+                            shared.gradio['epsilon_cutoff'] = gr.Slider(0, 9, value=generate_params['epsilon_cutoff'], step=0.01, label='epsilon_cutoff')
+                            shared.gradio['eta_cutoff'] = gr.Slider(0, 20, value=generate_params['eta_cutoff'], step=0.01, label='eta_cutoff')
+                            shared.gradio['tfs'] = gr.Slider(0.0, 1.0, value=generate_params['tfs'], step=0.01, label='tfs')
+                            shared.gradio['top_a'] = gr.Slider(0.0, 1.0, value=generate_params['top_a'], step=0.01, label='top_a')
+
+                        with gr.Column():
+                            shared.gradio['repetition_penalty'] = gr.Slider(1.0, 1.5, value=generate_params['repetition_penalty'], step=0.01, label='repetition_penalty')
+                            shared.gradio['repetition_penalty_range'] = gr.Slider(0, 4096, step=64, value=generate_params['repetition_penalty_range'], label='repetition_penalty_range')
+                            shared.gradio['encoder_repetition_penalty'] = gr.Slider(0.8, 1.5, value=generate_params['encoder_repetition_penalty'], step=0.01, label='encoder_repetition_penalty')
+                            shared.gradio['no_repeat_ngram_size'] = gr.Slider(0, 20, step=1, value=generate_params['no_repeat_ngram_size'], label='no_repeat_ngram_size')
+                            shared.gradio['min_length'] = gr.Slider(0, 2000, step=1, value=generate_params['min_length'], label='min_length')
+                            shared.gradio['seed'] = gr.Number(value=shared.settings['seed'], label='Seed (-1 for random)')
+                            shared.gradio['do_sample'] = gr.Checkbox(value=generate_params['do_sample'], label='do_sample')
+
+                with gr.Accordion("Learn more", open=False):
+                    gr.Markdown("""
+
+        For a technical description of the parameters, the [transformers documentation](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig) is a good reference.
+
+        The best presets, according to the [Preset Arena](https://github.com/oobabooga/oobabooga.github.io/blob/main/arena/results.md) experiment, are:
+
+        * Instruction following:
+            1) Divine Intellect
+            2) Big O
+            3) simple-1
+            4) Space Alien
+            5) StarChat
+            6) Titanic
+            7) tfs-with-top-a
+            8) Asterism
+            9) Contrastive Search
+
+        * Chat:
+            1) Midnight Enigma
+            2) Yara
+            3) Shortwave
+
+        ### Temperature
+        Primary factor to control randomness of outputs. 0 = deterministic (only the most likely token is used). Higher value = more randomness.
+        ### top_p
+        If not set to 1, select tokens with probabilities adding up to less than this number. Higher value = higher range of possible random results.
+        ### top_k
+        Similar to top_p, but select instead only the top_k most likely tokens. Higher value = higher range of possible random results.
+        ### typical_p
+        If not set to 1, select only tokens that are at least this much more likely to appear than random tokens, given the prior text.
+        ### epsilon_cutoff
+        In units of 1e-4; a reasonable value is 3. This sets a probability floor below which tokens are excluded from being sampled. Should be used with top_p, top_k, and eta_cutoff set to 0.
+        ### eta_cutoff
+        In units of 1e-4; a reasonable value is 3. Should be used with top_p, top_k, and epsilon_cutoff set to 0.
+        ### repetition_penalty
+        Exponential penalty factor for repeating prior tokens. 1 means no penalty, higher value = less repetition, lower value = more repetition.
+        ### repetition_penalty_range
+        The number of most recent tokens to consider for repetition penalty. 0 makes all tokens be used.
+        ### encoder_repetition_penalty
+        Also known as the "Hallucinations filter". Used to penalize tokens that are *not* in the prior text. Higher value = more likely to stay in context, lower value = more likely to diverge.
+        ### no_repeat_ngram_size
+        If not set to 0, specifies the length of token sets that are completely blocked from repeating at all. Higher values = blocks larger phrases, lower values = blocks words or letters from repeating. Only 0 or high values are a good idea in most cases.
+        ### min_length
+        Minimum generation length in tokens.
+        ### penalty_alpha
+        Contrastive Search is enabled by setting this to greater than zero and unchecking "do_sample". It should be used with a low value of top_k, for instance, top_k = 4.
+
+                    """, elem_classes="markdown")
+
+            with gr.Column():
+                create_chat_settings_menus()
+                with gr.Box():
+                    with gr.Row():
+                        with gr.Column():
+                            shared.gradio['guidance_scale'] = gr.Slider(-0.5, 2.5, step=0.05, value=generate_params['guidance_scale'], label='guidance_scale', info='For CFG. 1.5 is a good value.')
+                            shared.gradio['negative_prompt'] = gr.Textbox(value=shared.settings['negative_prompt'], label='Negative prompt')
+                            shared.gradio['mirostat_mode'] = gr.Slider(0, 2, step=1, value=generate_params['mirostat_mode'], label='mirostat_mode', info='mode=1 is for llama.cpp only.')
+                            shared.gradio['mirostat_tau'] = gr.Slider(0, 10, step=0.01, value=generate_params['mirostat_tau'], label='mirostat_tau')
+                            shared.gradio['mirostat_eta'] = gr.Slider(0, 1, step=0.01, value=generate_params['mirostat_eta'], label='mirostat_eta')
+
+                        with gr.Column():
+                            shared.gradio['penalty_alpha'] = gr.Slider(0, 5, value=generate_params['penalty_alpha'], label='penalty_alpha', info='For Contrastive Search. do_sample must be unchecked.')
+
+                            shared.gradio['num_beams'] = gr.Slider(1, 20, step=1, value=generate_params['num_beams'], label='num_beams', info='For Beam Search, along with length_penalty and early_stopping.')
+                            shared.gradio['length_penalty'] = gr.Slider(-5, 5, value=generate_params['length_penalty'], label='length_penalty')
+                            shared.gradio['early_stopping'] = gr.Checkbox(value=generate_params['early_stopping'], label='early_stopping')
+
+                with gr.Box():
+                    with gr.Row():
+                        with gr.Column():
+                            shared.gradio['truncation_length'] = gr.Slider(value=shared.settings['truncation_length'], minimum=shared.settings['truncation_length_min'], maximum=shared.settings['truncation_length_max'], step=256, label='Truncate the prompt up to this length', info='The leftmost tokens are removed if the prompt exceeds this length. Most models require this to be at most 2048.')
+                            shared.gradio['custom_stopping_strings'] = gr.Textbox(lines=1, value=shared.settings["custom_stopping_strings"] or None, label='Custom stopping strings', info='In addition to the defaults. Written between "" and separated by commas. For instance: "\\nYour Assistant:", "\\nThe assistant:"')
+                        with gr.Column():
+                            shared.gradio['auto_max_new_tokens'] = gr.Checkbox(value=shared.settings['auto_max_new_tokens'], label='auto_max_new_tokens', info='Expand max_new_tokens to the available context length.')
+                            shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label='Ban the eos_token', info='Forces the model to never end the generation prematurely.')
+                            shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.')
+
+                            shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.')
+                            shared.gradio['stream'] = gr.Checkbox(value=not shared.args.no_stream, label='Activate text streaming')
+
+
+def create_event_handlers():
+    shared.gradio['filter_by_loader'].change(loaders.blacklist_samplers, gradio('filter_by_loader'), gradio(loaders.list_all_samplers()), show_progress=False)
+    shared.gradio['preset_menu'].change(presets.load_preset_for_ui, gradio('preset_menu', 'interface_state'), gradio('interface_state') + gradio(presets.presets_params()))
+
+
+def create_chat_settings_menus():
+    if not shared.is_chat():
+        return
+
+    with gr.Box():
+        gr.Markdown("Chat parameters")
+        with gr.Row():
+            with gr.Column():
+                shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens'])
+                shared.gradio['chat_generation_attempts'] = gr.Slider(minimum=shared.settings['chat_generation_attempts_min'], maximum=shared.settings['chat_generation_attempts_max'], value=shared.settings['chat_generation_attempts'], step=1, label='Generation attempts (for longer replies)', info='New generations will be called until either this number is reached or no new content is generated between two iterations.')
+
+            with gr.Column():
+                shared.gradio['stop_at_newline'] = gr.Checkbox(value=shared.settings['stop_at_newline'], label='Stop generating at new line character')
diff --git a/modules/ui_session.py b/modules/ui_session.py
new file mode 100644
index 00000000..7a1a32b0
--- /dev/null
+++ b/modules/ui_session.py
@@ -0,0 +1,71 @@
+import gradio as gr
+
+from modules import shared, ui, utils
+from modules.github import clone_or_pull_repository
+from modules.utils import gradio
+
+
+def create_ui():
+    with gr.Tab("Session", elem_id="session-tab"):
+        modes = ["default", "notebook", "chat"]
+        current_mode = "default"
+        for mode in modes[1:]:
+            if getattr(shared.args, mode):
+                current_mode = mode
+                break
+
+        cmd_list = vars(shared.args)
+        bool_list = sorted([k for k in cmd_list if type(cmd_list[k]) is bool and k not in modes + ui.list_model_elements()])
+        bool_active = [k for k in bool_list if vars(shared.args)[k]]
+
+        with gr.Row():
+
+            with gr.Column():
+                with gr.Row():
+                    shared.gradio['interface_modes_menu'] = gr.Dropdown(choices=modes, value=current_mode, label="Mode", elem_classes='slim-dropdown')
+                    shared.gradio['reset_interface'] = gr.Button("Apply and restart", elem_classes="small-button", variant="primary")
+                    shared.gradio['toggle_dark_mode'] = gr.Button('Toggle 💡', elem_classes="small-button")
+
+                with gr.Row():
+                    with gr.Column():
+                        shared.gradio['extensions_menu'] = gr.CheckboxGroup(choices=utils.get_available_extensions(), value=shared.args.extensions, label="Available extensions", info='Note that some of these extensions may require manually installing Python requirements through the command: pip install -r extensions/extension_name/requirements.txt', elem_classes='checkboxgroup-table')
+
+                    with gr.Column():
+                        shared.gradio['bool_menu'] = gr.CheckboxGroup(choices=bool_list, value=bool_active, label="Boolean command-line flags", elem_classes='checkboxgroup-table')
+
+            with gr.Column():
+                if not shared.args.multi_user:
+                    shared.gradio['save_session'] = gr.Button('Save session', elem_id="save_session")
+                    shared.gradio['load_session'] = gr.File(type='binary', file_types=['.json'], label="Upload Session JSON")
+
+                extension_name = gr.Textbox(lines=1, label='Install or update an extension', info='Enter the GitHub URL below and press Enter. For a list of extensions, see: https://github.com/oobabooga/text-generation-webui-extensions ⚠️  WARNING ⚠️ : extensions can execute arbitrary code. Make sure to inspect their source code before activating them.')
+                extension_status = gr.Markdown()
+
+        extension_name.submit(
+            clone_or_pull_repository, extension_name, extension_status, show_progress=False).then(
+            lambda: gr.update(choices=utils.get_available_extensions(), value=shared.args.extensions), None, gradio('extensions_menu'))
+
+        # Reset interface event
+        shared.gradio['reset_interface'].click(
+            set_interface_arguments, gradio('interface_modes_menu', 'extensions_menu', 'bool_menu'), None).then(
+            lambda: None, None, None, _js='() => {document.body.innerHTML=\'<h1 style="font-family:monospace;padding-top:20%;margin:0;height:100vh;color:lightgray;text-align:center;background:var(--body-background-fill)">Reloading...</h1>\'; setTimeout(function(){location.reload()},2500); return []}')
+
+        shared.gradio['toggle_dark_mode'].click(lambda: None, None, None, _js='() => {document.getElementsByTagName("body")[0].classList.toggle("dark")}')
+
+
+def set_interface_arguments(interface_mode, extensions, bool_active):
+    modes = ["default", "notebook", "chat", "cai_chat"]
+    cmd_list = vars(shared.args)
+    bool_list = [k for k in cmd_list if type(cmd_list[k]) is bool and k not in modes]
+
+    shared.args.extensions = extensions
+    for k in modes[1:]:
+        setattr(shared.args, k, False)
+    if interface_mode != "default":
+        setattr(shared.args, interface_mode, True)
+    for k in bool_list:
+        setattr(shared.args, k, False)
+    for k in bool_active:
+        setattr(shared.args, k, True)
+
+    shared.need_restart = True
diff --git a/server.py b/server.py
index adff9669..0be1f4c4 100644
--- a/server.py
+++ b/server.py
@@ -14,579 +14,62 @@ with RequestBlocker():
 import matplotlib
 matplotlib.use('Agg')  # This fixes LaTeX rendering on some systems
 
-import importlib
 import json
-import math
 import os
-import re
 import sys
 import time
-import traceback
 from functools import partial
 from pathlib import Path
 from threading import Lock
 
-import psutil
-import torch
 import yaml
-from PIL import Image
 
 import modules.extensions as extensions_module
-from modules import chat, loaders, presets, shared, training, ui, utils
-from modules.extensions import apply_extensions
-from modules.github import clone_or_pull_repository
-from modules.html_generator import chat_html_wrapper
-from modules.LoRA import add_lora_to_model
-from modules.models import load_model, unload_model
-from modules.models_settings import (
-    apply_model_settings_to_state,
-    get_model_settings_from_yamls,
-    save_model_settings,
-    update_model_parameters
+from modules import (
+    chat,
+    shared,
+    training,
+    ui,
+    ui_chat,
+    ui_default,
+    ui_file_saving,
+    ui_model_menu,
+    ui_notebook,
+    ui_parameters,
+    ui_session,
+    utils,
 )
-from modules.text_generation import (
-    generate_reply_wrapper,
-    get_encoded_length,
-    stop_everything_event
+from modules.extensions import apply_extensions
+from modules.LoRA import add_lora_to_model
+from modules.models import load_model
+from modules.models_settings import (
+    get_model_settings_from_yamls,
+    update_model_parameters
 )
 from modules.utils import gradio
 
 
-def load_model_wrapper(selected_model, loader, autoload=False):
-    if not autoload:
-        yield f"The settings for {selected_model} have been updated.\nClick on \"Load\" to load it."
-        return
-
-    if selected_model == 'None':
-        yield "No model selected"
-    else:
-        try:
-            yield f"Loading {selected_model}..."
-            shared.model_name = selected_model
-            unload_model()
-            if selected_model != '':
-                shared.model, shared.tokenizer = load_model(shared.model_name, loader)
-
-            if shared.model is not None:
-                yield f"Successfully loaded {selected_model}"
-            else:
-                yield f"Failed to load {selected_model}."
-        except:
-            exc = traceback.format_exc()
-            logger.error('Failed to load the model.')
-            print(exc)
-            yield exc.replace('\n', '\n\n')
-
-
-def load_lora_wrapper(selected_loras):
-    yield ("Applying the following LoRAs to {}:\n\n{}".format(shared.model_name, '\n'.join(selected_loras)))
-    add_lora_to_model(selected_loras)
-    yield ("Successfuly applied the LoRAs")
-
-
-def load_prompt(fname):
-    if fname in ['None', '']:
-        return ''
-    elif fname.startswith('Instruct-'):
-        fname = re.sub('^Instruct-', '', fname)
-        file_path = Path(f'characters/instruction-following/{fname}.yaml')
-        if not file_path.exists():
-            return ''
-
-        with open(file_path, 'r', encoding='utf-8') as f:
-            data = yaml.safe_load(f)
-            output = ''
-            if 'context' in data:
-                output += data['context']
-
-            replacements = {
-                '<|user|>': data['user'],
-                '<|bot|>': data['bot'],
-                '<|user-message|>': 'Input',
-            }
-
-            output += utils.replace_all(data['turn_template'].split('<|bot-message|>')[0], replacements)
-            return output.rstrip(' ')
-    else:
-        file_path = Path(f'prompts/{fname}.txt')
-        if not file_path.exists():
-            return ''
-
-        with open(file_path, 'r', encoding='utf-8') as f:
-            text = f.read()
-            if text[-1] == '\n':
-                text = text[:-1]
-
-            return text
-
-
-def count_tokens(text):
-    try:
-        tokens = get_encoded_length(text)
-        return f'{tokens} tokens in the input.'
-    except:
-        return 'Couldn\'t count the number of tokens. Is a tokenizer loaded?'
-
-
-def download_model_wrapper(repo_id, progress=gr.Progress()):
-    try:
-        downloader_module = importlib.import_module("download-model")
-        downloader = downloader_module.ModelDownloader()
-        repo_id_parts = repo_id.split(":")
-        model = repo_id_parts[0] if len(repo_id_parts) > 0 else repo_id
-        branch = repo_id_parts[1] if len(repo_id_parts) > 1 else "main"
-        check = False
-
-        progress(0.0)
-        yield ("Cleaning up the model/branch names")
-        model, branch = downloader.sanitize_model_and_branch_names(model, branch)
-
-        yield ("Getting the download links from Hugging Face")
-        links, sha256, is_lora = downloader.get_download_links_from_huggingface(model, branch, text_only=False)
-
-        yield ("Getting the output folder")
-        base_folder = shared.args.lora_dir if is_lora else shared.args.model_dir
-        output_folder = downloader.get_output_folder(model, branch, is_lora, base_folder=base_folder)
-
-        if check:
-            progress(0.5)
-            yield ("Checking previously downloaded files")
-            downloader.check_model_files(model, branch, links, sha256, output_folder)
-            progress(1.0)
-        else:
-            yield (f"Downloading files to {output_folder}")
-            downloader.download_model_files(model, branch, links, sha256, output_folder, progress_bar=progress, threads=1)
-            yield ("Done!")
-    except:
-        progress(1.0)
-        yield traceback.format_exc().replace('\n', '\n\n')
-
-
-def create_model_menus():
-    # Finding the default values for the GPU and CPU memories
-    total_mem = []
-    for i in range(torch.cuda.device_count()):
-        total_mem.append(math.floor(torch.cuda.get_device_properties(i).total_memory / (1024 * 1024)))
-
-    default_gpu_mem = []
-    if shared.args.gpu_memory is not None and len(shared.args.gpu_memory) > 0:
-        for i in shared.args.gpu_memory:
-            if 'mib' in i.lower():
-                default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)))
-            else:
-                default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)) * 1000)
-    while len(default_gpu_mem) < len(total_mem):
-        default_gpu_mem.append(0)
-
-    total_cpu_mem = math.floor(psutil.virtual_memory().total / (1024 * 1024))
-    if shared.args.cpu_memory is not None:
-        default_cpu_mem = re.sub('[a-zA-Z ]', '', shared.args.cpu_memory)
-    else:
-        default_cpu_mem = 0
-
-    with gr.Row():
-        with gr.Column():
-            with gr.Row():
-                with gr.Column():
-                    with gr.Row():
-                        shared.gradio['model_menu'] = gr.Dropdown(choices=utils.get_available_models(), value=shared.model_name, label='Model', elem_classes='slim-dropdown')
-                        ui.create_refresh_button(shared.gradio['model_menu'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button')
-                        load = gr.Button("Load", visible=not shared.settings['autoload_model'], elem_classes='refresh-button')
-                        unload = gr.Button("Unload", elem_classes='refresh-button')
-                        reload = gr.Button("Reload", elem_classes='refresh-button')
-                        save_settings = gr.Button("Save settings", elem_classes='refresh-button')
-
-                with gr.Column():
-                    with gr.Row():
-                        shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown')
-                        ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button')
-                        shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button')
-
-    with gr.Row():
-        with gr.Column():
-            shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=["Transformers", "ExLlama_HF", "ExLlama", "AutoGPTQ", "GPTQ-for-LLaMa", "llama.cpp", "llamacpp_HF"], value=None)
-            with gr.Box():
-                with gr.Row():
-                    with gr.Column():
-                        for i in range(len(total_mem)):
-                            shared.gradio[f'gpu_memory_{i}'] = gr.Slider(label=f"gpu-memory in MiB for device :{i}", maximum=total_mem[i], value=default_gpu_mem[i])
-
-                        shared.gradio['cpu_memory'] = gr.Slider(label="cpu-memory in MiB", maximum=total_cpu_mem, value=default_cpu_mem)
-                        shared.gradio['transformers_info'] = gr.Markdown('load-in-4bit params:')
-                        shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype)
-                        shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type)
-
-                        shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=128, value=shared.args.n_gpu_layers)
-                        shared.gradio['n_ctx'] = gr.Slider(minimum=0, maximum=16384, step=256, label="n_ctx", value=shared.args.n_ctx)
-                        shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=32, value=shared.args.threads)
-                        shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, value=shared.args.n_batch)
-                        shared.gradio['n_gqa'] = gr.Slider(minimum=0, maximum=16, step=1, label="n_gqa", value=shared.args.n_gqa, info='grouped-query attention. Must be 8 for llama-2 70b.')
-                        shared.gradio['rms_norm_eps'] = gr.Slider(minimum=0, maximum=1e-5, step=1e-6, label="rms_norm_eps", value=shared.args.n_gqa, info='5e-6 is a good value for llama-2 models.')
-
-                        shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=str(shared.args.wbits) if shared.args.wbits > 0 else "None")
-                        shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=str(shared.args.groupsize) if shared.args.groupsize > 0 else "None")
-                        shared.gradio['model_type'] = gr.Dropdown(label="model_type", choices=["None", "llama", "opt", "gptj"], value=shared.args.model_type or "None")
-                        shared.gradio['pre_layer'] = gr.Slider(label="pre_layer", minimum=0, maximum=100, value=shared.args.pre_layer[0] if shared.args.pre_layer is not None else 0)
-                        shared.gradio['autogptq_info'] = gr.Markdown('* ExLlama_HF is recommended over AutoGPTQ for models derived from LLaMA.')
-                        shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
-                        shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=16384, step=256, info='Maximum sequence length.', value=shared.args.max_seq_len)
-                        shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should typically be set to max_seq_len / 2048.', value=shared.args.compress_pos_emb)
-                        shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=32, step=1, info='Positional embeddings alpha factor for NTK RoPE scaling. Scaling is not identical to embedding compression. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value)
-
-                    with gr.Column():
-                        shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
-                        shared.gradio['no_inject_fused_attention'] = gr.Checkbox(label="no_inject_fused_attention", value=shared.args.no_inject_fused_attention, info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Disable if running low on VRAM.')
-                        shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
-                        shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.')
-                        shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
-                        shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu)
-                        shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
-                        shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
-                        shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices)
-                        shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
-                        shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
-                        shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant)
-                        shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
-                        shared.gradio['low_vram'] = gr.Checkbox(label="low-vram", value=shared.args.low_vram)
-                        shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
-                        shared.gradio['llama_cpp_seed'] = gr.Number(label='Seed (0 for random)', value=shared.args.llama_cpp_seed)
-                        shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.')
-                        shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa is currently 2x faster than AutoGPTQ on some systems. It is installed by default with the one-click installers. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')
-                        shared.gradio['exllama_info'] = gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/blob/main/docs/ExLlama.md).')
-                        shared.gradio['exllama_HF_info'] = gr.Markdown('ExLlama_HF is a wrapper that lets you use ExLlama like a Transformers model, which means it can use the Transformers samplers. It\'s a bit slower than the regular ExLlama.')
-                        shared.gradio['llamacpp_HF_info'] = gr.Markdown('llamacpp_HF is a wrapper that lets you use llama.cpp like a Transformers model, which means it can use the Transformers samplers. To use it, make sure to first download oobabooga/llama-tokenizer under "Download custom model or LoRA".')
-
-        with gr.Column():
-            with gr.Row():
-                shared.gradio['autoload_model'] = gr.Checkbox(value=shared.settings['autoload_model'], label='Autoload the model', info='Whether to load the model as soon as it is selected in the Model dropdown.')
-
-            shared.gradio['custom_model_menu'] = gr.Textbox(label="Download custom model or LoRA", info="Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main")
-            shared.gradio['download_model_button'] = gr.Button("Download")
-
-            with gr.Row():
-                shared.gradio['model_status'] = gr.Markdown('No model is loaded' if shared.model_name == 'None' else 'Ready')
-
-    shared.gradio['loader'].change(loaders.make_loader_params_visible, gradio('loader'), gradio(loaders.get_all_params()))
-
-    # In this event handler, the interface state is read and updated
-    # with the model defaults (if any), and then the model is loaded
-    # unless "autoload_model" is unchecked
-    shared.gradio['model_menu'].change(
-        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        apply_model_settings_to_state, gradio('model_menu', 'interface_state'), gradio('interface_state')).then(
-        ui.apply_interface_values, gradio('interface_state'), gradio(ui.list_interface_input_elements()), show_progress=False).then(
-        update_model_parameters, gradio('interface_state'), None).then(
-        load_model_wrapper, gradio('model_menu', 'loader', 'autoload_model'), gradio('model_status'), show_progress=False)
-
-    load.click(
-        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        update_model_parameters, gradio('interface_state'), None).then(
-        partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=False)
-
-    unload.click(
-        unload_model, None, None).then(
-        lambda: "Model unloaded", None, gradio('model_status'))
-
-    reload.click(
-        unload_model, None, None).then(
-        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        update_model_parameters, gradio('interface_state'), None).then(
-        partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=False)
-
-    save_settings.click(
-        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        save_model_settings, gradio('model_menu', 'interface_state'), gradio('model_status'), show_progress=False)
-
-    shared.gradio['lora_menu_apply'].click(load_lora_wrapper, gradio('lora_menu'), gradio('model_status'), show_progress=False)
-    shared.gradio['download_model_button'].click(download_model_wrapper, gradio('custom_model_menu'), gradio('model_status'), show_progress=True)
-    shared.gradio['autoload_model'].change(lambda x: gr.update(visible=not x), gradio('autoload_model'), load)
-
-
-def create_chat_settings_menus():
-    if not shared.is_chat():
-        return
-
-    with gr.Box():
-        gr.Markdown("Chat parameters")
-        with gr.Row():
-            with gr.Column():
-                shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens'])
-                shared.gradio['chat_generation_attempts'] = gr.Slider(minimum=shared.settings['chat_generation_attempts_min'], maximum=shared.settings['chat_generation_attempts_max'], value=shared.settings['chat_generation_attempts'], step=1, label='Generation attempts (for longer replies)', info='New generations will be called until either this number is reached or no new content is generated between two iterations.')
-
-            with gr.Column():
-                shared.gradio['stop_at_newline'] = gr.Checkbox(value=shared.settings['stop_at_newline'], label='Stop generating at new line character')
-
-
-def create_settings_menus(default_preset):
-    generate_params = presets.load_preset(default_preset)
-    with gr.Row():
-        with gr.Column():
-            with gr.Row():
-                shared.gradio['preset_menu'] = gr.Dropdown(choices=utils.get_available_presets(), value=default_preset, label='Generation parameters preset', elem_classes='slim-dropdown')
-                ui.create_refresh_button(shared.gradio['preset_menu'], lambda: None, lambda: {'choices': utils.get_available_presets()}, 'refresh-button')
-                shared.gradio['save_preset'] = gr.Button('💾', elem_classes='refresh-button')
-                shared.gradio['delete_preset'] = gr.Button('🗑️', elem_classes='refresh-button')
-
-        with gr.Column():
-            filter_by_loader = gr.Dropdown(label="Filter by loader", choices=["All", "Transformers", "ExLlama_HF", "ExLlama", "AutoGPTQ", "GPTQ-for-LLaMa", "llama.cpp", "llamacpp_HF"], value="All", elem_classes='slim-dropdown')
-
-    with gr.Row():
-        with gr.Column():
-            with gr.Box():
-                with gr.Row():
-                    with gr.Column():
-                        shared.gradio['temperature'] = gr.Slider(0.01, 1.99, value=generate_params['temperature'], step=0.01, label='temperature')
-                        shared.gradio['top_p'] = gr.Slider(0.0, 1.0, value=generate_params['top_p'], step=0.01, label='top_p')
-                        shared.gradio['top_k'] = gr.Slider(0, 200, value=generate_params['top_k'], step=1, label='top_k')
-                        shared.gradio['typical_p'] = gr.Slider(0.0, 1.0, value=generate_params['typical_p'], step=0.01, label='typical_p')
-                        shared.gradio['epsilon_cutoff'] = gr.Slider(0, 9, value=generate_params['epsilon_cutoff'], step=0.01, label='epsilon_cutoff')
-                        shared.gradio['eta_cutoff'] = gr.Slider(0, 20, value=generate_params['eta_cutoff'], step=0.01, label='eta_cutoff')
-                        shared.gradio['tfs'] = gr.Slider(0.0, 1.0, value=generate_params['tfs'], step=0.01, label='tfs')
-                        shared.gradio['top_a'] = gr.Slider(0.0, 1.0, value=generate_params['top_a'], step=0.01, label='top_a')
-
-                    with gr.Column():
-                        shared.gradio['repetition_penalty'] = gr.Slider(1.0, 1.5, value=generate_params['repetition_penalty'], step=0.01, label='repetition_penalty')
-                        shared.gradio['repetition_penalty_range'] = gr.Slider(0, 4096, step=64, value=generate_params['repetition_penalty_range'], label='repetition_penalty_range')
-                        shared.gradio['encoder_repetition_penalty'] = gr.Slider(0.8, 1.5, value=generate_params['encoder_repetition_penalty'], step=0.01, label='encoder_repetition_penalty')
-                        shared.gradio['no_repeat_ngram_size'] = gr.Slider(0, 20, step=1, value=generate_params['no_repeat_ngram_size'], label='no_repeat_ngram_size')
-                        shared.gradio['min_length'] = gr.Slider(0, 2000, step=1, value=generate_params['min_length'], label='min_length')
-                        shared.gradio['seed'] = gr.Number(value=shared.settings['seed'], label='Seed (-1 for random)')
-                        shared.gradio['do_sample'] = gr.Checkbox(value=generate_params['do_sample'], label='do_sample')
-
-            with gr.Accordion("Learn more", open=False):
-                gr.Markdown("""
-
-    For a technical description of the parameters, the [transformers documentation](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig) is a good reference.
-
-    The best presets, according to the [Preset Arena](https://github.com/oobabooga/oobabooga.github.io/blob/main/arena/results.md) experiment, are:
-
-    * Instruction following:
-        1) Divine Intellect
-        2) Big O
-        3) simple-1
-        4) Space Alien
-        5) StarChat
-        6) Titanic
-        7) tfs-with-top-a
-        8) Asterism
-        9) Contrastive Search
-
-    * Chat:
-        1) Midnight Enigma
-        2) Yara
-        3) Shortwave
-
-    ### Temperature
-    Primary factor to control randomness of outputs. 0 = deterministic (only the most likely token is used). Higher value = more randomness.
-    ### top_p
-    If not set to 1, select tokens with probabilities adding up to less than this number. Higher value = higher range of possible random results.
-    ### top_k
-    Similar to top_p, but select instead only the top_k most likely tokens. Higher value = higher range of possible random results.
-    ### typical_p
-    If not set to 1, select only tokens that are at least this much more likely to appear than random tokens, given the prior text.
-    ### epsilon_cutoff
-    In units of 1e-4; a reasonable value is 3. This sets a probability floor below which tokens are excluded from being sampled. Should be used with top_p, top_k, and eta_cutoff set to 0.
-    ### eta_cutoff
-    In units of 1e-4; a reasonable value is 3. Should be used with top_p, top_k, and epsilon_cutoff set to 0.
-    ### repetition_penalty
-    Exponential penalty factor for repeating prior tokens. 1 means no penalty, higher value = less repetition, lower value = more repetition.
-    ### repetition_penalty_range
-    The number of most recent tokens to consider for repetition penalty. 0 makes all tokens be used.
-    ### encoder_repetition_penalty
-    Also known as the "Hallucinations filter". Used to penalize tokens that are *not* in the prior text. Higher value = more likely to stay in context, lower value = more likely to diverge.
-    ### no_repeat_ngram_size
-    If not set to 0, specifies the length of token sets that are completely blocked from repeating at all. Higher values = blocks larger phrases, lower values = blocks words or letters from repeating. Only 0 or high values are a good idea in most cases.
-    ### min_length
-    Minimum generation length in tokens.
-    ### penalty_alpha
-    Contrastive Search is enabled by setting this to greater than zero and unchecking "do_sample". It should be used with a low value of top_k, for instance, top_k = 4.
-
-                """, elem_classes="markdown")
-
-        with gr.Column():
-            create_chat_settings_menus()
-            with gr.Box():
-                with gr.Row():
-                    with gr.Column():
-                        shared.gradio['guidance_scale'] = gr.Slider(-0.5, 2.5, step=0.05, value=generate_params['guidance_scale'], label='guidance_scale', info='For CFG. 1.5 is a good value.')
-                        shared.gradio['negative_prompt'] = gr.Textbox(value=shared.settings['negative_prompt'], label='Negative prompt')
-                        shared.gradio['mirostat_mode'] = gr.Slider(0, 2, step=1, value=generate_params['mirostat_mode'], label='mirostat_mode', info='mode=1 is for llama.cpp only.')
-                        shared.gradio['mirostat_tau'] = gr.Slider(0, 10, step=0.01, value=generate_params['mirostat_tau'], label='mirostat_tau')
-                        shared.gradio['mirostat_eta'] = gr.Slider(0, 1, step=0.01, value=generate_params['mirostat_eta'], label='mirostat_eta')
-
-                    with gr.Column():
-                        shared.gradio['penalty_alpha'] = gr.Slider(0, 5, value=generate_params['penalty_alpha'], label='penalty_alpha', info='For Contrastive Search. do_sample must be unchecked.')
-
-                        shared.gradio['num_beams'] = gr.Slider(1, 20, step=1, value=generate_params['num_beams'], label='num_beams', info='For Beam Search, along with length_penalty and early_stopping.')
-                        shared.gradio['length_penalty'] = gr.Slider(-5, 5, value=generate_params['length_penalty'], label='length_penalty')
-                        shared.gradio['early_stopping'] = gr.Checkbox(value=generate_params['early_stopping'], label='early_stopping')
-
-            with gr.Box():
-                with gr.Row():
-                    with gr.Column():
-                        shared.gradio['truncation_length'] = gr.Slider(value=shared.settings['truncation_length'], minimum=shared.settings['truncation_length_min'], maximum=shared.settings['truncation_length_max'], step=256, label='Truncate the prompt up to this length', info='The leftmost tokens are removed if the prompt exceeds this length. Most models require this to be at most 2048.')
-                        shared.gradio['custom_stopping_strings'] = gr.Textbox(lines=1, value=shared.settings["custom_stopping_strings"] or None, label='Custom stopping strings', info='In addition to the defaults. Written between "" and separated by commas. For instance: "\\nYour Assistant:", "\\nThe assistant:"')
-                    with gr.Column():
-                        shared.gradio['auto_max_new_tokens'] = gr.Checkbox(value=shared.settings['auto_max_new_tokens'], label='auto_max_new_tokens', info='Expand max_new_tokens to the available context length.')
-                        shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label='Ban the eos_token', info='Forces the model to never end the generation prematurely.')
-                        shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.')
-
-                        shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.')
-                        shared.gradio['stream'] = gr.Checkbox(value=not shared.args.no_stream, label='Activate text streaming')
-
-    filter_by_loader.change(loaders.blacklist_samplers, filter_by_loader, gradio(loaders.list_all_samplers()), show_progress=False)
-    shared.gradio['preset_menu'].change(presets.load_preset_for_ui, gradio('preset_menu', 'interface_state'), gradio('interface_state') + gradio(presets.presets_params()))
-
-
-def create_file_saving_menus():
-
-    # Text file saver
-    with gr.Box(visible=False, elem_classes='file-saver') as shared.gradio['file_saver']:
-        shared.gradio['save_filename'] = gr.Textbox(lines=1, label='File name')
-        shared.gradio['save_root'] = gr.Textbox(lines=1, label='File folder', info='For reference. Unchangeable.', interactive=False)
-        shared.gradio['save_contents'] = gr.Textbox(lines=10, label='File contents')
-        with gr.Row():
-            shared.gradio['save_confirm'] = gr.Button('Save', elem_classes="small-button")
-            shared.gradio['save_cancel'] = gr.Button('Cancel', elem_classes="small-button")
-
-    # Text file deleter
-    with gr.Box(visible=False, elem_classes='file-saver') as shared.gradio['file_deleter']:
-        shared.gradio['delete_filename'] = gr.Textbox(lines=1, label='File name')
-        shared.gradio['delete_root'] = gr.Textbox(lines=1, label='File folder', info='For reference. Unchangeable.', interactive=False)
-        with gr.Row():
-            shared.gradio['delete_confirm'] = gr.Button('Delete', elem_classes="small-button", variant='stop')
-            shared.gradio['delete_cancel'] = gr.Button('Cancel', elem_classes="small-button")
-
-    # Character saver/deleter
-    if shared.is_chat():
-        with gr.Box(visible=False, elem_classes='file-saver') as shared.gradio['character_saver']:
-            shared.gradio['save_character_filename'] = gr.Textbox(lines=1, label='File name', info='The character will be saved to your characters/ folder with this base filename.')
-            with gr.Row():
-                shared.gradio['save_character_confirm'] = gr.Button('Save', elem_classes="small-button")
-                shared.gradio['save_character_cancel'] = gr.Button('Cancel', elem_classes="small-button")
-
-        with gr.Box(visible=False, elem_classes='file-saver') as shared.gradio['character_deleter']:
-            gr.Markdown('Confirm the character deletion?')
-            with gr.Row():
-                shared.gradio['delete_character_confirm'] = gr.Button('Delete', elem_classes="small-button", variant='stop')
-                shared.gradio['delete_character_cancel'] = gr.Button('Cancel', elem_classes="small-button")
-
-
-def create_file_saving_event_handlers():
-    shared.gradio['save_confirm'].click(
-        lambda x, y, z: utils.save_file(x + y, z), gradio('save_root', 'save_filename', 'save_contents'), None).then(
-        lambda: gr.update(visible=False), None, gradio('file_saver'))
-
-    shared.gradio['delete_confirm'].click(
-        lambda x, y: utils.delete_file(x + y), gradio('delete_root', 'delete_filename'), None).then(
-        lambda: gr.update(visible=False), None, gradio('file_deleter'))
-
-    shared.gradio['delete_cancel'].click(lambda: gr.update(visible=False), None, gradio('file_deleter'))
-    shared.gradio['save_cancel'].click(lambda: gr.update(visible=False), None, gradio('file_saver'))
-    if shared.is_chat():
-        shared.gradio['save_character_confirm'].click(
-            chat.save_character, gradio('name2', 'greeting', 'context', 'character_picture', 'save_character_filename'), None).then(
-            lambda: gr.update(visible=False), None, gradio('character_saver'))
-
-        shared.gradio['delete_character_confirm'].click(
-            chat.delete_character, gradio('character_menu'), None).then(
-            lambda: gr.update(visible=False), None, gradio('character_deleter')).then(
-            lambda: gr.update(choices=utils.get_available_characters()), None, gradio('character_menu'))
-
-        shared.gradio['save_character_cancel'].click(lambda: gr.update(visible=False), None, gradio('character_saver'))
-        shared.gradio['delete_character_cancel'].click(lambda: gr.update(visible=False), None, gradio('character_deleter'))
-
-    shared.gradio['save_preset'].click(
-        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        presets.generate_preset_yaml, gradio('interface_state'), gradio('save_contents')).then(
-        lambda: 'presets/', None, gradio('save_root')).then(
-        lambda: 'My Preset.yaml', None, gradio('save_filename')).then(
-        lambda: gr.update(visible=True), None, gradio('file_saver'))
-
-    shared.gradio['delete_preset'].click(
-        lambda x: f'{x}.yaml', gradio('preset_menu'), gradio('delete_filename')).then(
-        lambda: 'presets/', None, gradio('delete_root')).then(
-        lambda: gr.update(visible=True), None, gradio('file_deleter'))
-
-    if not shared.args.multi_user:
-
-        def load_session(file, state):
-            decoded_file = file if type(file) == str else file.decode('utf-8')
-            data = json.loads(decoded_file)
-
-            if shared.is_chat() and 'character_menu' in data and state.get('character_menu') != data.get('character_menu'):
-                shared.session_is_loading = True
-
-            state.update(data)
-            return state
-
-        shared.gradio['save_session'].click(
-            ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-            lambda x: json.dumps(x, indent=4), gradio('interface_state'), gradio('temporary_text')).then(
-            None, gradio('temporary_text'), None, _js=f"(contents) => {{{ui.save_files_js}; saveSession(contents, \"{shared.get_mode()}\")}}")
-
-        if shared.is_chat():
-            shared.gradio['load_session'].upload(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                load_session, gradio('load_session', 'interface_state'), gradio('interface_state')).then(
-                ui.apply_interface_values, gradio('interface_state'), gradio(ui.list_interface_input_elements()), show_progress=False).then(
-                chat.redraw_html, shared.reload_inputs, gradio('display')).then(
-                None, None, None, _js='() => {alert("The session has been loaded.")}')
-        else:
-            shared.gradio['load_session'].upload(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                load_session, gradio('load_session', 'interface_state'), gradio('interface_state')).then(
-                ui.apply_interface_values, gradio('interface_state'), gradio(ui.list_interface_input_elements()), show_progress=False).then(
-                None, None, None, _js='() => {alert("The session has been loaded.")}')
-
-
-def set_interface_arguments(interface_mode, extensions, bool_active):
-    modes = ["default", "notebook", "chat", "cai_chat"]
-    cmd_list = vars(shared.args)
-    bool_list = [k for k in cmd_list if type(cmd_list[k]) is bool and k not in modes]
-
-    shared.args.extensions = extensions
-    for k in modes[1:]:
-        setattr(shared.args, k, False)
-    if interface_mode != "default":
-        setattr(shared.args, interface_mode, True)
-    for k in bool_list:
-        setattr(shared.args, k, False)
-    for k in bool_active:
-        setattr(shared.args, k, True)
-
-    shared.need_restart = True
-
-
 def create_interface():
 
-    # Defining some variables
-    gen_events = []
-    default_preset = shared.settings['preset']
-    default_text = load_prompt(shared.settings['prompt'])
     title = 'Text generation web UI'
 
-    # Authentication variables
-    auth = None
-    gradio_auth_creds = []
+    # Password authentication
+    auth = []
     if shared.args.gradio_auth:
-        gradio_auth_creds += [x.strip() for x in shared.args.gradio_auth.strip('"').replace('\n', '').split(',') if x.strip()]
-    if shared.args.gradio_auth_path is not None:
+        auth.extend(x.strip() for x in shared.args.gradio_auth.strip('"').replace('\n', '').split(',') if x.strip())
+    if shared.args.gradio_auth_path:
         with open(shared.args.gradio_auth_path, 'r', encoding="utf8") as file:
-            for line in file.readlines():
-                gradio_auth_creds += [x.strip() for x in line.split(',') if x.strip()]
-    if gradio_auth_creds:
-        auth = [tuple(cred.split(':')) for cred in gradio_auth_creds]
+            auth.extend(x.strip() for line in file for x in line.split(',') if x.strip())
+    auth = [tuple(cred.split(':')) for cred in auth]
 
-    # Importing the extension files and executing their setup() functions
+    # Import the extensions and execute their setup() functions
     if shared.args.extensions is not None and len(shared.args.extensions) > 0:
         extensions_module.load_extensions()
 
-    # Forcing some events to be triggered on page load
+    # Force some events to be triggered on page load
     shared.persistent_interface_state.update({
         'loader': shared.args.loader or 'Transformers',
     })
-
     if shared.is_chat():
         shared.persistent_interface_state.update({
             'mode': shared.settings['mode'],
@@ -603,482 +86,55 @@ def create_interface():
     css += apply_extensions('css')
     js += apply_extensions('js')
 
+    # The input elements for the generation functions
+    shared.input_elements = ui.list_interface_input_elements()
+
     with gr.Blocks(css=css, analytics_enabled=False, title=title, theme=ui.theme) as shared.gradio['interface']:
+
+        # Audio notification
         if Path("notification.mp3").exists():
             shared.gradio['audio_notification'] = gr.Audio(interactive=False, value="notification.mp3", elem_id="audio_notification", visible=False)
-            audio_notification_js = "document.querySelector('#audio_notification audio')?.play();"
-        else:
-            audio_notification_js = ""
 
         # Floating menus for saving/deleting files
-        create_file_saving_menus()
+        ui_file_saving.create_ui()
 
-        # Used for saving files using javascript
+        # Temporary clipboard for saving files
         shared.gradio['temporary_text'] = gr.Textbox(visible=False)
 
-        # Create chat mode interface
+        # Text Generation tab
         if shared.is_chat():
-            shared.input_elements = ui.list_interface_input_elements()
-
-            shared.gradio.update({
-                'interface_state': gr.State({k: None for k in shared.input_elements}),
-                'Chat input': gr.State(),
-                'dummy': gr.State(),
-                'history': gr.State({'internal': [], 'visible': []}),
-            })
-
-            with gr.Tab('Text generation', elem_id='main'):
-                shared.gradio['display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': []}, shared.settings['name1'], shared.settings['name2'], 'chat', 'cai-chat'))
-                shared.gradio['textbox'] = gr.Textbox(label='Input')
-                with gr.Row():
-                    shared.gradio['Stop'] = gr.Button('Stop', elem_id='stop')
-                    shared.gradio['Generate'] = gr.Button('Generate', elem_id='Generate', variant='primary')
-                    shared.gradio['Continue'] = gr.Button('Continue')
-
-                with gr.Row():
-                    shared.gradio['Impersonate'] = gr.Button('Impersonate')
-                    shared.gradio['Regenerate'] = gr.Button('Regenerate')
-                    shared.gradio['Remove last'] = gr.Button('Remove last', elem_classes=['button_nowrap'])
-
-                with gr.Row():
-                    shared.gradio['Copy last reply'] = gr.Button('Copy last reply')
-                    shared.gradio['Replace last reply'] = gr.Button('Replace last reply')
-                    shared.gradio['Send dummy message'] = gr.Button('Send dummy message')
-                    shared.gradio['Send dummy reply'] = gr.Button('Send dummy reply')
-
-                with gr.Row():
-                    shared.gradio['Clear history'] = gr.Button('Clear history')
-                    shared.gradio['Clear history-confirm'] = gr.Button('Confirm', variant='stop', visible=False)
-                    shared.gradio['Clear history-cancel'] = gr.Button('Cancel', visible=False)
-
-                with gr.Row():
-                    shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'])
-
-                with gr.Row():
-                    shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], value=shared.settings['mode'] if shared.settings['mode'] in ['chat', 'instruct', 'chat-instruct'] else 'chat', label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template selected under "Chat settings" must match the current model.')
-                    shared.gradio['chat_style'] = gr.Dropdown(choices=utils.get_available_chat_styles(), label='Chat style', value=shared.settings['chat_style'], visible=shared.settings['mode'] != 'instruct')
-
-            with gr.Tab('Chat settings', elem_id='chat-settings'):
-
-                with gr.Tab("Character"):
-                    with gr.Row():
-                        with gr.Column(scale=8):
-                            with gr.Row():
-                                shared.gradio['character_menu'] = gr.Dropdown(value='None', choices=utils.get_available_characters(), label='Character', elem_id='character-menu', info='Used in chat and chat-instruct modes.', elem_classes='slim-dropdown')
-                                ui.create_refresh_button(shared.gradio['character_menu'], lambda: None, lambda: {'choices': utils.get_available_characters()}, 'refresh-button')
-                                shared.gradio['save_character'] = gr.Button('💾', elem_classes='refresh-button')
-                                shared.gradio['delete_character'] = gr.Button('🗑️', elem_classes='refresh-button')
-
-                            shared.gradio['name1'] = gr.Textbox(value=shared.settings['name1'], lines=1, label='Your name')
-                            shared.gradio['name2'] = gr.Textbox(value=shared.settings['name2'], lines=1, label='Character\'s name')
-                            shared.gradio['context'] = gr.Textbox(value=shared.settings['context'], lines=4, label='Context', elem_classes=['add_scrollbar'])
-                            shared.gradio['greeting'] = gr.Textbox(value=shared.settings['greeting'], lines=4, label='Greeting', elem_classes=['add_scrollbar'])
-
-                        with gr.Column(scale=1):
-                            shared.gradio['character_picture'] = gr.Image(label='Character picture', type='pil')
-                            shared.gradio['your_picture'] = gr.Image(label='Your picture', type='pil', value=Image.open(Path('cache/pfp_me.png')) if Path('cache/pfp_me.png').exists() else None)
-
-                with gr.Tab("Instruction template"):
-                    with gr.Row():
-                        with gr.Row():
-                            shared.gradio['instruction_template'] = gr.Dropdown(choices=utils.get_available_instruction_templates(), label='Instruction template', value='None', info='Change this according to the model/LoRA that you are using. Used in instruct and chat-instruct modes.', elem_classes='slim-dropdown')
-                            ui.create_refresh_button(shared.gradio['instruction_template'], lambda: None, lambda: {'choices': utils.get_available_instruction_templates()}, 'refresh-button')
-                            shared.gradio['save_template'] = gr.Button('💾', elem_classes='refresh-button')
-                            shared.gradio['delete_template'] = gr.Button('🗑️ ', elem_classes='refresh-button')
-
-                    shared.gradio['name1_instruct'] = gr.Textbox(value='', lines=2, label='User string')
-                    shared.gradio['name2_instruct'] = gr.Textbox(value='', lines=1, label='Bot string')
-                    shared.gradio['context_instruct'] = gr.Textbox(value='', lines=4, label='Context')
-                    shared.gradio['turn_template'] = gr.Textbox(value=shared.settings['turn_template'], lines=1, label='Turn template', info='Used to precisely define the placement of spaces and new line characters in instruction prompts.')
-                    with gr.Row():
-                        shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=4, label='Command for chat-instruct mode', info='<|character|> gets replaced by the bot name, and <|prompt|> gets replaced by the regular chat prompt.', elem_classes=['add_scrollbar'])
-
-                with gr.Tab('Chat history'):
-                    with gr.Row():
-                        with gr.Column():
-                            shared.gradio['save_chat_history'] = gr.Button(value='Save history')
-
-                        with gr.Column():
-                            shared.gradio['load_chat_history'] = gr.File(type='binary', file_types=['.json', '.txt'], label="Upload History JSON")
-
-                with gr.Tab('Upload character'):
-                    with gr.Tab('YAML or JSON'):
-                        with gr.Row():
-                            shared.gradio['upload_json'] = gr.File(type='binary', file_types=['.json', '.yaml'], label='JSON or YAML File')
-                            shared.gradio['upload_img_bot'] = gr.Image(type='pil', label='Profile Picture (optional)')
-
-                        shared.gradio['Submit character'] = gr.Button(value='Submit', interactive=False)
-
-                    with gr.Tab('TavernAI PNG'):
-                        with gr.Row():
-                            with gr.Column():
-                                shared.gradio['upload_img_tavern'] = gr.Image(type='pil', label='TavernAI PNG File', elem_id="upload_img_tavern")
-                                shared.gradio['tavern_json'] = gr.State()
-                            with gr.Column():
-                                shared.gradio['tavern_name'] = gr.Textbox(value='', lines=1, label='Name', interactive=False)
-                                shared.gradio['tavern_desc'] = gr.Textbox(value='', lines=4, max_lines=4, label='Description', interactive=False)
-
-                        shared.gradio['Submit tavern character'] = gr.Button(value='Submit', interactive=False)
-
-            with gr.Tab("Parameters", elem_id="parameters"):
-                create_settings_menus(default_preset)
-
-        # Create notebook mode interface
+            ui_chat.create_ui()
         elif shared.args.notebook:
-            shared.input_elements = ui.list_interface_input_elements()
-            shared.gradio['interface_state'] = gr.State({k: None for k in shared.input_elements})
-            shared.gradio['last_input'] = gr.State('')
-            with gr.Tab("Text generation", elem_id="main"):
-                with gr.Row():
-                    with gr.Column(scale=4):
-                        with gr.Tab('Raw'):
-                            shared.gradio['textbox'] = gr.Textbox(value=default_text, elem_classes=['textbox', 'add_scrollbar'], lines=27)
-
-                        with gr.Tab('Markdown'):
-                            shared.gradio['markdown_render'] = gr.Button('Render')
-                            shared.gradio['markdown'] = gr.Markdown()
-
-                        with gr.Tab('HTML'):
-                            shared.gradio['html'] = gr.HTML()
-
-                        with gr.Row():
-                            shared.gradio['Generate'] = gr.Button('Generate', variant='primary', elem_classes="small-button")
-                            shared.gradio['Stop'] = gr.Button('Stop', elem_classes="small-button", elem_id='stop')
-                            shared.gradio['Undo'] = gr.Button('Undo', elem_classes="small-button")
-                            shared.gradio['Regenerate'] = gr.Button('Regenerate', elem_classes="small-button")
-
-                    with gr.Column(scale=1):
-                        gr.HTML('<div style="padding-bottom: 13px"></div>')
-                        shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens'])
-                        with gr.Row():
-                            shared.gradio['prompt_menu'] = gr.Dropdown(choices=utils.get_available_prompts(), value='None', label='Prompt', elem_classes='slim-dropdown')
-                            ui.create_refresh_button(shared.gradio['prompt_menu'], lambda: None, lambda: {'choices': utils.get_available_prompts()}, ['refresh-button', 'refresh-button-small'])
-                            shared.gradio['save_prompt'] = gr.Button('💾', elem_classes=['refresh-button', 'refresh-button-small'])
-                            shared.gradio['delete_prompt'] = gr.Button('🗑️', elem_classes=['refresh-button', 'refresh-button-small'])
-
-                        shared.gradio['count_tokens'] = gr.Button('Count tokens')
-                        shared.gradio['status'] = gr.Markdown('')
-
-            with gr.Tab("Parameters", elem_id="parameters"):
-                create_settings_menus(default_preset)
-
-        # Create default mode interface
+            ui_notebook.create_ui()
         else:
-            shared.input_elements = ui.list_interface_input_elements()
-            shared.gradio['interface_state'] = gr.State({k: None for k in shared.input_elements})
-            shared.gradio['last_input'] = gr.State('')
-            with gr.Tab("Text generation", elem_id="main"):
-                with gr.Row():
-                    with gr.Column():
-                        shared.gradio['textbox'] = gr.Textbox(value=default_text, elem_classes=['textbox_default', 'add_scrollbar'], lines=27, label='Input')
-                        shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens'])
-                        with gr.Row():
-                            shared.gradio['Generate'] = gr.Button('Generate', variant='primary')
-                            shared.gradio['Stop'] = gr.Button('Stop', elem_id='stop')
-                            shared.gradio['Continue'] = gr.Button('Continue')
-                            shared.gradio['count_tokens'] = gr.Button('Count tokens')
+            ui_default.create_ui()
 
-                        with gr.Row():
-                            shared.gradio['prompt_menu'] = gr.Dropdown(choices=utils.get_available_prompts(), value='None', label='Prompt', elem_classes='slim-dropdown')
-                            ui.create_refresh_button(shared.gradio['prompt_menu'], lambda: None, lambda: {'choices': utils.get_available_prompts()}, 'refresh-button')
-                            shared.gradio['save_prompt'] = gr.Button('💾', elem_classes='refresh-button')
-                            shared.gradio['delete_prompt'] = gr.Button('🗑️', elem_classes='refresh-button')
+        ui_parameters.create_ui(shared.settings['preset'])  # Parameters tab
+        ui_model_menu.create_ui()  # Model tab
+        training.create_ui()  # Training tab
+        ui_session.create_ui()  # Session tab
 
-                        shared.gradio['status'] = gr.Markdown('')
-
-                    with gr.Column():
-                        with gr.Tab('Raw'):
-                            shared.gradio['output_textbox'] = gr.Textbox(lines=27, label='Output', elem_classes=['textbox_default_output', 'add_scrollbar'])
-
-                        with gr.Tab('Markdown'):
-                            shared.gradio['markdown_render'] = gr.Button('Render')
-                            shared.gradio['markdown'] = gr.Markdown()
-
-                        with gr.Tab('HTML'):
-                            shared.gradio['html'] = gr.HTML()
-
-            with gr.Tab("Parameters", elem_id="parameters"):
-                create_settings_menus(default_preset)
-
-        # Model tab
-        with gr.Tab("Model", elem_id="model-tab"):
-            create_model_menus()
-
-        # Training tab
-        with gr.Tab("Training", elem_id="training-tab"):
-            training.create_train_interface()
-
-        # Session tab
-        with gr.Tab("Session", elem_id="session-tab"):
-            modes = ["default", "notebook", "chat"]
-            current_mode = "default"
-            for mode in modes[1:]:
-                if getattr(shared.args, mode):
-                    current_mode = mode
-                    break
-
-            cmd_list = vars(shared.args)
-            bool_list = sorted([k for k in cmd_list if type(cmd_list[k]) is bool and k not in modes + ui.list_model_elements()])
-            bool_active = [k for k in bool_list if vars(shared.args)[k]]
-
-            with gr.Row():
-
-                with gr.Column():
-                    with gr.Row():
-                        shared.gradio['interface_modes_menu'] = gr.Dropdown(choices=modes, value=current_mode, label="Mode", elem_classes='slim-dropdown')
-                        shared.gradio['reset_interface'] = gr.Button("Apply and restart", elem_classes="small-button", variant="primary")
-                        shared.gradio['toggle_dark_mode'] = gr.Button('Toggle 💡', elem_classes="small-button")
-
-                    with gr.Row():
-                        with gr.Column():
-                            shared.gradio['extensions_menu'] = gr.CheckboxGroup(choices=utils.get_available_extensions(), value=shared.args.extensions, label="Available extensions", info='Note that some of these extensions may require manually installing Python requirements through the command: pip install -r extensions/extension_name/requirements.txt', elem_classes='checkboxgroup-table')
-
-                        with gr.Column():
-                            shared.gradio['bool_menu'] = gr.CheckboxGroup(choices=bool_list, value=bool_active, label="Boolean command-line flags", elem_classes='checkboxgroup-table')
-
-                with gr.Column():
-                    if not shared.args.multi_user:
-                        shared.gradio['save_session'] = gr.Button('Save session', elem_id="save_session")
-                        shared.gradio['load_session'] = gr.File(type='binary', file_types=['.json'], label="Upload Session JSON")
-
-                    extension_name = gr.Textbox(lines=1, label='Install or update an extension', info='Enter the GitHub URL below and press Enter. For a list of extensions, see: https://github.com/oobabooga/text-generation-webui-extensions ⚠️  WARNING ⚠️ : extensions can execute arbitrary code. Make sure to inspect their source code before activating them.')
-                    extension_status = gr.Markdown()
-
-            extension_name.submit(
-                clone_or_pull_repository, extension_name, extension_status, show_progress=False).then(
-                lambda: gr.update(choices=utils.get_available_extensions(), value=shared.args.extensions), None, gradio('extensions_menu'))
-
-            # Reset interface event
-            shared.gradio['reset_interface'].click(
-                set_interface_arguments, gradio('interface_modes_menu', 'extensions_menu', 'bool_menu'), None).then(
-                lambda: None, None, None, _js='() => {document.body.innerHTML=\'<h1 style="font-family:monospace;padding-top:20%;margin:0;height:100vh;color:lightgray;text-align:center;background:var(--body-background-fill)">Reloading...</h1>\'; setTimeout(function(){location.reload()},2500); return []}')
-
-            shared.gradio['toggle_dark_mode'].click(lambda: None, None, None, _js='() => {document.getElementsByTagName("body")[0].classList.toggle("dark")}')
-
-        # chat mode event handlers
+        # Generation events
         if shared.is_chat():
-            shared.input_params = gradio('Chat input', 'start_with', 'interface_state')
-            clear_arr = gradio('Clear history-confirm', 'Clear history', 'Clear history-cancel')
-            shared.reload_inputs = gradio('history', 'name1', 'name2', 'mode', 'chat_style')
-
-            gen_events.append(shared.gradio['Generate'].click(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
-                chat.generate_chat_reply_wrapper, shared.input_params, gradio('display', 'history'), show_progress=False).then(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
-                lambda: None, None, None, _js=f"() => {{{audio_notification_js}}}")
-            )
-
-            gen_events.append(shared.gradio['textbox'].submit(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
-                chat.generate_chat_reply_wrapper, shared.input_params, gradio('display', 'history'), show_progress=False).then(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
-                lambda: None, None, None, _js=f"() => {{{audio_notification_js}}}")
-            )
-
-            gen_events.append(shared.gradio['Regenerate'].click(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                partial(chat.generate_chat_reply_wrapper, regenerate=True), shared.input_params, gradio('display', 'history'), show_progress=False).then(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
-                lambda: None, None, None, _js=f"() => {{{audio_notification_js}}}")
-            )
-
-            gen_events.append(shared.gradio['Continue'].click(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                partial(chat.generate_chat_reply_wrapper, _continue=True), shared.input_params, gradio('display', 'history'), show_progress=False).then(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
-                lambda: None, None, None, _js=f"() => {{{audio_notification_js}}}")
-            )
-
-            gen_events.append(shared.gradio['Impersonate'].click(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                lambda x: x, gradio('textbox'), gradio('Chat input'), show_progress=False).then(
-                chat.impersonate_wrapper, shared.input_params, gradio('textbox'), show_progress=False).then(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                lambda: None, None, None, _js=f"() => {{{audio_notification_js}}}")
-            )
-
-            shared.gradio['Replace last reply'].click(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                chat.replace_last_reply, gradio('textbox', 'interface_state'), gradio('history')).then(
-                lambda: '', None, gradio('textbox'), show_progress=False).then(
-                chat.redraw_html, shared.reload_inputs, gradio('display')).then(
-                chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None)
-
-            shared.gradio['Send dummy message'].click(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                chat.send_dummy_message, gradio('textbox', 'interface_state'), gradio('history')).then(
-                lambda: '', None, gradio('textbox'), show_progress=False).then(
-                chat.redraw_html, shared.reload_inputs, gradio('display')).then(
-                chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None)
-
-            shared.gradio['Send dummy reply'].click(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                chat.send_dummy_reply, gradio('textbox', 'interface_state'), gradio('history')).then(
-                lambda: '', None, gradio('textbox'), show_progress=False).then(
-                chat.redraw_html, shared.reload_inputs, gradio('display')).then(
-                chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None)
-
-            shared.gradio['Clear history'].click(lambda: [gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, clear_arr)
-            shared.gradio['Clear history-cancel'].click(lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, clear_arr)
-            shared.gradio['Clear history-confirm'].click(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, clear_arr).then(
-                chat.clear_chat_log, gradio('interface_state'), gradio('history')).then(
-                chat.redraw_html, shared.reload_inputs, gradio('display')).then(
-                chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None)
-
-            shared.gradio['Remove last'].click(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                chat.remove_last_message, gradio('history'), gradio('textbox', 'history'), show_progress=False).then(
-                chat.redraw_html, shared.reload_inputs, gradio('display')).then(
-                chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None)
-
-            shared.gradio['character_menu'].change(
-                partial(chat.load_character, instruct=False), gradio('character_menu', 'name1', 'name2'), gradio('name1', 'name2', 'character_picture', 'greeting', 'context', 'dummy')).then(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                chat.load_persistent_history, gradio('interface_state'), gradio('history')).then(
-                chat.redraw_html, shared.reload_inputs, gradio('display'))
-
-            shared.gradio['Stop'].click(
-                stop_everything_event, None, None, queue=False, cancels=gen_events if shared.args.no_stream else None).then(
-                chat.redraw_html, shared.reload_inputs, gradio('display'))
-
-            shared.gradio['mode'].change(
-                lambda x: gr.update(visible=x != 'instruct'), gradio('mode'), gradio('chat_style'), show_progress=False).then(
-                chat.redraw_html, shared.reload_inputs, gradio('display'))
-
-            shared.gradio['chat_style'].change(chat.redraw_html, shared.reload_inputs, gradio('display'))
-            shared.gradio['instruction_template'].change(
-                partial(chat.load_character, instruct=True), gradio('instruction_template', 'name1_instruct', 'name2_instruct'), gradio('name1_instruct', 'name2_instruct', 'dummy', 'dummy', 'context_instruct', 'turn_template'))
-
-            shared.gradio['load_chat_history'].upload(
-                chat.load_history, gradio('load_chat_history', 'history'), gradio('history')).then(
-                chat.redraw_html, shared.reload_inputs, gradio('display')).then(
-                None, None, None, _js='() => {alert("The history has been loaded.")}')
-
-            shared.gradio['Copy last reply'].click(chat.send_last_reply_to_input, gradio('history'), gradio('textbox'), show_progress=False)
-
-            # Save/delete a character
-            shared.gradio['save_character'].click(
-                lambda x: x, gradio('name2'), gradio('save_character_filename')).then(
-                lambda: gr.update(visible=True), None, gradio('character_saver'))
-
-            shared.gradio['delete_character'].click(lambda: gr.update(visible=True), None, gradio('character_deleter'))
-
-            shared.gradio['save_template'].click(
-                lambda: 'My Template.yaml', None, gradio('save_filename')).then(
-                lambda: 'characters/instruction-following/', None, gradio('save_root')).then(
-                chat.generate_instruction_template_yaml, gradio('name1_instruct', 'name2_instruct', 'context_instruct', 'turn_template'), gradio('save_contents')).then(
-                lambda: gr.update(visible=True), None, gradio('file_saver'))
-
-            shared.gradio['delete_template'].click(
-                lambda x: f'{x}.yaml', gradio('instruction_template'), gradio('delete_filename')).then(
-                lambda: 'characters/instruction-following/', None, gradio('delete_root')).then(
-                lambda: gr.update(visible=True), None, gradio('file_deleter'))
-
-            shared.gradio['save_chat_history'].click(
-                lambda x: json.dumps(x, indent=4), gradio('history'), gradio('temporary_text')).then(
-                None, gradio('temporary_text', 'character_menu', 'mode'), None, _js=f"(hist, char, mode) => {{{ui.save_files_js}; saveHistory(hist, char, mode)}}")
-
-            shared.gradio['Submit character'].click(
-                chat.upload_character, gradio('upload_json', 'upload_img_bot'), gradio('character_menu')).then(
-                None, None, None, _js='() => {alert("The character has been loaded.")}')
-
-            shared.gradio['Submit tavern character'].click(
-                chat.upload_tavern_character, gradio('upload_img_tavern', 'tavern_json'), gradio('character_menu')).then(
-                None, None, None, _js='() => {alert("The character has been loaded.")}')
-
-            shared.gradio['upload_json'].upload(lambda: gr.update(interactive=True), None, gradio('Submit character'))
-            shared.gradio['upload_json'].clear(lambda: gr.update(interactive=False), None, gradio('Submit character'))
-            shared.gradio['upload_img_tavern'].upload(chat.check_tavern_character, gradio('upload_img_tavern'), gradio('tavern_name', 'tavern_desc', 'tavern_json', 'Submit tavern character'), show_progress=False)
-            shared.gradio['upload_img_tavern'].clear(lambda: (None, None, None, gr.update(interactive=False)), None, gradio('tavern_name', 'tavern_desc', 'tavern_json', 'Submit tavern character'), show_progress=False)
-            shared.gradio['your_picture'].change(
-                chat.upload_your_profile_picture, gradio('your_picture'), None).then(
-                partial(chat.redraw_html, reset_cache=True), shared.reload_inputs, gradio('display'))
-
-        # notebook/default modes event handlers
+            ui_chat.create_event_handlers()
+        elif shared.args.notebook:
+            ui_notebook.create_event_handlers()
         else:
-            shared.input_params = gradio('textbox', 'interface_state')
-            if shared.args.notebook:
-                output_params = gradio('textbox', 'html')
-            else:
-                output_params = gradio('output_textbox', 'html')
+            ui_default.create_event_handlers()
 
-            gen_events.append(shared.gradio['Generate'].click(
-                lambda x: x, gradio('textbox'), gradio('last_input')).then(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                generate_reply_wrapper, shared.input_params, output_params, show_progress=False).then(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                lambda: None, None, None, _js=f"() => {{{audio_notification_js}}}")
-                # lambda: None, None, None, _js="() => {element = document.getElementsByTagName('textarea')[0]; element.scrollTop = element.scrollHeight}")
-            )
-
-            gen_events.append(shared.gradio['textbox'].submit(
-                lambda x: x, gradio('textbox'), gradio('last_input')).then(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                generate_reply_wrapper, shared.input_params, output_params, show_progress=False).then(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                lambda: None, None, None, _js=f"() => {{{audio_notification_js}}}")
-                # lambda: None, None, None, _js="() => {element = document.getElementsByTagName('textarea')[0]; element.scrollTop = element.scrollHeight}")
-            )
-
-            if shared.args.notebook:
-                shared.gradio['Undo'].click(lambda x: x, gradio('last_input'), gradio('textbox'), show_progress=False)
-                shared.gradio['markdown_render'].click(lambda x: x, gradio('textbox'), gradio('markdown'), queue=False)
-                gen_events.append(shared.gradio['Regenerate'].click(
-                    lambda x: x, gradio('last_input'), gradio('textbox'), show_progress=False).then(
-                    ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                    generate_reply_wrapper, shared.input_params, output_params, show_progress=False).then(
-                    ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                    lambda: None, None, None, _js=f"() => {{{audio_notification_js}}}")
-                    # lambda: None, None, None, _js="() => {element = document.getElementsByTagName('textarea')[0]; element.scrollTop = element.scrollHeight}")
-                )
-            else:
-                shared.gradio['markdown_render'].click(lambda x: x, gradio('output_textbox'), gradio('markdown'), queue=False)
-                gen_events.append(shared.gradio['Continue'].click(
-                    ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                    generate_reply_wrapper, [shared.gradio['output_textbox']] + shared.input_params[1:], output_params, show_progress=False).then(
-                    ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                    lambda: None, None, None, _js=f"() => {{{audio_notification_js}}}")
-                    # lambda: None, None, None, _js="() => {element = document.getElementsByTagName('textarea')[1]; element.scrollTop = element.scrollHeight}")
-                )
-
-            shared.gradio['Stop'].click(stop_everything_event, None, None, queue=False, cancels=gen_events if shared.args.no_stream else None)
-            shared.gradio['prompt_menu'].change(load_prompt, gradio('prompt_menu'), gradio('textbox'), show_progress=False)
-            shared.gradio['save_prompt'].click(
-                lambda x: x, gradio('textbox'), gradio('save_contents')).then(
-                lambda: 'prompts/', None, gradio('save_root')).then(
-                lambda: utils.current_time() + '.txt', None, gradio('save_filename')).then(
-                lambda: gr.update(visible=True), None, gradio('file_saver'))
-
-            shared.gradio['delete_prompt'].click(
-                lambda: 'prompts/', None, gradio('delete_root')).then(
-                lambda x: x + '.txt', gradio('prompt_menu'), gradio('delete_filename')).then(
-                lambda: gr.update(visible=True), None, gradio('file_deleter'))
-
-            shared.gradio['count_tokens'].click(count_tokens, gradio('textbox'), gradio('status'), show_progress=False)
-
-        create_file_saving_event_handlers()
-
-        if shared.settings['dark_theme']:
-            shared.gradio['interface'].load(lambda: None, None, None, _js="() => document.getElementsByTagName('body')[0].classList.add('dark')")
+        # Other events
+        ui_file_saving.create_event_handlers()
+        ui_parameters.create_event_handlers()
+        ui_model_menu.create_event_handlers()
 
+        # Interface launch events
         shared.gradio['interface'].load(lambda: None, None, None, _js=f"() => {{{js}}}")
         shared.gradio['interface'].load(partial(ui.apply_interface_values, {}, use_persistent=True), None, gradio(ui.list_interface_input_elements()), show_progress=False)
         if shared.is_chat():
             shared.gradio['interface'].load(chat.redraw_html, shared.reload_inputs, gradio('display'))
 
-        # Extensions tabs
-        extensions_module.create_extensions_tabs()
-
-        # Extensions block
-        extensions_module.create_extensions_block()
+        extensions_module.create_extensions_tabs()  # Extensions tabs
+        extensions_module.create_extensions_block()  # Extensions block
 
     # Launch the interface
     shared.gradio['interface'].queue()
@@ -1086,17 +142,19 @@ def create_interface():
         shared.gradio['interface'].launch(
             prevent_thread_lock=True,
             share=shared.args.share,
-            server_name = None if not shared.args.listen else (shared.args.listen_host or '0.0.0.0'),
+            server_name=None if not shared.args.listen else (shared.args.listen_host or '0.0.0.0'),
             server_port=shared.args.listen_port,
             inbrowser=shared.args.auto_launch,
-            auth=auth,
+            auth=auth or None,
             ssl_verify=False if (shared.args.ssl_keyfile or shared.args.ssl_certfile) else True,
             ssl_keyfile=shared.args.ssl_keyfile,
             ssl_certfile=shared.args.ssl_certfile
         )
 
+
 if __name__ == "__main__":
-    # Loading custom settings
+
+    # Load custom settings
     settings_file = None
     if shared.args.settings is not None and Path(shared.args.settings).exists():
         settings_file = Path(shared.args.settings)
@@ -1109,10 +167,9 @@ if __name__ == "__main__":
         logger.info(f"Loading settings from {settings_file}...")
         file_contents = open(settings_file, 'r', encoding='utf-8').read()
         new_settings = json.loads(file_contents) if settings_file.suffix == "json" else yaml.safe_load(file_contents)
-        for item in new_settings:
-            shared.settings[item] = new_settings[item]
+        shared.settings.update(new_settings)
 
-    # Set default model settings based on settings file
+    # Fallback settings for models
     shared.model_config['.*'] = {
         'wbits': 'None',
         'model_type': 'None',
@@ -1128,7 +185,7 @@ if __name__ == "__main__":
 
     shared.model_config.move_to_end('.*', last=False)  # Move to the beginning
 
-    # Default extensions
+    # Activate the extensions listed on settings.yaml
     extensions_module.available_extensions = utils.get_available_extensions()
     if shared.is_chat():
         for extension in shared.settings['chat_default_extensions']:

From c237ce607e75b536f4807575aff846a6cad0da7b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 6 Aug 2023 17:50:07 -0700
Subject: [PATCH 032/169] Move characters/instruction-following to
 instruction-templates

---
 extensions/openai/completions.py                              | 4 ++--
 extensions/openai/edits.py                                    | 4 ++--
 .../Airoboros-v1.2.yaml                                       | 0
 .../Alpaca.yaml                                               | 0
 .../Bactrian.yaml                                             | 0
 .../Baichuan Chat.yaml                                        | 0
 .../Baize.yaml                                                | 0
 .../Bluemoon.yaml                                             | 0
 .../ChatGLM.yaml                                              | 0
 .../Chinese-Vicuna-Chat.yaml                                  | 0
 .../Galactica Cite.yaml                                       | 0
 .../Galactica Finetuned.yaml                                  | 0
 .../Galactica Q.yaml                                          | 0
 .../Galactica Summary.yaml                                    | 0
 .../Galactica Work.yaml                                       | 0
 .../Galactica v2.yaml                                         | 0
 .../Galactica.yaml                                            | 0
 .../Gorilla.yaml                                              | 0
 .../Guanaco non-chat.yaml                                     | 0
 .../Guanaco-QLoRA.yaml                                        | 0
 .../Guanaco.yaml                                              | 0
 .../H2O-human_bot.yaml                                        | 0
 .../H2O-prompt_answer.yaml                                    | 0
 .../Hippogriff.yaml                                           | 0
 .../INCITE-Chat.yaml                                          | 0
 .../INCITE-Instruct.yaml                                      | 0
 .../KoAlpaca.yaml                                             | 0
 .../Koala.yaml                                                | 0
 .../LLaVA.yaml                                                | 0
 .../Llama-v2.yaml                                             | 0
 .../instruction-following => instruction-templates}/MOSS.yaml | 0
 .../MPT-Chat.yaml                                             | 0
 .../Manticore Chat.yaml                                       | 0
 .../Metharme.yaml                                             | 0
 .../Minotaur.yaml                                             | 0
 .../NewHope.yaml                                              | 0
 .../Open Assistant.yaml                                       | 0
 .../OpenBuddy.yaml                                            | 0
 .../Orca Mini.yaml                                            | 0
 .../RWKV-Raven.yaml                                           | 0
 .../Samantha.yaml                                             | 0
 .../StableBeluga2.yaml                                        | 0
 .../StableLM.yaml                                             | 0
 .../StableVicuna.yaml                                         | 0
 .../Starchat-Beta.yaml                                        | 0
 .../instruction-following => instruction-templates}/Tulu.yaml | 0
 .../Vicuna-v0.yaml                                            | 0
 .../Vicuna-v1.1.yaml                                          | 0
 .../Vigogne-Chat.yaml                                         | 0
 .../Vigogne-Instruct.yaml                                     | 0
 .../Wizard-Mega ShareGPT.yaml                                 | 0
 .../Wizard-Mega WizardLM.yaml                                 | 0
 .../Wizard-Mega.yaml                                          | 0
 .../WizardLM.yaml                                             | 0
 .../instruction-following => instruction-templates}/Ziya.yaml | 0
 modules/chat.py                                               | 2 +-
 modules/prompts.py                                            | 2 +-
 modules/ui_chat.py                                            | 4 ++--
 modules/utils.py                                              | 4 ++--
 59 files changed, 10 insertions(+), 10 deletions(-)
 rename {characters/instruction-following => instruction-templates}/Airoboros-v1.2.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Alpaca.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Bactrian.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Baichuan Chat.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Baize.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Bluemoon.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/ChatGLM.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Chinese-Vicuna-Chat.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Galactica Cite.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Galactica Finetuned.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Galactica Q.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Galactica Summary.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Galactica Work.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Galactica v2.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Galactica.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Gorilla.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Guanaco non-chat.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Guanaco-QLoRA.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Guanaco.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/H2O-human_bot.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/H2O-prompt_answer.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Hippogriff.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/INCITE-Chat.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/INCITE-Instruct.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/KoAlpaca.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Koala.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/LLaVA.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Llama-v2.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/MOSS.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/MPT-Chat.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Manticore Chat.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Metharme.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Minotaur.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/NewHope.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Open Assistant.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/OpenBuddy.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Orca Mini.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/RWKV-Raven.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Samantha.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/StableBeluga2.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/StableLM.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/StableVicuna.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Starchat-Beta.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Tulu.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Vicuna-v0.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Vicuna-v1.1.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Vigogne-Chat.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Vigogne-Instruct.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Wizard-Mega ShareGPT.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Wizard-Mega WizardLM.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Wizard-Mega.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/WizardLM.yaml (100%)
 rename {characters/instruction-following => instruction-templates}/Ziya.yaml (100%)

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 646da958..3e277710 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -165,7 +165,7 @@ def messages_to_prompt(body: dict, req_params: dict, max_tokens):
     # Instruct models can be much better
     if shared.settings['instruction_template']:
         try:
-            instruct = yaml.safe_load(open(f"characters/instruction-following/{shared.settings['instruction_template']}.yaml", 'r'))
+            instruct = yaml.safe_load(open(f"instruction-templates/{shared.settings['instruction_template']}.yaml", 'r'))
 
             template = instruct['turn_template']
             system_message_template = "{message}"
@@ -193,7 +193,7 @@ def messages_to_prompt(body: dict, req_params: dict, max_tokens):
         except Exception as e:
             req_params['stopping_strings'].extend(['\nUser:', 'User:'])  # XXX User: prompt here also
 
-            print(f"Exception: When loading characters/instruction-following/{shared.settings['instruction_template']}.yaml: {repr(e)}")
+            print(f"Exception: When loading instruction-templates/{shared.settings['instruction_template']}.yaml: {repr(e)}")
             print("Warning: Loaded default instruction-following template for model.")
 
     else:
diff --git a/extensions/openai/edits.py b/extensions/openai/edits.py
index f10f5779..2b527dc0 100644
--- a/extensions/openai/edits.py
+++ b/extensions/openai/edits.py
@@ -31,7 +31,7 @@ def edits(instruction: str, input: str, temperature=1.0, top_p=1.0) -> dict:
             stopping_strings.extend(['\n###'])
         else:
             try:
-                instruct = yaml.safe_load(open(f"characters/instruction-following/{shared.settings['instruction_template']}.yaml", 'r'))
+                instruct = yaml.safe_load(open(f"instruction-templates/{shared.settings['instruction_template']}.yaml", 'r'))
 
                 template = instruct['turn_template']
                 template = template\
@@ -45,7 +45,7 @@ def edits(instruction: str, input: str, temperature=1.0, top_p=1.0) -> dict:
 
             except Exception as e:
                 instruction_template = default_template
-                print(f"Exception: When loading characters/instruction-following/{shared.settings['instruction_template']}.yaml: {repr(e)}")
+                print(f"Exception: When loading instruction-templates/{shared.settings['instruction_template']}.yaml: {repr(e)}")
                 print("Warning: Loaded default instruction-following template (Alpaca) for model.")
     else:
         stopping_strings.extend(['\n###'])
diff --git a/characters/instruction-following/Airoboros-v1.2.yaml b/instruction-templates/Airoboros-v1.2.yaml
similarity index 100%
rename from characters/instruction-following/Airoboros-v1.2.yaml
rename to instruction-templates/Airoboros-v1.2.yaml
diff --git a/characters/instruction-following/Alpaca.yaml b/instruction-templates/Alpaca.yaml
similarity index 100%
rename from characters/instruction-following/Alpaca.yaml
rename to instruction-templates/Alpaca.yaml
diff --git a/characters/instruction-following/Bactrian.yaml b/instruction-templates/Bactrian.yaml
similarity index 100%
rename from characters/instruction-following/Bactrian.yaml
rename to instruction-templates/Bactrian.yaml
diff --git a/characters/instruction-following/Baichuan Chat.yaml b/instruction-templates/Baichuan Chat.yaml
similarity index 100%
rename from characters/instruction-following/Baichuan Chat.yaml
rename to instruction-templates/Baichuan Chat.yaml
diff --git a/characters/instruction-following/Baize.yaml b/instruction-templates/Baize.yaml
similarity index 100%
rename from characters/instruction-following/Baize.yaml
rename to instruction-templates/Baize.yaml
diff --git a/characters/instruction-following/Bluemoon.yaml b/instruction-templates/Bluemoon.yaml
similarity index 100%
rename from characters/instruction-following/Bluemoon.yaml
rename to instruction-templates/Bluemoon.yaml
diff --git a/characters/instruction-following/ChatGLM.yaml b/instruction-templates/ChatGLM.yaml
similarity index 100%
rename from characters/instruction-following/ChatGLM.yaml
rename to instruction-templates/ChatGLM.yaml
diff --git a/characters/instruction-following/Chinese-Vicuna-Chat.yaml b/instruction-templates/Chinese-Vicuna-Chat.yaml
similarity index 100%
rename from characters/instruction-following/Chinese-Vicuna-Chat.yaml
rename to instruction-templates/Chinese-Vicuna-Chat.yaml
diff --git a/characters/instruction-following/Galactica Cite.yaml b/instruction-templates/Galactica Cite.yaml
similarity index 100%
rename from characters/instruction-following/Galactica Cite.yaml
rename to instruction-templates/Galactica Cite.yaml
diff --git a/characters/instruction-following/Galactica Finetuned.yaml b/instruction-templates/Galactica Finetuned.yaml
similarity index 100%
rename from characters/instruction-following/Galactica Finetuned.yaml
rename to instruction-templates/Galactica Finetuned.yaml
diff --git a/characters/instruction-following/Galactica Q.yaml b/instruction-templates/Galactica Q.yaml
similarity index 100%
rename from characters/instruction-following/Galactica Q.yaml
rename to instruction-templates/Galactica Q.yaml
diff --git a/characters/instruction-following/Galactica Summary.yaml b/instruction-templates/Galactica Summary.yaml
similarity index 100%
rename from characters/instruction-following/Galactica Summary.yaml
rename to instruction-templates/Galactica Summary.yaml
diff --git a/characters/instruction-following/Galactica Work.yaml b/instruction-templates/Galactica Work.yaml
similarity index 100%
rename from characters/instruction-following/Galactica Work.yaml
rename to instruction-templates/Galactica Work.yaml
diff --git a/characters/instruction-following/Galactica v2.yaml b/instruction-templates/Galactica v2.yaml
similarity index 100%
rename from characters/instruction-following/Galactica v2.yaml
rename to instruction-templates/Galactica v2.yaml
diff --git a/characters/instruction-following/Galactica.yaml b/instruction-templates/Galactica.yaml
similarity index 100%
rename from characters/instruction-following/Galactica.yaml
rename to instruction-templates/Galactica.yaml
diff --git a/characters/instruction-following/Gorilla.yaml b/instruction-templates/Gorilla.yaml
similarity index 100%
rename from characters/instruction-following/Gorilla.yaml
rename to instruction-templates/Gorilla.yaml
diff --git a/characters/instruction-following/Guanaco non-chat.yaml b/instruction-templates/Guanaco non-chat.yaml
similarity index 100%
rename from characters/instruction-following/Guanaco non-chat.yaml
rename to instruction-templates/Guanaco non-chat.yaml
diff --git a/characters/instruction-following/Guanaco-QLoRA.yaml b/instruction-templates/Guanaco-QLoRA.yaml
similarity index 100%
rename from characters/instruction-following/Guanaco-QLoRA.yaml
rename to instruction-templates/Guanaco-QLoRA.yaml
diff --git a/characters/instruction-following/Guanaco.yaml b/instruction-templates/Guanaco.yaml
similarity index 100%
rename from characters/instruction-following/Guanaco.yaml
rename to instruction-templates/Guanaco.yaml
diff --git a/characters/instruction-following/H2O-human_bot.yaml b/instruction-templates/H2O-human_bot.yaml
similarity index 100%
rename from characters/instruction-following/H2O-human_bot.yaml
rename to instruction-templates/H2O-human_bot.yaml
diff --git a/characters/instruction-following/H2O-prompt_answer.yaml b/instruction-templates/H2O-prompt_answer.yaml
similarity index 100%
rename from characters/instruction-following/H2O-prompt_answer.yaml
rename to instruction-templates/H2O-prompt_answer.yaml
diff --git a/characters/instruction-following/Hippogriff.yaml b/instruction-templates/Hippogriff.yaml
similarity index 100%
rename from characters/instruction-following/Hippogriff.yaml
rename to instruction-templates/Hippogriff.yaml
diff --git a/characters/instruction-following/INCITE-Chat.yaml b/instruction-templates/INCITE-Chat.yaml
similarity index 100%
rename from characters/instruction-following/INCITE-Chat.yaml
rename to instruction-templates/INCITE-Chat.yaml
diff --git a/characters/instruction-following/INCITE-Instruct.yaml b/instruction-templates/INCITE-Instruct.yaml
similarity index 100%
rename from characters/instruction-following/INCITE-Instruct.yaml
rename to instruction-templates/INCITE-Instruct.yaml
diff --git a/characters/instruction-following/KoAlpaca.yaml b/instruction-templates/KoAlpaca.yaml
similarity index 100%
rename from characters/instruction-following/KoAlpaca.yaml
rename to instruction-templates/KoAlpaca.yaml
diff --git a/characters/instruction-following/Koala.yaml b/instruction-templates/Koala.yaml
similarity index 100%
rename from characters/instruction-following/Koala.yaml
rename to instruction-templates/Koala.yaml
diff --git a/characters/instruction-following/LLaVA.yaml b/instruction-templates/LLaVA.yaml
similarity index 100%
rename from characters/instruction-following/LLaVA.yaml
rename to instruction-templates/LLaVA.yaml
diff --git a/characters/instruction-following/Llama-v2.yaml b/instruction-templates/Llama-v2.yaml
similarity index 100%
rename from characters/instruction-following/Llama-v2.yaml
rename to instruction-templates/Llama-v2.yaml
diff --git a/characters/instruction-following/MOSS.yaml b/instruction-templates/MOSS.yaml
similarity index 100%
rename from characters/instruction-following/MOSS.yaml
rename to instruction-templates/MOSS.yaml
diff --git a/characters/instruction-following/MPT-Chat.yaml b/instruction-templates/MPT-Chat.yaml
similarity index 100%
rename from characters/instruction-following/MPT-Chat.yaml
rename to instruction-templates/MPT-Chat.yaml
diff --git a/characters/instruction-following/Manticore Chat.yaml b/instruction-templates/Manticore Chat.yaml
similarity index 100%
rename from characters/instruction-following/Manticore Chat.yaml
rename to instruction-templates/Manticore Chat.yaml
diff --git a/characters/instruction-following/Metharme.yaml b/instruction-templates/Metharme.yaml
similarity index 100%
rename from characters/instruction-following/Metharme.yaml
rename to instruction-templates/Metharme.yaml
diff --git a/characters/instruction-following/Minotaur.yaml b/instruction-templates/Minotaur.yaml
similarity index 100%
rename from characters/instruction-following/Minotaur.yaml
rename to instruction-templates/Minotaur.yaml
diff --git a/characters/instruction-following/NewHope.yaml b/instruction-templates/NewHope.yaml
similarity index 100%
rename from characters/instruction-following/NewHope.yaml
rename to instruction-templates/NewHope.yaml
diff --git a/characters/instruction-following/Open Assistant.yaml b/instruction-templates/Open Assistant.yaml
similarity index 100%
rename from characters/instruction-following/Open Assistant.yaml
rename to instruction-templates/Open Assistant.yaml
diff --git a/characters/instruction-following/OpenBuddy.yaml b/instruction-templates/OpenBuddy.yaml
similarity index 100%
rename from characters/instruction-following/OpenBuddy.yaml
rename to instruction-templates/OpenBuddy.yaml
diff --git a/characters/instruction-following/Orca Mini.yaml b/instruction-templates/Orca Mini.yaml
similarity index 100%
rename from characters/instruction-following/Orca Mini.yaml
rename to instruction-templates/Orca Mini.yaml
diff --git a/characters/instruction-following/RWKV-Raven.yaml b/instruction-templates/RWKV-Raven.yaml
similarity index 100%
rename from characters/instruction-following/RWKV-Raven.yaml
rename to instruction-templates/RWKV-Raven.yaml
diff --git a/characters/instruction-following/Samantha.yaml b/instruction-templates/Samantha.yaml
similarity index 100%
rename from characters/instruction-following/Samantha.yaml
rename to instruction-templates/Samantha.yaml
diff --git a/characters/instruction-following/StableBeluga2.yaml b/instruction-templates/StableBeluga2.yaml
similarity index 100%
rename from characters/instruction-following/StableBeluga2.yaml
rename to instruction-templates/StableBeluga2.yaml
diff --git a/characters/instruction-following/StableLM.yaml b/instruction-templates/StableLM.yaml
similarity index 100%
rename from characters/instruction-following/StableLM.yaml
rename to instruction-templates/StableLM.yaml
diff --git a/characters/instruction-following/StableVicuna.yaml b/instruction-templates/StableVicuna.yaml
similarity index 100%
rename from characters/instruction-following/StableVicuna.yaml
rename to instruction-templates/StableVicuna.yaml
diff --git a/characters/instruction-following/Starchat-Beta.yaml b/instruction-templates/Starchat-Beta.yaml
similarity index 100%
rename from characters/instruction-following/Starchat-Beta.yaml
rename to instruction-templates/Starchat-Beta.yaml
diff --git a/characters/instruction-following/Tulu.yaml b/instruction-templates/Tulu.yaml
similarity index 100%
rename from characters/instruction-following/Tulu.yaml
rename to instruction-templates/Tulu.yaml
diff --git a/characters/instruction-following/Vicuna-v0.yaml b/instruction-templates/Vicuna-v0.yaml
similarity index 100%
rename from characters/instruction-following/Vicuna-v0.yaml
rename to instruction-templates/Vicuna-v0.yaml
diff --git a/characters/instruction-following/Vicuna-v1.1.yaml b/instruction-templates/Vicuna-v1.1.yaml
similarity index 100%
rename from characters/instruction-following/Vicuna-v1.1.yaml
rename to instruction-templates/Vicuna-v1.1.yaml
diff --git a/characters/instruction-following/Vigogne-Chat.yaml b/instruction-templates/Vigogne-Chat.yaml
similarity index 100%
rename from characters/instruction-following/Vigogne-Chat.yaml
rename to instruction-templates/Vigogne-Chat.yaml
diff --git a/characters/instruction-following/Vigogne-Instruct.yaml b/instruction-templates/Vigogne-Instruct.yaml
similarity index 100%
rename from characters/instruction-following/Vigogne-Instruct.yaml
rename to instruction-templates/Vigogne-Instruct.yaml
diff --git a/characters/instruction-following/Wizard-Mega ShareGPT.yaml b/instruction-templates/Wizard-Mega ShareGPT.yaml
similarity index 100%
rename from characters/instruction-following/Wizard-Mega ShareGPT.yaml
rename to instruction-templates/Wizard-Mega ShareGPT.yaml
diff --git a/characters/instruction-following/Wizard-Mega WizardLM.yaml b/instruction-templates/Wizard-Mega WizardLM.yaml
similarity index 100%
rename from characters/instruction-following/Wizard-Mega WizardLM.yaml
rename to instruction-templates/Wizard-Mega WizardLM.yaml
diff --git a/characters/instruction-following/Wizard-Mega.yaml b/instruction-templates/Wizard-Mega.yaml
similarity index 100%
rename from characters/instruction-following/Wizard-Mega.yaml
rename to instruction-templates/Wizard-Mega.yaml
diff --git a/characters/instruction-following/WizardLM.yaml b/instruction-templates/WizardLM.yaml
similarity index 100%
rename from characters/instruction-following/WizardLM.yaml
rename to instruction-templates/WizardLM.yaml
diff --git a/characters/instruction-following/Ziya.yaml b/instruction-templates/Ziya.yaml
similarity index 100%
rename from characters/instruction-following/Ziya.yaml
rename to instruction-templates/Ziya.yaml
diff --git a/modules/chat.py b/modules/chat.py
index 8a86523c..a445c6a8 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -475,7 +475,7 @@ def load_character(character, name1, name2, instruct=False):
         Path("cache/pfp_character.png").unlink()
 
     if character not in ['None', '', None]:
-        folder = 'characters' if not instruct else 'characters/instruction-following'
+        folder = 'characters' if not instruct else 'instruction-templates'
         picture = generate_pfp_cache(character)
         filepath = None
         for extension in ["yml", "yaml", "json"]:
diff --git a/modules/prompts.py b/modules/prompts.py
index f68c83c4..8a3cf3e3 100644
--- a/modules/prompts.py
+++ b/modules/prompts.py
@@ -12,7 +12,7 @@ def load_prompt(fname):
         return ''
     elif fname.startswith('Instruct-'):
         fname = re.sub('^Instruct-', '', fname)
-        file_path = Path(f'characters/instruction-following/{fname}.yaml')
+        file_path = Path(f'instruction-templates/{fname}.yaml')
         if not file_path.exists():
             return ''
 
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 8a0c103b..a858acaf 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -232,13 +232,13 @@ def create_event_handlers():
 
     shared.gradio['save_template'].click(
         lambda: 'My Template.yaml', None, gradio('save_filename')).then(
-        lambda: 'characters/instruction-following/', None, gradio('save_root')).then(
+        lambda: 'instruction-templates/', None, gradio('save_root')).then(
         chat.generate_instruction_template_yaml, gradio('name1_instruct', 'name2_instruct', 'context_instruct', 'turn_template'), gradio('save_contents')).then(
         lambda: gr.update(visible=True), None, gradio('file_saver'))
 
     shared.gradio['delete_template'].click(
         lambda x: f'{x}.yaml', gradio('instruction_template'), gradio('delete_filename')).then(
-        lambda: 'characters/instruction-following/', None, gradio('delete_root')).then(
+        lambda: 'instruction-templates/', None, gradio('delete_root')).then(
         lambda: gr.update(visible=True), None, gradio('file_deleter'))
 
     shared.gradio['save_chat_history'].click(
diff --git a/modules/utils.py b/modules/utils.py
index 9ae5dc86..adaa15e8 100644
--- a/modules/utils.py
+++ b/modules/utils.py
@@ -90,11 +90,11 @@ def get_available_prompts():
 
 def get_available_characters():
     paths = (x for x in Path('characters').iterdir() if x.suffix in ('.json', '.yaml', '.yml'))
-    return ['None'] + sorted(set((k.stem for k in paths if k.stem != "instruction-following")), key=natural_keys)
+    return ['None'] + sorted(set((k.stem for k in paths)), key=natural_keys)
 
 
 def get_available_instruction_templates():
-    path = "characters/instruction-following"
+    path = "instruction-templates"
     paths = []
     if os.path.exists(path):
         paths = (x for x in Path(path).iterdir() if x.suffix in ('.json', '.yaml', '.yml'))

From 3d48933f274ca571c88bb225b22950b6efb75324 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 6 Aug 2023 18:58:59 -0700
Subject: [PATCH 033/169] Remove ancient deprecation warnings

---
 modules/shared.py | 62 +++++++++++++++++------------------------------
 1 file changed, 22 insertions(+), 40 deletions(-)

diff --git a/modules/shared.py b/modules/shared.py
index be5be109..c2f5e0f2 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -6,34 +6,29 @@ import yaml
 
 from modules.logging_colors import logger
 
-generation_lock = None
+
+# Model variables
 model = None
 tokenizer = None
-is_seq2seq = False
 model_name = "None"
-lora_names = []
+is_seq2seq = False
 model_dirty_from_training = False
+lora_names = []
 
-# Chat variables
+# Generation variables
 stop_everything = False
+generation_lock = None
 processing_message = '*Is typing...*'
+input_params = []
+reload_inputs = []
 
-# UI elements (buttons, sliders, HTML, etc)
+# UI variables
 gradio = {}
-
-# For keeping the values of UI elements on page reload
 persistent_interface_state = {}
-
-input_params = []  # Generation input parameters
-reload_inputs = []  # Parameters for reloading the chat interface
-
-# For restarting the interface
 need_restart = False
-
-# To prevent the persistent chat history from being loaded when
-# a session JSON file is being loaded in chat mode
 session_is_loading = False
 
+# UI defaults
 settings = {
     'dark_theme': True,
     'autoload_model': False,
@@ -148,8 +143,6 @@ parser.add_argument('--warmup_autotune', action='store_true', help='(triton) Ena
 parser.add_argument('--fused_mlp', action='store_true', help='(triton) Enable fused mlp.')
 
 # AutoGPTQ
-parser.add_argument('--gptq-for-llama', action='store_true', help='DEPRECATED')
-parser.add_argument('--autogptq', action='store_true', help='DEPRECATED')
 parser.add_argument('--triton', action='store_true', help='Use triton.')
 parser.add_argument('--no_inject_fused_attention', action='store_true', help='Do not use fused attention (lowers VRAM requirements).')
 parser.add_argument('--no_inject_fused_mlp', action='store_true', help='Triton mode only: Do not use fused MLP (lowers VRAM requirements).')
@@ -196,14 +189,6 @@ parser.add_argument('--multimodal-pipeline', type=str, default=None, help='The m
 args = parser.parse_args()
 args_defaults = parser.parse_args([])
 
-# Deprecation warnings
-if args.autogptq:
-    logger.warning('--autogptq has been deprecated and will be removed soon. Use --loader autogptq instead.')
-    args.loader = 'autogptq'
-if args.gptq_for_llama:
-    logger.warning('--gptq-for-llama has been deprecated and will be removed soon. Use --loader gptq-for-llama instead.')
-    args.loader = 'gptq-for-llama'
-
 # Security warnings
 if args.trust_remote_code:
     logger.warning("trust_remote_code is enabled. This is dangerous.")
@@ -231,10 +216,6 @@ def fix_loader_name(name):
         return 'ExLlama_HF'
 
 
-if args.loader is not None:
-    args.loader = fix_loader_name(args.loader)
-
-
 def add_extension(name):
     if args.extensions is None:
         args.extensions = [name]
@@ -242,15 +223,6 @@ def add_extension(name):
         args.extensions.append(name)
 
 
-# Activating the API extension
-if args.api or args.public_api:
-    add_extension('api')
-
-# Activating the multimodal extension
-if args.multimodal_pipeline is not None:
-    add_extension('multimodal')
-
-
 def is_chat():
     return args.chat
 
@@ -264,14 +236,24 @@ def get_mode():
         return 'default'
 
 
-# Loading model-specific settings
+args.loader = fix_loader_name(args.loader)
+
+# Activate the API extension
+if args.api or args.public_api:
+    add_extension('api')
+
+# Activate the multimodal extension
+if args.multimodal_pipeline is not None:
+    add_extension('multimodal')
+
+# Load model-specific settings
 with Path(f'{args.model_dir}/config.yaml') as p:
     if p.exists():
         model_config = yaml.safe_load(open(p, 'r').read())
     else:
         model_config = {}
 
-# Applying user-defined model settings
+# Load custom model-specific settings
 with Path(f'{args.model_dir}/config-user.yaml') as p:
     if p.exists():
         user_config = yaml.safe_load(open(p, 'r').read())

From 2cf64474f2319ec009a95a96ec7d1ea799217104 Mon Sep 17 00:00:00 2001
From: jllllll <3887729+jllllll@users.noreply.github.com>
Date: Sun, 6 Aug 2023 21:46:25 -0500
Subject: [PATCH 034/169] Use chat_instruct_command in API (#3482)

---
 api-examples/api-example-chat-stream.py | 2 +-
 api-examples/api-example-chat.py        | 2 +-
 extensions/api/util.py                  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/api-examples/api-example-chat-stream.py b/api-examples/api-example-chat-stream.py
index a774f907..055900bd 100644
--- a/api-examples/api-example-chat-stream.py
+++ b/api-examples/api-example-chat-stream.py
@@ -38,7 +38,7 @@ async def run(user_input, history):
         '_continue': False,
         'stop_at_newline': False,
         'chat_generation_attempts': 1,
-        'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',
+        'chat_instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',
 
         # Generation params. If 'preset' is set to different than 'None', the values
         # in presets/preset-name.yaml are used instead of the individual numbers.
diff --git a/api-examples/api-example-chat.py b/api-examples/api-example-chat.py
index 824bf3a0..c3d0c538 100644
--- a/api-examples/api-example-chat.py
+++ b/api-examples/api-example-chat.py
@@ -32,7 +32,7 @@ def run(user_input, history):
         '_continue': False,
         'stop_at_newline': False,
         'chat_generation_attempts': 1,
-        'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',
+        'chat_instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',
 
         # Generation params. If 'preset' is set to different than 'None', the values
         # in presets/preset-name.yaml are used instead of the individual numbers.
diff --git a/extensions/api/util.py b/extensions/api/util.py
index 2654d046..f36c070b 100644
--- a/extensions/api/util.py
+++ b/extensions/api/util.py
@@ -79,7 +79,7 @@ def build_parameters(body, chat=False):
             'name2_instruct': str(body.get('name2_instruct', name2_instruct)),
             'context_instruct': str(body.get('context_instruct', context_instruct)),
             'turn_template': str(body.get('turn_template', turn_template)),
-            'chat-instruct_command': str(body.get('chat-instruct_command', shared.settings['chat-instruct_command'])),
+            'chat-instruct_command': str(body.get('chat_instruct_command', body.get('chat-instruct_command', shared.settings['chat-instruct_command']))),
             'history': body.get('history', {'internal': [], 'visible': []})
         })
 

From a373c96d5931ad73eb41ed3e045d9846fb7533d6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 6 Aug 2023 20:36:35 -0700
Subject: [PATCH 035/169] Fix a bug in modules/shared.py

---
 modules/shared.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/modules/shared.py b/modules/shared.py
index c2f5e0f2..30f6512c 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -199,6 +199,9 @@ if args.multi_user:
 
 
 def fix_loader_name(name):
+    if not name:
+        return name
+
     name = name.lower()
     if name in ['llamacpp', 'llama.cpp', 'llama-cpp', 'llama cpp']:
         return 'llama.cpp'

From 412f6ff9d3bb0bb9a30932ffc87ac1114c606ad5 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 7 Aug 2023 06:08:51 -0700
Subject: [PATCH 036/169] Change alpha_value maximum and step

---
 modules/ui_model_menu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index c9d772b8..89b82c1f 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -90,7 +90,7 @@ def create_ui():
                             shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
                             shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=16384, step=256, info='Maximum sequence length.', value=shared.args.max_seq_len)
                             shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should typically be set to max_seq_len / 2048.', value=shared.args.compress_pos_emb)
-                            shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=32, step=1, info='Positional embeddings alpha factor for NTK RoPE scaling. Scaling is not identical to embedding compression. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value)
+                            shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=0.1, info='Positional embeddings alpha factor for NTK RoPE scaling. Scaling is not identical to embedding compression. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value)
 
                         with gr.Column():
                             shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)

From 3b27404865c20193667cd614ec12f2f354a08798 Mon Sep 17 00:00:00 2001
From: Sam <sammcj@users.noreply.github.com>
Date: Mon, 7 Aug 2023 23:19:16 +1000
Subject: [PATCH 037/169] Make dockerfile respect specified cuda version
 (#3474)

---
 docker/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 7cc0ff15..3c5108d8 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -16,7 +16,7 @@ RUN . /build/venv/bin/activate && \
 
 # https://developer.nvidia.com/cuda-gpus
 # for a rtx 2060: ARG TORCH_CUDA_ARCH_LIST="7.5"
-ARG TORCH_CUDA_ARCH_LIST="3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX"
+ARG TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX}"
 RUN . /build/venv/bin/activate && \
     python3 setup_cuda.py bdist_wheel -d .
 

From 2d0634cd0764a5d64c39a483e51dd587ea008917 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 7 Aug 2023 08:57:19 -0700
Subject: [PATCH 038/169] Bump transformers commit for positive prompts

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 9deadd48..da6a5f20 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,7 +18,7 @@ tensorboard
 tqdm
 wandb
 git+https://github.com/huggingface/peft@96c0277a1b9a381b10ab34dbf84917f9b3b992e6
-git+https://github.com/huggingface/transformers@d533465150532b0c5de167b574e59f64c68b1154
+git+https://github.com/huggingface/transformers@baf1daa58eb2960248fd9f7c3af0ed245b8ce4af
 bitsandbytes==0.41.1; platform_system != "Windows"
 https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl; platform_system == "Windows"
 https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.3.0/auto_gptq-0.3.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"

From bbe4a29a258d028c6369c0eda90b9607b86f0156 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 7 Aug 2023 23:03:09 -0300
Subject: [PATCH 039/169] Add back dark theme code

---
 server.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/server.py b/server.py
index 0be1f4c4..414cd3ba 100644
--- a/server.py
+++ b/server.py
@@ -128,6 +128,9 @@ def create_interface():
         ui_model_menu.create_event_handlers()
 
         # Interface launch events
+        if shared.settings['dark_theme']:
+            shared.gradio['interface'].load(lambda: None, None, None, _js="() => document.getElementsByTagName('body')[0].classList.add('dark')")
+            
         shared.gradio['interface'].load(lambda: None, None, None, _js=f"() => {{{js}}}")
         shared.gradio['interface'].load(partial(ui.apply_interface_values, {}, use_persistent=True), None, gradio(ui.list_interface_input_elements()), show_progress=False)
         if shared.is_chat():

From 584dd334242df56cca4a53664c5d1e3b57094e74 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 7 Aug 2023 23:44:59 -0300
Subject: [PATCH 040/169] Fix missing example_dialogue when uploading
 characters

---
 modules/chat.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index a445c6a8..5667d433 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -510,9 +510,6 @@ def load_character(character, name1, name2, instruct=False):
             context = build_pygmalion_style_context(data)
             greeting_field = 'char_greeting'
 
-        if 'example_dialogue' in data:
-            context += f"{data['example_dialogue'].strip()}\n"
-
         if greeting_field in data:
             greeting = data[greeting_field]
 
@@ -573,6 +570,9 @@ def build_pygmalion_style_context(data):
     if 'world_scenario' in data and data['world_scenario'] != '':
         context += f"Scenario: {data['world_scenario']}\n"
 
+    if 'example_dialogue' in data and data['example_dialogue'] != '':
+        context += f"{data['example_dialogue'].strip()}\n"
+
     context = f"{context.strip()}\n"
     return context
 

From 6d354bb50b62d6ded5431a9097e6ca0db748678d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 7 Aug 2023 23:57:25 -0300
Subject: [PATCH 041/169] Allow the webui to do multiple tasks simultaneously

---
 server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server.py b/server.py
index 414cd3ba..b477d4c1 100644
--- a/server.py
+++ b/server.py
@@ -140,7 +140,7 @@ def create_interface():
         extensions_module.create_extensions_block()  # Extensions block
 
     # Launch the interface
-    shared.gradio['interface'].queue()
+    shared.gradio['interface'].queue(concurrency_count=64)
     with OpenMonkeyPatch():
         shared.gradio['interface'].launch(
             prevent_thread_lock=True,

From 37fb7194521fe4196f2f15cfa93045fea8c2349a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 8 Aug 2023 00:09:00 -0300
Subject: [PATCH 042/169] Increase the Context/Greeting boxes sizes

---
 modules/ui_chat.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index a858acaf..4471d2f4 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -63,8 +63,8 @@ def create_ui():
 
                     shared.gradio['name1'] = gr.Textbox(value=shared.settings['name1'], lines=1, label='Your name')
                     shared.gradio['name2'] = gr.Textbox(value=shared.settings['name2'], lines=1, label='Character\'s name')
-                    shared.gradio['context'] = gr.Textbox(value=shared.settings['context'], lines=4, label='Context', elem_classes=['add_scrollbar'])
-                    shared.gradio['greeting'] = gr.Textbox(value=shared.settings['greeting'], lines=4, label='Greeting', elem_classes=['add_scrollbar'])
+                    shared.gradio['context'] = gr.Textbox(value=shared.settings['context'], lines=10, label='Context', elem_classes=['add_scrollbar'])
+                    shared.gradio['greeting'] = gr.Textbox(value=shared.settings['greeting'], lines=5, label='Greeting', elem_classes=['add_scrollbar'])
 
                 with gr.Column(scale=1):
                     shared.gradio['character_picture'] = gr.Image(label='Character picture', type='pil')

From 0e78f3b4d4b188d902f9ccdeaebbf48f161b1d50 Mon Sep 17 00:00:00 2001
From: Gennadij <berkut1@users.noreply.github.com>
Date: Tue, 8 Aug 2023 06:31:11 +0300
Subject: [PATCH 043/169] Fixed a typo in "rms_norm_eps", incorrectly set as
 n_gqa (#3494)

---
 modules/ui_model_menu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 89b82c1f..7961c225 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -80,7 +80,7 @@ def create_ui():
                             shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=32, value=shared.args.threads)
                             shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, value=shared.args.n_batch)
                             shared.gradio['n_gqa'] = gr.Slider(minimum=0, maximum=16, step=1, label="n_gqa", value=shared.args.n_gqa, info='grouped-query attention. Must be 8 for llama-2 70b.')
-                            shared.gradio['rms_norm_eps'] = gr.Slider(minimum=0, maximum=1e-5, step=1e-6, label="rms_norm_eps", value=shared.args.n_gqa, info='5e-6 is a good value for llama-2 models.')
+                            shared.gradio['rms_norm_eps'] = gr.Slider(minimum=0, maximum=1e-5, step=1e-6, label="rms_norm_eps", value=shared.args.rms_norm_eps, info='5e-6 is a good value for llama-2 models.')
 
                             shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=str(shared.args.wbits) if shared.args.wbits > 0 else "None")
                             shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=str(shared.args.groupsize) if shared.args.groupsize > 0 else "None")

From bf08b16b32847cc813b55a2d93fc15b2cf3a53ea Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 8 Aug 2023 14:09:01 -0700
Subject: [PATCH 044/169] Fix disappearing profile picture bug

---
 modules/chat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/chat.py b/modules/chat.py
index 5667d433..efb7ecb8 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -471,7 +471,7 @@ def load_character(character, name1, name2, instruct=False):
     picture = None
 
     # Deleting the profile picture cache, if any
-    if Path("cache/pfp_character.png").exists():
+    if Path("cache/pfp_character.png").exists() and not instruct:
         Path("cache/pfp_character.png").unlink()
 
     if character not in ['None', '', None]:

From 4ba30f676544600c5c9ffdddfc50bfb4682f8a36 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 8 Aug 2023 14:10:04 -0700
Subject: [PATCH 045/169] Add OpenChat template

---
 instruction-templates/OpenChat.yaml | 4 ++++
 models/config.yaml                  | 3 +++
 2 files changed, 7 insertions(+)
 create mode 100644 instruction-templates/OpenChat.yaml

diff --git a/instruction-templates/OpenChat.yaml b/instruction-templates/OpenChat.yaml
new file mode 100644
index 00000000..3b84c226
--- /dev/null
+++ b/instruction-templates/OpenChat.yaml
@@ -0,0 +1,4 @@
+user: "GPT4 User:"
+bot: "GPT4 Assistant:"
+turn_template: "<|user|> <|user-message|><|end_of_turn|><|bot|> <|bot-message|><|end_of_turn|>"
+context: ""
diff --git a/models/config.yaml b/models/config.yaml
index 4d618de2..97ec9e92 100644
--- a/models/config.yaml
+++ b/models/config.yaml
@@ -288,3 +288,6 @@ TheBloke_WizardLM-30B-GPTQ:
   instruction_template: 'StableBeluga2'
   truncation_length: 4096
   rms_norm_eps: 5.0e-6
+.*openchat:
+  mode: 'instruct'
+  instruction_template: 'OpenChat'

From 901b028d551c85b4a19d8ceeb497efe2de7b32db Mon Sep 17 00:00:00 2001
From: Friedemann Lipphardt <friedemann.lipphardt@posteo.net>
Date: Wed, 9 Aug 2023 03:20:27 +0200
Subject: [PATCH 046/169] Add option for named cloudflare tunnels (#3364)

---
 README.md                       | 1 +
 docker/docker-compose.yml       | 1 +
 extensions/api/blocking_api.py  | 8 ++++----
 extensions/api/requirements.txt | 2 +-
 extensions/api/script.py        | 4 ++--
 extensions/api/streaming_api.py | 8 ++++----
 extensions/api/util.py          | 8 ++++----
 modules/shared.py               | 1 +
 8 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 218fa765..3df9a16f 100644
--- a/README.md
+++ b/README.md
@@ -326,6 +326,7 @@ Optionally, you can use the following command-line flags:
 |---------------------------------------|-------------|
 | `--api`                               | Enable the API extension. |
 | `--public-api`                        | Create a public URL for the API using Cloudfare. |
+| `--public-api-id PUBLIC_API_ID`       | Tunnel ID for named Cloudflare Tunnel. Use together with public-api option. |
 | `--api-blocking-port BLOCKING_PORT`   | The listening port for the blocking API. |
 | `--api-streaming-port STREAMING_PORT` | The listening port for the streaming API. |
 
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
index 46b27580..ce29f33b 100644
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@@ -23,6 +23,7 @@ services:
       - ./prompts:/app/prompts
       - ./softprompts:/app/softprompts
       - ./training:/app/training
+      - ./cloudflared:/etc/cloudflared
     deploy:
       resources:
         reservations:
diff --git a/extensions/api/blocking_api.py b/extensions/api/blocking_api.py
index fbbc5ec1..6b28205a 100644
--- a/extensions/api/blocking_api.py
+++ b/extensions/api/blocking_api.py
@@ -200,7 +200,7 @@ class Handler(BaseHTTPRequestHandler):
         super().end_headers()
 
 
-def _run_server(port: int, share: bool = False):
+def _run_server(port: int, share: bool = False, tunnel_id=str):
     address = '0.0.0.0' if shared.args.listen else '127.0.0.1'
 
     server = ThreadingHTTPServer((address, port), Handler)
@@ -210,7 +210,7 @@ def _run_server(port: int, share: bool = False):
 
     if share:
         try:
-            try_start_cloudflared(port, max_attempts=3, on_start=on_start)
+            try_start_cloudflared(port, tunnel_id, max_attempts=3, on_start=on_start)
         except Exception:
             pass
     else:
@@ -220,5 +220,5 @@ def _run_server(port: int, share: bool = False):
     server.serve_forever()
 
 
-def start_server(port: int, share: bool = False):
-    Thread(target=_run_server, args=[port, share], daemon=True).start()
+def start_server(port: int, share: bool = False, tunnel_id=str):
+    Thread(target=_run_server, args=[port, share, tunnel_id], daemon=True).start()
diff --git a/extensions/api/requirements.txt b/extensions/api/requirements.txt
index 14e29d35..e4f26c3a 100644
--- a/extensions/api/requirements.txt
+++ b/extensions/api/requirements.txt
@@ -1,2 +1,2 @@
-flask_cloudflared==0.0.12
+flask_cloudflared==0.0.14
 websockets==11.0.2
\ No newline at end of file
diff --git a/extensions/api/script.py b/extensions/api/script.py
index 5d1b1a68..80617b3e 100644
--- a/extensions/api/script.py
+++ b/extensions/api/script.py
@@ -4,5 +4,5 @@ from modules import shared
 
 
 def setup():
-    blocking_api.start_server(shared.args.api_blocking_port, share=shared.args.public_api)
-    streaming_api.start_server(shared.args.api_streaming_port, share=shared.args.public_api)
+    blocking_api.start_server(shared.args.api_blocking_port, share=shared.args.public_api, tunnel_id=shared.args.public_api_id)
+    streaming_api.start_server(shared.args.api_streaming_port, share=shared.args.public_api, tunnel_id=shared.args.public_api_id)
diff --git a/extensions/api/streaming_api.py b/extensions/api/streaming_api.py
index 6afa827d..9175eeb0 100644
--- a/extensions/api/streaming_api.py
+++ b/extensions/api/streaming_api.py
@@ -102,7 +102,7 @@ async def _run(host: str, port: int):
         await asyncio.Future()  # run forever
 
 
-def _run_server(port: int, share: bool = False):
+def _run_server(port: int, share: bool = False, tunnel_id=str):
     address = '0.0.0.0' if shared.args.listen else '127.0.0.1'
 
     def on_start(public_url: str):
@@ -111,7 +111,7 @@ def _run_server(port: int, share: bool = False):
 
     if share:
         try:
-            try_start_cloudflared(port, max_attempts=3, on_start=on_start)
+            try_start_cloudflared(port, tunnel_id, max_attempts=3, on_start=on_start)
         except Exception as e:
             print(e)
     else:
@@ -120,5 +120,5 @@ def _run_server(port: int, share: bool = False):
     asyncio.run(_run(host=address, port=port))
 
 
-def start_server(port: int, share: bool = False):
-    Thread(target=_run_server, args=[port, share], daemon=True).start()
+def start_server(port: int, share: bool = False, tunnel_id=str):
+    Thread(target=_run_server, args=[port, share, tunnel_id], daemon=True).start()
diff --git a/extensions/api/util.py b/extensions/api/util.py
index f36c070b..7ebfaa32 100644
--- a/extensions/api/util.py
+++ b/extensions/api/util.py
@@ -86,12 +86,12 @@ def build_parameters(body, chat=False):
     return generate_params
 
 
-def try_start_cloudflared(port: int, max_attempts: int = 3, on_start: Optional[Callable[[str], None]] = None):
+def try_start_cloudflared(port: int, tunnel_id: str, max_attempts: int = 3, on_start: Optional[Callable[[str], None]] = None):
     Thread(target=_start_cloudflared, args=[
-           port, max_attempts, on_start], daemon=True).start()
+           port, tunnel_id, max_attempts, on_start], daemon=True).start()
 
 
-def _start_cloudflared(port: int, max_attempts: int = 3, on_start: Optional[Callable[[str], None]] = None):
+def _start_cloudflared(port: int, tunnel_id: str, max_attempts: int = 3, on_start: Optional[Callable[[str], None]] = None):
     try:
         from flask_cloudflared import _run_cloudflared
     except ImportError:
@@ -101,7 +101,7 @@ def _start_cloudflared(port: int, max_attempts: int = 3, on_start: Optional[Call
 
     for _ in range(max_attempts):
         try:
-            public_url = _run_cloudflared(port, port + 1)
+            public_url = _run_cloudflared(port, port + 1, tunnel_id=tunnel_id)
 
             if on_start:
                 on_start(public_url)
diff --git a/modules/shared.py b/modules/shared.py
index 30f6512c..05c402c4 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -182,6 +182,7 @@ parser.add_argument('--api', action='store_true', help='Enable the API extension
 parser.add_argument('--api-blocking-port', type=int, default=5000, help='The listening port for the blocking API.')
 parser.add_argument('--api-streaming-port', type=int, default=5005, help='The listening port for the streaming API.')
 parser.add_argument('--public-api', action='store_true', help='Create a public URL for the API using Cloudfare.')
+parser.add_argument('--public-api-id', type=str, help='Tunnel ID for named Cloudflare Tunnel. Use together with public-api option.', default=None)
 
 # Multimodal
 parser.add_argument('--multimodal-pipeline', type=str, default=None, help='The multimodal pipeline to use. Examples: llava-7b, llava-13b.')

From f4caaf337afda85236e3963c22042e2581597424 Mon Sep 17 00:00:00 2001
From: Hans Raaf <hara@oderwat.de>
Date: Wed, 9 Aug 2023 04:26:28 +0200
Subject: [PATCH 047/169] Fix superbooga when using regenerate (#3362)

---
 extensions/superbooga/script.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/extensions/superbooga/script.py b/extensions/superbooga/script.py
index 5ef14d9d..475cf1e0 100644
--- a/extensions/superbooga/script.py
+++ b/extensions/superbooga/script.py
@@ -96,7 +96,8 @@ def apply_settings(chunk_count, chunk_count_initial, time_weight):
 def custom_generate_chat_prompt(user_input, state, **kwargs):
     global chat_collector
 
-    history = state['history']
+    # get history as being modified when using regenerate.
+    history = kwargs['history']
 
     if state['mode'] == 'instruct':
         results = collector.get_sorted(user_input, n_results=params['chunk_count'])

From d8fb506affda77dcc418fc25810de7254ce125bd Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 8 Aug 2023 21:24:28 -0700
Subject: [PATCH 048/169] Add RoPE scaling support for transformers (including
 dynamic NTK)

https://github.com/huggingface/transformers/pull/24653
---
 README.md                |  4 ++--
 modules/loaders.py       | 10 ++++++----
 modules/models.py        |  7 ++++++-
 modules/shared.py        |  2 +-
 modules/ui_model_menu.py |  2 +-
 5 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 3df9a16f..f7e18350 100644
--- a/README.md
+++ b/README.md
@@ -299,12 +299,12 @@ Optionally, you can use the following command-line flags:
 | `--rwkv-strategy RWKV_STRATEGY` | RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8". |
 | `--rwkv-cuda-on`                | RWKV: Compile the CUDA kernel for better performance. |
 
-#### RoPE (for llama.cpp and ExLlama only)
+#### RoPE (for llama.cpp, ExLlama, and transformers)
 
 | Flag             | Description |
 |------------------|-------------|
 |`--compress_pos_emb COMPRESS_POS_EMB` | Positional embeddings compression factor. Should typically be set to max_seq_len / 2048. |
-|`--alpha_value ALPHA_VALUE`           | Positional embeddings alpha factor for NTK RoPE scaling. Scaling is not identical to embedding compression. Use either this or compress_pos_emb, not both. |
+|`--alpha_value ALPHA_VALUE`           | Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both. |
 
 #### Gradio
 
diff --git a/modules/loaders.py b/modules/loaders.py
index 519e47a7..07bc455c 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -39,8 +39,8 @@ loaders_and_params = {
         'low_vram',
         'mlock',
         'llama_cpp_seed',
-        'compress_pos_emb',
         'alpha_value',
+        'compress_pos_emb',
         'cpu',
     ],
     'llamacpp_HF': [
@@ -54,8 +54,8 @@ loaders_and_params = {
         'low_vram',
         'mlock',
         'llama_cpp_seed',
-        'compress_pos_emb',
         'alpha_value',
+        'compress_pos_emb',
         'cpu',
         'llamacpp_HF_info',
     ],
@@ -73,20 +73,22 @@ loaders_and_params = {
         'quant_type',
         'compute_dtype',
         'trust_remote_code',
+        'alpha_value',
+        'compress_pos_emb',
         'transformers_info'
     ],
     'ExLlama': [
         'gpu_split',
         'max_seq_len',
-        'compress_pos_emb',
         'alpha_value',
+        'compress_pos_emb',
         'exllama_info',
     ],
     'ExLlama_HF': [
         'gpu_split',
         'max_seq_len',
-        'compress_pos_emb',
         'alpha_value',
+        'compress_pos_emb',
         'exllama_HF_info',
     ]
 }
diff --git a/modules/models.py b/modules/models.py
index 4f6a44c1..aad142c1 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -144,7 +144,7 @@ def huggingface_loader(model_name):
             LoaderClass = AutoModelForCausalLM
 
     # Load the model in simple 16-bit mode by default
-    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None]):
+    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.compress_pos_emb > 1, shared.args.alpha_value > 1]):
         model = LoaderClass.from_pretrained(Path(f"{shared.args.model_dir}/{model_name}"), low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 if shared.args.bf16 else torch.float16, trust_remote_code=shared.args.trust_remote_code)
         if torch.backends.mps.is_available():
             device = torch.device('mps')
@@ -215,6 +215,11 @@ def huggingface_loader(model_name):
                 no_split_module_classes=model._no_split_modules
             )
 
+        if shared.args.compress_pos_emb > 1:
+            params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb}
+        elif shared.args.alpha_value > 1:
+            params['rope_scaling'] = {'type': 'dynamic', 'factor': shared.args.alpha_value}
+
         model = LoaderClass.from_pretrained(checkpoint, **params)
 
     return model
diff --git a/modules/shared.py b/modules/shared.py
index 05c402c4..951120c8 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -164,7 +164,7 @@ parser.add_argument('--rwkv-cuda-on', action='store_true', help='RWKV: Compile t
 
 # RoPE
 parser.add_argument('--compress_pos_emb', type=int, default=1, help="Positional embeddings compression factor. Should typically be set to max_seq_len / 2048.")
-parser.add_argument('--alpha_value', type=int, default=1, help="Positional embeddings alpha factor for NTK RoPE scaling. Scaling is not identical to embedding compression. Use either this or compress_pos_emb, not both.")
+parser.add_argument('--alpha_value', type=int, default=1, help="Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.")
 
 # Gradio
 parser.add_argument('--listen', action='store_true', help='Make the web UI reachable from your local network.')
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 7961c225..55416a07 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -89,8 +89,8 @@ def create_ui():
                             shared.gradio['autogptq_info'] = gr.Markdown('* ExLlama_HF is recommended over AutoGPTQ for models derived from LLaMA.')
                             shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
                             shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=16384, step=256, info='Maximum sequence length.', value=shared.args.max_seq_len)
+                            shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=0.1, info='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value)
                             shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should typically be set to max_seq_len / 2048.', value=shared.args.compress_pos_emb)
-                            shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=0.1, info='Positional embeddings alpha factor for NTK RoPE scaling. Scaling is not identical to embedding compression. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value)
 
                         with gr.Column():
                             shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)

From 5bfcfcfc5ab7fe003c1f401949c467edbad6376a Mon Sep 17 00:00:00 2001
From: GiganticPrime <giganticprime@gmail.com>
Date: Wed, 9 Aug 2023 21:26:12 +0900
Subject: [PATCH 049/169] Added the logic for starchat model series (#3185)

---
 models/config.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/models/config.yaml b/models/config.yaml
index 97ec9e92..7e3e8ca4 100644
--- a/models/config.yaml
+++ b/models/config.yaml
@@ -234,6 +234,7 @@ TheBloke_WizardLM-30B-GPTQ:
 .*starchat-beta:
   mode: 'instruct'
   instruction_template: 'Starchat-Beta'
+  custom_stopping_strings: '"<|end|>"'
 .*minotaur:
   mode: 'instruct'
   instruction_template: 'Minotaur'

From 2255349f1904b1bfc1e3c6ef790777ad035363ea Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 9 Aug 2023 05:46:25 -0700
Subject: [PATCH 050/169] Update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f7e18350..98de6c09 100644
--- a/README.md
+++ b/README.md
@@ -303,8 +303,8 @@ Optionally, you can use the following command-line flags:
 
 | Flag             | Description |
 |------------------|-------------|
-|`--compress_pos_emb COMPRESS_POS_EMB` | Positional embeddings compression factor. Should typically be set to max_seq_len / 2048. |
 |`--alpha_value ALPHA_VALUE`           | Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both. |
+|`--compress_pos_emb COMPRESS_POS_EMB` | Positional embeddings compression factor. Should typically be set to max_seq_len / 2048. |
 
 #### Gradio
 

From 6c6a52aaad8e5d3ed28878ce9c51848f00ec422a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 9 Aug 2023 07:47:19 -0700
Subject: [PATCH 051/169] Change the filenames for caches and histories

---
 modules/chat.py           | 14 +++++++++++---
 modules/html_generator.py | 13 +++++++++++--
 2 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index efb7ecb8..c9af55db 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -395,6 +395,7 @@ def save_history(history, path=None):
     p = path or Path('logs/exported_history.json')
     if not p.parent.is_dir():
         p.parent.mkdir(parents=True)
+
     with open(p, 'w', encoding='utf-8') as f:
         f.write(json.dumps(history, indent=4))
 
@@ -415,7 +416,7 @@ def load_history(file, history):
 
 def save_persistent_history(history, character, mode):
     if mode in ['chat', 'chat-instruct'] and character not in ['', 'None', None] and not shared.args.multi_user:
-        save_history(history, path=Path(f'logs/{character}_persistent.json'))
+        save_history(history, path=Path(f'logs/persistent_{character}.json'))
 
 
 def load_persistent_history(state):
@@ -428,8 +429,15 @@ def load_persistent_history(state):
 
     character = state['character_menu']
     greeting = replace_character_names(state['greeting'], state['name1'], state['name2'])
-    p = Path(f'logs/{character}_persistent.json')
-    if not shared.args.multi_user and character not in ['None', '', None] and p.exists():
+
+    should_load_history = (not shared.args.multi_user and character not in ['None', '', None])
+    old_p = Path(f'logs/{character}_persistent.json')
+    p = Path(f'logs/persistent_{character}.json')
+    if should_load_history and old_p.exists():
+        logger.warning(f"Renaming {old_p} to {p}")
+        old_p.rename(p)
+
+    if should_load_history and p.exists():
         f = json.loads(open(p, 'rb').read())
         if 'internal' in f and 'visible' in f:
             history = f
diff --git a/modules/html_generator.py b/modules/html_generator.py
index 15c731c3..422beb30 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -6,6 +6,7 @@ from pathlib import Path
 import markdown
 from PIL import Image, ImageOps
 
+from modules.logging_colors import logger
 from modules.utils import get_available_chat_styles
 
 # This is to store the paths to the thumbnails of the profile pictures
@@ -120,6 +121,7 @@ def generate_4chan_html(f):
             post = line
         else:
             post += line
+
     if post != '':
         src = process_post(post, c)
         posts.append(src)
@@ -134,13 +136,14 @@ def generate_4chan_html(f):
     output += f'<style>{_4chan_css}</style><div id="parent"><div id="container">'
     for post in posts:
         output += post
+
     output += '</div></div>'
     output = output.split('\n')
     for i in range(len(output)):
         output[i] = re.sub(r'^(&gt;(.*?)(<br>|</div>))', r'<span class="greentext">\1</span>', output[i])
         output[i] = re.sub(r'^<blockquote class="message">(&gt;(.*?)(<br>|</div>))', r'<blockquote class="message"><span class="greentext">\1</span>', output[i])
-    output = '\n'.join(output)
 
+    output = '\n'.join(output)
     return output
 
 
@@ -160,7 +163,13 @@ def get_image_cache(path):
     mtime = os.stat(path).st_mtime
     if (path in image_cache and mtime != image_cache[path][0]) or (path not in image_cache):
         img = make_thumbnail(Image.open(path))
-        output_file = Path(f'cache/{path.name}_cache.png')
+
+        old_p = Path(f'cache/{path.name}_cache.png')
+        p = Path(f'cache/cache_{path.name}.png')
+        if old_p.exists():
+            old_p.rename(p)
+
+        output_file = p
         img.convert('RGB').save(output_file, format='PNG')
         image_cache[path] = [mtime, output_file.as_posix()]
 

From 7c1300fab55e39f80c1d6fa8f547b9a7c41fcbcd Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 9 Aug 2023 08:07:55 -0700
Subject: [PATCH 052/169] Pin aiofiles version to fix statvfs issue

---
 requirements.txt | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index da6a5f20..0cea4fe8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,10 +1,12 @@
+aiofiles==23.1.0
+fastapi==0.95.2
+gradio_client==0.2.5
+gradio==3.33.1
+
 accelerate==0.21.0
 colorama
 datasets
 einops
-fastapi==0.95.2
-gradio_client==0.2.5
-gradio==3.33.1
 markdown
 numpy
 pandas
@@ -17,14 +19,17 @@ sentencepiece
 tensorboard
 tqdm
 wandb
+
 git+https://github.com/huggingface/peft@96c0277a1b9a381b10ab34dbf84917f9b3b992e6
 git+https://github.com/huggingface/transformers@baf1daa58eb2960248fd9f7c3af0ed245b8ce4af
+
 bitsandbytes==0.41.1; platform_system != "Windows"
 https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl; platform_system == "Windows"
 https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.3.0/auto_gptq-0.3.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.3.0/auto_gptq-0.3.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+
 # llama-cpp-python without GPU support
 llama-cpp-python==0.1.77; platform_system != "Windows"
 https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.77/llama_cpp_python-0.1.77-cp310-cp310-win_amd64.whl; platform_system == "Windows"

From a4e48cbdb67b50cc174c8e09704e54472b059aa2 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 9 Aug 2023 08:31:17 -0700
Subject: [PATCH 053/169] Bump AutoGPTQ

---
 requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 0cea4fe8..e65bed6b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -25,8 +25,8 @@ git+https://github.com/huggingface/transformers@baf1daa58eb2960248fd9f7c3af0ed24
 
 bitsandbytes==0.41.1; platform_system != "Windows"
 https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.3.0/auto_gptq-0.3.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
-https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.3.0/auto_gptq-0.3.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.0/auto_gptq-0.4.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
+https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.0/auto_gptq-0.4.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 

From a3295dd6667219da2458a1420b746008d71b18b4 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 9 Aug 2023 10:38:35 -0700
Subject: [PATCH 054/169] Detect n_gqa and prompt template for wizardlm-70b

---
 instruction-templates/WizardLM.yaml |  4 ----
 models/config.yaml                  | 20 ++------------------
 2 files changed, 2 insertions(+), 22 deletions(-)
 delete mode 100644 instruction-templates/WizardLM.yaml

diff --git a/instruction-templates/WizardLM.yaml b/instruction-templates/WizardLM.yaml
deleted file mode 100644
index c65bb8f4..00000000
--- a/instruction-templates/WizardLM.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-user: ""
-bot: "### Response:"
-turn_template: "<|user-message|>\n\n<|bot|><|bot-message|>\n\n</s>"
-context: ""
\ No newline at end of file
diff --git a/models/config.yaml b/models/config.yaml
index 7e3e8ca4..23862770 100644
--- a/models/config.yaml
+++ b/models/config.yaml
@@ -53,9 +53,6 @@ llama-65b-gptq-3bit:
 .*vicuna.*(1.1|1_1|1.3|1_3):
   mode: 'instruct'
   instruction_template: 'Vicuna-v1.1'
-.*wizard.*vicuna:
-  mode: 'instruct'
-  instruction_template: 'Vicuna-v1.1'
 .*stable.*vicuna:
   mode: 'instruct'
   instruction_template: 'StableVicuna'
@@ -108,10 +105,6 @@ llama-65b-gptq-3bit:
   truncation_length: 4096
 .*stablelm-base:
   truncation_length: 4096
-.*wizardlm:
-  mode: 'instruct'
-  model_type: 'llama'
-  instruction_template: 'WizardLM'
 .*galactica.*finetuned:
   mode: 'instruct'
   instruction_template: 'Galactica Finetuned'
@@ -189,21 +182,12 @@ llama-65b-gptq-3bit:
 .*airoboros.*1.2:
   mode: 'instruct'
   instruction_template: 'Airoboros-v1.2'
-.*WizardLM-30B-V1.0:
-  mode: 'instruct'
-  instruction_template: 'Vicuna-v1.1'
-TheBloke_WizardLM-30B-GPTQ:
-  mode: 'instruct'
-  instruction_template: 'Vicuna-v1.1'
 .*alpa(cino|sta):
   mode: 'instruct'
   instruction_template: 'Alpaca'
 .*hippogriff:
   mode: 'instruct'
   instruction_template: 'Hippogriff'
-.*gpt4all-.*-snoozy:
-  mode: 'instruct'
-  instruction_template: 'WizardLM'
 .*lazarus:
   mode: 'instruct'
   instruction_template: 'Alpaca'
@@ -267,7 +251,7 @@ TheBloke_WizardLM-30B-GPTQ:
   mode: 'instruct'
   instruction_template: 'Alpaca'
   truncation_length: 8192
-.*wizardlm-.*-v1.1:
+.*wizardlm:
   mode: 'instruct'
   instruction_template: 'Vicuna-v1.1'
 .*godzilla:
@@ -279,7 +263,7 @@ TheBloke_WizardLM-30B-GPTQ:
 .*llama-(2|v2).*chat:
   mode: 'instruct'
   instruction_template: 'Llama-v2'
-.*llama.*70b.*ggml.*\.bin:
+.*70b.*ggml.*\.bin:
   n_gqa: 8
 .*newhope:
   mode: 'instruct'

From bee73cedbd535d8a5392472c402c843b3ed10e27 Mon Sep 17 00:00:00 2001
From: jllllll <3887729+jllllll@users.noreply.github.com>
Date: Wed, 9 Aug 2023 23:42:34 -0500
Subject: [PATCH 055/169] Streamline GPTQ-for-LLaMa support

---
 README.md                |  3 --
 modules/GPTQ_loader.py   | 64 ++++++++++------------------------------
 modules/shared.py        |  3 --
 modules/ui_model_menu.py |  2 +-
 requirements.txt         |  4 +++
 5 files changed, 21 insertions(+), 55 deletions(-)

diff --git a/README.md b/README.md
index 98de6c09..ad2ad1ed 100644
--- a/README.md
+++ b/README.md
@@ -280,9 +280,6 @@ Optionally, you can use the following command-line flags:
 | `--pre_layer PRE_LAYER [PRE_LAYER ...]`  | The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. For multi-gpu, write the numbers separated by spaces, eg `--pre_layer 30 60`. |
 | `--checkpoint CHECKPOINT` | The path to the quantized checkpoint file. If not specified, it will be automatically detected. |
 | `--monkey-patch`          | Apply the monkey patch for using LoRAs with quantized models.
-| `--quant_attn`         | (triton) Enable quant attention. |
-| `--warmup_autotune`    | (triton) Enable warmup autotune. |
-| `--fused_mlp`          | (triton) Enable fused mlp. |
 
 #### DeepSpeed
 
diff --git a/modules/GPTQ_loader.py b/modules/GPTQ_loader.py
index ddc5f9a5..c0cef476 100644
--- a/modules/GPTQ_loader.py
+++ b/modules/GPTQ_loader.py
@@ -11,26 +11,9 @@ from transformers import AutoConfig, AutoModelForCausalLM
 import modules.shared as shared
 from modules.logging_colors import logger
 
-sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa")))
-
-try:
-    import llama_inference_offload
-except ImportError:
-    logger.error('Failed to load GPTQ-for-LLaMa')
-    logger.error('See https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md')
-    sys.exit(-1)
-
-try:
-    from modelutils import find_layers
-except ImportError:
-    from utils import find_layers
-
-try:
-    from quant import make_quant
-    is_triton = False
-except ImportError:
-    import quant
-    is_triton = True
+from gptq_for_llama import llama_inference_offload
+from gptq_for_llama.modelutils import find_layers
+from gptq_for_llama.quant import make_quant
 
 
 # This function is a replacement for the load_quant function in the
@@ -59,24 +42,21 @@ def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exc
         if name in layers:
             del layers[name]
 
-    if not is_triton:
-        gptq_args = inspect.getfullargspec(make_quant).args
+    gptq_args = inspect.getfullargspec(make_quant).args
 
-        make_quant_kwargs = {
-            'module': model,
-            'names': layers,
-            'bits': wbits,
-        }
-        if 'groupsize' in gptq_args:
-            make_quant_kwargs['groupsize'] = groupsize
-        if 'faster' in gptq_args:
-            make_quant_kwargs['faster'] = faster_kernel
-        if 'kernel_switch_threshold' in gptq_args:
-            make_quant_kwargs['kernel_switch_threshold'] = kernel_switch_threshold
+    make_quant_kwargs = {
+        'module': model,
+        'names': layers,
+        'bits': wbits,
+    }
+    if 'groupsize' in gptq_args:
+        make_quant_kwargs['groupsize'] = groupsize
+    if 'faster' in gptq_args:
+        make_quant_kwargs['faster'] = faster_kernel
+    if 'kernel_switch_threshold' in gptq_args:
+        make_quant_kwargs['kernel_switch_threshold'] = kernel_switch_threshold
 
-        make_quant(**make_quant_kwargs)
-    else:
-        quant.make_quant_linear(model, layers, wbits, groupsize)
+    make_quant(**make_quant_kwargs)
 
     del layers
     if checkpoint.endswith('.safetensors'):
@@ -85,18 +65,6 @@ def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exc
     else:
         model.load_state_dict(torch.load(checkpoint), strict=False)
 
-    if is_triton:
-        if shared.args.quant_attn:
-            quant.make_quant_attn(model)
-
-        if eval and shared.args.fused_mlp:
-            quant.make_fused_mlp(model)
-
-        if shared.args.warmup_autotune:
-            quant.autotune_warmup_linear(model, transpose=not eval)
-            if eval and shared.args.fused_mlp:
-                quant.autotune_warmup_fused(model)
-
     model.seqlen = 2048
     return model
 
diff --git a/modules/shared.py b/modules/shared.py
index 951120c8..224fa6aa 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -138,9 +138,6 @@ parser.add_argument('--groupsize', type=int, default=-1, help='Group size.')
 parser.add_argument('--pre_layer', type=int, nargs="+", help='The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. For multi-gpu, write the numbers separated by spaces, eg --pre_layer 30 60.')
 parser.add_argument('--checkpoint', type=str, help='The path to the quantized checkpoint file. If not specified, it will be automatically detected.')
 parser.add_argument('--monkey-patch', action='store_true', help='Apply the monkey patch for using LoRAs with quantized models.')
-parser.add_argument('--quant_attn', action='store_true', help='(triton) Enable quant attention.')
-parser.add_argument('--warmup_autotune', action='store_true', help='(triton) Enable warmup autotune.')
-parser.add_argument('--fused_mlp', action='store_true', help='(triton) Enable fused mlp.')
 
 # AutoGPTQ
 parser.add_argument('--triton', action='store_true', help='Use triton.')
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 55416a07..e98e237c 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -110,7 +110,7 @@ def create_ui():
                             shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
                             shared.gradio['llama_cpp_seed'] = gr.Number(label='Seed (0 for random)', value=shared.args.llama_cpp_seed)
                             shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.')
-                            shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa is currently 2x faster than AutoGPTQ on some systems. It is installed by default with the one-click installers. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')
+                            shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa support is currently only kept for compatibility with older GPUs. AutoGPTQ or ExLlama is preferred when compatible. GPTQ-for-LLaMa is installed by default with the one-click installers. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')
                             shared.gradio['exllama_info'] = gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/blob/main/docs/ExLlama.md).')
                             shared.gradio['exllama_HF_info'] = gr.Markdown('ExLlama_HF is a wrapper that lets you use ExLlama like a Transformers model, which means it can use the Transformers samplers. It\'s a bit slower than the regular ExLlama.')
                             shared.gradio['llamacpp_HF_info'] = gr.Markdown('llamacpp_HF is a wrapper that lets you use llama.cpp like a Transformers model, which means it can use the Transformers samplers. To use it, make sure to first download oobabooga/llama-tokenizer under "Download custom model or LoRA".')
diff --git a/requirements.txt b/requirements.txt
index e65bed6b..b27e14c5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -36,3 +36,7 @@ https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.77/llama_cpp_
 # llama-cpp-python with CUDA support
 https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.77+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.77+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+
+# GPTQ-for-LLaMa
+https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
+https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From e3d3565b2a538da8769fd0352067647529b2298c Mon Sep 17 00:00:00 2001
From: jllllll <3887729+jllllll@users.noreply.github.com>
Date: Wed, 9 Aug 2023 23:59:04 -0500
Subject: [PATCH 056/169] Remove GPTQ-for-LLaMa monkey patch support

AutoGPTQ will be the preferred GPTQ LoRa loader in the future.
---
 README.md                         |  1 -
 docs/GPTQ-models-(4-bit-mode).md  | 27 -------------------
 docs/LoRA.md                      |  1 -
 docs/Training-LoRAs.md            |  8 ------
 modules/monkey_patch_gptq_lora.py | 43 -------------------------------
 modules/training.py               | 23 -----------------
 6 files changed, 103 deletions(-)
 delete mode 100644 modules/monkey_patch_gptq_lora.py

diff --git a/README.md b/README.md
index ad2ad1ed..5739d0ba 100644
--- a/README.md
+++ b/README.md
@@ -279,7 +279,6 @@ Optionally, you can use the following command-line flags:
 | `--groupsize GROUPSIZE`   | Group size. |
 | `--pre_layer PRE_LAYER [PRE_LAYER ...]`  | The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. For multi-gpu, write the numbers separated by spaces, eg `--pre_layer 30 60`. |
 | `--checkpoint CHECKPOINT` | The path to the quantized checkpoint file. If not specified, it will be automatically detected. |
-| `--monkey-patch`          | Apply the monkey patch for using LoRAs with quantized models.
 
 #### DeepSpeed
 
diff --git a/docs/GPTQ-models-(4-bit-mode).md b/docs/GPTQ-models-(4-bit-mode).md
index 838595ef..d3869bb7 100644
--- a/docs/GPTQ-models-(4-bit-mode).md
+++ b/docs/GPTQ-models-(4-bit-mode).md
@@ -198,31 +198,4 @@ Output generated in 123.79 seconds (1.61 tokens/s, 199 tokens)
 
 You can also use multiple GPUs with `pre_layer` if using the oobabooga fork of GPTQ, eg `--pre_layer 30 60` will load a LLaMA-30B model half onto your first GPU and half onto your second, or `--pre_layer 20 40` will load 20 layers onto GPU-0, 20 layers onto GPU-1, and 20 layers offloaded to CPU.
 
-### Using LoRAs with GPTQ-for-LLaMa
-
-This requires using a monkey patch that is supported by this web UI: https://github.com/johnsmith0031/alpaca_lora_4bit
-
-To use it:
-
-1. Clone `johnsmith0031/alpaca_lora_4bit` into the repositories folder:
-
-```
-cd text-generation-webui/repositories
-git clone https://github.com/johnsmith0031/alpaca_lora_4bit
-```
-
-⚠️  I have tested it with the following commit specifically: `2f704b93c961bf202937b10aac9322b092afdce0`
-
-2. Install https://github.com/sterlind/GPTQ-for-LLaMa with this command:
-
-```
-pip install git+https://github.com/sterlind/GPTQ-for-LLaMa.git@lora_4bit
-```
-
-3. Start the UI with the `--monkey-patch` flag:
-
-```
-python server.py --model llama-7b-4bit-128g --listen --lora tloen_alpaca-lora-7b --monkey-patch
-```
-
 
diff --git a/docs/LoRA.md b/docs/LoRA.md
index f1504d10..02ce55be 100644
--- a/docs/LoRA.md
+++ b/docs/LoRA.md
@@ -11,7 +11,6 @@ This is the current state of LoRA integration in the web UI:
 | Transformers | Full support in 16-bit, `--load-in-8bit`, `--load-in-4bit`, and CPU modes. |
 | ExLlama | Single LoRA support. Fast to remove the LoRA afterwards. |
 | AutoGPTQ | Single LoRA support. Removing the LoRA requires reloading the entire model.|
-| GPTQ-for-LLaMa | Full support with the [monkey patch](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#using-loras-with-gptq-for-llama). |
 
 ## Downloading a LoRA
 
diff --git a/docs/Training-LoRAs.md b/docs/Training-LoRAs.md
index 83e6d5a7..bdc79992 100644
--- a/docs/Training-LoRAs.md
+++ b/docs/Training-LoRAs.md
@@ -131,14 +131,6 @@ So, in effect, Loss is a balancing game: you want to get it low enough that it u
 
 Note: if you see Loss start at or suddenly jump to exactly `0`, it is likely something has gone wrong in your training process (eg model corruption).
 
-## Note: 4-Bit Monkeypatch
-
-The [4-bit LoRA monkeypatch](GPTQ-models-(4-bit-mode).md#using-loras-in-4-bit-mode) works for training, but has side effects:
-- VRAM usage is higher currently. You can reduce the `Micro Batch Size` to `1` to compensate.
-- Models do funky things. LoRAs apply themselves, or refuse to apply, or spontaneously error out, or etc. It can be helpful to reload base model or restart the WebUI between training/usage to minimize chances of anything going haywire.
-- Loading or working with multiple LoRAs at the same time doesn't currently work.
-- Generally, recognize and treat the monkeypatch as the dirty temporary hack it is - it works, but isn't very stable. It will get better in time when everything is merged upstream for full official support.
-
 ## Legacy notes
 
 LoRA training was contributed by [mcmonkey4eva](https://github.com/mcmonkey4eva) in PR [#570](https://github.com/oobabooga/text-generation-webui/pull/570).
diff --git a/modules/monkey_patch_gptq_lora.py b/modules/monkey_patch_gptq_lora.py
deleted file mode 100644
index bf8d478d..00000000
--- a/modules/monkey_patch_gptq_lora.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copied from https://github.com/johnsmith0031/alpaca_lora_4bit
-
-import sys
-from pathlib import Path
-
-sys.path.insert(0, str(Path("repositories/alpaca_lora_4bit")))
-
-import autograd_4bit
-from amp_wrapper import AMPWrapper
-from autograd_4bit import (
-    Autograd4bitQuantLinear,
-    load_llama_model_4bit_low_ram
-)
-from monkeypatch.peft_tuners_lora_monkey_patch import (
-    Linear4bitLt,
-    replace_peft_model_with_gptq_lora_model
-)
-
-from modules import shared
-from modules.GPTQ_loader import find_quantized_model_file
-
-replace_peft_model_with_gptq_lora_model()
-
-
-def load_model_llama(model_name):
-    config_path = str(Path(f'{shared.args.model_dir}/{model_name}'))
-    model_path = str(find_quantized_model_file(model_name))
-    model, tokenizer = load_llama_model_4bit_low_ram(config_path, model_path, groupsize=shared.args.groupsize, is_v1_model=False)
-    for n, m in model.named_modules():
-        if isinstance(m, Autograd4bitQuantLinear) or isinstance(m, Linear4bitLt):
-            if m.is_v1_model:
-                m.zeros = m.zeros.half()
-            m.scales = m.scales.half()
-            m.bias = m.bias.half()
-
-    autograd_4bit.use_new = True
-    autograd_4bit.auto_switch = True
-
-    model.half()
-    wrapper = AMPWrapper(model)
-    wrapper.apply_generate()
-
-    return model, tokenizer
diff --git a/modules/training.py b/modules/training.py
index 7558cd5d..fa721ff0 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -270,12 +270,6 @@ def calc_trainable_parameters(model):
 
 def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch_size: int, batch_size: int, epochs: int, learning_rate: str, lr_scheduler_type: str, lora_rank: int, lora_alpha: int, lora_dropout: float, cutoff_len: int, dataset: str, eval_dataset: str, format: str, eval_steps: int, raw_text_file: str, overlap_len: int, newline_favor_len: int, higher_rank_limit: bool, warmup_steps: int, optimizer: str, hard_cut_string: str, train_only_after: str, stop_at_loss: float, add_eos_token: bool, min_chars: int, report_to: str):
 
-    if shared.args.monkey_patch:
-        from monkeypatch.peft_tuners_lora_monkey_patch import (
-            replace_peft_model_with_gptq_lora_model
-        )
-        replace_peft_model_with_gptq_lora_model()
-
     global WANT_INTERRUPT
     WANT_INTERRUPT = False
 
@@ -307,15 +301,6 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
 
         time.sleep(5)
 
-    if shared.args.wbits > 0 and not shared.args.monkey_patch:
-        yield "LoRA training with GPTQ models requires loading with `--monkey-patch`"
-        return
-
-    elif not (shared.args.load_in_8bit or shared.args.load_in_4bit) and shared.args.wbits <= 0:
-        yield "It is highly recommended you use `--load-in-8bit` for LoRA training. *(Will continue anyway in 2 seconds, press `Interrupt` to stop.)*"
-        logger.warning("It is highly recommended you use `--load-in-8bit` for LoRA training.")
-        time.sleep(2)  # Give it a moment for the message to show in UI before continuing
-
     if cutoff_len <= 0 or micro_batch_size <= 0 or batch_size <= 0 or actual_lr <= 0 or lora_rank <= 0 or lora_alpha <= 0:
         yield "Cannot input zeroes."
         return
@@ -520,14 +505,6 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
         yield traceback.format_exc().replace('\n', '\n\n')
         return
 
-    if shared.args.monkey_patch:
-        for n, m in lora_model.named_modules():
-            if '4bit' in str(type(m)):
-                if m.is_v1_model:
-                    m.zeros = m.zeros.half()
-
-                m.scales = m.scales.half()
-
     class Tracked():
         def __init__(self):
             self.current_steps = 0

From d7ee4c23862081f6b8dbbaac8f22e7bc519da172 Mon Sep 17 00:00:00 2001
From: jllllll <3887729+jllllll@users.noreply.github.com>
Date: Thu, 10 Aug 2023 00:10:14 -0500
Subject: [PATCH 057/169] Remove unused import

---
 modules/GPTQ_loader.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modules/GPTQ_loader.py b/modules/GPTQ_loader.py
index c0cef476..bc528b18 100644
--- a/modules/GPTQ_loader.py
+++ b/modules/GPTQ_loader.py
@@ -1,6 +1,5 @@
 import inspect
 import re
-import sys
 from pathlib import Path
 
 import accelerate

From d6765bebc4920827200ee5779e2441dec65763e1 Mon Sep 17 00:00:00 2001
From: jllllll <3887729+jllllll@users.noreply.github.com>
Date: Thu, 10 Aug 2023 00:53:48 -0500
Subject: [PATCH 058/169] Update installation documentation

---
 docs/GPTQ-models-(4-bit-mode).md | 55 ++++----------------------------
 modules/ui_model_menu.py         |  2 +-
 2 files changed, 8 insertions(+), 49 deletions(-)

diff --git a/docs/GPTQ-models-(4-bit-mode).md b/docs/GPTQ-models-(4-bit-mode).md
index d3869bb7..e8d983eb 100644
--- a/docs/GPTQ-models-(4-bit-mode).md
+++ b/docs/GPTQ-models-(4-bit-mode).md
@@ -70,53 +70,13 @@ Not supported yet.
 
 GPTQ-for-LLaMa is the original adaptation of GPTQ for the LLaMA model. It was made possible by [@qwopqwop200](https://github.com/qwopqwop200/GPTQ-for-LLaMa): https://github.com/qwopqwop200/GPTQ-for-LLaMa
 
-Different branches of GPTQ-for-LLaMa are currently available, including:
-
-| Branch | Comment |
-|----|----|
-| [Old CUDA branch (recommended)](https://github.com/oobabooga/GPTQ-for-LLaMa/) | The fastest branch, works on Windows and Linux. |
-| [Up-to-date triton branch](https://github.com/qwopqwop200/GPTQ-for-LLaMa) | Slightly more precise than the old CUDA branch from 13b upwards, significantly more precise for 7b. 2x slower for small context size and only works on Linux. |
-| [Up-to-date CUDA branch](https://github.com/qwopqwop200/GPTQ-for-LLaMa/tree/cuda) | As precise as the up-to-date triton branch, 10x slower than the old cuda branch for small context size. |
-
-Overall, I recommend using the old CUDA branch. It is included by default in the one-click-installer for this web UI.
-
-### Installation
-
-Start by cloning GPTQ-for-LLaMa into your `text-generation-webui/repositories` folder:
-
-```
-mkdir repositories
-cd repositories
-git clone https://github.com/oobabooga/GPTQ-for-LLaMa.git -b cuda
-```
-
-If you want to you to use the up-to-date CUDA or triton branches instead of the old CUDA branch, use these commands:
-
-```
-git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa.git -b cuda
-```
-
-```
-git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa.git -b triton
-```
-
-Next you need to install the CUDA extensions. You can do that either by installing the precompiled wheels, or by compiling the wheels yourself.
+A Python package containing both major CUDA versions of GPTQ-for-LLaMa is used to simplify installation and compatibility: https://github.com/jllllll/GPTQ-for-LLaMa-CUDA
 
 ### Precompiled wheels
 
-Kindly provided by our friend jllllll: https://github.com/jllllll/GPTQ-for-LLaMa-Wheels
+Kindly provided by our friend jllllll: https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases
 
-Windows:
-
-```
-pip install https://github.com/jllllll/GPTQ-for-LLaMa-Wheels/raw/main/quant_cuda-0.0.0-cp310-cp310-win_amd64.whl
-```
-
-Linux:
-
-```
-pip install https://github.com/jllllll/GPTQ-for-LLaMa-Wheels/raw/Linux-x64/quant_cuda-0.0.0-cp310-cp310-linux_x86_64.whl
-```
+Wheels are included in requirements.txt and are installed with the webui on supported systems.
 
 ### Manual installation
 
@@ -124,20 +84,19 @@ pip install https://github.com/jllllll/GPTQ-for-LLaMa-Wheels/raw/Linux-x64/quant
 
 ```
 conda activate textgen
-conda install -c conda-forge cudatoolkit-dev
+conda install cuda -c nvidia/label/cuda-11.7.1
 ```
 
 The command above takes some 10 minutes to run and shows no progress bar or updates along the way.
 
-You are also going to need to have a C++ compiler installed. On Linux, `sudo apt install build-essential` or equivalent is enough.
+You are also going to need to have a C++ compiler installed. On Linux, `sudo apt install build-essential` or equivalent is enough. On Windows, Visual Studio or Visual Studio Build Tools is required.
 
-If you're using an older version of CUDA toolkit (e.g. 11.7) but the latest version of `gcc` and `g++` (12.0+), you should downgrade with: `conda install -c conda-forge gxx==11.3.0`. Kernel compilation will fail otherwise.
+If you're using an older version of CUDA toolkit (e.g. 11.7) but the latest version of `gcc` and `g++` (12.0+) on Linux, you should downgrade with: `conda install -c conda-forge gxx==11.3.0`. Kernel compilation will fail otherwise.
 
 #### Step 2: compile the CUDA extensions
 
 ```
-cd repositories/GPTQ-for-LLaMa
-python setup_cuda.py install
+python -m pip install git+https://github.com/jllllll/GPTQ-for-LLaMa-CUDA -v
 ```
 
 ### Getting pre-converted LLaMA weights
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index e98e237c..0c1042f6 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -110,7 +110,7 @@ def create_ui():
                             shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
                             shared.gradio['llama_cpp_seed'] = gr.Number(label='Seed (0 for random)', value=shared.args.llama_cpp_seed)
                             shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.')
-                            shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa support is currently only kept for compatibility with older GPUs. AutoGPTQ or ExLlama is preferred when compatible. GPTQ-for-LLaMa is installed by default with the one-click installers. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')
+                            shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa support is currently only kept for compatibility with older GPUs. AutoGPTQ or ExLlama is preferred when compatible. GPTQ-for-LLaMa is installed by default with the webui on supported systems. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')
                             shared.gradio['exllama_info'] = gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/blob/main/docs/ExLlama.md).')
                             shared.gradio['exllama_HF_info'] = gr.Markdown('ExLlama_HF is a wrapper that lets you use ExLlama like a Transformers model, which means it can use the Transformers samplers. It\'s a bit slower than the regular ExLlama.')
                             shared.gradio['llamacpp_HF_info'] = gr.Markdown('llamacpp_HF is a wrapper that lets you use llama.cpp like a Transformers model, which means it can use the Transformers samplers. To use it, make sure to first download oobabooga/llama-tokenizer under "Download custom model or LoRA".')

From 16e2b117b415074afd2917a72496b776debfcd58 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 10 Aug 2023 08:38:10 -0700
Subject: [PATCH 059/169] Minor doc change

---
 docs/GPTQ-models-(4-bit-mode).md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/GPTQ-models-(4-bit-mode).md b/docs/GPTQ-models-(4-bit-mode).md
index e8d983eb..b42f4224 100644
--- a/docs/GPTQ-models-(4-bit-mode).md
+++ b/docs/GPTQ-models-(4-bit-mode).md
@@ -64,7 +64,7 @@ python server.py --autogptq --gpu-memory 3000MiB 6000MiB --model model_name
 
 ### Using LoRAs with AutoGPTQ
 
-Not supported yet.
+Works fine for a single LoRA.
 
 ## GPTQ-for-LLaMa
 

From c7f52bbdc106896b8f839c442b0c82937f006fd8 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 10 Aug 2023 08:39:41 -0700
Subject: [PATCH 060/169] Revert "Remove GPTQ-for-LLaMa monkey patch support"

This reverts commit e3d3565b2a538da8769fd0352067647529b2298c.
---
 README.md                         |  1 +
 docs/GPTQ-models-(4-bit-mode).md  | 27 +++++++++++++++++++
 docs/LoRA.md                      |  1 +
 docs/Training-LoRAs.md            |  8 ++++++
 modules/monkey_patch_gptq_lora.py | 43 +++++++++++++++++++++++++++++++
 modules/training.py               | 23 +++++++++++++++++
 6 files changed, 103 insertions(+)
 create mode 100644 modules/monkey_patch_gptq_lora.py

diff --git a/README.md b/README.md
index 5739d0ba..ad2ad1ed 100644
--- a/README.md
+++ b/README.md
@@ -279,6 +279,7 @@ Optionally, you can use the following command-line flags:
 | `--groupsize GROUPSIZE`   | Group size. |
 | `--pre_layer PRE_LAYER [PRE_LAYER ...]`  | The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. For multi-gpu, write the numbers separated by spaces, eg `--pre_layer 30 60`. |
 | `--checkpoint CHECKPOINT` | The path to the quantized checkpoint file. If not specified, it will be automatically detected. |
+| `--monkey-patch`          | Apply the monkey patch for using LoRAs with quantized models.
 
 #### DeepSpeed
 
diff --git a/docs/GPTQ-models-(4-bit-mode).md b/docs/GPTQ-models-(4-bit-mode).md
index b42f4224..428d7560 100644
--- a/docs/GPTQ-models-(4-bit-mode).md
+++ b/docs/GPTQ-models-(4-bit-mode).md
@@ -157,4 +157,31 @@ Output generated in 123.79 seconds (1.61 tokens/s, 199 tokens)
 
 You can also use multiple GPUs with `pre_layer` if using the oobabooga fork of GPTQ, eg `--pre_layer 30 60` will load a LLaMA-30B model half onto your first GPU and half onto your second, or `--pre_layer 20 40` will load 20 layers onto GPU-0, 20 layers onto GPU-1, and 20 layers offloaded to CPU.
 
+### Using LoRAs with GPTQ-for-LLaMa
+
+This requires using a monkey patch that is supported by this web UI: https://github.com/johnsmith0031/alpaca_lora_4bit
+
+To use it:
+
+1. Clone `johnsmith0031/alpaca_lora_4bit` into the repositories folder:
+
+```
+cd text-generation-webui/repositories
+git clone https://github.com/johnsmith0031/alpaca_lora_4bit
+```
+
+⚠️  I have tested it with the following commit specifically: `2f704b93c961bf202937b10aac9322b092afdce0`
+
+2. Install https://github.com/sterlind/GPTQ-for-LLaMa with this command:
+
+```
+pip install git+https://github.com/sterlind/GPTQ-for-LLaMa.git@lora_4bit
+```
+
+3. Start the UI with the `--monkey-patch` flag:
+
+```
+python server.py --model llama-7b-4bit-128g --listen --lora tloen_alpaca-lora-7b --monkey-patch
+```
+
 
diff --git a/docs/LoRA.md b/docs/LoRA.md
index 02ce55be..f1504d10 100644
--- a/docs/LoRA.md
+++ b/docs/LoRA.md
@@ -11,6 +11,7 @@ This is the current state of LoRA integration in the web UI:
 | Transformers | Full support in 16-bit, `--load-in-8bit`, `--load-in-4bit`, and CPU modes. |
 | ExLlama | Single LoRA support. Fast to remove the LoRA afterwards. |
 | AutoGPTQ | Single LoRA support. Removing the LoRA requires reloading the entire model.|
+| GPTQ-for-LLaMa | Full support with the [monkey patch](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#using-loras-with-gptq-for-llama). |
 
 ## Downloading a LoRA
 
diff --git a/docs/Training-LoRAs.md b/docs/Training-LoRAs.md
index bdc79992..83e6d5a7 100644
--- a/docs/Training-LoRAs.md
+++ b/docs/Training-LoRAs.md
@@ -131,6 +131,14 @@ So, in effect, Loss is a balancing game: you want to get it low enough that it u
 
 Note: if you see Loss start at or suddenly jump to exactly `0`, it is likely something has gone wrong in your training process (eg model corruption).
 
+## Note: 4-Bit Monkeypatch
+
+The [4-bit LoRA monkeypatch](GPTQ-models-(4-bit-mode).md#using-loras-in-4-bit-mode) works for training, but has side effects:
+- VRAM usage is higher currently. You can reduce the `Micro Batch Size` to `1` to compensate.
+- Models do funky things. LoRAs apply themselves, or refuse to apply, or spontaneously error out, or etc. It can be helpful to reload base model or restart the WebUI between training/usage to minimize chances of anything going haywire.
+- Loading or working with multiple LoRAs at the same time doesn't currently work.
+- Generally, recognize and treat the monkeypatch as the dirty temporary hack it is - it works, but isn't very stable. It will get better in time when everything is merged upstream for full official support.
+
 ## Legacy notes
 
 LoRA training was contributed by [mcmonkey4eva](https://github.com/mcmonkey4eva) in PR [#570](https://github.com/oobabooga/text-generation-webui/pull/570).
diff --git a/modules/monkey_patch_gptq_lora.py b/modules/monkey_patch_gptq_lora.py
new file mode 100644
index 00000000..bf8d478d
--- /dev/null
+++ b/modules/monkey_patch_gptq_lora.py
@@ -0,0 +1,43 @@
+# Copied from https://github.com/johnsmith0031/alpaca_lora_4bit
+
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path("repositories/alpaca_lora_4bit")))
+
+import autograd_4bit
+from amp_wrapper import AMPWrapper
+from autograd_4bit import (
+    Autograd4bitQuantLinear,
+    load_llama_model_4bit_low_ram
+)
+from monkeypatch.peft_tuners_lora_monkey_patch import (
+    Linear4bitLt,
+    replace_peft_model_with_gptq_lora_model
+)
+
+from modules import shared
+from modules.GPTQ_loader import find_quantized_model_file
+
+replace_peft_model_with_gptq_lora_model()
+
+
+def load_model_llama(model_name):
+    config_path = str(Path(f'{shared.args.model_dir}/{model_name}'))
+    model_path = str(find_quantized_model_file(model_name))
+    model, tokenizer = load_llama_model_4bit_low_ram(config_path, model_path, groupsize=shared.args.groupsize, is_v1_model=False)
+    for n, m in model.named_modules():
+        if isinstance(m, Autograd4bitQuantLinear) or isinstance(m, Linear4bitLt):
+            if m.is_v1_model:
+                m.zeros = m.zeros.half()
+            m.scales = m.scales.half()
+            m.bias = m.bias.half()
+
+    autograd_4bit.use_new = True
+    autograd_4bit.auto_switch = True
+
+    model.half()
+    wrapper = AMPWrapper(model)
+    wrapper.apply_generate()
+
+    return model, tokenizer
diff --git a/modules/training.py b/modules/training.py
index fa721ff0..7558cd5d 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -270,6 +270,12 @@ def calc_trainable_parameters(model):
 
 def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch_size: int, batch_size: int, epochs: int, learning_rate: str, lr_scheduler_type: str, lora_rank: int, lora_alpha: int, lora_dropout: float, cutoff_len: int, dataset: str, eval_dataset: str, format: str, eval_steps: int, raw_text_file: str, overlap_len: int, newline_favor_len: int, higher_rank_limit: bool, warmup_steps: int, optimizer: str, hard_cut_string: str, train_only_after: str, stop_at_loss: float, add_eos_token: bool, min_chars: int, report_to: str):
 
+    if shared.args.monkey_patch:
+        from monkeypatch.peft_tuners_lora_monkey_patch import (
+            replace_peft_model_with_gptq_lora_model
+        )
+        replace_peft_model_with_gptq_lora_model()
+
     global WANT_INTERRUPT
     WANT_INTERRUPT = False
 
@@ -301,6 +307,15 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
 
         time.sleep(5)
 
+    if shared.args.wbits > 0 and not shared.args.monkey_patch:
+        yield "LoRA training with GPTQ models requires loading with `--monkey-patch`"
+        return
+
+    elif not (shared.args.load_in_8bit or shared.args.load_in_4bit) and shared.args.wbits <= 0:
+        yield "It is highly recommended you use `--load-in-8bit` for LoRA training. *(Will continue anyway in 2 seconds, press `Interrupt` to stop.)*"
+        logger.warning("It is highly recommended you use `--load-in-8bit` for LoRA training.")
+        time.sleep(2)  # Give it a moment for the message to show in UI before continuing
+
     if cutoff_len <= 0 or micro_batch_size <= 0 or batch_size <= 0 or actual_lr <= 0 or lora_rank <= 0 or lora_alpha <= 0:
         yield "Cannot input zeroes."
         return
@@ -505,6 +520,14 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
         yield traceback.format_exc().replace('\n', '\n\n')
         return
 
+    if shared.args.monkey_patch:
+        for n, m in lora_model.named_modules():
+            if '4bit' in str(type(m)):
+                if m.is_v1_model:
+                    m.zeros = m.zeros.half()
+
+                m.scales = m.scales.half()
+
     class Tracked():
         def __init__(self):
             self.current_steps = 0

From e12a1852d9b617fc851caa5363527ea0e2ce8b4f Mon Sep 17 00:00:00 2001
From: Gennadij <berkut1@users.noreply.github.com>
Date: Thu, 10 Aug 2023 19:42:24 +0300
Subject: [PATCH 061/169] Add Vicuna-v1.5 detection (#3524)

---
 models/config.yaml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/models/config.yaml b/models/config.yaml
index 23862770..3d5f48ff 100644
--- a/models/config.yaml
+++ b/models/config.yaml
@@ -53,6 +53,11 @@ llama-65b-gptq-3bit:
 .*vicuna.*(1.1|1_1|1.3|1_3):
   mode: 'instruct'
   instruction_template: 'Vicuna-v1.1'
+.*vicuna.*(1.5|1_5):
+  mode: 'instruct'
+  instruction_template: 'Vicuna-v1.1'
+  truncation_length: 4096
+  rms_norm_eps: 5.0e-6  
 .*stable.*vicuna:
   mode: 'instruct'
   instruction_template: 'StableVicuna'

From 3929971b669e6807cce8bbcaebc0d9a0998464a5 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 10 Aug 2023 10:01:12 -0700
Subject: [PATCH 062/169] Don't show oobabooga_llama-tokenizer in the model
 dropdown

---
 modules/utils.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/modules/utils.py b/modules/utils.py
index adaa15e8..011c71f1 100644
--- a/modules/utils.py
+++ b/modules/utils.py
@@ -71,7 +71,12 @@ def natural_keys(text):
 
 
 def get_available_models():
-    return sorted([re.sub('.pth$', '', item.name) for item in list(Path(f'{shared.args.model_dir}/').glob('*')) if not item.name.endswith(('.txt', '-np', '.pt', '.json', '.yaml'))], key=natural_keys)
+    model_list = []
+    for item in list(Path(f'{shared.args.model_dir}/').glob('*')):
+        if not item.name.endswith(('.txt', '-np', '.pt', '.json', '.yaml', '.py')) and 'llama-tokenizer' not in item.name:
+            model_list.append(re.sub('.pth$', '', item.name))
+
+    return sorted(model_list, key=natural_keys)
 
 
 def get_available_presets():
@@ -120,8 +125,3 @@ def get_datasets(path: str, ext: str):
 
 def get_available_chat_styles():
     return sorted(set(('-'.join(k.stem.split('-')[1:]) for k in Path('css').glob('chat_style*.css'))), key=natural_keys)
-
-
-def get_available_sessions():
-    items = sorted(set(k.stem for k in Path('logs').glob(f'session_{shared.get_mode()}*')), key=natural_keys, reverse=True)
-    return [item for item in items if 'autosave' in item] + [item for item in items if 'autosave' not in item]

From 0789554f65e31a089e4a81e8a47daf2932b762d6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 10 Aug 2023 09:54:28 -0700
Subject: [PATCH 063/169] Allow --lora to use an absolute path

---
 modules/LoRA.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/modules/LoRA.py b/modules/LoRA.py
index 1350783f..10020552 100644
--- a/modules/LoRA.py
+++ b/modules/LoRA.py
@@ -17,6 +17,14 @@ def add_lora_to_model(lora_names):
         add_lora_transformers(lora_names)
 
 
+def get_lora_path(lora_name):
+    p = Path(lora_name)
+    if p.exists():
+        lora_name = p.parts[-1]
+
+    return Path(f"{shared.args.lora_dir}/{lora_name}")
+
+
 def add_lora_exllama(lora_names):
 
     try:
@@ -40,7 +48,7 @@ def add_lora_exllama(lora_names):
         if len(lora_names) > 1:
             logger.warning('ExLlama can only work with 1 LoRA at the moment. Only the first one in the list will be loaded.')
 
-        lora_path = Path(f"{shared.args.lora_dir}/{lora_names[0]}")
+        lora_path = get_lora_path(lora_names[0])
         lora_config_path = lora_path / "adapter_config.json"
         lora_adapter_path = lora_path / "adapter_model.bin"
 
@@ -81,7 +89,7 @@ def add_lora_autogptq(lora_names):
             inference_mode=True,
         )
 
-        lora_path = Path(f"{shared.args.lora_dir}/{lora_names[0]}")
+        lora_path = get_lora_path(lora_names[0])
         logger.info("Applying the following LoRAs to {}: {}".format(shared.model_name, ', '.join([lora_names[0]])))
         shared.model = get_gptq_peft_model(shared.model, peft_config, lora_path)
         shared.lora_names = [lora_names[0]]
@@ -101,7 +109,7 @@ def add_lora_transformers(lora_names):
     if len(removed_set) == 0 and len(prior_set) > 0:
         logger.info(f"Adding the LoRA(s) named {added_set} to the model...")
         for lora in added_set:
-            shared.model.load_adapter(Path(f"{shared.args.lora_dir}/{lora}"), lora)
+            shared.model.load_adapter(get_lora_path(lora), lora)
 
         return
 
@@ -123,9 +131,9 @@ def add_lora_transformers(lora_names):
                     params['device_map'] = {"base_model.model." + k: v for k, v in shared.model.hf_device_map.items()}
 
         logger.info("Applying the following LoRAs to {}: {}".format(shared.model_name, ', '.join(lora_names)))
-        shared.model = PeftModel.from_pretrained(shared.model, Path(f"{shared.args.lora_dir}/{lora_names[0]}"), adapter_name=lora_names[0], **params)
+        shared.model = PeftModel.from_pretrained(shared.model, get_lora_path(lora_names[0]), adapter_name=lora_names[0], **params)
         for lora in lora_names[1:]:
-            shared.model.load_adapter(Path(f"{shared.args.lora_dir}/{lora}"), lora)
+            shared.model.load_adapter(get_lora_path(lora), lora)
 
         shared.lora_names = lora_names
 

From 8dbaa20ca8104aa5ead76dec13af3faa25f5d7e8 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 10 Aug 2023 13:14:48 -0700
Subject: [PATCH 064/169] Don't replace last reply with an empty message

---
 modules/chat.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/modules/chat.py b/modules/chat.py
index c9af55db..c2a05d3f 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -347,7 +347,10 @@ def send_last_reply_to_input(history):
 
 def replace_last_reply(text, state):
     history = state['history']
-    if len(history['visible']) > 0:
+
+    if len(text.strip()) == 0:
+        return history
+    elif len(history['visible']) > 0:
         history['visible'][-1][1] = text
         history['internal'][-1][1] = apply_extensions('input', text, state)
 

From 7a4fcee0697b35081e781ab67c70af0c459579eb Mon Sep 17 00:00:00 2001
From: cal066 <60696996+cal066@users.noreply.github.com>
Date: Fri, 11 Aug 2023 17:41:33 +0000
Subject: [PATCH 065/169] Add ctransformers support (#3313)

---------

Co-authored-by: cal066 <cal066@users.noreply.github.com>
Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com>
Co-authored-by: randoentity <137087500+randoentity@users.noreply.github.com>
---
 README.md                      |   2 +-
 modules/ctransformers_model.py |  76 +++++++++++++++++++++++
 modules/loaders.py             | 109 +++++++++++++++++++++++----------
 modules/models.py              |  23 ++++++-
 modules/shared.py              |   2 +
 modules/text_generation.py     |   6 +-
 modules/ui_model_menu.py       |   8 ++-
 modules/ui_parameters.py       |   2 +-
 requirements.txt               |   3 +
 9 files changed, 188 insertions(+), 43 deletions(-)
 create mode 100644 modules/ctransformers_model.py

diff --git a/README.md b/README.md
index ad2ad1ed..8ceb077c 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 ## Features
 
 * 3 interface modes: default, notebook, and chat
-* Multiple model backends: transformers, llama.cpp, ExLlama, AutoGPTQ, GPTQ-for-LLaMa
+* Multiple model backends: transformers, llama.cpp, ExLlama, AutoGPTQ, GPTQ-for-LLaMa, ctransformers
 * Dropdown menu for quickly switching between different models
 * LoRA: load and unload LoRAs on the fly, train a new LoRA
 * Precise instruction templates for chat mode, including Llama 2, Alpaca, Vicuna, WizardLM, StableLM, and many others
diff --git a/modules/ctransformers_model.py b/modules/ctransformers_model.py
new file mode 100644
index 00000000..b3d001d3
--- /dev/null
+++ b/modules/ctransformers_model.py
@@ -0,0 +1,76 @@
+from ctransformers import AutoConfig, AutoModelForCausalLM
+
+from modules import shared
+from modules.callbacks import Iteratorize
+from modules.logging_colors import logger
+
+
+class CtransformersModel:
+    def __init__(self):
+        pass
+
+    @classmethod
+    def from_pretrained(self, path):
+        result = self()
+
+        # ctransformers uses -1 for random seed
+        config = AutoConfig.from_pretrained(
+            str(path),
+            threads=shared.args.threads,
+            gpu_layers=shared.args.n_gpu_layers,
+            batch_size=shared.args.n_batch,
+            stream=True,
+            seed=(-1 if shared.args.llama_cpp_seed == 0 else shared.args.llama_cpp_seed)
+        )
+
+        self.model = AutoModelForCausalLM.from_pretrained(
+            str(result.model_dir(path) if result.model_type_is_auto() else path),
+            model_type=(None if result.model_type_is_auto() else shared.args.model_type),
+            config=config
+        )
+
+        logger.info(f'Using ctransformers model_type: {self.model.model_type} for {self.model.model_path}')
+        return result, result
+
+    def model_type_is_auto(self):
+        return shared.args.model_type == "Auto" or shared.args.model_type == "None"
+
+    def model_dir(self, path):
+        if path.is_file():
+            return path.parent
+
+        return path
+
+    def encode(self, string, **kwargs):
+        return self.model.tokenize(string)
+
+    def decode(self, ids):
+        return self.model.detokenize(ids)
+
+    def generate(self, prompt, state, callback=None):
+        prompt = prompt if type(prompt) is str else prompt.decode()
+        generator = self.model._stream(
+            prompt=prompt,
+            max_new_tokens=state['max_new_tokens'],
+            temperature=state['temperature'],
+            top_p=state['top_p'],
+            top_k=state['top_k'],
+            repetition_penalty=state['repetition_penalty'],
+            threads=shared.args.threads
+        )
+
+        output = ""
+        for token in generator:
+            if callback:
+                callback(token)
+
+            output += token
+
+        return output
+
+    def generate_with_streaming(self, *args, **kwargs):
+        with Iteratorize(self.generate, args, kwargs, callback=None) as generator:
+            reply = ''
+            for token in generator:
+                reply += token
+                yield reply
diff --git a/modules/loaders.py b/modules/loaders.py
index 07bc455c..fa5e03c2 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -1,10 +1,43 @@
 import functools
+from collections import OrderedDict
 
 import gradio as gr
 
 from modules import shared
 
-loaders_and_params = {
+loaders_and_params = OrderedDict({
+    'Transformers': [
+        'cpu_memory',
+        'gpu_memory',
+        'trust_remote_code',
+        'load_in_8bit',
+        'bf16',
+        'cpu',
+        'disk',
+        'auto_devices',
+        'load_in_4bit',
+        'use_double_quant',
+        'quant_type',
+        'compute_dtype',
+        'trust_remote_code',
+        'alpha_value',
+        'compress_pos_emb',
+        'transformers_info'
+    ],
+    'ExLlama_HF': [
+        'gpu_split',
+        'max_seq_len',
+        'alpha_value',
+        'compress_pos_emb',
+        'exllama_HF_info',
+    ],
+    'ExLlama': [
+        'gpu_split',
+        'max_seq_len',
+        'alpha_value',
+        'compress_pos_emb',
+        'exllama_info',
+    ],
     'AutoGPTQ': [
         'triton',
         'no_inject_fused_attention',
@@ -59,39 +92,15 @@ loaders_and_params = {
         'cpu',
         'llamacpp_HF_info',
     ],
-    'Transformers': [
-        'cpu_memory',
-        'gpu_memory',
-        'trust_remote_code',
-        'load_in_8bit',
-        'bf16',
-        'cpu',
-        'disk',
-        'auto_devices',
-        'load_in_4bit',
-        'use_double_quant',
-        'quant_type',
-        'compute_dtype',
-        'trust_remote_code',
-        'alpha_value',
-        'compress_pos_emb',
-        'transformers_info'
-    ],
-    'ExLlama': [
-        'gpu_split',
-        'max_seq_len',
-        'alpha_value',
-        'compress_pos_emb',
-        'exllama_info',
-    ],
-    'ExLlama_HF': [
-        'gpu_split',
-        'max_seq_len',
-        'alpha_value',
-        'compress_pos_emb',
-        'exllama_HF_info',
+    'ctransformers': [
+        'n_ctx',
+        'n_gpu_layers',
+        'n_batch',
+        'threads',
+        'model_type',
+        'llama_cpp_seed',
     ]
-}
+})
 
 loaders_samplers = {
     'Transformers': {
@@ -256,6 +265,33 @@ loaders_samplers = {
         'skip_special_tokens',
         'auto_max_new_tokens',
     },
+    'ctransformers': {
+        'temperature',
+        'top_p',
+        'top_k',
+        'repetition_penalty',
+    }
+}
+
+loaders_model_types = {
+    'GPTQ-for-LLaMa': [
+        "None",
+        "llama",
+        "opt",
+        "gptj"
+    ],
+    'ctransformers': [
+        "None",
+        "gpt2",
+        "gptj",
+        "gptneox",
+        "llama",
+        "mpt",
+        "dollyv2"
+        "replit",
+        "starcoder",
+        "falcon"
+    ],
 }
 
 
@@ -277,6 +313,13 @@ def blacklist_samplers(loader):
         return [gr.update(visible=True) if sampler in loaders_samplers[loader] else gr.update(visible=False) for sampler in all_samplers]
 
 
+def get_model_types(loader):
+    if loader in loaders_model_types:
+        return loaders_model_types[loader]
+
+    return ["None"]
+
+
 def get_gpu_memory_keys():
     return [k for k in shared.gradio if k.startswith('gpu_memory')]
 
diff --git a/modules/models.py b/modules/models.py
index aad142c1..d60aecd0 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -58,7 +58,8 @@ def load_model(model_name, loader=None):
         'llamacpp_HF': llamacpp_HF_loader,
         'RWKV': RWKV_loader,
         'ExLlama': ExLlama_loader,
-        'ExLlama_HF': ExLlama_HF_loader
+        'ExLlama_HF': ExLlama_HF_loader,
+        'ctransformers': ctransformers_loader,
     }
 
     p = Path(model_name)
@@ -242,7 +243,7 @@ def llamacpp_loader(model_name):
     else:
         model_file = list(Path(f'{shared.args.model_dir}/{model_name}').glob('*ggml*.bin'))[0]
 
-    logger.info(f"llama.cpp weights detected: {model_file}\n")
+    logger.info(f"llama.cpp weights detected: {model_file}")
     model, tokenizer = LlamaCppModel.from_pretrained(model_file)
     return model, tokenizer
 
@@ -268,6 +269,24 @@ def llamacpp_HF_loader(model_name):
     return model, tokenizer
 
 
+def ctransformers_loader(model_name):
+    from modules.ctransformers_model import CtransformersModel
+
+    path = Path(f'{shared.args.model_dir}/{model_name}')
+    ctrans = CtransformersModel()
+    if ctrans.model_type_is_auto():
+        model_file = path
+    else:
+        if path.is_file():
+            model_file = path
+        else:
+            model_file = list(Path(f'{shared.args.model_dir}/{model_name}').glob('*.bin'))[0]
+
+    logger.info(f'ctransformers weights detected: {model_file}')
+    model, tokenizer = ctrans.from_pretrained(model_file)
+    return model, tokenizer
+
+
 def GPTQ_loader(model_name):
 
     # Monkey patch
diff --git a/modules/shared.py b/modules/shared.py
index 224fa6aa..cb6f0ae1 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -215,6 +215,8 @@ def fix_loader_name(name):
         return 'ExLlama'
     elif name in ['exllama-hf', 'exllama_hf', 'exllama hf', 'ex-llama-hf', 'ex_llama_hf']:
         return 'ExLlama_HF'
+    elif name in ['ctransformers', 'ctranforemrs', 'ctransformer']:
+        return 'ctransformers'
 
 
 def add_extension(name):
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 6e95414b..30e81355 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -41,7 +41,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False):
             yield ''
             return
 
-        if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel']:
+        if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel', 'CtransformersModel']:
             generate_func = generate_reply_custom
         else:
             generate_func = generate_reply_HF
@@ -90,7 +90,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False):
 
 
 def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_length=None):
-    if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel']:
+    if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'CtransformersModel']:
         input_ids = shared.tokenizer.encode(str(prompt))
         input_ids = np.array(input_ids).reshape(1, len(input_ids))
     else:
@@ -104,7 +104,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
     if truncation_length is not None:
         input_ids = input_ids[:, -truncation_length:]
 
-    if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel'] or shared.args.cpu:
+    if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel', 'CtransformersModel'] or shared.args.cpu:
         return input_ids
     elif shared.args.deepspeed:
         return input_ids.to(device=local_rank)
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 0c1042f6..7b852a44 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -63,7 +63,7 @@ def create_ui():
 
         with gr.Row():
             with gr.Column():
-                shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=["Transformers", "ExLlama_HF", "ExLlama", "AutoGPTQ", "GPTQ-for-LLaMa", "llama.cpp", "llamacpp_HF"], value=None)
+                shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=loaders.loaders_and_params.keys(), value=None)
                 with gr.Box():
                     with gr.Row():
                         with gr.Column():
@@ -84,7 +84,7 @@ def create_ui():
 
                             shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=str(shared.args.wbits) if shared.args.wbits > 0 else "None")
                             shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=str(shared.args.groupsize) if shared.args.groupsize > 0 else "None")
-                            shared.gradio['model_type'] = gr.Dropdown(label="model_type", choices=["None", "llama", "opt", "gptj"], value=shared.args.model_type or "None")
+                            shared.gradio['model_type'] = gr.Dropdown(label="model_type", choices=["None"], value=shared.args.model_type or "None")
                             shared.gradio['pre_layer'] = gr.Slider(label="pre_layer", minimum=0, maximum=100, value=shared.args.pre_layer[0] if shared.args.pre_layer is not None else 0)
                             shared.gradio['autogptq_info'] = gr.Markdown('* ExLlama_HF is recommended over AutoGPTQ for models derived from LLaMA.')
                             shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
@@ -127,7 +127,9 @@ def create_ui():
 
 
 def create_event_handlers():
-    shared.gradio['loader'].change(loaders.make_loader_params_visible, gradio('loader'), gradio(loaders.get_all_params()))
+    shared.gradio['loader'].change(
+        loaders.make_loader_params_visible, gradio('loader'), gradio(loaders.get_all_params())).then(
+        lambda value: gr.update(choices=loaders.get_model_types(value)), gradio('loader'), gradio('model_type'))
 
     # In this event handler, the interface state is read and updated
     # with the model defaults (if any), and then the model is loaded
diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
index 75bce9b1..4b9fb918 100644
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@@ -16,7 +16,7 @@ def create_ui(default_preset):
                     shared.gradio['delete_preset'] = gr.Button('🗑️', elem_classes='refresh-button')
 
             with gr.Column():
-                shared.gradio['filter_by_loader'] = gr.Dropdown(label="Filter by loader", choices=["All", "Transformers", "ExLlama_HF", "ExLlama", "AutoGPTQ", "GPTQ-for-LLaMa", "llama.cpp", "llamacpp_HF"], value="All", elem_classes='slim-dropdown')
+                shared.gradio['filter_by_loader'] = gr.Dropdown(label="Filter by loader", choices=["All"] + list(loaders.loaders_and_params.keys()), value="All", elem_classes='slim-dropdown')
 
         with gr.Row():
             with gr.Column():
diff --git a/requirements.txt b/requirements.txt
index b27e14c5..ec6a7e47 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -40,3 +40,6 @@ https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/text
 # GPTQ-for-LLaMa
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+
+# ctransformers
+https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.20+cu117-py3-none-any.whl

From 28c8df337bc14a752f66f8ee258b73ee621329b1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 11 Aug 2023 11:02:56 -0700
Subject: [PATCH 066/169] Add repetition_penalty_range to ctransformers

---
 modules/ctransformers_model.py | 1 +
 modules/loaders.py             | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/modules/ctransformers_model.py b/modules/ctransformers_model.py
index b3d001d3..c5bc701a 100644
--- a/modules/ctransformers_model.py
+++ b/modules/ctransformers_model.py
@@ -56,6 +56,7 @@ class CtransformersModel:
             top_p=state['top_p'],
             top_k=state['top_k'],
             repetition_penalty=state['repetition_penalty'],
+            last_n_tokens=state['repetition_penalty_range'],
             threads=shared.args.threads
         )
 
diff --git a/modules/loaders.py b/modules/loaders.py
index fa5e03c2..21854de7 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -86,7 +86,6 @@ loaders_and_params = OrderedDict({
         'no_mmap',
         'low_vram',
         'mlock',
-        'llama_cpp_seed',
         'alpha_value',
         'compress_pos_emb',
         'cpu',
@@ -270,6 +269,7 @@ loaders_samplers = {
         'top_p',
         'top_k',
         'repetition_penalty',
+        'repetition_penalty_range',
     }
 }
 

From 2f918ccf7cc0b9b1901714c2f70e8d8f124cb6bf Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 11 Aug 2023 11:15:22 -0700
Subject: [PATCH 067/169] Remove unused parameter

---
 modules/loaders.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modules/loaders.py b/modules/loaders.py
index 21854de7..9a222a72 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -92,7 +92,6 @@ loaders_and_params = OrderedDict({
         'llamacpp_HF_info',
     ],
     'ctransformers': [
-        'n_ctx',
         'n_gpu_layers',
         'n_batch',
         'threads',

From 4c450e6b7073fb04614adf1f90845107c44174ea Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 11 Aug 2023 15:50:16 -0300
Subject: [PATCH 068/169] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8ceb077c..6694e500 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 ## Features
 
 * 3 interface modes: default, notebook, and chat
-* Multiple model backends: transformers, llama.cpp, ExLlama, AutoGPTQ, GPTQ-for-LLaMa, ctransformers
+* Multiple model backends: [transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp), [ExLlama](https://github.com/turboderp/exllama), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa), [ctransformers](https://github.com/marella/ctransformers)
 * Dropdown menu for quickly switching between different models
 * LoRA: load and unload LoRAs on the fly, train a new LoRA
 * Precise instruction templates for chat mode, including Llama 2, Alpaca, Vicuna, WizardLM, StableLM, and many others

From 0e05818266570454c16d7c45b656338c99f22c46 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 11 Aug 2023 16:33:15 -0700
Subject: [PATCH 069/169] Style changes

---
 modules/ui_chat.py     | 31 ++++++++++++++-----------------
 modules/ui_default.py  |  8 ++++----
 modules/ui_notebook.py | 17 ++++++++---------
 3 files changed, 26 insertions(+), 30 deletions(-)

diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 4471d2f4..1d73adf7 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -12,13 +12,10 @@ from modules.utils import gradio
 
 
 def create_ui():
-
-    shared.gradio.update({
-        'interface_state': gr.State({k: None for k in shared.input_elements}),
-        'Chat input': gr.State(),
-        'dummy': gr.State(),
-        'history': gr.State({'internal': [], 'visible': []}),
-    })
+    shared.gradio['interface_state'] = gr.State({k: None for k in shared.input_elements})
+    shared.gradio['Chat input'] = gr.State()
+    shared.gradio['dummy'] = gr.State()
+    shared.gradio['history'] = gr.State({'internal': [], 'visible': []})
 
     with gr.Tab('Text generation', elem_id='main'):
         shared.gradio['display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': []}, shared.settings['name1'], shared.settings['name2'], 'chat', 'cai-chat'))
@@ -52,7 +49,7 @@ def create_ui():
             shared.gradio['chat_style'] = gr.Dropdown(choices=utils.get_available_chat_styles(), label='Chat style', value=shared.settings['chat_style'], visible=shared.settings['mode'] != 'instruct')
 
     with gr.Tab('Chat settings', elem_id='chat-settings'):
-        with gr.Tab("Character"):
+        with gr.Tab('Character'):
             with gr.Row():
                 with gr.Column(scale=8):
                     with gr.Row():
@@ -70,7 +67,7 @@ def create_ui():
                     shared.gradio['character_picture'] = gr.Image(label='Character picture', type='pil')
                     shared.gradio['your_picture'] = gr.Image(label='Your picture', type='pil', value=Image.open(Path('cache/pfp_me.png')) if Path('cache/pfp_me.png').exists() else None)
 
-        with gr.Tab("Instruction template"):
+        with gr.Tab('Instruction template'):
             with gr.Row():
                 with gr.Row():
                     shared.gradio['instruction_template'] = gr.Dropdown(choices=utils.get_available_instruction_templates(), label='Instruction template', value='None', info='Change this according to the model/LoRA that you are using. Used in instruct and chat-instruct modes.', elem_classes='slim-dropdown')
@@ -91,7 +88,7 @@ def create_ui():
                     shared.gradio['save_chat_history'] = gr.Button(value='Save history')
 
                 with gr.Column():
-                    shared.gradio['load_chat_history'] = gr.File(type='binary', file_types=['.json', '.txt'], label="Upload History JSON")
+                    shared.gradio['load_chat_history'] = gr.File(type='binary', file_types=['.json', '.txt'], label='Upload History JSON')
 
         with gr.Tab('Upload character'):
             with gr.Tab('YAML or JSON'):
@@ -104,7 +101,7 @@ def create_ui():
             with gr.Tab('TavernAI PNG'):
                 with gr.Row():
                     with gr.Column():
-                        shared.gradio['upload_img_tavern'] = gr.Image(type='pil', label='TavernAI PNG File', elem_id="upload_img_tavern")
+                        shared.gradio['upload_img_tavern'] = gr.Image(type='pil', label='TavernAI PNG File', elem_id='upload_img_tavern')
                         shared.gradio['tavern_json'] = gr.State()
                     with gr.Column():
                         shared.gradio['tavern_name'] = gr.Textbox(value='', lines=1, label='Name', interactive=False)
@@ -126,7 +123,7 @@ def create_event_handlers():
         chat.generate_chat_reply_wrapper, shared.input_params, gradio('display', 'history'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
-        lambda: None, None, None, _js=f"() => {{{ui.audio_notification_js}}}")
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
     )
 
     gen_events.append(shared.gradio['textbox'].submit(
@@ -135,7 +132,7 @@ def create_event_handlers():
         chat.generate_chat_reply_wrapper, shared.input_params, gradio('display', 'history'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
-        lambda: None, None, None, _js=f"() => {{{ui.audio_notification_js}}}")
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
     )
 
     gen_events.append(shared.gradio['Regenerate'].click(
@@ -143,7 +140,7 @@ def create_event_handlers():
         partial(chat.generate_chat_reply_wrapper, regenerate=True), shared.input_params, gradio('display', 'history'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
-        lambda: None, None, None, _js=f"() => {{{ui.audio_notification_js}}}")
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
     )
 
     gen_events.append(shared.gradio['Continue'].click(
@@ -151,7 +148,7 @@ def create_event_handlers():
         partial(chat.generate_chat_reply_wrapper, _continue=True), shared.input_params, gradio('display', 'history'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
-        lambda: None, None, None, _js=f"() => {{{ui.audio_notification_js}}}")
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
     )
 
     gen_events.append(shared.gradio['Impersonate'].click(
@@ -159,7 +156,7 @@ def create_event_handlers():
         lambda x: x, gradio('textbox'), gradio('Chat input'), show_progress=False).then(
         chat.impersonate_wrapper, shared.input_params, gradio('textbox'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        lambda: None, None, None, _js=f"() => {{{ui.audio_notification_js}}}")
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
     )
 
     shared.gradio['Replace last reply'].click(
@@ -243,7 +240,7 @@ def create_event_handlers():
 
     shared.gradio['save_chat_history'].click(
         lambda x: json.dumps(x, indent=4), gradio('history'), gradio('temporary_text')).then(
-        None, gradio('temporary_text', 'character_menu', 'mode'), None, _js=f"(hist, char, mode) => {{{ui.save_files_js}; saveHistory(hist, char, mode)}}")
+        None, gradio('temporary_text', 'character_menu', 'mode'), None, _js=f'(hist, char, mode) => {{{ui.save_files_js}; saveHistory(hist, char, mode)}}')
 
     shared.gradio['Submit character'].click(
         chat.upload_character, gradio('upload_json', 'upload_img_bot'), gradio('character_menu')).then(
diff --git a/modules/ui_default.py b/modules/ui_default.py
index f0ab74ef..b879e1ef 100644
--- a/modules/ui_default.py
+++ b/modules/ui_default.py
@@ -15,7 +15,7 @@ def create_ui():
     shared.gradio['interface_state'] = gr.State({k: None for k in shared.input_elements})
     shared.gradio['last_input'] = gr.State('')
 
-    with gr.Tab("Text generation", elem_id="main"):
+    with gr.Tab('Text generation', elem_id='main'):
         with gr.Row():
             with gr.Column():
                 shared.gradio['textbox'] = gr.Textbox(value=default_text, elem_classes=['textbox_default', 'add_scrollbar'], lines=27, label='Input')
@@ -56,7 +56,7 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         generate_reply_wrapper, shared.input_params, output_params, show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        lambda: None, None, None, _js=f"() => {{{ui.audio_notification_js}}}")
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
         # lambda: None, None, None, _js="() => {element = document.getElementsByTagName('textarea')[0]; element.scrollTop = element.scrollHeight}")
     )
 
@@ -65,7 +65,7 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         generate_reply_wrapper, shared.input_params, output_params, show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        lambda: None, None, None, _js=f"() => {{{ui.audio_notification_js}}}")
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
         # lambda: None, None, None, _js="() => {element = document.getElementsByTagName('textarea')[0]; element.scrollTop = element.scrollHeight}")
     )
 
@@ -74,7 +74,7 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         generate_reply_wrapper, [shared.gradio['output_textbox']] + shared.input_params[1:], output_params, show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        lambda: None, None, None, _js=f"() => {{{ui.audio_notification_js}}}")
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
         # lambda: None, None, None, _js="() => {element = document.getElementsByTagName('textarea')[1]; element.scrollTop = element.scrollHeight}")
     )
 
diff --git a/modules/ui_notebook.py b/modules/ui_notebook.py
index 998a7cf7..9e8b3af6 100644
--- a/modules/ui_notebook.py
+++ b/modules/ui_notebook.py
@@ -11,11 +11,10 @@ from modules.utils import gradio
 
 def create_ui():
     default_text = load_prompt(shared.settings['prompt'])
-
     shared.gradio['interface_state'] = gr.State({k: None for k in shared.input_elements})
     shared.gradio['last_input'] = gr.State('')
 
-    with gr.Tab("Text generation", elem_id="main"):
+    with gr.Tab('Text generation', elem_id='main'):
         with gr.Row():
             with gr.Column(scale=4):
                 with gr.Tab('Raw'):
@@ -29,10 +28,10 @@ def create_ui():
                     shared.gradio['html'] = gr.HTML()
 
                 with gr.Row():
-                    shared.gradio['Generate'] = gr.Button('Generate', variant='primary', elem_classes="small-button")
-                    shared.gradio['Stop'] = gr.Button('Stop', elem_classes="small-button", elem_id='stop')
-                    shared.gradio['Undo'] = gr.Button('Undo', elem_classes="small-button")
-                    shared.gradio['Regenerate'] = gr.Button('Regenerate', elem_classes="small-button")
+                    shared.gradio['Generate'] = gr.Button('Generate', variant='primary', elem_classes='small-button')
+                    shared.gradio['Stop'] = gr.Button('Stop', elem_classes='small-button', elem_id='stop')
+                    shared.gradio['Undo'] = gr.Button('Undo', elem_classes='small-button')
+                    shared.gradio['Regenerate'] = gr.Button('Regenerate', elem_classes='small-button')
 
             with gr.Column(scale=1):
                 gr.HTML('<div style="padding-bottom: 13px"></div>')
@@ -58,7 +57,7 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         generate_reply_wrapper, shared.input_params, output_params, show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        lambda: None, None, None, _js=f"() => {{{ui.audio_notification_js}}}")
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
         # lambda: None, None, None, _js="() => {element = document.getElementsByTagName('textarea')[0]; element.scrollTop = element.scrollHeight}")
     )
 
@@ -67,7 +66,7 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         generate_reply_wrapper, shared.input_params, output_params, show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        lambda: None, None, None, _js=f"() => {{{ui.audio_notification_js}}}")
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
         # lambda: None, None, None, _js="() => {element = document.getElementsByTagName('textarea')[0]; element.scrollTop = element.scrollHeight}")
     )
 
@@ -78,7 +77,7 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         generate_reply_wrapper, shared.input_params, output_params, show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        lambda: None, None, None, _js=f"() => {{{ui.audio_notification_js}}}")
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
         # lambda: None, None, None, _js="() => {element = document.getElementsByTagName('textarea')[0]; element.scrollTop = element.scrollHeight}")
     )
 

From 0230fa4e9c91b99e95f8b85385cc385fb1fdcbfe Mon Sep 17 00:00:00 2001
From: Chris Lefever <linuxcl4@gmail.com>
Date: Sat, 12 Aug 2023 02:26:58 -0400
Subject: [PATCH 070/169] Add the --disable_exllama option for AutoGPTQ

---
 README.md                  | 1 +
 modules/AutoGPTQ_loader.py | 1 +
 modules/loaders.py         | 1 +
 modules/shared.py          | 1 +
 modules/ui.py              | 1 +
 modules/ui_model_menu.py   | 1 +
 6 files changed, 6 insertions(+)

diff --git a/README.md b/README.md
index 6694e500..278e5e3a 100644
--- a/README.md
+++ b/README.md
@@ -262,6 +262,7 @@ Optionally, you can use the following command-line flags:
 | `--no_inject_fused_mlp`        | Triton mode only: disable the use of fused MLP, which will use less VRAM at the cost of slower inference. |
 | `--no_use_cuda_fp16`           | This can make models faster on some systems. |
 | `--desc_act`                   | For models that don't have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig. |
+| `--disable_exllama`            | Disable ExLlama kernel, which can improve inference speed on some systems. |
 
 #### ExLlama
 
diff --git a/modules/AutoGPTQ_loader.py b/modules/AutoGPTQ_loader.py
index 0d41ac0a..987f5ba7 100644
--- a/modules/AutoGPTQ_loader.py
+++ b/modules/AutoGPTQ_loader.py
@@ -50,6 +50,7 @@ def load_quantized(model_name):
         'max_memory': get_max_memory_dict(),
         'quantize_config': quantize_config,
         'use_cuda_fp16': not shared.args.no_use_cuda_fp16,
+        'disable_exllama': shared.args.disable_exllama,
     }
 
     logger.info(f"The AutoGPTQ params are: {params}")
diff --git a/modules/loaders.py b/modules/loaders.py
index 9a222a72..a96c43ea 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -46,6 +46,7 @@ loaders_and_params = OrderedDict({
         'wbits',
         'groupsize',
         'desc_act',
+        'disable_exllama',
         'gpu_memory',
         'cpu_memory',
         'cpu',
diff --git a/modules/shared.py b/modules/shared.py
index cb6f0ae1..ba89fb52 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -145,6 +145,7 @@ parser.add_argument('--no_inject_fused_attention', action='store_true', help='Do
 parser.add_argument('--no_inject_fused_mlp', action='store_true', help='Triton mode only: Do not use fused MLP (lowers VRAM requirements).')
 parser.add_argument('--no_use_cuda_fp16', action='store_true', help='This can make models faster on some systems.')
 parser.add_argument('--desc_act', action='store_true', help='For models that don\'t have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.')
+parser.add_argument('--disable_exllama', action='store_true', help='Disable ExLlama kernel, which can improve inference speed on some systems.')
 
 # ExLlama
 parser.add_argument('--gpu-split', type=str, help="Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. 20,7,7")
diff --git a/modules/ui.py b/modules/ui.py
index b58b7dd6..37284d25 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -58,6 +58,7 @@ def list_model_elements():
         'no_inject_fused_attention',
         'no_inject_fused_mlp',
         'no_use_cuda_fp16',
+        'disable_exllama',
         'threads',
         'n_batch',
         'no_mmap',
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 7b852a44..3059f616 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -98,6 +98,7 @@ def create_ui():
                             shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
                             shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.')
                             shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
+                            shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel, which can improve inference speed on some systems.')
                             shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu)
                             shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
                             shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)

From 73421b1fedc46e013d35ceaf8f2adba01c5c20a8 Mon Sep 17 00:00:00 2001
From: jllllll <3887729+jllllll@users.noreply.github.com>
Date: Sat, 12 Aug 2023 21:02:47 -0500
Subject: [PATCH 071/169] Bump ctransformers wheel version (#3558)

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index ec6a7e47..c4595d48 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -42,4 +42,4 @@ https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 
 # ctransformers
-https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.20+cu117-py3-none-any.whl
+https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.22+cu117-py3-none-any.whl

From bf70c19603627a5bbf6b5a2370bddc9ac45d81db Mon Sep 17 00:00:00 2001
From: cal066 <60696996+cal066@users.noreply.github.com>
Date: Sun, 13 Aug 2023 03:04:03 +0000
Subject: [PATCH 072/169] ctransformers: move thread and seed parameters
 (#3543)

---
 modules/ctransformers_model.py | 7 +++----
 modules/loaders.py             | 3 +--
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/modules/ctransformers_model.py b/modules/ctransformers_model.py
index c5bc701a..f5641616 100644
--- a/modules/ctransformers_model.py
+++ b/modules/ctransformers_model.py
@@ -13,14 +13,12 @@ class CtransformersModel:
     def from_pretrained(self, path):
         result = self()
 
-        # ctransformers uses -1 for random seed
         config = AutoConfig.from_pretrained(
             str(path),
             threads=shared.args.threads,
             gpu_layers=shared.args.n_gpu_layers,
             batch_size=shared.args.n_batch,
-            stream=True,
-            seed=(-1 if shared.args.llama_cpp_seed == 0 else shared.args.llama_cpp_seed)
+            stream=True
         )
 
         self.model = AutoModelForCausalLM.from_pretrained(
@@ -49,6 +47,7 @@ class CtransformersModel:
 
     def generate(self, prompt, state, callback=None):
         prompt = prompt if type(prompt) is str else prompt.decode()
+        # ctransformers uses -1 for random seed
         generator = self.model._stream(
             prompt=prompt,
             max_new_tokens=state['max_new_tokens'],
@@ -57,7 +56,7 @@ class CtransformersModel:
             top_k=state['top_k'],
             repetition_penalty=state['repetition_penalty'],
             last_n_tokens=state['repetition_penalty_range'],
-            threads=shared.args.threads
+            seed=state['seed']
         )
 
         output = ""
diff --git a/modules/loaders.py b/modules/loaders.py
index 9a222a72..2b3a50b3 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -95,8 +95,7 @@ loaders_and_params = OrderedDict({
         'n_gpu_layers',
         'n_batch',
         'threads',
-        'model_type',
-        'llama_cpp_seed',
+        'model_type'
     ]
 })
 

From a1a9ec895d96b27f2c03cf5df4bce679f3abaf91 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 13 Aug 2023 01:12:15 -0300
Subject: [PATCH 073/169] Unify the 3 interface modes (#3554)

---
 README.md                               |   2 -
 api-examples/api-example-chat-stream.py |   2 -
 api-examples/api-example-chat.py        |   2 -
 css/chat.css                            | 146 ----------------
 css/main.css                            | 156 +++++++++++++++--
 docs/Extensions.md                      |   8 +-
 extensions/api/util.py                  |   2 -
 extensions/elevenlabs_tts/script.py     |  35 ++--
 extensions/example/script.py            |   4 +-
 extensions/gallery/script.js            |  14 ++
 extensions/gallery/script.py            |   7 +-
 extensions/send_pictures/script.py      |   4 +-
 extensions/silero_tts/script.py         |  33 ++--
 extensions/superbooga/script.py         |   6 +-
 js/main.js                              |  31 +++-
 js/save_files.js                        |   4 +-
 modules/chat.py                         |  99 ++++-------
 modules/extensions.py                   |  30 +++-
 modules/shared.py                       |  29 +---
 modules/ui.py                           |  54 +++---
 modules/ui_chat.py                      | 155 ++++++++---------
 modules/ui_default.py                   |  70 ++++----
 modules/ui_file_saving.py               |  66 ++++---
 modules/ui_notebook.py                  |  77 ++++-----
 modules/ui_parameters.py                | 220 +++++++++++-------------
 modules/ui_session.py                   |  50 +++---
 modules/utils.py                        |   2 +-
 server.py                               |  59 +++----
 settings-template.yaml                  |   7 +-
 29 files changed, 660 insertions(+), 714 deletions(-)
 delete mode 100644 css/chat.css
 create mode 100644 extensions/gallery/script.js

diff --git a/README.md b/README.md
index 6694e500..73ae33bd 100644
--- a/README.md
+++ b/README.md
@@ -189,8 +189,6 @@ Optionally, you can use the following command-line flags:
 | Flag                                       | Description |
 |--------------------------------------------|-------------|
 | `-h`, `--help`                             | Show this help message and exit. |
-| `--notebook`                               | Launch the web UI in notebook mode, where the output is written to the same text box as the input. |
-| `--chat`                                   | Launch the web UI in chat mode. |
 | `--multi-user`                             | Multi-user mode. Chat histories are not saved or automatically loaded. WARNING: this is highly experimental. |
 | `--character CHARACTER`                    | The name of the character to load in chat mode by default. |
 | `--model MODEL`                            | Name of the model to load by default. |
diff --git a/api-examples/api-example-chat-stream.py b/api-examples/api-example-chat-stream.py
index 055900bd..cccd5b26 100644
--- a/api-examples/api-example-chat-stream.py
+++ b/api-examples/api-example-chat-stream.py
@@ -36,8 +36,6 @@ async def run(user_input, history):
         # 'turn_template': 'turn_template', # Optional
         'regenerate': False,
         '_continue': False,
-        'stop_at_newline': False,
-        'chat_generation_attempts': 1,
         'chat_instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',
 
         # Generation params. If 'preset' is set to different than 'None', the values
diff --git a/api-examples/api-example-chat.py b/api-examples/api-example-chat.py
index c3d0c538..c197a584 100644
--- a/api-examples/api-example-chat.py
+++ b/api-examples/api-example-chat.py
@@ -30,8 +30,6 @@ def run(user_input, history):
         # 'turn_template': 'turn_template', # Optional
         'regenerate': False,
         '_continue': False,
-        'stop_at_newline': False,
-        'chat_generation_attempts': 1,
         'chat_instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',
 
         # Generation params. If 'preset' is set to different than 'None', the values
diff --git a/css/chat.css b/css/chat.css
deleted file mode 100644
index 677d86db..00000000
--- a/css/chat.css
+++ /dev/null
@@ -1,146 +0,0 @@
-.h-\[40vh\], .wrap.svelte-byatnx.svelte-byatnx.svelte-byatnx {
-    height: 66.67vh
-}
-
-.gradio-container {
-    margin-left: auto !important;
-    margin-right: auto !important;
-}
-
-.w-screen {
-    width: unset
-}
-
-div.svelte-362y77>*, div.svelte-362y77>.form>* {
-    flex-wrap: nowrap
-}
-
-/* fixes the API documentation in chat mode */
-.api-docs.svelte-1iguv9h.svelte-1iguv9h.svelte-1iguv9h {
-    display: grid;
-}
-
-.pending.svelte-1ed2p3z {
-    opacity: 1;
-}
-
-#extensions {
-    padding: 0;
-}
-
-#gradio-chatbot {
-    height: 66.67vh;
-}
-
-.wrap.svelte-6roggh.svelte-6roggh {
-    max-height: 92.5%;
-}
-
-/* This is for the microphone button in the whisper extension */
-.sm.svelte-1ipelgc {
-    width: 100%;
-}
-
-#main button {
-    min-width: 0 !important;
-}
-
-#main > :first-child, #extensions {
-    max-width: 800px;
-    margin-left: auto;
-    margin-right: auto;
-}
-
-@media screen and (max-width: 688px) {
-    #main {
-        padding: 0px;
-    }
-
-    .chat {
-        height: calc(100vh - 274px) !important;
-    }
-}
-
-/*****************************************************/
-/*************** Chat box declarations ***************/
-/*****************************************************/
-
-.chat {
-    margin-left: auto;
-    margin-right: auto;
-    max-width: 800px;
-    height: calc(100vh - 286px);
-    overflow-y: auto;
-    padding-right: 20px;
-    display: flex;
-    flex-direction: column-reverse;
-    word-break: break-word;
-    overflow-wrap: anywhere;
-    padding-top: 1px;
-}
-
-.chat > .messages {
-    display: flex;
-    flex-direction: column;
-}
-
-.message-body li {
-    margin-top: 0.5em !important;
-    margin-bottom: 0.5em !important;
-}
-
-.message-body li > p {
-    display: inline !important;
-}
-
-.message-body ul, .message-body ol {
-    font-size: 15px !important;
-}
-
-.message-body ul {
-    list-style-type: disc !important;
-}
-
-.message-body pre {
-    margin-bottom: 1.25em !important;
-}
-
-.message-body code {
-    white-space: pre-wrap !important;
-    word-wrap: break-word !important;
-}
-
-.message-body :not(pre) > code {
-    white-space: normal !important;
-}
-
-@media print {
-    body {
-        visibility: hidden;
-    }
-
-    .chat {
-        visibility: visible;
-        position: absolute;
-        left: 0;
-        top: 0;
-        max-width: none;
-        max-height: none;
-        width: 100%;
-        height: fit-content;
-        display: flex;
-        flex-direction: column-reverse;
-    }
-    
-    .message {
-        break-inside: avoid;
-    }
-    
-    .gradio-container {
-        overflow: visible;
-    }
-    
-    .tab-nav {
-        display: none !important;
-    }
-}
diff --git a/css/main.css b/css/main.css
index d37e3f63..5f293921 100644
--- a/css/main.css
+++ b/css/main.css
@@ -45,13 +45,6 @@
     min-height: 0
 }
 
-#save_session {
-    margin-top: 32px;
-}
-
-#accordion {
-}
-
 .dark svg {
     fill: white;
 }
@@ -64,7 +57,7 @@ ol li p, ul li p {
     display: inline-block;
 }
 
-#main, #parameters, #chat-settings, #lora, #training-tab, #model-tab, #session-tab {
+#chat-tab, #default-tab, #notebook-tab, #parameters, #chat-settings, #lora, #training-tab, #model-tab, #session-tab {
     border: 0;
 }
 
@@ -78,7 +71,6 @@ ol li p, ul li p {
 }
 
 #extensions {
-    padding: 15px;
     margin-bottom: 35px;
 }
 
@@ -108,7 +100,7 @@ div.svelte-15lo0d8 > *, div.svelte-15lo0d8 > .form > * {
 }
 
 .textbox_default textarea {
-    height: calc(100vh - 380px);
+    height: calc(100vh - 310px);
 }
 
 .textbox_default_output textarea {
@@ -128,6 +120,12 @@ div.svelte-15lo0d8 > *, div.svelte-15lo0d8 > .form > * {
     color: #efefef !important;
 }
 
+@media screen and (max-width: 711px) {
+    .textbox_default textarea {
+        height: calc(100vh - 275px);
+    }
+}
+
 /* Hide the gradio footer*/
 footer {
     display: none !important;
@@ -193,3 +191,141 @@ button {
 .dark .pretty_scrollbar::-webkit-resizer {
   background: #374151;
 }
+
+/*****************************************************/
+/*************** Chat UI declarations ****************/
+/*****************************************************/
+
+.h-\[40vh\], .wrap.svelte-byatnx.svelte-byatnx.svelte-byatnx {
+    height: 66.67vh
+}
+
+.gradio-container {
+    margin-left: auto !important;
+    margin-right: auto !important;
+}
+
+.w-screen {
+    width: unset
+}
+
+div.svelte-362y77>*, div.svelte-362y77>.form>* {
+    flex-wrap: nowrap
+}
+
+.pending.svelte-1ed2p3z {
+    opacity: 1;
+}
+
+#gradio-chatbot {
+    height: 66.67vh;
+}
+
+.wrap.svelte-6roggh.svelte-6roggh {
+    max-height: 92.5%;
+}
+
+/* This is for the microphone button in the whisper extension */
+.sm.svelte-1ipelgc {
+    width: 100%;
+}
+
+#chat-tab button, #notebook-tab button, #default-tab button {
+    min-width: 0 !important;
+}
+
+#chat-tab > :first-child, #extensions {
+    max-width: 800px;
+    margin-left: auto;
+    margin-right: auto;
+}
+
+@media screen and (max-width: 688px) {
+    #chat-tab {
+        padding: 0px;
+    }
+
+    .chat {
+        height: calc(100vh - 274px) !important;
+    }
+}
+
+.chat {
+    margin-left: auto;
+    margin-right: auto;
+    max-width: 800px;
+    height: calc(100vh - 286px);
+    overflow-y: auto;
+    padding-right: 20px;
+    display: flex;
+    flex-direction: column-reverse;
+    word-break: break-word;
+    overflow-wrap: anywhere;
+    padding-top: 1px;
+}
+
+.chat > .messages {
+    display: flex;
+    flex-direction: column;
+}
+
+.message-body li {
+    margin-top: 0.5em !important;
+    margin-bottom: 0.5em !important;
+}
+
+.message-body li > p {
+    display: inline !important;
+}
+
+.message-body ul, .message-body ol {
+    font-size: 15px !important;
+}
+
+.message-body ul {
+    list-style-type: disc !important;
+}
+
+.message-body pre {
+    margin-bottom: 1.25em !important;
+}
+
+.message-body code {
+    white-space: pre-wrap !important;
+    word-wrap: break-word !important;
+}
+
+.message-body :not(pre) > code {
+    white-space: normal !important;
+}
+
+@media print {
+    body {
+        visibility: hidden;
+    }
+
+    .chat {
+        visibility: visible;
+        position: absolute;
+        left: 0;
+        top: 0;
+        max-width: none;
+        max-height: none;
+        width: 100%;
+        height: fit-content;
+        display: flex;
+        flex-direction: column-reverse;
+    }
+    
+    .message {
+        break-inside: avoid;
+    }
+    
+    .gradio-container {
+        overflow: visible;
+    }
+    
+    .tab-nav {
+        display: none !important;
+    }
+}
diff --git a/docs/Extensions.md b/docs/Extensions.md
index 4e59e855..53acce59 100644
--- a/docs/Extensions.md
+++ b/docs/Extensions.md
@@ -39,8 +39,8 @@ The extensions framework is based on special functions and variables that you ca
 | `def ui()` | Creates custom gradio elements when the UI is launched. | 
 | `def custom_css()` | Returns custom CSS as a string. It is applied whenever the web UI is loaded. |
 | `def custom_js()` | Same as above but for javascript. |
-| `def input_modifier(string, state)`  | Modifies the input string before it enters the model. In chat mode, it is applied to the user message. Otherwise, it is applied to the entire prompt. |
-| `def output_modifier(string, state)`  | Modifies the output string before it is presented in the UI. In chat mode, it is applied to the bot's reply. Otherwise, it is applied to the entire output. |
+| `def input_modifier(string, state, is_chat=False)`  | Modifies the input string before it enters the model. In chat mode, it is applied to the user message. Otherwise, it is applied to the entire prompt. |
+| `def output_modifier(string, state, is_chat=False)`  | Modifies the output string before it is presented in the UI. In chat mode, it is applied to the bot's reply. Otherwise, it is applied to the entire output. |
 | `def chat_input_modifier(text, visible_text, state)` | Modifies both the visible and internal inputs in chat mode. Can be used to hijack the chat input with custom content. |
 | `def bot_prefix_modifier(string, state)`  | Applied in chat mode to the prefix for the bot's reply. |
 | `def state_modifier(state)`  | Modifies the dictionary containing the UI input parameters before it is used by the text generation functions. |
@@ -163,7 +163,7 @@ def chat_input_modifier(text, visible_text, state):
     """
     return text, visible_text
 
-def input_modifier(string, state):
+def input_modifier(string, state, is_chat=False):
     """
     In default/notebook modes, modifies the whole prompt.
 
@@ -196,7 +196,7 @@ def logits_processor_modifier(processor_list, input_ids):
     processor_list.append(MyLogits())
     return processor_list
 
-def output_modifier(string, state):
+def output_modifier(string, state, is_chat=False):
     """
     Modifies the LLM output before it gets presented.
 
diff --git a/extensions/api/util.py b/extensions/api/util.py
index 7ebfaa32..0db1c46c 100644
--- a/extensions/api/util.py
+++ b/extensions/api/util.py
@@ -68,8 +68,6 @@ def build_parameters(body, chat=False):
         name1, name2, _, greeting, context, _ = load_character_memoized(character, str(body.get('your_name', shared.settings['name1'])), shared.settings['name2'], instruct=False)
         name1_instruct, name2_instruct, _, _, context_instruct, turn_template = load_character_memoized(instruction_template, '', '', instruct=True)
         generate_params.update({
-            'stop_at_newline': bool(body.get('stop_at_newline', shared.settings['stop_at_newline'])),
-            'chat_generation_attempts': int(body.get('chat_generation_attempts', shared.settings['chat_generation_attempts'])),
             'mode': str(body.get('mode', 'chat')),
             'name1': str(body.get('name1', name1)),
             'name2': str(body.get('name2', name2)),
diff --git a/extensions/elevenlabs_tts/script.py b/extensions/elevenlabs_tts/script.py
index f74e1047..2324d782 100644
--- a/extensions/elevenlabs_tts/script.py
+++ b/extensions/elevenlabs_tts/script.py
@@ -4,9 +4,9 @@ from pathlib import Path
 import elevenlabs
 import gradio as gr
 
-from modules import chat, shared
-from modules.utils import gradio
+from modules import chat, shared, ui_chat
 from modules.logging_colors import logger
+from modules.utils import gradio
 
 params = {
     'activate': True,
@@ -167,24 +167,23 @@ def ui():
         convert_cancel = gr.Button('Cancel', visible=False)
         convert_confirm = gr.Button('Confirm (cannot be undone)', variant="stop", visible=False)
 
-    if shared.is_chat():
-        # Convert history with confirmation
-        convert_arr = [convert_confirm, convert, convert_cancel]
-        convert.click(lambda: [gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, convert_arr)
-        convert_confirm.click(
-            lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr).then(
-            remove_tts_from_history, gradio('history'), gradio('history')).then(
-            chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
-            chat.redraw_html, shared.reload_inputs, gradio('display'))
+    # Convert history with confirmation
+    convert_arr = [convert_confirm, convert, convert_cancel]
+    convert.click(lambda: [gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, convert_arr)
+    convert_confirm.click(
+        lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr).then(
+        remove_tts_from_history, gradio('history'), gradio('history')).then(
+        chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
+        chat.redraw_html, gradio(ui_chat.reload_arr), gradio('display'))
 
-        convert_cancel.click(lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr)
+    convert_cancel.click(lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr)
 
-        # Toggle message text in history
-        show_text.change(
-            lambda x: params.update({"show_text": x}), show_text, None).then(
-            toggle_text_in_history, gradio('history'), gradio('history')).then(
-            chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
-            chat.redraw_html, shared.reload_inputs, gradio('display'))
+    # Toggle message text in history
+    show_text.change(
+        lambda x: params.update({"show_text": x}), show_text, None).then(
+        toggle_text_in_history, gradio('history'), gradio('history')).then(
+        chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
+        chat.redraw_html, gradio(ui_chat.reload_arr), gradio('display'))
 
     # Event functions to update the parameters in the backend
     activate.change(lambda x: params.update({'activate': x}), activate, None)
diff --git a/extensions/example/script.py b/extensions/example/script.py
index b4db7102..44f0cb3c 100644
--- a/extensions/example/script.py
+++ b/extensions/example/script.py
@@ -59,7 +59,7 @@ def chat_input_modifier(text, visible_text, state):
     """
     return text, visible_text
 
-def input_modifier(string, state):
+def input_modifier(string, state, is_chat=False):
     """
     In default/notebook modes, modifies the whole prompt.
 
@@ -92,7 +92,7 @@ def logits_processor_modifier(processor_list, input_ids):
     processor_list.append(MyLogits())
     return processor_list
 
-def output_modifier(string, state):
+def output_modifier(string, state, is_chat=False):
     """
     Modifies the LLM output before it gets presented.
 
diff --git a/extensions/gallery/script.js b/extensions/gallery/script.js
new file mode 100644
index 00000000..878401ec
--- /dev/null
+++ b/extensions/gallery/script.js
@@ -0,0 +1,14 @@
+let gallery_element = document.getElementById('gallery-extension');
+
+main_parent.addEventListener('click', function(e) {
+    let chat_visible = (chat_tab.offsetHeight > 0 && chat_tab.offsetWidth > 0);
+    let notebook_visible = (notebook_tab.offsetHeight > 0 && notebook_tab.offsetWidth > 0);
+    let default_visible = (default_tab.offsetHeight > 0 && default_tab.offsetWidth > 0);
+
+    // Only show this extension in the Chat tab
+    if (chat_visible) {
+        gallery_element.style.display = 'flex';
+    } else {
+        gallery_element.style.display = 'none';
+    }
+});
diff --git a/extensions/gallery/script.py b/extensions/gallery/script.py
index 993ef273..611a11f4 100644
--- a/extensions/gallery/script.py
+++ b/extensions/gallery/script.py
@@ -82,8 +82,13 @@ def select_character(evt: gr.SelectData):
     return (evt.value[1])
 
 
+def custom_js():
+    path_to_js = Path(__file__).parent.resolve() / 'script.js'
+    return open(path_to_js, 'r').read()
+
+
 def ui():
-    with gr.Accordion("Character gallery", open=False):
+    with gr.Accordion("Character gallery", open=False, elem_id='gallery-extension'):
         update = gr.Button("Refresh")
         gr.HTML(value="<style>" + generate_css() + "</style>")
         gallery = gr.Dataset(components=[gr.HTML(visible=False)],
diff --git a/extensions/send_pictures/script.py b/extensions/send_pictures/script.py
index 39c9362a..f8e6c969 100644
--- a/extensions/send_pictures/script.py
+++ b/extensions/send_pictures/script.py
@@ -5,7 +5,7 @@ import gradio as gr
 import torch
 from transformers import BlipForConditionalGeneration, BlipProcessor
 
-from modules import chat, shared
+from modules import chat, shared, ui_chat
 from modules.ui import gather_interface_values
 from modules.utils import gradio
 
@@ -54,5 +54,5 @@ def ui():
             "value": generate_chat_picture(picture, name1, name2)
         }), [picture_select, shared.gradio['name1'], shared.gradio['name2']], None).then(
         gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        chat.generate_chat_reply_wrapper, shared.input_params, gradio('display', 'history'), show_progress=False).then(
+        chat.generate_chat_reply_wrapper, gradio(ui_chat.inputs), gradio('display', 'history'), show_progress=False).then(
         lambda: None, None, picture_select, show_progress=False)
diff --git a/extensions/silero_tts/script.py b/extensions/silero_tts/script.py
index b96a47fd..707d919b 100644
--- a/extensions/silero_tts/script.py
+++ b/extensions/silero_tts/script.py
@@ -6,7 +6,7 @@ import gradio as gr
 import torch
 
 from extensions.silero_tts import tts_preprocessor
-from modules import chat, shared
+from modules import chat, shared, ui_chat
 from modules.utils import gradio
 
 torch._C._jit_set_profiling_mode(False)
@@ -194,24 +194,23 @@ def ui():
             convert_cancel = gr.Button('Cancel', visible=False)
             convert_confirm = gr.Button('Confirm (cannot be undone)', variant="stop", visible=False)
 
-    if shared.is_chat():
-        # Convert history with confirmation
-        convert_arr = [convert_confirm, convert, convert_cancel]
-        convert.click(lambda: [gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, convert_arr)
-        convert_confirm.click(
-            lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr).then(
-            remove_tts_from_history, gradio('history'), gradio('history')).then(
-            chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
-            chat.redraw_html, shared.reload_inputs, gradio('display'))
+    # Convert history with confirmation
+    convert_arr = [convert_confirm, convert, convert_cancel]
+    convert.click(lambda: [gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, convert_arr)
+    convert_confirm.click(
+        lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr).then(
+        remove_tts_from_history, gradio('history'), gradio('history')).then(
+        chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
+        chat.redraw_html, gradio(ui_chat.reload_arr), gradio('display'))
 
-        convert_cancel.click(lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr)
+    convert_cancel.click(lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr)
 
-        # Toggle message text in history
-        show_text.change(
-            lambda x: params.update({"show_text": x}), show_text, None).then(
-            toggle_text_in_history, gradio('history'), gradio('history')).then(
-            chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
-            chat.redraw_html, shared.reload_inputs, gradio('display'))
+    # Toggle message text in history
+    show_text.change(
+        lambda x: params.update({"show_text": x}), show_text, None).then(
+        toggle_text_in_history, gradio('history'), gradio('history')).then(
+        chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
+        chat.redraw_html, gradio(ui_chat.reload_arr), gradio('display'))
 
     # Event functions to update the parameters in the backend
     activate.change(lambda x: params.update({"activate": x}), activate, None)
diff --git a/extensions/superbooga/script.py b/extensions/superbooga/script.py
index 475cf1e0..06fe8ad3 100644
--- a/extensions/superbooga/script.py
+++ b/extensions/superbooga/script.py
@@ -4,7 +4,7 @@ import textwrap
 import gradio as gr
 from bs4 import BeautifulSoup
 
-from modules import chat, shared
+from modules import chat
 from modules.logging_colors import logger
 
 from .chromadb import add_chunks_to_collector, make_collector
@@ -143,8 +143,8 @@ def remove_special_tokens(string):
     return re.sub(pattern, '', string)
 
 
-def input_modifier(string):
-    if shared.is_chat():
+def input_modifier(string, state, is_chat=False):
+    if is_chat:
         return string
 
     # Find the user input
diff --git a/js/main.js b/js/main.js
index 7a2368fe..40197869 100644
--- a/js/main.js
+++ b/js/main.js
@@ -1,17 +1,30 @@
-document.getElementById("main").parentNode.childNodes[0].classList.add("header_bar");
-document.getElementById("main").parentNode.style = "padding: 0; margin: 0";
-document.getElementById("main").parentNode.parentNode.parentNode.style = "padding: 0";
+let chat_tab = document.getElementById('chat-tab');
+let notebook_tab = document.getElementById('notebook-tab');
+let default_tab = document.getElementById('default-tab');
 
-// Get references to the elements
-let main = document.getElementById('main');
-let main_parent = main.parentNode;
+let main_parent = chat_tab.parentNode;
 let extensions = document.getElementById('extensions');
 
-// Add an event listener to the main element
+main_parent.childNodes[0].classList.add("header_bar");
+main_parent.style = "padding: 0; margin: 0";
+main_parent.parentNode.parentNode.style = "padding: 0";
+
+// Add an event listener to the generation tabs
 main_parent.addEventListener('click', function(e) {
-    // Check if the main element is visible
-    if (main.offsetHeight > 0 && main.offsetWidth > 0) {
+    let chat_visible = (chat_tab.offsetHeight > 0 && chat_tab.offsetWidth > 0);
+    let notebook_visible = (notebook_tab.offsetHeight > 0 && notebook_tab.offsetWidth > 0);
+    let default_visible = (default_tab.offsetHeight > 0 && default_tab.offsetWidth > 0);
+
+    // Check if one of the generation tabs is visible
+    if (chat_visible || notebook_visible || default_visible) {
         extensions.style.display = 'flex';
+        if (chat_visible) {
+            extensions.style.maxWidth = "800px";
+            extensions.style.padding = "0px";
+        } else {
+            extensions.style.maxWidth = "none";
+            extensions.style.padding = "15px";
+        }
     } else {
         extensions.style.display = 'none';
     }
diff --git a/js/save_files.js b/js/save_files.js
index 7dfbcfda..d5b22c4b 100644
--- a/js/save_files.js
+++ b/js/save_files.js
@@ -32,9 +32,9 @@ function saveHistory(history, character, mode) {
     saveFile(history, path);
 }
 
-function saveSession(session, mode) {
+function saveSession(session) {
     let path = null;
 
-    path = `session_${mode}_${getCurrentTimestamp()}.json`;
+    path = `session_${getCurrentTimestamp()}.json`;
     saveFile(session, path);
 }
diff --git a/modules/chat.py b/modules/chat.py
index c2a05d3f..e2bba18f 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -175,9 +175,6 @@ def get_stopping_strings(state):
             f"\n{state['name2']}:"
         ]
 
-    if state['stop_at_newline']:
-        stopping_strings.append("\n")
-
     return stopping_strings
 
 
@@ -201,7 +198,7 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
     if not any((regenerate, _continue)):
         visible_text = text
         text, visible_text = apply_extensions('chat_input', text, visible_text, state)
-        text = apply_extensions('input', text, state)
+        text = apply_extensions('input', text, state, is_chat=True)
 
         # *Is typing...*
         if loading_message:
@@ -230,45 +227,37 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
         prompt = generate_chat_prompt(text, state, **kwargs)
 
     # Generate
-    cumulative_reply = ''
-    for i in range(state['chat_generation_attempts']):
-        reply = None
-        for j, reply in enumerate(generate_reply(prompt + cumulative_reply, state, stopping_strings=stopping_strings, is_chat=True)):
-            reply = cumulative_reply + reply
+    reply = None
+    for j, reply in enumerate(generate_reply(prompt, state, stopping_strings=stopping_strings, is_chat=True)):
 
-            # Extract the reply
-            visible_reply = re.sub("(<USER>|<user>|{{user}})", state['name1'], reply)
+        # Extract the reply
+        visible_reply = re.sub("(<USER>|<user>|{{user}})", state['name1'], reply)
 
-            # We need this global variable to handle the Stop event,
-            # otherwise gradio gets confused
-            if shared.stop_everything:
-                output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state)
+        # We need this global variable to handle the Stop event,
+        # otherwise gradio gets confused
+        if shared.stop_everything:
+            output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
+            yield output
+            return
+
+        if just_started:
+            just_started = False
+            if not _continue:
+                output['internal'].append(['', ''])
+                output['visible'].append(['', ''])
+
+        if _continue:
+            output['internal'][-1] = [text, last_reply[0] + reply]
+            output['visible'][-1] = [visible_text, last_reply[1] + visible_reply]
+            if is_stream:
+                yield output
+        elif not (j == 0 and visible_reply.strip() == ''):
+            output['internal'][-1] = [text, reply.lstrip(' ')]
+            output['visible'][-1] = [visible_text, visible_reply.lstrip(' ')]
+            if is_stream:
                 yield output
-                return
 
-            if just_started:
-                just_started = False
-                if not _continue:
-                    output['internal'].append(['', ''])
-                    output['visible'].append(['', ''])
-
-            if _continue:
-                output['internal'][-1] = [text, last_reply[0] + reply]
-                output['visible'][-1] = [visible_text, last_reply[1] + visible_reply]
-                if is_stream:
-                    yield output
-            elif not (j == 0 and visible_reply.strip() == ''):
-                output['internal'][-1] = [text, reply.lstrip(' ')]
-                output['visible'][-1] = [visible_text, visible_reply.lstrip(' ')]
-                if is_stream:
-                    yield output
-
-        if reply in [None, cumulative_reply]:
-            break
-        else:
-            cumulative_reply = reply
-
-    output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state)
+    output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
     yield output
 
 
@@ -278,27 +267,15 @@ def impersonate_wrapper(text, start_with, state):
         yield ''
         return
 
-    # Defining some variables
-    cumulative_reply = ''
     prompt = generate_chat_prompt('', state, impersonate=True)
     stopping_strings = get_stopping_strings(state)
 
     yield text + '...'
-    cumulative_reply = text
-    for i in range(state['chat_generation_attempts']):
-        reply = None
-        for reply in generate_reply(prompt + cumulative_reply, state, stopping_strings=stopping_strings, is_chat=True):
-            reply = cumulative_reply + reply
-            yield reply.lstrip(' ')
-            if shared.stop_everything:
-                return
-
-        if reply in [None, cumulative_reply]:
-            break
-        else:
-            cumulative_reply = reply
-
-    yield cumulative_reply.lstrip(' ')
+    reply = None
+    for reply in generate_reply(prompt, state, stopping_strings=stopping_strings, is_chat=True):
+        yield reply.lstrip(' ')
+        if shared.stop_everything:
+            return
 
 
 def generate_chat_reply(text, state, regenerate=False, _continue=False, loading_message=True):
@@ -352,7 +329,7 @@ def replace_last_reply(text, state):
         return history
     elif len(history['visible']) > 0:
         history['visible'][-1][1] = text
-        history['internal'][-1][1] = apply_extensions('input', text, state)
+        history['internal'][-1][1] = apply_extensions('input', text, state, is_chat=True)
 
     return history
 
@@ -360,7 +337,7 @@ def replace_last_reply(text, state):
 def send_dummy_message(text, state):
     history = state['history']
     history['visible'].append([text, ''])
-    history['internal'].append([apply_extensions('input', text, state), ''])
+    history['internal'].append([apply_extensions('input', text, state, is_chat=True), ''])
     return history
 
 
@@ -371,7 +348,7 @@ def send_dummy_reply(text, state):
         history['internal'].append(['', ''])
 
     history['visible'][-1][1] = text
-    history['internal'][-1][1] = apply_extensions('input', text, state)
+    history['internal'][-1][1] = apply_extensions('input', text, state, is_chat=True)
     return history
 
 
@@ -385,7 +362,7 @@ def clear_chat_log(state):
     if mode != 'instruct':
         if greeting != '':
             history['internal'] += [['<|BEGIN-VISIBLE-CHAT|>', greeting]]
-            history['visible'] += [['', apply_extensions('output', greeting, state)]]
+            history['visible'] += [['', apply_extensions('output', greeting, state, is_chat=True)]]
 
     return history
 
@@ -452,7 +429,7 @@ def load_persistent_history(state):
         history = {'internal': [], 'visible': []}
         if greeting != "":
             history['internal'] += [['<|BEGIN-VISIBLE-CHAT|>', greeting]]
-            history['visible'] += [['', apply_extensions('output', greeting, state)]]
+            history['visible'] += [['', apply_extensions('output', greeting, state, is_chat=True)]]
 
     return history
 
diff --git a/modules/extensions.py b/modules/extensions.py
index 76b6be8b..796ff072 100644
--- a/modules/extensions.py
+++ b/modules/extensions.py
@@ -53,14 +53,32 @@ def iterator():
 
 
 # Extension functions that map string -> string
-def _apply_string_extensions(function_name, text, state):
+def _apply_string_extensions(function_name, text, state, is_chat=False):
     for extension, _ in iterator():
         if hasattr(extension, function_name):
             func = getattr(extension, function_name)
-            if len(signature(func).parameters) == 2:
-                text = func(text, state)
+
+            # Handle old extensions without the 'state' arg or
+            # the 'is_chat' kwarg
+            count = 0
+            has_chat = False
+            for k in signature(func).parameters:
+                if k == 'is_chat':
+                    has_chat = True
+                else:
+                    count += 1
+
+            if count == 2:
+                args = [text, state]
             else:
-                text = func(text)
+                args = [text]
+
+            if has_chat:
+                kwargs = {'is_chat': is_chat}
+            else:
+                kwargs = {}
+
+            text = func(*args, **kwargs)
 
     return text
 
@@ -169,9 +187,7 @@ def create_extensions_block():
     if len(to_display) > 0:
         with gr.Column(elem_id="extensions"):
             for row in to_display:
-                extension, name = row
-                display_name = getattr(extension, 'params', {}).get('display_name', name)
-                gr.Markdown(f"\n### {display_name}")
+                extension, _ = row
                 extension.ui()
 
 
diff --git a/modules/shared.py b/modules/shared.py
index cb6f0ae1..89b5f0cb 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -19,8 +19,6 @@ lora_names = []
 stop_everything = False
 generation_lock = None
 processing_message = '*Is typing...*'
-input_params = []
-reload_inputs = []
 
 # UI variables
 gradio = {}
@@ -45,7 +43,6 @@ settings = {
     'greeting': '',
     'turn_template': '',
     'custom_stopping_strings': '',
-    'stop_at_newline': False,
     'add_bos_token': True,
     'ban_eos_token': False,
     'skip_special_tokens': True,
@@ -57,11 +54,7 @@ settings = {
     'chat_style': 'TheEncrypted777',
     'instruction_template': 'None',
     'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',
-    'chat_generation_attempts': 1,
-    'chat_generation_attempts_min': 1,
-    'chat_generation_attempts_max': 10,
-    'default_extensions': [],
-    'chat_default_extensions': ['gallery'],
+    'default_extensions': ['gallery'],
     'preset': 'simple-1',
     'prompt': 'QA',
 }
@@ -81,8 +74,8 @@ def str2bool(v):
 parser = argparse.ArgumentParser(formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=54))
 
 # Basic settings
-parser.add_argument('--notebook', action='store_true', help='Launch the web UI in notebook mode, where the output is written to the same text box as the input.')
-parser.add_argument('--chat', action='store_true', help='Launch the web UI in chat mode with a style similar to the Character.AI website.')
+parser.add_argument('--notebook', action='store_true', help='DEPRECATED')
+parser.add_argument('--chat', action='store_true', help='DEPRECATED')
 parser.add_argument('--multi-user', action='store_true', help='Multi-user mode. Chat histories are not saved or automatically loaded. WARNING: this is highly experimental.')
 parser.add_argument('--character', type=str, help='The name of the character to load in chat mode by default.')
 parser.add_argument('--model', type=str, help='Name of the model to load by default.')
@@ -187,6 +180,11 @@ parser.add_argument('--multimodal-pipeline', type=str, default=None, help='The m
 args = parser.parse_args()
 args_defaults = parser.parse_args([])
 
+# Deprecation warnings
+for k in ['chat', 'notebook']:
+    if getattr(args, k):
+        logger.warning(f'--{k} has been deprecated and will be removed soon. Please remove that flag.')
+
 # Security warnings
 if args.trust_remote_code:
     logger.warning("trust_remote_code is enabled. This is dangerous.")
@@ -227,16 +225,7 @@ def add_extension(name):
 
 
 def is_chat():
-    return args.chat
-
-
-def get_mode():
-    if args.chat:
-        return 'chat'
-    elif args.notebook:
-        return 'notebook'
-    else:
-        return 'default'
+    return True
 
 
 args.loader = fix_loader_name(args.loader)
diff --git a/modules/ui.py b/modules/ui.py
index b58b7dd6..e7817f73 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -8,10 +8,8 @@ from modules import shared
 
 with open(Path(__file__).resolve().parent / '../css/main.css', 'r') as f:
     css = f.read()
-with open(Path(__file__).resolve().parent / '../css/chat.css', 'r') as f:
-    chat_css = f.read()
 with open(Path(__file__).resolve().parent / '../js/main.js', 'r') as f:
-    main_js = f.read()
+    js = f.read()
 with open(Path(__file__).resolve().parent / '../js/save_files.js', 'r') as f:
     save_files_js = f.read()
 
@@ -116,31 +114,35 @@ def list_interface_input_elements():
         'top_a',
     ]
 
-    if shared.args.chat:
-        elements += [
-            'character_menu',
-            'history',
-            'name1',
-            'name2',
-            'greeting',
-            'context',
-            'chat_generation_attempts',
-            'stop_at_newline',
-            'mode',
-            'instruction_template',
-            'name1_instruct',
-            'name2_instruct',
-            'context_instruct',
-            'turn_template',
-            'chat_style',
-            'chat-instruct_command',
-        ]
-    else:
-        elements.append('textbox')
-        if not shared.args.notebook:
-            elements.append('output_textbox')
+    # Chat elements
+    elements += [
+        'textbox',
+        'character_menu',
+        'history',
+        'name1',
+        'name2',
+        'greeting',
+        'context',
+        'mode',
+        'instruction_template',
+        'name1_instruct',
+        'name2_instruct',
+        'context_instruct',
+        'turn_template',
+        'chat_style',
+        'chat-instruct_command',
+    ]
 
+    # Notebook/default elements
+    elements += [
+        'textbox-notebook',
+        'textbox-default',
+        'output_textbox'
+    ]
+
+    # Model elements
     elements += list_model_elements()
+
     return elements
 
 
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 1d73adf7..76e70ed0 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -10,14 +10,17 @@ from modules.html_generator import chat_html_wrapper
 from modules.text_generation import stop_everything_event
 from modules.utils import gradio
 
+inputs = ('Chat input', 'start_with', 'interface_state')
+reload_arr = ('history', 'name1', 'name2', 'mode', 'chat_style')
+clear_arr = ('Clear history-confirm', 'Clear history', 'Clear history-cancel')
+
 
 def create_ui():
-    shared.gradio['interface_state'] = gr.State({k: None for k in shared.input_elements})
     shared.gradio['Chat input'] = gr.State()
     shared.gradio['dummy'] = gr.State()
     shared.gradio['history'] = gr.State({'internal': [], 'visible': []})
 
-    with gr.Tab('Text generation', elem_id='main'):
+    with gr.Tab('Chat', elem_id='chat-tab'):
         shared.gradio['display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': []}, shared.settings['name1'], shared.settings['name2'], 'chat', 'cai-chat'))
         shared.gradio['textbox'] = gr.Textbox(label='Input')
         with gr.Row():
@@ -45,82 +48,80 @@ def create_ui():
             shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'])
 
         with gr.Row():
-            shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], value=shared.settings['mode'] if shared.settings['mode'] in ['chat', 'instruct', 'chat-instruct'] else 'chat', label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template selected under "Chat settings" must match the current model.')
+            shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], value=shared.settings['mode'] if shared.settings['mode'] in ['chat', 'instruct', 'chat-instruct'] else 'chat', label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template selected under Parameters > Instruction template must match the current model.')
             shared.gradio['chat_style'] = gr.Dropdown(choices=utils.get_available_chat_styles(), label='Chat style', value=shared.settings['chat_style'], visible=shared.settings['mode'] != 'instruct')
 
-    with gr.Tab('Chat settings', elem_id='chat-settings'):
-        with gr.Tab('Character'):
-            with gr.Row():
-                with gr.Column(scale=8):
-                    with gr.Row():
-                        shared.gradio['character_menu'] = gr.Dropdown(value='None', choices=utils.get_available_characters(), label='Character', elem_id='character-menu', info='Used in chat and chat-instruct modes.', elem_classes='slim-dropdown')
-                        ui.create_refresh_button(shared.gradio['character_menu'], lambda: None, lambda: {'choices': utils.get_available_characters()}, 'refresh-button')
-                        shared.gradio['save_character'] = gr.Button('💾', elem_classes='refresh-button')
-                        shared.gradio['delete_character'] = gr.Button('🗑️', elem_classes='refresh-button')
 
-                    shared.gradio['name1'] = gr.Textbox(value=shared.settings['name1'], lines=1, label='Your name')
-                    shared.gradio['name2'] = gr.Textbox(value=shared.settings['name2'], lines=1, label='Character\'s name')
-                    shared.gradio['context'] = gr.Textbox(value=shared.settings['context'], lines=10, label='Context', elem_classes=['add_scrollbar'])
-                    shared.gradio['greeting'] = gr.Textbox(value=shared.settings['greeting'], lines=5, label='Greeting', elem_classes=['add_scrollbar'])
-
-                with gr.Column(scale=1):
-                    shared.gradio['character_picture'] = gr.Image(label='Character picture', type='pil')
-                    shared.gradio['your_picture'] = gr.Image(label='Your picture', type='pil', value=Image.open(Path('cache/pfp_me.png')) if Path('cache/pfp_me.png').exists() else None)
-
-        with gr.Tab('Instruction template'):
-            with gr.Row():
+def create_chat_settings_ui():
+    with gr.Tab('Character'):
+        with gr.Row():
+            with gr.Column(scale=8):
                 with gr.Row():
-                    shared.gradio['instruction_template'] = gr.Dropdown(choices=utils.get_available_instruction_templates(), label='Instruction template', value='None', info='Change this according to the model/LoRA that you are using. Used in instruct and chat-instruct modes.', elem_classes='slim-dropdown')
-                    ui.create_refresh_button(shared.gradio['instruction_template'], lambda: None, lambda: {'choices': utils.get_available_instruction_templates()}, 'refresh-button')
-                    shared.gradio['save_template'] = gr.Button('💾', elem_classes='refresh-button')
-                    shared.gradio['delete_template'] = gr.Button('🗑️ ', elem_classes='refresh-button')
+                    shared.gradio['character_menu'] = gr.Dropdown(value='None', choices=utils.get_available_characters(), label='Character', elem_id='character-menu', info='Used in chat and chat-instruct modes.', elem_classes='slim-dropdown')
+                    ui.create_refresh_button(shared.gradio['character_menu'], lambda: None, lambda: {'choices': utils.get_available_characters()}, 'refresh-button')
+                    shared.gradio['save_character'] = gr.Button('💾', elem_classes='refresh-button')
+                    shared.gradio['delete_character'] = gr.Button('🗑️', elem_classes='refresh-button')
 
-            shared.gradio['name1_instruct'] = gr.Textbox(value='', lines=2, label='User string')
-            shared.gradio['name2_instruct'] = gr.Textbox(value='', lines=1, label='Bot string')
-            shared.gradio['context_instruct'] = gr.Textbox(value='', lines=4, label='Context')
-            shared.gradio['turn_template'] = gr.Textbox(value=shared.settings['turn_template'], lines=1, label='Turn template', info='Used to precisely define the placement of spaces and new line characters in instruction prompts.')
+                shared.gradio['name1'] = gr.Textbox(value=shared.settings['name1'], lines=1, label='Your name')
+                shared.gradio['name2'] = gr.Textbox(value=shared.settings['name2'], lines=1, label='Character\'s name')
+                shared.gradio['context'] = gr.Textbox(value=shared.settings['context'], lines=10, label='Context', elem_classes=['add_scrollbar'])
+                shared.gradio['greeting'] = gr.Textbox(value=shared.settings['greeting'], lines=5, label='Greeting', elem_classes=['add_scrollbar'])
+
+            with gr.Column(scale=1):
+                shared.gradio['character_picture'] = gr.Image(label='Character picture', type='pil')
+                shared.gradio['your_picture'] = gr.Image(label='Your picture', type='pil', value=Image.open(Path('cache/pfp_me.png')) if Path('cache/pfp_me.png').exists() else None)
+
+    with gr.Tab('Instruction template'):
+        with gr.Row():
             with gr.Row():
-                shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=4, label='Command for chat-instruct mode', info='<|character|> gets replaced by the bot name, and <|prompt|> gets replaced by the regular chat prompt.', elem_classes=['add_scrollbar'])
+                shared.gradio['instruction_template'] = gr.Dropdown(choices=utils.get_available_instruction_templates(), label='Instruction template', value='None', info='Change this according to the model/LoRA that you are using. Used in instruct and chat-instruct modes.', elem_classes='slim-dropdown')
+                ui.create_refresh_button(shared.gradio['instruction_template'], lambda: None, lambda: {'choices': utils.get_available_instruction_templates()}, 'refresh-button')
+                shared.gradio['save_template'] = gr.Button('💾', elem_classes='refresh-button')
+                shared.gradio['delete_template'] = gr.Button('🗑️ ', elem_classes='refresh-button')
 
-        with gr.Tab('Chat history'):
+        shared.gradio['name1_instruct'] = gr.Textbox(value='', lines=2, label='User string')
+        shared.gradio['name2_instruct'] = gr.Textbox(value='', lines=1, label='Bot string')
+        shared.gradio['context_instruct'] = gr.Textbox(value='', lines=4, label='Context')
+        shared.gradio['turn_template'] = gr.Textbox(value=shared.settings['turn_template'], lines=1, label='Turn template', info='Used to precisely define the placement of spaces and new line characters in instruction prompts.')
+        with gr.Row():
+            shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=4, label='Command for chat-instruct mode', info='<|character|> gets replaced by the bot name, and <|prompt|> gets replaced by the regular chat prompt.', elem_classes=['add_scrollbar'])
+
+    with gr.Tab('Chat history'):
+        with gr.Row():
+            with gr.Column():
+                shared.gradio['save_chat_history'] = gr.Button(value='Save history')
+
+            with gr.Column():
+                shared.gradio['load_chat_history'] = gr.File(type='binary', file_types=['.json', '.txt'], label='Upload History JSON')
+
+    with gr.Tab('Upload character'):
+        with gr.Tab('YAML or JSON'):
+            with gr.Row():
+                shared.gradio['upload_json'] = gr.File(type='binary', file_types=['.json', '.yaml'], label='JSON or YAML File')
+                shared.gradio['upload_img_bot'] = gr.Image(type='pil', label='Profile Picture (optional)')
+
+            shared.gradio['Submit character'] = gr.Button(value='Submit', interactive=False)
+
+        with gr.Tab('TavernAI PNG'):
             with gr.Row():
                 with gr.Column():
-                    shared.gradio['save_chat_history'] = gr.Button(value='Save history')
-
+                    shared.gradio['upload_img_tavern'] = gr.Image(type='pil', label='TavernAI PNG File', elem_id='upload_img_tavern')
+                    shared.gradio['tavern_json'] = gr.State()
                 with gr.Column():
-                    shared.gradio['load_chat_history'] = gr.File(type='binary', file_types=['.json', '.txt'], label='Upload History JSON')
+                    shared.gradio['tavern_name'] = gr.Textbox(value='', lines=1, label='Name', interactive=False)
+                    shared.gradio['tavern_desc'] = gr.Textbox(value='', lines=4, max_lines=4, label='Description', interactive=False)
 
-        with gr.Tab('Upload character'):
-            with gr.Tab('YAML or JSON'):
-                with gr.Row():
-                    shared.gradio['upload_json'] = gr.File(type='binary', file_types=['.json', '.yaml'], label='JSON or YAML File')
-                    shared.gradio['upload_img_bot'] = gr.Image(type='pil', label='Profile Picture (optional)')
-
-                shared.gradio['Submit character'] = gr.Button(value='Submit', interactive=False)
-
-            with gr.Tab('TavernAI PNG'):
-                with gr.Row():
-                    with gr.Column():
-                        shared.gradio['upload_img_tavern'] = gr.Image(type='pil', label='TavernAI PNG File', elem_id='upload_img_tavern')
-                        shared.gradio['tavern_json'] = gr.State()
-                    with gr.Column():
-                        shared.gradio['tavern_name'] = gr.Textbox(value='', lines=1, label='Name', interactive=False)
-                        shared.gradio['tavern_desc'] = gr.Textbox(value='', lines=4, max_lines=4, label='Description', interactive=False)
-
-                shared.gradio['Submit tavern character'] = gr.Button(value='Submit', interactive=False)
+            shared.gradio['Submit tavern character'] = gr.Button(value='Submit', interactive=False)
 
 
 def create_event_handlers():
     gen_events = []
-
-    shared.input_params = gradio('Chat input', 'start_with', 'interface_state')
-    clear_arr = gradio('Clear history-confirm', 'Clear history', 'Clear history-cancel')
-    shared.reload_inputs = gradio('history', 'name1', 'name2', 'mode', 'chat_style')
+    shared.input_params = gradio(inputs)  # Obsolete, kept for compatibility with old extensions
 
     gen_events.append(shared.gradio['Generate'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
-        chat.generate_chat_reply_wrapper, shared.input_params, gradio('display', 'history'), show_progress=False).then(
+        chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
         lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
@@ -129,7 +130,7 @@ def create_event_handlers():
     gen_events.append(shared.gradio['textbox'].submit(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
-        chat.generate_chat_reply_wrapper, shared.input_params, gradio('display', 'history'), show_progress=False).then(
+        chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
         lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
@@ -137,7 +138,7 @@ def create_event_handlers():
 
     gen_events.append(shared.gradio['Regenerate'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        partial(chat.generate_chat_reply_wrapper, regenerate=True), shared.input_params, gradio('display', 'history'), show_progress=False).then(
+        partial(chat.generate_chat_reply_wrapper, regenerate=True), gradio(inputs), gradio('display', 'history'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
         lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
@@ -145,7 +146,7 @@ def create_event_handlers():
 
     gen_events.append(shared.gradio['Continue'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        partial(chat.generate_chat_reply_wrapper, _continue=True), shared.input_params, gradio('display', 'history'), show_progress=False).then(
+        partial(chat.generate_chat_reply_wrapper, _continue=True), gradio(inputs), gradio('display', 'history'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
         lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
@@ -154,7 +155,7 @@ def create_event_handlers():
     gen_events.append(shared.gradio['Impersonate'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda x: x, gradio('textbox'), gradio('Chat input'), show_progress=False).then(
-        chat.impersonate_wrapper, shared.input_params, gradio('textbox'), show_progress=False).then(
+        chat.impersonate_wrapper, gradio(inputs), gradio('textbox'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
     )
@@ -163,59 +164,59 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.replace_last_reply, gradio('textbox', 'interface_state'), gradio('history')).then(
         lambda: '', None, gradio('textbox'), show_progress=False).then(
-        chat.redraw_html, shared.reload_inputs, gradio('display')).then(
+        chat.redraw_html, gradio(reload_arr), gradio('display')).then(
         chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None)
 
     shared.gradio['Send dummy message'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.send_dummy_message, gradio('textbox', 'interface_state'), gradio('history')).then(
         lambda: '', None, gradio('textbox'), show_progress=False).then(
-        chat.redraw_html, shared.reload_inputs, gradio('display')).then(
+        chat.redraw_html, gradio(reload_arr), gradio('display')).then(
         chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None)
 
     shared.gradio['Send dummy reply'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.send_dummy_reply, gradio('textbox', 'interface_state'), gradio('history')).then(
         lambda: '', None, gradio('textbox'), show_progress=False).then(
-        chat.redraw_html, shared.reload_inputs, gradio('display')).then(
+        chat.redraw_html, gradio(reload_arr), gradio('display')).then(
         chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None)
 
-    shared.gradio['Clear history'].click(lambda: [gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, clear_arr)
-    shared.gradio['Clear history-cancel'].click(lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, clear_arr)
+    shared.gradio['Clear history'].click(lambda: [gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, gradio(clear_arr))
+    shared.gradio['Clear history-cancel'].click(lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, gradio(clear_arr))
     shared.gradio['Clear history-confirm'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, clear_arr).then(
+        lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, gradio(clear_arr)).then(
         chat.clear_chat_log, gradio('interface_state'), gradio('history')).then(
-        chat.redraw_html, shared.reload_inputs, gradio('display')).then(
+        chat.redraw_html, gradio(reload_arr), gradio('display')).then(
         chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None)
 
     shared.gradio['Remove last'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.remove_last_message, gradio('history'), gradio('textbox', 'history'), show_progress=False).then(
-        chat.redraw_html, shared.reload_inputs, gradio('display')).then(
+        chat.redraw_html, gradio(reload_arr), gradio('display')).then(
         chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None)
 
     shared.gradio['character_menu'].change(
         partial(chat.load_character, instruct=False), gradio('character_menu', 'name1', 'name2'), gradio('name1', 'name2', 'character_picture', 'greeting', 'context', 'dummy')).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.load_persistent_history, gradio('interface_state'), gradio('history')).then(
-        chat.redraw_html, shared.reload_inputs, gradio('display'))
+        chat.redraw_html, gradio(reload_arr), gradio('display'))
 
     shared.gradio['Stop'].click(
         stop_everything_event, None, None, queue=False, cancels=gen_events if shared.args.no_stream else None).then(
-        chat.redraw_html, shared.reload_inputs, gradio('display'))
+        chat.redraw_html, gradio(reload_arr), gradio('display'))
 
     shared.gradio['mode'].change(
         lambda x: gr.update(visible=x != 'instruct'), gradio('mode'), gradio('chat_style'), show_progress=False).then(
-        chat.redraw_html, shared.reload_inputs, gradio('display'))
+        chat.redraw_html, gradio(reload_arr), gradio('display'))
 
-    shared.gradio['chat_style'].change(chat.redraw_html, shared.reload_inputs, gradio('display'))
+    shared.gradio['chat_style'].change(chat.redraw_html, gradio(reload_arr), gradio('display'))
     shared.gradio['instruction_template'].change(
         partial(chat.load_character, instruct=True), gradio('instruction_template', 'name1_instruct', 'name2_instruct'), gradio('name1_instruct', 'name2_instruct', 'dummy', 'dummy', 'context_instruct', 'turn_template'))
 
     shared.gradio['load_chat_history'].upload(
         chat.load_history, gradio('load_chat_history', 'history'), gradio('history')).then(
-        chat.redraw_html, shared.reload_inputs, gradio('display')).then(
+        chat.redraw_html, gradio(reload_arr), gradio('display')).then(
         None, None, None, _js='() => {alert("The history has been loaded.")}')
 
     shared.gradio['Copy last reply'].click(chat.send_last_reply_to_input, gradio('history'), gradio('textbox'), show_progress=False)
@@ -256,4 +257,4 @@ def create_event_handlers():
     shared.gradio['upload_img_tavern'].clear(lambda: (None, None, None, gr.update(interactive=False)), None, gradio('tavern_name', 'tavern_desc', 'tavern_json', 'Submit tavern character'), show_progress=False)
     shared.gradio['your_picture'].change(
         chat.upload_your_profile_picture, gradio('your_picture'), None).then(
-        partial(chat.redraw_html, reset_cache=True), shared.reload_inputs, gradio('display'))
+        partial(chat.redraw_html, reset_cache=True), gradio(reload_arr), gradio('display'))
diff --git a/modules/ui_default.py b/modules/ui_default.py
index b879e1ef..d26863bc 100644
--- a/modules/ui_default.py
+++ b/modules/ui_default.py
@@ -8,87 +8,85 @@ from modules.text_generation import (
 )
 from modules.utils import gradio
 
+inputs = ('textbox-default', 'interface_state')
+outputs = ('output_textbox', 'html-default')
+
 
 def create_ui():
     default_text = load_prompt(shared.settings['prompt'])
 
-    shared.gradio['interface_state'] = gr.State({k: None for k in shared.input_elements})
-    shared.gradio['last_input'] = gr.State('')
-
-    with gr.Tab('Text generation', elem_id='main'):
+    with gr.Tab('Default', elem_id='default-tab'):
+        shared.gradio['last_input-default'] = gr.State('')
         with gr.Row():
             with gr.Column():
-                shared.gradio['textbox'] = gr.Textbox(value=default_text, elem_classes=['textbox_default', 'add_scrollbar'], lines=27, label='Input')
-                shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens'])
+                shared.gradio['textbox-default'] = gr.Textbox(value=default_text, elem_classes=['textbox_default', 'add_scrollbar'], lines=27, label='Input')
                 with gr.Row():
-                    shared.gradio['Generate'] = gr.Button('Generate', variant='primary')
-                    shared.gradio['Stop'] = gr.Button('Stop', elem_id='stop')
-                    shared.gradio['Continue'] = gr.Button('Continue')
-                    shared.gradio['count_tokens'] = gr.Button('Count tokens')
+                    shared.gradio['Generate-default'] = gr.Button('Generate', variant='primary')
+                    shared.gradio['Stop-default'] = gr.Button('Stop', elem_id='stop')
+                    shared.gradio['Continue-default'] = gr.Button('Continue')
+                    shared.gradio['count_tokens-default'] = gr.Button('Count tokens')
 
                 with gr.Row():
-                    shared.gradio['prompt_menu'] = gr.Dropdown(choices=utils.get_available_prompts(), value='None', label='Prompt', elem_classes='slim-dropdown')
-                    ui.create_refresh_button(shared.gradio['prompt_menu'], lambda: None, lambda: {'choices': utils.get_available_prompts()}, 'refresh-button')
-                    shared.gradio['save_prompt'] = gr.Button('💾', elem_classes='refresh-button')
-                    shared.gradio['delete_prompt'] = gr.Button('🗑️', elem_classes='refresh-button')
+                    shared.gradio['prompt_menu-default'] = gr.Dropdown(choices=utils.get_available_prompts(), value='None', label='Prompt', elem_classes='slim-dropdown')
+                    ui.create_refresh_button(shared.gradio['prompt_menu-default'], lambda: None, lambda: {'choices': utils.get_available_prompts()}, 'refresh-button')
+                    shared.gradio['save_prompt-default'] = gr.Button('💾', elem_classes='refresh-button')
+                    shared.gradio['delete_prompt-default'] = gr.Button('🗑️', elem_classes='refresh-button')
 
-                shared.gradio['status'] = gr.Markdown('')
+                shared.gradio['status-default'] = gr.Markdown('')
 
             with gr.Column():
                 with gr.Tab('Raw'):
                     shared.gradio['output_textbox'] = gr.Textbox(lines=27, label='Output', elem_classes=['textbox_default_output', 'add_scrollbar'])
 
                 with gr.Tab('Markdown'):
-                    shared.gradio['markdown_render'] = gr.Button('Render')
-                    shared.gradio['markdown'] = gr.Markdown()
+                    shared.gradio['markdown_render-default'] = gr.Button('Render')
+                    shared.gradio['markdown-default'] = gr.Markdown()
 
                 with gr.Tab('HTML'):
-                    shared.gradio['html'] = gr.HTML()
+                    shared.gradio['html-default'] = gr.HTML()
 
 
 def create_event_handlers():
     gen_events = []
-    shared.input_params = gradio('textbox', 'interface_state')
-    output_params = gradio('output_textbox', 'html')
 
-    gen_events.append(shared.gradio['Generate'].click(
-        lambda x: x, gradio('textbox'), gradio('last_input')).then(
+    gen_events.append(shared.gradio['Generate-default'].click(
+        lambda x: x, gradio('textbox-default'), gradio('last_input-default')).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        generate_reply_wrapper, shared.input_params, output_params, show_progress=False).then(
+        generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
         # lambda: None, None, None, _js="() => {element = document.getElementsByTagName('textarea')[0]; element.scrollTop = element.scrollHeight}")
     )
 
-    gen_events.append(shared.gradio['textbox'].submit(
-        lambda x: x, gradio('textbox'), gradio('last_input')).then(
+    gen_events.append(shared.gradio['textbox-default'].submit(
+        lambda x: x, gradio('textbox-default'), gradio('last_input-default')).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        generate_reply_wrapper, shared.input_params, output_params, show_progress=False).then(
+        generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
         # lambda: None, None, None, _js="() => {element = document.getElementsByTagName('textarea')[0]; element.scrollTop = element.scrollHeight}")
     )
 
-    shared.gradio['markdown_render'].click(lambda x: x, gradio('output_textbox'), gradio('markdown'), queue=False)
-    gen_events.append(shared.gradio['Continue'].click(
+    shared.gradio['markdown_render-default'].click(lambda x: x, gradio('output_textbox'), gradio('markdown-default'), queue=False)
+    gen_events.append(shared.gradio['Continue-default'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        generate_reply_wrapper, [shared.gradio['output_textbox']] + shared.input_params[1:], output_params, show_progress=False).then(
+        generate_reply_wrapper, [shared.gradio['output_textbox']] + gradio(inputs)[1:], gradio(outputs), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
         # lambda: None, None, None, _js="() => {element = document.getElementsByTagName('textarea')[1]; element.scrollTop = element.scrollHeight}")
     )
 
-    shared.gradio['Stop'].click(stop_everything_event, None, None, queue=False, cancels=gen_events if shared.args.no_stream else None)
-    shared.gradio['prompt_menu'].change(load_prompt, gradio('prompt_menu'), gradio('textbox'), show_progress=False)
-    shared.gradio['save_prompt'].click(
-        lambda x: x, gradio('textbox'), gradio('save_contents')).then(
+    shared.gradio['Stop-default'].click(stop_everything_event, None, None, queue=False, cancels=gen_events if shared.args.no_stream else None)
+    shared.gradio['prompt_menu-default'].change(load_prompt, gradio('prompt_menu-default'), gradio('textbox-default'), show_progress=False)
+    shared.gradio['save_prompt-default'].click(
+        lambda x: x, gradio('textbox-default'), gradio('save_contents')).then(
         lambda: 'prompts/', None, gradio('save_root')).then(
         lambda: utils.current_time() + '.txt', None, gradio('save_filename')).then(
         lambda: gr.update(visible=True), None, gradio('file_saver'))
 
-    shared.gradio['delete_prompt'].click(
+    shared.gradio['delete_prompt-default'].click(
         lambda: 'prompts/', None, gradio('delete_root')).then(
-        lambda x: x + '.txt', gradio('prompt_menu'), gradio('delete_filename')).then(
+        lambda x: x + '.txt', gradio('prompt_menu-default'), gradio('delete_filename')).then(
         lambda: gr.update(visible=True), None, gradio('file_deleter'))
 
-    shared.gradio['count_tokens'].click(count_tokens, gradio('textbox'), gradio('status'), show_progress=False)
+    shared.gradio['count_tokens-default'].click(count_tokens, gradio('textbox-default'), gradio('status-default'), show_progress=False)
diff --git a/modules/ui_file_saving.py b/modules/ui_file_saving.py
index 952d66c9..98165d67 100644
--- a/modules/ui_file_saving.py
+++ b/modules/ui_file_saving.py
@@ -2,7 +2,7 @@ import json
 
 import gradio as gr
 
-from modules import chat, presets, shared, ui, utils
+from modules import chat, presets, shared, ui, ui_chat, utils
 from modules.utils import gradio
 
 
@@ -26,18 +26,17 @@ def create_ui():
             shared.gradio['delete_cancel'] = gr.Button('Cancel', elem_classes="small-button")
 
     # Character saver/deleter
-    if shared.is_chat():
-        with gr.Box(visible=False, elem_classes='file-saver') as shared.gradio['character_saver']:
-            shared.gradio['save_character_filename'] = gr.Textbox(lines=1, label='File name', info='The character will be saved to your characters/ folder with this base filename.')
-            with gr.Row():
-                shared.gradio['save_character_confirm'] = gr.Button('Save', elem_classes="small-button")
-                shared.gradio['save_character_cancel'] = gr.Button('Cancel', elem_classes="small-button")
+    with gr.Box(visible=False, elem_classes='file-saver') as shared.gradio['character_saver']:
+        shared.gradio['save_character_filename'] = gr.Textbox(lines=1, label='File name', info='The character will be saved to your characters/ folder with this base filename.')
+        with gr.Row():
+            shared.gradio['save_character_confirm'] = gr.Button('Save', elem_classes="small-button")
+            shared.gradio['save_character_cancel'] = gr.Button('Cancel', elem_classes="small-button")
 
-        with gr.Box(visible=False, elem_classes='file-saver') as shared.gradio['character_deleter']:
-            gr.Markdown('Confirm the character deletion?')
-            with gr.Row():
-                shared.gradio['delete_character_confirm'] = gr.Button('Delete', elem_classes="small-button", variant='stop')
-                shared.gradio['delete_character_cancel'] = gr.Button('Cancel', elem_classes="small-button")
+    with gr.Box(visible=False, elem_classes='file-saver') as shared.gradio['character_deleter']:
+        gr.Markdown('Confirm the character deletion?')
+        with gr.Row():
+            shared.gradio['delete_character_confirm'] = gr.Button('Delete', elem_classes="small-button", variant='stop')
+            shared.gradio['delete_character_cancel'] = gr.Button('Cancel', elem_classes="small-button")
 
 
 def create_event_handlers():
@@ -51,18 +50,18 @@ def create_event_handlers():
 
     shared.gradio['delete_cancel'].click(lambda: gr.update(visible=False), None, gradio('file_deleter'))
     shared.gradio['save_cancel'].click(lambda: gr.update(visible=False), None, gradio('file_saver'))
-    if shared.is_chat():
-        shared.gradio['save_character_confirm'].click(
-            chat.save_character, gradio('name2', 'greeting', 'context', 'character_picture', 'save_character_filename'), None).then(
-            lambda: gr.update(visible=False), None, gradio('character_saver'))
 
-        shared.gradio['delete_character_confirm'].click(
-            chat.delete_character, gradio('character_menu'), None).then(
-            lambda: gr.update(visible=False), None, gradio('character_deleter')).then(
-            lambda: gr.update(choices=utils.get_available_characters()), None, gradio('character_menu'))
+    shared.gradio['save_character_confirm'].click(
+        chat.save_character, gradio('name2', 'greeting', 'context', 'character_picture', 'save_character_filename'), None).then(
+        lambda: gr.update(visible=False), None, gradio('character_saver'))
 
-        shared.gradio['save_character_cancel'].click(lambda: gr.update(visible=False), None, gradio('character_saver'))
-        shared.gradio['delete_character_cancel'].click(lambda: gr.update(visible=False), None, gradio('character_deleter'))
+    shared.gradio['delete_character_confirm'].click(
+        chat.delete_character, gradio('character_menu'), None).then(
+        lambda: gr.update(visible=False), None, gradio('character_deleter')).then(
+        lambda: gr.update(choices=utils.get_available_characters()), None, gradio('character_menu'))
+
+    shared.gradio['save_character_cancel'].click(lambda: gr.update(visible=False), None, gradio('character_saver'))
+    shared.gradio['delete_character_cancel'].click(lambda: gr.update(visible=False), None, gradio('character_deleter'))
 
     shared.gradio['save_preset'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
@@ -80,28 +79,21 @@ def create_event_handlers():
         shared.gradio['save_session'].click(
             ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
             lambda x: json.dumps(x, indent=4), gradio('interface_state'), gradio('temporary_text')).then(
-            None, gradio('temporary_text'), None, _js=f"(contents) => {{{ui.save_files_js}; saveSession(contents, \"{shared.get_mode()}\")}}")
+            None, gradio('temporary_text'), None, _js=f"(contents) => {{{ui.save_files_js}; saveSession(contents)}}")
 
-        if shared.is_chat():
-            shared.gradio['load_session'].upload(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                load_session, gradio('load_session', 'interface_state'), gradio('interface_state')).then(
-                ui.apply_interface_values, gradio('interface_state'), gradio(ui.list_interface_input_elements()), show_progress=False).then(
-                chat.redraw_html, shared.reload_inputs, gradio('display')).then(
-                None, None, None, _js='() => {alert("The session has been loaded.")}')
-        else:
-            shared.gradio['load_session'].upload(
-                ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                load_session, gradio('load_session', 'interface_state'), gradio('interface_state')).then(
-                ui.apply_interface_values, gradio('interface_state'), gradio(ui.list_interface_input_elements()), show_progress=False).then(
-                None, None, None, _js='() => {alert("The session has been loaded.")}')
+        shared.gradio['load_session'].upload(
+            ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+            load_session, gradio('load_session', 'interface_state'), gradio('interface_state')).then(
+            ui.apply_interface_values, gradio('interface_state'), gradio(ui.list_interface_input_elements()), show_progress=False).then(
+            chat.redraw_html, gradio(ui_chat.reload_arr), gradio('display')).then(
+            None, None, None, _js='() => {alert("The session has been loaded.")}')
 
 
 def load_session(file, state):
     decoded_file = file if type(file) == str else file.decode('utf-8')
     data = json.loads(decoded_file)
 
-    if shared.is_chat() and 'character_menu' in data and state.get('character_menu') != data.get('character_menu'):
+    if 'character_menu' in data and state.get('character_menu') != data.get('character_menu'):
         shared.session_is_loading = True
 
     state.update(data)
diff --git a/modules/ui_notebook.py b/modules/ui_notebook.py
index 9e8b3af6..7d6648d2 100644
--- a/modules/ui_notebook.py
+++ b/modules/ui_notebook.py
@@ -8,90 +8,85 @@ from modules.text_generation import (
 )
 from modules.utils import gradio
 
+inputs = ('textbox-notebook', 'interface_state')
+outputs = ('textbox-notebook', 'html-notebook')
+
 
 def create_ui():
     default_text = load_prompt(shared.settings['prompt'])
-    shared.gradio['interface_state'] = gr.State({k: None for k in shared.input_elements})
-    shared.gradio['last_input'] = gr.State('')
 
-    with gr.Tab('Text generation', elem_id='main'):
+    with gr.Tab('Notebook', elem_id='notebook-tab'):
+        shared.gradio['last_input-notebook'] = gr.State('')
         with gr.Row():
             with gr.Column(scale=4):
                 with gr.Tab('Raw'):
-                    shared.gradio['textbox'] = gr.Textbox(value=default_text, elem_classes=['textbox', 'add_scrollbar'], lines=27)
+                    shared.gradio['textbox-notebook'] = gr.Textbox(value=default_text, elem_classes=['textbox', 'add_scrollbar'], lines=27)
 
                 with gr.Tab('Markdown'):
-                    shared.gradio['markdown_render'] = gr.Button('Render')
-                    shared.gradio['markdown'] = gr.Markdown()
+                    shared.gradio['markdown_render-notebook'] = gr.Button('Render')
+                    shared.gradio['markdown-notebook'] = gr.Markdown()
 
                 with gr.Tab('HTML'):
-                    shared.gradio['html'] = gr.HTML()
+                    shared.gradio['html-notebook'] = gr.HTML()
 
                 with gr.Row():
-                    shared.gradio['Generate'] = gr.Button('Generate', variant='primary', elem_classes='small-button')
-                    shared.gradio['Stop'] = gr.Button('Stop', elem_classes='small-button', elem_id='stop')
+                    shared.gradio['Generate-notebook'] = gr.Button('Generate', variant='primary', elem_classes='small-button')
+                    shared.gradio['Stop-notebook'] = gr.Button('Stop', elem_classes='small-button', elem_id='stop')
                     shared.gradio['Undo'] = gr.Button('Undo', elem_classes='small-button')
-                    shared.gradio['Regenerate'] = gr.Button('Regenerate', elem_classes='small-button')
+                    shared.gradio['Regenerate-notebook'] = gr.Button('Regenerate', elem_classes='small-button')
 
             with gr.Column(scale=1):
                 gr.HTML('<div style="padding-bottom: 13px"></div>')
-                shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens'])
                 with gr.Row():
-                    shared.gradio['prompt_menu'] = gr.Dropdown(choices=utils.get_available_prompts(), value='None', label='Prompt', elem_classes='slim-dropdown')
-                    ui.create_refresh_button(shared.gradio['prompt_menu'], lambda: None, lambda: {'choices': utils.get_available_prompts()}, ['refresh-button', 'refresh-button-small'])
-                    shared.gradio['save_prompt'] = gr.Button('💾', elem_classes=['refresh-button', 'refresh-button-small'])
-                    shared.gradio['delete_prompt'] = gr.Button('🗑️', elem_classes=['refresh-button', 'refresh-button-small'])
+                    shared.gradio['prompt_menu-notebook'] = gr.Dropdown(choices=utils.get_available_prompts(), value='None', label='Prompt', elem_classes='slim-dropdown')
+                    ui.create_refresh_button(shared.gradio['prompt_menu-notebook'], lambda: None, lambda: {'choices': utils.get_available_prompts()}, ['refresh-button', 'refresh-button-small'])
+                    shared.gradio['save_prompt-notebook'] = gr.Button('💾', elem_classes=['refresh-button', 'refresh-button-small'])
+                    shared.gradio['delete_prompt-notebook'] = gr.Button('🗑️', elem_classes=['refresh-button', 'refresh-button-small'])
 
-                shared.gradio['count_tokens'] = gr.Button('Count tokens')
-                shared.gradio['status'] = gr.Markdown('')
+                shared.gradio['count_tokens-notebook'] = gr.Button('Count tokens')
+                shared.gradio['status-notebook'] = gr.Markdown('')
 
 
 def create_event_handlers():
     gen_events = []
 
-    shared.input_params = gradio('textbox', 'interface_state')
-    output_params = gradio('textbox', 'html')
-
-    gen_events.append(shared.gradio['Generate'].click(
-        lambda x: x, gradio('textbox'), gradio('last_input')).then(
+    gen_events.append(shared.gradio['Generate-notebook'].click(
+        lambda x: x, gradio('textbox-notebook'), gradio('last_input-notebook')).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        generate_reply_wrapper, shared.input_params, output_params, show_progress=False).then(
+        generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
-        # lambda: None, None, None, _js="() => {element = document.getElementsByTagName('textarea')[0]; element.scrollTop = element.scrollHeight}")
     )
 
-    gen_events.append(shared.gradio['textbox'].submit(
-        lambda x: x, gradio('textbox'), gradio('last_input')).then(
+    gen_events.append(shared.gradio['textbox-notebook'].submit(
+        lambda x: x, gradio('textbox-notebook'), gradio('last_input-notebook')).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        generate_reply_wrapper, shared.input_params, output_params, show_progress=False).then(
+        generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
-        # lambda: None, None, None, _js="() => {element = document.getElementsByTagName('textarea')[0]; element.scrollTop = element.scrollHeight}")
     )
 
-    shared.gradio['Undo'].click(lambda x: x, gradio('last_input'), gradio('textbox'), show_progress=False)
-    shared.gradio['markdown_render'].click(lambda x: x, gradio('textbox'), gradio('markdown'), queue=False)
-    gen_events.append(shared.gradio['Regenerate'].click(
-        lambda x: x, gradio('last_input'), gradio('textbox'), show_progress=False).then(
+    shared.gradio['Undo'].click(lambda x: x, gradio('last_input-notebook'), gradio('textbox-notebook'), show_progress=False)
+    shared.gradio['markdown_render-notebook'].click(lambda x: x, gradio('textbox-notebook'), gradio('markdown-notebook'), queue=False)
+    gen_events.append(shared.gradio['Regenerate-notebook'].click(
+        lambda x: x, gradio('last_input-notebook'), gradio('textbox-notebook'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        generate_reply_wrapper, shared.input_params, output_params, show_progress=False).then(
+        generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
-        # lambda: None, None, None, _js="() => {element = document.getElementsByTagName('textarea')[0]; element.scrollTop = element.scrollHeight}")
     )
 
-    shared.gradio['Stop'].click(stop_everything_event, None, None, queue=False, cancels=gen_events if shared.args.no_stream else None)
-    shared.gradio['prompt_menu'].change(load_prompt, gradio('prompt_menu'), gradio('textbox'), show_progress=False)
-    shared.gradio['save_prompt'].click(
-        lambda x: x, gradio('textbox'), gradio('save_contents')).then(
+    shared.gradio['Stop-notebook'].click(stop_everything_event, None, None, queue=False, cancels=gen_events if shared.args.no_stream else None)
+    shared.gradio['prompt_menu-notebook'].change(load_prompt, gradio('prompt_menu-notebook'), gradio('textbox-notebook'), show_progress=False)
+    shared.gradio['save_prompt-notebook'].click(
+        lambda x: x, gradio('textbox-notebook'), gradio('save_contents')).then(
         lambda: 'prompts/', None, gradio('save_root')).then(
         lambda: utils.current_time() + '.txt', None, gradio('save_filename')).then(
         lambda: gr.update(visible=True), None, gradio('file_saver'))
 
-    shared.gradio['delete_prompt'].click(
+    shared.gradio['delete_prompt-notebook'].click(
         lambda: 'prompts/', None, gradio('delete_root')).then(
-        lambda x: x + '.txt', gradio('prompt_menu'), gradio('delete_filename')).then(
+        lambda x: x + '.txt', gradio('prompt_menu-notebook'), gradio('delete_filename')).then(
         lambda: gr.update(visible=True), None, gradio('file_deleter'))
 
-    shared.gradio['count_tokens'].click(count_tokens, gradio('textbox'), gradio('status'), show_progress=False)
+    shared.gradio['count_tokens-notebook'].click(count_tokens, gradio('textbox-notebook'), gradio('status-notebook'), show_progress=False)
diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
index 4b9fb918..2f0c2efd 100644
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@@ -1,143 +1,131 @@
 import gradio as gr
 
-from modules import loaders, presets, shared, ui, utils
+from modules import loaders, presets, shared, ui, ui_chat, utils
 from modules.utils import gradio
 
 
 def create_ui(default_preset):
     generate_params = presets.load_preset(default_preset)
     with gr.Tab("Parameters", elem_id="parameters"):
-        with gr.Row():
-            with gr.Column():
-                with gr.Row():
-                    shared.gradio['preset_menu'] = gr.Dropdown(choices=utils.get_available_presets(), value=default_preset, label='Generation parameters preset', elem_classes='slim-dropdown')
-                    ui.create_refresh_button(shared.gradio['preset_menu'], lambda: None, lambda: {'choices': utils.get_available_presets()}, 'refresh-button')
-                    shared.gradio['save_preset'] = gr.Button('💾', elem_classes='refresh-button')
-                    shared.gradio['delete_preset'] = gr.Button('🗑️', elem_classes='refresh-button')
-
-            with gr.Column():
-                shared.gradio['filter_by_loader'] = gr.Dropdown(label="Filter by loader", choices=["All"] + list(loaders.loaders_and_params.keys()), value="All", elem_classes='slim-dropdown')
-
-        with gr.Row():
-            with gr.Column():
-                with gr.Box():
+        with gr.Tab("Generation"):
+            with gr.Row():
+                with gr.Column():
                     with gr.Row():
-                        with gr.Column():
-                            shared.gradio['temperature'] = gr.Slider(0.01, 1.99, value=generate_params['temperature'], step=0.01, label='temperature')
-                            shared.gradio['top_p'] = gr.Slider(0.0, 1.0, value=generate_params['top_p'], step=0.01, label='top_p')
-                            shared.gradio['top_k'] = gr.Slider(0, 200, value=generate_params['top_k'], step=1, label='top_k')
-                            shared.gradio['typical_p'] = gr.Slider(0.0, 1.0, value=generate_params['typical_p'], step=0.01, label='typical_p')
-                            shared.gradio['epsilon_cutoff'] = gr.Slider(0, 9, value=generate_params['epsilon_cutoff'], step=0.01, label='epsilon_cutoff')
-                            shared.gradio['eta_cutoff'] = gr.Slider(0, 20, value=generate_params['eta_cutoff'], step=0.01, label='eta_cutoff')
-                            shared.gradio['tfs'] = gr.Slider(0.0, 1.0, value=generate_params['tfs'], step=0.01, label='tfs')
-                            shared.gradio['top_a'] = gr.Slider(0.0, 1.0, value=generate_params['top_a'], step=0.01, label='top_a')
+                        shared.gradio['preset_menu'] = gr.Dropdown(choices=utils.get_available_presets(), value=default_preset, label='Preset', elem_classes='slim-dropdown')
+                        ui.create_refresh_button(shared.gradio['preset_menu'], lambda: None, lambda: {'choices': utils.get_available_presets()}, 'refresh-button')
+                        shared.gradio['save_preset'] = gr.Button('💾', elem_classes='refresh-button')
+                        shared.gradio['delete_preset'] = gr.Button('🗑️', elem_classes='refresh-button')
 
-                        with gr.Column():
-                            shared.gradio['repetition_penalty'] = gr.Slider(1.0, 1.5, value=generate_params['repetition_penalty'], step=0.01, label='repetition_penalty')
-                            shared.gradio['repetition_penalty_range'] = gr.Slider(0, 4096, step=64, value=generate_params['repetition_penalty_range'], label='repetition_penalty_range')
-                            shared.gradio['encoder_repetition_penalty'] = gr.Slider(0.8, 1.5, value=generate_params['encoder_repetition_penalty'], step=0.01, label='encoder_repetition_penalty')
-                            shared.gradio['no_repeat_ngram_size'] = gr.Slider(0, 20, step=1, value=generate_params['no_repeat_ngram_size'], label='no_repeat_ngram_size')
-                            shared.gradio['min_length'] = gr.Slider(0, 2000, step=1, value=generate_params['min_length'], label='min_length')
-                            shared.gradio['seed'] = gr.Number(value=shared.settings['seed'], label='Seed (-1 for random)')
-                            shared.gradio['do_sample'] = gr.Checkbox(value=generate_params['do_sample'], label='do_sample')
+                with gr.Column():
+                    shared.gradio['filter_by_loader'] = gr.Dropdown(label="Filter by loader", choices=["All"] + list(loaders.loaders_and_params.keys()), value="All", elem_classes='slim-dropdown')
 
-                with gr.Accordion("Learn more", open=False):
-                    gr.Markdown("""
+            with gr.Row():
+                with gr.Column():
+                    with gr.Box():
+                        with gr.Row():
+                            with gr.Column():
+                                shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens'])
+                                shared.gradio['temperature'] = gr.Slider(0.01, 1.99, value=generate_params['temperature'], step=0.01, label='temperature')
+                                shared.gradio['top_p'] = gr.Slider(0.0, 1.0, value=generate_params['top_p'], step=0.01, label='top_p')
+                                shared.gradio['top_k'] = gr.Slider(0, 200, value=generate_params['top_k'], step=1, label='top_k')
+                                shared.gradio['typical_p'] = gr.Slider(0.0, 1.0, value=generate_params['typical_p'], step=0.01, label='typical_p')
+                                shared.gradio['epsilon_cutoff'] = gr.Slider(0, 9, value=generate_params['epsilon_cutoff'], step=0.01, label='epsilon_cutoff')
+                                shared.gradio['eta_cutoff'] = gr.Slider(0, 20, value=generate_params['eta_cutoff'], step=0.01, label='eta_cutoff')
+                                shared.gradio['tfs'] = gr.Slider(0.0, 1.0, value=generate_params['tfs'], step=0.01, label='tfs')
+                                shared.gradio['top_a'] = gr.Slider(0.0, 1.0, value=generate_params['top_a'], step=0.01, label='top_a')
 
-        For a technical description of the parameters, the [transformers documentation](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig) is a good reference.
+                            with gr.Column():
+                                shared.gradio['repetition_penalty'] = gr.Slider(1.0, 1.5, value=generate_params['repetition_penalty'], step=0.01, label='repetition_penalty')
+                                shared.gradio['repetition_penalty_range'] = gr.Slider(0, 4096, step=64, value=generate_params['repetition_penalty_range'], label='repetition_penalty_range')
+                                shared.gradio['encoder_repetition_penalty'] = gr.Slider(0.8, 1.5, value=generate_params['encoder_repetition_penalty'], step=0.01, label='encoder_repetition_penalty')
+                                shared.gradio['no_repeat_ngram_size'] = gr.Slider(0, 20, step=1, value=generate_params['no_repeat_ngram_size'], label='no_repeat_ngram_size')
+                                shared.gradio['min_length'] = gr.Slider(0, 2000, step=1, value=generate_params['min_length'], label='min_length')
+                                shared.gradio['seed'] = gr.Number(value=shared.settings['seed'], label='Seed (-1 for random)')
+                                shared.gradio['do_sample'] = gr.Checkbox(value=generate_params['do_sample'], label='do_sample')
 
-        The best presets, according to the [Preset Arena](https://github.com/oobabooga/oobabooga.github.io/blob/main/arena/results.md) experiment, are:
+                    with gr.Accordion("Learn more", open=False):
+                        gr.Markdown("""
 
-        * Instruction following:
-            1) Divine Intellect
-            2) Big O
-            3) simple-1
-            4) Space Alien
-            5) StarChat
-            6) Titanic
-            7) tfs-with-top-a
-            8) Asterism
-            9) Contrastive Search
+            For a technical description of the parameters, the [transformers documentation](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig) is a good reference.
 
-        * Chat:
-            1) Midnight Enigma
-            2) Yara
-            3) Shortwave
+            The best presets, according to the [Preset Arena](https://github.com/oobabooga/oobabooga.github.io/blob/main/arena/results.md) experiment, are:
 
-        ### Temperature
-        Primary factor to control randomness of outputs. 0 = deterministic (only the most likely token is used). Higher value = more randomness.
-        ### top_p
-        If not set to 1, select tokens with probabilities adding up to less than this number. Higher value = higher range of possible random results.
-        ### top_k
-        Similar to top_p, but select instead only the top_k most likely tokens. Higher value = higher range of possible random results.
-        ### typical_p
-        If not set to 1, select only tokens that are at least this much more likely to appear than random tokens, given the prior text.
-        ### epsilon_cutoff
-        In units of 1e-4; a reasonable value is 3. This sets a probability floor below which tokens are excluded from being sampled. Should be used with top_p, top_k, and eta_cutoff set to 0.
-        ### eta_cutoff
-        In units of 1e-4; a reasonable value is 3. Should be used with top_p, top_k, and epsilon_cutoff set to 0.
-        ### repetition_penalty
-        Exponential penalty factor for repeating prior tokens. 1 means no penalty, higher value = less repetition, lower value = more repetition.
-        ### repetition_penalty_range
-        The number of most recent tokens to consider for repetition penalty. 0 makes all tokens be used.
-        ### encoder_repetition_penalty
-        Also known as the "Hallucinations filter". Used to penalize tokens that are *not* in the prior text. Higher value = more likely to stay in context, lower value = more likely to diverge.
-        ### no_repeat_ngram_size
-        If not set to 0, specifies the length of token sets that are completely blocked from repeating at all. Higher values = blocks larger phrases, lower values = blocks words or letters from repeating. Only 0 or high values are a good idea in most cases.
-        ### min_length
-        Minimum generation length in tokens.
-        ### penalty_alpha
-        Contrastive Search is enabled by setting this to greater than zero and unchecking "do_sample". It should be used with a low value of top_k, for instance, top_k = 4.
+            * Instruction following:
+                1) Divine Intellect
+                2) Big O
+                3) simple-1
+                4) Space Alien
+                5) StarChat
+                6) Titanic
+                7) tfs-with-top-a
+                8) Asterism
+                9) Contrastive Search
 
-                    """, elem_classes="markdown")
+            * Chat:
+                1) Midnight Enigma
+                2) Yara
+                3) Shortwave
 
-            with gr.Column():
-                create_chat_settings_menus()
-                with gr.Box():
-                    with gr.Row():
-                        with gr.Column():
-                            shared.gradio['guidance_scale'] = gr.Slider(-0.5, 2.5, step=0.05, value=generate_params['guidance_scale'], label='guidance_scale', info='For CFG. 1.5 is a good value.')
-                            shared.gradio['negative_prompt'] = gr.Textbox(value=shared.settings['negative_prompt'], label='Negative prompt')
-                            shared.gradio['mirostat_mode'] = gr.Slider(0, 2, step=1, value=generate_params['mirostat_mode'], label='mirostat_mode', info='mode=1 is for llama.cpp only.')
-                            shared.gradio['mirostat_tau'] = gr.Slider(0, 10, step=0.01, value=generate_params['mirostat_tau'], label='mirostat_tau')
-                            shared.gradio['mirostat_eta'] = gr.Slider(0, 1, step=0.01, value=generate_params['mirostat_eta'], label='mirostat_eta')
+            ### Temperature
+            Primary factor to control randomness of outputs. 0 = deterministic (only the most likely token is used). Higher value = more randomness.
+            ### top_p
+            If not set to 1, select tokens with probabilities adding up to less than this number. Higher value = higher range of possible random results.
+            ### top_k
+            Similar to top_p, but select instead only the top_k most likely tokens. Higher value = higher range of possible random results.
+            ### typical_p
+            If not set to 1, select only tokens that are at least this much more likely to appear than random tokens, given the prior text.
+            ### epsilon_cutoff
+            In units of 1e-4; a reasonable value is 3. This sets a probability floor below which tokens are excluded from being sampled. Should be used with top_p, top_k, and eta_cutoff set to 0.
+            ### eta_cutoff
+            In units of 1e-4; a reasonable value is 3. Should be used with top_p, top_k, and epsilon_cutoff set to 0.
+            ### repetition_penalty
+            Exponential penalty factor for repeating prior tokens. 1 means no penalty, higher value = less repetition, lower value = more repetition.
+            ### repetition_penalty_range
+            The number of most recent tokens to consider for repetition penalty. 0 makes all tokens be used.
+            ### encoder_repetition_penalty
+            Also known as the "Hallucinations filter". Used to penalize tokens that are *not* in the prior text. Higher value = more likely to stay in context, lower value = more likely to diverge.
+            ### no_repeat_ngram_size
+            If not set to 0, specifies the length of token sets that are completely blocked from repeating at all. Higher values = blocks larger phrases, lower values = blocks words or letters from repeating. Only 0 or high values are a good idea in most cases.
+            ### min_length
+            Minimum generation length in tokens.
+            ### penalty_alpha
+            Contrastive Search is enabled by setting this to greater than zero and unchecking "do_sample". It should be used with a low value of top_k, for instance, top_k = 4.
 
-                        with gr.Column():
-                            shared.gradio['penalty_alpha'] = gr.Slider(0, 5, value=generate_params['penalty_alpha'], label='penalty_alpha', info='For Contrastive Search. do_sample must be unchecked.')
+                        """, elem_classes="markdown")
 
-                            shared.gradio['num_beams'] = gr.Slider(1, 20, step=1, value=generate_params['num_beams'], label='num_beams', info='For Beam Search, along with length_penalty and early_stopping.')
-                            shared.gradio['length_penalty'] = gr.Slider(-5, 5, value=generate_params['length_penalty'], label='length_penalty')
-                            shared.gradio['early_stopping'] = gr.Checkbox(value=generate_params['early_stopping'], label='early_stopping')
+                with gr.Column():
+                    with gr.Box():
+                        with gr.Row():
+                            with gr.Column():
+                                shared.gradio['guidance_scale'] = gr.Slider(-0.5, 2.5, step=0.05, value=generate_params['guidance_scale'], label='guidance_scale', info='For CFG. 1.5 is a good value.')
+                                shared.gradio['negative_prompt'] = gr.Textbox(value=shared.settings['negative_prompt'], label='Negative prompt')
+                                shared.gradio['mirostat_mode'] = gr.Slider(0, 2, step=1, value=generate_params['mirostat_mode'], label='mirostat_mode', info='mode=1 is for llama.cpp only.')
+                                shared.gradio['mirostat_tau'] = gr.Slider(0, 10, step=0.01, value=generate_params['mirostat_tau'], label='mirostat_tau')
+                                shared.gradio['mirostat_eta'] = gr.Slider(0, 1, step=0.01, value=generate_params['mirostat_eta'], label='mirostat_eta')
 
-                with gr.Box():
-                    with gr.Row():
-                        with gr.Column():
-                            shared.gradio['truncation_length'] = gr.Slider(value=shared.settings['truncation_length'], minimum=shared.settings['truncation_length_min'], maximum=shared.settings['truncation_length_max'], step=256, label='Truncate the prompt up to this length', info='The leftmost tokens are removed if the prompt exceeds this length. Most models require this to be at most 2048.')
-                            shared.gradio['custom_stopping_strings'] = gr.Textbox(lines=1, value=shared.settings["custom_stopping_strings"] or None, label='Custom stopping strings', info='In addition to the defaults. Written between "" and separated by commas. For instance: "\\nYour Assistant:", "\\nThe assistant:"')
-                        with gr.Column():
-                            shared.gradio['auto_max_new_tokens'] = gr.Checkbox(value=shared.settings['auto_max_new_tokens'], label='auto_max_new_tokens', info='Expand max_new_tokens to the available context length.')
-                            shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label='Ban the eos_token', info='Forces the model to never end the generation prematurely.')
-                            shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.')
+                            with gr.Column():
+                                shared.gradio['penalty_alpha'] = gr.Slider(0, 5, value=generate_params['penalty_alpha'], label='penalty_alpha', info='For Contrastive Search. do_sample must be unchecked.')
 
-                            shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.')
-                            shared.gradio['stream'] = gr.Checkbox(value=not shared.args.no_stream, label='Activate text streaming')
+                                shared.gradio['num_beams'] = gr.Slider(1, 20, step=1, value=generate_params['num_beams'], label='num_beams', info='For Beam Search, along with length_penalty and early_stopping.')
+                                shared.gradio['length_penalty'] = gr.Slider(-5, 5, value=generate_params['length_penalty'], label='length_penalty')
+                                shared.gradio['early_stopping'] = gr.Checkbox(value=generate_params['early_stopping'], label='early_stopping')
+
+                    with gr.Box():
+                        with gr.Row():
+                            with gr.Column():
+                                shared.gradio['truncation_length'] = gr.Slider(value=shared.settings['truncation_length'], minimum=shared.settings['truncation_length_min'], maximum=shared.settings['truncation_length_max'], step=256, label='Truncate the prompt up to this length', info='The leftmost tokens are removed if the prompt exceeds this length. Most models require this to be at most 2048.')
+                                shared.gradio['custom_stopping_strings'] = gr.Textbox(lines=1, value=shared.settings["custom_stopping_strings"] or None, label='Custom stopping strings', info='In addition to the defaults. Written between "" and separated by commas. For instance: "\\nYour Assistant:", "\\nThe assistant:"')
+                            with gr.Column():
+                                shared.gradio['auto_max_new_tokens'] = gr.Checkbox(value=shared.settings['auto_max_new_tokens'], label='auto_max_new_tokens', info='Expand max_new_tokens to the available context length.')
+                                shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label='Ban the eos_token', info='Forces the model to never end the generation prematurely.')
+                                shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.')
+
+                                shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.')
+                                shared.gradio['stream'] = gr.Checkbox(value=not shared.args.no_stream, label='Activate text streaming')
+
+        ui_chat.create_chat_settings_ui()
 
 
 def create_event_handlers():
     shared.gradio['filter_by_loader'].change(loaders.blacklist_samplers, gradio('filter_by_loader'), gradio(loaders.list_all_samplers()), show_progress=False)
     shared.gradio['preset_menu'].change(presets.load_preset_for_ui, gradio('preset_menu', 'interface_state'), gradio('interface_state') + gradio(presets.presets_params()))
-
-
-def create_chat_settings_menus():
-    if not shared.is_chat():
-        return
-
-    with gr.Box():
-        gr.Markdown("Chat parameters")
-        with gr.Row():
-            with gr.Column():
-                shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens'])
-                shared.gradio['chat_generation_attempts'] = gr.Slider(minimum=shared.settings['chat_generation_attempts_min'], maximum=shared.settings['chat_generation_attempts_max'], value=shared.settings['chat_generation_attempts'], step=1, label='Generation attempts (for longer replies)', info='New generations will be called until either this number is reached or no new content is generated between two iterations.')
-
-            with gr.Column():
-                shared.gradio['stop_at_newline'] = gr.Checkbox(value=shared.settings['stop_at_newline'], label='Stop generating at new line character')
diff --git a/modules/ui_session.py b/modules/ui_session.py
index 7a1a32b0..3d0fdac6 100644
--- a/modules/ui_session.py
+++ b/modules/ui_session.py
@@ -7,35 +7,21 @@ from modules.utils import gradio
 
 def create_ui():
     with gr.Tab("Session", elem_id="session-tab"):
-        modes = ["default", "notebook", "chat"]
-        current_mode = "default"
-        for mode in modes[1:]:
-            if getattr(shared.args, mode):
-                current_mode = mode
-                break
-
-        cmd_list = vars(shared.args)
-        bool_list = sorted([k for k in cmd_list if type(cmd_list[k]) is bool and k not in modes + ui.list_model_elements()])
-        bool_active = [k for k in bool_list if vars(shared.args)[k]]
-
         with gr.Row():
-
             with gr.Column():
-                with gr.Row():
-                    shared.gradio['interface_modes_menu'] = gr.Dropdown(choices=modes, value=current_mode, label="Mode", elem_classes='slim-dropdown')
-                    shared.gradio['reset_interface'] = gr.Button("Apply and restart", elem_classes="small-button", variant="primary")
-                    shared.gradio['toggle_dark_mode'] = gr.Button('Toggle 💡', elem_classes="small-button")
+                shared.gradio['reset_interface'] = gr.Button("Apply and restart")
+                shared.gradio['toggle_dark_mode'] = gr.Button('Toggle 💡')
 
                 with gr.Row():
                     with gr.Column():
                         shared.gradio['extensions_menu'] = gr.CheckboxGroup(choices=utils.get_available_extensions(), value=shared.args.extensions, label="Available extensions", info='Note that some of these extensions may require manually installing Python requirements through the command: pip install -r extensions/extension_name/requirements.txt', elem_classes='checkboxgroup-table')
 
                     with gr.Column():
-                        shared.gradio['bool_menu'] = gr.CheckboxGroup(choices=bool_list, value=bool_active, label="Boolean command-line flags", elem_classes='checkboxgroup-table')
+                        shared.gradio['bool_menu'] = gr.CheckboxGroup(choices=get_boolean_arguments(), value=get_boolean_arguments(active=True), label="Boolean command-line flags", elem_classes='checkboxgroup-table')
 
             with gr.Column():
                 if not shared.args.multi_user:
-                    shared.gradio['save_session'] = gr.Button('Save session', elem_id="save_session")
+                    shared.gradio['save_session'] = gr.Button('Save session')
                     shared.gradio['load_session'] = gr.File(type='binary', file_types=['.json'], label="Upload Session JSON")
 
                 extension_name = gr.Textbox(lines=1, label='Install or update an extension', info='Enter the GitHub URL below and press Enter. For a list of extensions, see: https://github.com/oobabooga/text-generation-webui-extensions ⚠️  WARNING ⚠️ : extensions can execute arbitrary code. Make sure to inspect their source code before activating them.')
@@ -47,25 +33,33 @@ def create_ui():
 
         # Reset interface event
         shared.gradio['reset_interface'].click(
-            set_interface_arguments, gradio('interface_modes_menu', 'extensions_menu', 'bool_menu'), None).then(
+            set_interface_arguments, gradio('extensions_menu', 'bool_menu'), None).then(
             lambda: None, None, None, _js='() => {document.body.innerHTML=\'<h1 style="font-family:monospace;padding-top:20%;margin:0;height:100vh;color:lightgray;text-align:center;background:var(--body-background-fill)">Reloading...</h1>\'; setTimeout(function(){location.reload()},2500); return []}')
 
         shared.gradio['toggle_dark_mode'].click(lambda: None, None, None, _js='() => {document.getElementsByTagName("body")[0].classList.toggle("dark")}')
 
 
-def set_interface_arguments(interface_mode, extensions, bool_active):
-    modes = ["default", "notebook", "chat", "cai_chat"]
-    cmd_list = vars(shared.args)
-    bool_list = [k for k in cmd_list if type(cmd_list[k]) is bool and k not in modes]
-
+def set_interface_arguments(extensions, bool_active):
     shared.args.extensions = extensions
-    for k in modes[1:]:
-        setattr(shared.args, k, False)
-    if interface_mode != "default":
-        setattr(shared.args, interface_mode, True)
+
+    bool_list = get_boolean_arguments()
+
     for k in bool_list:
         setattr(shared.args, k, False)
     for k in bool_active:
         setattr(shared.args, k, True)
 
     shared.need_restart = True
+
+
+def get_boolean_arguments(active=False):
+    exclude = ["default", "notebook", "chat"]
+
+    cmd_list = vars(shared.args)
+    bool_list = sorted([k for k in cmd_list if type(cmd_list[k]) is bool and k not in exclude + ui.list_model_elements()])
+    bool_active = [k for k in bool_list if vars(shared.args)[k]]
+
+    if active:
+        return bool_active
+    else:
+        return bool_list
diff --git a/modules/utils.py b/modules/utils.py
index 011c71f1..6fa94730 100644
--- a/modules/utils.py
+++ b/modules/utils.py
@@ -9,7 +9,7 @@ from modules.logging_colors import logger
 
 # Helper function to get multiple values from shared.gradio
 def gradio(*keys):
-    if len(keys) == 1 and type(keys[0]) is list:
+    if len(keys) == 1 and type(keys[0]) in [list, tuple]:
         keys = keys[0]
 
     return [shared.gradio[k] for k in keys]
diff --git a/server.py b/server.py
index b477d4c1..e86e3338 100644
--- a/server.py
+++ b/server.py
@@ -69,28 +69,28 @@ def create_interface():
     # Force some events to be triggered on page load
     shared.persistent_interface_state.update({
         'loader': shared.args.loader or 'Transformers',
+        'mode': shared.settings['mode'],
+        'character_menu': shared.args.character or shared.settings['character'],
+        'instruction_template': shared.settings['instruction_template']
     })
-    if shared.is_chat():
-        shared.persistent_interface_state.update({
-            'mode': shared.settings['mode'],
-            'character_menu': shared.args.character or shared.settings['character'],
-            'instruction_template': shared.settings['instruction_template']
-        })
 
-        if Path("cache/pfp_character.png").exists():
-            Path("cache/pfp_character.png").unlink()
+    if Path("cache/pfp_character.png").exists():
+        Path("cache/pfp_character.png").unlink()
 
     # css/js strings
-    css = ui.css if not shared.is_chat() else ui.css + ui.chat_css
-    js = ui.main_js
+    css = ui.css
+    js = ui.js
     css += apply_extensions('css')
     js += apply_extensions('js')
 
-    # The input elements for the generation functions
+    # Interface state elements
     shared.input_elements = ui.list_interface_input_elements()
 
     with gr.Blocks(css=css, analytics_enabled=False, title=title, theme=ui.theme) as shared.gradio['interface']:
 
+        # Interface state
+        shared.gradio['interface_state'] = gr.State({k: None for k in shared.input_elements})
+
         # Audio notification
         if Path("notification.mp3").exists():
             shared.gradio['audio_notification'] = gr.Audio(interactive=False, value="notification.mp3", elem_id="audio_notification", visible=False)
@@ -102,12 +102,9 @@ def create_interface():
         shared.gradio['temporary_text'] = gr.Textbox(visible=False)
 
         # Text Generation tab
-        if shared.is_chat():
-            ui_chat.create_ui()
-        elif shared.args.notebook:
-            ui_notebook.create_ui()
-        else:
-            ui_default.create_ui()
+        ui_chat.create_ui()
+        ui_default.create_ui()
+        ui_notebook.create_ui()
 
         ui_parameters.create_ui(shared.settings['preset'])  # Parameters tab
         ui_model_menu.create_ui()  # Model tab
@@ -115,12 +112,9 @@ def create_interface():
         ui_session.create_ui()  # Session tab
 
         # Generation events
-        if shared.is_chat():
-            ui_chat.create_event_handlers()
-        elif shared.args.notebook:
-            ui_notebook.create_event_handlers()
-        else:
-            ui_default.create_event_handlers()
+        ui_chat.create_event_handlers()
+        ui_default.create_event_handlers()
+        ui_notebook.create_event_handlers()
 
         # Other events
         ui_file_saving.create_event_handlers()
@@ -130,11 +124,10 @@ def create_interface():
         # Interface launch events
         if shared.settings['dark_theme']:
             shared.gradio['interface'].load(lambda: None, None, None, _js="() => document.getElementsByTagName('body')[0].classList.add('dark')")
-            
+
         shared.gradio['interface'].load(lambda: None, None, None, _js=f"() => {{{js}}}")
         shared.gradio['interface'].load(partial(ui.apply_interface_values, {}, use_persistent=True), None, gradio(ui.list_interface_input_elements()), show_progress=False)
-        if shared.is_chat():
-            shared.gradio['interface'].load(chat.redraw_html, shared.reload_inputs, gradio('display'))
+        shared.gradio['interface'].load(chat.redraw_html, gradio(ui_chat.reload_arr), gradio('display'))
 
         extensions_module.create_extensions_tabs()  # Extensions tabs
         extensions_module.create_extensions_block()  # Extensions block
@@ -190,16 +183,10 @@ if __name__ == "__main__":
 
     # Activate the extensions listed on settings.yaml
     extensions_module.available_extensions = utils.get_available_extensions()
-    if shared.is_chat():
-        for extension in shared.settings['chat_default_extensions']:
-            shared.args.extensions = shared.args.extensions or []
-            if extension not in shared.args.extensions:
-                shared.args.extensions.append(extension)
-    else:
-        for extension in shared.settings['default_extensions']:
-            shared.args.extensions = shared.args.extensions or []
-            if extension not in shared.args.extensions:
-                shared.args.extensions.append(extension)
+    for extension in shared.settings['default_extensions']:
+        shared.args.extensions = shared.args.extensions or []
+        if extension not in shared.args.extensions:
+            shared.args.extensions.append(extension)
 
     available_models = utils.get_available_models()
 
diff --git a/settings-template.yaml b/settings-template.yaml
index a0c53b33..b1d63c71 100644
--- a/settings-template.yaml
+++ b/settings-template.yaml
@@ -13,7 +13,6 @@ context: This is a conversation with your Assistant. It is a computer program de
 greeting: ''
 turn_template: ''
 custom_stopping_strings: ''
-stop_at_newline: false
 add_bos_token: true
 ban_eos_token: false
 skip_special_tokens: true
@@ -28,11 +27,7 @@ chat-instruct_command: |-
   Continue the chat dialogue below. Write a single reply for the character "<|character|>".
 
   <|prompt|>
-chat_generation_attempts: 1
-chat_generation_attempts_min: 1
-chat_generation_attempts_max: 10
-default_extensions: []
-chat_default_extensions:
+default_extensions:
 - gallery
 preset: simple-1
 prompt: QA

From 919a3cf9d093fe0abd0657a4375ddc3c0b37376e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 13 Aug 2023 05:43:09 -0700
Subject: [PATCH 074/169] Fix the gallery

---
 extensions/gallery/script.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extensions/gallery/script.js b/extensions/gallery/script.js
index 878401ec..0f25b4ea 100644
--- a/extensions/gallery/script.js
+++ b/extensions/gallery/script.js
@@ -7,7 +7,7 @@ main_parent.addEventListener('click', function(e) {
 
     // Only show this extension in the Chat tab
     if (chat_visible) {
-        gallery_element.style.display = 'flex';
+        gallery_element.style.display = 'block';
     } else {
         gallery_element.style.display = 'none';
     }

From f6db2c78d11704ffe04ec21f91259e28edc0cd56 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 13 Aug 2023 05:48:53 -0700
Subject: [PATCH 075/169] Fix ctransformers seed

---
 modules/ctransformers_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ctransformers_model.py b/modules/ctransformers_model.py
index f5641616..74c4018a 100644
--- a/modules/ctransformers_model.py
+++ b/modules/ctransformers_model.py
@@ -56,7 +56,7 @@ class CtransformersModel:
             top_k=state['top_k'],
             repetition_penalty=state['repetition_penalty'],
             last_n_tokens=state['repetition_penalty_range'],
-            seed=state['seed']
+            seed=int(state['seed'])
         )
 
         output = ""

From 3ae2cee446b075a20937a60d4e49d42bebf4a744 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 13 Aug 2023 06:09:27 -0700
Subject: [PATCH 076/169] Fix empty space when the gallery is hidden

---
 extensions/gallery/script.js | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/extensions/gallery/script.js b/extensions/gallery/script.js
index 0f25b4ea..23acfbf6 100644
--- a/extensions/gallery/script.js
+++ b/extensions/gallery/script.js
@@ -1,4 +1,7 @@
 let gallery_element = document.getElementById('gallery-extension');
+let extensions_block = gallery_element.parentElement;
+let extensions_block_size = extensions_block.childNodes.length;
+let gallery_only = (extensions_block_size == 5);
 
 main_parent.addEventListener('click', function(e) {
     let chat_visible = (chat_tab.offsetHeight > 0 && chat_tab.offsetWidth > 0);
@@ -8,7 +11,13 @@ main_parent.addEventListener('click', function(e) {
     // Only show this extension in the Chat tab
     if (chat_visible) {
         gallery_element.style.display = 'block';
+        if (gallery_only) {
+            extensions_block.style.display = '';
+        }
     } else {
         gallery_element.style.display = 'none';
+        if (gallery_only) {
+            extensions_block.style.display = 'none';
+        }
     }
 });

From 4a05aa92cb60d82623a5484fef4328e8d77fc1b6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 13 Aug 2023 18:14:09 -0700
Subject: [PATCH 077/169] Add "send to" buttons for instruction templates

- Remove instruction templates from prompt dropdowns (default/notebook)
- Add 3 buttons to Parameters > Instruction template as a replacement
- Increase the number of lines of 'negative prompt' field to 3, and add a scrollbar
- When uploading a character, switch to the Character tab
- When uploading chat history, switch to the Chat tab
---
 js/switch_tabs.js        | 31 +++++++++++++++++++++++++++++
 modules/prompts.py       | 42 ++++++++++++++++++++--------------------
 modules/ui.py            |  2 ++
 modules/ui_chat.py       | 25 ++++++++++++++++++++----
 modules/ui_parameters.py |  2 +-
 modules/utils.py         |  1 -
 6 files changed, 76 insertions(+), 27 deletions(-)
 create mode 100644 js/switch_tabs.js

diff --git a/js/switch_tabs.js b/js/switch_tabs.js
new file mode 100644
index 00000000..ed6c653d
--- /dev/null
+++ b/js/switch_tabs.js
@@ -0,0 +1,31 @@
+let chat_tab = document.getElementById('chat-tab');
+let main_parent = chat_tab.parentNode;
+
+function switch_to_chat() {
+    let chat_tab_button = main_parent.childNodes[0].childNodes[1];
+    chat_tab_button.click();
+}
+
+function switch_to_default() {
+    let default_tab_button = main_parent.childNodes[0].childNodes[4];
+    default_tab_button.click();
+}
+
+function switch_to_notebook() {
+    let notebook_tab_button = main_parent.childNodes[0].childNodes[7];
+    notebook_tab_button.click();
+}
+
+function switch_to_generation_parameters() {
+    let parameters_tab_button = main_parent.childNodes[0].childNodes[10];
+    let generation_tab_button = document.getElementById('character-menu').parentNode.parentNode.parentNode.parentNode.parentNode.parentNode.childNodes[0].childNodes[1];
+    parameters_tab_button.click();
+    generation_tab_button.click();
+}
+
+function switch_to_character() {
+    let parameters_tab_button = main_parent.childNodes[0].childNodes[10];
+    let character_tab_button = document.getElementById('character-menu').parentNode.parentNode.parentNode.parentNode.parentNode.parentNode.childNodes[0].childNodes[4];
+    parameters_tab_button.click();
+    character_tab_button.click();
+}
diff --git a/modules/prompts.py b/modules/prompts.py
index 8a3cf3e3..e7654fbf 100644
--- a/modules/prompts.py
+++ b/modules/prompts.py
@@ -1,4 +1,3 @@
-import re
 from pathlib import Path
 
 import yaml
@@ -10,26 +9,6 @@ from modules.text_generation import get_encoded_length
 def load_prompt(fname):
     if fname in ['None', '']:
         return ''
-    elif fname.startswith('Instruct-'):
-        fname = re.sub('^Instruct-', '', fname)
-        file_path = Path(f'instruction-templates/{fname}.yaml')
-        if not file_path.exists():
-            return ''
-
-        with open(file_path, 'r', encoding='utf-8') as f:
-            data = yaml.safe_load(f)
-            output = ''
-            if 'context' in data:
-                output += data['context']
-
-            replacements = {
-                '<|user|>': data['user'],
-                '<|bot|>': data['bot'],
-                '<|user-message|>': 'Input',
-            }
-
-            output += utils.replace_all(data['turn_template'].split('<|bot-message|>')[0], replacements)
-            return output.rstrip(' ')
     else:
         file_path = Path(f'prompts/{fname}.txt')
         if not file_path.exists():
@@ -43,6 +22,27 @@ def load_prompt(fname):
             return text
 
 
+def load_instruction_prompt_simple(fname):
+    file_path = Path(f'instruction-templates/{fname}.yaml')
+    if not file_path.exists():
+        return ''
+
+    with open(file_path, 'r', encoding='utf-8') as f:
+        data = yaml.safe_load(f)
+        output = ''
+        if 'context' in data:
+            output += data['context']
+
+        replacements = {
+            '<|user|>': data['user'],
+            '<|bot|>': data['bot'],
+            '<|user-message|>': 'Input',
+        }
+
+        output += utils.replace_all(data['turn_template'].split('<|bot-message|>')[0], replacements)
+        return output.rstrip(' ')
+
+
 def count_tokens(text):
     try:
         tokens = get_encoded_length(text)
diff --git a/modules/ui.py b/modules/ui.py
index e7817f73..a7d7811e 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -12,6 +12,8 @@ with open(Path(__file__).resolve().parent / '../js/main.js', 'r') as f:
     js = f.read()
 with open(Path(__file__).resolve().parent / '../js/save_files.js', 'r') as f:
     save_files_js = f.read()
+with open(Path(__file__).resolve().parent / '../js/switch_tabs.js', 'r') as f:
+    switch_tabs_js = f.read()
 
 refresh_symbol = '🔄'
 delete_symbol = '🗑️'
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 76e70ed0..461cf811 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -5,7 +5,7 @@ from pathlib import Path
 import gradio as gr
 from PIL import Image
 
-from modules import chat, shared, ui, utils
+from modules import chat, prompts, shared, ui, utils
 from modules.html_generator import chat_html_wrapper
 from modules.text_generation import stop_everything_event
 from modules.utils import gradio
@@ -83,6 +83,11 @@ def create_chat_settings_ui():
         shared.gradio['name2_instruct'] = gr.Textbox(value='', lines=1, label='Bot string')
         shared.gradio['context_instruct'] = gr.Textbox(value='', lines=4, label='Context')
         shared.gradio['turn_template'] = gr.Textbox(value=shared.settings['turn_template'], lines=1, label='Turn template', info='Used to precisely define the placement of spaces and new line characters in instruction prompts.')
+        with gr.Row():
+            shared.gradio['send_instruction_to_default'] = gr.Button('Send to default', elem_classes=['small-button'])
+            shared.gradio['send_instruction_to_notebook'] = gr.Button('Send to notebook', elem_classes=['small-button'])
+            shared.gradio['send_instruction_to_negative_prompt'] = gr.Button('Send to negative prompt', elem_classes=['small-button'])
+
         with gr.Row():
             shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=4, label='Command for chat-instruct mode', info='<|character|> gets replaced by the bot name, and <|prompt|> gets replaced by the regular chat prompt.', elem_classes=['add_scrollbar'])
 
@@ -217,7 +222,7 @@ def create_event_handlers():
     shared.gradio['load_chat_history'].upload(
         chat.load_history, gradio('load_chat_history', 'history'), gradio('history')).then(
         chat.redraw_html, gradio(reload_arr), gradio('display')).then(
-        None, None, None, _js='() => {alert("The history has been loaded.")}')
+        lambda: None, None, None, _js=f'() => {{{ui.switch_tabs_js}; switch_to_chat()}}')
 
     shared.gradio['Copy last reply'].click(chat.send_last_reply_to_input, gradio('history'), gradio('textbox'), show_progress=False)
 
@@ -245,11 +250,11 @@ def create_event_handlers():
 
     shared.gradio['Submit character'].click(
         chat.upload_character, gradio('upload_json', 'upload_img_bot'), gradio('character_menu')).then(
-        None, None, None, _js='() => {alert("The character has been loaded.")}')
+        lambda: None, None, None, _js=f'() => {{{ui.switch_tabs_js}; switch_to_character()}}')
 
     shared.gradio['Submit tavern character'].click(
         chat.upload_tavern_character, gradio('upload_img_tavern', 'tavern_json'), gradio('character_menu')).then(
-        None, None, None, _js='() => {alert("The character has been loaded.")}')
+        lambda: None, None, None, _js=f'() => {{{ui.switch_tabs_js}; switch_to_character()}}')
 
     shared.gradio['upload_json'].upload(lambda: gr.update(interactive=True), None, gradio('Submit character'))
     shared.gradio['upload_json'].clear(lambda: gr.update(interactive=False), None, gradio('Submit character'))
@@ -258,3 +263,15 @@ def create_event_handlers():
     shared.gradio['your_picture'].change(
         chat.upload_your_profile_picture, gradio('your_picture'), None).then(
         partial(chat.redraw_html, reset_cache=True), gradio(reload_arr), gradio('display'))
+
+    shared.gradio['send_instruction_to_default'].click(
+        prompts.load_instruction_prompt_simple, gradio('instruction_template'), gradio('textbox-default')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.switch_tabs_js}; switch_to_default()}}')
+
+    shared.gradio['send_instruction_to_notebook'].click(
+        prompts.load_instruction_prompt_simple, gradio('instruction_template'), gradio('textbox-notebook')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.switch_tabs_js}; switch_to_notebook()}}')
+
+    shared.gradio['send_instruction_to_negative_prompt'].click(
+        prompts.load_instruction_prompt_simple, gradio('instruction_template'), gradio('negative_prompt')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.switch_tabs_js}; switch_to_generation_parameters()}}')
diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
index 2f0c2efd..c6d38804 100644
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@@ -98,7 +98,7 @@ def create_ui(default_preset):
                         with gr.Row():
                             with gr.Column():
                                 shared.gradio['guidance_scale'] = gr.Slider(-0.5, 2.5, step=0.05, value=generate_params['guidance_scale'], label='guidance_scale', info='For CFG. 1.5 is a good value.')
-                                shared.gradio['negative_prompt'] = gr.Textbox(value=shared.settings['negative_prompt'], label='Negative prompt')
+                                shared.gradio['negative_prompt'] = gr.Textbox(value=shared.settings['negative_prompt'], label='Negative prompt', lines=3, elem_classes=['add_scrollbar'])
                                 shared.gradio['mirostat_mode'] = gr.Slider(0, 2, step=1, value=generate_params['mirostat_mode'], label='mirostat_mode', info='mode=1 is for llama.cpp only.')
                                 shared.gradio['mirostat_tau'] = gr.Slider(0, 10, step=0.01, value=generate_params['mirostat_tau'], label='mirostat_tau')
                                 shared.gradio['mirostat_eta'] = gr.Slider(0, 1, step=0.01, value=generate_params['mirostat_eta'], label='mirostat_eta')
diff --git a/modules/utils.py b/modules/utils.py
index 6fa94730..0a7edffa 100644
--- a/modules/utils.py
+++ b/modules/utils.py
@@ -88,7 +88,6 @@ def get_available_prompts():
     files = set((k.stem for k in Path('prompts').glob('*.txt')))
     prompts += sorted([k for k in files if re.match('^[0-9]', k)], key=natural_keys, reverse=True)
     prompts += sorted([k for k in files if re.match('^[^0-9]', k)], key=natural_keys)
-    prompts += ['Instruct-' + k for k in get_available_instruction_templates() if k != 'None']
     prompts += ['None']
     return prompts
 

From c2692142199e7a15324db46e1ecec7a24543964a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 13 Aug 2023 18:45:13 -0700
Subject: [PATCH 078/169] CSS change to make buttons smaller

---
 css/main.css | 1 +
 1 file changed, 1 insertion(+)

diff --git a/css/main.css b/css/main.css
index 5f293921..5432a9db 100644
--- a/css/main.css
+++ b/css/main.css
@@ -7,6 +7,7 @@
 }
 
 .small-button {
+    min-width: 0 !important;
     max-width: 171px;
     height: 39.594px;
     align-self: end;

From b8df4a436eebe741a0ab7852e4df317f862e947b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 13 Aug 2023 18:48:15 -0700
Subject: [PATCH 079/169] Scroll up when switching tabs

---
 js/switch_tabs.js | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/js/switch_tabs.js b/js/switch_tabs.js
index ed6c653d..56279193 100644
--- a/js/switch_tabs.js
+++ b/js/switch_tabs.js
@@ -1,19 +1,29 @@
 let chat_tab = document.getElementById('chat-tab');
 let main_parent = chat_tab.parentNode;
 
+function scrollToTop() {
+    window.scrollTo({
+        top: 0,
+        // behavior: 'smooth'
+    });
+}
+
 function switch_to_chat() {
     let chat_tab_button = main_parent.childNodes[0].childNodes[1];
     chat_tab_button.click();
+    scrollToTop();
 }
 
 function switch_to_default() {
     let default_tab_button = main_parent.childNodes[0].childNodes[4];
     default_tab_button.click();
+    scrollToTop();
 }
 
 function switch_to_notebook() {
     let notebook_tab_button = main_parent.childNodes[0].childNodes[7];
     notebook_tab_button.click();
+    scrollToTop();
 }
 
 function switch_to_generation_parameters() {
@@ -21,6 +31,7 @@ function switch_to_generation_parameters() {
     let generation_tab_button = document.getElementById('character-menu').parentNode.parentNode.parentNode.parentNode.parentNode.parentNode.childNodes[0].childNodes[1];
     parameters_tab_button.click();
     generation_tab_button.click();
+    scrollToTop();
 }
 
 function switch_to_character() {
@@ -28,4 +39,5 @@ function switch_to_character() {
     let character_tab_button = document.getElementById('character-menu').parentNode.parentNode.parentNode.parentNode.parentNode.parentNode.childNodes[0].childNodes[4];
     parameters_tab_button.click();
     character_tab_button.click();
+    scrollToTop();
 }

From 66c04c304deb89ecb8286e3dbcfda5d0c31b6a32 Mon Sep 17 00:00:00 2001
From: Eve <139727413+netrunnereve@users.noreply.github.com>
Date: Sun, 13 Aug 2023 22:09:03 -0400
Subject: [PATCH 080/169] Various ctransformers fixes (#3556)

---------

Co-authored-by: cal066 <cal066@users.noreply.github.com>
---
 README.md                      | 19 +++++++++++++++----
 models/config.yaml             | 14 ++++++++++++++
 modules/ctransformers_model.py |  5 +++--
 modules/loaders.py             |  1 +
 4 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 73ae33bd..9201df13 100644
--- a/README.md
+++ b/README.md
@@ -205,7 +205,7 @@ Optionally, you can use the following command-line flags:
 
 | Flag                                       | Description |
 |--------------------------------------------|-------------|
-| `--loader LOADER`                          | Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv |
+| `--loader LOADER`                          | Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv, ctransformers |
 
 #### Accelerate/transformers
 
@@ -235,22 +235,33 @@ Optionally, you can use the following command-line flags:
 | `--quant_type QUANT_TYPE`                   | quant_type for 4-bit. Valid options: nf4, fp4. |
 | `--use_double_quant`                        | use_double_quant for 4-bit. |
 
-#### llama.cpp
+#### GGML (for llama.cpp and ctransformers)
 
 | Flag        | Description |
 |-------------|-------------|
 | `--threads` | Number of threads to use. |
 | `--n_batch` | Maximum number of prompt tokens to batch together when calling llama_eval. |
+| `--n-gpu-layers N_GPU_LAYERS` | Number of layers to offload to the GPU. Only works if llama-cpp-python was compiled with BLAS. Set this to 1000000000 to offload all layers to the GPU. |
+| `--n_ctx N_CTX` | Size of the prompt context. |
+
+#### llama.cpp
+
+| Flag        | Description |
+|-------------|-------------|
 | `--no-mmap` | Prevent mmap from being used. |
 | `--mlock`   | Force the system to keep the model in RAM. |
 | `--cache-capacity CACHE_CAPACITY`   | Maximum cache capacity. Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. |
-| `--n-gpu-layers N_GPU_LAYERS` | Number of layers to offload to the GPU. Only works if llama-cpp-python was compiled with BLAS. Set this to 1000000000 to offload all layers to the GPU. |
-| `--n_ctx N_CTX` | Size of the prompt context. |
 | `--llama_cpp_seed SEED` | Seed for llama-cpp models. Default 0 (random). |
 | `--n_gqa N_GQA`         | grouped-query attention. Must be 8 for llama-2 70b. |
 | `--rms_norm_eps RMS_NORM_EPS`  | 5e-6 is a good value for llama-2 models. |
 | `--cpu`                        | Use the CPU version of llama-cpp-python instead of the GPU-accelerated version. |
 
+#### ctransformers
+
+| Flag        | Description |
+|-------------|-------------|
+| `--model_type MODEL_TYPE` | Model type of pre-quantized model. Currently gpt2, gptj, gpt_neox, falcon, llama, mpt, gpt_bigcode, dolly-v2, and replit are supported. |
+
 #### AutoGPTQ
 
 | Flag             | Description |
diff --git a/models/config.yaml b/models/config.yaml
index 3d5f48ff..ba12e8bc 100644
--- a/models/config.yaml
+++ b/models/config.yaml
@@ -10,6 +10,18 @@
   model_type: 'llama'
 .*bloom:
   model_type: 'bloom'
+.*gpt2:
+  model_type: 'gpt2'
+.*falcon:
+  model_type: 'falcon'
+.*mpt:
+  model_type: 'mpt'
+.*(starcoder|starchat):
+  model_type: 'gpt_bigcode'
+.*dolly-v2:
+  model_type: 'dolly-v2'
+.*replit:
+  model_type: 'replit'
 llama-65b-gptq-3bit:
   groupsize: 'None'
 .*(4bit|int4):
@@ -281,3 +293,5 @@ llama-65b-gptq-3bit:
 .*openchat:
   mode: 'instruct'
   instruction_template: 'OpenChat'
+.*falcon.*-instruct:
+  mode: 'instruct'
diff --git a/modules/ctransformers_model.py b/modules/ctransformers_model.py
index 74c4018a..5e0f347c 100644
--- a/modules/ctransformers_model.py
+++ b/modules/ctransformers_model.py
@@ -18,6 +18,7 @@ class CtransformersModel:
             threads=shared.args.threads,
             gpu_layers=shared.args.n_gpu_layers,
             batch_size=shared.args.n_batch,
+            context_length=shared.args.n_ctx,
             stream=True
         )
 
@@ -31,7 +32,7 @@ class CtransformersModel:
         return result, result
 
     def model_type_is_auto(self):
-        return shared.args.model_type == "Auto" or shared.args.model_type == "None"
+        return shared.args.model_type is None or shared.args.model_type == "Auto" or shared.args.model_type == "None"
 
     def model_dir(self, path):
         if path.is_file():
@@ -48,7 +49,7 @@ class CtransformersModel:
     def generate(self, prompt, state, callback=None):
         prompt = prompt if type(prompt) is str else prompt.decode()
         # ctransformers uses -1 for random seed
-        generator = self.model._stream(
+        generator = self.model(
             prompt=prompt,
             max_new_tokens=state['max_new_tokens'],
             temperature=state['temperature'],
diff --git a/modules/loaders.py b/modules/loaders.py
index 2b3a50b3..f7288f90 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -92,6 +92,7 @@ loaders_and_params = OrderedDict({
         'llamacpp_HF_info',
     ],
     'ctransformers': [
+        'n_ctx',
         'n_gpu_layers',
         'n_batch',
         'threads',

From cc7e6ef645186219b92865c6cc98f6eb59dd3abf Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 13 Aug 2023 19:24:09 -0700
Subject: [PATCH 081/169] Fix a CSS conflict

---
 css/html_4chan_style.css  | 2 +-
 modules/html_generator.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/css/html_4chan_style.css b/css/html_4chan_style.css
index 99ac6845..cef9f6eb 100644
--- a/css/html_4chan_style.css
+++ b/css/html_4chan_style.css
@@ -98,7 +98,7 @@
     margin-right: 40px !important;
 }
 
-#parent #container .message {
+#parent #container .message_4chan {
     color: black;
     border: none;
 }
\ No newline at end of file
diff --git a/modules/html_generator.py b/modules/html_generator.py
index 422beb30..eb1da374 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -100,7 +100,7 @@ def process_post(post, c):
     src = re.sub('>', '&gt;', src)
     src = re.sub('(&gt;&gt;[0-9]*)', '<span class="quote">\\1</span>', src)
     src = re.sub('\n', '<br>\n', src)
-    src = f'<blockquote class="message">{src}\n'
+    src = f'<blockquote class="message_4chan">{src}\n'
     src = f'<span class="name">Anonymous </span> <span class="number">No.{number}</span>\n{src}'
     return src
 
@@ -141,7 +141,7 @@ def generate_4chan_html(f):
     output = output.split('\n')
     for i in range(len(output)):
         output[i] = re.sub(r'^(&gt;(.*?)(<br>|</div>))', r'<span class="greentext">\1</span>', output[i])
-        output[i] = re.sub(r'^<blockquote class="message">(&gt;(.*?)(<br>|</div>))', r'<blockquote class="message"><span class="greentext">\1</span>', output[i])
+        output[i] = re.sub(r'^<blockquote class="message_4chan">(&gt;(.*?)(<br>|</div>))', r'<blockquote class="message_4chan"><span class="greentext">\1</span>', output[i])
 
     output = '\n'.join(output)
     return output

From ff9b5861c8c7595d5a2dab03610b13cbc2c760e4 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 13 Aug 2023 21:10:47 -0700
Subject: [PATCH 082/169] Fix impersonate when some text is present (closes
 #3564)

---
 modules/chat.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index e2bba18f..ffddb700 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -272,8 +272,8 @@ def impersonate_wrapper(text, start_with, state):
 
     yield text + '...'
     reply = None
-    for reply in generate_reply(prompt, state, stopping_strings=stopping_strings, is_chat=True):
-        yield reply.lstrip(' ')
+    for reply in generate_reply(prompt + text, state, stopping_strings=stopping_strings, is_chat=True):
+        yield (text + reply).lstrip(' ')
         if shared.stop_everything:
             return
 

From a95e6f02cbcc88756777c2affa8b621c7fbbb525 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 13 Aug 2023 21:17:20 -0700
Subject: [PATCH 083/169] Add a placeholder for custom stopping strings

---
 modules/ui_parameters.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
index c6d38804..235fd0bf 100644
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@@ -114,7 +114,7 @@ def create_ui(default_preset):
                         with gr.Row():
                             with gr.Column():
                                 shared.gradio['truncation_length'] = gr.Slider(value=shared.settings['truncation_length'], minimum=shared.settings['truncation_length_min'], maximum=shared.settings['truncation_length_max'], step=256, label='Truncate the prompt up to this length', info='The leftmost tokens are removed if the prompt exceeds this length. Most models require this to be at most 2048.')
-                                shared.gradio['custom_stopping_strings'] = gr.Textbox(lines=1, value=shared.settings["custom_stopping_strings"] or None, label='Custom stopping strings', info='In addition to the defaults. Written between "" and separated by commas. For instance: "\\nYour Assistant:", "\\nThe assistant:"')
+                                shared.gradio['custom_stopping_strings'] = gr.Textbox(lines=1, value=shared.settings["custom_stopping_strings"] or None, label='Custom stopping strings', info='In addition to the defaults. Written between "" and separated by commas.', placeholder='"\\n", "\\nYou:"')
                             with gr.Column():
                                 shared.gradio['auto_max_new_tokens'] = gr.Checkbox(value=shared.settings['auto_max_new_tokens'], label='auto_max_new_tokens', info='Expand max_new_tokens to the available context length.')
                                 shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label='Ban the eos_token', info='Forces the model to never end the generation prematurely.')

From 619cb4e78b1c5db39cbd65d2d5c631ec50f6ab42 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 14 Aug 2023 11:46:07 -0300
Subject: [PATCH 084/169] Add "save defaults to settings.yaml" button (#3574)

---
 README.md                |  1 -
 modules/chat.py          |  8 ++++----
 modules/shared.py        | 37 +++++++++++++++++++------------------
 modules/ui.py            | 25 ++++++++++++++++++++++++-
 modules/ui_chat.py       |  4 ++--
 modules/ui_default.py    |  4 +---
 modules/ui_notebook.py   |  4 +---
 modules/ui_parameters.py |  2 +-
 modules/ui_session.py    | 12 ++++++++++--
 server.py                |  4 +++-
 settings-template.yaml   | 33 +++++++++++++++++----------------
 11 files changed, 82 insertions(+), 52 deletions(-)

diff --git a/README.md b/README.md
index 9201df13..3c9996aa 100644
--- a/README.md
+++ b/README.md
@@ -196,7 +196,6 @@ Optionally, you can use the following command-line flags:
 | `--model-dir MODEL_DIR`                    | Path to directory with all the models. |
 | `--lora-dir LORA_DIR`                      | Path to directory with all the loras. |
 | `--model-menu`                             | Show a model menu in the terminal when the web UI is first launched. |
-| `--no-stream`                              | Don't stream the text output in real time. |
 | `--settings SETTINGS_FILE`                 | Load the default interface settings from this yaml file. See `settings-template.yaml` for an example. If you create a file called `settings.yaml`, this file will be loaded by default without the need to use the `--settings` flag. |
 | `--extensions EXTENSIONS [EXTENSIONS ...]` | The list of extensions to load. If you want to load more than one extension, write the names separated by spaces. |
 | `--verbose`                                | Print the prompts to the terminal. |
diff --git a/modules/chat.py b/modules/chat.py
index ffddb700..dad3d8b3 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -261,7 +261,7 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
     yield output
 
 
-def impersonate_wrapper(text, start_with, state):
+def impersonate_wrapper(text, state):
     if shared.model_name == 'None' or shared.model is None:
         logger.error("No model is loaded! Select one in the Model tab.")
         yield ''
@@ -291,15 +291,15 @@ def generate_chat_reply(text, state, regenerate=False, _continue=False, loading_
 
 
 # Same as above but returns HTML for the UI
-def generate_chat_reply_wrapper(text, start_with, state, regenerate=False, _continue=False):
-    if start_with != '' and not _continue:
+def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
+    if state['start_with'] != '' and not _continue:
         if regenerate:
             text, state['history'] = remove_last_message(state['history'])
             regenerate = False
 
         _continue = True
         send_dummy_message(text, state)
-        send_dummy_reply(start_with, state)
+        send_dummy_reply(state['start_with'], state)
 
     for i, history in enumerate(generate_chat_reply(text, state, regenerate, _continue, loading_message=True)):
         yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style']), history
diff --git a/modules/shared.py b/modules/shared.py
index 89b5f0cb..e36e2437 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -29,34 +29,35 @@ session_is_loading = False
 # UI defaults
 settings = {
     'dark_theme': True,
-    'autoload_model': False,
+    'start_with': '',
+    'mode': 'chat',
+    'chat_style': 'TheEncrypted777',
+    'character': 'None',
+    'prompt-default': 'QA',
+    'prompt-notebook': 'QA',
+    'preset': 'simple-1',
     'max_new_tokens': 200,
     'max_new_tokens_min': 1,
     'max_new_tokens_max': 4096,
-    'auto_max_new_tokens': False,
     'seed': -1,
     'negative_prompt': '',
-    'character': 'None',
+    'truncation_length': 2048,
+    'truncation_length_min': 0,
+    'truncation_length_max': 16384,
+    'custom_stopping_strings': '',
+    'auto_max_new_tokens': False,
+    'ban_eos_token': False,
+    'add_bos_token': True,
+    'skip_special_tokens': True,
+    'stream': True,
     'name1': 'You',
     'name2': 'Assistant',
     'context': 'This is a conversation with your Assistant. It is a computer program designed to help you with various tasks such as answering questions, providing recommendations, and helping with decision making. You can ask it anything you want and it will do its best to give you accurate and relevant information.',
     'greeting': '',
-    'turn_template': '',
-    'custom_stopping_strings': '',
-    'add_bos_token': True,
-    'ban_eos_token': False,
-    'skip_special_tokens': True,
-    'truncation_length': 2048,
-    'truncation_length_min': 0,
-    'truncation_length_max': 16384,
-    'mode': 'chat',
-    'start_with': '',
-    'chat_style': 'TheEncrypted777',
     'instruction_template': 'None',
     'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',
+    'autoload_model': False,
     'default_extensions': ['gallery'],
-    'preset': 'simple-1',
-    'prompt': 'QA',
 }
 
 
@@ -83,7 +84,7 @@ parser.add_argument('--lora', type=str, nargs="+", help='The list of LoRAs to lo
 parser.add_argument("--model-dir", type=str, default='models/', help="Path to directory with all the models")
 parser.add_argument("--lora-dir", type=str, default='loras/', help="Path to directory with all the loras")
 parser.add_argument('--model-menu', action='store_true', help='Show a model menu in the terminal when the web UI is first launched.')
-parser.add_argument('--no-stream', action='store_true', help='Don\'t stream the text output in real time.')
+parser.add_argument('--no-stream', action='store_true', help='DEPRECATED')
 parser.add_argument('--settings', type=str, help='Load the default interface settings from this yaml file. See settings-template.yaml for an example. If you create a file called settings.yaml, this file will be loaded by default without the need to use the --settings flag.')
 parser.add_argument('--extensions', type=str, nargs="+", help='The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.')
 parser.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.')
@@ -181,7 +182,7 @@ args = parser.parse_args()
 args_defaults = parser.parse_args([])
 
 # Deprecation warnings
-for k in ['chat', 'notebook']:
+for k in ['chat', 'notebook', 'no_stream']:
     if getattr(args, k):
         logger.warning(f'--{k} has been deprecated and will be removed soon. Please remove that flag.')
 
diff --git a/modules/ui.py b/modules/ui.py
index a7d7811e..19b9997f 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -1,7 +1,9 @@
+import copy
 from pathlib import Path
 
 import gradio as gr
 import torch
+import yaml
 
 from modules import shared
 
@@ -119,6 +121,7 @@ def list_interface_input_elements():
     # Chat elements
     elements += [
         'textbox',
+        'start_with',
         'character_menu',
         'history',
         'name1',
@@ -139,7 +142,9 @@ def list_interface_input_elements():
     elements += [
         'textbox-notebook',
         'textbox-default',
-        'output_textbox'
+        'output_textbox',
+        'prompt_menu-default',
+        'prompt_menu-notebook',
     ]
 
     # Model elements
@@ -170,6 +175,24 @@ def apply_interface_values(state, use_persistent=False):
         return [state[k] if k in state else gr.update() for k in elements]
 
 
+def save_settings(state, preset, instruction_template, extensions):
+    output = copy.deepcopy(shared.settings)
+    exclude = ['name1', 'name2', 'greeting', 'context', 'turn_template']
+    for k in state:
+        if k in shared.settings and k not in exclude:
+            output[k] = state[k]
+
+    output['preset'] = preset
+    output['prompt-default'] = state['prompt_menu-default']
+    output['prompt-notebook'] = state['prompt_menu-notebook']
+    output['character'] = state['character_menu']
+    output['instruction_template'] = instruction_template
+    output['default_extensions'] = extensions
+    output['seed'] = int(output['seed'])
+
+    return yaml.dump(output, sort_keys=False, width=float("inf"))
+
+
 class ToolButton(gr.Button, gr.components.IOComponent):
     """
     Small button with single emoji as text, fits inside gradio forms
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 461cf811..d8179867 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -10,7 +10,7 @@ from modules.html_generator import chat_html_wrapper
 from modules.text_generation import stop_everything_event
 from modules.utils import gradio
 
-inputs = ('Chat input', 'start_with', 'interface_state')
+inputs = ('Chat input', 'interface_state')
 reload_arr = ('history', 'name1', 'name2', 'mode', 'chat_style')
 clear_arr = ('Clear history-confirm', 'Clear history', 'Clear history-cancel')
 
@@ -82,7 +82,7 @@ def create_chat_settings_ui():
         shared.gradio['name1_instruct'] = gr.Textbox(value='', lines=2, label='User string')
         shared.gradio['name2_instruct'] = gr.Textbox(value='', lines=1, label='Bot string')
         shared.gradio['context_instruct'] = gr.Textbox(value='', lines=4, label='Context')
-        shared.gradio['turn_template'] = gr.Textbox(value=shared.settings['turn_template'], lines=1, label='Turn template', info='Used to precisely define the placement of spaces and new line characters in instruction prompts.')
+        shared.gradio['turn_template'] = gr.Textbox(value='', lines=1, label='Turn template', info='Used to precisely define the placement of spaces and new line characters in instruction prompts.')
         with gr.Row():
             shared.gradio['send_instruction_to_default'] = gr.Button('Send to default', elem_classes=['small-button'])
             shared.gradio['send_instruction_to_notebook'] = gr.Button('Send to notebook', elem_classes=['small-button'])
diff --git a/modules/ui_default.py b/modules/ui_default.py
index d26863bc..e4771fb8 100644
--- a/modules/ui_default.py
+++ b/modules/ui_default.py
@@ -13,13 +13,11 @@ outputs = ('output_textbox', 'html-default')
 
 
 def create_ui():
-    default_text = load_prompt(shared.settings['prompt'])
-
     with gr.Tab('Default', elem_id='default-tab'):
         shared.gradio['last_input-default'] = gr.State('')
         with gr.Row():
             with gr.Column():
-                shared.gradio['textbox-default'] = gr.Textbox(value=default_text, elem_classes=['textbox_default', 'add_scrollbar'], lines=27, label='Input')
+                shared.gradio['textbox-default'] = gr.Textbox(value='', elem_classes=['textbox_default', 'add_scrollbar'], lines=27, label='Input')
                 with gr.Row():
                     shared.gradio['Generate-default'] = gr.Button('Generate', variant='primary')
                     shared.gradio['Stop-default'] = gr.Button('Stop', elem_id='stop')
diff --git a/modules/ui_notebook.py b/modules/ui_notebook.py
index 7d6648d2..dba9039a 100644
--- a/modules/ui_notebook.py
+++ b/modules/ui_notebook.py
@@ -13,14 +13,12 @@ outputs = ('textbox-notebook', 'html-notebook')
 
 
 def create_ui():
-    default_text = load_prompt(shared.settings['prompt'])
-
     with gr.Tab('Notebook', elem_id='notebook-tab'):
         shared.gradio['last_input-notebook'] = gr.State('')
         with gr.Row():
             with gr.Column(scale=4):
                 with gr.Tab('Raw'):
-                    shared.gradio['textbox-notebook'] = gr.Textbox(value=default_text, elem_classes=['textbox', 'add_scrollbar'], lines=27)
+                    shared.gradio['textbox-notebook'] = gr.Textbox(value='', elem_classes=['textbox', 'add_scrollbar'], lines=27)
 
                 with gr.Tab('Markdown'):
                     shared.gradio['markdown_render-notebook'] = gr.Button('Render')
diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
index 235fd0bf..a0f95158 100644
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@@ -121,7 +121,7 @@ def create_ui(default_preset):
                                 shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.')
 
                                 shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.')
-                                shared.gradio['stream'] = gr.Checkbox(value=not shared.args.no_stream, label='Activate text streaming')
+                                shared.gradio['stream'] = gr.Checkbox(value=shared.settings['stream'], label='Activate text streaming')
 
         ui_chat.create_chat_settings_ui()
 
diff --git a/modules/ui_session.py b/modules/ui_session.py
index 3d0fdac6..b774a207 100644
--- a/modules/ui_session.py
+++ b/modules/ui_session.py
@@ -9,8 +9,10 @@ def create_ui():
     with gr.Tab("Session", elem_id="session-tab"):
         with gr.Row():
             with gr.Column():
-                shared.gradio['reset_interface'] = gr.Button("Apply and restart")
-                shared.gradio['toggle_dark_mode'] = gr.Button('Toggle 💡')
+                shared.gradio['reset_interface'] = gr.Button("Apply flags/extensions and restart")
+                with gr.Row():
+                    shared.gradio['toggle_dark_mode'] = gr.Button('Toggle 💡')
+                    shared.gradio['save_settings'] = gr.Button('Save UI defaults to settings.yaml')
 
                 with gr.Row():
                     with gr.Column():
@@ -37,6 +39,12 @@ def create_ui():
             lambda: None, None, None, _js='() => {document.body.innerHTML=\'<h1 style="font-family:monospace;padding-top:20%;margin:0;height:100vh;color:lightgray;text-align:center;background:var(--body-background-fill)">Reloading...</h1>\'; setTimeout(function(){location.reload()},2500); return []}')
 
         shared.gradio['toggle_dark_mode'].click(lambda: None, None, None, _js='() => {document.getElementsByTagName("body")[0].classList.toggle("dark")}')
+        shared.gradio['save_settings'].click(
+            ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+            ui.save_settings, gradio('interface_state', 'preset_menu', 'instruction_template', 'extensions_menu'), gradio('save_contents')).then(
+            lambda: './', None, gradio('save_root')).then(
+            lambda: 'settings.yaml', None, gradio('save_filename')).then(
+            lambda: gr.update(visible=True), None, gradio('file_saver'))
 
 
 def set_interface_arguments(extensions, bool_active):
diff --git a/server.py b/server.py
index e86e3338..d90453a0 100644
--- a/server.py
+++ b/server.py
@@ -71,7 +71,9 @@ def create_interface():
         'loader': shared.args.loader or 'Transformers',
         'mode': shared.settings['mode'],
         'character_menu': shared.args.character or shared.settings['character'],
-        'instruction_template': shared.settings['instruction_template']
+        'instruction_template': shared.settings['instruction_template'],
+        'prompt_menu-default': shared.settings['prompt-default'],
+        'prompt_menu-notebook': shared.settings['prompt-notebook'],
     })
 
     if Path("cache/pfp_character.png").exists():
diff --git a/settings-template.yaml b/settings-template.yaml
index b1d63c71..11cd1185 100644
--- a/settings-template.yaml
+++ b/settings-template.yaml
@@ -1,33 +1,34 @@
 dark_theme: true
-autoload_model: false
+start_with: ''
+mode: chat
+chat_style: TheEncrypted777
+character: None
+prompt-default: QA
+prompt-notebook: QA
+preset: simple-1
 max_new_tokens: 200
 max_new_tokens_min: 1
 max_new_tokens_max: 4096
-auto_max_new_tokens: false
 seed: -1
 negative_prompt: ''
-character: None
+truncation_length: 2048
+truncation_length_min: 0
+truncation_length_max: 16384
+custom_stopping_strings: ''
+auto_max_new_tokens: false
+ban_eos_token: false
+add_bos_token: true
+skip_special_tokens: true
+stream: true
 name1: You
 name2: Assistant
 context: This is a conversation with your Assistant. It is a computer program designed to help you with various tasks such as answering questions, providing recommendations, and helping with decision making. You can ask it anything you want and it will do its best to give you accurate and relevant information.
 greeting: ''
-turn_template: ''
-custom_stopping_strings: ''
-add_bos_token: true
-ban_eos_token: false
-skip_special_tokens: true
-truncation_length: 2048
-truncation_length_min: 0
-truncation_length_max: 16384
-mode: chat
-start_with: ''
-chat_style: TheEncrypted777
 instruction_template: None
 chat-instruct_command: |-
   Continue the chat dialogue below. Write a single reply for the character "<|character|>".
 
   <|prompt|>
+autoload_model: false
 default_extensions:
 - gallery
-preset: simple-1
-prompt: QA

From 890b4abdad2a49279d645c0c5d0a6b891b12a9a0 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 14 Aug 2023 07:55:52 -0700
Subject: [PATCH 085/169] Fix session saving

---
 modules/ui_file_saving.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/modules/ui_file_saving.py b/modules/ui_file_saving.py
index 98165d67..4ccc3126 100644
--- a/modules/ui_file_saving.py
+++ b/modules/ui_file_saving.py
@@ -1,3 +1,4 @@
+import copy
 import json
 
 import gradio as gr
@@ -78,7 +79,7 @@ def create_event_handlers():
     if not shared.args.multi_user:
         shared.gradio['save_session'].click(
             ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-            lambda x: json.dumps(x, indent=4), gradio('interface_state'), gradio('temporary_text')).then(
+            save_session, gradio('interface_state'), gradio('temporary_text')).then(
             None, gradio('temporary_text'), None, _js=f"(contents) => {{{ui.save_files_js}; saveSession(contents)}}")
 
         shared.gradio['load_session'].upload(
@@ -98,3 +99,11 @@ def load_session(file, state):
 
     state.update(data)
     return state
+
+
+def save_session(state):
+    output = copy.deepcopy(state)
+    for key in ['prompt_menu-default', 'prompt_menu-notebook']:
+        del output[key]
+
+    return json.dumps(output, indent=4)

From 3e0a9f9cdb3b372c5f8bec314ec1a1a66aa0c43e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 14 Aug 2023 08:18:21 -0700
Subject: [PATCH 086/169] Refresh the character dropdown when saving/deleting a
 character

---
 modules/chat.py           | 1 -
 modules/ui_file_saving.py | 5 +++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index dad3d8b3..d83e9490 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -508,7 +508,6 @@ def load_character(character, name1, name2, instruct=False):
         context = shared.settings['context']
         name2 = shared.settings['name2']
         greeting = shared.settings['greeting']
-        turn_template = shared.settings['turn_template']
 
     return name1, name2, picture, greeting, context, turn_template.replace("\n", r"\n")
 
diff --git a/modules/ui_file_saving.py b/modules/ui_file_saving.py
index 4ccc3126..b4674426 100644
--- a/modules/ui_file_saving.py
+++ b/modules/ui_file_saving.py
@@ -54,12 +54,13 @@ def create_event_handlers():
 
     shared.gradio['save_character_confirm'].click(
         chat.save_character, gradio('name2', 'greeting', 'context', 'character_picture', 'save_character_filename'), None).then(
-        lambda: gr.update(visible=False), None, gradio('character_saver'))
+        lambda: gr.update(visible=False), None, gradio('character_saver')).then(
+        lambda x: gr.update(choices=utils.get_available_characters(), value=x), gradio('save_character_filename'), gradio('character_menu'))
 
     shared.gradio['delete_character_confirm'].click(
         chat.delete_character, gradio('character_menu'), None).then(
         lambda: gr.update(visible=False), None, gradio('character_deleter')).then(
-        lambda: gr.update(choices=utils.get_available_characters()), None, gradio('character_menu'))
+        lambda: gr.update(choices=utils.get_available_characters(), value="None"), None, gradio('character_menu'))
 
     shared.gradio['save_character_cancel'].click(lambda: gr.update(visible=False), None, gradio('character_saver'))
     shared.gradio['delete_character_cancel'].click(lambda: gr.update(visible=False), None, gradio('character_deleter'))

From d8a82d34ed1cf63f4190500bbbf478f8dd74d8f7 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 14 Aug 2023 08:45:58 -0700
Subject: [PATCH 087/169] Improve a warning

---
 modules/shared.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/shared.py b/modules/shared.py
index e36e2437..a2ee0b91 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -184,7 +184,7 @@ args_defaults = parser.parse_args([])
 # Deprecation warnings
 for k in ['chat', 'notebook', 'no_stream']:
     if getattr(args, k):
-        logger.warning(f'--{k} has been deprecated and will be removed soon. Please remove that flag.')
+        logger.warning(f'The --{k} flag has been deprecated and will be removed soon. Please remove that flag.')
 
 # Security warnings
 if args.trust_remote_code:

From 4d067e9b5213657e350f835056ef13028326de27 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 14 Aug 2023 09:39:06 -0700
Subject: [PATCH 088/169] Add back a variable to keep old extensions working

---
 modules/ui_chat.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index d8179867..fc850ea3 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -121,7 +121,10 @@ def create_chat_settings_ui():
 
 def create_event_handlers():
     gen_events = []
-    shared.input_params = gradio(inputs)  # Obsolete, kept for compatibility with old extensions
+
+    # Obsolete variables, kept for compatibility with old extensions
+    shared.input_params = gradio(inputs)
+    shared.reload_inputs = gradio(reload_arr)
 
     gen_events.append(shared.gradio['Generate'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(

From 7e57b35b5e7c9a2cd20504633d2bcb5b4eaf600a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 14 Aug 2023 10:10:39 -0700
Subject: [PATCH 089/169] Clean up old code

---
 modules/ui_chat.py     | 18 ++++++------------
 modules/ui_default.py  | 16 ++++------------
 modules/ui_notebook.py | 13 ++++---------
 3 files changed, 14 insertions(+), 33 deletions(-)

diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index fc850ea3..a3a4ccf0 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -120,53 +120,47 @@ def create_chat_settings_ui():
 
 
 def create_event_handlers():
-    gen_events = []
 
     # Obsolete variables, kept for compatibility with old extensions
     shared.input_params = gradio(inputs)
     shared.reload_inputs = gradio(reload_arr)
 
-    gen_events.append(shared.gradio['Generate'].click(
+    shared.gradio['Generate'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
         chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
         lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
-    )
 
-    gen_events.append(shared.gradio['textbox'].submit(
+    shared.gradio['textbox'].submit(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
         chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
         lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
-    )
 
-    gen_events.append(shared.gradio['Regenerate'].click(
+    shared.gradio['Regenerate'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         partial(chat.generate_chat_reply_wrapper, regenerate=True), gradio(inputs), gradio('display', 'history'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
         lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
-    )
 
-    gen_events.append(shared.gradio['Continue'].click(
+    shared.gradio['Continue'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         partial(chat.generate_chat_reply_wrapper, _continue=True), gradio(inputs), gradio('display', 'history'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.save_persistent_history, gradio('history', 'character_menu', 'mode'), None).then(
         lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
-    )
 
-    gen_events.append(shared.gradio['Impersonate'].click(
+    shared.gradio['Impersonate'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda x: x, gradio('textbox'), gradio('Chat input'), show_progress=False).then(
         chat.impersonate_wrapper, gradio(inputs), gradio('textbox'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
-    )
 
     shared.gradio['Replace last reply'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
@@ -211,7 +205,7 @@ def create_event_handlers():
         chat.redraw_html, gradio(reload_arr), gradio('display'))
 
     shared.gradio['Stop'].click(
-        stop_everything_event, None, None, queue=False, cancels=gen_events if shared.args.no_stream else None).then(
+        stop_everything_event, None, None, queue=False).then(
         chat.redraw_html, gradio(reload_arr), gradio('display'))
 
     shared.gradio['mode'].change(
diff --git a/modules/ui_default.py b/modules/ui_default.py
index e4771fb8..99657227 100644
--- a/modules/ui_default.py
+++ b/modules/ui_default.py
@@ -45,36 +45,28 @@ def create_ui():
 
 
 def create_event_handlers():
-    gen_events = []
-
-    gen_events.append(shared.gradio['Generate-default'].click(
+    shared.gradio['Generate-default'].click(
         lambda x: x, gradio('textbox-default'), gradio('last_input-default')).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
-        # lambda: None, None, None, _js="() => {element = document.getElementsByTagName('textarea')[0]; element.scrollTop = element.scrollHeight}")
-    )
 
-    gen_events.append(shared.gradio['textbox-default'].submit(
+    shared.gradio['textbox-default'].submit(
         lambda x: x, gradio('textbox-default'), gradio('last_input-default')).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
-        # lambda: None, None, None, _js="() => {element = document.getElementsByTagName('textarea')[0]; element.scrollTop = element.scrollHeight}")
-    )
 
     shared.gradio['markdown_render-default'].click(lambda x: x, gradio('output_textbox'), gradio('markdown-default'), queue=False)
-    gen_events.append(shared.gradio['Continue-default'].click(
+    shared.gradio['Continue-default'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         generate_reply_wrapper, [shared.gradio['output_textbox']] + gradio(inputs)[1:], gradio(outputs), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
-        # lambda: None, None, None, _js="() => {element = document.getElementsByTagName('textarea')[1]; element.scrollTop = element.scrollHeight}")
-    )
 
-    shared.gradio['Stop-default'].click(stop_everything_event, None, None, queue=False, cancels=gen_events if shared.args.no_stream else None)
+    shared.gradio['Stop-default'].click(stop_everything_event, None, None, queue=False)
     shared.gradio['prompt_menu-default'].change(load_prompt, gradio('prompt_menu-default'), gradio('textbox-default'), show_progress=False)
     shared.gradio['save_prompt-default'].click(
         lambda x: x, gradio('textbox-default'), gradio('save_contents')).then(
diff --git a/modules/ui_notebook.py b/modules/ui_notebook.py
index dba9039a..6949ed78 100644
--- a/modules/ui_notebook.py
+++ b/modules/ui_notebook.py
@@ -46,35 +46,30 @@ def create_ui():
 
 
 def create_event_handlers():
-    gen_events = []
-
-    gen_events.append(shared.gradio['Generate-notebook'].click(
+    shared.gradio['Generate-notebook'].click(
         lambda x: x, gradio('textbox-notebook'), gradio('last_input-notebook')).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
-    )
 
-    gen_events.append(shared.gradio['textbox-notebook'].submit(
+    shared.gradio['textbox-notebook'].submit(
         lambda x: x, gradio('textbox-notebook'), gradio('last_input-notebook')).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
-    )
 
     shared.gradio['Undo'].click(lambda x: x, gradio('last_input-notebook'), gradio('textbox-notebook'), show_progress=False)
     shared.gradio['markdown_render-notebook'].click(lambda x: x, gradio('textbox-notebook'), gradio('markdown-notebook'), queue=False)
-    gen_events.append(shared.gradio['Regenerate-notebook'].click(
+    shared.gradio['Regenerate-notebook'].click(
         lambda x: x, gradio('last_input-notebook'), gradio('textbox-notebook'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
-    )
 
-    shared.gradio['Stop-notebook'].click(stop_everything_event, None, None, queue=False, cancels=gen_events if shared.args.no_stream else None)
+    shared.gradio['Stop-notebook'].click(stop_everything_event, None, None, queue=False)
     shared.gradio['prompt_menu-notebook'].change(load_prompt, gradio('prompt_menu-notebook'), gradio('textbox-notebook'), show_progress=False)
     shared.gradio['save_prompt-notebook'].click(
         lambda x: x, gradio('textbox-notebook'), gradio('save_contents')).then(

From 8294eadd384e8d9543b169abd751406885e3f3c6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 14 Aug 2023 11:13:46 -0700
Subject: [PATCH 090/169] Bump AutoGPTQ wheel

---
 requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index ec6a7e47..5e750903 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -25,8 +25,8 @@ git+https://github.com/huggingface/transformers@baf1daa58eb2960248fd9f7c3af0ed24
 
 bitsandbytes==0.41.1; platform_system != "Windows"
 https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.0/auto_gptq-0.4.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
-https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.0/auto_gptq-0.4.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
+https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 

From 991bb57e439ccfbcd5a0f154957c98d2e3d66c35 Mon Sep 17 00:00:00 2001
From: cal066 <60696996+cal066@users.noreply.github.com>
Date: Mon, 14 Aug 2023 18:17:24 +0000
Subject: [PATCH 091/169] ctransformers: Fix up model_type name consistency
 (#3567)

---
 README.md          | 2 +-
 models/config.yaml | 6 +++---
 modules/loaders.py | 1 +
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 0cbe24cc..3a7e7eab 100644
--- a/README.md
+++ b/README.md
@@ -259,7 +259,7 @@ Optionally, you can use the following command-line flags:
 
 | Flag        | Description |
 |-------------|-------------|
-| `--model_type MODEL_TYPE` | Model type of pre-quantized model. Currently gpt2, gptj, gpt_neox, falcon, llama, mpt, gpt_bigcode, dolly-v2, and replit are supported. |
+| `--model_type MODEL_TYPE` | Model type of pre-quantized model. Currently gpt2, gptj, gptneox, falcon, llama, mpt, starcoder (gptbigcode), dollyv2, and replit are supported. |
 
 #### AutoGPTQ
 
diff --git a/models/config.yaml b/models/config.yaml
index ba12e8bc..624840df 100644
--- a/models/config.yaml
+++ b/models/config.yaml
@@ -5,7 +5,7 @@
 .*(gpt-j|gptj|gpt4all-j|malion-6b|pygway|pygmalion-6b|dolly-v1):
   model_type: 'gptj'
 .*(gpt-neox|koalpaca-polyglot|polyglot.*koalpaca|polyglot-ko|polyglot_ko|pythia|stablelm|incite|dolly-v2|polycoder|h2ogpt-oig|h2ogpt-oasst1|h2ogpt-gm):
-  model_type: 'gpt_neox'
+  model_type: 'gptneox'
 .*llama:
   model_type: 'llama'
 .*bloom:
@@ -17,9 +17,9 @@
 .*mpt:
   model_type: 'mpt'
 .*(starcoder|starchat):
-  model_type: 'gpt_bigcode'
+  model_type: 'starcoder'
 .*dolly-v2:
-  model_type: 'dolly-v2'
+  model_type: 'dollyv2'
 .*replit:
   model_type: 'replit'
 llama-65b-gptq-3bit:
diff --git a/modules/loaders.py b/modules/loaders.py
index d7bd8d48..08a11ac0 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -290,6 +290,7 @@ loaders_model_types = {
         "dollyv2"
         "replit",
         "starcoder",
+        "gptbigcode",
         "falcon"
     ],
 }

From 155862a4a0938a47e9792cd6a7e8dcebe1a969f1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 15 Aug 2023 11:40:37 -0700
Subject: [PATCH 092/169] Update README

---
 README.md | 79 +++++++++++++++++++++++++++++++------------------------
 1 file changed, 45 insertions(+), 34 deletions(-)

diff --git a/README.md b/README.md
index 3a7e7eab..f1af6519 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # Text generation web UI
 
-A gradio web UI for running Large Language Models like LLaMA, llama.cpp, GPT-J, OPT, and GALACTICA.
+A Gradio web UI for Large Language Models.
 
 Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) of text generation.
 
@@ -10,20 +10,18 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 
 ## Features
 
-* 3 interface modes: default, notebook, and chat
+* 3 interface modes: default (two columns), notebook, and chat
 * Multiple model backends: [transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp), [ExLlama](https://github.com/turboderp/exllama), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa), [ctransformers](https://github.com/marella/ctransformers)
 * Dropdown menu for quickly switching between different models
-* LoRA: load and unload LoRAs on the fly, train a new LoRA
-* Precise instruction templates for chat mode, including Llama 2, Alpaca, Vicuna, WizardLM, StableLM, and many others
+* LoRA: load and unload LoRAs on the fly, train a new LoRA using QLoRA
+* Precise instruction templates for chat mode, including Llama-2-chat, Alpaca, Vicuna, WizardLM, StableLM, and many others
+* 4-bit, 8-bit, and CPU inference through the transformers library
+* Use llama.cpp models with transformers samplers (`llamacpp_HF` loader)
 * [Multimodal pipelines, including LLaVA and MiniGPT-4](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/multimodal)
-* 8-bit and 4-bit inference through bitsandbytes
-* CPU mode for transformers models
-* [DeepSpeed ZeRO-3 inference](docs/DeepSpeed.md)
-* [Extensions](docs/Extensions.md)
+* [Extensions framework](docs/Extensions.md)
 * [Custom chat characters](docs/Chat-mode.md)
 * Very efficient text streaming
 * Markdown output with LaTeX rendering, to use for instance with [GALACTICA](https://github.com/paperswithcode/galai)
-* Nice HTML output for GPT-4chan
 * API, including endpoints for websocket streaming ([see the examples](https://github.com/oobabooga/text-generation-webui/blob/main/api-examples))
 
 To learn how to use the various features, check out the Documentation: https://github.com/oobabooga/text-generation-webui/tree/main/docs
@@ -38,26 +36,24 @@ To learn how to use the various features, check out the Documentation: https://g
 
 Just download the zip above, extract it, and double-click on "start". The web UI and all its dependencies will be installed in the same folder.
 
-* The source codes are here: https://github.com/oobabooga/one-click-installers
+* The source codes and more information can be found here: https://github.com/oobabooga/one-click-installers
 * There is no need to run the installers as admin.
-* AMD doesn't work on Windows.
 * Huge thanks to [@jllllll](https://github.com/jllllll), [@ClayShoaf](https://github.com/ClayShoaf), and [@xNul](https://github.com/xNul) for their contributions to these installers.
 
 ### Manual installation using Conda
 
-Recommended if you have some experience with the command line.
+Recommended if you have some experience with the command-line.
 
 #### 0. Install Conda
 
 https://docs.conda.io/en/latest/miniconda.html
 
-On Linux or WSL, it can be automatically installed with these two commands:
+On Linux or WSL, it can be automatically installed with these two commands ([source](https://educe-ubc.github.io/conda.html)):
 
 ```
 curl -sL "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" > "Miniconda3.sh"
 bash Miniconda3.sh
 ```
-Source: https://educe-ubc.github.io/conda.html
 
 #### 1. Create a new conda environment
 
@@ -92,9 +88,9 @@ cd text-generation-webui
 pip install -r requirements.txt
 ```
 
-#### bitsandbytes
+#### Note about older NVIDIA GPUs
 
-bitsandbytes >= 0.39 may not work on older NVIDIA GPUs. In that case, to use `--load-in-8bit`, you may have to downgrade like this:
+bitsandbytes >= 0.39 may not work. In that case, to use `--load-in-8bit`, you may have to downgrade like this:
 
 * Linux: `pip install bitsandbytes==0.38.1`
 * Windows: `pip install https://github.com/jllllll/bitsandbytes-windows-webui/raw/main/bitsandbytes-0.38.1-py3-none-any.whl`
@@ -113,37 +109,52 @@ docker compose up --build
 
 ### Updating the requirements
 
-From time to time, the `requirements.txt` changes. To update, use this command:
+From time to time, the `requirements.txt` changes. To update, use these commands:
 
 ```
 conda activate textgen
 cd text-generation-webui
 pip install -r requirements.txt --upgrade
 ```
+
 ## Downloading models
 
-Models should be placed inside the `models/` folder.
+Models should be placed in the `text-generation-webui/models` folder. They are usually downloaded from [Hugging Face](https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads).
 
-[Hugging Face](https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads) is the main place to download models. These are some examples:
+* Transformers or GPTQ models are made of several files and must be placed in a subfolder. Example:
 
-* [Pythia](https://huggingface.co/models?sort=downloads&search=eleutherai%2Fpythia+deduped)
-* [OPT](https://huggingface.co/models?search=facebook/opt)
-* [GALACTICA](https://huggingface.co/models?search=facebook/galactica)
-* [GPT-J 6B](https://huggingface.co/EleutherAI/gpt-j-6B/tree/main)
+```
+text-generation-webui/
+├── models
+│   ├── lmsys_vicuna-33b-v1.3
+│   │   ├── config.json
+│   │   ├── generation_config.json
+│   │   ├── huggingface-metadata.txt
+│   │   ├── pytorch_model-00001-of-00007.bin
+│   │   ├── pytorch_model-00002-of-00007.bin
+│   │   ├── pytorch_model-00003-of-00007.bin
+│   │   ├── pytorch_model-00004-of-00007.bin
+│   │   ├── pytorch_model-00005-of-00007.bin
+│   │   ├── pytorch_model-00006-of-00007.bin
+│   │   ├── pytorch_model-00007-of-00007.bin
+│   │   ├── pytorch_model.bin.index.json
+│   │   ├── README.md
+│   │   ├── special_tokens_map.json
+│   │   ├── tokenizer_config.json
+│   │   └── tokenizer.model
+```
 
-You can automatically download a model from HF using the script `download-model.py`:
+In the "Model" tab of the UI, those models can be downloaded from Hugging Face. You can also download them from the command-line with `python download-model.py organization/model`.
 
-    python download-model.py organization/model
+* GGML models are a single file and can be placed directly into `models`. Example:
 
-For example:
+```
+text-generation-webui/
+├── models
+│   ├── llama-13b.ggmlv3.q4_K_M.bin
+```
 
-    python download-model.py facebook/opt-1.3b
-
-To download a protected model, set env vars `HF_USER` and `HF_PASS` to your Hugging Face username and password (or [User Access Token](https://huggingface.co/settings/tokens)). The model's terms must first be accepted on the HF website.
-
-#### GGML models
-
-You can drop these directly into the `models/` folder, making sure that the file name contains `ggml` somewhere and ends in `.bin`.
+Those models have to be downloaded manually and placed into that folder.
 
 #### GPT-4chan
 
@@ -354,5 +365,5 @@ If you would like to contribute to the project, check out the [Contributing guid
 
 ## Community
 
-* Subreddit: https://www.reddit.com/r/oobaboogazz/
+* Subreddit: https://www.reddit.com/r/oobabooga/
 * Discord: https://discord.gg/jwZCF2dPQN

From 7089b2a48f98266556b85bb756ffcefab233e97e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 15 Aug 2023 12:16:21 -0700
Subject: [PATCH 093/169] Update README

---
 README.md | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index f1af6519..8b6bd281 100644
--- a/README.md
+++ b/README.md
@@ -124,12 +124,11 @@ Models should be placed in the `text-generation-webui/models` folder. They are u
 * Transformers or GPTQ models are made of several files and must be placed in a subfolder. Example:
 
 ```
-text-generation-webui/
+text-generation-webui
 ├── models
 │   ├── lmsys_vicuna-33b-v1.3
 │   │   ├── config.json
 │   │   ├── generation_config.json
-│   │   ├── huggingface-metadata.txt
 │   │   ├── pytorch_model-00001-of-00007.bin
 │   │   ├── pytorch_model-00002-of-00007.bin
 │   │   ├── pytorch_model-00003-of-00007.bin
@@ -138,15 +137,14 @@ text-generation-webui/
 │   │   ├── pytorch_model-00006-of-00007.bin
 │   │   ├── pytorch_model-00007-of-00007.bin
 │   │   ├── pytorch_model.bin.index.json
-│   │   ├── README.md
 │   │   ├── special_tokens_map.json
 │   │   ├── tokenizer_config.json
 │   │   └── tokenizer.model
 ```
 
-In the "Model" tab of the UI, those models can be downloaded from Hugging Face. You can also download them from the command-line with `python download-model.py organization/model`.
+In the "Model" tab of the UI, those models can be directly downloaded from Hugging Face. You can also download them from the command-line with `python download-model.py organization/model`.
 
-* GGML models are a single file and can be placed directly into `models`. Example:
+* GGML models are a single file and should be placed directly into `models`. Example:
 
 ```
 text-generation-webui/
@@ -154,7 +152,7 @@ text-generation-webui/
 │   ├── llama-13b.ggmlv3.q4_K_M.bin
 ```
 
-Those models have to be downloaded manually and placed into that folder.
+These models have to be downloaded manually and are not supported by the automated downloaders yet.
 
 #### GPT-4chan
 

From a03a70bed63758ee77fbbd10e4ccefaa247b8c88 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 15 Aug 2023 12:20:59 -0700
Subject: [PATCH 094/169] Update README

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 8b6bd281..fcb30e95 100644
--- a/README.md
+++ b/README.md
@@ -142,7 +142,7 @@ text-generation-webui
 │   │   └── tokenizer.model
 ```
 
-In the "Model" tab of the UI, those models can be directly downloaded from Hugging Face. You can also download them from the command-line with `python download-model.py organization/model`.
+In the "Model" tab of the UI, those models can be automatically downloaded from Hugging Face. You can also download them via the command-line with `python download-model.py organization/model`.
 
 * GGML models are a single file and should be placed directly into `models`. Example:
 
@@ -152,7 +152,7 @@ text-generation-webui/
 │   ├── llama-13b.ggmlv3.q4_K_M.bin
 ```
 
-These models have to be downloaded manually and are not supported by the automated downloaders yet.
+Those models must be downloaded manually, as they are not currently supported by the automated downloader.
 
 #### GPT-4chan
 

From 87dd85b7190cc26730bae3a86de5f5faaed6be7d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 15 Aug 2023 12:21:50 -0700
Subject: [PATCH 095/169] Update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index fcb30e95..1b862a7d 100644
--- a/README.md
+++ b/README.md
@@ -147,7 +147,7 @@ In the "Model" tab of the UI, those models can be automatically downloaded from
 * GGML models are a single file and should be placed directly into `models`. Example:
 
 ```
-text-generation-webui/
+text-generation-webui
 ├── models
 │   ├── llama-13b.ggmlv3.q4_K_M.bin
 ```

From 32ff3da941d3e9d7c603cff88db645cf76647a32 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 15 Aug 2023 17:16:24 -0300
Subject: [PATCH 096/169] Update ancient screenshots

---
 README.md | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 1b862a7d..18f6d73f 100644
--- a/README.md
+++ b/README.md
@@ -4,9 +4,9 @@ A Gradio web UI for Large Language Models.
 
 Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) of text generation.
 
-|![Image1](https://github.com/oobabooga/screenshots/raw/main/qa.png) | ![Image2](https://github.com/oobabooga/screenshots/raw/main/cai3.png) |
+|![Image1](https://github.com/oobabooga/screenshots/raw/main/print_instruct.png) | ![Image2](https://github.com/oobabooga/screenshots/raw/main/print_chat.png) |
 |:---:|:---:|
-|![Image3](https://github.com/oobabooga/screenshots/raw/main/gpt4chan.png) | ![Image4](https://github.com/oobabooga/screenshots/raw/main/galactica.png) |
+|![Image1](https://github.com/oobabooga/screenshots/raw/main/print_default.png) | ![Image2](https://github.com/oobabooga/screenshots/raw/main/print_parameters.png) |
 
 ## Features
 
@@ -178,7 +178,10 @@ After downloading the model, follow these steps:
 python download-model.py EleutherAI/gpt-j-6B --text-only
 ```
 
-When you load this model in default or notebook modes, the "HTML" tab will show the generated text in 4chan format.
+When you load this model in default or notebook modes, the "HTML" tab will show the generated text in 4chan format:
+
+![Image3](https://github.com/oobabooga/screenshots/raw/main/gpt4chan.png)
+
 </details>
 
 ## Starting the web UI

From 2a292082240527ba5c065e941811d82082b48b11 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 16 Aug 2023 02:39:58 -0300
Subject: [PATCH 097/169] Add a "Show controls" button to chat UI (#3590)

---
 css/main.css                 | 37 +++++++++++++++++++++++++-----------
 extensions/gallery/script.js | 17 ++++++++++++++---
 js/show_controls.js          | 18 ++++++++++++++++++
 modules/ui.py                |  2 ++
 modules/ui_chat.py           |  8 ++++++--
 5 files changed, 66 insertions(+), 16 deletions(-)
 create mode 100644 js/show_controls.js

diff --git a/css/main.css b/css/main.css
index 5432a9db..e82a8c61 100644
--- a/css/main.css
+++ b/css/main.css
@@ -101,15 +101,15 @@ div.svelte-15lo0d8 > *, div.svelte-15lo0d8 > .form > * {
 }
 
 .textbox_default textarea {
-    height: calc(100vh - 310px);
+    height: calc(100dvh - 310px);
 }
 
 .textbox_default_output textarea {
-    height: calc(100vh - 190px);
+    height: calc(100dvh - 190px);
 }
 
 .textbox textarea {
-    height: calc(100vh - 241px);
+    height: calc(100dvh - 241px);
 }
 
 .textbox_default textarea, .textbox_default_output textarea, .textbox textarea {
@@ -123,7 +123,7 @@ div.svelte-15lo0d8 > *, div.svelte-15lo0d8 > .form > * {
 
 @media screen and (max-width: 711px) {
     .textbox_default textarea {
-        height: calc(100vh - 275px);
+        height: calc(100dvh - 295px);
     }
 }
 
@@ -218,10 +218,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     opacity: 1;
 }
 
-#gradio-chatbot {
-    height: 66.67vh;
-}
-
 .wrap.svelte-6roggh.svelte-6roggh {
     max-height: 92.5%;
 }
@@ -246,8 +242,12 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
         padding: 0px;
     }
 
-    .chat {
-        height: calc(100vh - 274px) !important;
+    #chat {
+        height: calc(100dvh - 272px) !important;
+    }
+
+    .bigchat #chat {
+        height: calc(100dvh - 180px) !important;
     }
 }
 
@@ -255,7 +255,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     margin-left: auto;
     margin-right: auto;
     max-width: 800px;
-    height: calc(100vh - 286px);
+    height: 100%;
     overflow-y: auto;
     padding-right: 20px;
     display: flex;
@@ -265,6 +265,21 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     padding-top: 1px;
 }
 
+#chat {
+    height: calc(100dvh - 286px);
+}
+
+.bigchat #chat {
+    height: calc(100dvh - 200px);
+}
+
+
+#show-controls {
+    position: absolute;
+    background-color: transparent;
+    left: calc(100% - 140px);
+}
+
 .chat > .messages {
     display: flex;
     flex-direction: column;
diff --git a/extensions/gallery/script.js b/extensions/gallery/script.js
index 23acfbf6..4203f0dd 100644
--- a/extensions/gallery/script.js
+++ b/extensions/gallery/script.js
@@ -1,23 +1,34 @@
 let gallery_element = document.getElementById('gallery-extension');
+let chat_mode_element = document.getElementById('chat-mode');
+
 let extensions_block = gallery_element.parentElement;
 let extensions_block_size = extensions_block.childNodes.length;
 let gallery_only = (extensions_block_size == 5);
 
 main_parent.addEventListener('click', function(e) {
     let chat_visible = (chat_tab.offsetHeight > 0 && chat_tab.offsetWidth > 0);
+    let chat_mode_visible = (chat_mode_element.offsetHeight > 0 && chat_mode_element.offsetWidth > 0);
     let notebook_visible = (notebook_tab.offsetHeight > 0 && notebook_tab.offsetWidth > 0);
     let default_visible = (default_tab.offsetHeight > 0 && default_tab.offsetWidth > 0);
 
     // Only show this extension in the Chat tab
     if (chat_visible) {
-        gallery_element.style.display = 'block';
-        if (gallery_only) {
-            extensions_block.style.display = '';
+        if (chat_mode_visible) {
+            gallery_element.style.display = 'block';
+            if (gallery_only) {
+                extensions_block.style.display = '';
+            }
+        } else {
+            gallery_element.style.display = 'none';
+            extensions_block.style.display = 'none';
         }
     } else {
         gallery_element.style.display = 'none';
         if (gallery_only) {
             extensions_block.style.display = 'none';
         }
+        else {
+            extensions_block.style.display = '';
+        }
     }
 });
diff --git a/js/show_controls.js b/js/show_controls.js
new file mode 100644
index 00000000..83bb6c02
--- /dev/null
+++ b/js/show_controls.js
@@ -0,0 +1,18 @@
+const belowChatInput = document.querySelectorAll("#chat-tab > div > :nth-child(n+3), #extensions");
+const chatParent = document.getElementById("chat").parentNode;
+
+function toggle_controls(value) {
+    if (value) {
+        belowChatInput.forEach(element => {
+          element.style.display = "inherit";
+        });
+
+        chatParent.classList.remove("bigchat");
+    } else {
+        belowChatInput.forEach(element => {
+          element.style.display = "none";
+        });
+
+        chatParent.classList.add("bigchat");
+    }
+}
diff --git a/modules/ui.py b/modules/ui.py
index 682d74f9..94bfe4a6 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -16,6 +16,8 @@ with open(Path(__file__).resolve().parent / '../js/save_files.js', 'r') as f:
     save_files_js = f.read()
 with open(Path(__file__).resolve().parent / '../js/switch_tabs.js', 'r') as f:
     switch_tabs_js = f.read()
+with open(Path(__file__).resolve().parent / '../js/show_controls.js', 'r') as f:
+    show_controls_js = f.read()
 
 refresh_symbol = '🔄'
 delete_symbol = '🗑️'
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index a3a4ccf0..aec7a2ad 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -22,7 +22,9 @@ def create_ui():
 
     with gr.Tab('Chat', elem_id='chat-tab'):
         shared.gradio['display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': []}, shared.settings['name1'], shared.settings['name2'], 'chat', 'cai-chat'))
-        shared.gradio['textbox'] = gr.Textbox(label='Input')
+
+        shared.gradio['textbox'] = gr.Textbox(label='Input', elem_id='chat-input')
+        shared.gradio['show-controls'] = gr.Checkbox(value=True, label='Show controls', elem_id='show-controls')
         with gr.Row():
             shared.gradio['Stop'] = gr.Button('Stop', elem_id='stop')
             shared.gradio['Generate'] = gr.Button('Generate', elem_id='Generate', variant='primary')
@@ -48,7 +50,7 @@ def create_ui():
             shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'])
 
         with gr.Row():
-            shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], value=shared.settings['mode'] if shared.settings['mode'] in ['chat', 'instruct', 'chat-instruct'] else 'chat', label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template selected under Parameters > Instruction template must match the current model.')
+            shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], value=shared.settings['mode'] if shared.settings['mode'] in ['chat', 'instruct', 'chat-instruct'] else 'chat', label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template selected under Parameters > Instruction template must match the current model.', elem_id='chat-mode')
             shared.gradio['chat_style'] = gr.Dropdown(choices=utils.get_available_chat_styles(), label='Chat style', value=shared.settings['chat_style'], visible=shared.settings['mode'] != 'instruct')
 
 
@@ -125,6 +127,8 @@ def create_event_handlers():
     shared.input_params = gradio(inputs)
     shared.reload_inputs = gradio(reload_arr)
 
+    shared.gradio['show-controls'].change(None, gradio('show-controls'), None, _js=f'(x) => {{{ui.show_controls_js}; toggle_controls(x)}}')
+
     shared.gradio['Generate'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(

From 73d9befb65b26a016bc142794abbaf823b7e59fa Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 16 Aug 2023 07:03:53 -0700
Subject: [PATCH 098/169] Make "Show controls" customizable through
 settings.yaml

---
 css/main.css           | 1 -
 modules/shared.py      | 1 +
 modules/ui.py          | 3 ++-
 modules/ui_chat.py     | 8 ++++----
 modules/ui_session.py  | 2 +-
 server.py              | 1 +
 settings-template.yaml | 1 +
 7 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/css/main.css b/css/main.css
index e82a8c61..0305efb4 100644
--- a/css/main.css
+++ b/css/main.css
@@ -273,7 +273,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     height: calc(100dvh - 200px);
 }
 
-
 #show-controls {
     position: absolute;
     background-color: transparent;
diff --git a/modules/shared.py b/modules/shared.py
index fa1a0a3b..88aa8cf2 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -29,6 +29,7 @@ session_is_loading = False
 # UI defaults
 settings = {
     'dark_theme': True,
+    'show_controls': True,
     'start_with': '',
     'mode': 'chat',
     'chat_style': 'TheEncrypted777',
diff --git a/modules/ui.py b/modules/ui.py
index 94bfe4a6..a99af375 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -178,7 +178,7 @@ def apply_interface_values(state, use_persistent=False):
         return [state[k] if k in state else gr.update() for k in elements]
 
 
-def save_settings(state, preset, instruction_template, extensions):
+def save_settings(state, preset, instruction_template, extensions, show_controls):
     output = copy.deepcopy(shared.settings)
     exclude = ['name1', 'name2', 'greeting', 'context', 'turn_template']
     for k in state:
@@ -192,6 +192,7 @@ def save_settings(state, preset, instruction_template, extensions):
     output['instruction_template'] = instruction_template
     output['default_extensions'] = extensions
     output['seed'] = int(output['seed'])
+    output['show_controls'] = show_controls
 
     return yaml.dump(output, sort_keys=False, width=float("inf"))
 
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index aec7a2ad..b356f8c6 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -22,9 +22,9 @@ def create_ui():
 
     with gr.Tab('Chat', elem_id='chat-tab'):
         shared.gradio['display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': []}, shared.settings['name1'], shared.settings['name2'], 'chat', 'cai-chat'))
-
         shared.gradio['textbox'] = gr.Textbox(label='Input', elem_id='chat-input')
-        shared.gradio['show-controls'] = gr.Checkbox(value=True, label='Show controls', elem_id='show-controls')
+        shared.gradio['show_controls'] = gr.Checkbox(value=shared.settings['show_controls'], label='Show controls', elem_id='show-controls')
+
         with gr.Row():
             shared.gradio['Stop'] = gr.Button('Stop', elem_id='stop')
             shared.gradio['Generate'] = gr.Button('Generate', elem_id='Generate', variant='primary')
@@ -127,8 +127,6 @@ def create_event_handlers():
     shared.input_params = gradio(inputs)
     shared.reload_inputs = gradio(reload_arr)
 
-    shared.gradio['show-controls'].change(None, gradio('show-controls'), None, _js=f'(x) => {{{ui.show_controls_js}; toggle_controls(x)}}')
-
     shared.gradio['Generate'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
@@ -276,3 +274,5 @@ def create_event_handlers():
     shared.gradio['send_instruction_to_negative_prompt'].click(
         prompts.load_instruction_prompt_simple, gradio('instruction_template'), gradio('negative_prompt')).then(
         lambda: None, None, None, _js=f'() => {{{ui.switch_tabs_js}; switch_to_generation_parameters()}}')
+
+    shared.gradio['show_controls'].change(None, gradio('show_controls'), None, _js=f'(x) => {{{ui.show_controls_js}; toggle_controls(x)}}')
diff --git a/modules/ui_session.py b/modules/ui_session.py
index b774a207..537a31f2 100644
--- a/modules/ui_session.py
+++ b/modules/ui_session.py
@@ -41,7 +41,7 @@ def create_ui():
         shared.gradio['toggle_dark_mode'].click(lambda: None, None, None, _js='() => {document.getElementsByTagName("body")[0].classList.toggle("dark")}')
         shared.gradio['save_settings'].click(
             ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-            ui.save_settings, gradio('interface_state', 'preset_menu', 'instruction_template', 'extensions_menu'), gradio('save_contents')).then(
+            ui.save_settings, gradio('interface_state', 'preset_menu', 'instruction_template', 'extensions_menu', 'show_controls'), gradio('save_contents')).then(
             lambda: './', None, gradio('save_root')).then(
             lambda: 'settings.yaml', None, gradio('save_filename')).then(
             lambda: gr.update(visible=True), None, gradio('file_saver'))
diff --git a/server.py b/server.py
index d90453a0..ef213a87 100644
--- a/server.py
+++ b/server.py
@@ -128,6 +128,7 @@ def create_interface():
             shared.gradio['interface'].load(lambda: None, None, None, _js="() => document.getElementsByTagName('body')[0].classList.add('dark')")
 
         shared.gradio['interface'].load(lambda: None, None, None, _js=f"() => {{{js}}}")
+        shared.gradio['interface'].load(None, gradio('show_controls'), None, _js=f'(x) => {{{ui.show_controls_js}; toggle_controls(x)}}')
         shared.gradio['interface'].load(partial(ui.apply_interface_values, {}, use_persistent=True), None, gradio(ui.list_interface_input_elements()), show_progress=False)
         shared.gradio['interface'].load(chat.redraw_html, gradio(ui_chat.reload_arr), gradio('display'))
 
diff --git a/settings-template.yaml b/settings-template.yaml
index 11cd1185..b2526df1 100644
--- a/settings-template.yaml
+++ b/settings-template.yaml
@@ -1,4 +1,5 @@
 dark_theme: true
+show_controls: true
 start_with: ''
 mode: chat
 chat_style: TheEncrypted777

From 7966989667b4ad88cae3e760fdc759a634ce5478 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 16 Aug 2023 07:25:59 -0700
Subject: [PATCH 099/169] Minor CSS fix

---
 css/main.css | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/css/main.css b/css/main.css
index 0305efb4..d301e10e 100644
--- a/css/main.css
+++ b/css/main.css
@@ -266,7 +266,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 #chat {
-    height: calc(100dvh - 286px);
+    height: calc(100dvh - 284px);
 }
 
 .bigchat #chat {

From a4e903e932c6b3b43b2ccb88f9e75049b2ac4b2e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 16 Aug 2023 09:23:29 -0700
Subject: [PATCH 100/169] Escape HTML in chat messages

---
 modules/chat.py           | 25 +++++++++++++------------
 modules/html_generator.py |  2 ++
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index d83e9490..d81d254f 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -1,6 +1,7 @@
 import base64
 import copy
 import functools
+import html
 import json
 import re
 from pathlib import Path
@@ -188,15 +189,16 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
         yield output
         return
 
-    # Defining some variables
     just_started = True
     visible_text = None
     stopping_strings = get_stopping_strings(state)
     is_stream = state['stream']
 
-    # Preparing the input
+    # Prepare the input
     if not any((regenerate, _continue)):
-        visible_text = text
+        visible_text = html.escape(text)
+
+        # Apply extensions
         text, visible_text = apply_extensions('chat_input', text, visible_text, state)
         text = apply_extensions('input', text, state, is_chat=True)
 
@@ -208,6 +210,7 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
         if regenerate:
             output['visible'].pop()
             output['internal'].pop()
+
             # *Is typing...*
             if loading_message:
                 yield {'visible': output['visible'] + [[visible_text, shared.processing_message]], 'internal': output['internal']}
@@ -216,12 +219,11 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
             if loading_message:
                 yield {'visible': output['visible'][:-1] + [[visible_text, last_reply[1] + '...']], 'internal': output['internal']}
 
-    # Generating the prompt
+    # Generate the prompt
     kwargs = {
         '_continue': _continue,
         'history': output,
     }
-
     prompt = apply_extensions('custom_generate_chat_prompt', text, state, **kwargs)
     if prompt is None:
         prompt = generate_chat_prompt(text, state, **kwargs)
@@ -232,9 +234,8 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
 
         # Extract the reply
         visible_reply = re.sub("(<USER>|<user>|{{user}})", state['name1'], reply)
+        visible_reply = html.escape(visible_reply)
 
-        # We need this global variable to handle the Stop event,
-        # otherwise gradio gets confused
         if shared.stop_everything:
             output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
             yield output
@@ -307,8 +308,8 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
 
 def remove_last_message(history):
     if len(history['visible']) > 0 and history['internal'][-1][0] != '<|BEGIN-VISIBLE-CHAT|>':
-        last = history['visible'].pop()
-        history['internal'].pop()
+        last = history['internal'].pop()
+        history['visible'].pop()
     else:
         last = ['', '']
 
@@ -328,7 +329,7 @@ def replace_last_reply(text, state):
     if len(text.strip()) == 0:
         return history
     elif len(history['visible']) > 0:
-        history['visible'][-1][1] = text
+        history['visible'][-1][1] = html.escape(text)
         history['internal'][-1][1] = apply_extensions('input', text, state, is_chat=True)
 
     return history
@@ -336,7 +337,7 @@ def replace_last_reply(text, state):
 
 def send_dummy_message(text, state):
     history = state['history']
-    history['visible'].append([text, ''])
+    history['visible'].append([html.escape(text), ''])
     history['internal'].append([apply_extensions('input', text, state, is_chat=True), ''])
     return history
 
@@ -347,7 +348,7 @@ def send_dummy_reply(text, state):
         history['visible'].append(['', ''])
         history['internal'].append(['', ''])
 
-    history['visible'][-1][1] = text
+    history['visible'][-1][1] = html.escape(text)
     history['internal'][-1][1] = apply_extensions('input', text, state, is_chat=True)
     return history
 
diff --git a/modules/html_generator.py b/modules/html_generator.py
index eb1da374..3d9f758b 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -1,3 +1,4 @@
+import html
 import os
 import re
 import time
@@ -85,6 +86,7 @@ def convert_to_markdown(string):
 
 
 def generate_basic_html(string):
+    string = html.escape(string)
     string = convert_to_markdown(string)
     string = f'<style>{readable_css}</style><div class="container">{string}</div>'
     return string

From 300219b0816a86eafedda0ae285e30390a28b629 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 16 Aug 2023 09:35:10 -0700
Subject: [PATCH 101/169] Fix <audio> tag width in chat mode

---
 css/main.css | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/css/main.css b/css/main.css
index d301e10e..425a531c 100644
--- a/css/main.css
+++ b/css/main.css
@@ -193,6 +193,10 @@ button {
   background: #374151;
 }
 
+audio {
+  max-width: 100%;
+}
+
 /*****************************************************/
 /*************** Chat UI declarations ****************/
 /*****************************************************/

From 46408465ce3869a4e4b6975b2350a2c56788f9ff Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 17 Aug 2023 01:03:40 -0300
Subject: [PATCH 102/169] Improved chat scrolling (#3601)

---
 css/main.css |  8 ++++++--
 js/main.js   | 23 +++++++++++++++++++++++
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/css/main.css b/css/main.css
index 425a531c..a97a18ca 100644
--- a/css/main.css
+++ b/css/main.css
@@ -253,6 +253,10 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     .bigchat #chat {
         height: calc(100dvh - 180px) !important;
     }
+
+    .chat {
+        flex-direction: column-reverse !important;
+    }
 }
 
 .chat {
@@ -263,10 +267,10 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     overflow-y: auto;
     padding-right: 20px;
     display: flex;
-    flex-direction: column-reverse;
+    flex-direction: column;
     word-break: break-word;
     overflow-wrap: anywhere;
-    padding-top: 1px;
+    padding-top: 6px;
 }
 
 #chat {
diff --git a/js/main.js b/js/main.js
index 40197869..68a511fc 100644
--- a/js/main.js
+++ b/js/main.js
@@ -48,3 +48,26 @@ document.addEventListener("keydown", function(event) {
     }
   }
 });
+
+// Chat scrolling
+const targetElement = document.getElementById('chat').parentNode.parentNode.parentNode;
+
+// Create a MutationObserver instance
+const observer = new MutationObserver(function(mutations) {
+  mutations.forEach(function(mutation) {
+    let childElement = targetElement.childNodes[2].childNodes[0].childNodes[1];
+    childElement.scrollTop = childElement.scrollHeight;
+  });
+});
+
+// Configure the observer to watch for changes in the subtree and attributes
+const config = {
+  childList: true,
+  subtree: true,
+  characterData: true,
+  attributeOldValue: true,
+  characterDataOldValue: true
+};
+
+// Start observing the target element
+observer.observe(targetElement, config);

From cebe07f29c6ade45ba9c169c304dceabea1e278e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 16 Aug 2023 21:08:01 -0700
Subject: [PATCH 103/169] Unescape HTML inside code blocks

---
 modules/html_generator.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/modules/html_generator.py b/modules/html_generator.py
index 3d9f758b..0026ba4e 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -54,6 +54,9 @@ def convert_to_markdown(string):
         if line.lstrip(' ').startswith('```'):
             is_code = not is_code
 
+        if is_code:
+            line = html.unescape(line)
+
         result += line
         if is_code or line.startswith('|'):  # Don't add an extra \n for tables or code
             result += '\n'
@@ -75,14 +78,14 @@ def convert_to_markdown(string):
 
         result = re.sub(r'(\d+\.)$', r'\g<1> ' + delete_str, result)
 
-        html = markdown.markdown(result, extensions=['fenced_code', 'tables'])
-        pos = html.rfind(delete_str)
+        html_output = markdown.markdown(result, extensions=['fenced_code', 'tables'])
+        pos = html_output.rfind(delete_str)
         if pos > -1:
-            html = html[:pos] + html[pos + len(delete_str):]
+            html_output = html_output[:pos] + html_output[pos + len(delete_str):]
     else:
-        html = markdown.markdown(result, extensions=['fenced_code', 'tables'])
+        html_output = markdown.markdown(result, extensions=['fenced_code', 'tables'])
 
-    return html
+    return html_output
 
 
 def generate_basic_html(string):

From bdb6eb5734b3bc9585c7edc7bd2f2ec8da679973 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 17 Aug 2023 11:06:18 -0700
Subject: [PATCH 104/169] Restyle the chat input box + several CSS improvements

- Remove extra spacing below the last chat message
- Change the background color of code blocks in dark mode
- Remove border radius from selected header bar elements
- Make the chat scrollbar more discrete
---
 css/html_instruct_style.css |  6 +++++-
 css/main.css                | 32 +++++++++++++++++++++++++-------
 js/main.js                  | 12 ++++++++++++
 modules/ui_chat.py          |  2 +-
 4 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css
index 575281b1..160aa01c 100644
--- a/css/html_instruct_style.css
+++ b/css/html_instruct_style.css
@@ -43,6 +43,10 @@
     margin-bottom: 9px !important;
 }
 
+.gradio-container .chat .assistant-message:last-child, .gradio-container .chat .user-message:last-child {
+    margin-bottom: 0px !important;
+}
+
 .dark .chat .assistant-message {
     background-color: #3741519e;
     border: 1px solid #4b5563;
@@ -58,5 +62,5 @@ code {
 }
 
 .dark code {
-    background-color: #1a212f !important;
+    background-color: #0e1321 !important;
 }
\ No newline at end of file
diff --git a/css/main.css b/css/main.css
index a97a18ca..1f5bf21b 100644
--- a/css/main.css
+++ b/css/main.css
@@ -100,6 +100,10 @@ div.svelte-15lo0d8 > *, div.svelte-15lo0d8 > .form > * {
     background-color: #8080802b;
 }
 
+.header_bar button.selected {
+    border-radius: 0;
+}
+
 .textbox_default textarea {
     height: calc(100dvh - 310px);
 }
@@ -166,7 +170,7 @@ button {
 }
 
 .pretty_scrollbar::-webkit-scrollbar {
-  width: 10px;
+  width: 5px;
 }
 
 .pretty_scrollbar::-webkit-scrollbar-track {
@@ -176,13 +180,11 @@ button {
 .pretty_scrollbar::-webkit-scrollbar-thumb,
 .pretty_scrollbar::-webkit-scrollbar-thumb:hover {
   background: #c5c5d2;
-  border-radius: 10px;
 }
 
 .dark .pretty_scrollbar::-webkit-scrollbar-thumb,
 .dark .pretty_scrollbar::-webkit-scrollbar-thumb:hover {
   background: #374151;
-  border-radius: 10px;
 }
 
 .pretty_scrollbar::-webkit-resizer {
@@ -247,7 +249,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     }
 
     #chat {
-        height: calc(100dvh - 272px) !important;
+        height: calc(100dvh - 262px) !important;
     }
 
     .bigchat #chat {
@@ -265,7 +267,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     max-width: 800px;
     height: 100%;
     overflow-y: auto;
-    padding-right: 20px;
+    padding-right: 15px;
     display: flex;
     flex-direction: column;
     word-break: break-word;
@@ -274,7 +276,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 #chat {
-    height: calc(100dvh - 284px);
+    height: calc(100dvh - 272px);
 }
 
 .bigchat #chat {
@@ -284,7 +286,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 #show-controls {
     position: absolute;
     background-color: transparent;
-    left: calc(100% - 140px);
+    left: calc(100% - 130px);
 }
 
 .chat > .messages {
@@ -292,6 +294,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     flex-direction: column;
 }
 
+.chat .message:last-child {
+    margin-bottom: 0px !important;
+    padding-bottom: 0px !important;
+}
+
 .message-body li {
     margin-top: 0.5em !important;
     margin-bottom: 0.5em !important;
@@ -322,6 +329,17 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     white-space: normal !important;
 }
 
+#chat-input {
+    padding: 0;
+    padding-top: 18px;
+    background: var(--background-fill-primary);
+    border: none;
+}
+
+#chat-input textarea:focus {
+    box-shadow: none !important;
+}
+
 @media print {
     body {
         visibility: hidden;
diff --git a/js/main.js b/js/main.js
index 68a511fc..3428c62c 100644
--- a/js/main.js
+++ b/js/main.js
@@ -30,7 +30,9 @@ main_parent.addEventListener('click', function(e) {
     }
 });
 
+//------------------------------------------------
 // Add some scrollbars
+//------------------------------------------------
 const textareaElements = document.querySelectorAll('.add_scrollbar textarea');
 for(i = 0; i < textareaElements.length; i++) {
     textareaElements[i].classList.remove('scroll-hide');
@@ -38,7 +40,9 @@ for(i = 0; i < textareaElements.length; i++) {
     textareaElements[i].style.resize = "none";
 }
 
+//------------------------------------------------
 // Stop generation on Esc pressed
+//------------------------------------------------
 document.addEventListener("keydown", function(event) {
   if (event.key === "Escape") {
     // Find the element with id 'stop' and click it
@@ -49,7 +53,9 @@ document.addEventListener("keydown", function(event) {
   }
 });
 
+//------------------------------------------------
 // Chat scrolling
+//------------------------------------------------
 const targetElement = document.getElementById('chat').parentNode.parentNode.parentNode;
 
 // Create a MutationObserver instance
@@ -71,3 +77,9 @@ const config = {
 
 // Start observing the target element
 observer.observe(targetElement, config);
+
+//------------------------------------------------
+// Improve the looks of the chat input field
+//------------------------------------------------
+document.getElementById('chat-input').parentNode.style.background = 'transparent';
+document.getElementById('chat-input').parentNode.style.border = 'none';
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index b356f8c6..e727172c 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -22,7 +22,7 @@ def create_ui():
 
     with gr.Tab('Chat', elem_id='chat-tab'):
         shared.gradio['display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': []}, shared.settings['name1'], shared.settings['name2'], 'chat', 'cai-chat'))
-        shared.gradio['textbox'] = gr.Textbox(label='Input', elem_id='chat-input')
+        shared.gradio['textbox'] = gr.Textbox(label='', placeholder='Send a message', elem_id='chat-input')
         shared.gradio['show_controls'] = gr.Checkbox(value=shared.settings['show_controls'], label='Show controls', elem_id='show-controls')
 
         with gr.Row():

From a954b3e7deece2b108aac6182a612a3645f26744 Mon Sep 17 00:00:00 2001
From: ausboss <102309304+ausboss@users.noreply.github.com>
Date: Thu, 17 Aug 2023 13:20:36 -0500
Subject: [PATCH 105/169] fixes error when not specifying tunnel id (#3606)

---
 extensions/api/util.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/extensions/api/util.py b/extensions/api/util.py
index 0db1c46c..032a9e5c 100644
--- a/extensions/api/util.py
+++ b/extensions/api/util.py
@@ -99,7 +99,10 @@ def _start_cloudflared(port: int, tunnel_id: str, max_attempts: int = 3, on_star
 
     for _ in range(max_attempts):
         try:
-            public_url = _run_cloudflared(port, port + 1, tunnel_id=tunnel_id)
+            if tunnel_id is not None:
+                public_url = _run_cloudflared(port, port + 1, tunnel_id=tunnel_id)
+            else:
+                public_url = _run_cloudflared(port, port + 1)
 
             if on_start:
                 on_start(public_url)

From 669c640eb49ba8c31d7472b51a50731c5703a65f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 17 Aug 2023 11:37:14 -0700
Subject: [PATCH 106/169] Fix extensions block appearing where it shouldn't

---
 extensions/gallery/script.js | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/extensions/gallery/script.js b/extensions/gallery/script.js
index 4203f0dd..b80785b6 100644
--- a/extensions/gallery/script.js
+++ b/extensions/gallery/script.js
@@ -15,9 +15,7 @@ main_parent.addEventListener('click', function(e) {
     if (chat_visible) {
         if (chat_mode_visible) {
             gallery_element.style.display = 'block';
-            if (gallery_only) {
-                extensions_block.style.display = '';
-            }
+            extensions_block.style.display = '';
         } else {
             gallery_element.style.display = 'none';
             extensions_block.style.display = 'none';
@@ -27,8 +25,5 @@ main_parent.addEventListener('click', function(e) {
         if (gallery_only) {
             extensions_block.style.display = 'none';
         }
-        else {
-            extensions_block.style.display = '';
-        }
     }
 });

From 3e7c624f8e420a40b7885b2677f846f2a0d8a3e0 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 17 Aug 2023 15:03:08 -0700
Subject: [PATCH 107/169] Add a template for OpenOrca-Platypus2

---
 instruction-templates/OpenOrca-Platypus2.yaml | 4 ++++
 models/config.yaml                            | 5 +++++
 2 files changed, 9 insertions(+)
 create mode 100644 instruction-templates/OpenOrca-Platypus2.yaml

diff --git a/instruction-templates/OpenOrca-Platypus2.yaml b/instruction-templates/OpenOrca-Platypus2.yaml
new file mode 100644
index 00000000..6cac0046
--- /dev/null
+++ b/instruction-templates/OpenOrca-Platypus2.yaml
@@ -0,0 +1,4 @@
+user: "### Instruction:"
+bot: "### Response:"
+turn_template: "<|user|> <|user-message|>\n\n<|bot|> <|bot-message|>\n\n"
+context: ""
diff --git a/models/config.yaml b/models/config.yaml
index 624840df..366684dc 100644
--- a/models/config.yaml
+++ b/models/config.yaml
@@ -295,3 +295,8 @@ llama-65b-gptq-3bit:
   instruction_template: 'OpenChat'
 .*falcon.*-instruct:
   mode: 'instruct'
+.*(openorca-platypus2):
+  mode: 'instruct'
+  instruction_template: 'OpenOrca-Platypus2'
+  custom_stopping_strings: '"### Instruction:", "### Response:"'
+  rms_norm_eps: 5.0e-6

From 6170b5ba31926ee32200a062707269cd8fbd923d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 17 Aug 2023 21:39:26 -0700
Subject: [PATCH 108/169] Bump llama-cpp-python

---
 README.md                 | 1 +
 modules/llamacpp_hf.py    | 7 +++++++
 modules/llamacpp_model.py | 7 +++++++
 modules/loaders.py        | 2 ++
 modules/shared.py         | 1 +
 modules/ui.py             | 1 +
 requirements.txt          | 4 ++--
 7 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 18f6d73f..4607ab5b 100644
--- a/README.md
+++ b/README.md
@@ -262,6 +262,7 @@ Optionally, you can use the following command-line flags:
 | `--no-mmap` | Prevent mmap from being used. |
 | `--mlock`   | Force the system to keep the model in RAM. |
 | `--cache-capacity CACHE_CAPACITY`   | Maximum cache capacity. Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. |
+| `--tensor_split TENSOR_SPLIT` | Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17 |
 | `--llama_cpp_seed SEED` | Seed for llama-cpp models. Default 0 (random). |
 | `--n_gqa N_GQA`         | grouped-query attention. Must be 8 for llama-2 70b. |
 | `--rms_norm_eps RMS_NORM_EPS`  | 5e-6 is a good value for llama-2 models. |
diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py
index fa0554cd..7deae98a 100644
--- a/modules/llamacpp_hf.py
+++ b/modules/llamacpp_hf.py
@@ -102,6 +102,12 @@ class LlamacppHF(PreTrainedModel):
             model_file = list(path.glob('*ggml*.bin'))[0]
 
         logger.info(f"llama.cpp weights detected: {model_file}\n")
+
+        if shared.args.tensor_split is None or shared.args.tensor_split.strip() == '':
+            tensor_split_list = None
+        else:
+            tensor_split_list = [float(x) for x in shared.args.tensor_split.strip().split(",")]
+
         params = {
             'model_path': str(model_file),
             'n_ctx': shared.args.n_ctx,
@@ -113,6 +119,7 @@ class LlamacppHF(PreTrainedModel):
             'low_vram': shared.args.low_vram,
             'n_gpu_layers': shared.args.n_gpu_layers,
             'rope_freq_base': 10000 * shared.args.alpha_value ** (64 / 63.),
+            'tensor_split': tensor_split_list,
             'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
             'n_gqa': shared.args.n_gqa or None,
             'rms_norm_eps': shared.args.rms_norm_eps or None,
diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
index f7f4cc9b..0e635da4 100644
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@@ -55,6 +55,12 @@ class LlamaCppModel:
                 cache_capacity = int(shared.args.cache_capacity)
 
         logger.info("Cache capacity is " + str(cache_capacity) + " bytes")
+
+        if shared.args.tensor_split is None or shared.args.tensor_split.strip() == '':
+            tensor_split_list = None
+        else:
+            tensor_split_list = [float(x) for x in shared.args.tensor_split.strip().split(",")]
+
         params = {
             'model_path': str(path),
             'n_ctx': shared.args.n_ctx,
@@ -66,6 +72,7 @@ class LlamaCppModel:
             'low_vram': shared.args.low_vram,
             'n_gpu_layers': shared.args.n_gpu_layers,
             'rope_freq_base': 10000 * shared.args.alpha_value ** (64 / 63.),
+            'tensor_split': tensor_split_list,
             'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
             'n_gqa': shared.args.n_gqa or None,
             'rms_norm_eps': shared.args.rms_norm_eps or None,
diff --git a/modules/loaders.py b/modules/loaders.py
index 08a11ac0..d49f7c01 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -67,6 +67,7 @@ loaders_and_params = OrderedDict({
         'n_gqa',
         'rms_norm_eps',
         'n_gpu_layers',
+        'tensor_split',
         'n_batch',
         'threads',
         'no_mmap',
@@ -82,6 +83,7 @@ loaders_and_params = OrderedDict({
         'n_gqa',
         'rms_norm_eps',
         'n_gpu_layers',
+        'tensor_split',
         'n_batch',
         'threads',
         'no_mmap',
diff --git a/modules/shared.py b/modules/shared.py
index 88aa8cf2..23eb3983 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -121,6 +121,7 @@ parser.add_argument('--low-vram', action='store_true', help='Low VRAM Mode')
 parser.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
 parser.add_argument('--cache-capacity', type=str, help='Maximum cache capacity. Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.')
 parser.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layers to offload to the GPU.')
+parser.add_argument('--tensor_split', type=str, default=None, help="Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17")
 parser.add_argument('--n_ctx', type=int, default=2048, help='Size of the prompt context.')
 parser.add_argument('--llama_cpp_seed', type=int, default=0, help='Seed for llama-cpp models. Default 0 (random)')
 parser.add_argument('--n_gqa', type=int, default=0, help='grouped-query attention. Must be 8 for llama-2 70b.')
diff --git a/modules/ui.py b/modules/ui.py
index a99af375..ea1ca74b 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -69,6 +69,7 @@ def list_model_elements():
         'low_vram',
         'mlock',
         'n_gpu_layers',
+        'tensor_split',
         'n_ctx',
         'n_gqa',
         'rms_norm_eps',
diff --git a/requirements.txt b/requirements.txt
index 888dda8b..9a623546 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -31,8 +31,8 @@ https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu117
 https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 
 # llama-cpp-python without GPU support
-llama-cpp-python==0.1.77; platform_system != "Windows"
-https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.77/llama_cpp_python-0.1.77-cp310-cp310-win_amd64.whl; platform_system == "Windows"
+llama-cpp-python==0.1.78; platform_system != "Windows"
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.78/llama_cpp_python-0.1.78-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 # llama-cpp-python with CUDA support
 https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.77+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.77+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From 4b69f4f6ae9c8258286a6136767e80af26ee5eee Mon Sep 17 00:00:00 2001
From: missionfloyd <missionfloyd@users.noreply.github.com>
Date: Thu, 17 Aug 2023 22:44:22 -0600
Subject: [PATCH 109/169] Fix print CSS (#3608)

---
 css/main.css | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/css/main.css b/css/main.css
index 1f5bf21b..7bfd0146 100644
--- a/css/main.css
+++ b/css/main.css
@@ -350,12 +350,10 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
         position: absolute;
         left: 0;
         top: 0;
-        max-width: none;
-        max-height: none;
+        max-width: unset;
+        max-height: unset;
         width: 100%;
-        height: fit-content;
-        display: flex;
-        flex-direction: column-reverse;
+        overflow-y: visible;
     }
     
     .message {
@@ -369,4 +367,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     .tab-nav {
         display: none !important;
     }
+    
+    #chat-tab > :first-child {
+        max-width: unset;
+    }
 }

From 28cf5862af3ad66743eaad7a8bf649fbf34f6c51 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 18 Aug 2023 06:26:48 -0700
Subject: [PATCH 110/169] Add UI element for tensor_split

---
 modules/ui_model_menu.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 3059f616..21507530 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -33,6 +33,7 @@ def create_ui():
                 default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)))
             else:
                 default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)) * 1000)
+
     while len(default_gpu_mem) < len(total_mem):
         default_gpu_mem.append(0)
 
@@ -109,6 +110,7 @@ def create_ui():
                             shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
                             shared.gradio['low_vram'] = gr.Checkbox(label="low-vram", value=shared.args.low_vram)
                             shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
+                            shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17')
                             shared.gradio['llama_cpp_seed'] = gr.Number(label='Seed (0 for random)', value=shared.args.llama_cpp_seed)
                             shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.')
                             shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa support is currently only kept for compatibility with older GPUs. AutoGPTQ or ExLlama is preferred when compatible. GPTQ-for-LLaMa is installed by default with the webui on supported systems. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')

From 4ec42679e31924809d7734ea40da2f836bad4010 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 18 Aug 2023 07:58:20 -0700
Subject: [PATCH 111/169] Add --mul_mat_q param

---
 README.md                 | 1 +
 modules/llamacpp_hf.py    | 1 +
 modules/llamacpp_model.py | 1 +
 modules/shared.py         | 1 +
 modules/ui.py             | 1 +
 modules/ui_model_menu.py  | 1 +
 6 files changed, 6 insertions(+)

diff --git a/README.md b/README.md
index 4607ab5b..5b1e95c3 100644
--- a/README.md
+++ b/README.md
@@ -261,6 +261,7 @@ Optionally, you can use the following command-line flags:
 |-------------|-------------|
 | `--no-mmap` | Prevent mmap from being used. |
 | `--mlock`   | Force the system to keep the model in RAM. |
+| `--mul_mat_q` | Activate new mulmat kernels. |
 | `--cache-capacity CACHE_CAPACITY`   | Maximum cache capacity. Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. |
 | `--tensor_split TENSOR_SPLIT` | Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17 |
 | `--llama_cpp_seed SEED` | Seed for llama-cpp models. Default 0 (random). |
diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py
index 7deae98a..10c30112 100644
--- a/modules/llamacpp_hf.py
+++ b/modules/llamacpp_hf.py
@@ -116,6 +116,7 @@ class LlamacppHF(PreTrainedModel):
             'n_batch': shared.args.n_batch,
             'use_mmap': not shared.args.no_mmap,
             'use_mlock': shared.args.mlock,
+            'mul_mat_q': shared.args.mul_mat_q,
             'low_vram': shared.args.low_vram,
             'n_gpu_layers': shared.args.n_gpu_layers,
             'rope_freq_base': 10000 * shared.args.alpha_value ** (64 / 63.),
diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
index 0e635da4..28a38de6 100644
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@@ -69,6 +69,7 @@ class LlamaCppModel:
             'n_batch': shared.args.n_batch,
             'use_mmap': not shared.args.no_mmap,
             'use_mlock': shared.args.mlock,
+            'mul_mat_q': shared.args.mul_mat_q,
             'low_vram': shared.args.low_vram,
             'n_gpu_layers': shared.args.n_gpu_layers,
             'rope_freq_base': 10000 * shared.args.alpha_value ** (64 / 63.),
diff --git a/modules/shared.py b/modules/shared.py
index 23eb3983..385b99da 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -119,6 +119,7 @@ parser.add_argument('--n_batch', type=int, default=512, help='Maximum number of
 parser.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.')
 parser.add_argument('--low-vram', action='store_true', help='Low VRAM Mode')
 parser.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
+parser.add_argument('--mul_mat_q', action='store_true', help='Activate new mulmat kernels.')
 parser.add_argument('--cache-capacity', type=str, help='Maximum cache capacity. Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.')
 parser.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layers to offload to the GPU.')
 parser.add_argument('--tensor_split', type=str, default=None, help="Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17")
diff --git a/modules/ui.py b/modules/ui.py
index ea1ca74b..15f24d85 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -68,6 +68,7 @@ def list_model_elements():
         'no_mmap',
         'low_vram',
         'mlock',
+        'mul_mat_q',
         'n_gpu_layers',
         'tensor_split',
         'n_ctx',
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 21507530..8e24ebdf 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -110,6 +110,7 @@ def create_ui():
                             shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
                             shared.gradio['low_vram'] = gr.Checkbox(label="low-vram", value=shared.args.low_vram)
                             shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
+                            shared.gradio['mul_mat_q'] = gr.Checkbox(label="mul_mat_q", value=shared.args.mul_mat_q)
                             shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17')
                             shared.gradio['llama_cpp_seed'] = gr.Number(label='Seed (0 for random)', value=shared.args.llama_cpp_seed)
                             shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.')

From d8f660e58684e35a0ca44ca67b601e02dfc26de5 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 18 Aug 2023 08:00:22 -0700
Subject: [PATCH 112/169] Add to modules/loaders.py

---
 modules/loaders.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/loaders.py b/modules/loaders.py
index d49f7c01..7444555f 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -73,6 +73,7 @@ loaders_and_params = OrderedDict({
         'no_mmap',
         'low_vram',
         'mlock',
+        'mul_mat_q',
         'llama_cpp_seed',
         'alpha_value',
         'compress_pos_emb',
@@ -89,6 +90,7 @@ loaders_and_params = OrderedDict({
         'no_mmap',
         'low_vram',
         'mlock',
+        'mul_mat_q',
         'alpha_value',
         'compress_pos_emb',
         'cpu',

From 7cba00042102ac5cb659d8838f6a6f1f6107e7ad Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 18 Aug 2023 12:03:34 -0300
Subject: [PATCH 113/169] Bump llama-cpp-python, +tensor_split by @shouyiwang,
 +mul_mat_q (#3610)

---
 README.md                 | 2 ++
 modules/llamacpp_hf.py    | 8 ++++++++
 modules/llamacpp_model.py | 8 ++++++++
 modules/loaders.py        | 4 ++++
 modules/shared.py         | 2 ++
 modules/ui.py             | 2 ++
 modules/ui_model_menu.py  | 3 +++
 requirements.txt          | 4 ++--
 8 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 18f6d73f..5b1e95c3 100644
--- a/README.md
+++ b/README.md
@@ -261,7 +261,9 @@ Optionally, you can use the following command-line flags:
 |-------------|-------------|
 | `--no-mmap` | Prevent mmap from being used. |
 | `--mlock`   | Force the system to keep the model in RAM. |
+| `--mul_mat_q` | Activate new mulmat kernels. |
 | `--cache-capacity CACHE_CAPACITY`   | Maximum cache capacity. Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. |
+| `--tensor_split TENSOR_SPLIT` | Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17 |
 | `--llama_cpp_seed SEED` | Seed for llama-cpp models. Default 0 (random). |
 | `--n_gqa N_GQA`         | grouped-query attention. Must be 8 for llama-2 70b. |
 | `--rms_norm_eps RMS_NORM_EPS`  | 5e-6 is a good value for llama-2 models. |
diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py
index fa0554cd..10c30112 100644
--- a/modules/llamacpp_hf.py
+++ b/modules/llamacpp_hf.py
@@ -102,6 +102,12 @@ class LlamacppHF(PreTrainedModel):
             model_file = list(path.glob('*ggml*.bin'))[0]
 
         logger.info(f"llama.cpp weights detected: {model_file}\n")
+
+        if shared.args.tensor_split is None or shared.args.tensor_split.strip() == '':
+            tensor_split_list = None
+        else:
+            tensor_split_list = [float(x) for x in shared.args.tensor_split.strip().split(",")]
+
         params = {
             'model_path': str(model_file),
             'n_ctx': shared.args.n_ctx,
@@ -110,9 +116,11 @@ class LlamacppHF(PreTrainedModel):
             'n_batch': shared.args.n_batch,
             'use_mmap': not shared.args.no_mmap,
             'use_mlock': shared.args.mlock,
+            'mul_mat_q': shared.args.mul_mat_q,
             'low_vram': shared.args.low_vram,
             'n_gpu_layers': shared.args.n_gpu_layers,
             'rope_freq_base': 10000 * shared.args.alpha_value ** (64 / 63.),
+            'tensor_split': tensor_split_list,
             'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
             'n_gqa': shared.args.n_gqa or None,
             'rms_norm_eps': shared.args.rms_norm_eps or None,
diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
index f7f4cc9b..28a38de6 100644
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@@ -55,6 +55,12 @@ class LlamaCppModel:
                 cache_capacity = int(shared.args.cache_capacity)
 
         logger.info("Cache capacity is " + str(cache_capacity) + " bytes")
+
+        if shared.args.tensor_split is None or shared.args.tensor_split.strip() == '':
+            tensor_split_list = None
+        else:
+            tensor_split_list = [float(x) for x in shared.args.tensor_split.strip().split(",")]
+
         params = {
             'model_path': str(path),
             'n_ctx': shared.args.n_ctx,
@@ -63,9 +69,11 @@ class LlamaCppModel:
             'n_batch': shared.args.n_batch,
             'use_mmap': not shared.args.no_mmap,
             'use_mlock': shared.args.mlock,
+            'mul_mat_q': shared.args.mul_mat_q,
             'low_vram': shared.args.low_vram,
             'n_gpu_layers': shared.args.n_gpu_layers,
             'rope_freq_base': 10000 * shared.args.alpha_value ** (64 / 63.),
+            'tensor_split': tensor_split_list,
             'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
             'n_gqa': shared.args.n_gqa or None,
             'rms_norm_eps': shared.args.rms_norm_eps or None,
diff --git a/modules/loaders.py b/modules/loaders.py
index 08a11ac0..7444555f 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -67,11 +67,13 @@ loaders_and_params = OrderedDict({
         'n_gqa',
         'rms_norm_eps',
         'n_gpu_layers',
+        'tensor_split',
         'n_batch',
         'threads',
         'no_mmap',
         'low_vram',
         'mlock',
+        'mul_mat_q',
         'llama_cpp_seed',
         'alpha_value',
         'compress_pos_emb',
@@ -82,11 +84,13 @@ loaders_and_params = OrderedDict({
         'n_gqa',
         'rms_norm_eps',
         'n_gpu_layers',
+        'tensor_split',
         'n_batch',
         'threads',
         'no_mmap',
         'low_vram',
         'mlock',
+        'mul_mat_q',
         'alpha_value',
         'compress_pos_emb',
         'cpu',
diff --git a/modules/shared.py b/modules/shared.py
index 88aa8cf2..385b99da 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -119,8 +119,10 @@ parser.add_argument('--n_batch', type=int, default=512, help='Maximum number of
 parser.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.')
 parser.add_argument('--low-vram', action='store_true', help='Low VRAM Mode')
 parser.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
+parser.add_argument('--mul_mat_q', action='store_true', help='Activate new mulmat kernels.')
 parser.add_argument('--cache-capacity', type=str, help='Maximum cache capacity. Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.')
 parser.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layers to offload to the GPU.')
+parser.add_argument('--tensor_split', type=str, default=None, help="Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17")
 parser.add_argument('--n_ctx', type=int, default=2048, help='Size of the prompt context.')
 parser.add_argument('--llama_cpp_seed', type=int, default=0, help='Seed for llama-cpp models. Default 0 (random)')
 parser.add_argument('--n_gqa', type=int, default=0, help='grouped-query attention. Must be 8 for llama-2 70b.')
diff --git a/modules/ui.py b/modules/ui.py
index a99af375..15f24d85 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -68,7 +68,9 @@ def list_model_elements():
         'no_mmap',
         'low_vram',
         'mlock',
+        'mul_mat_q',
         'n_gpu_layers',
+        'tensor_split',
         'n_ctx',
         'n_gqa',
         'rms_norm_eps',
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 3059f616..8e24ebdf 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -33,6 +33,7 @@ def create_ui():
                 default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)))
             else:
                 default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)) * 1000)
+
     while len(default_gpu_mem) < len(total_mem):
         default_gpu_mem.append(0)
 
@@ -109,6 +110,8 @@ def create_ui():
                             shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
                             shared.gradio['low_vram'] = gr.Checkbox(label="low-vram", value=shared.args.low_vram)
                             shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
+                            shared.gradio['mul_mat_q'] = gr.Checkbox(label="mul_mat_q", value=shared.args.mul_mat_q)
+                            shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17')
                             shared.gradio['llama_cpp_seed'] = gr.Number(label='Seed (0 for random)', value=shared.args.llama_cpp_seed)
                             shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.')
                             shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa support is currently only kept for compatibility with older GPUs. AutoGPTQ or ExLlama is preferred when compatible. GPTQ-for-LLaMa is installed by default with the webui on supported systems. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')
diff --git a/requirements.txt b/requirements.txt
index 888dda8b..9a623546 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -31,8 +31,8 @@ https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu117
 https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 
 # llama-cpp-python without GPU support
-llama-cpp-python==0.1.77; platform_system != "Windows"
-https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.77/llama_cpp_python-0.1.77-cp310-cp310-win_amd64.whl; platform_system == "Windows"
+llama-cpp-python==0.1.78; platform_system != "Windows"
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.78/llama_cpp_python-0.1.78-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 # llama-cpp-python with CUDA support
 https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.77+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.77+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From 1a71ab58a9302085bcf342986abb50e5f5a25f73 Mon Sep 17 00:00:00 2001
From: jllllll <3887729+jllllll@users.noreply.github.com>
Date: Fri, 18 Aug 2023 10:04:01 -0500
Subject: [PATCH 114/169] Bump llama_cpp_python_cuda to 0.1.78 (#3614)

---
 requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 9a623546..9cac54dc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -34,8 +34,8 @@ https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu117
 llama-cpp-python==0.1.78; platform_system != "Windows"
 https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.78/llama_cpp_python-0.1.78-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 # llama-cpp-python with CUDA support
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.77+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.77+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.78+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.78+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 
 # GPTQ-for-LLaMa
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"

From c4733000d715e422d76f3bf58c12f596df03fc0d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 18 Aug 2023 09:25:51 -0700
Subject: [PATCH 115/169] Return the visible history with "Remove last"

---
 modules/chat.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index d81d254f..ce10e393 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -308,8 +308,8 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
 
 def remove_last_message(history):
     if len(history['visible']) > 0 and history['internal'][-1][0] != '<|BEGIN-VISIBLE-CHAT|>':
-        last = history['internal'].pop()
-        history['visible'].pop()
+        last = history['visible'].pop()
+        history['internal'].pop()
     else:
         last = ['', '']
 

From f50f534b0f5f9a721fe2e2049b111d35a6a7c2ae Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 18 Aug 2023 09:37:20 -0700
Subject: [PATCH 116/169] Add note about AMD/Metal to README

---
 README.md | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5b1e95c3..e7954718 100644
--- a/README.md
+++ b/README.md
@@ -88,7 +88,19 @@ cd text-generation-webui
 pip install -r requirements.txt
 ```
 
-#### Note about older NVIDIA GPUs
+#### llama.cpp on AMD, Metal, and some specific CPUs
+
+Precompiled wheels are included for CPU-only and NVIDIA GPUs (cuBLAS). For AMD, Metal, and some specific CPUs, you need to uninstall those wheels and compile llama-cpp-python yourself.
+
+To uninstall:
+
+```
+pip uninstall -y llama-cpp-python llama-cpp-python-cuda
+```
+
+To compile: https://github.com/abetlen/llama-cpp-python#installation-with-openblas--cublas--clblast--metal
+
+#### bitsandbytes on older NVIDIA GPUs
 
 bitsandbytes >= 0.39 may not work. In that case, to use `--load-in-8bit`, you may have to downgrade like this:
 

From 54df0bfad1a98e53d94bac2712c0d5c76962ac40 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 18 Aug 2023 09:43:15 -0700
Subject: [PATCH 117/169] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e7954718..379d1db4 100644
--- a/README.md
+++ b/README.md
@@ -75,7 +75,7 @@ conda activate textgen
 
 The up-to-date commands can be found here: https://pytorch.org/get-started/locally/. 
 
-#### 2.1 Special instructions
+#### 2.1 Additional information
 
 * MacOS users: https://github.com/oobabooga/text-generation-webui/pull/393
 * AMD users: https://rentry.org/eq3hg

From b96fd22a81df547c32819ce846d8517cde1153f2 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 18 Aug 2023 16:58:38 -0300
Subject: [PATCH 118/169] Refactor the training tab (#3619)

---
 modules/training.py | 182 ++++++++++++++++++++++----------------------
 requirements.txt    |   2 +-
 2 files changed, 90 insertions(+), 94 deletions(-)

diff --git a/modules/training.py b/modules/training.py
index 7558cd5d..839c9079 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -24,6 +24,11 @@ from peft import (
     prepare_model_for_int8_training,
     set_peft_model_state_dict
 )
+from peft.utils.other import \
+    TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING as model_to_lora_modules
+from transformers.models.auto.modeling_auto import (
+    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+)
 
 from modules import shared, ui, utils
 from modules.evaluate import (
@@ -32,109 +37,101 @@ from modules.evaluate import (
     save_past_evaluations
 )
 from modules.logging_colors import logger
-from modules.models import load_model, unload_model
+from modules.models import reload_model
 from modules.utils import natural_keys
 
-# This mapping is from a very recent commit, not yet released.
-# If not available, default to a backup map for some common model types.
-try:
-    from peft.utils.other import \
-        TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING as \
-        model_to_lora_modules
-    from transformers.models.auto.modeling_auto import (
-        MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
-    )
-    MODEL_CLASSES = {v: k for k, v in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES}
-except:
-    standard_modules = ["q_proj", "v_proj"]
-    model_to_lora_modules = {"llama": standard_modules, "opt": standard_modules, "gptj": standard_modules, "gpt_neox": ["query_key_value"], "rw": ["query_key_value"]}
-    MODEL_CLASSES = {
-        "LlamaForCausalLM": "llama",
-        "OPTForCausalLM": "opt",
-        "GPTJForCausalLM": "gptj",
-        "GPTNeoXForCausalLM": "gpt_neox",
-        "RWForCausalLM": "rw"
-
-    }
+MODEL_CLASSES = {v[1]: v[0] for v in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.items()}
+PARAMETERS = ["lora_name", "always_override", "save_steps", "micro_batch_size", "batch_size", "epochs", "learning_rate", "lr_scheduler_type", "lora_rank", "lora_alpha", "lora_dropout", "cutoff_len", "dataset", "eval_dataset", "format", "eval_steps", "raw_text_file", "overlap_len", "newline_favor_len", "higher_rank_limit", "warmup_steps", "optimizer", "hard_cut_string", "train_only_after", "stop_at_loss", "add_eos_token", "min_chars", "report_to"]
+WANT_INTERRUPT = False
 
 train_log = {}
 train_template = {}
 
-WANT_INTERRUPT = False
-PARAMETERS = ["lora_name", "always_override", "save_steps", "micro_batch_size", "batch_size", "epochs", "learning_rate", "lr_scheduler_type", "lora_rank", "lora_alpha", "lora_dropout", "cutoff_len", "dataset", "eval_dataset", "format", "eval_steps", "raw_text_file", "overlap_len", "newline_favor_len", "higher_rank_limit", "warmup_steps", "optimizer", "hard_cut_string", "train_only_after", "stop_at_loss", "add_eos_token", "min_chars", "report_to"]
-
 
 def create_ui():
     with gr.Tab("Training", elem_id="training-tab"):
-        tmp = gr.State('')
         with gr.Tab('Train LoRA', elem_id='lora-train-tab'):
-            gr.Markdown("Confused? [[Click here for a guide]](https://github.com/oobabooga/text-generation-webui/blob/main/docs/Training-LoRAs.md)")
-
+            tmp = gr.State('')
             with gr.Row():
-                lora_name = gr.Textbox(label='Name', info='The name of your new LoRA file')
-                always_override = gr.Checkbox(label='Override Existing Files', value=False, info='If the name given is the same as an existing file, checking this will replace that file. Leaving unchecked will load that file and continue from it (must use the same rank value as the original had).')
-                save_steps = gr.Number(label='Save every n steps', value=0, info='If above 0, a checkpoint of the LoRA will be saved every time this many steps pass.')
+                with gr.Column():
+                    gr.Markdown("[Tutorial](https://github.com/oobabooga/text-generation-webui/blob/main/docs/Training-LoRAs.md)")
 
-            with gr.Row():
-                copy_from = gr.Dropdown(label='Copy parameters from', value='None', choices=utils.get_available_loras())
-                ui.create_refresh_button(copy_from, lambda: None, lambda: {'choices': utils.get_available_loras()}, 'refresh-button')
+                    with gr.Row():
+                        copy_from = gr.Dropdown(label='Copy parameters from', value='None', choices=utils.get_available_loras(), elem_classes=['slim-dropdown'])
+                        ui.create_refresh_button(copy_from, lambda: None, lambda: {'choices': utils.get_available_loras()}, 'refresh-button')
 
-            with gr.Row():
-                # TODO: Implement multi-device support.
-                micro_batch_size = gr.Slider(label='Micro Batch Size', value=4, minimum=1, maximum=128, step=1, info='Per-device batch size (NOTE: multiple devices not yet implemented). Increasing this will increase VRAM usage.')
-                batch_size = gr.Slider(label='Batch Size', value=128, minimum=0, maximum=1024, step=4, info='Global batch size. The two batch sizes together determine gradient accumulation (gradientAccum = batch / microBatch). Higher gradient accum values lead to better quality training.')
+                    with gr.Row():
+                        with gr.Column(scale=5):
+                            lora_name = gr.Textbox(label='Name', info='The name of your new LoRA file')
+                        with gr.Column():
+                            always_override = gr.Checkbox(label='Override Existing Files', value=False, info='If the name is the same, checking will replace the existing file, and unchecking will load and continue from it (the rank must be the same).')
 
-            with gr.Row():
-                epochs = gr.Number(label='Epochs', value=3, info='Number of times every entry in the dataset should be fed into training. So 1 means feed each item in once, 5 means feed it in five times, etc.')
-                learning_rate = gr.Textbox(label='Learning Rate', value='3e-4', info='Learning rate, in scientific notation. 3e-4 is a good starting base point. 1e-2 is extremely high, 1e-6 is extremely low.')
-                lr_scheduler_type = gr.Dropdown(label='LR Scheduler', value='linear', choices=['linear', 'constant', 'constant_with_warmup', 'cosine', 'cosine_with_restarts', 'polynomial', 'inverse_sqrt'], info='Learning rate scheduler - defines how the learning rate changes over time. "Constant" means never change, "linear" means to go in a straight line from the learning rate down to 0, cosine follows a curve, etc.')
+                    with gr.Row():
+                        with gr.Column():
+                            lora_rank = gr.Slider(label='LoRA Rank', value=32, minimum=0, maximum=1024, step=4, info='Also called dimension count. Higher values = larger file, more content control. Smaller values = smaller file, less control. Use 4 or 8 for style, 128 or 256 to teach, 1024+ for fine-detail on big data. More VRAM is needed for higher ranks.')
+                            lora_alpha = gr.Slider(label='LoRA Alpha', value=64, minimum=0, maximum=2048, step=4, info='This divided by the rank becomes the scaling of the LoRA. Higher means stronger. A good standard value is twice your Rank.')
+                            batch_size = gr.Slider(label='Batch Size', value=128, minimum=0, maximum=1024, step=4, info='Global batch size. The two batch sizes together determine gradient accumulation (gradientAccum = batch / microBatch). Higher gradient accum values lead to better quality training.')
+                            micro_batch_size = gr.Slider(label='Micro Batch Size', value=4, minimum=1, maximum=128, step=1, info='Per-device batch size (NOTE: multiple devices not yet implemented). Increasing this will increase VRAM usage.')
+                            cutoff_len = gr.Slider(label='Cutoff Length', minimum=0, maximum=2048, value=256, step=32, info='Cutoff length for text input. Essentially, how long of a line of text to feed in at a time. Higher values require drastically more VRAM.')
 
-            # TODO: What is the actual maximum rank? Likely distinct per model. This might be better to somehow be on a log scale.
-            lora_rank = gr.Slider(label='LoRA Rank', value=32, minimum=0, maximum=1024, step=4, info='LoRA Rank, or dimension count. Higher values produce a larger file with better control over the model\'s content. Smaller values produce a smaller file with less overall control. Small values like 4 or 8 are great for stylistic guidance, higher values like 128 or 256 are good for teaching content upgrades, extremely high values (1024+) are difficult to train but may improve fine-detail learning for large datasets. Higher ranks also require higher VRAM.')
-            lora_alpha = gr.Slider(label='LoRA Alpha', value=64, minimum=0, maximum=2048, step=4, info='LoRA Alpha. This divided by the rank becomes the scaling of the LoRA. Higher means stronger. A good standard value is twice your Rank.')
+                        with gr.Column():
+                            save_steps = gr.Number(label='Save every n steps', value=0, info='If above 0, a checkpoint of the LoRA will be saved every time this many steps pass.')
 
-            cutoff_len = gr.Slider(label='Cutoff Length', minimum=0, maximum=2048, value=256, step=32, info='Cutoff length for text input. Essentially, how long of a line of text to feed in at a time. Higher values require drastically more VRAM.')
+                            epochs = gr.Number(label='Epochs', value=3, info='Number of times every entry in the dataset should be fed into training. So 1 means feed each item in once, 5 means feed it in five times, etc.')
+                            learning_rate = gr.Textbox(label='Learning Rate', value='3e-4', info='In scientific notation. 3e-4 is a good starting base point. 1e-2 is extremely high, 1e-6 is extremely low.')
+                            lr_scheduler_type = gr.Dropdown(label='LR Scheduler', value='linear', choices=['linear', 'constant', 'constant_with_warmup', 'cosine', 'cosine_with_restarts', 'polynomial', 'inverse_sqrt'], info='Learning rate scheduler - defines how the learning rate changes over time. "Constant" means never change, "linear" means to go in a straight line from the learning rate down to 0, cosine follows a curve, etc.', elem_classes=['slim-dropdown'])
 
-            with gr.Tab(label='Formatted Dataset'):
-                with gr.Row():
-                    dataset = gr.Dropdown(choices=utils.get_datasets('training/datasets', 'json'), value='None', label='Dataset', info='The dataset file to use for training.')
-                    ui.create_refresh_button(dataset, lambda: None, lambda: {'choices': utils.get_datasets('training/datasets', 'json')}, 'refresh-button')
-                    eval_dataset = gr.Dropdown(choices=utils.get_datasets('training/datasets', 'json'), value='None', label='Evaluation Dataset', info='The (optional) dataset file used to evaluate the model after training.')
-                    ui.create_refresh_button(eval_dataset, lambda: None, lambda: {'choices': utils.get_datasets('training/datasets', 'json')}, 'refresh-button')
-                    format = gr.Dropdown(choices=utils.get_datasets('training/formats', 'json'), value='None', label='Data Format', info='The format file used to decide how to format the dataset input.')
-                    ui.create_refresh_button(format, lambda: None, lambda: {'choices': utils.get_datasets('training/formats', 'json')}, 'refresh-button')
+                    with gr.Accordion(label='Advanced Options', open=False):
+                        with gr.Row():
+                            with gr.Column():
+                                lora_dropout = gr.Slider(label='LoRA Dropout', minimum=0.0, maximum=1.0, step=0.025, value=0.05, info='Percentage probability for dropout of LoRA layers. This can help reduce overfitting. Most users should leave at default.')
+                                stop_at_loss = gr.Slider(label='Stop at loss', minimum=0.0, maximum=3.0, step=0.1, value=0.00, info='The process will automatically stop once the desired loss value is reached. (reasonable numbers are 1.5-1.8)')
+                                optimizer = gr.Dropdown(label='Optimizer', value='adamw_torch', choices=['adamw_hf', 'adamw_torch', 'adamw_torch_fused', 'adamw_torch_xla', 'adamw_apex_fused', 'adafactor', 'adamw_bnb_8bit', 'adamw_anyprecision', 'sgd', 'adagrad'], info='Different optimizer implementation options, for advanced users. Effects of different options are not well documented yet.', elem_classes=['slim-dropdown'])
 
-                eval_steps = gr.Number(label='Evaluate every n steps', value=100, info='If an evaluation dataset is given, test it every time this many steps pass.')
+                            with gr.Column():
+                                warmup_steps = gr.Number(label='Warmup Steps', value=100, info='For this many steps at the start, the learning rate will be lower than normal. This helps the trainer prepare the model and precompute statistics to improve the quality of training after the start.')
+                                train_only_after = gr.Textbox(label='Train Only After', value='', info='Only consider text *after* this string in any given chunk for training. For Alpaca datasets, use "### Response:" to only train the response and ignore the input.')
 
-            with gr.Tab(label="Raw text file"):
-                with gr.Row():
-                    raw_text_file = gr.Dropdown(choices=utils.get_datasets('training/datasets', 'txt'), value='None', label='Text file', info='The raw text file to use for training.')
-                    ui.create_refresh_button(raw_text_file, lambda: None, lambda: {'choices': utils.get_datasets('training/datasets', 'txt')}, 'refresh-button')
-                    hard_cut_string = gr.Textbox(label='Hard Cut String', value='\\n\\n\\n', info='String that indicates a hard cut between text parts. Helps prevent unwanted overlap.')
-                    min_chars = gr.Number(label='Ignore small blocks', value=0, info='Ignore Hard Cut blocks that have less or equal characters than this number')
+                                add_eos_token = gr.Checkbox(label='Add EOS token', value=False, info="Adds EOS token for each dataset item. In case of raw text, the EOS will be added at the Hard Cut")
 
-                with gr.Row():
-                    overlap_len = gr.Slider(label='Overlap Length', minimum=0, maximum=512, value=128, step=16, info='Overlap length - ie how many tokens from the prior chunk of text to include into the next chunk. (The chunks themselves will be of a size determined by Cutoff Length below). Setting overlap to exactly half the cutoff length may be ideal.')
-                    newline_favor_len = gr.Slider(label='Prefer Newline Cut Length', minimum=0, maximum=512, value=128, step=16, info='Length (in characters, not tokens) of the maximum distance to shift an overlap cut by to ensure chunks cut at newlines. If too low, cuts may occur in the middle of lines.')
+                                higher_rank_limit = gr.Checkbox(label='Enable higher ranks', value=False, info='If checked, changes Rank/Alpha slider above to go much higher. This will not work without a datacenter-class GPU.')
+                                report_to = gr.Radio(label="Save detailed logs with", value="None", choices=["None", "wandb", "tensorboard"], interactive=True)
 
-            with gr.Accordion(label='Advanced Options', open=False):
-                lora_dropout = gr.Slider(label='LoRA Dropout', minimum=0.0, maximum=1.0, step=0.025, value=0.05, info='Percentage probability for dropout of LoRA layers. This can help reduce overfitting. Most users should leave at default.')
-                warmup_steps = gr.Number(label='Warmup Steps', value=100, info='For this many steps at the start, the learning rate will be lower than normal. This helps the trainer prepare the model and precompute statistics to improve the quality of training after the start.')
-                optimizer = gr.Dropdown(label='Optimizer', value='adamw_torch', choices=['adamw_hf', 'adamw_torch', 'adamw_torch_fused', 'adamw_torch_xla', 'adamw_apex_fused', 'adafactor', 'adamw_bnb_8bit', 'adamw_anyprecision', 'sgd', 'adagrad'], info='Different optimizer implementation options, for advanced users. Effects of different options are not well documented yet.')
-                train_only_after = gr.Textbox(label='Train Only After', value='', info='Only consider text *after* this string in any given chunk for training. For Alpaca datasets, use "### Response:" to only train the response and ignore the input.')
-                stop_at_loss = gr.Slider(label='Stop at loss', minimum=0.0, maximum=3.0, step=0.1, value=0.00, info='The process will automatically stop once the desired loss value is reached. (reasonable numbers are 1.5-1.8)')
-                add_eos_token = gr.Checkbox(label='Add EOS token', value=False, info="Adds EOS token for each dataset item. In case of raw text, the EOS will be added at the Hard Cut")
+                with gr.Column():
+                    with gr.Tab(label='Formatted Dataset'):
+                        with gr.Row():
+                            format = gr.Dropdown(choices=utils.get_datasets('training/formats', 'json'), value='None', label='Data Format', info='The format file used to decide how to format the dataset input.', elem_classes=['slim-dropdown'])
+                            ui.create_refresh_button(format, lambda: None, lambda: {'choices': utils.get_datasets('training/formats', 'json')}, 'refresh-button')
 
-                with gr.Row():
-                    higher_rank_limit = gr.Checkbox(label='Enable higher ranks', value=False, info='If checked, changes Rank/Alpha slider above to go much higher. This will not work without a datacenter-class GPU.')
-                with gr.Row():
-                    report_to = gr.Radio(label="Save detailed logs with", value="None", choices=["None", "wandb", "tensorboard"], interactive=True)
+                        with gr.Row():
+                            dataset = gr.Dropdown(choices=utils.get_datasets('training/datasets', 'json'), value='None', label='Dataset', info='The dataset file to use for training.', elem_classes=['slim-dropdown'])
+                            ui.create_refresh_button(dataset, lambda: None, lambda: {'choices': utils.get_datasets('training/datasets', 'json')}, 'refresh-button')
 
-            with gr.Row():
-                start_button = gr.Button("Start LoRA Training")
-                stop_button = gr.Button("Interrupt")
+                        with gr.Row():
+                            eval_dataset = gr.Dropdown(choices=utils.get_datasets('training/datasets', 'json'), value='None', label='Evaluation Dataset', info='The (optional) dataset file used to evaluate the model after training.', elem_classes=['slim-dropdown'])
+                            ui.create_refresh_button(eval_dataset, lambda: None, lambda: {'choices': utils.get_datasets('training/datasets', 'json')}, 'refresh-button')
 
-            output = gr.Markdown(value="Ready")
+                        eval_steps = gr.Number(label='Evaluate every n steps', value=100, info='If an evaluation dataset is given, test it every time this many steps pass.')
+
+                    with gr.Tab(label="Raw text file"):
+                        with gr.Row():
+                            raw_text_file = gr.Dropdown(choices=utils.get_datasets('training/datasets', 'txt'), value='None', label='Text file', info='The raw text file to use for training.', elem_classes=['slim-dropdown'])
+                            ui.create_refresh_button(raw_text_file, lambda: None, lambda: {'choices': utils.get_datasets('training/datasets', 'txt')}, 'refresh-button')
+
+                        with gr.Row():
+                            with gr.Column():
+                                overlap_len = gr.Slider(label='Overlap Length', minimum=0, maximum=512, value=128, step=16, info='How many tokens from the prior chunk of text to include into the next chunk. (The chunks themselves will be of a size determined by Cutoff Length). Setting overlap to exactly half the cutoff length may be ideal.')
+                                newline_favor_len = gr.Slider(label='Prefer Newline Cut Length', minimum=0, maximum=512, value=128, step=16, info='Length (in characters, not tokens) of the maximum distance to shift an overlap cut by to ensure chunks cut at newlines. If too low, cuts may occur in the middle of lines.')
+
+                            with gr.Column():
+                                hard_cut_string = gr.Textbox(label='Hard Cut String', value='\\n\\n\\n', info='String that indicates a hard cut between text parts. Helps prevent unwanted overlap.')
+                                min_chars = gr.Number(label='Ignore small blocks', value=0, info='Ignore Hard Cut blocks that have less or equal characters than this number')
+
+                    with gr.Row():
+                        start_button = gr.Button("Start LoRA Training", variant='primary')
+                        stop_button = gr.Button("Interrupt")
+
+                    output = gr.Markdown(value="Ready")
 
         with gr.Tab('Perplexity evaluation', elem_id='evaluate-tab'):
             with gr.Row():
@@ -142,8 +139,11 @@ def create_ui():
                     models = gr.Dropdown(utils.get_available_models(), label='Models', multiselect=True)
                     evaluate_text_file = gr.Dropdown(choices=['wikitext', 'ptb', 'ptb_new'] + utils.get_datasets('training/datasets', 'txt')[1:], value='wikitext', label='Input dataset', info='The raw text file on which the model will be evaluated. The first options are automatically downloaded: wikitext, ptb, and ptb_new. The next options are your local text files under training/datasets.')
                     with gr.Row():
-                        stride_length = gr.Slider(label='Stride', minimum=1, maximum=2048, value=512, step=1, info='Used to make the evaluation faster at the cost of accuracy. 1 = slowest but most accurate. 512 is a common value.')
-                        max_length = gr.Slider(label='max_length', minimum=0, maximum=8096, value=0, step=1, info='The context for each evaluation. If set to 0, the maximum context length for the model will be used.')
+                        with gr.Column():
+                            stride_length = gr.Slider(label='Stride', minimum=1, maximum=2048, value=512, step=1, info='Used to make the evaluation faster at the cost of accuracy. 1 = slowest but most accurate. 512 is a common value.')
+
+                        with gr.Column():
+                            max_length = gr.Slider(label='max_length', minimum=0, maximum=8096, value=0, step=1, info='The context for each evaluation. If set to 0, the maximum context length for the model will be used.')
 
                     with gr.Row():
                         start_current_evaluation = gr.Button("Evaluate loaded model")
@@ -214,8 +214,6 @@ def change_rank_limit(use_higher_ranks: bool):
 
 def clean_path(base_path: str, path: str):
     """Strips unusual symbols and forcibly builds a path as relative to the intended directory."""
-    # TODO: Probably could do with a security audit to guarantee there's no ways this can be bypassed to target an unwanted path.
-    # Or swap it to a strict whitelist of [a-zA-Z_0-9]
     path = path.replace('\\', '/').replace('..', '_')
     if base_path is None:
         return path
@@ -280,13 +278,13 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
     WANT_INTERRUPT = False
 
     # == Input validation / processing ==
-    yield "Prepping..."
+    yield "Preparing the input..."
     lora_file_path = clean_path(None, lora_name)
     if lora_file_path.strip() == '':
         yield "Missing or invalid LoRA file name input."
         return
 
-    lora_file_path = f"{shared.args.lora_dir}/{lora_file_path}"
+    lora_file_path = f"{Path(shared.args.lora_dir)}/{lora_file_path}"
     actual_lr = float(learning_rate)
     model_type = type(shared.model).__name__
 
@@ -395,7 +393,6 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
         eos_added = 0
         out_tokens = []
         for text_part in raw_text.split(cut_string):
-
             if len(text_part.strip()) <= min_chars:
                 continue
 
@@ -425,11 +422,11 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
         eval_data = None
     else:
         if dataset in ['None', '']:
-            yield "**Missing dataset choice input, cannot continue.**"
+            yield "Missing dataset choice input, cannot continue."
             return
 
         if format in ['None', '']:
-            yield "**Missing format choice input, cannot continue.**"
+            yield "Missing format choice input, cannot continue."
             return
 
         train_template["template_type"] = "dataset"
@@ -472,8 +469,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
             print("\033[1;31;1m(Model has been modified by previous training, it needs to be reloaded...)\033[0;37;0m")
             try:
                 yield f"Reloading {selected_model}..."
-                unload_model()
-                shared.model, shared.tokenizer = load_model(shared.model_name, None)
+                reload_model()
                 if shared.model is not None:
                     print("Model reloaded OK, continue with training.")
                 else:
@@ -492,7 +488,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
     # base model is now frozen and should not be reused for any other LoRA training than this one
     shared.model_dirty_from_training = True
 
-    logger.info("Prepping for training...")
+    logger.info("Preparing for training...")
     config = LoraConfig(
         r=lora_rank,
         lora_alpha=lora_alpha,
@@ -703,10 +699,10 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
 
     if WANT_INTERRUPT:
         logger.info("Training interrupted.")
-        yield f"Interrupted. Incomplete LoRA saved to `{lora_file_path}`"
+        yield f"Interrupted. Incomplete LoRA saved to `{lora_file_path}`."
     else:
         logger.info("Training complete!")
-        yield f"Done! LoRA saved to `{lora_file_path}`"
+        yield f"Done! LoRA saved to `{lora_file_path}`.\n\nBefore testing your new LoRA, make sure to first reload the model, as it is currently dirty from training."
 
 
 def split_chunks(arr, size, step):
diff --git a/requirements.txt b/requirements.txt
index 9cac54dc..05a3ec0e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,7 +20,7 @@ tensorboard
 tqdm
 wandb
 
-git+https://github.com/huggingface/peft@96c0277a1b9a381b10ab34dbf84917f9b3b992e6
+git+https://github.com/huggingface/peft@4b371b489b9850fd583f204cdf8b5471e098d4e4
 git+https://github.com/huggingface/transformers@baf1daa58eb2960248fd9f7c3af0ed245b8ce4af
 
 bitsandbytes==0.41.1; platform_system != "Windows"

From f6724a1a01f48c70a0c00cc4b2f85501b1e4f9f1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 18 Aug 2023 13:04:45 -0700
Subject: [PATCH 119/169] Return the visible history with "Copy last reply"

---
 modules/chat.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index ce10e393..312f8cb8 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -317,8 +317,8 @@ def remove_last_message(history):
 
 
 def send_last_reply_to_input(history):
-    if len(history['internal']) > 0:
-        return history['internal'][-1][1]
+    if len(history['visible']) > 0:
+        return history['visible'][-1][1]
     else:
         return ''
 

From 942ad6067d9fbd5158db60331a1cd09e8c7ee918 Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Fri, 18 Aug 2023 23:17:27 -0400
Subject: [PATCH 120/169] llama.cpp: make Stop button work with streaming
 disabled (#3620)

---
 modules/llamacpp_model.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
index 28a38de6..69cbd236 100644
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@@ -125,6 +125,8 @@ class LlamaCppModel:
 
         output = ""
         for completion_chunk in completion_chunks:
+            if shared.stop_everything:
+                break
             text = completion_chunk['choices'][0]['text']
             output += text
             if callback:

From 1cae78476159f3688408113ce3634290ceed508c Mon Sep 17 00:00:00 2001
From: missionfloyd <missionfloyd@users.noreply.github.com>
Date: Sat, 19 Aug 2023 06:29:08 -0600
Subject: [PATCH 121/169] Unescape last message (#3623)

---
 modules/chat.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 312f8cb8..e9c2fe7c 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -313,12 +313,12 @@ def remove_last_message(history):
     else:
         last = ['', '']
 
-    return last[0], history
+    return html.unescape(last[0]), history
 
 
 def send_last_reply_to_input(history):
     if len(history['visible']) > 0:
-        return history['visible'][-1][1]
+        return html.unescape(history['visible'][-1][1])
     else:
         return ''
 

From ee964bcce94875484135c4700412d9dcad6ebd19 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 20 Aug 2023 07:01:43 -0700
Subject: [PATCH 122/169] Update a comment about RoPE scaling

---
 modules/ui_model_menu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 8e24ebdf..e217bee1 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -91,7 +91,7 @@ def create_ui():
                             shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
                             shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=16384, step=256, info='Maximum sequence length.', value=shared.args.max_seq_len)
                             shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=0.1, info='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value)
-                            shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should typically be set to max_seq_len / 2048.', value=shared.args.compress_pos_emb)
+                            shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length).', value=shared.args.compress_pos_emb)
 
                         with gr.Column():
                             shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)

From ef17da70af109f3b543a6d0550dd5a2e8c0067a3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 20 Aug 2023 08:50:32 -0700
Subject: [PATCH 123/169] Fix ExLlama truncation

---
 modules/exllama.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/exllama.py b/modules/exllama.py
index 30c37634..25bf0e53 100644
--- a/modules/exllama.py
+++ b/modules/exllama.py
@@ -111,7 +111,7 @@ class ExllamaModel:
             self.generator.end_beam_search()
 
             # Tokenizing the input
-            ids = self.generator.tokenizer.encode(prompt)
+            ids = self.generator.tokenizer.encode(prompt, max_seq_len=self.model.config.max_seq_len)
             ids = ids[:, -get_max_prompt_length(state):]
             if state['auto_max_new_tokens']:
                 max_new_tokens = state['truncation_length'] - ids.shape[-1]
@@ -141,7 +141,7 @@ class ExllamaModel:
             alpha = state['guidance_scale']
             prompts = [prompt, state['negative_prompt'] or '']
 
-            ids, mask = self.tokenizer.encode(prompts, return_mask=True)
+            ids, mask = self.tokenizer.encode(prompts, return_mask=True, max_seq_len=self.model.config.max_seq_len)
             if state['auto_max_new_tokens']:
                 max_new_tokens = state['truncation_length'] - ids[0].shape[-1]
             else:
@@ -181,7 +181,7 @@ class ExllamaModel:
         return output
 
     def encode(self, string, **kwargs):
-        return self.tokenizer.encode(string)
+        return self.tokenizer.encode(string, max_seq_len=self.model.config.max_seq_len)
 
     def decode(self, string, **kwargs):
         return self.tokenizer.decode(string)[0]

From 457fedfa360f52e1791f6f18913f73f13328db37 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 20 Aug 2023 16:02:44 -0700
Subject: [PATCH 124/169] Remove niche dockerfile

---
 docker/Dockerfile.jetson | 51 ----------------------------------------
 1 file changed, 51 deletions(-)
 delete mode 100644 docker/Dockerfile.jetson

diff --git a/docker/Dockerfile.jetson b/docker/Dockerfile.jetson
deleted file mode 100644
index cefbc3c2..00000000
--- a/docker/Dockerfile.jetson
+++ /dev/null
@@ -1,51 +0,0 @@
-#Standalone Dockerfile for text-generation-webui on NVIDIA Jetson Embedded devices
-
-FROM nvcr.io/nvidia/l4t-pytorch:r35.2.1-pth2.0-py3 as builder
-ENV TORCH_CUDA_ARCH_LIST Turing
-RUN apt-get update && \
-    apt-get install -y python3 python3-pip git build-essential python3-dev
-
-RUN pip3 install --upgrade pip setuptools
-RUN git clone https://github.com/g588928812/bitsandbytes_jetsonX.git /build
-WORKDIR /build
-RUN CUDA_VERSION=118 make cuda11x
-RUN mkdir /wheels
-RUN python3 setup.py bdist_wheel -d /wheels
-RUN rm -rf /build
-RUN git clone https://github.com/oobabooga/GPTQ-for-LLaMa /build
-WORKDIR /build
-RUN pip3 install -r requirements.txt
-RUN python3 setup_cuda.py bdist_wheel -d /wheels
-
-FROM nvcr.io/nvidia/l4t-pytorch:r35.2.1-pth2.0-py3
-COPY --from=builder /wheels /wheels
-COPY --from=builder /build /build
-RUN apt-get update && \
-    apt-get install --no-install-recommends -y git python3-dev python3 python3-pip make g++ && \
-    rm -rf /var/lib/apt/lists/*
-RUN pip3 install /wheels/*.whl
-RUN rm -rf /wheels
-WORKDIR /build
-RUN pip3 install -r requirements.txt
-RUN git clone https://github.com/oobabooga/text-generation-webui /app
-
-WORKDIR /app
-#ENV WEBUI_VERSION="2908a515877ffde2b1684b2353f6d72e6cb4d31b"
-#RUN git reset --hard ${WEBUI_VERSION}
-RUN pip3 install --upgrade pip setuptools
-RUN pip3 install protobuf>=3.3.0
-RUN pip3 install -r requirements.txt
-#Force to use bitsandbytes_jetsonX
-RUN pip3 uninstall -y bitsandbytes
-RUN mkdir /app/repositories
-RUN mv /build /app/repositories/GPTQ-for-LLaMa
-
-#Remove Python 3.10 specific macros
-RUN sed -i 's/@functools.cache/@functools.lru_cache(maxsize=None)/g' /app/modules/chat.py
-RUN sed -i 's/@functools.cache/@functools.lru_cache(maxsize=None)/g' /app/modules/loaders.py
-RUN sed -i 's/@functools.cache/@functools.lru_cache(maxsize=None)/g' /app/modules/presets.py
-
-EXPOSE 7860
-
-ENV CLI_ARGS="--listen"
-CMD python3 server.py ${CLI_ARGS}
\ No newline at end of file

From 0dfd1a8b7d81bbcdd616540e1e5205562d494940 Mon Sep 17 00:00:00 2001
From: Thomas De Bonnet <45205349+Thutmose3@users.noreply.github.com>
Date: Mon, 21 Aug 2023 01:13:13 +0200
Subject: [PATCH 125/169] Improve readability of download-model.py (#3497)

---
 download-model.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/download-model.py b/download-model.py
index e1afa9ef..a65f82c7 100644
--- a/download-model.py
+++ b/download-model.py
@@ -24,14 +24,14 @@ from tqdm.contrib.concurrent import thread_map
 
 class ModelDownloader:
     def __init__(self, max_retries=5):
-        self.s = requests.Session()
+        self.session = requests.Session()
         if max_retries:
-            self.s.mount('https://cdn-lfs.huggingface.co', HTTPAdapter(max_retries=max_retries))
-            self.s.mount('https://huggingface.co', HTTPAdapter(max_retries=max_retries))
+            self.session.mount('https://cdn-lfs.huggingface.co', HTTPAdapter(max_retries=max_retries))
+            self.session.mount('https://huggingface.co', HTTPAdapter(max_retries=max_retries))
         if os.getenv('HF_USER') is not None and os.getenv('HF_PASS') is not None:
-            self.s.auth = (os.getenv('HF_USER'), os.getenv('HF_PASS'))
+            self.session.auth = (os.getenv('HF_USER'), os.getenv('HF_PASS'))
         if os.getenv('HF_TOKEN') is not None:
-            self.s.headers = {'authorization': f'Bearer {os.getenv("HF_TOKEN")}'}
+            self.session.headers = {'authorization': f'Bearer {os.getenv("HF_TOKEN")}'}
 
     def sanitize_model_and_branch_names(self, model, branch):
         if model[-1] == '/':
@@ -62,7 +62,7 @@ class ModelDownloader:
         is_lora = False
         while True:
             url = f"{base}{page}" + (f"?cursor={cursor.decode()}" if cursor else "")
-            r = self.s.get(url, timeout=10)
+            r = self.session.get(url, timeout=10)
             r.raise_for_status()
             content = r.content
 
@@ -136,7 +136,7 @@ class ModelDownloader:
         if output_path.exists() and not start_from_scratch:
 
             # Check if the file has already been downloaded completely
-            r = self.s.get(url, stream=True, timeout=10)
+            r = self.session.get(url, stream=True, timeout=10)
             total_size = int(r.headers.get('content-length', 0))
             if output_path.stat().st_size >= total_size:
                 return
@@ -145,7 +145,7 @@ class ModelDownloader:
             headers = {'Range': f'bytes={output_path.stat().st_size}-'}
             mode = 'ab'
 
-        with self.s.get(url, stream=True, headers=headers, timeout=10) as r:
+        with self.session.get(url, stream=True, headers=headers, timeout=10) as r:
             r.raise_for_status()  # Do not continue the download if the request was unsuccessful
             total_size = int(r.headers.get('content-length', 0))
             block_size = 1024 * 1024  # 1MB

From 2c1fd0d72bf96bc7603fa5ab0d9cde098b339d41 Mon Sep 17 00:00:00 2001
From: SeanScripts <64337075+SeanScripts@users.noreply.github.com>
Date: Sun, 20 Aug 2023 18:28:14 -0500
Subject: [PATCH 126/169] Add probability dropdown to perplexity_colors
 extension (#3148)

---
 extensions/perplexity_colors/script.py | 335 ++++++++++++++++++-------
 1 file changed, 241 insertions(+), 94 deletions(-)

diff --git a/extensions/perplexity_colors/script.py b/extensions/perplexity_colors/script.py
index 84b62a30..d106fabd 100644
--- a/extensions/perplexity_colors/script.py
+++ b/extensions/perplexity_colors/script.py
@@ -1,17 +1,25 @@
+import re
+import time
+
 import gradio
+import markdown
+import numpy as np
 import torch
 from transformers import LogitsProcessor
-import numpy as np
 
-from modules import shared
+from modules import html_generator, shared
+from modules.html_generator import replace_blockquote
 
 params = {
+    'active': True,
     'color_by_perplexity': False,
     'color_by_probability': False,
-    'ppl_scale': 15.0, # No slider for this right now, because I don't think it really needs to be changed. Very large perplexity scores don't show up often.
-    #'probability_dropdown': False
+    'ppl_scale': 15.0,  # No slider for this right now, because I don't think it really needs to be changed. Very large perplexity scores don't show up often.
+    'probability_dropdown': False,
+    'verbose': False  # For debugging mostly
 }
 
+
 class PerplexityLogits(LogitsProcessor):
     def __init__(self, verbose=False):
         self.generated_token_ids = []
@@ -23,9 +31,10 @@ class PerplexityLogits(LogitsProcessor):
         self.verbose = verbose
 
     def __call__(self, input_ids, scores):
+        # t0 = time.time()
         probs = torch.softmax(scores, dim=-1, dtype=torch.float)
-        log_probs = torch.nan_to_num(torch.log(probs))
-        entropy = -torch.sum(probs*log_probs)
+        log_probs = torch.nan_to_num(torch.log(probs))  # Note: This is to convert log(0) nan to 0, but probs*log_probs makes this 0 not affect the perplexity.
+        entropy = -torch.sum(probs * log_probs)
         entropy = entropy.cpu().numpy()
         perplexity = round(float(np.exp(entropy)), 4)
         self.perplexities_list.append(perplexity)
@@ -36,25 +45,25 @@ class PerplexityLogits(LogitsProcessor):
         if len(self.selected_probs) > 0:
             # Is the selected token in the top tokens?
             if self.verbose:
-                print(shared.tokenizer.decode(last_token_id))
-                print([shared.tokenizer.decode(token_id) for token_id in self.top_token_ids_list[-1]])
-                print(self.top_probs_list[-1])
-            if last_token_id in self.top_token_ids_list[-1]:
-                idx = self.top_token_ids_list[-1].index(last_token_id)
-                self.selected_probs.append(self.top_probs_list[-1][idx])
+                print('Probs: Token after', shared.tokenizer.decode(last_token_id))
+                print('Probs:', [shared.tokenizer.decode(token_id) for token_id in self.top_token_ids_list[-1][0]])
+                print('Probs:', [round(float(prob), 4) for prob in self.top_probs_list[-1][0]])
+            if last_token_id in self.top_token_ids_list[-1][0]:
+                idx = self.top_token_ids_list[-1][0].index(last_token_id)
+                self.selected_probs.append(self.top_probs_list[-1][0][idx])
             else:
-                self.top_token_ids_list[-1].append(last_token_id)
+                self.top_token_ids_list[-1][0].append(last_token_id)
                 last_prob = round(float(self.last_probs[last_token_id]), 4)
-                self.top_probs_list[-1].append(last_prob)
+                self.top_probs_list[-1][0].append(last_prob)
                 self.selected_probs.append(last_prob)
         else:
-            self.selected_probs.append(1.0) # Placeholder for the last token of the prompt
+            self.selected_probs.append(1.0)  # Placeholder for the last token of the prompt
 
         if self.verbose:
             pplbar = "-"
             if not np.isnan(perplexity):
-                pplbar = "*"*round(perplexity)
-            print(f"{last_token}\t{perplexity:.2f}\t{pplbar}")
+                pplbar = "*" * round(perplexity)
+            print(f"PPL: Token after {shared.tokenizer.decode(last_token_id)}\t{perplexity:.2f}\t{pplbar}")
 
         # Get top 5 probabilities
         top_tokens_and_probs = torch.topk(probs, 5)
@@ -63,153 +72,291 @@ class PerplexityLogits(LogitsProcessor):
 
         self.top_token_ids_list.append(top_token_ids)
         self.top_probs_list.append(top_probs)
-        
-        probs = probs.cpu().numpy().flatten()
-        self.last_probs = probs # Need to keep this as a reference for top probs
 
+        probs = probs.cpu().numpy().flatten()
+        self.last_probs = probs  # Need to keep this as a reference for top probs
+
+        # t1 = time.time()
+        # print(f"PPL Processor: {(t1-t0):.3f} s")
+        # About 1 ms, though occasionally up to around 100 ms, not sure why...
         # Doesn't actually modify the logits!
         return scores
 
+
 # Stores the perplexity and top probabilities
 ppl_logits_processor = None
 
+
 def logits_processor_modifier(logits_processor_list, input_ids):
     global ppl_logits_processor
-    ppl_logits_processor = PerplexityLogits()
-    logits_processor_list.append(ppl_logits_processor)
+    if params['active']:
+        ppl_logits_processor = PerplexityLogits(verbose=params['verbose'])
+        logits_processor_list.append(ppl_logits_processor)
+
 
 def output_modifier(text):
     global ppl_logits_processor
+    # t0 = time.time()
+
+    if not params['active']:
+        return text
 
     # TODO: It's probably more efficient to do this above rather than modifying all these lists
     # Remove last element of perplexities_list, top_token_ids_list, top_tokens_list, top_probs_list since everything is off by one because this extension runs before generation
     perplexities = ppl_logits_processor.perplexities_list[:-1]
     top_token_ids_list = ppl_logits_processor.top_token_ids_list[:-1]
-    top_tokens_list = [[shared.tokenizer.decode(token_id) for token_id in top_token_ids] for top_token_ids in top_token_ids_list]
+    top_tokens_list = [[shared.tokenizer.decode(token_id) for token_id in top_token_ids[0]] for top_token_ids in top_token_ids_list]
     top_probs_list = ppl_logits_processor.top_probs_list[:-1]
     # Remove first element of generated_token_ids, generated_tokens, selected_probs because they are for the last token of the prompt
     gen_token_ids = ppl_logits_processor.generated_token_ids[1:]
     gen_tokens = [shared.tokenizer.decode(token_id) for token_id in gen_token_ids]
     sel_probs = ppl_logits_processor.selected_probs[1:]
 
-    end_part = '</span>' # Helps with finding the index after replacing part of the text.
-    in_code = False # Since the <span> tags mess up code blocks, avoid coloring while inside a code block, based on finding tokens with '`' in them
+    end_part = '</div></div>' if params['probability_dropdown'] else '</span>'  # Helps with finding the index after replacing part of the text.
+    in_code = False  # Since the <span> tags mess up code blocks, avoid coloring while inside a code block, based on finding tokens with '`' in them
 
-    if params['color_by_probability'] and params['color_by_perplexity']:
-        i = 0
-        for token, prob, ppl, top_tokens, top_probs in zip(gen_tokens, sel_probs, perplexities, top_tokens_list, top_probs_list):
-            if '`' in token:
-                in_code = not in_code
-                continue
-            if in_code:
-                continue
+    i = 0
+    for token, prob, ppl, top_tokens, top_probs in zip(gen_tokens, sel_probs, perplexities, top_tokens_list, top_probs_list):
+        if '`' in token and not params['probability_dropdown']:
+            in_code = not in_code
+            continue
+        if in_code:
+            continue
+        color = 'ffffff'
+        if params['color_by_probability'] and params['color_by_perplexity']:
             color = probability_perplexity_color_scale(prob, ppl)
-            if token in text[i:]:
-                text = text[:i] + text[i:].replace(token, add_color_html(token, color), 1)
-                i += text[i:].find(end_part) + len(end_part)
-    elif params['color_by_perplexity']:
-        i = 0
-        for token, ppl, top_tokens, top_probs in zip(gen_tokens, perplexities, top_tokens_list, top_probs_list):
-            if '`' in token:
-                in_code = not in_code
-                continue
-            if in_code:
-                continue
+        elif params['color_by_perplexity']:
             color = perplexity_color_scale(ppl)
-            if token in text[i:]:
-                text = text[:i] + text[i:].replace(token, add_color_html(token, color), 1)
-                i += text[i:].find(end_part) + len(end_part)
-    elif params['color_by_probability']:
-        i = 0
-        for token, prob, top_tokens, top_probs in zip(gen_tokens, sel_probs, top_tokens_list, top_probs_list):
-            if '`' in token:
-                in_code = not in_code
-                continue
-            if in_code:
-                continue
+        elif params['color_by_probability']:
             color = probability_color_scale(prob)
-            if token in text[i:]:
+        if token in text[i:]:
+            if params['probability_dropdown']:
+                after_token_index = text[i:].find(token) + len(token)
+                whitespace = text[i:][after_token_index:(after_token_index + 1)]
+                if whitespace != ' ':
+                    whitespace = ''
+                text = text[:i] + text[i:].replace(token, add_dropdown_html(token, color, top_tokens, top_probs[0], whitespace, ppl), 1)
+            else:
                 text = text[:i] + text[i:].replace(token, add_color_html(token, color), 1)
-                i += text[i:].find(end_part) + len(end_part)
+            i += text[i:].find(end_part) + len(end_part)
 
-    print('Average perplexity:', round(np.mean(perplexities), 4))
+    # Use full perplexity list for calculating the average here.
+    print('Average perplexity:', round(np.mean(ppl_logits_processor.perplexities_list[:-1]), 4))
+    # Optional hacky workaround: Without this, spaces get added between every token. With this, there is a little extra whitespace at the top.
+    # This fixes the tokenization spaces, somehow. However, this also removes any paragraph breaks in the message.
+    # return '<p>' + text + '</p>'
+    # t1 = time.time()
+    # print(f"Modifier: {(t1-t0):.3f} s")
+    # About 50 ms
     return text
 
-# Green-yellow-red color scale
+
 def probability_color_scale(prob):
+    '''
+    Green-yellow-red color scale
+    '''
+
     rv = 0
     gv = 0
     if prob <= 0.5:
         rv = 'ff'
-        gv = hex(int(255*prob*2))[2:]
+        gv = hex(int(255 * prob * 2))[2:]
         if len(gv) < 2:
-            gv = '0'*(2 - len(gv)) + gv
+            gv = '0' * (2 - len(gv)) + gv
     else:
-        rv = hex(int(255 - 255*(prob - 0.5)*2))[2:]
+        rv = hex(int(255 - 255 * (prob - 0.5) * 2))[2:]
         gv = 'ff'
         if len(rv) < 2:
-            rv = '0'*(2 - len(rv)) + rv
+            rv = '0' * (2 - len(rv)) + rv
+
     return rv + gv + '00'
 
-# Red component only, white for 0 perplexity (sorry if you're not in dark mode)
+
 def perplexity_color_scale(ppl):
-    value = hex(max(int(255.0 - params['ppl_scale']*(float(ppl)-1.0)), 0))[2:]
+    '''
+    Red component only, white for 0 perplexity (sorry if you're not in dark mode)
+    '''
+    value = hex(max(int(255.0 - params['ppl_scale'] * (float(ppl) - 1.0)), 0))[2:]
     if len(value) < 2:
-        value = '0'*(2 - len(value)) + value
+        value = '0' * (2 - len(value)) + value
+
     return 'ff' + value + value
 
-# Green-yellow-red for probability and blue component for perplexity
+
 def probability_perplexity_color_scale(prob, ppl):
+    '''
+    Green-yellow-red for probability and blue component for perplexity
+    '''
+
     rv = 0
     gv = 0
-    bv = hex(min(max(int(params['ppl_scale']*(float(ppl)-1.0)), 0), 255))[2:]
+    bv = hex(min(max(int(params['ppl_scale'] * (float(ppl) - 1.0)), 0), 255))[2:]
     if len(bv) < 2:
-            bv = '0'*(2 - len(bv)) + bv
+        bv = '0' * (2 - len(bv)) + bv
+
     if prob <= 0.5:
         rv = 'ff'
-        gv = hex(int(255*prob*2))[2:]
+        gv = hex(int(255 * prob * 2))[2:]
         if len(gv) < 2:
-            gv = '0'*(2 - len(gv)) + gv
+            gv = '0' * (2 - len(gv)) + gv
     else:
-        rv = hex(int(255 - 255*(prob - 0.5)*2))[2:]
+        rv = hex(int(255 - 255 * (prob - 0.5) * 2))[2:]
         gv = 'ff'
         if len(rv) < 2:
-            rv = '0'*(2 - len(rv)) + rv
+            rv = '0' * (2 - len(rv)) + rv
+
     return rv + gv + bv
 
+
 def add_color_html(token, color):
     return f'<span style="color: #{color}">{token}</span>'
 
-"""
-# This is still very broken at the moment, needs CSS too but I'm not very good at CSS (and neither is GPT-4 apparently) so I still need to figure that out.
-def add_dropdown_html(token, color, top_tokens, top_probs):
-    html = f'<span class="hoverable" style="color: #{color}">{token}<div class="dropdown"><table class="dropdown-content">'
-    for token, prob in zip(top_tokens, top_probs):
-        # TODO: Background color? Bold for selected token?
-        # Bigger issue: Why is there a newline after the first token, and the dropdown fails there?
-        # The HTML ends up like <p><span>word</span></p><div>...</div>,
-        # even though for all other tokens it shows up correctly.
+
+# TODO: Major issue: Applying this to too many tokens will cause a permanent slowdown in generation speed until the messages are removed from the history.
+# I think the issue is from HTML elements taking up space in the visible history, and things like history deepcopy add latency proportional to the size of the history.
+# Potential solution is maybe to modify the main generation code to send just the internal text and not the visible history, to avoid moving too much around.
+# I wonder if we can also avoid using deepcopy here.
+# The whitespace fix here is not perfect -- it will remove whitespace of paragraph breaks and other particular cases.
+def add_dropdown_html(token, color, top_tokens, top_probs, whitespace='', perplexity=0):
+    if whitespace != '':
+        whitespace = '&nbsp;'
+    html = f'<div class="hoverable"><span style="color: #{color}">{token}{whitespace}</span><div class="dropdown"><table class="dropdown-content"><tbody>'
+    for token_option, prob in zip(top_tokens, top_probs):
+        # TODO: Bold for selected token?
+        # Using divs prevented the problem of divs inside spans causing issues.
+        # Now the problem is that divs show the same whitespace of one space between every token.
+        # There is probably some way to fix this in CSS that I don't know about.
         row_color = probability_color_scale(prob)
-        html += f'<tr><td style="color: #{row_color}">{token}</td><td style="color: #{row_color}">{prob}</td></tr>'
-    html += '</table></div></span>'
-    return html
-"""
+        row_class = ' class="selected"' if token_option == token else ''
+        html += f'<tr{row_class}><td style="color: #{row_color}">{token_option}</td><td style="color: #{row_color}">{prob:.4f}</td></tr>'
+    if perplexity != 0:
+        ppl_color = perplexity_color_scale(perplexity)
+        html += f'<tr><td>Perplexity:</td><td style="color: #{ppl_color}">{perplexity:.4f}</td></tr>'
+    html += '</tbody></table></div></div>\n'  # The newline would normally be added by markdown.markdown() but this is faster.
+    return html  # About 750 characters per token...
+
+
+def custom_css():
+    return """
+        .dropdown {
+            display: none;
+            position: absolute;
+            z-index: 50;
+            background-color: var(--block-background-fill);
+            box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2);
+            width: max-content;
+            overflow: visible;
+            padding: 5px;
+            border-radius: 10px;
+            border: 1px solid var(--border-color-primary);
+        }
+
+        .dropdown-content {
+            border: none;
+            z-index: 50;
+        }
+
+        .dropdown-content tr.selected {
+            background-color: var(--block-label-background-fill);
+        }
+
+        .dropdown-content td {
+            color: var(--body-text-color);
+        }
+
+        .hoverable {
+            color: var(--body-text-color);
+            position: relative;
+            display: inline-block;
+            overflow: visible;
+            font-size: 15px;
+            line-height: 1.75;
+            margin: 0;
+            padding: 0;
+            margin-right: -4px;
+        }
+
+        .hoverable:hover .dropdown {
+            display: block;
+        }
+
+        # TODO: This makes the hover menus extend outside the bounds of the chat area, which is good.
+        # However, it also makes the scrollbar disappear, which is bad.
+        # The scroll bar needs to still be present. So for now, we can't see dropdowns that extend past the edge of the chat area.
+        #.chat {
+        #    overflow-y: auto;
+        #}
+    """
+
+# Monkeypatch applied to html_generator.py
+# This fixes an issue where the markdown conversion was causing a large slowdown in generation speeds if too many tokens had probability dropdowns added.
+# I'd rather have a more long-term solution, since this really shouldn't be called on all messages for each token, but this works for now.
+def convert_to_markdown(string):
+    # t0 = time.time()
+    # Blockquote
+    pattern = re.compile(r'\\begin{blockquote}(.*?)\\end{blockquote}', re.DOTALL)
+    string = pattern.sub(replace_blockquote, string)
+
+    # Code
+    string = string.replace('\\begin{code}', '```')
+    string = string.replace('\\end{code}', '```')
+    string = re.sub(r"(.)```", r"\1\n```", string)
+
+    result = ''
+    is_code = False
+    for line in string.split('\n'):
+        if line.lstrip(' ').startswith('```'):
+            is_code = not is_code
+
+        result += line
+        if is_code or line.startswith('|'):  # Don't add an extra \n for tables or code
+            result += '\n'
+        else:
+            result += '\n\n'
+
+    if is_code:
+        result = result + '```'  # Unfinished code block
+
+    string = result.strip()
+    # t1 = time.time()
+    # print(len(string))
+    # print(f"Pre markdown: {(t1-t0):.3f} s")
+    if params['probability_dropdown'] and '<div class="hoverable">' in string:
+        # Prevents all latency introduced by trying to convert the HTML to markdown when it's not even necessary
+        # print('Monkeypatched')
+        return string
+    else:
+        # t0 = time.time()
+        return markdown.markdown(string, extensions=['fenced_code', 'tables'])
+        # t1 = time.time()
+        # print(f"Markdown: {(t1-t0):.3f} s for string of length {len(string)}")
+        # print(string)
+        # print(res)
+        # return res
+
+
+html_generator.convert_to_markdown = convert_to_markdown
+
 
 def ui():
-    color_by_ppl_check = gradio.Checkbox(value=False, label="Color by perplexity", info="Higher perplexity is more red. If also showing probability, higher perplexity has more blue component.")
+    def update_active_check(x):
+        params.update({'active': x})
+
     def update_color_by_ppl_check(x):
         params.update({'color_by_perplexity': x})
-    color_by_ppl_check.change(update_color_by_ppl_check, color_by_ppl_check, None)
 
-    color_by_prob_check = gradio.Checkbox(value=False, label="Color by probability", info="Green-yellow-red linear scale, with 100% green, 50% yellow, 0% red.")
     def update_color_by_prob_check(x):
         params.update({'color_by_probability': x})
-    color_by_prob_check.change(update_color_by_prob_check, color_by_prob_check, None)
 
-    # Doesn't work yet...
-    """
-    prob_dropdown_check = gradio.Checkbox(value=False, label="Probability dropdown")
     def update_prob_dropdown_check(x):
         params.update({'probability_dropdown': x})
+
+    active_check = gradio.Checkbox(value=True, label="Compute probabilities and perplexity scores", info="Activate this extension. Note that this extension currently does not work with exllama or llama.cpp.")
+    color_by_ppl_check = gradio.Checkbox(value=False, label="Color by perplexity", info="Higher perplexity is more red. If also showing probability, higher perplexity has more blue component.")
+    color_by_prob_check = gradio.Checkbox(value=False, label="Color by probability", info="Green-yellow-red linear scale, with 100% green, 50% yellow, 0% red.")
+    prob_dropdown_check = gradio.Checkbox(value=False, label="Probability dropdown", info="Hover over a token to show a dropdown of top token probabilities. Currently slightly buggy with whitespace between tokens.")
+
+    active_check.change(update_active_check, active_check, None)
+    color_by_ppl_check.change(update_color_by_ppl_check, color_by_ppl_check, None)
+    color_by_prob_check.change(update_color_by_prob_check, color_by_prob_check, None)
     prob_dropdown_check.change(update_prob_dropdown_check, prob_dropdown_check, None)
-    """

From 120fb86c6ac8a30aed96cdefce7248735361bb72 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 20 Aug 2023 20:49:21 -0300
Subject: [PATCH 127/169] Add a simple logit viewer (#3636)

---
 css/main.css           | 15 ++++++++++++++-
 modules/logits.py      | 19 +++++++++++++++++++
 modules/ui_default.py  |  7 ++++++-
 modules/ui_notebook.py |  7 ++++++-
 4 files changed, 45 insertions(+), 3 deletions(-)
 create mode 100644 modules/logits.py

diff --git a/css/main.css b/css/main.css
index 7bfd0146..2562cf7d 100644
--- a/css/main.css
+++ b/css/main.css
@@ -116,7 +116,20 @@ div.svelte-15lo0d8 > *, div.svelte-15lo0d8 > .form > * {
     height: calc(100dvh - 241px);
 }
 
-.textbox_default textarea, .textbox_default_output textarea, .textbox textarea {
+.textbox_logits textarea {
+    height: calc(100dvh - 241px);
+}
+
+.textbox_logits_notebook textarea {
+    height: calc(100dvh - 292px);
+}
+
+.textbox_default textarea,
+.textbox_default_output textarea,
+.textbox_logits textarea,
+.textbox_logits_notebook textarea,
+.textbox textarea
+{
     font-size: 16px !important;
     color: #46464A !important;
 }
diff --git a/modules/logits.py b/modules/logits.py
new file mode 100644
index 00000000..99cb336f
--- /dev/null
+++ b/modules/logits.py
@@ -0,0 +1,19 @@
+import torch
+
+from modules import shared
+
+
+def get_next_logits(prompt):
+    tokens = shared.tokenizer.encode(prompt, return_tensors='pt').cuda()
+    output = shared.model(input_ids=tokens)
+
+    scores = output['logits'][-1][-1]
+    probs = torch.softmax(scores, dim=-1, dtype=torch.float)
+
+    topk_values, topk_indices = torch.topk(probs, k=20, largest=True, sorted=True)
+    topk_values = [f"{float(i):.5f}" % i for i in topk_values]
+    output = ''
+    for row in list(zip(topk_values, shared.tokenizer.convert_ids_to_tokens(topk_indices))):
+        output += f"{row[0]} {row[1]}\n"
+
+    return output
diff --git a/modules/ui_default.py b/modules/ui_default.py
index 99657227..a5fbc3f5 100644
--- a/modules/ui_default.py
+++ b/modules/ui_default.py
@@ -1,6 +1,6 @@
 import gradio as gr
 
-from modules import shared, ui, utils
+from modules import logits, shared, ui, utils
 from modules.prompts import count_tokens, load_prompt
 from modules.text_generation import (
     generate_reply_wrapper,
@@ -43,6 +43,10 @@ def create_ui():
                 with gr.Tab('HTML'):
                     shared.gradio['html-default'] = gr.HTML()
 
+                with gr.Tab('Logits'):
+                    shared.gradio['get_logits-default'] = gr.Button('Get next token probabilities')
+                    shared.gradio['logits-default'] = gr.Textbox(lines=23, label='Output', elem_classes=['textbox_logits', 'add_scrollbar'])
+
 
 def create_event_handlers():
     shared.gradio['Generate-default'].click(
@@ -80,3 +84,4 @@ def create_event_handlers():
         lambda: gr.update(visible=True), None, gradio('file_deleter'))
 
     shared.gradio['count_tokens-default'].click(count_tokens, gradio('textbox-default'), gradio('status-default'), show_progress=False)
+    shared.gradio['get_logits-default'].click(logits.get_next_logits, gradio('textbox-default'), gradio('logits-default'))
diff --git a/modules/ui_notebook.py b/modules/ui_notebook.py
index 6949ed78..289cf62c 100644
--- a/modules/ui_notebook.py
+++ b/modules/ui_notebook.py
@@ -1,6 +1,6 @@
 import gradio as gr
 
-from modules import shared, ui, utils
+from modules import logits, shared, ui, utils
 from modules.prompts import count_tokens, load_prompt
 from modules.text_generation import (
     generate_reply_wrapper,
@@ -27,6 +27,10 @@ def create_ui():
                 with gr.Tab('HTML'):
                     shared.gradio['html-notebook'] = gr.HTML()
 
+                with gr.Tab('Logits'):
+                    shared.gradio['get_logits-notebook'] = gr.Button('Get next token probabilities')
+                    shared.gradio['logits-notebook'] = gr.Textbox(lines=23, label='Output', elem_classes=['textbox_logits_notebook', 'add_scrollbar'])
+
                 with gr.Row():
                     shared.gradio['Generate-notebook'] = gr.Button('Generate', variant='primary', elem_classes='small-button')
                     shared.gradio['Stop-notebook'] = gr.Button('Stop', elem_classes='small-button', elem_id='stop')
@@ -83,3 +87,4 @@ def create_event_handlers():
         lambda: gr.update(visible=True), None, gradio('file_deleter'))
 
     shared.gradio['count_tokens-notebook'].click(count_tokens, gradio('textbox-notebook'), gradio('status-notebook'), show_progress=False)
+    shared.gradio['get_logits-notebook'].click(logits.get_next_logits, gradio('textbox-notebook'), gradio('logits-notebook'))

From 429cacd71557ca974bab09a7498468feec172a91 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 20 Aug 2023 19:13:01 -0700
Subject: [PATCH 128/169] Add a token counter similar to automatic1111

It can now be found in the Default and Notebook tabs
---
 css/main.css           | 27 ++++++++++++++++++++++++++-
 modules/prompts.py     |  4 ++--
 modules/ui_default.py  | 12 ++++++------
 modules/ui_notebook.py |  9 ++++-----
 4 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/css/main.css b/css/main.css
index 2562cf7d..e7afd595 100644
--- a/css/main.css
+++ b/css/main.css
@@ -140,7 +140,11 @@ div.svelte-15lo0d8 > *, div.svelte-15lo0d8 > .form > * {
 
 @media screen and (max-width: 711px) {
     .textbox_default textarea {
-        height: calc(100dvh - 295px);
+        height: calc(100dvh - 271px);
+    }
+
+    div .default-token-counter {
+        top: calc( 0.5 * (100dvh - 245px) ) !important;
     }
 }
 
@@ -212,6 +216,27 @@ audio {
   max-width: 100%;
 }
 
+/* Copied from https://github.com/AUTOMATIC1111/stable-diffusion-webui */
+.token-counter {
+  position: absolute !important;
+  top: calc( 0.5 * (100dvh - 215px) ) !important;
+  right: 2px;
+  z-index: 100;
+  background: var(--input-background-fill) !important;
+  min-height: 0 !important;
+}
+
+.default-token-counter {
+  top: calc( 0.5 * (100dvh - 285px) ) !important;
+}
+
+.token-counter span {
+  padding: 1px;
+  box-shadow: 0 0 0 0.3em rgba(192,192,192,0.15), inset 0 0 0.6em rgba(192,192,192,0.075);
+  border: 2px solid rgba(192,192,192,0.4) !important;
+  border-radius: 0.4em;
+}
+
 /*****************************************************/
 /*************** Chat UI declarations ****************/
 /*****************************************************/
diff --git a/modules/prompts.py b/modules/prompts.py
index e7654fbf..1d1a66b5 100644
--- a/modules/prompts.py
+++ b/modules/prompts.py
@@ -46,6 +46,6 @@ def load_instruction_prompt_simple(fname):
 def count_tokens(text):
     try:
         tokens = get_encoded_length(text)
-        return f'{tokens} tokens in the input.'
+        return str(tokens)
     except:
-        return 'Couldn\'t count the number of tokens. Is a tokenizer loaded?'
+        return '-1'
diff --git a/modules/ui_default.py b/modules/ui_default.py
index a5fbc3f5..5470a6ad 100644
--- a/modules/ui_default.py
+++ b/modules/ui_default.py
@@ -17,12 +17,14 @@ def create_ui():
         shared.gradio['last_input-default'] = gr.State('')
         with gr.Row():
             with gr.Column():
-                shared.gradio['textbox-default'] = gr.Textbox(value='', elem_classes=['textbox_default', 'add_scrollbar'], lines=27, label='Input')
+                with gr.Row():
+                    shared.gradio['textbox-default'] = gr.Textbox(value='', elem_classes=['textbox_default', 'add_scrollbar'], lines=27, label='Input')
+                    shared.gradio['token-counter-default'] = gr.HTML(value="<span>0</span>", elem_classes=["token-counter", "default-token-counter"])
+
                 with gr.Row():
                     shared.gradio['Generate-default'] = gr.Button('Generate', variant='primary')
                     shared.gradio['Stop-default'] = gr.Button('Stop', elem_id='stop')
                     shared.gradio['Continue-default'] = gr.Button('Continue')
-                    shared.gradio['count_tokens-default'] = gr.Button('Count tokens')
 
                 with gr.Row():
                     shared.gradio['prompt_menu-default'] = gr.Dropdown(choices=utils.get_available_prompts(), value='None', label='Prompt', elem_classes='slim-dropdown')
@@ -30,8 +32,6 @@ def create_ui():
                     shared.gradio['save_prompt-default'] = gr.Button('💾', elem_classes='refresh-button')
                     shared.gradio['delete_prompt-default'] = gr.Button('🗑️', elem_classes='refresh-button')
 
-                shared.gradio['status-default'] = gr.Markdown('')
-
             with gr.Column():
                 with gr.Tab('Raw'):
                     shared.gradio['output_textbox'] = gr.Textbox(lines=27, label='Output', elem_classes=['textbox_default_output', 'add_scrollbar'])
@@ -83,5 +83,5 @@ def create_event_handlers():
         lambda x: x + '.txt', gradio('prompt_menu-default'), gradio('delete_filename')).then(
         lambda: gr.update(visible=True), None, gradio('file_deleter'))
 
-    shared.gradio['count_tokens-default'].click(count_tokens, gradio('textbox-default'), gradio('status-default'), show_progress=False)
-    shared.gradio['get_logits-default'].click(logits.get_next_logits, gradio('textbox-default'), gradio('logits-default'))
+    shared.gradio['textbox-default'].change(lambda x : f"<span>{count_tokens(x)}</span>", gradio('textbox-default'), gradio('token-counter-default'), show_progress=False)
+    shared.gradio['get_logits-default'].click(logits.get_next_logits, gradio('textbox-default'), gradio('logits-default'), show_progress=False)
diff --git a/modules/ui_notebook.py b/modules/ui_notebook.py
index 289cf62c..b4b859ee 100644
--- a/modules/ui_notebook.py
+++ b/modules/ui_notebook.py
@@ -18,7 +18,9 @@ def create_ui():
         with gr.Row():
             with gr.Column(scale=4):
                 with gr.Tab('Raw'):
-                    shared.gradio['textbox-notebook'] = gr.Textbox(value='', elem_classes=['textbox', 'add_scrollbar'], lines=27)
+                    with gr.Row():
+                        shared.gradio['textbox-notebook'] = gr.Textbox(value='', elem_classes=['textbox', 'add_scrollbar'], lines=27)
+                        shared.gradio['token-counter-notebook'] = gr.HTML(value="<span>0</span>", elem_classes=["token-counter"])
 
                 with gr.Tab('Markdown'):
                     shared.gradio['markdown_render-notebook'] = gr.Button('Render')
@@ -45,9 +47,6 @@ def create_ui():
                     shared.gradio['save_prompt-notebook'] = gr.Button('💾', elem_classes=['refresh-button', 'refresh-button-small'])
                     shared.gradio['delete_prompt-notebook'] = gr.Button('🗑️', elem_classes=['refresh-button', 'refresh-button-small'])
 
-                shared.gradio['count_tokens-notebook'] = gr.Button('Count tokens')
-                shared.gradio['status-notebook'] = gr.Markdown('')
-
 
 def create_event_handlers():
     shared.gradio['Generate-notebook'].click(
@@ -86,5 +85,5 @@ def create_event_handlers():
         lambda x: x + '.txt', gradio('prompt_menu-notebook'), gradio('delete_filename')).then(
         lambda: gr.update(visible=True), None, gradio('file_deleter'))
 
-    shared.gradio['count_tokens-notebook'].click(count_tokens, gradio('textbox-notebook'), gradio('status-notebook'), show_progress=False)
+    shared.gradio['textbox-notebook'].change(lambda x : f"<span>{count_tokens(x)}</span>", gradio('textbox-notebook'), gradio('token-counter-notebook'), show_progress=False)
     shared.gradio['get_logits-notebook'].click(logits.get_next_logits, gradio('textbox-notebook'), gradio('logits-notebook'))

From 57036abc76db4ac1f2749846882beb6201632ccd Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 20 Aug 2023 19:54:59 -0700
Subject: [PATCH 129/169] Add "send to default/notebook" buttons to chat tab

---
 modules/ui_chat.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index e727172c..72e8cd03 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -46,6 +46,10 @@ def create_ui():
             shared.gradio['Clear history-confirm'] = gr.Button('Confirm', variant='stop', visible=False)
             shared.gradio['Clear history-cancel'] = gr.Button('Cancel', visible=False)
 
+        with gr.Row():
+            shared.gradio['send-chat-to-default'] = gr.Button('Send to default')
+            shared.gradio['send-chat-to-notebook'] = gr.Button('Send to notebook')
+
         with gr.Row():
             shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'])
 
@@ -275,4 +279,14 @@ def create_event_handlers():
         prompts.load_instruction_prompt_simple, gradio('instruction_template'), gradio('negative_prompt')).then(
         lambda: None, None, None, _js=f'() => {{{ui.switch_tabs_js}; switch_to_generation_parameters()}}')
 
+    shared.gradio['send-chat-to-default'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        partial(chat.generate_chat_prompt, '', _continue=True), gradio('interface_state'), gradio('textbox-default')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.switch_tabs_js}; switch_to_default()}}')
+
+    shared.gradio['send-chat-to-notebook'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        partial(chat.generate_chat_prompt, '', _continue=True), gradio('interface_state'), gradio('textbox-notebook')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.switch_tabs_js}; switch_to_notebook()}}')
+
     shared.gradio['show_controls'].change(None, gradio('show_controls'), None, _js=f'(x) => {{{ui.show_controls_js}; toggle_controls(x)}}')

From 6394fef1dbd7c74542dc383a7df1d3468b85db6f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 20 Aug 2023 21:02:53 -0700
Subject: [PATCH 130/169] Rewrite tab detection js

---
 extensions/gallery/script.js | 38 +++++++++++++++++++----------------
 js/main.js                   | 39 ++++++++++++++++++------------------
 2 files changed, 40 insertions(+), 37 deletions(-)

diff --git a/extensions/gallery/script.js b/extensions/gallery/script.js
index b80785b6..4ff23afc 100644
--- a/extensions/gallery/script.js
+++ b/extensions/gallery/script.js
@@ -1,29 +1,33 @@
 let gallery_element = document.getElementById('gallery-extension');
 let chat_mode_element = document.getElementById('chat-mode');
 
-let extensions_block = gallery_element.parentElement;
+let extensions_block = document.getElementById('extensions');
 let extensions_block_size = extensions_block.childNodes.length;
 let gallery_only = (extensions_block_size == 5);
 
-main_parent.addEventListener('click', function(e) {
-    let chat_visible = (chat_tab.offsetHeight > 0 && chat_tab.offsetWidth > 0);
-    let chat_mode_visible = (chat_mode_element.offsetHeight > 0 && chat_mode_element.offsetWidth > 0);
-    let notebook_visible = (notebook_tab.offsetHeight > 0 && notebook_tab.offsetWidth > 0);
-    let default_visible = (default_tab.offsetHeight > 0 && default_tab.offsetWidth > 0);
+document.querySelector('.header_bar').addEventListener('click', function(event) {
+    if (event.target.tagName === 'BUTTON') {
+        const buttonText = event.target.textContent.trim();
 
-    // Only show this extension in the Chat tab
-    if (chat_visible) {
-        if (chat_mode_visible) {
-            gallery_element.style.display = 'block';
-            extensions_block.style.display = '';
+        let chat_visible = (buttonText == 'Chat');
+        let default_visible = (buttonText == 'Default');
+        let notebook_visible = (buttonText == 'Notebook');
+        let chat_mode_visible = (chat_mode_element.offsetHeight > 0 && chat_mode_element.offsetWidth > 0);
+
+        // Only show this extension in the Chat tab
+        if (chat_visible) {
+            if (chat_mode_visible) {
+                gallery_element.style.display = 'block';
+                extensions_block.style.display = '';
+            } else {
+                gallery_element.style.display = 'none';
+                extensions_block.style.display = 'none';
+            }
         } else {
             gallery_element.style.display = 'none';
-            extensions_block.style.display = 'none';
-        }
-    } else {
-        gallery_element.style.display = 'none';
-        if (gallery_only) {
-            extensions_block.style.display = 'none';
+            if (gallery_only) {
+                extensions_block.style.display = 'none';
+            }
         }
     }
 });
diff --git a/js/main.js b/js/main.js
index 3428c62c..6a27c3b4 100644
--- a/js/main.js
+++ b/js/main.js
@@ -1,32 +1,31 @@
-let chat_tab = document.getElementById('chat-tab');
-let notebook_tab = document.getElementById('notebook-tab');
-let default_tab = document.getElementById('default-tab');
-
-let main_parent = chat_tab.parentNode;
+let main_parent = document.getElementById('chat-tab').parentNode;
 let extensions = document.getElementById('extensions');
 
 main_parent.childNodes[0].classList.add("header_bar");
 main_parent.style = "padding: 0; margin: 0";
 main_parent.parentNode.parentNode.style = "padding: 0";
 
-// Add an event listener to the generation tabs
-main_parent.addEventListener('click', function(e) {
-    let chat_visible = (chat_tab.offsetHeight > 0 && chat_tab.offsetWidth > 0);
-    let notebook_visible = (notebook_tab.offsetHeight > 0 && notebook_tab.offsetWidth > 0);
-    let default_visible = (default_tab.offsetHeight > 0 && default_tab.offsetWidth > 0);
+document.querySelector('.header_bar').addEventListener('click', function(event) {
+    if (event.target.tagName === 'BUTTON') {
+        const buttonText = event.target.textContent.trim();
 
-    // Check if one of the generation tabs is visible
-    if (chat_visible || notebook_visible || default_visible) {
-        extensions.style.display = 'flex';
-        if (chat_visible) {
-            extensions.style.maxWidth = "800px";
-            extensions.style.padding = "0px";
+        let chat_visible = (buttonText == 'Chat');
+        let default_visible = (buttonText == 'Default');
+        let notebook_visible = (buttonText == 'Notebook');
+
+        // Check if one of the generation tabs is visible
+        if (chat_visible || notebook_visible || default_visible) {
+            extensions.style.display = 'flex';
+            if (chat_visible) {
+                extensions.style.maxWidth = "800px";
+                extensions.style.padding = "0px";
+            } else {
+                extensions.style.maxWidth = "none";
+                extensions.style.padding = "15px";
+            }
         } else {
-            extensions.style.maxWidth = "none";
-            extensions.style.padding = "15px";
+            extensions.style.display = 'none';
         }
-    } else {
-        extensions.style.display = 'none';
     }
 });
 

From a74dd9003f0f06395cceaf1565618c6f6f70bf61 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 20 Aug 2023 21:40:22 -0700
Subject: [PATCH 131/169] Fix HTML escaping for perplexity_colors extension

---
 modules/html_generator.py  | 1 -
 modules/text_generation.py | 8 ++++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/modules/html_generator.py b/modules/html_generator.py
index 0026ba4e..86f31f3a 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -89,7 +89,6 @@ def convert_to_markdown(string):
 
 
 def generate_basic_html(string):
-    string = html.escape(string)
     string = convert_to_markdown(string)
     string = f'<style>{readable_css}</style><div class="container">{string}</div>'
     return string
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 30e81355..efafd67a 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -1,5 +1,6 @@
 import ast
 import copy
+import html
 import random
 import re
 import time
@@ -31,7 +32,7 @@ def generate_reply(*args, **kwargs):
         shared.generation_lock.release()
 
 
-def _generate_reply(question, state, stopping_strings=None, is_chat=False):
+def _generate_reply(question, state, stopping_strings=None, is_chat=False, escape_html=False):
 
     # Find the appropriate generation function
     generate_func = apply_extensions('custom_generate_reply')
@@ -73,6 +74,9 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False):
 
     # Generate
     for reply in generate_func(question, original_question, seed, state, stopping_strings, is_chat=is_chat):
+        if escape_html:
+            reply = html.escape(reply)
+
         reply, stop_found = apply_stopping_strings(reply, all_stop_strings)
         if is_stream:
             cur_time = time.time()
@@ -138,7 +142,7 @@ def generate_reply_wrapper(question, state, stopping_strings=None):
     reply = question if not shared.is_seq2seq else ''
     yield formatted_outputs(reply, shared.model_name)
 
-    for reply in generate_reply(question, state, stopping_strings, is_chat=False):
+    for reply in generate_reply(question, state, stopping_strings, is_chat=False, escape_html=True):
         if not shared.is_seq2seq:
             reply = question + reply
 

From 2cb07065ec6aa7064e8ce82b11572c6086f749f0 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 20 Aug 2023 21:50:42 -0700
Subject: [PATCH 132/169] Fix an escaping bug

---
 modules/text_generation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/text_generation.py b/modules/text_generation.py
index efafd67a..5128e503 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -152,9 +152,9 @@ def generate_reply_wrapper(question, state, stopping_strings=None):
 def formatted_outputs(reply, model_name):
     if any(s in model_name for s in ['gpt-4chan', 'gpt4chan']):
         reply = fix_gpt4chan(reply)
-        return reply, generate_4chan_html(reply)
+        return html.unescape(reply), generate_4chan_html(reply)
     else:
-        return reply, generate_basic_html(reply)
+        return html.unescape(reply), generate_basic_html(reply)
 
 
 def fix_gpt4chan(s):

From 41b98e07fbc2a0ffd9771de80138f7158d614466 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 20 Aug 2023 22:09:18 -0700
Subject: [PATCH 133/169] Minor CSS fix

---
 css/main.css | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/css/main.css b/css/main.css
index e7afd595..3408375c 100644
--- a/css/main.css
+++ b/css/main.css
@@ -105,7 +105,7 @@ div.svelte-15lo0d8 > *, div.svelte-15lo0d8 > .form > * {
 }
 
 .textbox_default textarea {
-    height: calc(100dvh - 310px);
+    height: calc(100dvh - 280px);
 }
 
 .textbox_default_output textarea {
@@ -227,7 +227,7 @@ audio {
 }
 
 .default-token-counter {
-  top: calc( 0.5 * (100dvh - 285px) ) !important;
+  top: calc( 0.5 * (100dvh - 255px) ) !important;
 }
 
 .token-counter span {

From 6cca8b80283499bab21511cb4304f9fcda7690d5 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 21 Aug 2023 05:39:55 -0700
Subject: [PATCH 134/169] Only update notebook token counter on input

For performance during streaming
---
 modules/ui_notebook.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_notebook.py b/modules/ui_notebook.py
index b4b859ee..7fbf7a85 100644
--- a/modules/ui_notebook.py
+++ b/modules/ui_notebook.py
@@ -85,5 +85,5 @@ def create_event_handlers():
         lambda x: x + '.txt', gradio('prompt_menu-notebook'), gradio('delete_filename')).then(
         lambda: gr.update(visible=True), None, gradio('file_deleter'))
 
-    shared.gradio['textbox-notebook'].change(lambda x : f"<span>{count_tokens(x)}</span>", gradio('textbox-notebook'), gradio('token-counter-notebook'), show_progress=False)
+    shared.gradio['textbox-notebook'].input(lambda x : f"<span>{count_tokens(x)}</span>", gradio('textbox-notebook'), gradio('token-counter-notebook'), show_progress=False)
     shared.gradio['get_logits-notebook'].click(logits.get_next_logits, gradio('textbox-notebook'), gradio('logits-notebook'))

From 1b419f656f7a28508d4a42d23f8862c250b47d2a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 21 Aug 2023 11:57:51 -0700
Subject: [PATCH 135/169] Acknowledge a16z support

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index 379d1db4..9e63163e 100644
--- a/README.md
+++ b/README.md
@@ -382,3 +382,7 @@ If you would like to contribute to the project, check out the [Contributing guid
 
 * Subreddit: https://www.reddit.com/r/oobabooga/
 * Discord: https://discord.gg/jwZCF2dPQN
+
+## Acknowledgment
+
+In August 2023, [Andreessen Horowitz](https://a16z.com/) (a16z) provided a generous grant to encourage and support my independent work on this project. I am **extremely** grateful for their trust and recognition, which will allow me to dedicate more time towards realizing the full potential of text-generation-webui.

From 2da38e89e68d7732850be4912d96dadaed1c829f Mon Sep 17 00:00:00 2001
From: tdrussell <6509934+tdrussell@users.noreply.github.com>
Date: Tue, 22 Aug 2023 14:49:37 -0500
Subject: [PATCH 136/169] Fix whitespace formatting in perplexity_colors
 extension. (#3643)

---
 css/html_readable_style.css            |  4 ++
 extensions/perplexity_colors/script.py | 81 +++++---------------------
 2 files changed, 18 insertions(+), 67 deletions(-)

diff --git a/css/html_readable_style.css b/css/html_readable_style.css
index cd5fca97..2cfa6f2b 100644
--- a/css/html_readable_style.css
+++ b/css/html_readable_style.css
@@ -26,4 +26,8 @@
 
 .container :not(pre) > code {
     white-space: normal !important;
+}
+
+.container .hoverable {
+    font-size: 14px;
 }
\ No newline at end of file
diff --git a/extensions/perplexity_colors/script.py b/extensions/perplexity_colors/script.py
index d106fabd..2a986ac4 100644
--- a/extensions/perplexity_colors/script.py
+++ b/extensions/perplexity_colors/script.py
@@ -1,14 +1,11 @@
-import re
 import time
 
 import gradio
-import markdown
 import numpy as np
 import torch
 from transformers import LogitsProcessor
 
 from modules import html_generator, shared
-from modules.html_generator import replace_blockquote
 
 params = {
     'active': True,
@@ -113,15 +110,9 @@ def output_modifier(text):
     sel_probs = ppl_logits_processor.selected_probs[1:]
 
     end_part = '</div></div>' if params['probability_dropdown'] else '</span>'  # Helps with finding the index after replacing part of the text.
-    in_code = False  # Since the <span> tags mess up code blocks, avoid coloring while inside a code block, based on finding tokens with '`' in them
 
     i = 0
     for token, prob, ppl, top_tokens, top_probs in zip(gen_tokens, sel_probs, perplexities, top_tokens_list, top_probs_list):
-        if '`' in token and not params['probability_dropdown']:
-            in_code = not in_code
-            continue
-        if in_code:
-            continue
         color = 'ffffff'
         if params['color_by_probability'] and params['color_by_perplexity']:
             color = probability_perplexity_color_scale(prob, ppl)
@@ -131,20 +122,13 @@ def output_modifier(text):
             color = probability_color_scale(prob)
         if token in text[i:]:
             if params['probability_dropdown']:
-                after_token_index = text[i:].find(token) + len(token)
-                whitespace = text[i:][after_token_index:(after_token_index + 1)]
-                if whitespace != ' ':
-                    whitespace = ''
-                text = text[:i] + text[i:].replace(token, add_dropdown_html(token, color, top_tokens, top_probs[0], whitespace, ppl), 1)
+                text = text[:i] + text[i:].replace(token, add_dropdown_html(token, color, top_tokens, top_probs[0], ppl), 1)
             else:
                 text = text[:i] + text[i:].replace(token, add_color_html(token, color), 1)
             i += text[i:].find(end_part) + len(end_part)
 
     # Use full perplexity list for calculating the average here.
     print('Average perplexity:', round(np.mean(ppl_logits_processor.perplexities_list[:-1]), 4))
-    # Optional hacky workaround: Without this, spaces get added between every token. With this, there is a little extra whitespace at the top.
-    # This fixes the tokenization spaces, somehow. However, this also removes any paragraph breaks in the message.
-    # return '<p>' + text + '</p>'
     # t1 = time.time()
     # print(f"Modifier: {(t1-t0):.3f} s")
     # About 50 ms
@@ -216,11 +200,8 @@ def add_color_html(token, color):
 # I think the issue is from HTML elements taking up space in the visible history, and things like history deepcopy add latency proportional to the size of the history.
 # Potential solution is maybe to modify the main generation code to send just the internal text and not the visible history, to avoid moving too much around.
 # I wonder if we can also avoid using deepcopy here.
-# The whitespace fix here is not perfect -- it will remove whitespace of paragraph breaks and other particular cases.
-def add_dropdown_html(token, color, top_tokens, top_probs, whitespace='', perplexity=0):
-    if whitespace != '':
-        whitespace = '&nbsp;'
-    html = f'<div class="hoverable"><span style="color: #{color}">{token}{whitespace}</span><div class="dropdown"><table class="dropdown-content"><tbody>'
+def add_dropdown_html(token, color, top_tokens, top_probs, perplexity=0):
+    html = f'<div class="hoverable"><span style="color: #{color}">{token}</span><div class="dropdown"><table class="dropdown-content"><tbody>'
     for token_option, prob in zip(top_tokens, top_probs):
         # TODO: Bold for selected token?
         # Using divs prevented the problem of divs inside spans causing issues.
@@ -232,7 +213,7 @@ def add_dropdown_html(token, color, top_tokens, top_probs, whitespace='', perple
     if perplexity != 0:
         ppl_color = perplexity_color_scale(perplexity)
         html += f'<tr><td>Perplexity:</td><td style="color: #{ppl_color}">{perplexity:.4f}</td></tr>'
-    html += '</tbody></table></div></div>\n'  # The newline would normally be added by markdown.markdown() but this is faster.
+    html += '</tbody></table></div></div>'
     return html  # About 750 characters per token...
 
 
@@ -273,13 +254,16 @@ def custom_css():
             line-height: 1.75;
             margin: 0;
             padding: 0;
-            margin-right: -4px;
         }
 
         .hoverable:hover .dropdown {
             display: block;
         }
 
+        pre {
+            white-space: pre-wrap;
+        }
+
         # TODO: This makes the hover menus extend outside the bounds of the chat area, which is good.
         # However, it also makes the scrollbar disappear, which is bad.
         # The scroll bar needs to still be present. So for now, we can't see dropdowns that extend past the edge of the chat area.
@@ -288,51 +272,14 @@ def custom_css():
         #}
     """
 
+
 # Monkeypatch applied to html_generator.py
-# This fixes an issue where the markdown conversion was causing a large slowdown in generation speeds if too many tokens had probability dropdowns added.
-# I'd rather have a more long-term solution, since this really shouldn't be called on all messages for each token, but this works for now.
+# We simply don't render markdown into HTML. We wrap everything in <pre> tags to preserve whitespace
+# formatting. If you're coloring tokens by perplexity or probability, or especially if you're using
+# the probability dropdown, you probably care more about seeing the tokens the model actually outputted
+# rather than rendering ```code blocks``` or *italics*.
 def convert_to_markdown(string):
-    # t0 = time.time()
-    # Blockquote
-    pattern = re.compile(r'\\begin{blockquote}(.*?)\\end{blockquote}', re.DOTALL)
-    string = pattern.sub(replace_blockquote, string)
-
-    # Code
-    string = string.replace('\\begin{code}', '```')
-    string = string.replace('\\end{code}', '```')
-    string = re.sub(r"(.)```", r"\1\n```", string)
-
-    result = ''
-    is_code = False
-    for line in string.split('\n'):
-        if line.lstrip(' ').startswith('```'):
-            is_code = not is_code
-
-        result += line
-        if is_code or line.startswith('|'):  # Don't add an extra \n for tables or code
-            result += '\n'
-        else:
-            result += '\n\n'
-
-    if is_code:
-        result = result + '```'  # Unfinished code block
-
-    string = result.strip()
-    # t1 = time.time()
-    # print(len(string))
-    # print(f"Pre markdown: {(t1-t0):.3f} s")
-    if params['probability_dropdown'] and '<div class="hoverable">' in string:
-        # Prevents all latency introduced by trying to convert the HTML to markdown when it's not even necessary
-        # print('Monkeypatched')
-        return string
-    else:
-        # t0 = time.time()
-        return markdown.markdown(string, extensions=['fenced_code', 'tables'])
-        # t1 = time.time()
-        # print(f"Markdown: {(t1-t0):.3f} s for string of length {len(string)}")
-        # print(string)
-        # print(res)
-        # return res
+    return '<pre>' + string + '</pre>'
 
 
 html_generator.convert_to_markdown = convert_to_markdown

From e042bf86248f1ecff9d95fae49812750c27ce758 Mon Sep 17 00:00:00 2001
From: cal066 <60696996+cal066@users.noreply.github.com>
Date: Tue, 22 Aug 2023 19:51:34 +0000
Subject: [PATCH 137/169] ctransformers: add mlock and no-mmap options (#3649)

---
 modules/ctransformers_model.py | 4 +++-
 modules/loaders.py             | 4 +++-
 requirements.txt               | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/modules/ctransformers_model.py b/modules/ctransformers_model.py
index 5e0f347c..8b8b5c4d 100644
--- a/modules/ctransformers_model.py
+++ b/modules/ctransformers_model.py
@@ -19,7 +19,9 @@ class CtransformersModel:
             gpu_layers=shared.args.n_gpu_layers,
             batch_size=shared.args.n_batch,
             context_length=shared.args.n_ctx,
-            stream=True
+            stream=True,
+            mmap=not shared.args.no_mmap,
+            mlock=shared.args.mlock
         )
 
         self.model = AutoModelForCausalLM.from_pretrained(
diff --git a/modules/loaders.py b/modules/loaders.py
index 7444555f..472e8ddb 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -101,7 +101,9 @@ loaders_and_params = OrderedDict({
         'n_gpu_layers',
         'n_batch',
         'threads',
-        'model_type'
+        'model_type',
+        'no_mmap',
+        'mlock'
     ]
 })
 
diff --git a/requirements.txt b/requirements.txt
index 05a3ec0e..25c953ee 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -42,4 +42,4 @@ https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 
 # ctransformers
-https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.22+cu117-py3-none-any.whl
+https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.23+cu117-py3-none-any.whl

From df165fe6c4969d5e18ffc2a63625da0779657929 Mon Sep 17 00:00:00 2001
From: tkbit <96506466+tkbit@users.noreply.github.com>
Date: Tue, 22 Aug 2023 12:55:17 -0700
Subject: [PATCH 138/169] Use numpy==1.24 in requirements.txt (#3651)

The whisper extension needs numpy 1.24 to work properly
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 25c953ee..746b3b57 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,7 +8,7 @@ colorama
 datasets
 einops
 markdown
-numpy
+numpy==1.24
 pandas
 Pillow>=9.5.0
 pyyaml

From 727fd229f44f633048b105eb770f36d1bdf9774a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 22 Aug 2023 13:02:29 -0700
Subject: [PATCH 139/169] Increase stalebot timeout to 6 weeks

---
 .github/workflows/stale.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index ce603a4f..2de6d955 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -13,8 +13,8 @@ jobs:
       - uses: actions/stale@v5
         with:
           stale-issue-message: ""
-          close-issue-message: "This issue has been closed due to inactivity for 30 days. If you believe it is still relevant, please leave a comment below."
-          days-before-issue-stale: 30
+          close-issue-message: "This issue has been closed due to inactivity for 6 weeks. If you believe it is still relevant, please leave a comment below. You can tag a developer in your comment."
+          days-before-issue-stale: 42
           days-before-issue-close: 0
           stale-issue-label: "stale"
           days-before-pr-stale: -1

From 335c49cc7e899a386fde2641072a2e101c89612f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 22 Aug 2023 13:14:59 -0700
Subject: [PATCH 140/169] Bump peft and transformers

---
 modules/training.py | 4 ++--
 requirements.txt    | 9 ++++-----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/modules/training.py b/modules/training.py
index 839c9079..3a9b4146 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -21,7 +21,7 @@ from datasets import Dataset, load_dataset
 from peft import (
     LoraConfig,
     get_peft_model,
-    prepare_model_for_int8_training,
+    prepare_model_for_kbit_training,
     set_peft_model_state_dict
 )
 from peft.utils.other import \
@@ -483,7 +483,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
     # == Start prepping the model itself ==
     if not hasattr(shared.model, 'lm_head') or hasattr(shared.model.lm_head, 'weight'):
         logger.info("Getting model ready...")
-        prepare_model_for_int8_training(shared.model)
+        prepare_model_for_kbit_training(shared.model)
 
     # base model is now frozen and should not be reused for any other LoRA training than this one
     shared.model_dirty_from_training = True
diff --git a/requirements.txt b/requirements.txt
index 746b3b57..3aa65688 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,26 +3,25 @@ fastapi==0.95.2
 gradio_client==0.2.5
 gradio==3.33.1
 
-accelerate==0.21.0
+accelerate==0.21.*
 colorama
 datasets
 einops
 markdown
 numpy==1.24
 pandas
+peft==0.5.*
 Pillow>=9.5.0
 pyyaml
 requests
-safetensors==0.3.1
+safetensors==0.3.*
+transformers==4.32.*
 scipy
 sentencepiece
 tensorboard
 tqdm
 wandb
 
-git+https://github.com/huggingface/peft@4b371b489b9850fd583f204cdf8b5471e098d4e4
-git+https://github.com/huggingface/transformers@baf1daa58eb2960248fd9f7c3af0ed245b8ce4af
-
 bitsandbytes==0.41.1; platform_system != "Windows"
 https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl; platform_system == "Windows"
 https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"

From 25e5eaa6a6a01b6f8ca25d05d9c1df63aa28b836 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 22 Aug 2023 13:16:44 -0700
Subject: [PATCH 141/169] Remove outdated training warning

---
 modules/training.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/modules/training.py b/modules/training.py
index 3a9b4146..7be0d24f 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -305,15 +305,10 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
 
         time.sleep(5)
 
-    if shared.args.wbits > 0 and not shared.args.monkey_patch:
-        yield "LoRA training with GPTQ models requires loading with `--monkey-patch`"
+    if shared.args.loader == 'GPTQ-for-LLaMa' and not shared.args.monkey_patch:
+        yield "LoRA training with GPTQ-for-LLaMa requires loading with `--monkey-patch`"
         return
 
-    elif not (shared.args.load_in_8bit or shared.args.load_in_4bit) and shared.args.wbits <= 0:
-        yield "It is highly recommended you use `--load-in-8bit` for LoRA training. *(Will continue anyway in 2 seconds, press `Interrupt` to stop.)*"
-        logger.warning("It is highly recommended you use `--load-in-8bit` for LoRA training.")
-        time.sleep(2)  # Give it a moment for the message to show in UI before continuing
-
     if cutoff_len <= 0 or micro_batch_size <= 0 or batch_size <= 0 or actual_lr <= 0 or lora_rank <= 0 or lora_alpha <= 0:
         yield "Cannot input zeroes."
         return

From 0b352ea7ef802ee4cfe29de310b84e8318e241d0 Mon Sep 17 00:00:00 2001
From: Sam <sammcj@users.noreply.github.com>
Date: Wed, 23 Aug 2023 06:41:11 +1000
Subject: [PATCH 142/169] Add missing extensions to Dockerfile (#3544)

---
 docker/Dockerfile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 3c5108d8..177b247e 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -51,11 +51,15 @@ COPY extensions/elevenlabs_tts/requirements.txt /app/extensions/elevenlabs_tts/r
 COPY extensions/google_translate/requirements.txt /app/extensions/google_translate/requirements.txt
 COPY extensions/silero_tts/requirements.txt /app/extensions/silero_tts/requirements.txt
 COPY extensions/whisper_stt/requirements.txt /app/extensions/whisper_stt/requirements.txt
+COPY extensions/superbooga/requirements.txt /app/extensions/superbooga/requirements.txt
+COPY extensions/openai/requirements.txt /app/extensions/openai/requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/api && pip3 install -r requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/elevenlabs_tts && pip3 install -r requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/google_translate && pip3 install -r requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/silero_tts && pip3 install -r requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/whisper_stt && pip3 install -r requirements.txt
+RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/superbooga && pip3 install -r requirements.txt
+RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/openai && pip3 install -r requirements.txt
 
 COPY requirements.txt /app/requirements.txt
 RUN . /app/venv/bin/activate && \

From 8545052c9d994370b110047e634c4593d02d50f9 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 22 Aug 2023 20:18:16 -0700
Subject: [PATCH 143/169] Add the option to use samplers in the logit viewer

---
 css/main.css              |  5 +++++
 js/main.js                |  9 +++++++++
 modules/callbacks.py      |  1 +
 modules/logits.py         | 29 ++++++++++++++++++++---------
 modules/sampler_hijack.py | 13 +++++++++++++
 modules/training.py       |  2 +-
 modules/ui_default.py     | 17 +++++++++++++----
 modules/ui_notebook.py    | 17 +++++++++++++----
 8 files changed, 75 insertions(+), 18 deletions(-)

diff --git a/css/main.css b/css/main.css
index 3408375c..405b57e0 100644
--- a/css/main.css
+++ b/css/main.css
@@ -237,6 +237,11 @@ audio {
   border-radius: 0.4em;
 }
 
+.no-background {
+  background: var(--background-fill-primary) !important;
+  padding: 0px !important;
+}
+
 /*****************************************************/
 /*************** Chat UI declarations ****************/
 /*****************************************************/
diff --git a/js/main.js b/js/main.js
index 6a27c3b4..e409cc3d 100644
--- a/js/main.js
+++ b/js/main.js
@@ -82,3 +82,12 @@ observer.observe(targetElement, config);
 //------------------------------------------------
 document.getElementById('chat-input').parentNode.style.background = 'transparent';
 document.getElementById('chat-input').parentNode.style.border = 'none';
+
+//------------------------------------------------
+// Remove some backgrounds
+//------------------------------------------------
+const noBackgroundelements = document.querySelectorAll('.no-background');
+for(i = 0; i < noBackgroundelements.length; i++) {
+    noBackgroundelements[i].parentNode.style.border = 'none';
+    noBackgroundelements[i].parentNode.parentNode.parentNode.style.alignItems = 'center';
+}
diff --git a/modules/callbacks.py b/modules/callbacks.py
index 1fa95e47..e29e397d 100644
--- a/modules/callbacks.py
+++ b/modules/callbacks.py
@@ -24,6 +24,7 @@ class Stream(transformers.StoppingCriteria):
     def __call__(self, input_ids, scores) -> bool:
         if self.callback_func is not None:
             self.callback_func(input_ids[0])
+
         return False
 
 
diff --git a/modules/logits.py b/modules/logits.py
index 99cb336f..3bfeb6b0 100644
--- a/modules/logits.py
+++ b/modules/logits.py
@@ -1,19 +1,30 @@
 import torch
 
-from modules import shared
+from modules import sampler_hijack, shared
+from modules.text_generation import generate_reply
+
+global_scores = None
 
 
-def get_next_logits(prompt):
-    tokens = shared.tokenizer.encode(prompt, return_tensors='pt').cuda()
-    output = shared.model(input_ids=tokens)
+def get_next_logits(prompt, state, use_samplers, previous):
+    if use_samplers:
+        state['max_new_tokens'] = 1
+        state['auto_max_new_tokens'] = False
+        for _ in generate_reply(prompt, state):
+            pass
+
+        scores = sampler_hijack.global_scores[-1]
+    else:
+        tokens = shared.tokenizer.encode(prompt, return_tensors='pt').cuda()
+        output = shared.model(input_ids=tokens)
+        scores = output['logits'][-1][-1]
 
-    scores = output['logits'][-1][-1]
     probs = torch.softmax(scores, dim=-1, dtype=torch.float)
-
     topk_values, topk_indices = torch.topk(probs, k=20, largest=True, sorted=True)
-    topk_values = [f"{float(i):.5f}" % i for i in topk_values]
+    topk_values = [f"{float(i):.5f}" for i in topk_values]
+
     output = ''
     for row in list(zip(topk_values, shared.tokenizer.convert_ids_to_tokens(topk_indices))):
-        output += f"{row[0]} {row[1]}\n"
+        output += f"{row[0]}  -  {row[1]}\n"
 
-    return output
+    return output, previous
diff --git a/modules/sampler_hijack.py b/modules/sampler_hijack.py
index d5ebbb76..0a724f47 100644
--- a/modules/sampler_hijack.py
+++ b/modules/sampler_hijack.py
@@ -10,6 +10,8 @@ from transformers.generation.logits_process import (
     TemperatureLogitsWarper
 )
 
+global_scores = None
+
 
 class TailFreeLogitsWarper(LogitsWarper):
     def __init__(self, tfs: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
@@ -122,6 +124,16 @@ class MirostatLogitsWarper(LogitsWarper):
         return scores
 
 
+class SpyLogitsWarper(LogitsWarper):
+    def __init__(self):
+        pass
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        global global_scores
+        global_scores = scores
+        return scores
+
+
 class RepetitionPenaltyLogitsProcessorWithRange(LogitsProcessor):
     '''
     Copied from the transformers library
@@ -168,6 +180,7 @@ def get_logits_warper_patch(self, generation_config):
     else:
         warpers += warpers_to_add
 
+    warpers.append(SpyLogitsWarper())
     return warpers
 
 
diff --git a/modules/training.py b/modules/training.py
index 7be0d24f..a993f6f0 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -64,7 +64,7 @@ def create_ui():
                         with gr.Column(scale=5):
                             lora_name = gr.Textbox(label='Name', info='The name of your new LoRA file')
                         with gr.Column():
-                            always_override = gr.Checkbox(label='Override Existing Files', value=False, info='If the name is the same, checking will replace the existing file, and unchecking will load and continue from it (the rank must be the same).')
+                            always_override = gr.Checkbox(label='Override Existing Files', value=False, info='If the name is the same, checking will replace the existing file, and unchecking will load and continue from it (the rank must be the same).', elem_classes=['no-background'])
 
                     with gr.Row():
                         with gr.Column():
diff --git a/modules/ui_default.py b/modules/ui_default.py
index 5470a6ad..29b9bee5 100644
--- a/modules/ui_default.py
+++ b/modules/ui_default.py
@@ -44,8 +44,15 @@ def create_ui():
                     shared.gradio['html-default'] = gr.HTML()
 
                 with gr.Tab('Logits'):
-                    shared.gradio['get_logits-default'] = gr.Button('Get next token probabilities')
-                    shared.gradio['logits-default'] = gr.Textbox(lines=23, label='Output', elem_classes=['textbox_logits', 'add_scrollbar'])
+                    with gr.Row():
+                        with gr.Column(scale=10):
+                            shared.gradio['get_logits-default'] = gr.Button('Get next token probabilities')
+                        with gr.Column(scale=1):
+                            shared.gradio['use_samplers-default'] = gr.Checkbox(label='Use samplers', value=True, elem_classes=['no-background'])
+
+                    with gr.Row():
+                        shared.gradio['logits-default'] = gr.Textbox(lines=23, label='Output', elem_classes=['textbox_logits', 'add_scrollbar'])
+                        shared.gradio['logits-default-previous'] = gr.Textbox(lines=23, label='Previous output', elem_classes=['textbox_logits', 'add_scrollbar'])
 
 
 def create_event_handlers():
@@ -83,5 +90,7 @@ def create_event_handlers():
         lambda x: x + '.txt', gradio('prompt_menu-default'), gradio('delete_filename')).then(
         lambda: gr.update(visible=True), None, gradio('file_deleter'))
 
-    shared.gradio['textbox-default'].change(lambda x : f"<span>{count_tokens(x)}</span>", gradio('textbox-default'), gradio('token-counter-default'), show_progress=False)
-    shared.gradio['get_logits-default'].click(logits.get_next_logits, gradio('textbox-default'), gradio('logits-default'), show_progress=False)
+    shared.gradio['textbox-default'].change(lambda x: f"<span>{count_tokens(x)}</span>", gradio('textbox-default'), gradio('token-counter-default'), show_progress=False)
+    shared.gradio['get_logits-default'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        logits.get_next_logits, gradio('textbox-default', 'interface_state', 'use_samplers-default', 'logits-default'), gradio('logits-default', 'logits-default-previous'), show_progress=False)
diff --git a/modules/ui_notebook.py b/modules/ui_notebook.py
index 7fbf7a85..9ff0c3fe 100644
--- a/modules/ui_notebook.py
+++ b/modules/ui_notebook.py
@@ -30,8 +30,15 @@ def create_ui():
                     shared.gradio['html-notebook'] = gr.HTML()
 
                 with gr.Tab('Logits'):
-                    shared.gradio['get_logits-notebook'] = gr.Button('Get next token probabilities')
-                    shared.gradio['logits-notebook'] = gr.Textbox(lines=23, label='Output', elem_classes=['textbox_logits_notebook', 'add_scrollbar'])
+                    with gr.Row():
+                        with gr.Column(scale=10):
+                            shared.gradio['get_logits-notebook'] = gr.Button('Get next token probabilities')
+                        with gr.Column(scale=1):
+                            shared.gradio['use_samplers-notebook'] = gr.Checkbox(label='Use samplers', value=True, elem_classes=['no-background'])
+
+                    with gr.Row():
+                        shared.gradio['logits-notebook'] = gr.Textbox(lines=23, label='Output', elem_classes=['textbox_logits', 'add_scrollbar'])
+                        shared.gradio['logits-notebook-previous'] = gr.Textbox(lines=23, label='Previous output', elem_classes=['textbox_logits', 'add_scrollbar'])
 
                 with gr.Row():
                     shared.gradio['Generate-notebook'] = gr.Button('Generate', variant='primary', elem_classes='small-button')
@@ -85,5 +92,7 @@ def create_event_handlers():
         lambda x: x + '.txt', gradio('prompt_menu-notebook'), gradio('delete_filename')).then(
         lambda: gr.update(visible=True), None, gradio('file_deleter'))
 
-    shared.gradio['textbox-notebook'].input(lambda x : f"<span>{count_tokens(x)}</span>", gradio('textbox-notebook'), gradio('token-counter-notebook'), show_progress=False)
-    shared.gradio['get_logits-notebook'].click(logits.get_next_logits, gradio('textbox-notebook'), gradio('logits-notebook'))
+    shared.gradio['textbox-notebook'].input(lambda x: f"<span>{count_tokens(x)}</span>", gradio('textbox-notebook'), gradio('token-counter-notebook'), show_progress=False)
+    shared.gradio['get_logits-notebook'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        logits.get_next_logits, gradio('textbox-notebook', 'interface_state', 'use_samplers-notebook', 'logits-notebook'), gradio('logits-notebook', 'logits-notebook-previous'), show_progress=False)

From c0b119c3a3ab65555d7e9a5245c7cd80aaefc33e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 22 Aug 2023 20:35:12 -0700
Subject: [PATCH 144/169] Improve logit viewer format

---
 modules/logits.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/modules/logits.py b/modules/logits.py
index 3bfeb6b0..3aed6624 100644
--- a/modules/logits.py
+++ b/modules/logits.py
@@ -20,11 +20,12 @@ def get_next_logits(prompt, state, use_samplers, previous):
         scores = output['logits'][-1][-1]
 
     probs = torch.softmax(scores, dim=-1, dtype=torch.float)
-    topk_values, topk_indices = torch.topk(probs, k=20, largest=True, sorted=True)
+    topk_values, topk_indices = torch.topk(probs, k=25, largest=True, sorted=True)
     topk_values = [f"{float(i):.5f}" for i in topk_values]
+    tokens = [shared.tokenizer.decode(i) for i in topk_indices]
 
     output = ''
-    for row in list(zip(topk_values, shared.tokenizer.convert_ids_to_tokens(topk_indices))):
+    for row in list(zip(topk_values, tokens)):
         output += f"{row[0]}  -  {row[1]}\n"
 
     return output, previous

From 87442c6d1839c5e5259880a9a0868f75ee0e898e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 22 Aug 2023 21:00:12 -0700
Subject: [PATCH 145/169] Fix Notebook Logits tab

---
 modules/ui_notebook.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/ui_notebook.py b/modules/ui_notebook.py
index 9ff0c3fe..e2aeb860 100644
--- a/modules/ui_notebook.py
+++ b/modules/ui_notebook.py
@@ -37,8 +37,8 @@ def create_ui():
                             shared.gradio['use_samplers-notebook'] = gr.Checkbox(label='Use samplers', value=True, elem_classes=['no-background'])
 
                     with gr.Row():
-                        shared.gradio['logits-notebook'] = gr.Textbox(lines=23, label='Output', elem_classes=['textbox_logits', 'add_scrollbar'])
-                        shared.gradio['logits-notebook-previous'] = gr.Textbox(lines=23, label='Previous output', elem_classes=['textbox_logits', 'add_scrollbar'])
+                        shared.gradio['logits-notebook'] = gr.Textbox(lines=23, label='Output', elem_classes=['textbox_logits_notebook', 'add_scrollbar'])
+                        shared.gradio['logits-notebook-previous'] = gr.Textbox(lines=23, label='Previous output', elem_classes=['textbox_logits_notebook', 'add_scrollbar'])
 
                 with gr.Row():
                     shared.gradio['Generate-notebook'] = gr.Button('Generate', variant='primary', elem_classes='small-button')

From 2b675533f75a9441c6ba8e52bd5b82b68eedf99c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 23 Aug 2023 14:36:03 -0700
Subject: [PATCH 146/169] Un-bump safetensors

The newest one doesn't work on Windows yet
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 3aa65688..7a033c96 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,7 +14,7 @@ peft==0.5.*
 Pillow>=9.5.0
 pyyaml
 requests
-safetensors==0.3.*
+safetensors==0.3.2
 transformers==4.32.*
 scipy
 sentencepiece

From d6934bc7bc79d7e30629414d1f46faae404bbff9 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 24 Aug 2023 16:27:36 -0300
Subject: [PATCH 147/169] Implement CFG for ExLlama_HF (#3666)

---
 README.md                  |  1 +
 modules/exllama_hf.py      | 57 ++++++++++++++++++++-------
 modules/llamacpp_hf.py     | 80 +++++++++++++++++++++++++++++++++-----
 modules/loaders.py         |  3 ++
 modules/models_settings.py |  4 +-
 modules/shared.py          |  1 +
 modules/ui.py              |  1 +
 modules/ui_model_menu.py   |  1 +
 8 files changed, 122 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index 9e63163e..3c58d0ec 100644
--- a/README.md
+++ b/README.md
@@ -304,6 +304,7 @@ Optionally, you can use the following command-line flags:
 |------------------|-------------|
 |`--gpu-split`     | Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. `20,7,7` |
 |`--max_seq_len MAX_SEQ_LEN`           | Maximum sequence length. |
+|`--cfg-cache`                         | ExLlama_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader, but not necessary for CFG with base ExLlama. |
 
 #### GPTQ-for-LLaMa
 
diff --git a/modules/exllama_hf.py b/modules/exllama_hf.py
index ebafb4f7..129ee52e 100644
--- a/modules/exllama_hf.py
+++ b/modules/exllama_hf.py
@@ -29,10 +29,16 @@ class ExllamaHF(PreTrainedModel):
         super().__init__(PretrainedConfig())
         self.ex_config = config
         self.ex_model = ExLlama(self.ex_config)
-        self.ex_cache = ExLlamaCache(self.ex_model)
         self.generation_config = GenerationConfig()
         self.lora = None
 
+        self.ex_cache = ExLlamaCache(self.ex_model)
+        self.past_seq = None
+
+        if shared.args.cfg_cache:
+            self.ex_cache_negative = ExLlamaCache(self.ex_model)
+            self.past_seq_negative = None
+
     def _validate_model_class(self):
         pass
 
@@ -47,25 +53,46 @@ class ExllamaHF(PreTrainedModel):
         return torch.device(0)
 
     def __call__(self, *args, **kwargs):
-        input_ids = args[0] if len(args) > 0 else kwargs['input_ids']
         use_cache = kwargs.get('use_cache', True)
         labels = kwargs.get('labels', None)
-        cache = kwargs.get('past_key_values', None)
-        seq = input_ids[0].tolist()
+        past_key_values = kwargs.get('past_key_values', None)
 
-        if labels is None:
-            if cache is None:
-                self.ex_cache.current_seq_len = 0
-                cache = self.ex_cache
-                self.ex_model.forward(torch.tensor([seq[:-1]], dtype=torch.long), cache, preprocess_only=True, lora=self.lora)
+        if len(args) > 0:
+            if not shared.args.cfg_cache:
+                logger.error("Please enable the cfg-cache option to use CFG with ExLlama_HF.")
+                return
 
-            logits = self.ex_model.forward(torch.tensor([seq[-1:]], dtype=torch.long), cache, lora=self.lora).to(input_ids.device)
+            input_ids = args[0]
+            is_negative = True
+            past_seq = self.past_seq_negative
+            ex_cache = self.ex_cache_negative
         else:
-            if cache is None:
-                self.ex_cache.current_seq_len = 0
-                cache = self.ex_cache
+            input_ids = kwargs['input_ids']
+            is_negative = False
+            past_seq = self.past_seq
+            ex_cache = self.ex_cache
 
-            logits = self.ex_model.forward(torch.tensor([seq], dtype=torch.long), cache, last_id_only=False, lora=self.lora)
+        seq = input_ids[0].tolist()
+        if is_negative and past_key_values is not None:
+            seq = past_key_values + seq
+
+        seq_tensor = torch.tensor(seq)
+
+        # Make the forward call
+        if labels is None:
+            if past_seq is None or not torch.equal(past_seq, seq_tensor[:-1]):
+                ex_cache.current_seq_len = 0
+                self.ex_model.forward(torch.tensor([seq[:-1]], dtype=torch.long), ex_cache, preprocess_only=True, lora=self.lora)
+
+            logits = self.ex_model.forward(torch.tensor([seq[-1:]], dtype=torch.long), ex_cache, lora=self.lora).to(input_ids.device)
+        else:
+            ex_cache.current_seq_len = 0
+            logits = self.ex_model.forward(torch.tensor([seq], dtype=torch.long), ex_cache, last_id_only=False, lora=self.lora)
+
+        if is_negative:
+            self.past_seq_negative = seq_tensor
+        else:
+            self.past_seq = seq_tensor
 
         loss = None
         if labels is not None:
@@ -80,7 +107,7 @@ class ExllamaHF(PreTrainedModel):
             shift_labels = shift_labels.to(shift_logits.device)
             loss = loss_fct(shift_logits, shift_labels)
 
-        return CausalLMOutputWithPast(logits=logits, past_key_values=cache if use_cache else None, loss=loss)
+        return CausalLMOutputWithPast(logits=logits, past_key_values=seq if use_cache else None, loss=loss)
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py
index 10c30112..3c7314d1 100644
--- a/modules/llamacpp_hf.py
+++ b/modules/llamacpp_hf.py
@@ -33,7 +33,22 @@ class LlamacppHF(PreTrainedModel):
         super().__init__(PretrainedConfig())
         self.model = model
         self.generation_config = GenerationConfig()
-        self.cache = None
+
+        self.past_seq = None
+        self.llamacpp_cache = {
+            'n_tokens': self.model.n_tokens,
+            'input_ids': self.model.input_ids,
+            'scores': self.model.scores
+        }
+
+        if shared.args.cfg_cache:
+            logger.warning('CFG is currently bugged and not functional for llamacpp_HF. Contributions are welcome.')
+            self.past_seq_negative = None
+            self.llamacpp_cache_negative = {
+                'n_tokens': self.model.n_tokens,
+                'input_ids': self.model.input_ids.copy(),
+                'scores': self.model.scores.copy()
+            }
 
     def _validate_model_class(self):
         pass
@@ -44,36 +59,83 @@ class LlamacppHF(PreTrainedModel):
     def prepare_inputs_for_generation(self, input_ids, **kwargs):
         return {'input_ids': input_ids, **kwargs}
 
+    def save_cache(self):
+        self.llamacpp_cache.update({
+            'n_tokens': self.model.n_tokens,
+            'input_ids': self.model.input_ids,
+            'scores': self.model.scores
+        })
+
+    def save_negative_cache(self):
+        self.llamacpp_cache_negative.update({
+            'n_tokens': self.model.n_tokens,
+            'input_ids': self.model.input_ids,
+            'scores': self.model.scores
+        })
+
+    def load_cache(self):
+        self.model.n_tokens = self.llamacpp_cache['n_tokens']
+        self.model.input_ids = self.llamacpp_cache['input_ids']
+        self.model.scores = self.llamacpp_cache['scores']
+
+    def load_negative_cache(self):
+        self.model.n_tokens = self.llamacpp_cache_negative['n_tokens']
+        self.model.input_ids = self.llamacpp_cache_negative['input_ids']
+        self.model.scores = self.llamacpp_cache_negative['scores']
+
     @property
     def device(self) -> torch.device:
         return torch.device(0)
 
     def __call__(self, *args, **kwargs):
-        input_ids = args[0] if len(args) > 0 else kwargs['input_ids']
         use_cache = kwargs.get('use_cache', True)
         labels = kwargs.get('labels', None)
-        cache = kwargs.get('past_key_values', None)
+        past_key_values = kwargs.get('past_key_values', None)
+
+        if len(args) > 0:
+            if not shared.args.cfg_cache:
+                logger.error("Please enable the cfg-cache option to use CFG with llamacpp_HF.")
+                logger.warning('CFG is currently bugged and not functional for llamacpp_HF. Contributions are welcome.')
+                return
+
+            input_ids = args[0]
+            is_negative = True
+            past_seq = self.past_seq_negative
+            self.load_negative_cache()
+        else:
+            input_ids = kwargs['input_ids']
+            is_negative = False
+            past_seq = self.past_seq
+            self.load_cache()
+
         seq = input_ids[0].tolist()
+        if is_negative and past_key_values is not None:
+            seq = past_key_values + seq
+
+        seq_tensor = torch.tensor(seq)
 
         # Make the forward call
-        seq_tensor = torch.tensor(seq)
         if labels is None:
-            if self.cache is None or not torch.equal(self.cache, seq_tensor[:-1]):
+            if past_seq is None or not torch.equal(past_seq, seq_tensor[:-1]):
                 self.model.reset()
                 self.model.eval(seq)
             else:
                 self.model.eval([seq[-1]])
 
-            logits = torch.tensor(self.model.scores[self.model.n_tokens - 1, :]).view(1, 1, -1).to(kwargs['input_ids'].device)
+            logits = torch.tensor(self.model.scores[self.model.n_tokens - 1, :]).view(1, 1, -1).to(input_ids.device)
         else:
             self.model.reset()
             self.model.eval(seq)
             logits = torch.tensor(self.model.eval_logits)
             logits = logits.view(1, logits.shape[0], logits.shape[1]).to(input_ids.device)
 
-        self.cache = seq_tensor
+        if is_negative:
+            self.save_negative_cache()
+            self.past_seq_negative = seq_tensor
+        else:
+            self.save_cache()
+            self.past_seq = seq_tensor
 
-        # Based on transformers/models/llama/modeling_llama.py
         loss = None
         if labels is not None:
             # Shift so that tokens < n predict n
@@ -87,7 +149,7 @@ class LlamacppHF(PreTrainedModel):
             shift_labels = shift_labels.to(shift_logits.device)
             loss = loss_fct(shift_logits, shift_labels)
 
-        return CausalLMOutputWithPast(logits=logits, past_key_values=cache if use_cache else None, loss=loss)
+        return CausalLMOutputWithPast(logits=logits, past_key_values=seq if use_cache else None, loss=loss)
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
diff --git a/modules/loaders.py b/modules/loaders.py
index 472e8ddb..b8660a46 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -29,6 +29,7 @@ loaders_and_params = OrderedDict({
         'max_seq_len',
         'alpha_value',
         'compress_pos_emb',
+        'cfg_cache',
         'exllama_HF_info',
     ],
     'ExLlama': [
@@ -157,6 +158,8 @@ loaders_samplers = {
         'mirostat_mode',
         'mirostat_tau',
         'mirostat_eta',
+        'guidance_scale',
+        'negative_prompt',
         'ban_eos_token',
         'add_bos_token',
         'skip_special_tokens',
diff --git a/modules/models_settings.py b/modules/models_settings.py
index 06a41da4..5efde34b 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -91,8 +91,8 @@ def apply_model_settings_to_state(model, state):
         if 'wbits' in model_settings and type(model_settings['wbits']) is int and model_settings['wbits'] > 0:
             loader = 'AutoGPTQ'
 
-        # If the user is using an alternative GPTQ loader, let them keep using it
-        if not (loader == 'AutoGPTQ' and state['loader'] in ['GPTQ-for-LLaMa', 'ExLlama', 'ExLlama_HF']):
+        # If the user is using an alternative loader for the same model type, let them keep using it
+        if not (loader == 'AutoGPTQ' and state['loader'] in ['GPTQ-for-LLaMa', 'ExLlama', 'ExLlama_HF']) and not (loader == 'llama.cpp' and state['loader'] in ['llamacpp_HF', 'ctransformers']):
             state['loader'] = loader
 
     for k in model_settings:
diff --git a/modules/shared.py b/modules/shared.py
index 385b99da..c89c906b 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -147,6 +147,7 @@ parser.add_argument('--disable_exllama', action='store_true', help='Disable ExLl
 # ExLlama
 parser.add_argument('--gpu-split', type=str, help="Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. 20,7,7")
 parser.add_argument('--max_seq_len', type=int, default=2048, help="Maximum sequence length.")
+parser.add_argument('--cfg-cache', action='store_true', help="ExLlama_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader, but not necessary for CFG with base ExLlama.")
 
 # DeepSpeed
 parser.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.')
diff --git a/modules/ui.py b/modules/ui.py
index 15f24d85..f6e9ac10 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -63,6 +63,7 @@ def list_model_elements():
         'no_inject_fused_mlp',
         'no_use_cuda_fp16',
         'disable_exllama',
+        'cfg_cache',
         'threads',
         'n_batch',
         'no_mmap',
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index e217bee1..05fe3af7 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -111,6 +111,7 @@ def create_ui():
                             shared.gradio['low_vram'] = gr.Checkbox(label="low-vram", value=shared.args.low_vram)
                             shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
                             shared.gradio['mul_mat_q'] = gr.Checkbox(label="mul_mat_q", value=shared.args.mul_mat_q)
+                            shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Create an additional cache for CFG negative prompts.')
                             shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17')
                             shared.gradio['llama_cpp_seed'] = gr.Number(label='Seed (0 for random)', value=shared.args.llama_cpp_seed)
                             shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.')

From 3320accfdcfa797bc8ff477be87b09db9251abef Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 24 Aug 2023 20:32:21 -0300
Subject: [PATCH 148/169] Add CFG to llamacpp_HF (second attempt) (#3678)

---
 README.md              |  1 +
 modules/llamacpp_hf.py | 16 ++++++++++------
 modules/loaders.py     |  3 +++
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 3c58d0ec..bd356a18 100644
--- a/README.md
+++ b/README.md
@@ -280,6 +280,7 @@ Optionally, you can use the following command-line flags:
 | `--n_gqa N_GQA`         | grouped-query attention. Must be 8 for llama-2 70b. |
 | `--rms_norm_eps RMS_NORM_EPS`  | 5e-6 is a good value for llama-2 models. |
 | `--cpu`                        | Use the CPU version of llama-cpp-python instead of the GPU-accelerated version. |
+|`--cfg-cache`                   | llamacpp_HF: Create an additional cache for CFG negative prompts. |
 
 #### ctransformers
 
diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py
index 3c7314d1..f37d710a 100644
--- a/modules/llamacpp_hf.py
+++ b/modules/llamacpp_hf.py
@@ -38,16 +38,17 @@ class LlamacppHF(PreTrainedModel):
         self.llamacpp_cache = {
             'n_tokens': self.model.n_tokens,
             'input_ids': self.model.input_ids,
-            'scores': self.model.scores
+            'scores': self.model.scores,
+            'ctx': self.model.ctx
         }
 
         if shared.args.cfg_cache:
-            logger.warning('CFG is currently bugged and not functional for llamacpp_HF. Contributions are welcome.')
             self.past_seq_negative = None
             self.llamacpp_cache_negative = {
                 'n_tokens': self.model.n_tokens,
                 'input_ids': self.model.input_ids.copy(),
-                'scores': self.model.scores.copy()
+                'scores': self.model.scores.copy(),
+                'ctx': llama_cpp_lib().llama_new_context_with_model(model.model, model.params)
             }
 
     def _validate_model_class(self):
@@ -63,25 +64,29 @@ class LlamacppHF(PreTrainedModel):
         self.llamacpp_cache.update({
             'n_tokens': self.model.n_tokens,
             'input_ids': self.model.input_ids,
-            'scores': self.model.scores
+            'scores': self.model.scores,
+            'ctx': self.model.ctx
         })
 
     def save_negative_cache(self):
         self.llamacpp_cache_negative.update({
             'n_tokens': self.model.n_tokens,
             'input_ids': self.model.input_ids,
-            'scores': self.model.scores
+            'scores': self.model.scores,
+            'ctx': self.model.ctx
         })
 
     def load_cache(self):
         self.model.n_tokens = self.llamacpp_cache['n_tokens']
         self.model.input_ids = self.llamacpp_cache['input_ids']
         self.model.scores = self.llamacpp_cache['scores']
+        self.model.ctx = self.llamacpp_cache['ctx']
 
     def load_negative_cache(self):
         self.model.n_tokens = self.llamacpp_cache_negative['n_tokens']
         self.model.input_ids = self.llamacpp_cache_negative['input_ids']
         self.model.scores = self.llamacpp_cache_negative['scores']
+        self.model.ctx = self.llamacpp_cache_negative['ctx']
 
     @property
     def device(self) -> torch.device:
@@ -95,7 +100,6 @@ class LlamacppHF(PreTrainedModel):
         if len(args) > 0:
             if not shared.args.cfg_cache:
                 logger.error("Please enable the cfg-cache option to use CFG with llamacpp_HF.")
-                logger.warning('CFG is currently bugged and not functional for llamacpp_HF. Contributions are welcome.')
                 return
 
             input_ids = args[0]
diff --git a/modules/loaders.py b/modules/loaders.py
index b8660a46..dde40c9b 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -95,6 +95,7 @@ loaders_and_params = OrderedDict({
         'alpha_value',
         'compress_pos_emb',
         'cpu',
+        'cfg_cache',
         'llamacpp_HF_info',
     ],
     'ctransformers': [
@@ -268,6 +269,8 @@ loaders_samplers = {
         'mirostat_mode',
         'mirostat_tau',
         'mirostat_eta',
+        'guidance_scale',
+        'negative_prompt',
         'ban_eos_token',
         'add_bos_token',
         'skip_special_tokens',

From a2c67262c7e88933ebe25943d8ff3bc0ec731102 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 24 Aug 2023 17:27:12 -0700
Subject: [PATCH 149/169] Unescape model output for silero/elevenlabs

---
 extensions/elevenlabs_tts/script.py | 3 ++-
 extensions/silero_tts/script.py     | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/extensions/elevenlabs_tts/script.py b/extensions/elevenlabs_tts/script.py
index 2324d782..68ae16bd 100644
--- a/extensions/elevenlabs_tts/script.py
+++ b/extensions/elevenlabs_tts/script.py
@@ -1,3 +1,4 @@
+import html
 import re
 from pathlib import Path
 
@@ -111,7 +112,7 @@ def output_modifier(string):
     output_file = Path(f'extensions/elevenlabs_tts/outputs/{wav_idx:06d}.mp3'.format(wav_idx))
     print(f'Outputting audio to {str(output_file)}')
     try:
-        audio = elevenlabs.generate(text=string, voice=params['selected_voice'], model=params['model'])
+        audio = elevenlabs.generate(text=html.unescape(string), voice=params['selected_voice'], model=params['model'])
         elevenlabs.save(audio, str(output_file))
 
         autoplay = 'autoplay' if params['autoplay'] else ''
diff --git a/extensions/silero_tts/script.py b/extensions/silero_tts/script.py
index 707d919b..e7616918 100644
--- a/extensions/silero_tts/script.py
+++ b/extensions/silero_tts/script.py
@@ -1,3 +1,4 @@
+import html
 import random
 import time
 from pathlib import Path
@@ -125,7 +126,7 @@ def output_modifier(string, state):
     else:
         output_file = Path(f'extensions/silero_tts/outputs/{state["character_menu"]}_{int(time.time())}.wav')
         prosody = '<prosody rate="{}" pitch="{}">'.format(params['voice_speed'], params['voice_pitch'])
-        silero_input = f'<speak>{prosody}{xmlesc(string)}</prosody></speak>'
+        silero_input = f'<speak>{prosody}{xmlesc(html.unescape(string))}</prosody></speak>'
         model.save_wav(ssml_text=silero_input, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
 
         autoplay = 'autoplay' if params['autoplay'] else ''

From 26c5e5e8789b234236424aebded7ade097ccc08e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 24 Aug 2023 19:23:08 -0700
Subject: [PATCH 150/169] Bump autogptq

---
 requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 7a033c96..daccd5dd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -24,8 +24,8 @@ wandb
 
 bitsandbytes==0.41.1; platform_system != "Windows"
 https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
-https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
+https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 

From feecd8190fab73dae0a7c64b546304a22e431c3c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 24 Aug 2023 21:01:09 -0700
Subject: [PATCH 151/169] Unescape inline code blocks

---
 modules/html_generator.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/modules/html_generator.py b/modules/html_generator.py
index 86f31f3a..47ca6095 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -54,9 +54,6 @@ def convert_to_markdown(string):
         if line.lstrip(' ').startswith('```'):
             is_code = not is_code
 
-        if is_code:
-            line = html.unescape(line)
-
         result += line
         if is_code or line.startswith('|'):  # Don't add an extra \n for tables or code
             result += '\n'
@@ -85,6 +82,10 @@ def convert_to_markdown(string):
     else:
         html_output = markdown.markdown(result, extensions=['fenced_code', 'tables'])
 
+    # Unescape code blocks
+    pattern = re.compile(r'<code[^>]*>(.*?)</code>', re.DOTALL)
+    html_output = pattern.sub(lambda x: html.unescape(x.group()), html_output)
+
     return html_output
 
 

From 52ab2a6b9ef1b1927d4259731d38543fb1a3ceb7 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 25 Aug 2023 06:53:37 -0700
Subject: [PATCH 152/169] Add rope_freq_base parameter for CodeLlama

---
 README.md                 | 5 +++--
 modules/exllama.py        | 6 +++---
 modules/exllama_hf.py     | 6 +++---
 modules/llamacpp_hf.py    | 4 ++--
 modules/llamacpp_model.py | 4 ++--
 modules/loaders.py        | 5 +++++
 modules/models.py         | 4 ++--
 modules/shared.py         | 3 ++-
 modules/ui.py             | 3 ++-
 modules/ui_model_menu.py  | 3 ++-
 10 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index bd356a18..13268352 100644
--- a/README.md
+++ b/README.md
@@ -337,8 +337,9 @@ Optionally, you can use the following command-line flags:
 
 | Flag             | Description |
 |------------------|-------------|
-|`--alpha_value ALPHA_VALUE`           | Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both. |
-|`--compress_pos_emb COMPRESS_POS_EMB` | Positional embeddings compression factor. Should typically be set to max_seq_len / 2048. |
+| `--alpha_value ALPHA_VALUE`           | Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.
+| `--rope_freq_base ROPE_FREQ_BASE`     | If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)
+| `--compress_pos_emb COMPRESS_POS_EMB` | Positional embeddings compression factor. Should be set to (context length) / (model's original context length). Equal to 1/rope_freq_scale.
 
 #### Gradio
 
diff --git a/modules/exllama.py b/modules/exllama.py
index 25bf0e53..7df1d321 100644
--- a/modules/exllama.py
+++ b/modules/exllama.py
@@ -3,7 +3,7 @@ from pathlib import Path
 import torch.nn.functional as F
 from torch import version as torch_version
 
-from modules import shared
+from modules import RoPE, shared
 from modules.logging_colors import logger
 from modules.models import clear_torch_cache
 from modules.text_generation import get_max_prompt_length
@@ -56,8 +56,8 @@ class ExllamaModel:
             config.set_auto_map(shared.args.gpu_split)
             config.gpu_peer_fix = True
 
-        if shared.args.alpha_value:
-            config.alpha_value = shared.args.alpha_value
+        if shared.args.alpha_value > 1 or shared.args.rope_freq_base > 0:
+            config.alpha_value = RoPE.get_alpha_value(shared.args.alpha_value, shared.args.rope_freq_base)
             config.calculate_rotary_embedding_base()
 
         if torch_version.hip:
diff --git a/modules/exllama_hf.py b/modules/exllama_hf.py
index 129ee52e..eab92644 100644
--- a/modules/exllama_hf.py
+++ b/modules/exllama_hf.py
@@ -7,7 +7,7 @@ from torch.nn import CrossEntropyLoss
 from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
 from transformers.modeling_outputs import CausalLMOutputWithPast
 
-from modules import shared
+from modules import RoPE, shared
 from modules.logging_colors import logger
 
 try:
@@ -134,8 +134,8 @@ class ExllamaHF(PreTrainedModel):
             config.set_auto_map(shared.args.gpu_split)
             config.gpu_peer_fix = True
 
-        if shared.args.alpha_value:
-            config.alpha_value = shared.args.alpha_value
+        if shared.args.alpha_value > 1 or shared.args.rope_freq_base > 0:
+            config.alpha_value = RoPE.get_alpha_value(shared.args.alpha_value, shared.args.rope_freq_base)
             config.calculate_rotary_embedding_base()
 
         if torch.version.hip:
diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py
index f37d710a..4d42394f 100644
--- a/modules/llamacpp_hf.py
+++ b/modules/llamacpp_hf.py
@@ -7,7 +7,7 @@ from torch.nn import CrossEntropyLoss
 from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
 from transformers.modeling_outputs import CausalLMOutputWithPast
 
-from modules import shared
+from modules import RoPE, shared
 from modules.logging_colors import logger
 
 import llama_cpp
@@ -185,7 +185,7 @@ class LlamacppHF(PreTrainedModel):
             'mul_mat_q': shared.args.mul_mat_q,
             'low_vram': shared.args.low_vram,
             'n_gpu_layers': shared.args.n_gpu_layers,
-            'rope_freq_base': 10000 * shared.args.alpha_value ** (64 / 63.),
+            'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base),
             'tensor_split': tensor_split_list,
             'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
             'n_gqa': shared.args.n_gqa or None,
diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
index 69cbd236..d2893b0d 100644
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@@ -3,7 +3,7 @@ from functools import partial
 
 import torch
 
-from modules import shared
+from modules import RoPE, shared
 from modules.callbacks import Iteratorize
 from modules.logging_colors import logger
 from modules.text_generation import get_max_prompt_length
@@ -72,7 +72,7 @@ class LlamaCppModel:
             'mul_mat_q': shared.args.mul_mat_q,
             'low_vram': shared.args.low_vram,
             'n_gpu_layers': shared.args.n_gpu_layers,
-            'rope_freq_base': 10000 * shared.args.alpha_value ** (64 / 63.),
+            'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base),
             'tensor_split': tensor_split_list,
             'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
             'n_gqa': shared.args.n_gqa or None,
diff --git a/modules/loaders.py b/modules/loaders.py
index dde40c9b..b949c325 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -21,6 +21,7 @@ loaders_and_params = OrderedDict({
         'compute_dtype',
         'trust_remote_code',
         'alpha_value',
+        'rope_freq_base',
         'compress_pos_emb',
         'transformers_info'
     ],
@@ -28,6 +29,7 @@ loaders_and_params = OrderedDict({
         'gpu_split',
         'max_seq_len',
         'alpha_value',
+        'rope_freq_base',
         'compress_pos_emb',
         'cfg_cache',
         'exllama_HF_info',
@@ -36,6 +38,7 @@ loaders_and_params = OrderedDict({
         'gpu_split',
         'max_seq_len',
         'alpha_value',
+        'rope_freq_base',
         'compress_pos_emb',
         'exllama_info',
     ],
@@ -77,6 +80,7 @@ loaders_and_params = OrderedDict({
         'mul_mat_q',
         'llama_cpp_seed',
         'alpha_value',
+        'rope_freq_base',
         'compress_pos_emb',
         'cpu',
     ],
@@ -93,6 +97,7 @@ loaders_and_params = OrderedDict({
         'mlock',
         'mul_mat_q',
         'alpha_value',
+        'rope_freq_base',
         'compress_pos_emb',
         'cpu',
         'cfg_cache',
diff --git a/modules/models.py b/modules/models.py
index d60aecd0..48a384cf 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -18,7 +18,7 @@ from transformers import (
 )
 
 import modules.shared as shared
-from modules import llama_attn_hijack, sampler_hijack
+from modules import llama_attn_hijack, RoPE, sampler_hijack
 from modules.logging_colors import logger
 from modules.models_settings import infer_loader
 
@@ -219,7 +219,7 @@ def huggingface_loader(model_name):
         if shared.args.compress_pos_emb > 1:
             params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb}
         elif shared.args.alpha_value > 1:
-            params['rope_scaling'] = {'type': 'dynamic', 'factor': shared.args.alpha_value}
+            params['rope_scaling'] = {'type': 'dynamic', 'factor': RoPE.get_alpha_value(shared.args.alpha_value, shared.args.rope_freq_base)}
 
         model = LoaderClass.from_pretrained(checkpoint, **params)
 
diff --git a/modules/shared.py b/modules/shared.py
index c89c906b..49557a8d 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -159,8 +159,9 @@ parser.add_argument('--rwkv-strategy', type=str, default=None, help='RWKV: The s
 parser.add_argument('--rwkv-cuda-on', action='store_true', help='RWKV: Compile the CUDA kernel for better performance.')
 
 # RoPE
-parser.add_argument('--compress_pos_emb', type=int, default=1, help="Positional embeddings compression factor. Should typically be set to max_seq_len / 2048.")
 parser.add_argument('--alpha_value', type=int, default=1, help="Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.")
+parser.add_argument('--rope_freq_base', type=int, default=1, help="If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)")
+parser.add_argument('--compress_pos_emb', type=int, default=1, help="Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.")
 
 # Gradio
 parser.add_argument('--listen', action='store_true', help='Make the web UI reachable from your local network.')
diff --git a/modules/ui.py b/modules/ui.py
index f6e9ac10..aa72f287 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -79,7 +79,8 @@ def list_model_elements():
         'gpu_split',
         'max_seq_len',
         'compress_pos_emb',
-        'alpha_value'
+        'alpha_value',
+        'rope_freq_base'
     ]
 
     for i in range(torch.cuda.device_count()):
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 05fe3af7..b1eb6ad9 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -91,7 +91,8 @@ def create_ui():
                             shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
                             shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=16384, step=256, info='Maximum sequence length.', value=shared.args.max_seq_len)
                             shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=0.1, info='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value)
-                            shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length).', value=shared.args.compress_pos_emb)
+                            shared.gradio['rope_freq_base'] = gr.Slider(label='rope_freq_base', minimum=0, maximum=100000, step=1000, info='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)', value=shared.args.rope_freq_base)
+                            shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.', value=shared.args.compress_pos_emb)
 
                         with gr.Column():
                             shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)

From 5c7d8bfdfd90b426e7c03dc8098837d3ed988f81 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 25 Aug 2023 07:06:57 -0700
Subject: [PATCH 153/169] Detect CodeLlama settings

---
 models/config.yaml       | 4 ++++
 modules/ui_model_menu.py | 2 +-
 server.py                | 1 +
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/models/config.yaml b/models/config.yaml
index 366684dc..475eb8e8 100644
--- a/models/config.yaml
+++ b/models/config.yaml
@@ -300,3 +300,7 @@ llama-65b-gptq-3bit:
   instruction_template: 'OpenOrca-Platypus2'
   custom_stopping_strings: '"### Instruction:", "### Response:"'
   rms_norm_eps: 5.0e-6
+.*codellama:
+  rope_freq_base: 1000000
+.*codellama.*instruct:
+  instruction_template: 'Llama-v2'
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index b1eb6ad9..fde2fb38 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -91,7 +91,7 @@ def create_ui():
                             shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
                             shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=16384, step=256, info='Maximum sequence length.', value=shared.args.max_seq_len)
                             shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=0.1, info='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value)
-                            shared.gradio['rope_freq_base'] = gr.Slider(label='rope_freq_base', minimum=0, maximum=100000, step=1000, info='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)', value=shared.args.rope_freq_base)
+                            shared.gradio['rope_freq_base'] = gr.Slider(label='rope_freq_base', minimum=0, maximum=1000000, step=1000, info='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)', value=shared.args.rope_freq_base)
                             shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.', value=shared.args.compress_pos_emb)
 
                         with gr.Column():
diff --git a/server.py b/server.py
index ef213a87..f3f7da96 100644
--- a/server.py
+++ b/server.py
@@ -180,6 +180,7 @@ if __name__ == "__main__":
         'truncation_length': shared.settings['truncation_length'],
         'n_gqa': 0,
         'rms_norm_eps': 0,
+        'rope_freq_base': 0,
     }
 
     shared.model_config.move_to_end('.*', last=False)  # Move to the beginning

From f4f04c8c328fe7177fe67316d95b254742e4478f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 25 Aug 2023 07:08:38 -0700
Subject: [PATCH 154/169] Fix a typo

---
 README.md         | 6 +++---
 modules/shared.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 13268352..822deba9 100644
--- a/README.md
+++ b/README.md
@@ -337,9 +337,9 @@ Optionally, you can use the following command-line flags:
 
 | Flag             | Description |
 |------------------|-------------|
-| `--alpha_value ALPHA_VALUE`           | Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.
-| `--rope_freq_base ROPE_FREQ_BASE`     | If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)
-| `--compress_pos_emb COMPRESS_POS_EMB` | Positional embeddings compression factor. Should be set to (context length) / (model's original context length). Equal to 1/rope_freq_scale.
+| `--alpha_value ALPHA_VALUE`           | Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both. |
+| `--rope_freq_base ROPE_FREQ_BASE`     | If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63). |
+| `--compress_pos_emb COMPRESS_POS_EMB` | Positional embeddings compression factor. Should be set to (context length) / (model's original context length). Equal to 1/rope_freq_scale. |
 
 #### Gradio
 
diff --git a/modules/shared.py b/modules/shared.py
index 49557a8d..ca68c1a6 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -160,7 +160,7 @@ parser.add_argument('--rwkv-cuda-on', action='store_true', help='RWKV: Compile t
 
 # RoPE
 parser.add_argument('--alpha_value', type=int, default=1, help="Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.")
-parser.add_argument('--rope_freq_base', type=int, default=1, help="If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)")
+parser.add_argument('--rope_freq_base', type=int, default=0, help="If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63).")
 parser.add_argument('--compress_pos_emb', type=int, default=1, help="Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.")
 
 # Gradio

From 21058c37f789c241833c10144bed8c01af6c4ccd Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 25 Aug 2023 07:10:26 -0700
Subject: [PATCH 155/169] Add missing file

---
 modules/RoPE.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 modules/RoPE.py

diff --git a/modules/RoPE.py b/modules/RoPE.py
new file mode 100644
index 00000000..c15616c6
--- /dev/null
+++ b/modules/RoPE.py
@@ -0,0 +1,18 @@
+def get_alpha_value(alpha, base):
+    '''
+    Gets alpha_value from alpha_value and rope_freq_base
+    '''
+    if base > 0:
+        return (base/10000.) ** (63/64.)
+    else:
+        return alpha
+
+
+def get_rope_freq_base(alpha, base):
+    '''
+    Gets rope_freq_base from alpha_value and rope_freq_base
+    '''
+    if base > 0:
+        return base
+    else:
+        return 10000 * alpha ** (64/63.)

From 960980247f4737be2a952b8a6055b95e7f1dd46f Mon Sep 17 00:00:00 2001
From: cal066 <60696996+cal066@users.noreply.github.com>
Date: Fri, 25 Aug 2023 14:33:04 +0000
Subject: [PATCH 156/169] ctransformers: gguf support (#3685)

---
 modules/models.py | 11 ++++++++++-
 requirements.txt  |  2 +-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/modules/models.py b/modules/models.py
index 48a384cf..ea9cc528 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -280,7 +280,16 @@ def ctransformers_loader(model_name):
         if path.is_file():
             model_file = path
         else:
-            model_file = list(Path(f'{shared.args.model_dir}/{model_name}').glob('*.bin'))[0]
+            entries = Path(f'{shared.args.model_dir}/{model_name}')
+            gguf = list(entries.glob('*.gguf'))
+            bin = list(entries.glob('*.bin'))
+            if len(gguf) > 0:
+                model_file = gguf[0]
+            elif len(bin) > 0:
+                model_file = bin[0]
+            else:
+                logger.error("Could not find a model for ctransformers.")
+                return None, None
 
     logger.info(f'ctransformers weights detected: {model_file}')
     model, tokenizer = ctrans.from_pretrained(model_file)
diff --git a/requirements.txt b/requirements.txt
index daccd5dd..90c7969f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -41,4 +41,4 @@ https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 
 # ctransformers
-https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.23+cu117-py3-none-any.whl
+https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.24+cu117-py3-none-any.whl

From 0bcecaa21622daf5752280e8e3399f12f7ada259 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 25 Aug 2023 07:59:23 -0700
Subject: [PATCH 157/169] Set mode: instruct for CodeLlama-instruct

---
 models/config.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/models/config.yaml b/models/config.yaml
index 475eb8e8..61e128cd 100644
--- a/models/config.yaml
+++ b/models/config.yaml
@@ -303,4 +303,5 @@ llama-65b-gptq-3bit:
 .*codellama:
   rope_freq_base: 1000000
 .*codellama.*instruct:
+  mode: 'instruct'
   instruction_template: 'Llama-v2'

From db42b365c91d9fc79ebe0a3189a6cf7135357072 Mon Sep 17 00:00:00 2001
From: jllllll <3887729+jllllll@users.noreply.github.com>
Date: Fri, 25 Aug 2023 12:37:02 -0500
Subject: [PATCH 158/169] Fix ctransformers threads auto-detection (#3688)

---
 modules/ctransformers_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ctransformers_model.py b/modules/ctransformers_model.py
index 8b8b5c4d..60b34e72 100644
--- a/modules/ctransformers_model.py
+++ b/modules/ctransformers_model.py
@@ -15,7 +15,7 @@ class CtransformersModel:
 
         config = AutoConfig.from_pretrained(
             str(path),
-            threads=shared.args.threads,
+            threads=shared.args.threads if shared.args.threads != 0 else -1,
             gpu_layers=shared.args.n_gpu_layers,
             batch_size=shared.args.n_batch,
             context_length=shared.args.n_ctx,

From 1a642c12b582330a4780047461905787cc3f18c5 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 26 Aug 2023 00:45:07 -0700
Subject: [PATCH 159/169] Fix silero_tts HTML unescaping

---
 extensions/silero_tts/script.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/extensions/silero_tts/script.py b/extensions/silero_tts/script.py
index e7616918..31677eca 100644
--- a/extensions/silero_tts/script.py
+++ b/extensions/silero_tts/script.py
@@ -119,14 +119,14 @@ def output_modifier(string, state):
         return string
 
     original_string = string
-    string = tts_preprocessor.preprocess(string)
+    string = tts_preprocessor.preprocess(html.unescape(string))
 
     if string == '':
         string = '*Empty reply, try regenerating*'
     else:
         output_file = Path(f'extensions/silero_tts/outputs/{state["character_menu"]}_{int(time.time())}.wav')
         prosody = '<prosody rate="{}" pitch="{}">'.format(params['voice_speed'], params['voice_pitch'])
-        silero_input = f'<speak>{prosody}{xmlesc(html.unescape(string))}</prosody></speak>'
+        silero_input = f'<speak>{prosody}{xmlesc(string)}</prosody></speak>'
         model.save_wav(ssml_text=silero_input, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
 
         autoplay = 'autoplay' if params['autoplay'] else ''

From 83640d6f43c80906492109af521e0ddc6d43a5cc Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 26 Aug 2023 01:06:59 -0700
Subject: [PATCH 160/169] Replace ggml occurences with gguf

---
 README.md                  |  4 ++--
 download-model.py          | 12 ++++++------
 modules/llamacpp_hf.py     |  2 +-
 modules/models.py          |  2 +-
 modules/models_settings.py |  4 ++--
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 822deba9..f36a8b9b 100644
--- a/README.md
+++ b/README.md
@@ -156,7 +156,7 @@ text-generation-webui
 
 In the "Model" tab of the UI, those models can be automatically downloaded from Hugging Face. You can also download them via the command-line with `python download-model.py organization/model`.
 
-* GGML models are a single file and should be placed directly into `models`. Example:
+* GGUF models are a single file and should be placed directly into `models`. Example:
 
 ```
 text-generation-webui
@@ -258,7 +258,7 @@ Optionally, you can use the following command-line flags:
 | `--quant_type QUANT_TYPE`                   | quant_type for 4-bit. Valid options: nf4, fp4. |
 | `--use_double_quant`                        | use_double_quant for 4-bit. |
 
-#### GGML (for llama.cpp and ctransformers)
+#### GGUF (for llama.cpp and ctransformers)
 
 | Flag        | Description |
 |-------------|-------------|
diff --git a/download-model.py b/download-model.py
index a65f82c7..be8d59fe 100644
--- a/download-model.py
+++ b/download-model.py
@@ -57,7 +57,7 @@ class ModelDownloader:
         classifications = []
         has_pytorch = False
         has_pt = False
-        # has_ggml = False
+        # has_gguf = False
         has_safetensors = False
         is_lora = False
         while True:
@@ -78,10 +78,10 @@ class ModelDownloader:
                 is_pytorch = re.match(r"(pytorch|adapter|gptq)_model.*\.bin", fname)
                 is_safetensors = re.match(r".*\.safetensors", fname)
                 is_pt = re.match(r".*\.pt", fname)
-                is_ggml = re.match(r".*ggml.*\.bin", fname)
+                is_gguf = re.match(r'.*\.gguf', fname)
                 is_tokenizer = re.match(r"(tokenizer|ice|spiece).*\.model", fname)
                 is_text = re.match(r".*\.(txt|json|py|md)", fname) or is_tokenizer
-                if any((is_pytorch, is_safetensors, is_pt, is_ggml, is_tokenizer, is_text)):
+                if any((is_pytorch, is_safetensors, is_pt, is_gguf, is_tokenizer, is_text)):
                     if 'lfs' in dict[i]:
                         sha256.append([fname, dict[i]['lfs']['oid']])
 
@@ -101,9 +101,9 @@ class ModelDownloader:
                         elif is_pt:
                             has_pt = True
                             classifications.append('pt')
-                        elif is_ggml:
-                            # has_ggml = True
-                            classifications.append('ggml')
+                        elif is_gguf:
+                            # has_gguf = True
+                            classifications.append('gguf')
 
             cursor = base64.b64encode(f'{{"file_name":"{dict[-1]["path"]}"}}'.encode()) + b':50'
             cursor = base64.b64encode(cursor)
diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py
index 4d42394f..0608cb01 100644
--- a/modules/llamacpp_hf.py
+++ b/modules/llamacpp_hf.py
@@ -165,7 +165,7 @@ class LlamacppHF(PreTrainedModel):
         if path.is_file():
             model_file = path
         else:
-            model_file = list(path.glob('*ggml*.bin'))[0]
+            model_file = list(path.glob('*.gguf*'))[0]
 
         logger.info(f"llama.cpp weights detected: {model_file}\n")
 
diff --git a/modules/models.py b/modules/models.py
index ea9cc528..5268a2fc 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -241,7 +241,7 @@ def llamacpp_loader(model_name):
     if path.is_file():
         model_file = path
     else:
-        model_file = list(Path(f'{shared.args.model_dir}/{model_name}').glob('*ggml*.bin'))[0]
+        model_file = list(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf*'))[0]
 
     logger.info(f"llama.cpp weights detected: {model_file}")
     model, tokenizer = LlamaCppModel.from_pretrained(model_file)
diff --git a/modules/models_settings.py b/modules/models_settings.py
index 5efde34b..2ed658b8 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -24,9 +24,9 @@ def infer_loader(model_name):
         loader = None
     elif Path(f'{shared.args.model_dir}/{model_name}/quantize_config.json').exists() or ('wbits' in model_settings and type(model_settings['wbits']) is int and model_settings['wbits'] > 0):
         loader = 'AutoGPTQ'
-    elif len(list(path_to_model.glob('*ggml*.bin'))) > 0:
+    elif len(list(path_to_model.glob('*.gguf*'))) > 0:
         loader = 'llama.cpp'
-    elif re.match(r'.*ggml.*\.bin', model_name.lower()):
+    elif re.match(r'.*\.gguf', model_name.lower()):
         loader = 'llama.cpp'
     elif re.match(r'.*rwkv.*\.pth', model_name.lower()):
         loader = 'RWKV'

From 6e6431e73ff006ad272f0ea735e86e4577af1002 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 26 Aug 2023 01:07:28 -0700
Subject: [PATCH 161/169] Update requirements.txt

---
 requirements.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 90c7969f..a28d87ee 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -30,11 +30,11 @@ https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu117
 https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 
 # llama-cpp-python without GPU support
-llama-cpp-python==0.1.78; platform_system != "Windows"
-https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.78/llama_cpp_python-0.1.78-cp310-cp310-win_amd64.whl; platform_system == "Windows"
+llama-cpp-python==0.1.79; platform_system != "Windows"
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.79/llama_cpp_python-0.1.79-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 # llama-cpp-python with CUDA support
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.78+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.78+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.79+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.79+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 
 # GPTQ-for-LLaMa
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"

From 4a999e3bcd6f44e78eda3d448f4e36884aaf8966 Mon Sep 17 00:00:00 2001
From: jllllll <3887729+jllllll@users.noreply.github.com>
Date: Sat, 26 Aug 2023 09:15:11 -0500
Subject: [PATCH 162/169] Use separate llama-cpp-python packages for GGML
 support

---
 download-model.py          | 15 +++++++++++++--
 modules/llamacpp_hf.py     | 27 +++++++++++++++++++++------
 modules/llamacpp_model.py  | 29 +++++++++++++++++++++++------
 modules/models.py          |  2 +-
 modules/models_settings.py |  4 ++--
 modules/utils.py           |  9 +++++++++
 requirements.txt           |  5 +++++
 7 files changed, 74 insertions(+), 17 deletions(-)

diff --git a/download-model.py b/download-model.py
index be8d59fe..3bb4a39b 100644
--- a/download-model.py
+++ b/download-model.py
@@ -57,7 +57,8 @@ class ModelDownloader:
         classifications = []
         has_pytorch = False
         has_pt = False
-        # has_gguf = False
+        has_gguf = False
+        has_ggml = False
         has_safetensors = False
         is_lora = False
         while True:
@@ -79,6 +80,7 @@ class ModelDownloader:
                 is_safetensors = re.match(r".*\.safetensors", fname)
                 is_pt = re.match(r".*\.pt", fname)
                 is_gguf = re.match(r'.*\.gguf', fname)
+                is_ggml = re.match(r".*ggml.*\.bin", fname)
                 is_tokenizer = re.match(r"(tokenizer|ice|spiece).*\.model", fname)
                 is_text = re.match(r".*\.(txt|json|py|md)", fname) or is_tokenizer
                 if any((is_pytorch, is_safetensors, is_pt, is_gguf, is_tokenizer, is_text)):
@@ -102,8 +104,11 @@ class ModelDownloader:
                             has_pt = True
                             classifications.append('pt')
                         elif is_gguf:
-                            # has_gguf = True
+                            has_gguf = True
                             classifications.append('gguf')
+                        elif is_ggml:
+                            has_ggml = True
+                            classifications.append('ggml')
 
             cursor = base64.b64encode(f'{{"file_name":"{dict[-1]["path"]}"}}'.encode()) + b':50'
             cursor = base64.b64encode(cursor)
@@ -115,6 +120,12 @@ class ModelDownloader:
                 if classifications[i] in ['pytorch', 'pt']:
                     links.pop(i)
 
+        # If both GGML and GGUF are available, download GGUF only
+        if has_ggml and has_gguf:
+            for i in range(len(classifications) - 1, -1, -1):
+                if classifications[i] == 'ggml':
+                    links.pop(i)
+
         return links, sha256, is_lora
 
     def get_output_folder(self, model, branch, is_lora, base_folder=None):
diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py
index 0608cb01..bcb537fa 100644
--- a/modules/llamacpp_hf.py
+++ b/modules/llamacpp_hf.py
@@ -9,23 +9,38 @@ from transformers.modeling_outputs import CausalLMOutputWithPast
 
 from modules import RoPE, shared
 from modules.logging_colors import logger
+from modules.utils import is_gguf
 
 import llama_cpp
 
+try:
+    import llama_cpp_ggml
+except:
+    llama_cpp_ggml = llama_cpp
+
 if torch.cuda.is_available() and not torch.version.hip:
     try:
         import llama_cpp_cuda
     except:
         llama_cpp_cuda = None
+    try:
+        import llama_cpp_ggml_cuda
+    except:
+        llama_cpp_ggml_cuda = llama_cpp_cuda
 else:
     llama_cpp_cuda = None
+    llama_cpp_ggml_cuda = None
 
 
-def llama_cpp_lib():
-    if shared.args.cpu or llama_cpp_cuda is None:
-        return llama_cpp
+def llama_cpp_lib(model_file: Union[str, Path] = None):
+    if model_file is not None:
+        gguf_model = is_gguf(model_file)
     else:
-        return llama_cpp_cuda
+        gguf_model = True
+    if shared.args.cpu or llama_cpp_cuda is None:
+        return llama_cpp if gguf_model else llama_cpp_ggml
+    else:
+        return llama_cpp_cuda if gguf_model else llama_cpp_ggml_cuda
 
 
 class LlamacppHF(PreTrainedModel):
@@ -165,7 +180,7 @@ class LlamacppHF(PreTrainedModel):
         if path.is_file():
             model_file = path
         else:
-            model_file = list(path.glob('*.gguf*'))[0]
+            model_file = (list(path.glob('*.gguf*')) + list(path.glob('*ggml*.bin')))[0]
 
         logger.info(f"llama.cpp weights detected: {model_file}\n")
 
@@ -193,7 +208,7 @@ class LlamacppHF(PreTrainedModel):
             'logits_all': True,
         }
 
-        Llama = llama_cpp_lib().Llama
+        Llama = llama_cpp_lib(model_file).Llama
         model = Llama(**params)
 
         return LlamacppHF(model)
diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
index d2893b0d..c3c41541 100644
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@@ -1,5 +1,7 @@
 import re
 from functools import partial
+from pathlib import Path
+from typing import Union
 
 import torch
 
@@ -7,23 +9,38 @@ from modules import RoPE, shared
 from modules.callbacks import Iteratorize
 from modules.logging_colors import logger
 from modules.text_generation import get_max_prompt_length
+from modules.utils import is_gguf
 
 import llama_cpp
 
+try:
+    import llama_cpp_ggml
+except:
+    llama_cpp_ggml = llama_cpp
+
 if torch.cuda.is_available() and not torch.version.hip:
     try:
         import llama_cpp_cuda
     except:
         llama_cpp_cuda = None
+    try:
+        import llama_cpp_ggml_cuda
+    except:
+        llama_cpp_ggml_cuda = llama_cpp_cuda
 else:
     llama_cpp_cuda = None
+    llama_cpp_ggml_cuda = None
 
 
-def llama_cpp_lib():
-    if shared.args.cpu or llama_cpp_cuda is None:
-        return llama_cpp
+def llama_cpp_lib(model_file: Union[str, Path] = None):
+    if model_file is not None:
+        gguf_model = is_gguf(model_file)
     else:
-        return llama_cpp_cuda
+        gguf_model = True
+    if shared.args.cpu or llama_cpp_cuda is None:
+        return llama_cpp if gguf_model else llama_cpp_ggml
+    else:
+        return llama_cpp_cuda if gguf_model else llama_cpp_ggml_cuda
 
 
 def ban_eos_logits_processor(eos_token, input_ids, logits):
@@ -41,8 +58,8 @@ class LlamaCppModel:
     @classmethod
     def from_pretrained(self, path):
 
-        Llama = llama_cpp_lib().Llama
-        LlamaCache = llama_cpp_lib().LlamaCache
+        Llama = llama_cpp_lib(str(path)).Llama
+        LlamaCache = llama_cpp_lib(str(path)).LlamaCache
 
         result = self()
         cache_capacity = 0
diff --git a/modules/models.py b/modules/models.py
index 5268a2fc..3025fe3d 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -241,7 +241,7 @@ def llamacpp_loader(model_name):
     if path.is_file():
         model_file = path
     else:
-        model_file = list(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf*'))[0]
+        model_file = (list(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf*')) + list(Path(f'{shared.args.model_dir}/{model_name}').glob('*ggml*.bin')))[0]
 
     logger.info(f"llama.cpp weights detected: {model_file}")
     model, tokenizer = LlamaCppModel.from_pretrained(model_file)
diff --git a/modules/models_settings.py b/modules/models_settings.py
index 2ed658b8..c55b1e88 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -24,9 +24,9 @@ def infer_loader(model_name):
         loader = None
     elif Path(f'{shared.args.model_dir}/{model_name}/quantize_config.json').exists() or ('wbits' in model_settings and type(model_settings['wbits']) is int and model_settings['wbits'] > 0):
         loader = 'AutoGPTQ'
-    elif len(list(path_to_model.glob('*.gguf*'))) > 0:
+    elif len(list(path_to_model.glob('*.gguf*')) + list(path_to_model.glob('*ggml*.bin'))) > 0:
         loader = 'llama.cpp'
-    elif re.match(r'.*\.gguf', model_name.lower()):
+    elif re.match(r'.*\.gguf|.*ggml.*\.bin', model_name.lower()):
         loader = 'llama.cpp'
     elif re.match(r'.*rwkv.*\.pth', model_name.lower()):
         loader = 'RWKV'
diff --git a/modules/utils.py b/modules/utils.py
index 0a7edffa..3862817d 100644
--- a/modules/utils.py
+++ b/modules/utils.py
@@ -2,6 +2,7 @@ import os
 import re
 from datetime import datetime
 from pathlib import Path
+from typing import Union
 
 from modules import shared
 from modules.logging_colors import logger
@@ -124,3 +125,11 @@ def get_datasets(path: str, ext: str):
 
 def get_available_chat_styles():
     return sorted(set(('-'.join(k.stem.split('-')[1:]) for k in Path('css').glob('chat_style*.css'))), key=natural_keys)
+
+# Determines if a llama.cpp model is in GGUF format
+# Copied from ctransformers utils.py
+def is_gguf(path: Union[str, Path]) -> bool:
+    path = str(Path(path).resolve())
+    with open(path, "rb") as f:
+        magic = f.read(4)
+    return magic == "GGUF".encode()
diff --git a/requirements.txt b/requirements.txt
index a28d87ee..0c6aeb1b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -35,6 +35,11 @@ https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.79/llama_cpp_
 # llama-cpp-python with CUDA support
 https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.79+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.79+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+# llama-cpp-python with GGML support
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python_ggml-0.1.78+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python_ggml-0.1.78+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_ggml_cuda-0.1.78+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_ggml_cuda-0.1.78+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 
 # GPTQ-for-LLaMa
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"

From 4d61a7d9daa4aa565f840d0fff31c2b4a0247298 Mon Sep 17 00:00:00 2001
From: jllllll <3887729+jllllll@users.noreply.github.com>
Date: Sat, 26 Aug 2023 14:07:46 -0500
Subject: [PATCH 163/169] Account for deprecated GGML parameters

---
 modules/llamacpp_hf.py    | 9 +++++++--
 modules/llamacpp_model.py | 9 +++++++--
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py
index bcb537fa..a2dcb34b 100644
--- a/modules/llamacpp_hf.py
+++ b/modules/llamacpp_hf.py
@@ -203,10 +203,15 @@ class LlamacppHF(PreTrainedModel):
             'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base),
             'tensor_split': tensor_split_list,
             'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
-            'n_gqa': shared.args.n_gqa or None,
-            'rms_norm_eps': shared.args.rms_norm_eps or None,
             'logits_all': True,
         }
+        
+        if not is_gguf(model_file):
+            ggml_params = {
+                'n_gqa': shared.args.n_gqa or None,
+                'rms_norm_eps': shared.args.rms_norm_eps or None,
+            }
+            params = params | ggml_params
 
         Llama = llama_cpp_lib(model_file).Llama
         model = Llama(**params)
diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
index c3c41541..4908ecb7 100644
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@@ -92,9 +92,14 @@ class LlamaCppModel:
             'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base),
             'tensor_split': tensor_split_list,
             'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
-            'n_gqa': shared.args.n_gqa or None,
-            'rms_norm_eps': shared.args.rms_norm_eps or None,
         }
+        
+        if not is_gguf(str(path)):
+            ggml_params = {
+                'n_gqa': shared.args.n_gqa or None,
+                'rms_norm_eps': shared.args.rms_norm_eps or None,
+            }
+            params = params | ggml_params
 
         result.model = Llama(**params)
         if cache_capacity > 0:

From 7f5370a2726ef54f4ef9c350b986297f7809c226 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 26 Aug 2023 22:11:07 -0700
Subject: [PATCH 164/169] Minor fixes/cosmetics

---
 README.md                 | 4 ++--
 download-model.py         | 2 +-
 modules/llamacpp_hf.py    | 3 ++-
 modules/llamacpp_model.py | 9 +++++----
 modules/utils.py          | 8 ++++++--
 requirements.txt          | 7 +++++++
 6 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index f36a8b9b..f527cd55 100644
--- a/README.md
+++ b/README.md
@@ -156,7 +156,7 @@ text-generation-webui
 
 In the "Model" tab of the UI, those models can be automatically downloaded from Hugging Face. You can also download them via the command-line with `python download-model.py organization/model`.
 
-* GGUF models are a single file and should be placed directly into `models`. Example:
+* GGML/GGUF models are a single file and should be placed directly into `models`. Example:
 
 ```
 text-generation-webui
@@ -258,7 +258,7 @@ Optionally, you can use the following command-line flags:
 | `--quant_type QUANT_TYPE`                   | quant_type for 4-bit. Valid options: nf4, fp4. |
 | `--use_double_quant`                        | use_double_quant for 4-bit. |
 
-#### GGUF (for llama.cpp and ctransformers)
+#### GGML/GGUF (for llama.cpp and ctransformers)
 
 | Flag        | Description |
 |-------------|-------------|
diff --git a/download-model.py b/download-model.py
index 3bb4a39b..b36865d7 100644
--- a/download-model.py
+++ b/download-model.py
@@ -83,7 +83,7 @@ class ModelDownloader:
                 is_ggml = re.match(r".*ggml.*\.bin", fname)
                 is_tokenizer = re.match(r"(tokenizer|ice|spiece).*\.model", fname)
                 is_text = re.match(r".*\.(txt|json|py|md)", fname) or is_tokenizer
-                if any((is_pytorch, is_safetensors, is_pt, is_gguf, is_tokenizer, is_text)):
+                if any((is_pytorch, is_safetensors, is_pt, is_gguf, is_ggml, is_tokenizer, is_text)):
                     if 'lfs' in dict[i]:
                         sha256.append([fname, dict[i]['lfs']['oid']])
 
diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py
index a2dcb34b..ce8c6d15 100644
--- a/modules/llamacpp_hf.py
+++ b/modules/llamacpp_hf.py
@@ -37,6 +37,7 @@ def llama_cpp_lib(model_file: Union[str, Path] = None):
         gguf_model = is_gguf(model_file)
     else:
         gguf_model = True
+
     if shared.args.cpu or llama_cpp_cuda is None:
         return llama_cpp if gguf_model else llama_cpp_ggml
     else:
@@ -205,7 +206,7 @@ class LlamacppHF(PreTrainedModel):
             'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
             'logits_all': True,
         }
-        
+
         if not is_gguf(model_file):
             ggml_params = {
                 'n_gqa': shared.args.n_gqa or None,
diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
index 4908ecb7..12aa3a4f 100644
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@@ -37,6 +37,7 @@ def llama_cpp_lib(model_file: Union[str, Path] = None):
         gguf_model = is_gguf(model_file)
     else:
         gguf_model = True
+
     if shared.args.cpu or llama_cpp_cuda is None:
         return llama_cpp if gguf_model else llama_cpp_ggml
     else:
@@ -58,8 +59,8 @@ class LlamaCppModel:
     @classmethod
     def from_pretrained(self, path):
 
-        Llama = llama_cpp_lib(str(path)).Llama
-        LlamaCache = llama_cpp_lib(str(path)).LlamaCache
+        Llama = llama_cpp_lib(path).Llama
+        LlamaCache = llama_cpp_lib(path).LlamaCache
 
         result = self()
         cache_capacity = 0
@@ -93,8 +94,8 @@ class LlamaCppModel:
             'tensor_split': tensor_split_list,
             'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
         }
-        
-        if not is_gguf(str(path)):
+
+        if not is_gguf(path):
             ggml_params = {
                 'n_gqa': shared.args.n_gqa or None,
                 'rms_norm_eps': shared.args.rms_norm_eps or None,
diff --git a/modules/utils.py b/modules/utils.py
index 3862817d..15dbd9dd 100644
--- a/modules/utils.py
+++ b/modules/utils.py
@@ -126,10 +126,14 @@ def get_datasets(path: str, ext: str):
 def get_available_chat_styles():
     return sorted(set(('-'.join(k.stem.split('-')[1:]) for k in Path('css').glob('chat_style*.css'))), key=natural_keys)
 
-# Determines if a llama.cpp model is in GGUF format
-# Copied from ctransformers utils.py
+
 def is_gguf(path: Union[str, Path]) -> bool:
+    '''
+    Determines if a llama.cpp model is in GGUF format
+    Copied from ctransformers utils.py
+    '''
     path = str(Path(path).resolve())
     with open(path, "rb") as f:
         magic = f.read(4)
+
     return magic == "GGUF".encode()
diff --git a/requirements.txt b/requirements.txt
index 0c6aeb1b..7311370b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -22,19 +22,26 @@ tensorboard
 tqdm
 wandb
 
+# bitsandbytes
 bitsandbytes==0.41.1; platform_system != "Windows"
 https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl; platform_system == "Windows"
+
+# AutoGPTQ
 https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+
+# ExLlama
 https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 
 # llama-cpp-python without GPU support
 llama-cpp-python==0.1.79; platform_system != "Windows"
 https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.79/llama_cpp_python-0.1.79-cp310-cp310-win_amd64.whl; platform_system == "Windows"
+
 # llama-cpp-python with CUDA support
 https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.79+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.79+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+
 # llama-cpp-python with GGML support
 https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python_ggml-0.1.78+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python_ggml-0.1.78+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From 8aeae3b3f40f293c608283ddb34d33c2b5a4113c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 26 Aug 2023 22:15:06 -0700
Subject: [PATCH 165/169] Fix llamacpp_HF loading

---
 modules/llamacpp_hf.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py
index ce8c6d15..918ce7f8 100644
--- a/modules/llamacpp_hf.py
+++ b/modules/llamacpp_hf.py
@@ -45,7 +45,7 @@ def llama_cpp_lib(model_file: Union[str, Path] = None):
 
 
 class LlamacppHF(PreTrainedModel):
-    def __init__(self, model):
+    def __init__(self, model, path):
         super().__init__(PretrainedConfig())
         self.model = model
         self.generation_config = GenerationConfig()
@@ -64,7 +64,7 @@ class LlamacppHF(PreTrainedModel):
                 'n_tokens': self.model.n_tokens,
                 'input_ids': self.model.input_ids.copy(),
                 'scores': self.model.scores.copy(),
-                'ctx': llama_cpp_lib().llama_new_context_with_model(model.model, model.params)
+                'ctx': llama_cpp_lib(path).llama_new_context_with_model(model.model, model.params)
             }
 
     def _validate_model_class(self):
@@ -217,4 +217,4 @@ class LlamacppHF(PreTrainedModel):
         Llama = llama_cpp_lib(model_file).Llama
         model = Llama(**params)
 
-        return LlamacppHF(model)
+        return LlamacppHF(model, model_file)

From 3361728da154ab6ed9b472b529bc06a0a01e633c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 26 Aug 2023 22:24:44 -0700
Subject: [PATCH 166/169] Change some comments

---
 README.md                | 16 ++++++++--------
 modules/ui_model_menu.py |  4 ++--
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index f527cd55..77d67fa9 100644
--- a/README.md
+++ b/README.md
@@ -269,16 +269,16 @@ Optionally, you can use the following command-line flags:
 
 #### llama.cpp
 
-| Flag        | Description |
-|-------------|-------------|
-| `--no-mmap` | Prevent mmap from being used. |
-| `--mlock`   | Force the system to keep the model in RAM. |
+| Flag          | Description |
+|---------------|---------------|
+| `--no-mmap`   | Prevent mmap from being used. |
+| `--mlock`     | Force the system to keep the model in RAM. |
 | `--mul_mat_q` | Activate new mulmat kernels. |
 | `--cache-capacity CACHE_CAPACITY`   | Maximum cache capacity. Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. |
-| `--tensor_split TENSOR_SPLIT` | Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17 |
-| `--llama_cpp_seed SEED` | Seed for llama-cpp models. Default 0 (random). |
-| `--n_gqa N_GQA`         | grouped-query attention. Must be 8 for llama-2 70b. |
-| `--rms_norm_eps RMS_NORM_EPS`  | 5e-6 is a good value for llama-2 models. |
+| `--tensor_split TENSOR_SPLIT`  | Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17 |
+| `--llama_cpp_seed SEED`        | Seed for llama-cpp models. Default 0 (random). |
+| `--n_gqa N_GQA`                | GGML only (not used by GGUF): Grouped-Query Attention. Must be 8 for llama-2 70b. |
+| `--rms_norm_eps RMS_NORM_EPS`  | GGML only (not used by GGUF): 5e-6 is a good value for llama-2 models. |
 | `--cpu`                        | Use the CPU version of llama-cpp-python instead of the GPU-accelerated version. |
 |`--cfg-cache`                   | llamacpp_HF: Create an additional cache for CFG negative prompts. |
 
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index fde2fb38..5c945def 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -80,8 +80,8 @@ def create_ui():
                             shared.gradio['n_ctx'] = gr.Slider(minimum=0, maximum=16384, step=256, label="n_ctx", value=shared.args.n_ctx)
                             shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=32, value=shared.args.threads)
                             shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, value=shared.args.n_batch)
-                            shared.gradio['n_gqa'] = gr.Slider(minimum=0, maximum=16, step=1, label="n_gqa", value=shared.args.n_gqa, info='grouped-query attention. Must be 8 for llama-2 70b.')
-                            shared.gradio['rms_norm_eps'] = gr.Slider(minimum=0, maximum=1e-5, step=1e-6, label="rms_norm_eps", value=shared.args.rms_norm_eps, info='5e-6 is a good value for llama-2 models.')
+                            shared.gradio['n_gqa'] = gr.Slider(minimum=0, maximum=16, step=1, label="n_gqa", value=shared.args.n_gqa, info='GGML only (not used by GGUF): Grouped-Query Attention. Must be 8 for llama-2 70b.')
+                            shared.gradio['rms_norm_eps'] = gr.Slider(minimum=0, maximum=1e-5, step=1e-6, label="rms_norm_eps", value=shared.args.rms_norm_eps, info='GGML only (not used by GGUF): 5e-6 is a good value for llama-2 models.')
 
                             shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=str(shared.args.wbits) if shared.args.wbits > 0 else "None")
                             shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=str(shared.args.groupsize) if shared.args.groupsize > 0 else "None")

From 0c9e818bb874df6429bf1c5fe96be6400516abd9 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 26 Aug 2023 23:10:45 -0700
Subject: [PATCH 167/169] Update truncation length based on max_seq_len/n_ctx

---
 modules/ui_model_menu.py | 18 +++++++++++++++---
 modules/ui_parameters.py | 11 ++++++++++-
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 5c945def..30f41590 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -145,12 +145,14 @@ def create_event_handlers():
         apply_model_settings_to_state, gradio('model_menu', 'interface_state'), gradio('interface_state')).then(
         ui.apply_interface_values, gradio('interface_state'), gradio(ui.list_interface_input_elements()), show_progress=False).then(
         update_model_parameters, gradio('interface_state'), None).then(
-        load_model_wrapper, gradio('model_menu', 'loader', 'autoload_model'), gradio('model_status'), show_progress=False)
+        load_model_wrapper, gradio('model_menu', 'loader', 'autoload_model'), gradio('model_status'), show_progress=False).success(
+        update_truncation_length, gradio('truncation_length', 'interface_state'), gradio('truncation_length'))
 
     shared.gradio['load_model'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         update_model_parameters, gradio('interface_state'), None).then(
-        partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=False)
+        partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=False).success(
+        update_truncation_length, gradio('truncation_length', 'interface_state'), gradio('truncation_length'))
 
     shared.gradio['unload_model'].click(
         unload_model, None, None).then(
@@ -160,7 +162,8 @@ def create_event_handlers():
         unload_model, None, None).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         update_model_parameters, gradio('interface_state'), None).then(
-        partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=False)
+        partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=False).success(
+        update_truncation_length, gradio('truncation_length', 'interface_state'), gradio('truncation_length'))
 
     shared.gradio['save_model_settings'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
@@ -235,3 +238,12 @@ def download_model_wrapper(repo_id, progress=gr.Progress()):
     except:
         progress(1.0)
         yield traceback.format_exc().replace('\n', '\n\n')
+
+
+def update_truncation_length(current_length, state):
+    if state['loader'] in ['ExLlama', 'ExLlama_HF']:
+        return state['max_seq_len']
+    elif state['loader'] in ['llama.cpp', 'llamacpp_HF', 'ctransformers']:
+        return state['n_ctx']
+    else:
+        return current_length
diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
index a0f95158..b5ce5ac9 100644
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@@ -113,7 +113,7 @@ def create_ui(default_preset):
                     with gr.Box():
                         with gr.Row():
                             with gr.Column():
-                                shared.gradio['truncation_length'] = gr.Slider(value=shared.settings['truncation_length'], minimum=shared.settings['truncation_length_min'], maximum=shared.settings['truncation_length_max'], step=256, label='Truncate the prompt up to this length', info='The leftmost tokens are removed if the prompt exceeds this length. Most models require this to be at most 2048.')
+                                shared.gradio['truncation_length'] = gr.Slider(value=get_truncation_length(), minimum=shared.settings['truncation_length_min'], maximum=shared.settings['truncation_length_max'], step=256, label='Truncate the prompt up to this length', info='The leftmost tokens are removed if the prompt exceeds this length. Most models require this to be at most 2048.')
                                 shared.gradio['custom_stopping_strings'] = gr.Textbox(lines=1, value=shared.settings["custom_stopping_strings"] or None, label='Custom stopping strings', info='In addition to the defaults. Written between "" and separated by commas.', placeholder='"\\n", "\\nYou:"')
                             with gr.Column():
                                 shared.gradio['auto_max_new_tokens'] = gr.Checkbox(value=shared.settings['auto_max_new_tokens'], label='auto_max_new_tokens', info='Expand max_new_tokens to the available context length.')
@@ -129,3 +129,12 @@ def create_ui(default_preset):
 def create_event_handlers():
     shared.gradio['filter_by_loader'].change(loaders.blacklist_samplers, gradio('filter_by_loader'), gradio(loaders.list_all_samplers()), show_progress=False)
     shared.gradio['preset_menu'].change(presets.load_preset_for_ui, gradio('preset_menu', 'interface_state'), gradio('interface_state') + gradio(presets.presets_params()))
+
+
+def get_truncation_length():
+    if shared.args.max_seq_len != shared.args_defaults.max_seq_len:
+        return shared.args.max_seq_len
+    if shared.args.n_ctx != shared.args_defaults.n_ctx:
+        return shared.args.n_ctx
+    else:
+        return shared.settings['truncation_length']

From e4c3e1bdd2ceeb483f8c103df67640f7672fde00 Mon Sep 17 00:00:00 2001
From: Ravindra Marella <marella@users.noreply.github.com>
Date: Sun, 27 Aug 2023 19:23:48 +0530
Subject: [PATCH 168/169] Fix ctransformers model unload (#3711)

Add missing comma in model types list

Fixes marella/ctransformers#111
---
 modules/ctransformers_model.py | 8 ++++----
 modules/loaders.py             | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/modules/ctransformers_model.py b/modules/ctransformers_model.py
index 60b34e72..70ce92f5 100644
--- a/modules/ctransformers_model.py
+++ b/modules/ctransformers_model.py
@@ -10,8 +10,8 @@ class CtransformersModel:
         pass
 
     @classmethod
-    def from_pretrained(self, path):
-        result = self()
+    def from_pretrained(cls, path):
+        result = cls()
 
         config = AutoConfig.from_pretrained(
             str(path),
@@ -24,13 +24,13 @@ class CtransformersModel:
             mlock=shared.args.mlock
         )
 
-        self.model = AutoModelForCausalLM.from_pretrained(
+        result.model = AutoModelForCausalLM.from_pretrained(
             str(result.model_dir(path) if result.model_type_is_auto() else path),
             model_type=(None if result.model_type_is_auto() else shared.args.model_type),
             config=config
         )
 
-        logger.info(f'Using ctransformers model_type: {self.model.model_type} for {self.model.model_path}')
+        logger.info(f'Using ctransformers model_type: {result.model.model_type} for {result.model.model_path}')
         return result, result
 
     def model_type_is_auto(self):
diff --git a/modules/loaders.py b/modules/loaders.py
index b949c325..45a4e933 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -304,7 +304,7 @@ loaders_model_types = {
         "gptneox",
         "llama",
         "mpt",
-        "dollyv2"
+        "dollyv2",
         "replit",
         "starcoder",
         "gptbigcode",

From a965a368030422f586860f640f9c0f67642fcf17 Mon Sep 17 00:00:00 2001
From: Kelvie Wong <kelvie@kelvie.ca>
Date: Sun, 27 Aug 2023 08:29:00 -0700
Subject: [PATCH 169/169] Add ffmpeg to the Docker image (#3664)

---
 docker/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 177b247e..ded0b6c2 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -26,7 +26,7 @@ LABEL maintainer="Your Name <your.email@example.com>"
 LABEL description="Docker image for GPTQ-for-LLaMa and Text Generation WebUI"
 
 RUN apt-get update && \
-    apt-get install --no-install-recommends -y python3-dev libportaudio2 libasound-dev git python3 python3-pip make g++ && \
+    apt-get install --no-install-recommends -y python3-dev libportaudio2 libasound-dev git python3 python3-pip make g++ ffmpeg && \
     rm -rf /var/lib/apt/lists/*
 
 RUN --mount=type=cache,target=/root/.cache/pip pip3 install virtualenv