From ab50f80542788dd7fa21b20ac91fddc8c9766c23 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 8 Mar 2023 02:46:35 -0300
Subject: [PATCH 01/69] New text streaming method (much faster)
---
modules/callbacks.py | 75 ++++++++++++++++++++++++++++++++++++
modules/stopping_criteria.py | 32 ---------------
modules/text_generation.py | 66 +++++++++++++++++++++++--------
server.py | 3 --
4 files changed, 124 insertions(+), 52 deletions(-)
create mode 100644 modules/callbacks.py
delete mode 100644 modules/stopping_criteria.py
diff --git a/modules/callbacks.py b/modules/callbacks.py
new file mode 100644
index 00000000..15674b8a
--- /dev/null
+++ b/modules/callbacks.py
@@ -0,0 +1,75 @@
+from queue import Queue
+from threading import Thread
+
+import torch
+import transformers
+
+import modules.shared as shared
+
+
+# Copied from https://github.com/PygmalionAI/gradio-ui/
+class _SentinelTokenStoppingCriteria(transformers.StoppingCriteria):
+
+ def __init__(self, sentinel_token_ids: torch.LongTensor,
+ starting_idx: int):
+ transformers.StoppingCriteria.__init__(self)
+ self.sentinel_token_ids = sentinel_token_ids
+ self.starting_idx = starting_idx
+
+ def __call__(self, input_ids: torch.LongTensor,
+ _scores: torch.FloatTensor) -> bool:
+ for sample in input_ids:
+ trimmed_sample = sample[self.starting_idx:]
+ # Can't unfold, output is still too tiny. Skip.
+ if trimmed_sample.shape[-1] < self.sentinel_token_ids.shape[-1]:
+ continue
+
+ for window in trimmed_sample.unfold(
+ 0, self.sentinel_token_ids.shape[-1], 1):
+ if torch.all(torch.eq(self.sentinel_token_ids, window)):
+ return True
+ return False
+
+class Stream(transformers.StoppingCriteria):
+ def __init__(self, callback_func=None):
+ self.callback_func = callback_func
+
+ def __call__(self, input_ids, scores) -> bool:
+ if self.callback_func is not None:
+ self.callback_func(input_ids[0])
+ return False
+
+class Iteratorize:
+
+ """
+ Transforms a function that takes a callback
+ into a lazy iterator (generator).
+ """
+
+ def __init__(self, func, kwargs={}, callback=None):
+ self.mfunc=func
+ self.c_callback=callback
+ self.q = Queue(maxsize=1)
+ self.sentinel = object()
+ self.kwargs = kwargs
+
+ def _callback(val):
+ self.q.put(val)
+
+ def gentask():
+ ret = self.mfunc(callback=_callback, **self.kwargs)
+ self.q.put(self.sentinel)
+ if self.c_callback:
+ self.c_callback(ret)
+
+ Thread(target=gentask).start()
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ obj = self.q.get(True,None)
+ if obj is self.sentinel:
+ raise StopIteration
+ else:
+ return obj
diff --git a/modules/stopping_criteria.py b/modules/stopping_criteria.py
deleted file mode 100644
index 44a631b3..00000000
--- a/modules/stopping_criteria.py
+++ /dev/null
@@ -1,32 +0,0 @@
-'''
-This code was copied from
-
-https://github.com/PygmalionAI/gradio-ui/
-
-'''
-
-import torch
-import transformers
-
-
-class _SentinelTokenStoppingCriteria(transformers.StoppingCriteria):
-
- def __init__(self, sentinel_token_ids: torch.LongTensor,
- starting_idx: int):
- transformers.StoppingCriteria.__init__(self)
- self.sentinel_token_ids = sentinel_token_ids
- self.starting_idx = starting_idx
-
- def __call__(self, input_ids: torch.LongTensor,
- _scores: torch.FloatTensor) -> bool:
- for sample in input_ids:
- trimmed_sample = sample[self.starting_idx:]
- # Can't unfold, output is still too tiny. Skip.
- if trimmed_sample.shape[-1] < self.sentinel_token_ids.shape[-1]:
- continue
-
- for window in trimmed_sample.unfold(
- 0, self.sentinel_token_ids.shape[-1], 1):
- if torch.all(torch.eq(self.sentinel_token_ids, window)):
- return True
- return False
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 4af53273..436afbeb 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -5,13 +5,13 @@ import time
import numpy as np
import torch
import transformers
-from tqdm import tqdm
import modules.shared as shared
+from modules.callbacks import (Iteratorize, Stream,
+ _SentinelTokenStoppingCriteria)
from modules.extensions import apply_extensions
from modules.html_generator import generate_4chan_html, generate_basic_html
from modules.models import local_rank
-from modules.stopping_criteria import _SentinelTokenStoppingCriteria
def get_max_prompt_length(tokens):
@@ -103,7 +103,9 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
yield formatted_outputs(reply, shared.model_name)
t1 = time.time()
- print(f"Output generated in {(t1-t0):.2f} seconds.")
+ output = encode(reply)[0]
+ input_ids = encode(question)
+ print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(input_ids[0])} tokens)")
return
original_question = question
@@ -113,6 +115,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
print(f"\n\n{question}\n--------------------\n")
input_ids = encode(question, max_new_tokens)
+ original_input_ids = input_ids
cuda = "" if (shared.args.cpu or shared.args.deepspeed or shared.args.flexgen) else ".cuda()"
n = shared.tokenizer.eos_token_id if eos_token is None else int(encode(eos_token)[0][-1])
if stopping_string is not None:
@@ -126,10 +129,11 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
)
])
else:
- stopping_criteria_list = None
+ stopping_criteria_list = []
if not shared.args.flexgen:
generate_params = [
+ f"max_new_tokens=max_new_tokens",
f"eos_token_id={n}",
f"stopping_criteria=stopping_criteria_list",
f"do_sample={do_sample}",
@@ -147,24 +151,21 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
]
else:
generate_params = [
+ f"max_new_tokens={max_new_tokens if shared.args.no_stream else 8}",
f"do_sample={do_sample}",
f"temperature={temperature}",
f"stop={n}",
]
if shared.args.deepspeed:
generate_params.append("synced_gpus=True")
- if shared.args.no_stream:
- generate_params.append("max_new_tokens=max_new_tokens")
- else:
- generate_params.append("max_new_tokens=8")
if shared.soft_prompt:
inputs_embeds, filler_input_ids = generate_softprompt_input_tensors(input_ids)
generate_params.insert(0, "inputs_embeds=inputs_embeds")
- generate_params.insert(0, "filler_input_ids")
+ generate_params.insert(0, "inputs=filler_input_ids")
else:
- generate_params.insert(0, "input_ids")
+ generate_params.insert(0, "inputs=input_ids")
- # Generate the entire reply at once
+ # Generate the entire reply at once.
if shared.args.no_stream:
with torch.no_grad():
output = eval(f"shared.model.generate({', '.join(generate_params)}){cuda}")[0]
@@ -175,18 +176,45 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
if not (shared.args.chat or shared.args.cai_chat):
reply = original_question + apply_extensions(reply[len(question):], "output")
- t1 = time.time()
- print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0)/8:.2f} it/s, {len(output)-len(input_ids[0])} tokens)")
yield formatted_outputs(reply, shared.model_name)
- # Generate the reply 8 tokens at a time
- else:
+ # Stream the reply 1 token at a time.
+ # This is based on the trick of using 'stopping_criteria' to create an iterator.
+ elif not shared.args.flexgen:
+
+ def generate_with_callback(callback=None, **kwargs):
+ if 'stopping_criteria' not in kwargs:
+ kwargs['stopping_criteria'] = []
+ kwargs['stopping_criteria'].append(Stream(callback_func=callback))
+ shared.model.generate(**kwargs)[0]
+
+ def generate_with_streaming(**kwargs):
+ return Iteratorize(generate_with_callback, kwargs, callback=None)
+
yield formatted_outputs(original_question, shared.model_name)
- for i in tqdm(range(max_new_tokens//8+1)):
+ for output in eval(f"generate_with_streaming({', '.join(generate_params)})"):
+ if shared.soft_prompt:
+ output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
+
+ reply = decode(output)
+ if not (shared.args.chat or shared.args.cai_chat):
+ reply = original_question + apply_extensions(reply[len(question):], "output")
+ yield formatted_outputs(reply, shared.model_name)
+
+ if not shared.args.flexgen:
+ if output[-1] == n:
+ break
+ else:
+ if np.count_nonzero(input_ids[0] == n) < np.count_nonzero(output == n):
+ break
+
+ # Stream the output naively for FlexGen since it doesn't support 'stopping_criteria'
+ else:
+ for i in range(max_new_tokens//8+1):
clear_torch_cache()
with torch.no_grad():
- output = eval(f"shared.model.generate({', '.join(generate_params)}){cuda}")[0]
+ output = eval(f"shared.model.generate({', '.join(generate_params)})")[0]
if shared.soft_prompt:
output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
@@ -206,3 +234,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
if shared.soft_prompt:
inputs_embeds, filler_input_ids = generate_softprompt_input_tensors(input_ids)
+
+ t1 = time.time()
+ print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(original_input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(original_input_ids[0])} tokens)")
+ return
diff --git a/server.py b/server.py
index 9f584ba3..42897b0b 100644
--- a/server.py
+++ b/server.py
@@ -18,9 +18,6 @@ from modules.html_generator import generate_chat_html
from modules.models import load_model, load_soft_prompt
from modules.text_generation import generate_reply
-if (shared.args.chat or shared.args.cai_chat) and not shared.args.no_stream:
- print('Warning: chat mode currently becomes somewhat slower with text streaming on.\nConsider starting the web UI with the --no-stream option.\n')
-
# Loading custom settings
settings_file = None
if shared.args.settings is not None and Path(shared.args.settings).exists():
From 0e16c0bacb88ad0f5420fd2aa2c6cfadf38e2579 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 8 Mar 2023 02:50:49 -0300
Subject: [PATCH 02/69] Remove redeclaration of a function
---
modules/RWKV.py | 36 +-----------------------------------
1 file changed, 1 insertion(+), 35 deletions(-)
diff --git a/modules/RWKV.py b/modules/RWKV.py
index b226a195..70deab28 100644
--- a/modules/RWKV.py
+++ b/modules/RWKV.py
@@ -7,6 +7,7 @@ import numpy as np
from tokenizers import Tokenizer
import modules.shared as shared
+from modules.callbacks import Iteratorize
np.set_printoptions(precision=4, suppress=True, linewidth=200)
@@ -73,38 +74,3 @@ class RWKVTokenizer:
def decode(self, ids):
return self.tokenizer.decode(ids)
-
-class Iteratorize:
-
- """
- Transforms a function that takes a callback
- into a lazy iterator (generator).
- """
-
- def __init__(self, func, kwargs={}, callback=None):
- self.mfunc=func
- self.c_callback=callback
- self.q = Queue(maxsize=1)
- self.sentinel = object()
- self.kwargs = kwargs
-
- def _callback(val):
- self.q.put(val)
-
- def gentask():
- ret = self.mfunc(callback=_callback, **self.kwargs)
- self.q.put(self.sentinel)
- if self.c_callback:
- self.c_callback(ret)
-
- Thread(target=gentask).start()
-
- def __iter__(self):
- return self
-
- def __next__(self):
- obj = self.q.get(True,None)
- if obj is self.sentinel:
- raise StopIteration
- else:
- return obj
From 72d539dbff6f946fbbd1d8806361dccbc241f8ec Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 8 Mar 2023 02:54:47 -0300
Subject: [PATCH 03/69] Better separate the FlexGen case
---
modules/text_generation.py | 19 +++++--------------
1 file changed, 5 insertions(+), 14 deletions(-)
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 436afbeb..a8157a76 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -201,12 +201,8 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
reply = original_question + apply_extensions(reply[len(question):], "output")
yield formatted_outputs(reply, shared.model_name)
- if not shared.args.flexgen:
- if output[-1] == n:
- break
- else:
- if np.count_nonzero(input_ids[0] == n) < np.count_nonzero(output == n):
- break
+ if output[-1] == n:
+ break
# Stream the output naively for FlexGen since it doesn't support 'stopping_criteria'
else:
@@ -223,14 +219,9 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
reply = original_question + apply_extensions(reply[len(question):], "output")
yield formatted_outputs(reply, shared.model_name)
- if not shared.args.flexgen:
- if output[-1] == n:
- break
- input_ids = torch.reshape(output, (1, output.shape[0]))
- else:
- if np.count_nonzero(input_ids[0] == n) < np.count_nonzero(output == n):
- break
- input_ids = np.reshape(output, (1, output.shape[0]))
+ if np.count_nonzero(input_ids[0] == n) < np.count_nonzero(output == n):
+ break
+ input_ids = np.reshape(output, (1, output.shape[0]))
if shared.soft_prompt:
inputs_embeds, filler_input_ids = generate_softprompt_input_tensors(input_ids)
From ad2970374adeb58aec1d7748b02a8c82cc524c0a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 8 Mar 2023 03:00:06 -0300
Subject: [PATCH 04/69] Readability improvements
---
modules/text_generation.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/modules/text_generation.py b/modules/text_generation.py
index a8157a76..9477fe41 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -195,8 +195,8 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
for output in eval(f"generate_with_streaming({', '.join(generate_params)})"):
if shared.soft_prompt:
output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
-
reply = decode(output)
+
if not (shared.args.chat or shared.args.cai_chat):
reply = original_question + apply_extensions(reply[len(question):], "output")
yield formatted_outputs(reply, shared.model_name)
@@ -213,16 +213,16 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
output = eval(f"shared.model.generate({', '.join(generate_params)})")[0]
if shared.soft_prompt:
output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
-
reply = decode(output)
+
if not (shared.args.chat or shared.args.cai_chat):
reply = original_question + apply_extensions(reply[len(question):], "output")
yield formatted_outputs(reply, shared.model_name)
if np.count_nonzero(input_ids[0] == n) < np.count_nonzero(output == n):
break
- input_ids = np.reshape(output, (1, output.shape[0]))
+ input_ids = np.reshape(output, (1, output.shape[0]))
if shared.soft_prompt:
inputs_embeds, filler_input_ids = generate_softprompt_input_tensors(input_ids)
From 33fb6aed74ebfd50f12373fcbe2f7c0d285022d3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 8 Mar 2023 03:08:16 -0300
Subject: [PATCH 05/69] Minor bug fix
---
modules/text_generation.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 9477fe41..35617314 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -115,7 +115,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
print(f"\n\n{question}\n--------------------\n")
input_ids = encode(question, max_new_tokens)
- original_input_ids = input_ids
+ original_input_ids = output = input_ids
cuda = "" if (shared.args.cpu or shared.args.deepspeed or shared.args.flexgen) else ".cuda()"
n = shared.tokenizer.eos_token_id if eos_token is None else int(encode(eos_token)[0][-1])
if stopping_string is not None:
From ad6b699503eeabcad141efb6172ff43dc1976522 Mon Sep 17 00:00:00 2001
From: Xan <70198941+xanthousm@users.noreply.github.com>
Date: Wed, 8 Mar 2023 22:02:17 +1100
Subject: [PATCH 06/69] Better TTS with autoplay
- Adds "still_streaming" to shared module for extensions to know if generation is complete
- Changed TTS extension with new options:
- Show text under the audio widget
- Automatically play the audio once text generation finishes
- manage the generated wav files (only keep files for finished generations, optional max file limit)
- [wip] ability to change voice pitch and speed
- added 'tensorboard' to requirements, since python sent "tensorboard not found" errors after a fresh installation.
---
extensions/silero_tts/requirements.txt | 1 +
extensions/silero_tts/script.py | 60 +++++++++++++++++++++++---
modules/shared.py | 1 +
modules/text_generation.py | 11 ++++-
requirements.txt | 1 +
5 files changed, 67 insertions(+), 7 deletions(-)
diff --git a/extensions/silero_tts/requirements.txt b/extensions/silero_tts/requirements.txt
index f2f0bff5..b4444306 100644
--- a/extensions/silero_tts/requirements.txt
+++ b/extensions/silero_tts/requirements.txt
@@ -4,3 +4,4 @@ pydub
PyYAML
torch
torchaudio
+simpleaudio
diff --git a/extensions/silero_tts/script.py b/extensions/silero_tts/script.py
index f697d0e2..03319dbf 100644
--- a/extensions/silero_tts/script.py
+++ b/extensions/silero_tts/script.py
@@ -4,20 +4,36 @@ from pathlib import Path
import gradio as gr
import torch
+import modules.shared as shared
+import simpleaudio as sa
+
torch._C._jit_set_profiling_mode(False)
params = {
'activate': True,
- 'speaker': 'en_56',
+ 'speaker': 'en_5',
'language': 'en',
'model_id': 'v3_en',
'sample_rate': 48000,
'device': 'cpu',
+ 'max_wavs': 20,
+ 'play_audio': True,
+ 'show_text': True,
}
current_params = params.copy()
voices_by_gender = ['en_99', 'en_45', 'en_18', 'en_117', 'en_49', 'en_51', 'en_68', 'en_0', 'en_26', 'en_56', 'en_74', 'en_5', 'en_38', 'en_53', 'en_21', 'en_37', 'en_107', 'en_10', 'en_82', 'en_16', 'en_41', 'en_12', 'en_67', 'en_61', 'en_14', 'en_11', 'en_39', 'en_52', 'en_24', 'en_97', 'en_28', 'en_72', 'en_94', 'en_36', 'en_4', 'en_43', 'en_88', 'en_25', 'en_65', 'en_6', 'en_44', 'en_75', 'en_91', 'en_60', 'en_109', 'en_85', 'en_101', 'en_108', 'en_50', 'en_96', 'en_64', 'en_92', 'en_76', 'en_33', 'en_116', 'en_48', 'en_98', 'en_86', 'en_62', 'en_54', 'en_95', 'en_55', 'en_111', 'en_3', 'en_83', 'en_8', 'en_47', 'en_59', 'en_1', 'en_2', 'en_7', 'en_9', 'en_13', 'en_15', 'en_17', 'en_19', 'en_20', 'en_22', 'en_23', 'en_27', 'en_29', 'en_30', 'en_31', 'en_32', 'en_34', 'en_35', 'en_40', 'en_42', 'en_46', 'en_57', 'en_58', 'en_63', 'en_66', 'en_69', 'en_70', 'en_71', 'en_73', 'en_77', 'en_78', 'en_79', 'en_80', 'en_81', 'en_84', 'en_87', 'en_89', 'en_90', 'en_93', 'en_100', 'en_102', 'en_103', 'en_104', 'en_105', 'en_106', 'en_110', 'en_112', 'en_113', 'en_114', 'en_115']
wav_idx = 0
+table = str.maketrans({
+ "<": "<",
+ ">": ">",
+ "&": "&",
+ "'": "'",
+ '"': """,
+})
+def xmlesc(txt):
+ return txt.translate(table)
+
def load_model():
model, example_text = torch.hub.load(repo_or_dir='snakers4/silero-models', model='silero_tts', language=params['language'], speaker=params['model_id'])
model.to(params['device'])
@@ -58,20 +74,45 @@ def output_modifier(string):
if params['activate'] == False:
return string
+ orig_string = string
string = remove_surrounded_chars(string)
string = string.replace('"', '')
string = string.replace('“', '')
string = string.replace('\n', ' ')
string = string.strip()
+ auto_playable=True
if string == '':
- string = 'empty reply, try regenerating'
+ string = 'empty reply, try regenerating'
+ auto_playable=False
+
+ #x-slow, slow, medium, fast, x-fast
+ #x-low, low, medium, high, x-high
+ #prosody=''
+ prosody=''
+ string =''+prosody+xmlesc(string)+''
+
output_file = Path(f'extensions/silero_tts/outputs/{wav_idx:06d}.wav')
- audio = model.save_wav(text=string, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
-
+ audio = model.save_wav(ssml_text=string, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
string = f''
- wav_idx += 1
+
+ #reset if too many wavs. set max to -1 for unlimited.
+ if wav_idx < params['max_wavs'] and params['max_wavs'] > 0:
+ #only increment if starting a new stream, else replace during streaming. Does not update duration on webui sometimes?
+ if not shared.still_streaming:
+ wav_idx += 1
+ else:
+ wav_idx = 0
+
+ if params['show_text']:
+ string+='\n\n'+orig_string
+
+ #if params['play_audio'] == True and auto_playable and shared.stop_everything:
+ if params['play_audio'] == True and auto_playable and not shared.still_streaming:
+ stop_autoplay()
+ wave_obj = sa.WaveObject.from_wave_file(output_file.as_posix())
+ wave_obj.play()
return string
@@ -84,11 +125,20 @@ def bot_prefix_modifier(string):
return string
+def stop_autoplay():
+ sa.stop_all()
+
def ui():
# Gradio elements
activate = gr.Checkbox(value=params['activate'], label='Activate TTS')
+ show_text = gr.Checkbox(value=params['show_text'], label='Show message text under audio player')
+ play_audio = gr.Checkbox(value=params['play_audio'], label='Play TTS automatically')
+ stop_audio = gr.Button("Stop Auto-Play")
voice = gr.Dropdown(value=params['speaker'], choices=voices_by_gender, label='TTS voice')
# Event functions to update the parameters in the backend
activate.change(lambda x: params.update({"activate": x}), activate, None)
+ play_audio.change(lambda x: params.update({"play_audio": x}), play_audio, None)
+ show_text.change(lambda x: params.update({"show_text": x}), show_text, None)
+ stop_audio.click(stop_autoplay)
voice.change(lambda x: params.update({"speaker": x}), voice, None)
diff --git a/modules/shared.py b/modules/shared.py
index e9dfdaa2..90adb320 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -12,6 +12,7 @@ is_LLaMA = False
history = {'internal': [], 'visible': []}
character = 'None'
stop_everything = False
+still_streaming = False
# UI elements (buttons, sliders, HTML, etc)
gradio = {}
diff --git a/modules/text_generation.py b/modules/text_generation.py
index f9082a31..c9f4fc6a 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -182,6 +182,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
# Generate the reply 8 tokens at a time
else:
yield formatted_outputs(original_question, shared.model_name)
+ shared.still_streaming = True
for i in tqdm(range(max_new_tokens//8+1)):
with torch.no_grad():
output = eval(f"shared.model.generate({', '.join(generate_params)}){cuda}")[0]
@@ -191,8 +192,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
reply = decode(output)
if not (shared.args.chat or shared.args.cai_chat):
reply = original_question + apply_extensions(reply[len(question):], "output")
- yield formatted_outputs(reply, shared.model_name)
-
+
if not shared.args.flexgen:
if output[-1] == n:
break
@@ -201,6 +201,13 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
if np.count_nonzero(input_ids[0] == n) < np.count_nonzero(output == n):
break
input_ids = np.reshape(output, (1, output.shape[0]))
+
+ #Mid-stream yield, ran if no breaks
+ yield formatted_outputs(reply, shared.model_name)
if shared.soft_prompt:
inputs_embeds, filler_input_ids = generate_softprompt_input_tensors(input_ids)
+
+ #Stream finished from max tokens or break. Do final yield.
+ shared.still_streaming = False
+ yield formatted_outputs(reply, shared.model_name)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 55aeb8fd..48ca1e4e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,3 +6,4 @@ numpy
rwkv==0.0.6
safetensors==0.2.8
git+https://github.com/huggingface/transformers
+tensorboard
From 738be6dd59a6f9c2ee215093675f2d55111d89ca Mon Sep 17 00:00:00 2001
From: Xan <70198941+xanthousm@users.noreply.github.com>
Date: Wed, 8 Mar 2023 22:25:55 +1100
Subject: [PATCH 07/69] Fix merge errors and unlimited wav bug
---
extensions/silero_tts/script.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/extensions/silero_tts/script.py b/extensions/silero_tts/script.py
index 53bd554c..eaf56159 100644
--- a/extensions/silero_tts/script.py
+++ b/extensions/silero_tts/script.py
@@ -93,11 +93,11 @@ def output_modifier(string):
string =''+prosody+xmlesc(string)+''
output_file = Path(f'extensions/silero_tts/outputs/{wav_idx:06d}.wav')
- model.save_wav(text=string, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
+ model.save_wav(ssml_text=string, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
string = f''
#reset if too many wavs. set max to -1 for unlimited.
- if wav_idx < params['max_wavs'] and params['max_wavs'] > 0:
+ if wav_idx < params['max_wavs'] or params['max_wavs'] < 0:
#only increment if starting a new stream, else replace during streaming. Does not update duration on webui sometimes?
if not shared.still_streaming:
wav_idx += 1
From add9330e5e90e33f3f8bbe0ea42290475deb9998 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 8 Mar 2023 11:26:29 -0300
Subject: [PATCH 08/69] Bug fixes
---
modules/text_generation.py | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 35617314..8f5ea798 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -115,7 +115,8 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
print(f"\n\n{question}\n--------------------\n")
input_ids = encode(question, max_new_tokens)
- original_input_ids = output = input_ids
+ original_input_ids = input_ids
+ output = input_ids[0]
cuda = "" if (shared.args.cpu or shared.args.deepspeed or shared.args.flexgen) else ".cuda()"
n = shared.tokenizer.eos_token_id if eos_token is None else int(encode(eos_token)[0][-1])
if stopping_string is not None:
@@ -186,7 +187,8 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
if 'stopping_criteria' not in kwargs:
kwargs['stopping_criteria'] = []
kwargs['stopping_criteria'].append(Stream(callback_func=callback))
- shared.model.generate(**kwargs)[0]
+ clear_torch_cache()
+ shared.model.generate(**kwargs)
def generate_with_streaming(**kwargs):
return Iteratorize(generate_with_callback, kwargs, callback=None)
@@ -208,7 +210,6 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
else:
for i in range(max_new_tokens//8+1):
clear_torch_cache()
-
with torch.no_grad():
output = eval(f"shared.model.generate({', '.join(generate_params)})")[0]
if shared.soft_prompt:
From 59b5f7a4b731c528f0fa53d70eb3318d3a1727df Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 8 Mar 2023 12:13:40 -0300
Subject: [PATCH 09/69] Improve usage of stopping_criteria
---
modules/text_generation.py | 19 ++++++-------------
1 file changed, 6 insertions(+), 13 deletions(-)
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 8f5ea798..6a59f9a7 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -119,18 +119,11 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
output = input_ids[0]
cuda = "" if (shared.args.cpu or shared.args.deepspeed or shared.args.flexgen) else ".cuda()"
n = shared.tokenizer.eos_token_id if eos_token is None else int(encode(eos_token)[0][-1])
+ stopping_criteria_list = transformers.StoppingCriteriaList()
if stopping_string is not None:
- # The stopping_criteria code below was copied from
- # https://github.com/PygmalionAI/gradio-ui/blob/master/src/model.py
+ # Copied from https://github.com/PygmalionAI/gradio-ui/blob/master/src/model.py
t = encode(stopping_string, 0, add_special_tokens=False)
- stopping_criteria_list = transformers.StoppingCriteriaList([
- _SentinelTokenStoppingCriteria(
- sentinel_token_ids=t,
- starting_idx=len(input_ids[0])
- )
- ])
- else:
- stopping_criteria_list = []
+ stopping_criteria_list.append(_SentinelTokenStoppingCriteria(sentinel_token_ids=t, starting_idx=len(input_ids[0])))
if not shared.args.flexgen:
generate_params = [
@@ -184,17 +177,17 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
elif not shared.args.flexgen:
def generate_with_callback(callback=None, **kwargs):
- if 'stopping_criteria' not in kwargs:
- kwargs['stopping_criteria'] = []
kwargs['stopping_criteria'].append(Stream(callback_func=callback))
clear_torch_cache()
- shared.model.generate(**kwargs)
+ with torch.no_grad():
+ shared.model.generate(**kwargs)
def generate_with_streaming(**kwargs):
return Iteratorize(generate_with_callback, kwargs, callback=None)
yield formatted_outputs(original_question, shared.model_name)
for output in eval(f"generate_with_streaming({', '.join(generate_params)})"):
+ print(print('Used vram in gib:', torch.cuda.memory_allocated() / 1024**3))
if shared.soft_prompt:
output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
reply = decode(output)
From a2b5383398adc6da5c46811179bfadaefa5e23f7 Mon Sep 17 00:00:00 2001
From: Xan <70198941+xanthousm@users.noreply.github.com>
Date: Thu, 9 Mar 2023 10:48:44 +1100
Subject: [PATCH 10/69] Merge in audio generation only on text stream finish.,
postpone audioblock autoplay
- Keeping simpleaudio until audio block "autoplay" doesn't play previous messages
- Only generate audio for finished messages
- Better name for autoplay, clean up comments
- set default to unlimited wav files. Still a few bugs when wav id resets
Co-Authored-By: Christoph Hess <9931495+ChristophHess@users.noreply.github.com>
---
extensions/silero_tts/script.py | 34 +++++++++++++++++++--------------
1 file changed, 20 insertions(+), 14 deletions(-)
diff --git a/extensions/silero_tts/script.py b/extensions/silero_tts/script.py
index eaf56159..334b02b9 100644
--- a/extensions/silero_tts/script.py
+++ b/extensions/silero_tts/script.py
@@ -15,14 +15,15 @@ params = {
'model_id': 'v3_en',
'sample_rate': 48000,
'device': 'cpu',
- 'max_wavs': 20,
- 'play_audio': True,
+ 'max_wavs': -1,
+ 'autoplay': True,
'show_text': True,
}
current_params = params.copy()
voices_by_gender = ['en_99', 'en_45', 'en_18', 'en_117', 'en_49', 'en_51', 'en_68', 'en_0', 'en_26', 'en_56', 'en_74', 'en_5', 'en_38', 'en_53', 'en_21', 'en_37', 'en_107', 'en_10', 'en_82', 'en_16', 'en_41', 'en_12', 'en_67', 'en_61', 'en_14', 'en_11', 'en_39', 'en_52', 'en_24', 'en_97', 'en_28', 'en_72', 'en_94', 'en_36', 'en_4', 'en_43', 'en_88', 'en_25', 'en_65', 'en_6', 'en_44', 'en_75', 'en_91', 'en_60', 'en_109', 'en_85', 'en_101', 'en_108', 'en_50', 'en_96', 'en_64', 'en_92', 'en_76', 'en_33', 'en_116', 'en_48', 'en_98', 'en_86', 'en_62', 'en_54', 'en_95', 'en_55', 'en_111', 'en_3', 'en_83', 'en_8', 'en_47', 'en_59', 'en_1', 'en_2', 'en_7', 'en_9', 'en_13', 'en_15', 'en_17', 'en_19', 'en_20', 'en_22', 'en_23', 'en_27', 'en_29', 'en_30', 'en_31', 'en_32', 'en_34', 'en_35', 'en_40', 'en_42', 'en_46', 'en_57', 'en_58', 'en_63', 'en_66', 'en_69', 'en_70', 'en_71', 'en_73', 'en_77', 'en_78', 'en_79', 'en_80', 'en_81', 'en_84', 'en_87', 'en_89', 'en_90', 'en_93', 'en_100', 'en_102', 'en_103', 'en_104', 'en_105', 'en_106', 'en_110', 'en_112', 'en_113', 'en_114', 'en_115']
wav_idx = 0
+#Used for making text xml compatible, needed for voice pitch and speed control
table = str.maketrans({
"<": "<",
">": ">",
@@ -88,27 +89,32 @@ def output_modifier(string):
#x-slow, slow, medium, fast, x-fast
#x-low, low, medium, high, x-high
- #prosody=''
- prosody=''
+ prosody=''
string =''+prosody+xmlesc(string)+''
output_file = Path(f'extensions/silero_tts/outputs/{wav_idx:06d}.wav')
- model.save_wav(ssml_text=string, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
- string = f''
+ autoplay_str = ''
+ if not shared.still_streaming:
+ model.save_wav(ssml_text=string, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
+ #diabled until autoplay doesn't run on previous messages
+ #autoplay = 'autoplay' if (params['autoplay'] and auto_playable) else ''
+ string = f'\n\n'
+ else:
+ #placeholder so text doesnt shift around so much
+ string =f'\n\n'
#reset if too many wavs. set max to -1 for unlimited.
if wav_idx < params['max_wavs'] or params['max_wavs'] < 0:
- #only increment if starting a new stream, else replace during streaming. Does not update duration on webui sometimes?
+ #only increment if starting a new stream, else replace during streaming.
if not shared.still_streaming:
wav_idx += 1
else:
wav_idx = 0
-
+
if params['show_text']:
- string+='\n\n'+orig_string
-
- #if params['play_audio'] == True and auto_playable and shared.stop_everything:
- if params['play_audio'] == True and auto_playable and not shared.still_streaming:
+ string+=orig_string
+
+ if params['autoplay'] == True and auto_playable and not shared.still_streaming:
stop_autoplay()
wave_obj = sa.WaveObject.from_wave_file(output_file.as_posix())
wave_obj.play()
@@ -131,13 +137,13 @@ def ui():
# Gradio elements
activate = gr.Checkbox(value=params['activate'], label='Activate TTS')
show_text = gr.Checkbox(value=params['show_text'], label='Show message text under audio player')
- play_audio = gr.Checkbox(value=params['play_audio'], label='Play TTS automatically')
+ autoplay = gr.Checkbox(value=params['autoplay'], label='Play TTS automatically')
stop_audio = gr.Button("Stop Auto-Play")
voice = gr.Dropdown(value=params['speaker'], choices=voices_by_gender, label='TTS voice')
# Event functions to update the parameters in the backend
activate.change(lambda x: params.update({"activate": x}), activate, None)
- play_audio.change(lambda x: params.update({"play_audio": x}), play_audio, None)
+ autoplay.change(lambda x: params.update({"autoplay": x}), autoplay, None)
show_text.change(lambda x: params.update({"show_text": x}), show_text, None)
stop_audio.click(stop_autoplay)
voice.change(lambda x: params.update({"speaker": x}), voice, None)
From 4dd14dcab4778b2d4e031db9cdfa94a2e1fe13e6 Mon Sep 17 00:00:00 2001
From: Chimdumebi Nebolisa <78305519+MichealC0@users.noreply.github.com>
Date: Thu, 9 Mar 2023 10:22:09 +0100
Subject: [PATCH 11/69] Update README.md
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 9efacb7c..23d53604 100644
--- a/README.md
+++ b/README.md
@@ -53,7 +53,7 @@ The third line assumes that you have an NVIDIA GPU.
pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/rocm5.2
```
-* If you are running in CPU mode, replace the third command with this one:
+* If you are running it in CPU mode, replace the third command with this one:
```
conda install pytorch torchvision torchaudio git -c pytorch
From 828a524f9a957f56c1985d71f941715727fd1db4 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 9 Mar 2023 15:50:26 -0300
Subject: [PATCH 12/69] Add LLaMA 4-bit support
---
modules/models.py | 22 +++++++++++++++++++++-
modules/shared.py | 1 +
requirements.txt | 2 +-
3 files changed, 23 insertions(+), 2 deletions(-)
diff --git a/modules/models.py b/modules/models.py
index 16ce6eb1..04235b52 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -1,5 +1,6 @@
import json
import os
+import sys
import time
import zipfile
from pathlib import Path
@@ -41,7 +42,7 @@ def load_model(model_name):
shared.is_RWKV = model_name.lower().startswith('rwkv-')
# Default settings
- if not (shared.args.cpu or shared.args.load_in_8bit or shared.args.auto_devices or shared.args.disk or shared.args.gpu_memory is not None or shared.args.cpu_memory is not None or shared.args.deepspeed or shared.args.flexgen or shared.is_RWKV):
+ if not (shared.args.cpu or shared.args.load_in_8bit or shared.args.load_in_4bit or shared.args.auto_devices or shared.args.disk or shared.args.gpu_memory is not None or shared.args.cpu_memory is not None or shared.args.deepspeed or shared.args.flexgen or shared.is_RWKV):
if any(size in shared.model_name.lower() for size in ('13b', '20b', '30b')):
model = AutoModelForCausalLM.from_pretrained(Path(f"models/{shared.model_name}"), device_map='auto', load_in_8bit=True)
else:
@@ -86,6 +87,24 @@ def load_model(model_name):
return model, tokenizer
+ # 4-bit LLaMA
+ elif shared.args.load_in_4bit:
+ sys.path.append(os.path.abspath(Path("repositories/GPTQ-for-LLaMa")))
+
+ from llama import load_quant
+
+ path_to_model = Path(f'models/{model_name}')
+ pt_model = ''
+ if path_to_model.name.lower().startswith('llama-7b'):
+ pt_model = 'llama-7b-4bit.pt'
+ if path_to_model.name.lower().startswith('llama-13b'):
+ pt_model = 'llama-13b-4bit.pt'
+ if path_to_model.name.lower().startswith('llama-30b'):
+ pt_model = 'llama-30b-4bit.pt'
+
+ model = load_quant(path_to_model, Path(f"models/{pt_model}"), 4)
+ model = model.to(torch.device('cuda:0'))
+
# Custom
else:
command = "AutoModelForCausalLM.from_pretrained"
@@ -159,3 +178,4 @@ def load_soft_prompt(name):
shared.soft_prompt_tensor = tensor
return name
+
diff --git a/modules/shared.py b/modules/shared.py
index b609045c..4c062fe9 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -68,6 +68,7 @@ parser.add_argument('--chat', action='store_true', help='Launch the web UI in ch
parser.add_argument('--cai-chat', action='store_true', help='Launch the web UI in chat mode with a style similar to Character.AI\'s. If the file img_bot.png or img_bot.jpg exists in the same folder as server.py, this image will be used as the bot\'s profile picture. Similarly, img_me.png or img_me.jpg will be used as your profile picture.')
parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text.')
parser.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision.')
+parser.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision. Currently only works with LLaMA.')
parser.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.')
parser.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.')
parser.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')
diff --git a/requirements.txt b/requirements.txt
index 47c56a45..6133f394 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,4 +6,4 @@ numpy
rwkv==0.1.0
safetensors==0.2.8
sentencepiece
-git+https://github.com/oobabooga/transformers@llama_push
+git+https://github.com/zphang/transformers@llama_push
From fd540b89309a138a17147955ecf8ea2049af4ca2 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 9 Mar 2023 17:59:15 -0300
Subject: [PATCH 13/69] Use new LLaMA implementation (this will break stuff. I
am sorry)
https://github.com/oobabooga/text-generation-webui/wiki/LLaMA-model
---
requirements.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/requirements.txt b/requirements.txt
index 47c56a45..6133f394 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,4 +6,4 @@ numpy
rwkv==0.1.0
safetensors==0.2.8
sentencepiece
-git+https://github.com/oobabooga/transformers@llama_push
+git+https://github.com/zphang/transformers@llama_push
From d41e3c233b4b4bccf6b0b36ff3f1db8701e52d5c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 9 Mar 2023 18:02:44 -0300
Subject: [PATCH 14/69] Update README.md
---
README.md | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 65596321..9fe454c2 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
# Text generation web UI
-A gradio web UI for running Large Language Models like GPT-J 6B, OPT, GALACTICA, GPT-Neo, and Pygmalion.
+A gradio web UI for running Large Language Models like GPT-J 6B, OPT, GALACTICA, LLaMA, and Pygmalion.
Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) of text generation.
@@ -27,6 +27,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
* [FlexGen offload](https://github.com/oobabooga/text-generation-webui/wiki/FlexGen).
* [DeepSpeed ZeRO-3 offload](https://github.com/oobabooga/text-generation-webui/wiki/DeepSpeed).
* Get responses via API, [with](https://github.com/oobabooga/text-generation-webui/blob/main/api-example-streaming.py) or [without](https://github.com/oobabooga/text-generation-webui/blob/main/api-example.py) streaming.
+* [Supports the LLaMA model](https://github.com/oobabooga/text-generation-webui/wiki/LLaMA-model).
* [Supports the RWKV model](https://github.com/oobabooga/text-generation-webui/wiki/RWKV-model).
* Supports softprompts.
* [Supports extensions](https://github.com/oobabooga/text-generation-webui/wiki/Extensions).
From 2965aa1625a1186fcf36a559235881d1382f2366 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 9 Mar 2023 20:48:51 -0300
Subject: [PATCH 15/69] Check if the .pt file exists
---
modules/models.py | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/modules/models.py b/modules/models.py
index 04235b52..e10668cf 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -102,6 +102,10 @@ def load_model(model_name):
if path_to_model.name.lower().startswith('llama-30b'):
pt_model = 'llama-30b-4bit.pt'
+ if not Path(f"models/{pt_model}").exists():
+ print(f"Could not find models/{pt_model}, exiting...")
+ exit()
+
model = load_quant(path_to_model, Path(f"models/{pt_model}"), 4)
model = model.to(torch.device('cuda:0'))
@@ -178,4 +182,3 @@ def load_soft_prompt(name):
shared.soft_prompt_tensor = tensor
return name
-
From 74102d5ee48fcf68939ff4fc3ca7e34e6623bcb7 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 9 Mar 2023 20:51:22 -0300
Subject: [PATCH 16/69] Insert to the path instead of appending
---
modules/models.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/modules/models.py b/modules/models.py
index e10668cf..0ad4c198 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -89,7 +89,7 @@ def load_model(model_name):
# 4-bit LLaMA
elif shared.args.load_in_4bit:
- sys.path.append(os.path.abspath(Path("repositories/GPTQ-for-LLaMa")))
+ sys.path.insert(0, os.path.abspath(Path("repositories/GPTQ-for-LLaMa")))
from llama import load_quant
From eb0cb9b6df58c397bda377deefeb14a2c0b0e0f9 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 9 Mar 2023 20:53:52 -0300
Subject: [PATCH 17/69] Update README
---
README.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/README.md b/README.md
index 65596321..de498fb6 100644
--- a/README.md
+++ b/README.md
@@ -137,6 +137,7 @@ Optionally, you can use the following command-line flags:
| `--cai-chat` | Launch the web UI in chat mode with a style similar to Character.AI's. If the file `img_bot.png` or `img_bot.jpg` exists in the same folder as server.py, this image will be used as the bot's profile picture. Similarly, `img_me.png` or `img_me.jpg` will be used as your profile picture. |
| `--cpu` | Use the CPU to generate text.|
| `--load-in-8bit` | Load the model with 8-bit precision.|
+| `--load-in-4bit` | Load the model with 4-bit precision. Currently only works with LLaMA. |
| `--bf16` | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. |
| `--auto-devices` | Automatically split the model across the available GPU(s) and CPU.|
| `--disk` | If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. |
From 9849aac0f1284c5fa02509f1e197cc248e2c4700 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 9 Mar 2023 21:54:50 -0300
Subject: [PATCH 18/69] Don't show .pt models in the list
---
modules/models.py | 3 +++
server.py | 2 +-
2 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/modules/models.py b/modules/models.py
index 0ad4c198..3e6cea18 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -105,6 +105,9 @@ def load_model(model_name):
if not Path(f"models/{pt_model}").exists():
print(f"Could not find models/{pt_model}, exiting...")
exit()
+ elif pt_model == '':
+ print(f"Could not find the .pt model for {model_name}, exiting...")
+ exit()
model = load_quant(path_to_model, Path(f"models/{pt_model}"), 4)
model = model.to(torch.device('cuda:0'))
diff --git a/server.py b/server.py
index 7d8792b7..c2977f41 100644
--- a/server.py
+++ b/server.py
@@ -37,7 +37,7 @@ def get_available_models():
if shared.args.flexgen:
return sorted([re.sub('-np$', '', item.name) for item in list(Path('models/').glob('*')) if item.name.endswith('-np')], key=str.lower)
else:
- return sorted([item.name for item in list(Path('models/').glob('*')) if not item.name.endswith(('.txt', '-np'))], key=str.lower)
+ return sorted([item.name for item in list(Path('models/').glob('*')) if not item.name.endswith(('.txt', '-np', '.pt'))], key=str.lower)
def get_available_presets():
return sorted(set(map(lambda x : '.'.join(str(x.name).split('.')[:-1]), Path('presets').glob('*.txt'))), key=str.lower)
From 826e297b0ec40299318f1002f9165e7ac9c9c257 Mon Sep 17 00:00:00 2001
From: rohvani <3782201+rohvani@users.noreply.github.com>
Date: Thu, 9 Mar 2023 18:31:32 -0800
Subject: [PATCH 19/69] add llama-65b-4bit support & multiple pt paths
---
modules/models.py | 24 ++++++++++++++++--------
1 file changed, 16 insertions(+), 8 deletions(-)
diff --git a/modules/models.py b/modules/models.py
index 3e6cea18..062ccb1f 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -97,19 +97,27 @@ def load_model(model_name):
pt_model = ''
if path_to_model.name.lower().startswith('llama-7b'):
pt_model = 'llama-7b-4bit.pt'
- if path_to_model.name.lower().startswith('llama-13b'):
+ elif path_to_model.name.lower().startswith('llama-13b'):
pt_model = 'llama-13b-4bit.pt'
- if path_to_model.name.lower().startswith('llama-30b'):
+ elif path_to_model.name.lower().startswith('llama-30b'):
pt_model = 'llama-30b-4bit.pt'
-
- if not Path(f"models/{pt_model}").exists():
- print(f"Could not find models/{pt_model}, exiting...")
- exit()
- elif pt_model == '':
+ elif path_to_model.name.lower().startswith('llama-65b'):
+ pt_model = 'llama-65b-4bit.pt'
+ else:
print(f"Could not find the .pt model for {model_name}, exiting...")
exit()
- model = load_quant(path_to_model, Path(f"models/{pt_model}"), 4)
+ # check root of models folder, and model path root
+ paths = [ f"{path_to_model}/{pt_model}", f"models/{pt_model}" ]
+ for path in [ Path(p) for p in paths ]:
+ if path.exists():
+ pt_path = path
+
+ if not pt_path:
+ print(f"Could not find {pt_model}, exiting...")
+ exit()
+
+ model = load_quant(path_to_model, pt_path, 4)
model = model.to(torch.device('cuda:0'))
# Custom
From 5ee376c580e8c2cf2e3b34e1822c43e6754b2649 Mon Sep 17 00:00:00 2001
From: rohvani <3782201+rohvani@users.noreply.github.com>
Date: Thu, 9 Mar 2023 18:31:41 -0800
Subject: [PATCH 20/69] add LLaMA preset
---
presets/LLaMA-Default.txt | 12 ++++++++++++
1 file changed, 12 insertions(+)
create mode 100644 presets/LLaMA-Default.txt
diff --git a/presets/LLaMA-Default.txt b/presets/LLaMA-Default.txt
new file mode 100644
index 00000000..3df8209a
--- /dev/null
+++ b/presets/LLaMA-Default.txt
@@ -0,0 +1,12 @@
+do_sample=False
+temperature=0.7
+top_p=0
+typical_p=1
+repetition_penalty=1.15
+top_k=40
+num_beams=1
+penalty_alpha=0
+min_length=0
+length_penalty=1
+no_repeat_ngram_size=0
+early_stopping=True
From ec3de0495c52a6d81495ac0553f4a7a886e4e0c8 Mon Sep 17 00:00:00 2001
From: Ber Zoidberg
Date: Thu, 9 Mar 2023 19:08:09 -0800
Subject: [PATCH 21/69] download tokenizer when present
---
download-model.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/download-model.py b/download-model.py
index 599418fc..27fbffda 100644
--- a/download-model.py
+++ b/download-model.py
@@ -107,9 +107,10 @@ def get_download_links_from_huggingface(model, branch):
is_pytorch = re.match("pytorch_model.*\.bin", fname)
is_safetensors = re.match("model.*\.safetensors", fname)
+ is_tokenizer = re.match("tokenizer.*\.model", fname)
is_text = re.match(".*\.(txt|json)", fname)
- if is_text or is_safetensors or is_pytorch:
+ if any((is_pytorch, is_safetensors, is_text, is_tokenizer)):
if is_text:
links.append(f"https://huggingface.co/{model}/resolve/{branch}/{fname}")
classifications.append('text')
From 249c268176114e72da3e82d7e2c652481060f44f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 10 Mar 2023 00:41:10 -0300
Subject: [PATCH 22/69] Fix the download script for long lists of files on HF
---
download-model.py | 15 +++++++++++----
1 file changed, 11 insertions(+), 4 deletions(-)
diff --git a/download-model.py b/download-model.py
index 599418fc..98b57bb0 100644
--- a/download-model.py
+++ b/download-model.py
@@ -5,7 +5,9 @@ Example:
python download-model.py facebook/opt-1.3b
'''
+
import argparse
+import base64
import json
import multiprocessing
import re
@@ -93,14 +95,18 @@ facebook/opt-1.3b
def get_download_links_from_huggingface(model, branch):
base = "https://huggingface.co"
page = f"/api/models/{model}/tree/{branch}?cursor="
+ cursor = b""
links = []
classifications = []
has_pytorch = False
has_safetensors = False
- while page is not None:
- content = requests.get(f"{base}{page}").content
+ while True:
+ content = requests.get(f"{base}{page}{cursor.decode()}").content
+
dict = json.loads(content)
+ if len(dict) == 0:
+ break
for i in range(len(dict)):
fname = dict[i]['path']
@@ -123,8 +129,9 @@ def get_download_links_from_huggingface(model, branch):
has_pytorch = True
classifications.append('pytorch')
- #page = dict['nextUrl']
- page = None
+ cursor = base64.b64encode(f'{{"file_name":"{dict[-1]["path"]}"}}'.encode()) + b':50'
+ cursor = base64.b64encode(cursor)
+ cursor = cursor.replace(b'=', b'%3D')
# If both pytorch and safetensors are available, download safetensors only
if has_pytorch and has_safetensors:
From 875847bf88c52166c4e9a0cc35f7e6c535b88d97 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 10 Mar 2023 00:45:28 -0300
Subject: [PATCH 23/69] Consider tokenizer a type of text
---
download-model.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/download-model.py b/download-model.py
index 27fbffda..bf94be7c 100644
--- a/download-model.py
+++ b/download-model.py
@@ -108,7 +108,7 @@ def get_download_links_from_huggingface(model, branch):
is_pytorch = re.match("pytorch_model.*\.bin", fname)
is_safetensors = re.match("model.*\.safetensors", fname)
is_tokenizer = re.match("tokenizer.*\.model", fname)
- is_text = re.match(".*\.(txt|json)", fname)
+ is_text = re.match(".*\.(txt|json)", fname) or is_tokenizer
if any((is_pytorch, is_safetensors, is_text, is_tokenizer)):
if is_text:
From 2ac29137470396733e95e7efa77e091d5e8a5ef5 Mon Sep 17 00:00:00 2001
From: rohvani <3782201+rohvani@users.noreply.github.com>
Date: Thu, 9 Mar 2023 20:13:23 -0800
Subject: [PATCH 24/69] fix reference issue
---
modules/models.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/modules/models.py b/modules/models.py
index 062ccb1f..a2256b98 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -109,6 +109,7 @@ def load_model(model_name):
# check root of models folder, and model path root
paths = [ f"{path_to_model}/{pt_model}", f"models/{pt_model}" ]
+ pt_path = None
for path in [ Path(p) for p in paths ]:
if path.exists():
pt_path = path
From ab470444591e425290db72db9ebc3127f5520449 Mon Sep 17 00:00:00 2001
From: deepdiffuser
Date: Fri, 10 Mar 2023 04:29:09 -0800
Subject: [PATCH 25/69] add multi-gpu support for 4bit gptq LLaMA
---
modules/models.py | 13 ++++++++++++-
1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/modules/models.py b/modules/models.py
index 3e6cea18..14443c89 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -110,7 +110,18 @@ def load_model(model_name):
exit()
model = load_quant(path_to_model, Path(f"models/{pt_model}"), 4)
- model = model.to(torch.device('cuda:0'))
+
+ if shared.args.gpu_memory:
+ max_memory = {}
+ for i in range(len(shared.args.gpu_memory)):
+ max_memory[i] = f"{shared.args.gpu_memory[i]}GiB"
+ max_memory['cpu'] = f"{shared.args.cpu_memory or '99'}GiB"
+
+ import accelerate
+ device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory)
+ model = accelerate.dispatch_model(model, device_map=device_map)
+ else:
+ model = model.to(torch.device('cuda:0'))
# Custom
else:
From 9fbd60bf22c6a2e9cef0cade23a4933547df9114 Mon Sep 17 00:00:00 2001
From: deepdiffuser
Date: Fri, 10 Mar 2023 05:30:47 -0800
Subject: [PATCH 26/69] add no_split_module_classes to prevent tensor split
error
---
modules/models.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/modules/models.py b/modules/models.py
index 14443c89..986cd73a 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -118,7 +118,7 @@ def load_model(model_name):
max_memory['cpu'] = f"{shared.args.cpu_memory or '99'}GiB"
import accelerate
- device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory)
+ device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=["LLaMADecoderLayer"])
model = accelerate.dispatch_model(model, device_map=device_map)
else:
model = model.to(torch.device('cuda:0'))
From e461c0b7a0769c4df3aa96505803b004a1071c2e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 10 Mar 2023 10:51:12 -0300
Subject: [PATCH 27/69] Move the import to the top
---
modules/models.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/modules/models.py b/modules/models.py
index 986cd73a..f4c1071d 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -112,12 +112,13 @@ def load_model(model_name):
model = load_quant(path_to_model, Path(f"models/{pt_model}"), 4)
if shared.args.gpu_memory:
+ import accelerate
+
max_memory = {}
for i in range(len(shared.args.gpu_memory)):
max_memory[i] = f"{shared.args.gpu_memory[i]}GiB"
max_memory['cpu'] = f"{shared.args.cpu_memory or '99'}GiB"
- import accelerate
device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=["LLaMADecoderLayer"])
model = accelerate.dispatch_model(model, device_map=device_map)
else:
From de7dd8b6aa3aa00ba629c9ba6ce1bc32bd213d2f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 10 Mar 2023 10:54:08 -0300
Subject: [PATCH 28/69] Add comments
---
modules/models.py | 3 +++
1 file changed, 3 insertions(+)
diff --git a/modules/models.py b/modules/models.py
index f4c1071d..a5ec59d1 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -111,6 +111,7 @@ def load_model(model_name):
model = load_quant(path_to_model, Path(f"models/{pt_model}"), 4)
+ # Multi-GPU setup
if shared.args.gpu_memory:
import accelerate
@@ -121,6 +122,8 @@ def load_model(model_name):
device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=["LLaMADecoderLayer"])
model = accelerate.dispatch_model(model, device_map=device_map)
+
+ # Single GPU
else:
model = model.to(torch.device('cuda:0'))
From 706a03b2cb5bf3c0667d8c13b3a47f1a6e33cc81 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 10 Mar 2023 11:02:25 -0300
Subject: [PATCH 29/69] Minor changes
---
modules/models.py | 8 +++-----
presets/LLaMA-Default.txt | 12 ------------
2 files changed, 3 insertions(+), 17 deletions(-)
delete mode 100644 presets/LLaMA-Default.txt
diff --git a/modules/models.py b/modules/models.py
index a2256b98..a23f1fa9 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -104,13 +104,11 @@ def load_model(model_name):
elif path_to_model.name.lower().startswith('llama-65b'):
pt_model = 'llama-65b-4bit.pt'
else:
- print(f"Could not find the .pt model for {model_name}, exiting...")
- exit()
+ pt_model = f'{model_name}-4bit.pt'
- # check root of models folder, and model path root
- paths = [ f"{path_to_model}/{pt_model}", f"models/{pt_model}" ]
+ # Try to find the .pt both in models/ and in the subfolder
pt_path = None
- for path in [ Path(p) for p in paths ]:
+ for path in [Path(p) for p in [f"models/{pt_model}", f"{path_to_model}/{pt_model}"]]:
if path.exists():
pt_path = path
diff --git a/presets/LLaMA-Default.txt b/presets/LLaMA-Default.txt
deleted file mode 100644
index 3df8209a..00000000
--- a/presets/LLaMA-Default.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-do_sample=False
-temperature=0.7
-top_p=0
-typical_p=1
-repetition_penalty=1.15
-top_k=40
-num_beams=1
-penalty_alpha=0
-min_length=0
-length_penalty=1
-no_repeat_ngram_size=0
-early_stopping=True
From 026d60bd3424b5426c5ef80632aa6b71fe12d4c5 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 10 Mar 2023 14:01:02 -0300
Subject: [PATCH 30/69] Remove default preset that didn't do anything
---
modules/shared.py | 1 -
1 file changed, 1 deletion(-)
diff --git a/modules/shared.py b/modules/shared.py
index 4c062fe9..2acb047f 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -42,7 +42,6 @@ settings = {
'default': 'NovelAI-Sphinx Moth',
'pygmalion-*': 'Pygmalion',
'RWKV-*': 'Naive',
- '(rosey|chip|joi)_.*_instruct.*': 'Instruct Joi (Contrastive Search)'
},
'prompts': {
'default': 'Common sense questions and answers\n\nQuestion: \nFactual answer:',
From e6c631aea4dd4596606b0f058173de223909d372 Mon Sep 17 00:00:00 2001
From: draff
Date: Fri, 10 Mar 2023 21:36:45 +0000
Subject: [PATCH 31/69] Replace --load-in-4bit with --llama-bits
Replaces --load-in-4bit with a more flexible --llama-bits arg to allow for 2 and 3 bit models as well. This commit also fixes a loading issue with .pt files which are not in the root of the models folder
---
README.md | 2 +-
modules/models.py | 17 +++++++++--------
modules/shared.py | 2 +-
3 files changed, 11 insertions(+), 10 deletions(-)
diff --git a/README.md b/README.md
index c329913d..5c560172 100644
--- a/README.md
+++ b/README.md
@@ -138,7 +138,7 @@ Optionally, you can use the following command-line flags:
| `--cai-chat` | Launch the web UI in chat mode with a style similar to Character.AI's. If the file `img_bot.png` or `img_bot.jpg` exists in the same folder as server.py, this image will be used as the bot's profile picture. Similarly, `img_me.png` or `img_me.jpg` will be used as your profile picture. |
| `--cpu` | Use the CPU to generate text.|
| `--load-in-8bit` | Load the model with 8-bit precision.|
-| `--load-in-4bit` | Load the model with 4-bit precision. Currently only works with LLaMA. |
+| `--llama-bits` | Load LLaMA models with specified precision. 2, 3 and 4 bit are supported, use standard `--load-in-8bit` for 8bit precision. |
| `--bf16` | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. |
| `--auto-devices` | Automatically split the model across the available GPU(s) and CPU.|
| `--disk` | If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. |
diff --git a/modules/models.py b/modules/models.py
index f31d8b0d..467ffbee 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -42,7 +42,7 @@ def load_model(model_name):
shared.is_RWKV = model_name.lower().startswith('rwkv-')
# Default settings
- if not (shared.args.cpu or shared.args.load_in_8bit or shared.args.load_in_4bit or shared.args.auto_devices or shared.args.disk or shared.args.gpu_memory is not None or shared.args.cpu_memory is not None or shared.args.deepspeed or shared.args.flexgen or shared.is_RWKV):
+ if not (shared.args.cpu or shared.args.load_in_8bit or shared.args.llama_bits>0 or shared.args.auto_devices or shared.args.disk or shared.args.gpu_memory is not None or shared.args.cpu_memory is not None or shared.args.deepspeed or shared.args.flexgen or shared.is_RWKV):
if any(size in shared.model_name.lower() for size in ('13b', '20b', '30b')):
model = AutoModelForCausalLM.from_pretrained(Path(f"models/{shared.model_name}"), device_map='auto', load_in_8bit=True)
else:
@@ -88,23 +88,24 @@ def load_model(model_name):
return model, tokenizer
# 4-bit LLaMA
- elif shared.args.load_in_4bit:
+ elif shared.args.llama_bits>0:
sys.path.insert(0, os.path.abspath(Path("repositories/GPTQ-for-LLaMa")))
+ bits = shared.args.llama_bits
from llama import load_quant
path_to_model = Path(f'models/{model_name}')
pt_model = ''
if path_to_model.name.lower().startswith('llama-7b'):
- pt_model = 'llama-7b-4bit.pt'
+ pt_model = f'llama-7b-{bits}bit.pt'
elif path_to_model.name.lower().startswith('llama-13b'):
- pt_model = 'llama-13b-4bit.pt'
+ pt_model = f'llama-13b-{bits}bit.pt'
elif path_to_model.name.lower().startswith('llama-30b'):
- pt_model = 'llama-30b-4bit.pt'
+ pt_model = f'llama-30b-{bits}bit.pt'
elif path_to_model.name.lower().startswith('llama-65b'):
- pt_model = 'llama-65b-4bit.pt'
+ pt_model = f'llama-65b-{bits}bit.pt'
else:
- pt_model = f'{model_name}-4bit.pt'
+ pt_model = f'{model_name}-{bits}bit.pt'
# Try to find the .pt both in models/ and in the subfolder
pt_path = None
@@ -116,7 +117,7 @@ def load_model(model_name):
print(f"Could not find {pt_model}, exiting...")
exit()
- model = load_quant(path_to_model, Path(f"models/{pt_model}"), 4)
+ model = load_quant(path_to_model, Path(f"{pt_path}"), bits)
# Multi-GPU setup
if shared.args.gpu_memory:
diff --git a/modules/shared.py b/modules/shared.py
index 2acb047f..61d5a768 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -67,7 +67,7 @@ parser.add_argument('--chat', action='store_true', help='Launch the web UI in ch
parser.add_argument('--cai-chat', action='store_true', help='Launch the web UI in chat mode with a style similar to Character.AI\'s. If the file img_bot.png or img_bot.jpg exists in the same folder as server.py, this image will be used as the bot\'s profile picture. Similarly, img_me.png or img_me.jpg will be used as your profile picture.')
parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text.')
parser.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision.')
-parser.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision. Currently only works with LLaMA.')
+parser.add_argument('--llama-bits', type=int, default=0, help='Load LLaMA models with specified precision. 2, 3 and 4 bit are supported, use standard `--load-in-8bit` for 8bit precision.')
parser.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.')
parser.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.')
parser.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')
From 9ba8156a70b7d8d2cd79cac939aba22e080d8730 Mon Sep 17 00:00:00 2001
From: ItsLogic <38233332+ItsLogic@users.noreply.github.com>
Date: Fri, 10 Mar 2023 22:33:58 +0000
Subject: [PATCH 32/69] remove unnecessary Path()
---
modules/models.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/modules/models.py b/modules/models.py
index 467ffbee..3ec68f17 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -117,7 +117,7 @@ def load_model(model_name):
print(f"Could not find {pt_model}, exiting...")
exit()
- model = load_quant(path_to_model, Path(f"{pt_path}"), bits)
+ model = load_quant(path_to_model, pt_path, bits)
# Multi-GPU setup
if shared.args.gpu_memory:
From 804486214b5a1b07fc4c57255053593bb980d349 Mon Sep 17 00:00:00 2001
From: draff
Date: Fri, 10 Mar 2023 23:21:01 +0000
Subject: [PATCH 33/69] Re-implement --load-in-4bit and update --llama-bits arg
description
---
README.md | 3 ++-
modules/models.py | 8 ++++++--
modules/shared.py | 3 ++-
3 files changed, 10 insertions(+), 4 deletions(-)
diff --git a/README.md b/README.md
index 5c560172..76774c0b 100644
--- a/README.md
+++ b/README.md
@@ -138,7 +138,8 @@ Optionally, you can use the following command-line flags:
| `--cai-chat` | Launch the web UI in chat mode with a style similar to Character.AI's. If the file `img_bot.png` or `img_bot.jpg` exists in the same folder as server.py, this image will be used as the bot's profile picture. Similarly, `img_me.png` or `img_me.jpg` will be used as your profile picture. |
| `--cpu` | Use the CPU to generate text.|
| `--load-in-8bit` | Load the model with 8-bit precision.|
-| `--llama-bits` | Load LLaMA models with specified precision. 2, 3 and 4 bit are supported, use standard `--load-in-8bit` for 8bit precision. |
+| `--load-in-4bit` | Load the model with 4-bit precision. Currently only works with LLaMA.|
+| `--llama-bits` | Load pre-quantized models with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA. |
| `--bf16` | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. |
| `--auto-devices` | Automatically split the model across the available GPU(s) and CPU.|
| `--disk` | If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. |
diff --git a/modules/models.py b/modules/models.py
index 3ec68f17..6c423a25 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -88,9 +88,13 @@ def load_model(model_name):
return model, tokenizer
# 4-bit LLaMA
- elif shared.args.llama_bits>0:
+ elif shared.args.llama_bits>0 or shared.args.load_in_4bit:
sys.path.insert(0, os.path.abspath(Path("repositories/GPTQ-for-LLaMa")))
- bits = shared.args.llama_bits
+ if shared.args.load_in_4bit:
+ bits = 4
+ else:
+ bits = shared.args.llama_bits
+
from llama import load_quant
diff --git a/modules/shared.py b/modules/shared.py
index 61d5a768..f3f46329 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -67,7 +67,8 @@ parser.add_argument('--chat', action='store_true', help='Launch the web UI in ch
parser.add_argument('--cai-chat', action='store_true', help='Launch the web UI in chat mode with a style similar to Character.AI\'s. If the file img_bot.png or img_bot.jpg exists in the same folder as server.py, this image will be used as the bot\'s profile picture. Similarly, img_me.png or img_me.jpg will be used as your profile picture.')
parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text.')
parser.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision.')
-parser.add_argument('--llama-bits', type=int, default=0, help='Load LLaMA models with specified precision. 2, 3 and 4 bit are supported, use standard `--load-in-8bit` for 8bit precision.')
+parser.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision. Currently only works with LLaMA.')
+parser.add_argument('--llama-bits', type=int, default=0, help='Load pre-quantized models with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA.')
parser.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.')
parser.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.')
parser.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')
From 001e638b47331f24ac967dd982f8ce4781775f7d Mon Sep 17 00:00:00 2001
From: draff
Date: Fri, 10 Mar 2023 23:28:19 +0000
Subject: [PATCH 34/69] Make it actually work
---
modules/models.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/modules/models.py b/modules/models.py
index 6c423a25..8e7caa8d 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -42,7 +42,7 @@ def load_model(model_name):
shared.is_RWKV = model_name.lower().startswith('rwkv-')
# Default settings
- if not (shared.args.cpu or shared.args.load_in_8bit or shared.args.llama_bits>0 or shared.args.auto_devices or shared.args.disk or shared.args.gpu_memory is not None or shared.args.cpu_memory is not None or shared.args.deepspeed or shared.args.flexgen or shared.is_RWKV):
+ if not (shared.args.cpu or shared.args.load_in_8bit or shared.args.llama_bits>0 or shared.args.load_in_4bit or shared.args.auto_devices or shared.args.disk or shared.args.gpu_memory is not None or shared.args.cpu_memory is not None or shared.args.deepspeed or shared.args.flexgen or shared.is_RWKV):
if any(size in shared.model_name.lower() for size in ('13b', '20b', '30b')):
model = AutoModelForCausalLM.from_pretrained(Path(f"models/{shared.model_name}"), device_map='auto', load_in_8bit=True)
else:
From 28fd4fc9702c9fa3a52e2ca60ca034f01cbe3be9 Mon Sep 17 00:00:00 2001
From: draff
Date: Fri, 10 Mar 2023 23:34:13 +0000
Subject: [PATCH 35/69] Change wording to be consistent with other args
---
README.md | 2 +-
modules/shared.py | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
index 76774c0b..50d07cd6 100644
--- a/README.md
+++ b/README.md
@@ -139,7 +139,7 @@ Optionally, you can use the following command-line flags:
| `--cpu` | Use the CPU to generate text.|
| `--load-in-8bit` | Load the model with 8-bit precision.|
| `--load-in-4bit` | Load the model with 4-bit precision. Currently only works with LLaMA.|
-| `--llama-bits` | Load pre-quantized models with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA. |
+| `--llama-bits` | Load a pre-quantized model with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA. |
| `--bf16` | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. |
| `--auto-devices` | Automatically split the model across the available GPU(s) and CPU.|
| `--disk` | If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. |
diff --git a/modules/shared.py b/modules/shared.py
index f3f46329..3ea4ef41 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -68,7 +68,7 @@ parser.add_argument('--cai-chat', action='store_true', help='Launch the web UI i
parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text.')
parser.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision.')
parser.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision. Currently only works with LLaMA.')
-parser.add_argument('--llama-bits', type=int, default=0, help='Load pre-quantized models with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA.')
+parser.add_argument('--llama-bits', type=int, default=0, help='Load a pre-quantized model with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA.')
parser.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.')
parser.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.')
parser.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')
From 0dfac4b777009d415d848c2f0bc718ec1bbac7e5 Mon Sep 17 00:00:00 2001
From: Xan <70198941+xanthousm@users.noreply.github.com>
Date: Sat, 11 Mar 2023 16:34:59 +1100
Subject: [PATCH 36/69] Working html autoplay, clean up, improve wav naming
- New autoplay using html tag, removed from old message when new input provided
- Add voice pitch and speed control
- Group settings together
- Use name + conversation history to match wavs to messages, minimize problems when changing characters
Current minor bugs:
- Gradio seems to cache the audio files, so using "clear history" and generating new messages will play the old audio (the new messages are saving correctly). Gradio will clear cache and use correct audio after a few messages or after a page refresh.
- Switching characters does not immediately update the message ID used for the audio. ID is updated after the first new message, but that message will use the wrong ID
---
extensions/silero_tts/requirements.txt | 1 -
extensions/silero_tts/script.py | 79 +++++++++++++-------------
2 files changed, 38 insertions(+), 42 deletions(-)
diff --git a/extensions/silero_tts/requirements.txt b/extensions/silero_tts/requirements.txt
index b4444306..f2f0bff5 100644
--- a/extensions/silero_tts/requirements.txt
+++ b/extensions/silero_tts/requirements.txt
@@ -4,4 +4,3 @@ pydub
PyYAML
torch
torchaudio
-simpleaudio
diff --git a/extensions/silero_tts/script.py b/extensions/silero_tts/script.py
index 334b02b9..b66963e2 100644
--- a/extensions/silero_tts/script.py
+++ b/extensions/silero_tts/script.py
@@ -4,7 +4,6 @@ import gradio as gr
import torch
import modules.shared as shared
-import simpleaudio as sa
torch._C._jit_set_profiling_mode(False)
@@ -15,13 +14,16 @@ params = {
'model_id': 'v3_en',
'sample_rate': 48000,
'device': 'cpu',
- 'max_wavs': -1,
- 'autoplay': True,
'show_text': True,
+ 'autoplay': True,
+ 'voice_pitch': 'medium',
+ 'voice_speed': 'medium',
}
current_params = params.copy()
voices_by_gender = ['en_99', 'en_45', 'en_18', 'en_117', 'en_49', 'en_51', 'en_68', 'en_0', 'en_26', 'en_56', 'en_74', 'en_5', 'en_38', 'en_53', 'en_21', 'en_37', 'en_107', 'en_10', 'en_82', 'en_16', 'en_41', 'en_12', 'en_67', 'en_61', 'en_14', 'en_11', 'en_39', 'en_52', 'en_24', 'en_97', 'en_28', 'en_72', 'en_94', 'en_36', 'en_4', 'en_43', 'en_88', 'en_25', 'en_65', 'en_6', 'en_44', 'en_75', 'en_91', 'en_60', 'en_109', 'en_85', 'en_101', 'en_108', 'en_50', 'en_96', 'en_64', 'en_92', 'en_76', 'en_33', 'en_116', 'en_48', 'en_98', 'en_86', 'en_62', 'en_54', 'en_95', 'en_55', 'en_111', 'en_3', 'en_83', 'en_8', 'en_47', 'en_59', 'en_1', 'en_2', 'en_7', 'en_9', 'en_13', 'en_15', 'en_17', 'en_19', 'en_20', 'en_22', 'en_23', 'en_27', 'en_29', 'en_30', 'en_31', 'en_32', 'en_34', 'en_35', 'en_40', 'en_42', 'en_46', 'en_57', 'en_58', 'en_63', 'en_66', 'en_69', 'en_70', 'en_71', 'en_73', 'en_77', 'en_78', 'en_79', 'en_80', 'en_81', 'en_84', 'en_87', 'en_89', 'en_90', 'en_93', 'en_100', 'en_102', 'en_103', 'en_104', 'en_105', 'en_106', 'en_110', 'en_112', 'en_113', 'en_114', 'en_115']
-wav_idx = 0
+voice_pitches = ['x-low', 'low', 'medium', 'high', 'x-high']
+voice_speeds = ['x-slow', 'slow', 'medium', 'fast', 'x-fast']
+last_msg_id = 0
#Used for making text xml compatible, needed for voice pitch and speed control
table = str.maketrans({
@@ -55,6 +57,14 @@ def input_modifier(string):
This function is applied to your text inputs before
they are fed into the model.
"""
+ #remove autoplay from previous
+ if len(shared.history['internal'])>0:
+ [text, reply] = shared.history['internal'][-1]
+ [visible_text, visible_reply] = shared.history['visible'][-1]
+ rep_clean = reply.replace('controls autoplay>','controls>')
+ vis_rep_clean = visible_reply.replace('controls autoplay>','controls>')
+ shared.history['internal'][-1] = [text, rep_clean]
+ shared.history['visible'][-1] = [visible_text, vis_rep_clean]
return string
@@ -63,7 +73,7 @@ def output_modifier(string):
This function is applied to the model outputs.
"""
- global wav_idx, model, current_params
+ global model, current_params
for i in params:
if params[i] != current_params[i]:
@@ -81,44 +91,31 @@ def output_modifier(string):
string = string.replace('\n', ' ')
string = string.strip()
- auto_playable=True
+ silent_string = False #Used to prevent unnecessary audio file generation
if string == '':
string = 'empty reply, try regenerating'
- auto_playable=False
-
+ silent_string = True
#x-slow, slow, medium, fast, x-fast
#x-low, low, medium, high, x-high
- prosody=''
+ pitch = params['voice_pitch']
+ speed = params['voice_speed']
+ prosody=f''
string =''+prosody+xmlesc(string)+''
-
- output_file = Path(f'extensions/silero_tts/outputs/{wav_idx:06d}.wav')
- autoplay_str = ''
- if not shared.still_streaming:
+
+ current_msg_id=len(shared.history['visible'])#check length here, since output_modifier can run many times on the same message
+ output_file = Path(f'extensions/silero_tts/outputs/{shared.character}_{current_msg_id:06d}.wav')
+ if not shared.still_streaming and not silent_string:
model.save_wav(ssml_text=string, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
- #diabled until autoplay doesn't run on previous messages
- #autoplay = 'autoplay' if (params['autoplay'] and auto_playable) else ''
- string = f'\n\n'
+ string = f'\n\n'
else:
- #placeholder so text doesnt shift around so much
- string =f'\n\n'
-
- #reset if too many wavs. set max to -1 for unlimited.
- if wav_idx < params['max_wavs'] or params['max_wavs'] < 0:
- #only increment if starting a new stream, else replace during streaming.
- if not shared.still_streaming:
- wav_idx += 1
- else:
- wav_idx = 0
+ #placeholder so text doesn't shift around so much
+ string ='\n\n'
if params['show_text']:
+ #string+=f'*[{current_msg_id}]:*'+orig_string #Debug, looks like there is a delay in "current_msg_id" being updated when switching characters (updates after new message sent). Can't find the source. "shared.character" is updating properly.
string+=orig_string
- if params['autoplay'] == True and auto_playable and not shared.still_streaming:
- stop_autoplay()
- wave_obj = sa.WaveObject.from_wave_file(output_file.as_posix())
- wave_obj.play()
-
return string
def bot_prefix_modifier(string):
@@ -130,20 +127,20 @@ def bot_prefix_modifier(string):
return string
-def stop_autoplay():
- sa.stop_all()
-
def ui():
# Gradio elements
- activate = gr.Checkbox(value=params['activate'], label='Activate TTS')
- show_text = gr.Checkbox(value=params['show_text'], label='Show message text under audio player')
- autoplay = gr.Checkbox(value=params['autoplay'], label='Play TTS automatically')
- stop_audio = gr.Button("Stop Auto-Play")
- voice = gr.Dropdown(value=params['speaker'], choices=voices_by_gender, label='TTS voice')
+ with gr.Accordion("Silero TTS"):
+ activate = gr.Checkbox(value=params['activate'], label='Activate TTS')
+ show_text = gr.Checkbox(value=params['show_text'], label='Show message text under audio player')
+ autoplay = gr.Checkbox(value=params['autoplay'], label='Play TTS automatically')
+ voice = gr.Dropdown(value=params['speaker'], choices=voices_by_gender, label='TTS voice')
+ v_pitch = gr.Dropdown(value=params['voice_pitch'], choices=voice_pitches, label='Voice pitch')
+ v_speed = gr.Dropdown(value=params['voice_speed'], choices=voice_speeds, label='Voice speed')
# Event functions to update the parameters in the backend
activate.change(lambda x: params.update({"activate": x}), activate, None)
- autoplay.change(lambda x: params.update({"autoplay": x}), autoplay, None)
show_text.change(lambda x: params.update({"show_text": x}), show_text, None)
- stop_audio.click(stop_autoplay)
+ autoplay.change(lambda x: params.update({"autoplay": x}), autoplay, None)
voice.change(lambda x: params.update({"speaker": x}), voice, None)
+ v_pitch.change(lambda x: params.update({"voice_pitch": x}), v_pitch, None)
+ v_speed.change(lambda x: params.update({"voice_speed": x}), v_speed, None)
From b8f7d34c1df5b12e60491e4c8a6494d5e6aec20e Mon Sep 17 00:00:00 2001
From: Xan <70198941+xanthousm@users.noreply.github.com>
Date: Sat, 11 Mar 2023 17:05:09 +1100
Subject: [PATCH 37/69] Undo changes to requirements
needing to manually install tensorboard might be a windows-only problem. Can be easily solved manually.
---
requirements.txt | 1 -
1 file changed, 1 deletion(-)
diff --git a/requirements.txt b/requirements.txt
index a8a6eada..47c56a45 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,6 +5,5 @@ gradio==3.18.0
numpy
rwkv==0.1.0
safetensors==0.2.8
-tensorboard
sentencepiece
git+https://github.com/oobabooga/transformers@llama_push
From 96c51973f9e551055dac2e135e9c4229cbf40ad0 Mon Sep 17 00:00:00 2001
From: Xan <70198941+xanthousm@users.noreply.github.com>
Date: Sat, 11 Mar 2023 22:50:59 +1100
Subject: [PATCH 38/69] --auto-launch and "Is typing..."
- Added `--auto-launch` arg to open web UI in the default browser when ready.
- Changed chat.py to display user input immediately and "*Is typing...*" as a temporary reply while generating text. Most noticeable when using `--no-stream`.
---
modules/chat.py | 3 +++
modules/shared.py | 1 +
server.py | 4 ++--
3 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/modules/chat.py b/modules/chat.py
index f40f8299..0f029fe2 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -120,6 +120,9 @@ def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical
else:
prompt = custom_generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size)
+ #display user input and "*is typing...*" imediately
+ yield shared.history['visible']+[[visible_text, '*Is typing...*']]
+
# Generate
reply = ''
for i in range(chat_generation_attempts):
diff --git a/modules/shared.py b/modules/shared.py
index 2acb047f..c42ba7ed 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -90,4 +90,5 @@ parser.add_argument('--listen', action='store_true', help='Make the web UI reach
parser.add_argument('--listen-port', type=int, help='The listening port that the server will use.')
parser.add_argument('--share', action='store_true', help='Create a public URL. This is useful for running the web UI on Google Colab or similar.')
parser.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.')
+parser.add_argument('--auto-launch', action='store_true', default=False, help='Open the web UI in the default browser upon launch')
args = parser.parse_args()
diff --git a/server.py b/server.py
index c2977f41..ad483eb5 100644
--- a/server.py
+++ b/server.py
@@ -372,9 +372,9 @@ else:
shared.gradio['interface'].queue()
if shared.args.listen:
- shared.gradio['interface'].launch(prevent_thread_lock=True, share=shared.args.share, server_name='0.0.0.0', server_port=shared.args.listen_port)
+ shared.gradio['interface'].launch(prevent_thread_lock=True, share=shared.args.share, server_name='0.0.0.0', server_port=shared.args.listen_port, inbrowser=shared.args.auto_launch)
else:
- shared.gradio['interface'].launch(prevent_thread_lock=True, share=shared.args.share, server_port=shared.args.listen_port)
+ shared.gradio['interface'].launch(prevent_thread_lock=True, share=shared.args.share, server_port=shared.args.listen_port, inbrowser=shared.args.auto_launch)
# I think that I will need this later
while True:
From 2743dd736a431e54a7220ef7e29ad5d31797611c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 11 Mar 2023 10:50:18 -0300
Subject: [PATCH 39/69] Add *Is typing...* to impersonate as well
---
modules/chat.py | 5 ++++-
server.py | 9 +++++----
2 files changed, 9 insertions(+), 5 deletions(-)
diff --git a/modules/chat.py b/modules/chat.py
index 0f029fe2..5bf96b1a 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -120,7 +120,7 @@ def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical
else:
prompt = custom_generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size)
- #display user input and "*is typing...*" imediately
+ # Display user input and "*is typing...*" imediately
yield shared.history['visible']+[[visible_text, '*Is typing...*']]
# Generate
@@ -161,6 +161,9 @@ def impersonate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typ
prompt = generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size, impersonate=True)
+ # Display "*is typing...*" imediately
+ yield '*Is typing...*'
+
reply = ''
for i in range(chat_generation_attempts):
for reply in generate_reply(prompt+reply, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, eos_token=eos_token, stopping_string=f"\n{name2}:"):
diff --git a/server.py b/server.py
index ad483eb5..c65443ec 100644
--- a/server.py
+++ b/server.py
@@ -272,10 +272,10 @@ if shared.args.chat or shared.args.cai_chat:
function_call = 'chat.cai_chatbot_wrapper' if shared.args.cai_chat else 'chat.chatbot_wrapper'
- gen_events.append(shared.gradio['Generate'].click(eval(function_call), shared.input_params, shared.gradio['display'], show_progress=shared.args.no_stream, api_name='textgen'))
- gen_events.append(shared.gradio['textbox'].submit(eval(function_call), shared.input_params, shared.gradio['display'], show_progress=shared.args.no_stream))
- gen_events.append(shared.gradio['Regenerate'].click(chat.regenerate_wrapper, shared.input_params, shared.gradio['display'], show_progress=shared.args.no_stream))
- gen_events.append(shared.gradio['Impersonate'].click(chat.impersonate_wrapper, shared.input_params, shared.gradio['textbox'], show_progress=shared.args.no_stream))
+ gen_events.append(shared.gradio['Generate'].click(eval(function_call), shared.input_params, shared.gradio['display'], show_progress=False, api_name='textgen'))
+ gen_events.append(shared.gradio['textbox'].submit(eval(function_call), shared.input_params, shared.gradio['display'], show_progress=False))
+ gen_events.append(shared.gradio['Regenerate'].click(chat.regenerate_wrapper, shared.input_params, shared.gradio['display'], show_progress=False))
+ gen_events.append(shared.gradio['Impersonate'].click(chat.impersonate_wrapper, shared.input_params, shared.gradio['textbox'], show_progress=False))
shared.gradio['Stop'].click(chat.stop_everything_event, [], [], cancels=gen_events)
shared.gradio['Copy last reply'].click(chat.send_last_reply_to_input, [], shared.gradio['textbox'], show_progress=shared.args.no_stream)
@@ -309,6 +309,7 @@ if shared.args.chat or shared.args.cai_chat:
reload_inputs = [shared.gradio['name1'], shared.gradio['name2']] if shared.args.cai_chat else []
shared.gradio['upload_chat_history'].upload(reload_func, reload_inputs, [shared.gradio['display']])
shared.gradio['upload_img_me'].upload(reload_func, reload_inputs, [shared.gradio['display']])
+ shared.gradio['Stop'].click(reload_func, reload_inputs, [shared.gradio['display']])
shared.gradio['interface'].load(lambda : chat.load_default_history(shared.settings[f'name1{suffix}'], shared.settings[f'name2{suffix}']), None, None)
shared.gradio['interface'].load(reload_func, reload_inputs, [shared.gradio['display']], show_progress=True)
From 8f8da6707d7e71c2eef01c2d33ca6623cebf080c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 11 Mar 2023 11:17:13 -0300
Subject: [PATCH 40/69] Minor style changes to silero_tts
---
extensions/silero_tts/script.py | 31 +++++++++++++++++--------------
1 file changed, 17 insertions(+), 14 deletions(-)
diff --git a/extensions/silero_tts/script.py b/extensions/silero_tts/script.py
index b66963e2..7e63d8b7 100644
--- a/extensions/silero_tts/script.py
+++ b/extensions/silero_tts/script.py
@@ -14,18 +14,19 @@ params = {
'model_id': 'v3_en',
'sample_rate': 48000,
'device': 'cpu',
- 'show_text': True,
+ 'show_text': False,
'autoplay': True,
'voice_pitch': 'medium',
'voice_speed': 'medium',
}
+
current_params = params.copy()
voices_by_gender = ['en_99', 'en_45', 'en_18', 'en_117', 'en_49', 'en_51', 'en_68', 'en_0', 'en_26', 'en_56', 'en_74', 'en_5', 'en_38', 'en_53', 'en_21', 'en_37', 'en_107', 'en_10', 'en_82', 'en_16', 'en_41', 'en_12', 'en_67', 'en_61', 'en_14', 'en_11', 'en_39', 'en_52', 'en_24', 'en_97', 'en_28', 'en_72', 'en_94', 'en_36', 'en_4', 'en_43', 'en_88', 'en_25', 'en_65', 'en_6', 'en_44', 'en_75', 'en_91', 'en_60', 'en_109', 'en_85', 'en_101', 'en_108', 'en_50', 'en_96', 'en_64', 'en_92', 'en_76', 'en_33', 'en_116', 'en_48', 'en_98', 'en_86', 'en_62', 'en_54', 'en_95', 'en_55', 'en_111', 'en_3', 'en_83', 'en_8', 'en_47', 'en_59', 'en_1', 'en_2', 'en_7', 'en_9', 'en_13', 'en_15', 'en_17', 'en_19', 'en_20', 'en_22', 'en_23', 'en_27', 'en_29', 'en_30', 'en_31', 'en_32', 'en_34', 'en_35', 'en_40', 'en_42', 'en_46', 'en_57', 'en_58', 'en_63', 'en_66', 'en_69', 'en_70', 'en_71', 'en_73', 'en_77', 'en_78', 'en_79', 'en_80', 'en_81', 'en_84', 'en_87', 'en_89', 'en_90', 'en_93', 'en_100', 'en_102', 'en_103', 'en_104', 'en_105', 'en_106', 'en_110', 'en_112', 'en_113', 'en_114', 'en_115']
voice_pitches = ['x-low', 'low', 'medium', 'high', 'x-high']
voice_speeds = ['x-slow', 'slow', 'medium', 'fast', 'x-fast']
last_msg_id = 0
-#Used for making text xml compatible, needed for voice pitch and speed control
+# Used for making text xml compatible, needed for voice pitch and speed control
table = str.maketrans({
"<": "<",
">": ">",
@@ -33,6 +34,7 @@ table = str.maketrans({
"'": "'",
'"': """,
})
+
def xmlesc(txt):
return txt.translate(table)
@@ -57,7 +59,8 @@ def input_modifier(string):
This function is applied to your text inputs before
they are fed into the model.
"""
- #remove autoplay from previous
+
+ # Remove autoplay from previous
if len(shared.history['internal'])>0:
[text, reply] = shared.history['internal'][-1]
[visible_text, visible_reply] = shared.history['visible'][-1]
@@ -91,30 +94,30 @@ def output_modifier(string):
string = string.replace('\n', ' ')
string = string.strip()
- silent_string = False #Used to prevent unnecessary audio file generation
+ silent_string = False # Used to prevent unnecessary audio file generation
if string == '':
- string = 'empty reply, try regenerating'
- silent_string = True
+ string = 'empty reply, try regenerating'
+ silent_string = True
- #x-slow, slow, medium, fast, x-fast
- #x-low, low, medium, high, x-high
+ # x-slow, slow, medium, fast, x-fast
+ # x-low, low, medium, high, x-high
pitch = params['voice_pitch']
speed = params['voice_speed']
prosody=f''
- string =''+prosody+xmlesc(string)+''
+ string = ''+prosody+xmlesc(string)+''
- current_msg_id=len(shared.history['visible'])#check length here, since output_modifier can run many times on the same message
+ current_msg_id = len(shared.history['visible']) # Check length here, since output_modifier can run many times on the same message
output_file = Path(f'extensions/silero_tts/outputs/{shared.character}_{current_msg_id:06d}.wav')
if not shared.still_streaming and not silent_string:
model.save_wav(ssml_text=string, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
string = f'\n\n'
else:
- #placeholder so text doesn't shift around so much
- string ='\n\n'
+ # Placeholder so text doesn't shift around so much
+ string = '\n\n'
if params['show_text']:
- #string+=f'*[{current_msg_id}]:*'+orig_string #Debug, looks like there is a delay in "current_msg_id" being updated when switching characters (updates after new message sent). Can't find the source. "shared.character" is updating properly.
- string+=orig_string
+ #string += f'*[{current_msg_id}]:*'+orig_string #Debug, looks like there is a delay in "current_msg_id" being updated when switching characters (updates after new message sent). Can't find the source. "shared.character" is updating properly.
+ string += orig_string
return string
From 501afbc23408df04d2d545b2100bde55b6611598 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 11 Mar 2023 14:47:30 -0300
Subject: [PATCH 41/69] Add requests to requirements.txt
---
requirements.txt | 1 +
1 file changed, 1 insertion(+)
diff --git a/requirements.txt b/requirements.txt
index 6133f394..a7df93bb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,6 +3,7 @@ bitsandbytes==0.37.0
flexgen==0.1.7
gradio==3.18.0
numpy
+requests
rwkv==0.1.0
safetensors==0.2.8
sentencepiece
From 195e99d0b6d116105c0adc0978a4ec4dbb0d847c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 11 Mar 2023 16:11:15 -0300
Subject: [PATCH 42/69] Add llama_prompts extension
---
extensions/llama_prompts/script.py | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)
create mode 100644 extensions/llama_prompts/script.py
diff --git a/extensions/llama_prompts/script.py b/extensions/llama_prompts/script.py
new file mode 100644
index 00000000..e45cd445
--- /dev/null
+++ b/extensions/llama_prompts/script.py
@@ -0,0 +1,18 @@
+import gradio as gr
+import modules.shared as shared
+import pandas as pd
+
+df = pd.read_csv("https://raw.githubusercontent.com/devbrones/llama-prompts/main/prompts/prompts.csv")
+
+def get_prompt_by_name(name):
+ if name == 'None':
+ return ''
+ else:
+ return df[df['Prompt name'] == name].iloc[0]['Prompt'].replace('\\n', '\n')
+
+def ui():
+ if not shared.args.chat or share.args.cai_chat:
+ choices = ['None'] + list(df['Prompt name'])
+
+ prompts_menu = gr.Dropdown(value=choices[0], choices=choices, label='Prompt')
+ prompts_menu.change(get_prompt_by_name, prompts_menu, shared.gradio['textbox'])
From def97f658c016d4c50fe9d682265841154eb5336 Mon Sep 17 00:00:00 2001
From: HideLord
Date: Sun, 12 Mar 2023 02:54:22 +0200
Subject: [PATCH 43/69] Small patch to fix loading of character jsons. Now it
correctly reads non-ascii characters on Windows.
---
modules/chat.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/modules/chat.py b/modules/chat.py
index f40f8299..4a7fb873 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -332,7 +332,7 @@ def load_character(_character, name1, name2):
shared.history['visible'] = []
if _character != 'None':
shared.character = _character
- data = json.loads(open(Path(f'characters/{_character}.json'), 'r').read())
+ data = json.loads(open(Path(f'characters/{_character}.json'), 'r', encoding='utf-8').read())
name2 = data['char_name']
if 'char_persona' in data and data['char_persona'] != '':
context += f"{data['char_name']}'s Persona: {data['char_persona']}\n"
From 37f0166b2d6b0f2938a5a4c1762479829de1c5be Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 11 Mar 2023 23:14:49 -0300
Subject: [PATCH 44/69] Fix memory leak in new streaming (second attempt)
---
modules/callbacks.py | 5 ++++-
modules/text_generation.py | 1 -
2 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/modules/callbacks.py b/modules/callbacks.py
index 15674b8a..05e8fafa 100644
--- a/modules/callbacks.py
+++ b/modules/callbacks.py
@@ -49,7 +49,7 @@ class Iteratorize:
def __init__(self, func, kwargs={}, callback=None):
self.mfunc=func
self.c_callback=callback
- self.q = Queue(maxsize=1)
+ self.q = Queue()
self.sentinel = object()
self.kwargs = kwargs
@@ -73,3 +73,6 @@ class Iteratorize:
raise StopIteration
else:
return obj
+
+ def __del__(self):
+ pass
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 6a59f9a7..5d01c8cb 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -187,7 +187,6 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
yield formatted_outputs(original_question, shared.model_name)
for output in eval(f"generate_with_streaming({', '.join(generate_params)})"):
- print(print('Used vram in gib:', torch.cuda.memory_allocated() / 1024**3))
if shared.soft_prompt:
output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
reply = decode(output)
From 0bd54309887f6e7adc7e59d4f8675ed6f322bb81 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 12 Mar 2023 02:04:28 -0300
Subject: [PATCH 45/69] Use 'with' statement to better handle streaming memory
---
modules/RWKV.py | 10 +++++-----
modules/callbacks.py | 27 +++++++++++++++++++++++----
modules/text_generation.py | 19 ++++++++++---------
3 files changed, 38 insertions(+), 18 deletions(-)
diff --git a/modules/RWKV.py b/modules/RWKV.py
index 70deab28..836d31dc 100644
--- a/modules/RWKV.py
+++ b/modules/RWKV.py
@@ -50,11 +50,11 @@ class RWKVModel:
return context+self.pipeline.generate(context, token_count=token_count, args=args, callback=callback)
def generate_with_streaming(self, **kwargs):
- iterable = Iteratorize(self.generate, kwargs, callback=None)
- reply = kwargs['context']
- for token in iterable:
- reply += token
- yield reply
+ with Iteratorize(self.generate, kwargs, callback=None) as generator:
+ reply = kwargs['context']
+ for token in generator:
+ reply += token
+ yield reply
class RWKVTokenizer:
def __init__(self):
diff --git a/modules/callbacks.py b/modules/callbacks.py
index 05e8fafa..e0d1c988 100644
--- a/modules/callbacks.py
+++ b/modules/callbacks.py
@@ -1,3 +1,4 @@
+import gc
from queue import Queue
from threading import Thread
@@ -6,7 +7,6 @@ import transformers
import modules.shared as shared
-
# Copied from https://github.com/PygmalionAI/gradio-ui/
class _SentinelTokenStoppingCriteria(transformers.StoppingCriteria):
@@ -52,17 +52,24 @@ class Iteratorize:
self.q = Queue()
self.sentinel = object()
self.kwargs = kwargs
+ self.stop_now = False
def _callback(val):
+ if self.stop_now:
+ raise ValueError
self.q.put(val)
def gentask():
- ret = self.mfunc(callback=_callback, **self.kwargs)
+ try:
+ ret = self.mfunc(callback=_callback, **self.kwargs)
+ except ValueError:
+ pass
self.q.put(self.sentinel)
if self.c_callback:
self.c_callback(ret)
- Thread(target=gentask).start()
+ self.thread = Thread(target=gentask)
+ self.thread.start()
def __iter__(self):
return self
@@ -75,4 +82,16 @@ class Iteratorize:
return obj
def __del__(self):
- pass
+ clear_torch_cache()
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ self.stop_now = True
+ clear_torch_cache()
+
+def clear_torch_cache():
+ gc.collect()
+ if not shared.args.cpu:
+ torch.cuda.empty_cache()
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 5d01c8cb..7f5aad5e 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -186,17 +186,18 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
return Iteratorize(generate_with_callback, kwargs, callback=None)
yield formatted_outputs(original_question, shared.model_name)
- for output in eval(f"generate_with_streaming({', '.join(generate_params)})"):
- if shared.soft_prompt:
- output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
- reply = decode(output)
+ with eval(f"generate_with_streaming({', '.join(generate_params)})") as generator:
+ for output in generator:
+ if shared.soft_prompt:
+ output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
+ reply = decode(output)
- if not (shared.args.chat or shared.args.cai_chat):
- reply = original_question + apply_extensions(reply[len(question):], "output")
- yield formatted_outputs(reply, shared.model_name)
+ if not (shared.args.chat or shared.args.cai_chat):
+ reply = original_question + apply_extensions(reply[len(question):], "output")
+ yield formatted_outputs(reply, shared.model_name)
- if output[-1] == n:
- break
+ if output[-1] == n:
+ break
# Stream the output naively for FlexGen since it doesn't support 'stopping_criteria'
else:
From 433f6350bc794e0a904e1a34abaffe49a106a484 Mon Sep 17 00:00:00 2001
From: unknown
Date: Sat, 11 Mar 2023 21:21:30 -0800
Subject: [PATCH 46/69] Load and save character files in UTF-8
---
modules/chat.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/modules/chat.py b/modules/chat.py
index f40f8299..a0cae949 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -332,7 +332,7 @@ def load_character(_character, name1, name2):
shared.history['visible'] = []
if _character != 'None':
shared.character = _character
- data = json.loads(open(Path(f'characters/{_character}.json'), 'r').read())
+ data = json.loads(open(Path(f'characters/{_character}.json'), 'r', encoding='utf-8').read())
name2 = data['char_name']
if 'char_persona' in data and data['char_persona'] != '':
context += f"{data['char_name']}'s Persona: {data['char_persona']}\n"
@@ -372,7 +372,7 @@ def upload_character(json_file, img, tavern=False):
i += 1
if tavern:
outfile_name = f'TavernAI-{outfile_name}'
- with open(Path(f'characters/{outfile_name}.json'), 'w') as f:
+ with open(Path(f'characters/{outfile_name}.json'), 'w', encoding='utf-8') as f:
f.write(json_file)
if img is not None:
img = Image.open(io.BytesIO(img))
From b0e8cb8c889cdadd9779517ba8055114b39357cd Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 12 Mar 2023 02:31:45 -0300
Subject: [PATCH 47/69] Various fixes in chat mode
---
modules/chat.py | 16 +++---
modules/text_generation.py | 102 +++++++++++++++++++------------------
2 files changed, 62 insertions(+), 56 deletions(-)
diff --git a/modules/chat.py b/modules/chat.py
index f40f8299..69d81e94 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -115,14 +115,18 @@ def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical
visible_text = visible_text.replace('\n', ' ')
text = apply_extensions(text, "input")
- if custom_generate_chat_prompt is None:
- prompt = generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size)
- else:
- prompt = custom_generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size)
-
# Generate
reply = ''
for i in range(chat_generation_attempts):
+
+ # The prompt needs to be generated here because, as the reply
+ # grows, it may become necessary to remove more old messages to
+ # fit into the 2048 tokens window.
+ if custom_generate_chat_prompt is None:
+ prompt = generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size-len(encode(' '+reply)[0]))
+ else:
+ prompt = custom_generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size-len(encode(' '+reply)[0]))
+
for reply in generate_reply(f"{prompt}{' ' if len(reply) > 0 else ''}{reply}", max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, eos_token=eos_token, stopping_string=f"\n{name1}:"):
# Extracting the reply
@@ -156,10 +160,10 @@ def impersonate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typ
if 'pygmalion' in shared.model_name.lower():
name1 = "You"
- prompt = generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size, impersonate=True)
reply = ''
for i in range(chat_generation_attempts):
+ prompt = generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size-len(encode(' '+reply)[0]), impersonate=True)
for reply in generate_reply(prompt+reply, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, eos_token=eos_token, stopping_string=f"\n{name2}:"):
reply, next_character_found, substring_found = extract_message_from_reply(prompt, reply, name1, name2, check, impersonate=True)
if not substring_found:
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 7f5aad5e..2460df4f 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -159,35 +159,53 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
else:
generate_params.insert(0, "inputs=input_ids")
- # Generate the entire reply at once.
- if shared.args.no_stream:
- with torch.no_grad():
- output = eval(f"shared.model.generate({', '.join(generate_params)}){cuda}")[0]
- if shared.soft_prompt:
- output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
-
- reply = decode(output)
- if not (shared.args.chat or shared.args.cai_chat):
- reply = original_question + apply_extensions(reply[len(question):], "output")
-
- yield formatted_outputs(reply, shared.model_name)
-
- # Stream the reply 1 token at a time.
- # This is based on the trick of using 'stopping_criteria' to create an iterator.
- elif not shared.args.flexgen:
-
- def generate_with_callback(callback=None, **kwargs):
- kwargs['stopping_criteria'].append(Stream(callback_func=callback))
- clear_torch_cache()
+ try:
+ # Generate the entire reply at once.
+ if shared.args.no_stream:
with torch.no_grad():
- shared.model.generate(**kwargs)
+ output = eval(f"shared.model.generate({', '.join(generate_params)}){cuda}")[0]
+ if shared.soft_prompt:
+ output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
- def generate_with_streaming(**kwargs):
- return Iteratorize(generate_with_callback, kwargs, callback=None)
+ reply = decode(output)
+ if not (shared.args.chat or shared.args.cai_chat):
+ reply = original_question + apply_extensions(reply[len(question):], "output")
- yield formatted_outputs(original_question, shared.model_name)
- with eval(f"generate_with_streaming({', '.join(generate_params)})") as generator:
- for output in generator:
+ yield formatted_outputs(reply, shared.model_name)
+
+ # Stream the reply 1 token at a time.
+ # This is based on the trick of using 'stopping_criteria' to create an iterator.
+ elif not shared.args.flexgen:
+
+ def generate_with_callback(callback=None, **kwargs):
+ kwargs['stopping_criteria'].append(Stream(callback_func=callback))
+ clear_torch_cache()
+ with torch.no_grad():
+ shared.model.generate(**kwargs)
+
+ def generate_with_streaming(**kwargs):
+ return Iteratorize(generate_with_callback, kwargs, callback=None)
+
+ yield formatted_outputs(original_question, shared.model_name)
+ with eval(f"generate_with_streaming({', '.join(generate_params)})") as generator:
+ for output in generator:
+ if shared.soft_prompt:
+ output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
+ reply = decode(output)
+
+ if not (shared.args.chat or shared.args.cai_chat):
+ reply = original_question + apply_extensions(reply[len(question):], "output")
+ yield formatted_outputs(reply, shared.model_name)
+
+ if output[-1] == n:
+ break
+
+ # Stream the output naively for FlexGen since it doesn't support 'stopping_criteria'
+ else:
+ for i in range(max_new_tokens//8+1):
+ clear_torch_cache()
+ with torch.no_grad():
+ output = eval(f"shared.model.generate({', '.join(generate_params)})")[0]
if shared.soft_prompt:
output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
reply = decode(output)
@@ -196,30 +214,14 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
reply = original_question + apply_extensions(reply[len(question):], "output")
yield formatted_outputs(reply, shared.model_name)
- if output[-1] == n:
+ if np.count_nonzero(input_ids[0] == n) < np.count_nonzero(output == n):
break
- # Stream the output naively for FlexGen since it doesn't support 'stopping_criteria'
- else:
- for i in range(max_new_tokens//8+1):
- clear_torch_cache()
- with torch.no_grad():
- output = eval(f"shared.model.generate({', '.join(generate_params)})")[0]
- if shared.soft_prompt:
- output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
- reply = decode(output)
+ input_ids = np.reshape(output, (1, output.shape[0]))
+ if shared.soft_prompt:
+ inputs_embeds, filler_input_ids = generate_softprompt_input_tensors(input_ids)
- if not (shared.args.chat or shared.args.cai_chat):
- reply = original_question + apply_extensions(reply[len(question):], "output")
- yield formatted_outputs(reply, shared.model_name)
-
- if np.count_nonzero(input_ids[0] == n) < np.count_nonzero(output == n):
- break
-
- input_ids = np.reshape(output, (1, output.shape[0]))
- if shared.soft_prompt:
- inputs_embeds, filler_input_ids = generate_softprompt_input_tensors(input_ids)
-
- t1 = time.time()
- print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(original_input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(original_input_ids[0])} tokens)")
- return
+ finally:
+ t1 = time.time()
+ print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(original_input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(original_input_ids[0])} tokens)")
+ return
From 3baf5fc700c603182456e7b4c3ac4c0f5e9748e8 Mon Sep 17 00:00:00 2001
From: Aleksey Smolenchuk
Date: Sat, 11 Mar 2023 21:40:01 -0800
Subject: [PATCH 48/69] Load and save chat history in utf-8
---
modules/chat.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/modules/chat.py b/modules/chat.py
index a0cae949..8a221526 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -291,7 +291,7 @@ def save_history(timestamp=True):
fname = f"{prefix}persistent.json"
if not Path('logs').exists():
Path('logs').mkdir()
- with open(Path(f'logs/{fname}'), 'w') as f:
+ with open(Path(f'logs/{fname}'), 'w', encoding='utf-8') as f:
f.write(json.dumps({'data': shared.history['internal'], 'data_visible': shared.history['visible']}, indent=2))
return Path(f'logs/{fname}')
@@ -321,7 +321,7 @@ def load_history(file, name1, name2):
def load_default_history(name1, name2):
if Path('logs/persistent.json').exists():
- load_history(open(Path('logs/persistent.json'), 'rb').read(), name1, name2)
+ load_history(open(Path('logs/persistent.json'), 'rb', encoding='utf-8').read(), name1, name2)
else:
shared.history['internal'] = []
shared.history['visible'] = []
@@ -355,7 +355,7 @@ def load_character(_character, name1, name2):
name2 = shared.settings['name2_pygmalion']
if Path(f'logs/{shared.character}_persistent.json').exists():
- load_history(open(Path(f'logs/{shared.character}_persistent.json'), 'rb').read(), name1, name2)
+ load_history(open(Path(f'logs/{shared.character}_persistent.json'), 'rb', encoding='utf-8').read(), name1, name2)
if shared.args.cai_chat:
return name2, context, generate_chat_html(shared.history['visible'], name1, name2, shared.character)
From 341e13503634a0debb684105f055e09772d16c6e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 12 Mar 2023 02:53:08 -0300
Subject: [PATCH 49/69] Various fixes in chat mode
---
modules/callbacks.py | 1 +
modules/chat.py | 16 ++++++----------
modules/text_generation.py | 29 +++++++++++++++--------------
3 files changed, 22 insertions(+), 24 deletions(-)
diff --git a/modules/callbacks.py b/modules/callbacks.py
index e0d1c988..faa4a5e9 100644
--- a/modules/callbacks.py
+++ b/modules/callbacks.py
@@ -64,6 +64,7 @@ class Iteratorize:
ret = self.mfunc(callback=_callback, **self.kwargs)
except ValueError:
pass
+ clear_torch_cache()
self.q.put(self.sentinel)
if self.c_callback:
self.c_callback(ret)
diff --git a/modules/chat.py b/modules/chat.py
index 69d81e94..f40f8299 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -115,18 +115,14 @@ def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical
visible_text = visible_text.replace('\n', ' ')
text = apply_extensions(text, "input")
+ if custom_generate_chat_prompt is None:
+ prompt = generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size)
+ else:
+ prompt = custom_generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size)
+
# Generate
reply = ''
for i in range(chat_generation_attempts):
-
- # The prompt needs to be generated here because, as the reply
- # grows, it may become necessary to remove more old messages to
- # fit into the 2048 tokens window.
- if custom_generate_chat_prompt is None:
- prompt = generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size-len(encode(' '+reply)[0]))
- else:
- prompt = custom_generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size-len(encode(' '+reply)[0]))
-
for reply in generate_reply(f"{prompt}{' ' if len(reply) > 0 else ''}{reply}", max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, eos_token=eos_token, stopping_string=f"\n{name1}:"):
# Extracting the reply
@@ -160,10 +156,10 @@ def impersonate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typ
if 'pygmalion' in shared.model_name.lower():
name1 = "You"
+ prompt = generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size, impersonate=True)
reply = ''
for i in range(chat_generation_attempts):
- prompt = generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size-len(encode(' '+reply)[0]), impersonate=True)
for reply in generate_reply(prompt+reply, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, eos_token=eos_token, stopping_string=f"\n{name2}:"):
reply, next_character_found, substring_found = extract_message_from_reply(prompt, reply, name1, name2, check, impersonate=True)
if not substring_found:
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 2460df4f..7966e126 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -92,21 +92,22 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
# These models are not part of Hugging Face, so we handle them
# separately and terminate the function call earlier
if shared.is_RWKV:
- if shared.args.no_stream:
- reply = shared.model.generate(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k)
- yield formatted_outputs(reply, shared.model_name)
- else:
- yield formatted_outputs(question, shared.model_name)
- # RWKV has proper streaming, which is very nice.
- # No need to generate 8 tokens at a time.
- for reply in shared.model.generate_with_streaming(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k):
+ try:
+ if shared.args.no_stream:
+ reply = shared.model.generate(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k)
yield formatted_outputs(reply, shared.model_name)
-
- t1 = time.time()
- output = encode(reply)[0]
- input_ids = encode(question)
- print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(input_ids[0])} tokens)")
- return
+ else:
+ yield formatted_outputs(question, shared.model_name)
+ # RWKV has proper streaming, which is very nice.
+ # No need to generate 8 tokens at a time.
+ for reply in shared.model.generate_with_streaming(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k):
+ yield formatted_outputs(reply, shared.model_name)
+ finally:
+ t1 = time.time()
+ output = encode(reply)[0]
+ input_ids = encode(question)
+ print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(input_ids[0])} tokens)")
+ return
original_question = question
if not (shared.args.chat or shared.args.cai_chat):
From 3f7c3d6559a51a3b95667b3ff74d048ffb722484 Mon Sep 17 00:00:00 2001
From: Aleksey Smolenchuk
Date: Sat, 11 Mar 2023 22:10:57 -0800
Subject: [PATCH 50/69] No need to set encoding on binary read
---
modules/chat.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/modules/chat.py b/modules/chat.py
index 8a221526..ab5dbc2d 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -321,7 +321,7 @@ def load_history(file, name1, name2):
def load_default_history(name1, name2):
if Path('logs/persistent.json').exists():
- load_history(open(Path('logs/persistent.json'), 'rb', encoding='utf-8').read(), name1, name2)
+ load_history(open(Path('logs/persistent.json'), 'rb').read(), name1, name2)
else:
shared.history['internal'] = []
shared.history['visible'] = []
@@ -355,7 +355,7 @@ def load_character(_character, name1, name2):
name2 = shared.settings['name2_pygmalion']
if Path(f'logs/{shared.character}_persistent.json').exists():
- load_history(open(Path(f'logs/{shared.character}_persistent.json'), 'rb', encoding='utf-8').read(), name1, name2)
+ load_history(open(Path(f'logs/{shared.character}_persistent.json'), 'rb').read(), name1, name2)
if shared.args.cai_chat:
return name2, context, generate_chat_html(shared.history['visible'], name1, name2, shared.character)
From e2da6b9685a40825fa9c299d676aaae1c3d21dcc Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 12 Mar 2023 03:25:56 -0300
Subject: [PATCH 51/69] Fix You You You appearing in chat mode
---
modules/chat.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/modules/chat.py b/modules/chat.py
index 5bf96b1a..a6167d35 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -84,6 +84,7 @@ def extract_message_from_reply(question, reply, name1, name2, check, impersonate
tmp = f"\n{asker}:"
for j in range(1, len(tmp)):
if reply[-j:] == tmp[:j]:
+ reply = reply[:-j]
substring_found = True
return reply, next_character_found, substring_found
From ad14f0e49929d426560413c0b9de19986cbeac9e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 12 Mar 2023 03:42:29 -0300
Subject: [PATCH 52/69] Fix regenerate (provisory way)
---
modules/chat.py | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/modules/chat.py b/modules/chat.py
index ae089ca5..2048e2c5 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -92,7 +92,7 @@ def extract_message_from_reply(question, reply, name1, name2, check, impersonate
def stop_everything_event():
shared.stop_everything = True
-def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1):
+def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1, regenerate=False):
shared.stop_everything = False
just_started = True
eos_token = '\n' if check else None
@@ -121,8 +121,9 @@ def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical
else:
prompt = custom_generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size)
- # Display user input and "*is typing...*" imediately
- yield shared.history['visible']+[[visible_text, '*Is typing...*']]
+ if not regenerate:
+ # Display user input and "*is typing...*" imediately
+ yield shared.history['visible']+[[visible_text, '*Is typing...*']]
# Generate
reply = ''
@@ -189,7 +190,7 @@ def regenerate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typi
last_visible = shared.history['visible'].pop()
last_internal = shared.history['internal'].pop()
- for _history in chatbot_wrapper(last_internal[0], max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts):
+ for _history in chatbot_wrapper(last_internal[0], max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts, regenerate=True):
if shared.args.cai_chat:
shared.history['visible'][-1] = [last_visible[0], _history[-1][1]]
yield generate_chat_html(shared.history['visible'], name1, name2, shared.character)
From d4afed4e44a748c22d9fa97edb3f818ae8af191f Mon Sep 17 00:00:00 2001
From: Xan <70198941+xanthousm@users.noreply.github.com>
Date: Sun, 12 Mar 2023 17:56:57 +1100
Subject: [PATCH 53/69] Fixes and polish
- Change wav naming to be completely unique using timestamp instead of message ID, stops browser using cached audio when new audio is made with the same file name (eg after regenerate or clear history).
- Make the autoplay setting actually disable autoplay.
- Make Settings panel a bit more compact.
- Hide html errors when audio file of chat history is missing.
- Add button to permanently convert TTS history to normal text messages
- Changed the "show message text" toggle to affect the chat history.
---
extensions/silero_tts/script.py | 89 ++++++++++++++++++++++++++-------
1 file changed, 72 insertions(+), 17 deletions(-)
diff --git a/extensions/silero_tts/script.py b/extensions/silero_tts/script.py
index 7e63d8b7..1a60c901 100644
--- a/extensions/silero_tts/script.py
+++ b/extensions/silero_tts/script.py
@@ -2,8 +2,10 @@ from pathlib import Path
import gradio as gr
import torch
-
+import time
+import re
import modules.shared as shared
+import modules.chat as chat
torch._C._jit_set_profiling_mode(False)
@@ -54,19 +56,57 @@ def remove_surrounded_chars(string):
new_string += char
return new_string
+def remove_tts_from_history():
+ suffix = '_pygmalion' if 'pygmalion' in shared.model_name.lower() else ''
+ for i, entry in enumerate(shared.history['internal']):
+ reply = entry[1]
+ reply = re.sub("(||{{user}})", shared.settings[f'name1{suffix}'], reply)
+ if shared.args.chat:
+ reply = reply.replace('\n', ' ')
+ shared.history['visible'][i][1] = reply
+
+ if shared.args.cai_chat:
+ return chat.generate_chat_html(shared.history['visible'], shared.settings[f'name1{suffix}'], shared.settings[f'name1{suffix}'], shared.character)
+ else:
+ return shared.history['visible']
+
+def toggle_text_in_history():
+ suffix = '_pygmalion' if 'pygmalion' in shared.model_name.lower() else ''
+ audio_str='\n\n' # The '\n\n' used after
+ if shared.args.chat:
+ audio_str='
'
+
+ if params['show_text']==True:
+ #for i, entry in enumerate(shared.history['internal']):
+ for i, entry in enumerate(shared.history['visible']):
+ vis_reply = entry[1]
+ if vis_reply.startswith('