mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-25 17:29:22 +01:00
Use 'with' statement to better handle streaming memory
This commit is contained in:
parent
37f0166b2d
commit
0bd5430988
@ -50,11 +50,11 @@ class RWKVModel:
|
|||||||
return context+self.pipeline.generate(context, token_count=token_count, args=args, callback=callback)
|
return context+self.pipeline.generate(context, token_count=token_count, args=args, callback=callback)
|
||||||
|
|
||||||
def generate_with_streaming(self, **kwargs):
|
def generate_with_streaming(self, **kwargs):
|
||||||
iterable = Iteratorize(self.generate, kwargs, callback=None)
|
with Iteratorize(self.generate, kwargs, callback=None) as generator:
|
||||||
reply = kwargs['context']
|
reply = kwargs['context']
|
||||||
for token in iterable:
|
for token in generator:
|
||||||
reply += token
|
reply += token
|
||||||
yield reply
|
yield reply
|
||||||
|
|
||||||
class RWKVTokenizer:
|
class RWKVTokenizer:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import gc
|
||||||
from queue import Queue
|
from queue import Queue
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
|
|
||||||
@ -6,7 +7,6 @@ import transformers
|
|||||||
|
|
||||||
import modules.shared as shared
|
import modules.shared as shared
|
||||||
|
|
||||||
|
|
||||||
# Copied from https://github.com/PygmalionAI/gradio-ui/
|
# Copied from https://github.com/PygmalionAI/gradio-ui/
|
||||||
class _SentinelTokenStoppingCriteria(transformers.StoppingCriteria):
|
class _SentinelTokenStoppingCriteria(transformers.StoppingCriteria):
|
||||||
|
|
||||||
@ -52,17 +52,24 @@ class Iteratorize:
|
|||||||
self.q = Queue()
|
self.q = Queue()
|
||||||
self.sentinel = object()
|
self.sentinel = object()
|
||||||
self.kwargs = kwargs
|
self.kwargs = kwargs
|
||||||
|
self.stop_now = False
|
||||||
|
|
||||||
def _callback(val):
|
def _callback(val):
|
||||||
|
if self.stop_now:
|
||||||
|
raise ValueError
|
||||||
self.q.put(val)
|
self.q.put(val)
|
||||||
|
|
||||||
def gentask():
|
def gentask():
|
||||||
ret = self.mfunc(callback=_callback, **self.kwargs)
|
try:
|
||||||
|
ret = self.mfunc(callback=_callback, **self.kwargs)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
self.q.put(self.sentinel)
|
self.q.put(self.sentinel)
|
||||||
if self.c_callback:
|
if self.c_callback:
|
||||||
self.c_callback(ret)
|
self.c_callback(ret)
|
||||||
|
|
||||||
Thread(target=gentask).start()
|
self.thread = Thread(target=gentask)
|
||||||
|
self.thread.start()
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
return self
|
return self
|
||||||
@ -75,4 +82,16 @@ class Iteratorize:
|
|||||||
return obj
|
return obj
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
pass
|
clear_torch_cache()
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||||
|
self.stop_now = True
|
||||||
|
clear_torch_cache()
|
||||||
|
|
||||||
|
def clear_torch_cache():
|
||||||
|
gc.collect()
|
||||||
|
if not shared.args.cpu:
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
@ -186,17 +186,18 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
|
|||||||
return Iteratorize(generate_with_callback, kwargs, callback=None)
|
return Iteratorize(generate_with_callback, kwargs, callback=None)
|
||||||
|
|
||||||
yield formatted_outputs(original_question, shared.model_name)
|
yield formatted_outputs(original_question, shared.model_name)
|
||||||
for output in eval(f"generate_with_streaming({', '.join(generate_params)})"):
|
with eval(f"generate_with_streaming({', '.join(generate_params)})") as generator:
|
||||||
if shared.soft_prompt:
|
for output in generator:
|
||||||
output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
|
if shared.soft_prompt:
|
||||||
reply = decode(output)
|
output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
|
||||||
|
reply = decode(output)
|
||||||
|
|
||||||
if not (shared.args.chat or shared.args.cai_chat):
|
if not (shared.args.chat or shared.args.cai_chat):
|
||||||
reply = original_question + apply_extensions(reply[len(question):], "output")
|
reply = original_question + apply_extensions(reply[len(question):], "output")
|
||||||
yield formatted_outputs(reply, shared.model_name)
|
yield formatted_outputs(reply, shared.model_name)
|
||||||
|
|
||||||
if output[-1] == n:
|
if output[-1] == n:
|
||||||
break
|
break
|
||||||
|
|
||||||
# Stream the output naively for FlexGen since it doesn't support 'stopping_criteria'
|
# Stream the output naively for FlexGen since it doesn't support 'stopping_criteria'
|
||||||
else:
|
else:
|
||||||
|
Loading…
Reference in New Issue
Block a user