Use 'with' statement to better handle streaming memory

This commit is contained in:
oobabooga 2023-03-12 02:04:28 -03:00
parent 37f0166b2d
commit 0bd5430988
3 changed files with 38 additions and 18 deletions

View File

@ -50,11 +50,11 @@ class RWKVModel:
return context+self.pipeline.generate(context, token_count=token_count, args=args, callback=callback) return context+self.pipeline.generate(context, token_count=token_count, args=args, callback=callback)
def generate_with_streaming(self, **kwargs): def generate_with_streaming(self, **kwargs):
iterable = Iteratorize(self.generate, kwargs, callback=None) with Iteratorize(self.generate, kwargs, callback=None) as generator:
reply = kwargs['context'] reply = kwargs['context']
for token in iterable: for token in generator:
reply += token reply += token
yield reply yield reply
class RWKVTokenizer: class RWKVTokenizer:
def __init__(self): def __init__(self):

View File

@ -1,3 +1,4 @@
import gc
from queue import Queue from queue import Queue
from threading import Thread from threading import Thread
@ -6,7 +7,6 @@ import transformers
import modules.shared as shared import modules.shared as shared
# Copied from https://github.com/PygmalionAI/gradio-ui/ # Copied from https://github.com/PygmalionAI/gradio-ui/
class _SentinelTokenStoppingCriteria(transformers.StoppingCriteria): class _SentinelTokenStoppingCriteria(transformers.StoppingCriteria):
@ -52,17 +52,24 @@ class Iteratorize:
self.q = Queue() self.q = Queue()
self.sentinel = object() self.sentinel = object()
self.kwargs = kwargs self.kwargs = kwargs
self.stop_now = False
def _callback(val): def _callback(val):
if self.stop_now:
raise ValueError
self.q.put(val) self.q.put(val)
def gentask(): def gentask():
ret = self.mfunc(callback=_callback, **self.kwargs) try:
ret = self.mfunc(callback=_callback, **self.kwargs)
except ValueError:
pass
self.q.put(self.sentinel) self.q.put(self.sentinel)
if self.c_callback: if self.c_callback:
self.c_callback(ret) self.c_callback(ret)
Thread(target=gentask).start() self.thread = Thread(target=gentask)
self.thread.start()
def __iter__(self): def __iter__(self):
return self return self
@ -75,4 +82,16 @@ class Iteratorize:
return obj return obj
def __del__(self): def __del__(self):
pass clear_torch_cache()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.stop_now = True
clear_torch_cache()
def clear_torch_cache():
gc.collect()
if not shared.args.cpu:
torch.cuda.empty_cache()

View File

@ -186,17 +186,18 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
return Iteratorize(generate_with_callback, kwargs, callback=None) return Iteratorize(generate_with_callback, kwargs, callback=None)
yield formatted_outputs(original_question, shared.model_name) yield formatted_outputs(original_question, shared.model_name)
for output in eval(f"generate_with_streaming({', '.join(generate_params)})"): with eval(f"generate_with_streaming({', '.join(generate_params)})") as generator:
if shared.soft_prompt: for output in generator:
output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:])) if shared.soft_prompt:
reply = decode(output) output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
reply = decode(output)
if not (shared.args.chat or shared.args.cai_chat): if not (shared.args.chat or shared.args.cai_chat):
reply = original_question + apply_extensions(reply[len(question):], "output") reply = original_question + apply_extensions(reply[len(question):], "output")
yield formatted_outputs(reply, shared.model_name) yield formatted_outputs(reply, shared.model_name)
if output[-1] == n: if output[-1] == n:
break break
# Stream the output naively for FlexGen since it doesn't support 'stopping_criteria' # Stream the output naively for FlexGen since it doesn't support 'stopping_criteria'
else: else: