From 0bd54309887f6e7adc7e59d4f8675ed6f322bb81 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 12 Mar 2023 02:04:28 -0300 Subject: [PATCH] Use 'with' statement to better handle streaming memory --- modules/RWKV.py | 10 +++++----- modules/callbacks.py | 27 +++++++++++++++++++++++---- modules/text_generation.py | 19 ++++++++++--------- 3 files changed, 38 insertions(+), 18 deletions(-) diff --git a/modules/RWKV.py b/modules/RWKV.py index 70deab28..836d31dc 100644 --- a/modules/RWKV.py +++ b/modules/RWKV.py @@ -50,11 +50,11 @@ class RWKVModel: return context+self.pipeline.generate(context, token_count=token_count, args=args, callback=callback) def generate_with_streaming(self, **kwargs): - iterable = Iteratorize(self.generate, kwargs, callback=None) - reply = kwargs['context'] - for token in iterable: - reply += token - yield reply + with Iteratorize(self.generate, kwargs, callback=None) as generator: + reply = kwargs['context'] + for token in generator: + reply += token + yield reply class RWKVTokenizer: def __init__(self): diff --git a/modules/callbacks.py b/modules/callbacks.py index 05e8fafa..e0d1c988 100644 --- a/modules/callbacks.py +++ b/modules/callbacks.py @@ -1,3 +1,4 @@ +import gc from queue import Queue from threading import Thread @@ -6,7 +7,6 @@ import transformers import modules.shared as shared - # Copied from https://github.com/PygmalionAI/gradio-ui/ class _SentinelTokenStoppingCriteria(transformers.StoppingCriteria): @@ -52,17 +52,24 @@ class Iteratorize: self.q = Queue() self.sentinel = object() self.kwargs = kwargs + self.stop_now = False def _callback(val): + if self.stop_now: + raise ValueError self.q.put(val) def gentask(): - ret = self.mfunc(callback=_callback, **self.kwargs) + try: + ret = self.mfunc(callback=_callback, **self.kwargs) + except ValueError: + pass self.q.put(self.sentinel) if self.c_callback: self.c_callback(ret) - Thread(target=gentask).start() + self.thread = Thread(target=gentask) + self.thread.start() def __iter__(self): return self @@ -75,4 +82,16 @@ class Iteratorize: return obj def __del__(self): - pass + clear_torch_cache() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.stop_now = True + clear_torch_cache() + +def clear_torch_cache(): + gc.collect() + if not shared.args.cpu: + torch.cuda.empty_cache() diff --git a/modules/text_generation.py b/modules/text_generation.py index 5d01c8cb..7f5aad5e 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -186,17 +186,18 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi return Iteratorize(generate_with_callback, kwargs, callback=None) yield formatted_outputs(original_question, shared.model_name) - for output in eval(f"generate_with_streaming({', '.join(generate_params)})"): - if shared.soft_prompt: - output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:])) - reply = decode(output) + with eval(f"generate_with_streaming({', '.join(generate_params)})") as generator: + for output in generator: + if shared.soft_prompt: + output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:])) + reply = decode(output) - if not (shared.args.chat or shared.args.cai_chat): - reply = original_question + apply_extensions(reply[len(question):], "output") - yield formatted_outputs(reply, shared.model_name) + if not (shared.args.chat or shared.args.cai_chat): + reply = original_question + apply_extensions(reply[len(question):], "output") + yield formatted_outputs(reply, shared.model_name) - if output[-1] == n: - break + if output[-1] == n: + break # Stream the output naively for FlexGen since it doesn't support 'stopping_criteria' else: