text-generation-webui/modules/callbacks.py

import gc
import traceback
from queue import Queue
from threading import Thread

import torch
import transformers

import modules.shared as shared


class _SentinelTokenStoppingCriteria(transformers.StoppingCriteria):

    def __init__(self, sentinel_token_ids: list, starting_idx: int):
        transformers.StoppingCriteria.__init__(self)
        self.sentinel_token_ids = sentinel_token_ids
        self.starting_idx = starting_idx
        self.shortest = min([x.shape[-1] for x in sentinel_token_ids])

    def __call__(self, input_ids: torch.LongTensor, _scores: torch.FloatTensor) -> bool:
        for sample in input_ids:
            trimmed_sample = sample[self.starting_idx:]
            trimmed_len = trimmed_sample.shape[-1]
            if trimmed_len < self.shortest:
                continue

            for sentinel in self.sentinel_token_ids:
                sentinel_len = sentinel.shape[-1]
                if trimmed_len < sentinel_len:
                    continue

                window = trimmed_sample[-sentinel_len:]
                if torch.all(torch.eq(sentinel, window)):
                    return True

        return False


class Stream(transformers.StoppingCriteria):
    def __init__(self, callback_func=None):
        self.callback_func = callback_func

    def __call__(self, input_ids, scores) -> bool:
        if self.callback_func is not None:
            self.callback_func(input_ids[0])
        return False


class Iteratorize:

    """
    Transforms a function that takes a callback
    into a lazy iterator (generator).

    Adapted from: https://stackoverflow.com/a/9969000
    """

    def __init__(self, func, args=None, kwargs=None, callback=None):
        self.mfunc = func
        self.c_callback = callback
        self.q = Queue()
        self.sentinel = object()
        self.args = args or []
        self.kwargs = kwargs or {}
        self.stop_now = False

        def _callback(val):
            if self.stop_now or shared.stop_everything:
                raise ValueError
            self.q.put(val)

        def gentask():
            try:
                ret = self.mfunc(callback=_callback, *args, **self.kwargs)
            except ValueError:
                pass
            except:
                traceback.print_exc()
                pass

            clear_torch_cache()
            self.q.put(self.sentinel)
            if self.c_callback:
                self.c_callback(ret)

        self.thread = Thread(target=gentask)
        self.thread.start()

    def __iter__(self):
        return self

    def __next__(self):
        obj = self.q.get(True, None)
        if obj is self.sentinel:
            raise StopIteration
        else:
            return obj

    def __del__(self):
        clear_torch_cache()

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.stop_now = True
        clear_torch_cache()


def clear_torch_cache():
    gc.collect()
    if not shared.args.cpu:
        torch.cuda.empty_cache()
Missing import 2023-03-24 02:16:08 +01:00			`import gc`
Gracefully handle CUDA out of memory errors with streaming 2023-03-29 00:20:50 +02:00			`import traceback`
New text streaming method (much faster) 2023-03-08 06:46:35 +01:00			`from queue import Queue`
			`from threading import Thread`

			`import torch`
			`import transformers`

Another missing import 2023-03-24 02:19:01 +01:00			`import modules.shared as shared`

Sort the imports 2023-03-17 15:42:25 +01:00
New text streaming method (much faster) 2023-03-08 06:46:35 +01:00			`class _SentinelTokenStoppingCriteria(transformers.StoppingCriteria):`

Fix `type object is not subscriptable` Fix `type object is not subscriptable` on python 3.8 2023-03-31 13:20:31 +02:00			`def __init__(self, sentinel_token_ids: list, starting_idx: int):`
New text streaming method (much faster) 2023-03-08 06:46:35 +01:00			`transformers.StoppingCriteria.__init__(self)`
			`self.sentinel_token_ids = sentinel_token_ids`
			`self.starting_idx = starting_idx`
optimize stopping strings processing (#1625) 2023-05-02 06:21:54 +02:00			`self.shortest = min([x.shape[-1] for x in sentinel_token_ids])`
New text streaming method (much faster) 2023-03-08 06:46:35 +01:00
Stop the bot from talking for you in chat mode 2023-03-24 01:38:20 +01:00			`def __call__(self, input_ids: torch.LongTensor, _scores: torch.FloatTensor) -> bool:`
New text streaming method (much faster) 2023-03-08 06:46:35 +01:00			`for sample in input_ids:`
			`trimmed_sample = sample[self.starting_idx:]`
optimize stopping strings processing (#1625) 2023-05-02 06:21:54 +02:00			`trimmed_len = trimmed_sample.shape[-1]`
			`if trimmed_len < self.shortest:`
			`continue`
Stop the bot from talking for you in chat mode 2023-03-24 01:38:20 +01:00
optimize stopping strings processing (#1625) 2023-05-02 06:21:54 +02:00			`for sentinel in self.sentinel_token_ids:`
			`sentinel_len = sentinel.shape[-1]`
			`if trimmed_len < sentinel_len:`
Stop the bot from talking for you in chat mode 2023-03-24 01:38:20 +01:00			`continue`
optimize stopping strings processing (#1625) 2023-05-02 06:21:54 +02:00
			`window = trimmed_sample[-sentinel_len:]`
			`if torch.all(torch.eq(sentinel, window)):`
			`return True`

New text streaming method (much faster) 2023-03-08 06:46:35 +01:00			`return False`

Make the code more like PEP8 for readability (#862) 2023-04-07 05:15:45 +02:00
New text streaming method (much faster) 2023-03-08 06:46:35 +01:00			`class Stream(transformers.StoppingCriteria):`
			`def __init__(self, callback_func=None):`
			`self.callback_func = callback_func`

			`def __call__(self, input_ids, scores) -> bool:`
			`if self.callback_func is not None:`
			`self.callback_func(input_ids[0])`
			`return False`

Make the code more like PEP8 for readability (#862) 2023-04-07 05:15:45 +02:00
New text streaming method (much faster) 2023-03-08 06:46:35 +01:00			`class Iteratorize:`

			`"""`
			`Transforms a function that takes a callback`
			`into a lazy iterator (generator).`
Add credits 2023-05-04 02:49:55 +02:00
			`Adapted from: https://stackoverflow.com/a/9969000`
New text streaming method (much faster) 2023-03-08 06:46:35 +01:00			`"""`

Add ExLlama support (#2444) 2023-06-17 01:35:38 +02:00			`def __init__(self, func, args=None, kwargs=None, callback=None):`
Make the code more like PEP8 for readability (#862) 2023-04-07 05:15:45 +02:00			`self.mfunc = func`
			`self.c_callback = callback`
Fix memory leak in new streaming (second attempt) 2023-03-12 03:14:49 +01:00			`self.q = Queue()`
New text streaming method (much faster) 2023-03-08 06:46:35 +01:00			`self.sentinel = object()`
Add ExLlama support (#2444) 2023-06-17 01:35:38 +02:00			`self.args = args or []`
Remove mutable defaults from function signature. (#1663) 2023-05-09 03:55:41 +02:00			`self.kwargs = kwargs or {}`
Use 'with' statement to better handle streaming memory 2023-03-12 06:04:28 +01:00			`self.stop_now = False`
New text streaming method (much faster) 2023-03-08 06:46:35 +01:00
			`def _callback(val):`
Change Stop button behavior 2023-03-27 18:23:59 +02:00			`if self.stop_now or shared.stop_everything:`
Use 'with' statement to better handle streaming memory 2023-03-12 06:04:28 +01:00			`raise ValueError`
New text streaming method (much faster) 2023-03-08 06:46:35 +01:00			`self.q.put(val)`

			`def gentask():`
Use 'with' statement to better handle streaming memory 2023-03-12 06:04:28 +01:00			`try:`
Add ExLlama support (#2444) 2023-06-17 01:35:38 +02:00			`ret = self.mfunc(callback=_callback, args, *self.kwargs)`
Use 'with' statement to better handle streaming memory 2023-03-12 06:04:28 +01:00			`except ValueError:`
			`pass`
Gracefully handle CUDA out of memory errors with streaming 2023-03-29 00:20:50 +02:00			`except:`
			`traceback.print_exc()`
			`pass`

Various fixes in chat mode 2023-03-12 06:53:08 +01:00			`clear_torch_cache()`
New text streaming method (much faster) 2023-03-08 06:46:35 +01:00			`self.q.put(self.sentinel)`
			`if self.c_callback:`
			`self.c_callback(ret)`

Use 'with' statement to better handle streaming memory 2023-03-12 06:04:28 +01:00			`self.thread = Thread(target=gentask)`
			`self.thread.start()`
New text streaming method (much faster) 2023-03-08 06:46:35 +01:00
			`def __iter__(self):`
			`return self`

			`def __next__(self):`
Make the code more like PEP8 for readability (#862) 2023-04-07 05:15:45 +02:00			`obj = self.q.get(True, None)`
New text streaming method (much faster) 2023-03-08 06:46:35 +01:00			`if obj is self.sentinel:`
			`raise StopIteration`
			`else:`
			`return obj`
Fix memory leak in new streaming (second attempt) 2023-03-12 03:14:49 +01:00
			`def __del__(self):`
Use 'with' statement to better handle streaming memory 2023-03-12 06:04:28 +01:00			`clear_torch_cache()`

			`def __enter__(self):`
			`return self`

			`def __exit__(self, exc_type, exc_val, exc_tb):`
			`self.stop_now = True`
			`clear_torch_cache()`
Fix broken callbacks.py 2023-03-24 02:12:24 +01:00
Make the code more like PEP8 for readability (#862) 2023-04-07 05:15:45 +02:00
Fix broken callbacks.py 2023-03-24 02:12:24 +01:00			`def clear_torch_cache():`
			`gc.collect()`
			`if not shared.args.cpu:`
			`torch.cuda.empty_cache()`