Make the RWKV model cache the RNN state between messages (#1354)

2024-12-25 05:48:55 +01:00 · 2023-05-09 16:12:53 +02:00 · 2023-05-09 16:12:53 +02:00 · cf6caf1830
commit cf6caf1830
parent 641500dcb9
1 changed files with 71 additions and 1 deletions
--- a/modules/RWKV.py
+++ b/modules/RWKV.py
@ -1,3 +1,4 @@
 import copy
 import os
 from pathlib import Path
@ -32,6 +33,10 @@ class RWKVModel:
        result = self()
        result.pipeline = pipeline
        result.model = model
        result.cached_context = ""
        result.cached_model_state = None
        result.cached_output_logits = None
        return result
    def generate(self, context="", token_count=20, temperature=1, top_p=1, top_k=50, repetition_penalty=None, alpha_frequency=0.1, alpha_presence=0.1, token_ban=None, token_stop=None, callback=None):
@ -45,7 +50,17 @@ class RWKVModel:
            token_stop=token_stop or []
        )
-        return self.pipeline.generate(context, token_count=token_count, args=args, callback=callback)
+        if self.cached_context != "":
            if context.startswith(self.cached_context):
                context = context[len(self.cached_context):]
            else:
                self.cached_context = ""
                self.cached_model_state = None
                self.cached_output_logits = None
        # out = self.pipeline.generate(context, token_count=token_count, args=args, callback=callback)
        out = self.generate_from_cached_state(context, token_count=token_count, args=args, callback=callback)
        return out
    def generate_with_streaming(self, **kwargs):
        with Iteratorize(self.generate, kwargs, callback=None) as generator:
@ -54,6 +69,61 @@ class RWKVModel:
                reply += token
                yield reply
    # Similar to the PIPELINE.generate, but lets us maintain the cached_model_state
    def generate_from_cached_state(self, ctx="", token_count=20, args=None, callback=None):
        all_tokens = []
        out_str = ''
        occurrence = {}
        state = copy.deepcopy(self.cached_model_state) if self.cached_model_state is not None else None
        # if we ended up with an empty context, just reuse the cached logits
        # this can happen if a user undoes a message and then sends the exact message again
        # in that case the full context ends up being the same as the cached_context, so the remaining context is empty.
        if ctx == "":
            out = self.cached_output_logits
        for i in range(token_count):
            # forward
            tokens = self.pipeline.encode(ctx) if i == 0 else [token]
            while len(tokens) > 0:
                out, state = self.model.forward(tokens[:args.chunk_len], state)
                tokens = tokens[args.chunk_len:]
            # cache the model state after scanning the context
            # we don't cache the state after processing our own generated tokens because 
            # the output string might be post-processed arbitrarily. Therefore, what's fed into the model 
            # on the next round of chat might be slightly different what what it output on the previous round
            if i == 0:
                self.cached_context += ctx
                self.cached_model_state = copy.deepcopy(state)
                self.cached_output_logits = copy.deepcopy(out)
            # adjust probabilities
            for n in args.token_ban:
                out[n] = -float('inf')
            for n in occurrence:
                out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)
            # sampler
            token = self.pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p, top_k=args.top_k)
            if token in args.token_stop:
                break
            all_tokens += [token]
            if token not in occurrence:
                occurrence[token] = 1
            else:
                occurrence[token] += 1
            # output
            tmp = self.pipeline.decode([token])
            if '\ufffd' not in tmp: # is valid utf-8 string?
                if callback:
                    callback(tmp)
                out_str += tmp
        return out_str
 class RWKVTokenizer:
    def __init__(self):