Make the RWKV model cache the RNN state between messages (#1354)

2025-01-23 01:59:21 +01:00 · 2023-05-09 16:12:53 +02:00 · 2023-05-09 16:12:53 +02:00 · cf6caf1830
commit cf6caf1830
parent 641500dcb9
1 changed files with 71 additions and 1 deletions
--- a/modules/RWKV.py
+++ b/modules/RWKV.py
@ -1,3 +1,4 @@
+import copy
 import os
 from pathlib import Path

@ -32,6 +33,10 @@ class RWKVModel:

        result = self()
        result.pipeline = pipeline
+        result.model = model
+        result.cached_context = ""
+        result.cached_model_state = None
+        result.cached_output_logits = None
        return result

    def generate(self, context="", token_count=20, temperature=1, top_p=1, top_k=50, repetition_penalty=None, alpha_frequency=0.1, alpha_presence=0.1, token_ban=None, token_stop=None, callback=None):
@ -45,7 +50,17 @@ class RWKVModel:
            token_stop=token_stop or []
        )

-        return self.pipeline.generate(context, token_count=token_count, args=args, callback=callback)
+        if self.cached_context != "":
+            if context.startswith(self.cached_context):
+                context = context[len(self.cached_context):]
+            else:
+                self.cached_context = ""
+                self.cached_model_state = None
+                self.cached_output_logits = None
+
+        # out = self.pipeline.generate(context, token_count=token_count, args=args, callback=callback)
+        out = self.generate_from_cached_state(context, token_count=token_count, args=args, callback=callback)
+        return out

    def generate_with_streaming(self, **kwargs):
        with Iteratorize(self.generate, kwargs, callback=None) as generator:
@ -54,6 +69,61 @@ class RWKVModel:
                reply += token
                yield reply

+    # Similar to the PIPELINE.generate, but lets us maintain the cached_model_state
+    def generate_from_cached_state(self, ctx="", token_count=20, args=None, callback=None):
+        all_tokens = []
+        out_str = ''
+        occurrence = {}
+        state = copy.deepcopy(self.cached_model_state) if self.cached_model_state is not None else None
+
+        # if we ended up with an empty context, just reuse the cached logits
+        # this can happen if a user undoes a message and then sends the exact message again
+        # in that case the full context ends up being the same as the cached_context, so the remaining context is empty.
+        if ctx == "":
+            out = self.cached_output_logits
+
+        for i in range(token_count):
+
+            # forward
+            tokens = self.pipeline.encode(ctx) if i == 0 else [token]
+            while len(tokens) > 0:
+                out, state = self.model.forward(tokens[:args.chunk_len], state)
+                tokens = tokens[args.chunk_len:]
+
+            # cache the model state after scanning the context
+            # we don't cache the state after processing our own generated tokens because 
+            # the output string might be post-processed arbitrarily. Therefore, what's fed into the model 
+            # on the next round of chat might be slightly different what what it output on the previous round
+            if i == 0:
+                self.cached_context += ctx
+                self.cached_model_state = copy.deepcopy(state)
+                self.cached_output_logits = copy.deepcopy(out)
+            
+            # adjust probabilities
+            for n in args.token_ban:
+                out[n] = -float('inf')
+            for n in occurrence:
+                out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)
+            
+            # sampler
+            token = self.pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p, top_k=args.top_k)
+            if token in args.token_stop:
+                break
+            all_tokens += [token]
+            if token not in occurrence:
+                occurrence[token] = 1
+            else:
+                occurrence[token] += 1
+            
+            # output
+            tmp = self.pipeline.decode([token])
+            if '\ufffd' not in tmp: # is valid utf-8 string?
+                if callback:
+                    callback(tmp)
+                out_str += tmp
+
+        return out_str
+

 class RWKVTokenizer:
    def __init__(self):