Fix: Fixed the tokenization process of a raw dataset and improved its efficiency (#3035)

2025-01-24 18:49:23 +01:00 · 2023-07-13 00:05:37 +09:00 · 2023-07-13 00:05:37 +09:00 · 987d0fe023
commit 987d0fe023
parent 3f19e94c93
1 changed files with 3 additions and 8 deletions
--- a/modules/training.py
+++ b/modules/training.py
@ -388,12 +388,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
                yield f"Error: overlap_len ({overlap_len}) cannot be greater than or equal to cutoff_len ({cutoff_len})"
                return

-            tokens = list(split_chunks(tokens, step))
-            for i in range(1, len(tokens)):
-                tokens[i] = tokens[i - 1][-overlap_len:] + tokens[i]
-
-            out_tokens.extend(tokens)
-            del tokens
+            out_tokens.extend(split_chunks(tokens, cutoff_len, step))

        del raw_text  # Note: could be a gig for a large dataset, so delete redundant data as we go to be safe on RAM
        text_chunks = [shared.tokenizer.decode(x) for x in out_tokens]
@ -663,9 +658,9 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
        yield f"Done! LoRA saved to `{lora_file_path}`"


-def split_chunks(arr, step):
+def split_chunks(arr, size, step):
    for i in range(0, len(arr), step):
-        yield arr[i:i + step]
+        yield arr[i:i + size]


 def cut_chunk_for_newline(chunk: str, max_length: int):