From 987d0fe023d4a536193a54dbfc5ef6cf8b54859a Mon Sep 17 00:00:00 2001 From: Fernando Tarin Morales Date: Thu, 13 Jul 2023 00:05:37 +0900 Subject: [PATCH] Fix: Fixed the tokenization process of a raw dataset and improved its efficiency (#3035) --- modules/training.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/modules/training.py b/modules/training.py index 22dce3fc..9388436b 100644 --- a/modules/training.py +++ b/modules/training.py @@ -388,12 +388,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch yield f"Error: overlap_len ({overlap_len}) cannot be greater than or equal to cutoff_len ({cutoff_len})" return - tokens = list(split_chunks(tokens, step)) - for i in range(1, len(tokens)): - tokens[i] = tokens[i - 1][-overlap_len:] + tokens[i] - - out_tokens.extend(tokens) - del tokens + out_tokens.extend(split_chunks(tokens, cutoff_len, step)) del raw_text # Note: could be a gig for a large dataset, so delete redundant data as we go to be safe on RAM text_chunks = [shared.tokenizer.decode(x) for x in out_tokens] @@ -663,9 +658,9 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch yield f"Done! LoRA saved to `{lora_file_path}`" -def split_chunks(arr, step): +def split_chunks(arr, size, step): for i in range(0, len(arr), step): - yield arr[i:i + step] + yield arr[i:i + size] def cut_chunk_for_newline(chunk: str, max_length: int):