mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2025-01-24 18:49:23 +01:00
Fix: Fixed the tokenization process of a raw dataset and improved its efficiency (#3035)
This commit is contained in:
parent
3f19e94c93
commit
987d0fe023
@ -388,12 +388,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
|
||||
yield f"Error: overlap_len ({overlap_len}) cannot be greater than or equal to cutoff_len ({cutoff_len})"
|
||||
return
|
||||
|
||||
tokens = list(split_chunks(tokens, step))
|
||||
for i in range(1, len(tokens)):
|
||||
tokens[i] = tokens[i - 1][-overlap_len:] + tokens[i]
|
||||
|
||||
out_tokens.extend(tokens)
|
||||
del tokens
|
||||
out_tokens.extend(split_chunks(tokens, cutoff_len, step))
|
||||
|
||||
del raw_text # Note: could be a gig for a large dataset, so delete redundant data as we go to be safe on RAM
|
||||
text_chunks = [shared.tokenizer.decode(x) for x in out_tokens]
|
||||
@ -663,9 +658,9 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
|
||||
yield f"Done! LoRA saved to `{lora_file_path}`"
|
||||
|
||||
|
||||
def split_chunks(arr, step):
|
||||
def split_chunks(arr, size, step):
|
||||
for i in range(0, len(arr), step):
|
||||
yield arr[i:i + step]
|
||||
yield arr[i:i + size]
|
||||
|
||||
|
||||
def cut_chunk_for_newline(chunk: str, max_length: int):
|
||||
|
Loading…
Reference in New Issue
Block a user