mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-26 01:30:20 +01:00
Fix: Fixed the tokenization process of a raw dataset and improved its efficiency (#3035)
This commit is contained in:
parent
3f19e94c93
commit
987d0fe023
@ -388,12 +388,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
|
|||||||
yield f"Error: overlap_len ({overlap_len}) cannot be greater than or equal to cutoff_len ({cutoff_len})"
|
yield f"Error: overlap_len ({overlap_len}) cannot be greater than or equal to cutoff_len ({cutoff_len})"
|
||||||
return
|
return
|
||||||
|
|
||||||
tokens = list(split_chunks(tokens, step))
|
out_tokens.extend(split_chunks(tokens, cutoff_len, step))
|
||||||
for i in range(1, len(tokens)):
|
|
||||||
tokens[i] = tokens[i - 1][-overlap_len:] + tokens[i]
|
|
||||||
|
|
||||||
out_tokens.extend(tokens)
|
|
||||||
del tokens
|
|
||||||
|
|
||||||
del raw_text # Note: could be a gig for a large dataset, so delete redundant data as we go to be safe on RAM
|
del raw_text # Note: could be a gig for a large dataset, so delete redundant data as we go to be safe on RAM
|
||||||
text_chunks = [shared.tokenizer.decode(x) for x in out_tokens]
|
text_chunks = [shared.tokenizer.decode(x) for x in out_tokens]
|
||||||
@ -663,9 +658,9 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
|
|||||||
yield f"Done! LoRA saved to `{lora_file_path}`"
|
yield f"Done! LoRA saved to `{lora_file_path}`"
|
||||||
|
|
||||||
|
|
||||||
def split_chunks(arr, step):
|
def split_chunks(arr, size, step):
|
||||||
for i in range(0, len(arr), step):
|
for i in range(0, len(arr), step):
|
||||||
yield arr[i:i + step]
|
yield arr[i:i + size]
|
||||||
|
|
||||||
|
|
||||||
def cut_chunk_for_newline(chunk: str, max_length: int):
|
def cut_chunk_for_newline(chunk: str, max_length: int):
|
||||||
|
Loading…
Reference in New Issue
Block a user