Change training file encoding

2024-11-29 10:59:32 +01:00 · 2023-04-07 11:15:52 -03:00 · 2023-04-07 11:15:52 -03:00 · 768354239b
commit 768354239b
parent 6762e62a40
1 changed files with 1 additions and 1 deletions
--- a/modules/training.py
+++ b/modules/training.py
@ -152,7 +152,7 @@ def do_train(lora_name: str, micro_batch_size: int, batch_size: int, epochs: int
    # == Prep the dataset, format, etc ==
    if raw_text_file not in ['None', '']:
        print("Loading raw text file dataset...")
-        with open(clean_path('training/datasets', f'{raw_text_file}.txt'), 'r') as file:
+        with open(clean_path('training/datasets', f'{raw_text_file}.txt'), 'r', encoding='utf-8') as file:
            raw_text = file.read()
        tokens = shared.tokenizer.encode(raw_text)
        del raw_text  # Note: could be a gig for a large dataset, so delete redundant data as we go to be safe on RAM