From 5d513eea222f29f1e1c1b0a57500134873bf7c0f Mon Sep 17 00:00:00 2001
From: kizinfo <kizinfo@users.noreply.github.com>
Date: Wed, 12 Jul 2023 17:44:30 +0300
Subject: [PATCH] Add ability to load all text files from a subdirectory for
 training (#1997)

* Update utils.py

returns individual txt files and subdirectories to getdatasets to allow for training from a directory of text files

* Update training.py

minor tweak to training on raw datasets to detect if a directory is selected, and if so, to load in all the txt files in that directory for training

* Update put-trainer-datasets-here.txt

document

* Minor change

* Use pathlib, sort by natural keys

* Space

---------

Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com>
---
 modules/training.py                           | 22 ++++++++++++++-----
 modules/utils.py                              |  4 ++++
 .../datasets/put-trainer-datasets-here.txt    |  1 +
 3 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/modules/training.py b/modules/training.py
index 442b92b3..2f9a7768 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -32,6 +32,7 @@ from modules.evaluate import (
     save_past_evaluations
 )
 from modules.logging_colors import logger
+from modules.utils import natural_keys
 
 # This mapping is from a very recent commit, not yet released.
 # If not available, default to a backup map for some common model types.
@@ -354,12 +355,23 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
 
     # == Prep the dataset, format, etc ==
     if raw_text_file not in ['None', '']:
-        logger.info("Loading raw text file dataset...")
-
         train_template["template_type"] = "raw_text"
+        logger.info("Loading raw text file dataset...")
+        fullpath = clean_path('training/datasets', f'{raw_text_file}')
+        fullpath = Path(fullpath)
+        if fullpath.is_dir():
+            logger.info('Training path directory {}'.format(raw_text_file))
+            raw_text = ""
+            file_paths = sorted(fullpath.glob('*.txt'), key=lambda path: natural_keys(path.name))
+            for file_path in file_paths:
+                if file_path.is_file():
+                    with file_path.open('r', encoding='utf-8') as file:
+                        raw_text += file.read()
 
-        with open(clean_path('training/datasets', f'{raw_text_file}.txt'), 'r', encoding='utf-8') as file:
-            raw_text = file.read().replace('\r', '')
+                    logger.info(f"Loaded training file: {file_path.name}")
+        else:
+            with open(clean_path('training/datasets', f'{raw_text_file}.txt'), 'r', encoding='utf-8') as file:
+                raw_text = file.read()
 
         cut_string = hard_cut_string.replace('\\n', '\n')
         out_tokens = []
@@ -579,7 +591,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
     if WANT_INTERRUPT:
         yield "Interrupted before start."
         return
-    
+
     def log_train_dataset(trainer):
         decoded_entries = []
         # Try to decode the entries and write the log file
diff --git a/modules/utils.py b/modules/utils.py
index 72a0dfa1..8b662be1 100644
--- a/modules/utils.py
+++ b/modules/utils.py
@@ -114,6 +114,10 @@ def get_available_loras():
 
 
 def get_datasets(path: str, ext: str):
+    # include subdirectories for raw txt files to allow training from a subdirectory of txt files
+    if ext == "txt":
+        return ['None'] + sorted(set([k.stem for k in list(Path(path).glob('txt'))+list(Path(path).glob('*/')) if k.stem != 'put-trainer-datasets-here']), key=natural_keys)
+
     return ['None'] + sorted(set([k.stem for k in Path(path).glob(f'*.{ext}') if k.stem != 'put-trainer-datasets-here']), key=natural_keys)
 
 
diff --git a/training/datasets/put-trainer-datasets-here.txt b/training/datasets/put-trainer-datasets-here.txt
index e69de29b..932eacf8 100644
--- a/training/datasets/put-trainer-datasets-here.txt
+++ b/training/datasets/put-trainer-datasets-here.txt
@@ -0,0 +1 @@
+to load multiple raw text files create a subdirectory and put them all there