From 9695bfe1178c9b3ef72861be8cb02bcf484afc7c Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 10 May 2023 23:23:37 -0300 Subject: [PATCH] Add an option for cleaning up html in superbooga --- extensions/superbooga/script.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/extensions/superbooga/script.py b/extensions/superbooga/script.py index e239c58a..025714d9 100644 --- a/extensions/superbooga/script.py +++ b/extensions/superbooga/script.py @@ -16,6 +16,13 @@ from modules import chat, shared logging.info('Intercepting all calls to posthog :)') posthog.capture = lambda *args, **kwargs: None +# These parameters are customizable through settings.json +params = { + 'chunk_count': 5, + 'chunk_length': 700 + 'strong_cleanup': True, +} + class Collecter(): def __init__(self): @@ -105,7 +112,7 @@ def feed_file_into_collector(file, chunk_len): yield i -def feed_url_into_collector(urls, chunk_len): +def feed_url_into_collector(urls, chunk_len, strong_cleanup=False): urls = urls.strip().split('\n') all_text = '' cumulative = '' @@ -117,7 +124,11 @@ def feed_url_into_collector(urls, chunk_len): for script in soup(["script", "style"]): script.extract() - text = soup.get_text() + strings = soup.stripped_strings + if strong_cleanup: + strings = [s for s in strings if re.search("[A-Za-z] ", s)] + + text = '\n'.join([s.strip() for s in strings]) lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) text = '\n\n'.join(chunk for chunk in chunks if chunk) @@ -251,6 +262,7 @@ def ui(): with gr.Tab("URL input"): url_input = gr.Textbox(lines=10, label='Input URLs', info='Enter one or more URLs separated by newline characters.') + strong_cleanup = gr.Checkbox(value=params['strong_cleanup'], label='Strong cleanup', info='Only keeps html elements that look like long-form text.') update_url = gr.Button('Load data') with gr.Tab("File input"): @@ -258,15 +270,15 @@ def ui(): update_file = gr.Button('Load data') with gr.Tab("Generation settings"): - chunk_count = gr.Number(value=5, label='Chunk count', info='The number of closest-matching chunks to include in the prompt.') + chunk_count = gr.Number(value=params['chunk_count'], label='Chunk count', info='The number of closest-matching chunks to include in the prompt.') update_settings = gr.Button('Apply changes') - chunk_len = gr.Number(value=700, label='Chunk length', info='In characters, not tokens. This value is used when you click on "Load data".') + chunk_len = gr.Number(value=params['chunk_length'], label='Chunk length', info='In characters, not tokens. This value is used when you click on "Load data".') with gr.Column(): last_updated = gr.Markdown() update_data.click(feed_data_into_collector, [data_input, chunk_len], last_updated, show_progress=False) - update_url.click(feed_url_into_collector, [url_input, chunk_len], last_updated, show_progress=False) + update_url.click(feed_url_into_collector, [url_input, chunk_len, strong_cleanup], last_updated, show_progress=False) update_file.click(feed_file_into_collector, [file_input, chunk_len], last_updated, show_progress=False) update_settings.click(apply_settings, [chunk_count], last_updated, show_progress=False)