From 146a9cb3930f43f1659d02f7cbc3fbd8a5f90227 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 12 May 2023 14:19:55 -0300 Subject: [PATCH] Allow superbooga to download URLs in parallel --- extensions/superbooga/download_urls.py | 32 ++++++++++++++++++ extensions/superbooga/script.py | 46 +++++++++++++++----------- server.py | 3 +- 3 files changed, 61 insertions(+), 20 deletions(-) create mode 100644 extensions/superbooga/download_urls.py diff --git a/extensions/superbooga/download_urls.py b/extensions/superbooga/download_urls.py new file mode 100644 index 00000000..efe300d2 --- /dev/null +++ b/extensions/superbooga/download_urls.py @@ -0,0 +1,32 @@ +import concurrent.futures + +import requests + + +def download_single(url): + response = requests.get(url, timeout=5) + if response.status_code == 200: + return response.content + else: + raise Exception("Failed to download URL") + + +def download_urls(urls, threads=1): + with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor: + futures = [] + for url in urls: + future = executor.submit(download_single, url) + futures.append(future) + + results = [] + i = 0 + for future in concurrent.futures.as_completed(futures): + try: + result = future.result() + results.append(result) + i += 1 + yield f"{i}/{len(urls)}", results + except Exception: + pass + + yield "Done", results diff --git a/extensions/superbooga/script.py b/extensions/superbooga/script.py index 3276214a..7f47db27 100644 --- a/extensions/superbooga/script.py +++ b/extensions/superbooga/script.py @@ -1,7 +1,6 @@ import logging import re import textwrap -from urllib.request import urlopen import chromadb import gradio as gr @@ -13,6 +12,8 @@ from sentence_transformers import SentenceTransformer from modules import chat, shared +from .download_urls import download_urls + logging.info('Intercepting all calls to posthog :)') posthog.capture = lambda *args, **kwargs: None @@ -21,6 +22,7 @@ params = { 'chunk_count': 5, 'chunk_length': 700, 'strong_cleanup': True, + 'threads': 4, } @@ -112,15 +114,20 @@ def feed_file_into_collector(file, chunk_len): yield i -def feed_url_into_collector(urls, chunk_len, strong_cleanup=False): - urls = urls.strip().split('\n') +def feed_url_into_collector(urls, chunk_len, strong_cleanup, threads): all_text = '' cumulative = '' - for url in urls: - cumulative += f'Loading {url}...\n\n' - yield cumulative - html = urlopen(url).read() - soup = BeautifulSoup(html, features="html.parser") + + urls = urls.strip().split('\n') + cumulative += f'Loading {len(urls)} URLs with {threads} threads...\n\n' + yield cumulative + for update, contents in download_urls(urls, threads=threads): + yield cumulative + update + + cumulative += 'Processing the HTML sources...' + yield cumulative + for content in contents: + soup = BeautifulSoup(content, features="html.parser") for script in soup(["script", "style"]): script.extract() @@ -217,24 +224,24 @@ def ui(): ### Example - For your convenience, you can use the following prompt as a starting point (for Alpaca models): + For your convenience, you can use the following prompt as a starting point (for Vicuna 1.1 models): ``` - Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. + A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. - ### Instruction: - You are ArxivGPT, trained on millions of Arxiv papers. You always answer the question, even if full context isn't provided to you. The following are snippets from an Arxiv paper. Use the snippets to answer the question. Think about it step by step + USER: + + <|begin-user-input|> + What datasets are mentioned in the text below? + <|end-user-input|> <|injection-point|> - ### Input: - <|begin-user-input|> - What datasets are mentioned in the paper above? - <|end-user-input|> - - ### Response: + ASSISTANT: ``` + ⚠️ For best results, make sure to remove the spaces and new line characters after `ASSISTANT:`. + ## Chat mode In chat mode, the extension automatically sorts the history by relevance instead of chronologically, except for the very latest input/reply pair. @@ -263,6 +270,7 @@ def ui(): with gr.Tab("URL input"): url_input = gr.Textbox(lines=10, label='Input URLs', info='Enter one or more URLs separated by newline characters.') strong_cleanup = gr.Checkbox(value=params['strong_cleanup'], label='Strong cleanup', info='Only keeps html elements that look like long-form text.') + threads = gr.Number(value=params['threads'], label='Threads', info='The number of threads to use while downloading the URLs.', precision=0) update_url = gr.Button('Load data') with gr.Tab("File input"): @@ -279,6 +287,6 @@ def ui(): last_updated = gr.Markdown() update_data.click(feed_data_into_collector, [data_input, chunk_len], last_updated, show_progress=False) - update_url.click(feed_url_into_collector, [url_input, chunk_len, strong_cleanup], last_updated, show_progress=False) + update_url.click(feed_url_into_collector, [url_input, chunk_len, strong_cleanup, threads], last_updated, show_progress=False) update_file.click(feed_file_into_collector, [file_input, chunk_len], last_updated, show_progress=False) update_settings.click(apply_settings, [chunk_count], last_updated, show_progress=False) diff --git a/server.py b/server.py index 50c32895..43027f8d 100644 --- a/server.py +++ b/server.py @@ -146,12 +146,13 @@ def load_prompt(fname): } output += utils.replace_all(data['turn_template'].split('<|bot-message|>')[0], replacements) - return output + return output.rstrip(' ') else: with open(Path(f'prompts/{fname}.txt'), 'r', encoding='utf-8') as f: text = f.read() if text[-1] == '\n': text = text[:-1] + return text