Allow superbooga to download URLs in parallel

2024-11-22 08:07:56 +01:00 · 2023-05-12 14:19:55 -03:00 · 2023-05-12 14:19:55 -03:00 · 146a9cb393
commit 146a9cb393
parent df37ba5256
3 changed files with 61 additions and 20 deletions
--- a/extensions/superbooga/download_urls.py
+++ b/extensions/superbooga/download_urls.py
@ -0,0 +1,32 @@
+import concurrent.futures
+
+import requests
+
+
+def download_single(url):
+    response = requests.get(url, timeout=5)
+    if response.status_code == 200:
+        return response.content
+    else:
+        raise Exception("Failed to download URL")
+
+
+def download_urls(urls, threads=1):
+    with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
+        futures = []
+        for url in urls:
+            future = executor.submit(download_single, url)
+            futures.append(future)
+
+        results = []
+        i = 0
+        for future in concurrent.futures.as_completed(futures):
+            try:
+                result = future.result()
+                results.append(result)
+                i += 1
+                yield f"{i}/{len(urls)}", results
+            except Exception:
+                pass
+
+        yield "Done", results
--- a/extensions/superbooga/script.py
+++ b/extensions/superbooga/script.py
@ -1,7 +1,6 @@
 import logging
 import re
 import textwrap
-from urllib.request import urlopen

 import chromadb
 import gradio as gr
@ -13,6 +12,8 @@ from sentence_transformers import SentenceTransformer

 from modules import chat, shared

+from .download_urls import download_urls
+
 logging.info('Intercepting all calls to posthog :)')
 posthog.capture = lambda *args, **kwargs: None

@ -21,6 +22,7 @@ params = {
    'chunk_count': 5,
    'chunk_length': 700,
    'strong_cleanup': True,
+    'threads': 4,
 }


@ -112,15 +114,20 @@ def feed_file_into_collector(file, chunk_len):
        yield i


-def feed_url_into_collector(urls, chunk_len, strong_cleanup=False):
-    urls = urls.strip().split('\n')
+def feed_url_into_collector(urls, chunk_len, strong_cleanup, threads):
    all_text = ''
    cumulative = ''
-    for url in urls:
-        cumulative += f'Loading {url}...\n\n'
+
+    urls = urls.strip().split('\n')
+    cumulative += f'Loading {len(urls)} URLs with {threads} threads...\n\n'
    yield cumulative
-        html = urlopen(url).read()
-        soup = BeautifulSoup(html, features="html.parser")
+    for update, contents in download_urls(urls, threads=threads):
+        yield cumulative + update
+
+    cumulative += 'Processing the HTML sources...'
+    yield cumulative
+    for content in contents:
+        soup = BeautifulSoup(content, features="html.parser")
        for script in soup(["script", "style"]):
            script.extract()

@ -217,24 +224,24 @@ def ui():

        ### Example

-        For your convenience, you can use the following prompt as a starting point (for Alpaca models):
+        For your convenience, you can use the following prompt as a starting point (for Vicuna 1.1 models):

        ```
-        Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+        A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.

-        ### Instruction:
-        You are ArxivGPT, trained on millions of Arxiv papers. You always answer the question, even if full context isn't provided to you. The following are snippets from an Arxiv paper. Use the snippets to answer the question. Think about it step by step
+        USER:
+
+        <|begin-user-input|>
+        What datasets are mentioned in the text below?
+        <|end-user-input|>

        <|injection-point|>

-        ### Input:
-        <|begin-user-input|>
-        What datasets are mentioned in the paper above?
-        <|end-user-input|>
-
-        ### Response:
+        ASSISTANT:
        ```

+        ⚠️  For best results, make sure to remove the spaces and new line characters after `ASSISTANT:`.
+
        ## Chat mode

        In chat mode, the extension automatically sorts the history by relevance instead of chronologically, except for the very latest input/reply pair.
@ -263,6 +270,7 @@ def ui():
                with gr.Tab("URL input"):
                    url_input = gr.Textbox(lines=10, label='Input URLs', info='Enter one or more URLs separated by newline characters.')
                    strong_cleanup = gr.Checkbox(value=params['strong_cleanup'], label='Strong cleanup', info='Only keeps html elements that look like long-form text.')
+                    threads = gr.Number(value=params['threads'], label='Threads', info='The number of threads to use while downloading the URLs.', precision=0)
                    update_url = gr.Button('Load data')

                with gr.Tab("File input"):
@ -279,6 +287,6 @@ def ui():
                last_updated = gr.Markdown()

        update_data.click(feed_data_into_collector, [data_input, chunk_len], last_updated, show_progress=False)
-        update_url.click(feed_url_into_collector, [url_input, chunk_len, strong_cleanup], last_updated, show_progress=False)
+        update_url.click(feed_url_into_collector, [url_input, chunk_len, strong_cleanup, threads], last_updated, show_progress=False)
        update_file.click(feed_file_into_collector, [file_input, chunk_len], last_updated, show_progress=False)
        update_settings.click(apply_settings, [chunk_count], last_updated, show_progress=False)
--- a/server.py
+++ b/server.py
@ -146,12 +146,13 @@ def load_prompt(fname):
            }

            output += utils.replace_all(data['turn_template'].split('<|bot-message|>')[0], replacements)
-            return output
+            return output.rstrip(' ')
    else:
        with open(Path(f'prompts/{fname}.txt'), 'r', encoding='utf-8') as f:
            text = f.read()
            if text[-1] == '\n':
                text = text[:-1]
+
            return text