Allow superbooga to download URLs in parallel

2024-11-26 01:30:20 +01:00 · 2023-05-12 14:19:55 -03:00 · 2023-05-12 14:19:55 -03:00 · 146a9cb393
commit 146a9cb393
parent df37ba5256
3 changed files with 61 additions and 20 deletions
--- a/extensions/superbooga/download_urls.py
+++ b/extensions/superbooga/download_urls.py
@ -0,0 +1,32 @@
 import concurrent.futures
 import requests
 def download_single(url):
    response = requests.get(url, timeout=5)
    if response.status_code == 200:
        return response.content
    else:
        raise Exception("Failed to download URL")
 def download_urls(urls, threads=1):
    with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
        futures = []
        for url in urls:
            future = executor.submit(download_single, url)
            futures.append(future)
        results = []
        i = 0
        for future in concurrent.futures.as_completed(futures):
            try:
                result = future.result()
                results.append(result)
                i += 1
                yield f"{i}/{len(urls)}", results
            except Exception:
                pass
        yield "Done", results
--- a/extensions/superbooga/script.py
+++ b/extensions/superbooga/script.py
@ -1,7 +1,6 @@
 import logging
 import re
 import textwrap
 from urllib.request import urlopen
 import chromadb
 import gradio as gr
@ -13,6 +12,8 @@ from sentence_transformers import SentenceTransformer
 from modules import chat, shared
 from .download_urls import download_urls
 logging.info('Intercepting all calls to posthog :)')
 posthog.capture = lambda *args, **kwargs: None
@ -21,6 +22,7 @@ params = {
    'chunk_count': 5,
    'chunk_length': 700,
    'strong_cleanup': True,
    'threads': 4,
 }
@ -112,15 +114,20 @@ def feed_file_into_collector(file, chunk_len):
        yield i
-def feed_url_into_collector(urls, chunk_len, strong_cleanup=False):
+def feed_url_into_collector(urls, chunk_len, strong_cleanup, threads):
    urls = urls.strip().split('\n')
    all_text = ''
    cumulative = ''
-    for url in urls:
+
-        cumulative += f'Loading {url}...\n\n'
+    urls = urls.strip().split('\n')
    cumulative += f'Loading {len(urls)} URLs with {threads} threads...\n\n'
    yield cumulative
-        html = urlopen(url).read()
+    for update, contents in download_urls(urls, threads=threads):
-        soup = BeautifulSoup(html, features="html.parser")
+        yield cumulative + update
    cumulative += 'Processing the HTML sources...'
    yield cumulative
    for content in contents:
        soup = BeautifulSoup(content, features="html.parser")
        for script in soup(["script", "style"]):
            script.extract()
@ -217,24 +224,24 @@ def ui():
        ### Example
-        For your convenience, you can use the following prompt as a starting point (for Alpaca models):
+        For your convenience, you can use the following prompt as a starting point (for Vicuna 1.1 models):
        ```
-        Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+        A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
-        ### Instruction:
+        USER:
-        You are ArxivGPT, trained on millions of Arxiv papers. You always answer the question, even if full context isn't provided to you. The following are snippets from an Arxiv paper. Use the snippets to answer the question. Think about it step by step
+
        <|begin-user-input|>
        What datasets are mentioned in the text below?
        <|end-user-input|>
        <|injection-point|>
-        ### Input:
+        ASSISTANT:
        <|begin-user-input|>
        What datasets are mentioned in the paper above?
        <|end-user-input|>
        ### Response:
        ```
        ⚠️  For best results, make sure to remove the spaces and new line characters after `ASSISTANT:`.
        ## Chat mode
        In chat mode, the extension automatically sorts the history by relevance instead of chronologically, except for the very latest input/reply pair.
@ -263,6 +270,7 @@ def ui():
                with gr.Tab("URL input"):
                    url_input = gr.Textbox(lines=10, label='Input URLs', info='Enter one or more URLs separated by newline characters.')
                    strong_cleanup = gr.Checkbox(value=params['strong_cleanup'], label='Strong cleanup', info='Only keeps html elements that look like long-form text.')
                    threads = gr.Number(value=params['threads'], label='Threads', info='The number of threads to use while downloading the URLs.', precision=0)
                    update_url = gr.Button('Load data')
                with gr.Tab("File input"):
@ -279,6 +287,6 @@ def ui():
                last_updated = gr.Markdown()
        update_data.click(feed_data_into_collector, [data_input, chunk_len], last_updated, show_progress=False)
-        update_url.click(feed_url_into_collector, [url_input, chunk_len, strong_cleanup], last_updated, show_progress=False)
+        update_url.click(feed_url_into_collector, [url_input, chunk_len, strong_cleanup, threads], last_updated, show_progress=False)
        update_file.click(feed_file_into_collector, [file_input, chunk_len], last_updated, show_progress=False)
        update_settings.click(apply_settings, [chunk_count], last_updated, show_progress=False)
--- a/server.py
+++ b/server.py
@ -146,12 +146,13 @@ def load_prompt(fname):
            }
            output += utils.replace_all(data['turn_template'].split('<|bot-message|>')[0], replacements)
-            return output
+            return output.rstrip(' ')
    else:
        with open(Path(f'prompts/{fname}.txt'), 'r', encoding='utf-8') as f:
            text = f.read()
            if text[-1] == '\n':
                text = text[:-1]
            return text