mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-22 08:07:56 +01:00
Allow superbooga to download URLs in parallel
This commit is contained in:
parent
df37ba5256
commit
146a9cb393
32
extensions/superbooga/download_urls.py
Normal file
32
extensions/superbooga/download_urls.py
Normal file
@ -0,0 +1,32 @@
|
||||
import concurrent.futures
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
def download_single(url):
|
||||
response = requests.get(url, timeout=5)
|
||||
if response.status_code == 200:
|
||||
return response.content
|
||||
else:
|
||||
raise Exception("Failed to download URL")
|
||||
|
||||
|
||||
def download_urls(urls, threads=1):
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
|
||||
futures = []
|
||||
for url in urls:
|
||||
future = executor.submit(download_single, url)
|
||||
futures.append(future)
|
||||
|
||||
results = []
|
||||
i = 0
|
||||
for future in concurrent.futures.as_completed(futures):
|
||||
try:
|
||||
result = future.result()
|
||||
results.append(result)
|
||||
i += 1
|
||||
yield f"{i}/{len(urls)}", results
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
yield "Done", results
|
@ -1,7 +1,6 @@
|
||||
import logging
|
||||
import re
|
||||
import textwrap
|
||||
from urllib.request import urlopen
|
||||
|
||||
import chromadb
|
||||
import gradio as gr
|
||||
@ -13,6 +12,8 @@ from sentence_transformers import SentenceTransformer
|
||||
|
||||
from modules import chat, shared
|
||||
|
||||
from .download_urls import download_urls
|
||||
|
||||
logging.info('Intercepting all calls to posthog :)')
|
||||
posthog.capture = lambda *args, **kwargs: None
|
||||
|
||||
@ -21,6 +22,7 @@ params = {
|
||||
'chunk_count': 5,
|
||||
'chunk_length': 700,
|
||||
'strong_cleanup': True,
|
||||
'threads': 4,
|
||||
}
|
||||
|
||||
|
||||
@ -112,15 +114,20 @@ def feed_file_into_collector(file, chunk_len):
|
||||
yield i
|
||||
|
||||
|
||||
def feed_url_into_collector(urls, chunk_len, strong_cleanup=False):
|
||||
urls = urls.strip().split('\n')
|
||||
def feed_url_into_collector(urls, chunk_len, strong_cleanup, threads):
|
||||
all_text = ''
|
||||
cumulative = ''
|
||||
for url in urls:
|
||||
cumulative += f'Loading {url}...\n\n'
|
||||
|
||||
urls = urls.strip().split('\n')
|
||||
cumulative += f'Loading {len(urls)} URLs with {threads} threads...\n\n'
|
||||
yield cumulative
|
||||
html = urlopen(url).read()
|
||||
soup = BeautifulSoup(html, features="html.parser")
|
||||
for update, contents in download_urls(urls, threads=threads):
|
||||
yield cumulative + update
|
||||
|
||||
cumulative += 'Processing the HTML sources...'
|
||||
yield cumulative
|
||||
for content in contents:
|
||||
soup = BeautifulSoup(content, features="html.parser")
|
||||
for script in soup(["script", "style"]):
|
||||
script.extract()
|
||||
|
||||
@ -217,24 +224,24 @@ def ui():
|
||||
|
||||
### Example
|
||||
|
||||
For your convenience, you can use the following prompt as a starting point (for Alpaca models):
|
||||
For your convenience, you can use the following prompt as a starting point (for Vicuna 1.1 models):
|
||||
|
||||
```
|
||||
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
|
||||
A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
|
||||
|
||||
### Instruction:
|
||||
You are ArxivGPT, trained on millions of Arxiv papers. You always answer the question, even if full context isn't provided to you. The following are snippets from an Arxiv paper. Use the snippets to answer the question. Think about it step by step
|
||||
USER:
|
||||
|
||||
<|begin-user-input|>
|
||||
What datasets are mentioned in the text below?
|
||||
<|end-user-input|>
|
||||
|
||||
<|injection-point|>
|
||||
|
||||
### Input:
|
||||
<|begin-user-input|>
|
||||
What datasets are mentioned in the paper above?
|
||||
<|end-user-input|>
|
||||
|
||||
### Response:
|
||||
ASSISTANT:
|
||||
```
|
||||
|
||||
⚠️ For best results, make sure to remove the spaces and new line characters after `ASSISTANT:`.
|
||||
|
||||
## Chat mode
|
||||
|
||||
In chat mode, the extension automatically sorts the history by relevance instead of chronologically, except for the very latest input/reply pair.
|
||||
@ -263,6 +270,7 @@ def ui():
|
||||
with gr.Tab("URL input"):
|
||||
url_input = gr.Textbox(lines=10, label='Input URLs', info='Enter one or more URLs separated by newline characters.')
|
||||
strong_cleanup = gr.Checkbox(value=params['strong_cleanup'], label='Strong cleanup', info='Only keeps html elements that look like long-form text.')
|
||||
threads = gr.Number(value=params['threads'], label='Threads', info='The number of threads to use while downloading the URLs.', precision=0)
|
||||
update_url = gr.Button('Load data')
|
||||
|
||||
with gr.Tab("File input"):
|
||||
@ -279,6 +287,6 @@ def ui():
|
||||
last_updated = gr.Markdown()
|
||||
|
||||
update_data.click(feed_data_into_collector, [data_input, chunk_len], last_updated, show_progress=False)
|
||||
update_url.click(feed_url_into_collector, [url_input, chunk_len, strong_cleanup], last_updated, show_progress=False)
|
||||
update_url.click(feed_url_into_collector, [url_input, chunk_len, strong_cleanup, threads], last_updated, show_progress=False)
|
||||
update_file.click(feed_file_into_collector, [file_input, chunk_len], last_updated, show_progress=False)
|
||||
update_settings.click(apply_settings, [chunk_count], last_updated, show_progress=False)
|
||||
|
@ -146,12 +146,13 @@ def load_prompt(fname):
|
||||
}
|
||||
|
||||
output += utils.replace_all(data['turn_template'].split('<|bot-message|>')[0], replacements)
|
||||
return output
|
||||
return output.rstrip(' ')
|
||||
else:
|
||||
with open(Path(f'prompts/{fname}.txt'), 'r', encoding='utf-8') as f:
|
||||
text = f.read()
|
||||
if text[-1] == '\n':
|
||||
text = text[:-1]
|
||||
|
||||
return text
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user