mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-10-30 14:10:14 +01:00
65 lines
1.9 KiB
Python
65 lines
1.9 KiB
Python
|
import concurrent.futures
|
||
|
import requests
|
||
|
import re
|
||
|
|
||
|
from bs4 import BeautifulSoup
|
||
|
|
||
|
import extensions.superboogav2.parameters as parameters
|
||
|
|
||
|
from .data_processor import process_and_add_to_collector
|
||
|
from .utils import create_metadata_source
|
||
|
|
||
|
def _download_single(url):
|
||
|
response = requests.get(url, timeout=5)
|
||
|
if response.status_code == 200:
|
||
|
return response.content
|
||
|
else:
|
||
|
raise Exception("Failed to download URL")
|
||
|
|
||
|
|
||
|
def _download_urls(urls, threads=1):
|
||
|
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
|
||
|
futures = []
|
||
|
for url in urls:
|
||
|
future = executor.submit(_download_single, url)
|
||
|
futures.append(future)
|
||
|
|
||
|
results = []
|
||
|
i = 0
|
||
|
for future in concurrent.futures.as_completed(futures):
|
||
|
try:
|
||
|
result = future.result()
|
||
|
results.append(result)
|
||
|
i += 1
|
||
|
yield f"{i}/{len(urls)}", results
|
||
|
except Exception:
|
||
|
pass
|
||
|
|
||
|
yield "Done", results
|
||
|
|
||
|
|
||
|
def feed_url_into_collector(urls, collector):
|
||
|
all_text = ''
|
||
|
cumulative = ''
|
||
|
|
||
|
urls = urls.strip().split('\n')
|
||
|
cumulative += f'Loading {len(urls)} URLs with {parameters.get_num_threads()} threads...\n\n'
|
||
|
yield cumulative
|
||
|
for update, contents in _download_urls(urls, threads=parameters.get_num_threads()):
|
||
|
yield cumulative + update
|
||
|
|
||
|
cumulative += 'Processing the HTML sources...'
|
||
|
yield cumulative
|
||
|
for content in contents:
|
||
|
soup = BeautifulSoup(content, features="lxml")
|
||
|
for script in soup(["script", "style"]):
|
||
|
script.extract()
|
||
|
|
||
|
strings = soup.stripped_strings
|
||
|
if parameters.get_is_strong_cleanup():
|
||
|
strings = [s for s in strings if re.search("[A-Za-z] ", s)]
|
||
|
|
||
|
text = '\n'.join([s.strip() for s in strings])
|
||
|
all_text += text
|
||
|
|
||
|
process_and_add_to_collector(all_text, collector, False, create_metadata_source('url-download'))
|