mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-21 23:57:58 +01:00
Add an option for cleaning up html in superbooga
This commit is contained in:
parent
c7ba2d4f3f
commit
9695bfe117
@ -16,6 +16,13 @@ from modules import chat, shared
|
||||
logging.info('Intercepting all calls to posthog :)')
|
||||
posthog.capture = lambda *args, **kwargs: None
|
||||
|
||||
# These parameters are customizable through settings.json
|
||||
params = {
|
||||
'chunk_count': 5,
|
||||
'chunk_length': 700
|
||||
'strong_cleanup': True,
|
||||
}
|
||||
|
||||
|
||||
class Collecter():
|
||||
def __init__(self):
|
||||
@ -105,7 +112,7 @@ def feed_file_into_collector(file, chunk_len):
|
||||
yield i
|
||||
|
||||
|
||||
def feed_url_into_collector(urls, chunk_len):
|
||||
def feed_url_into_collector(urls, chunk_len, strong_cleanup=False):
|
||||
urls = urls.strip().split('\n')
|
||||
all_text = ''
|
||||
cumulative = ''
|
||||
@ -117,7 +124,11 @@ def feed_url_into_collector(urls, chunk_len):
|
||||
for script in soup(["script", "style"]):
|
||||
script.extract()
|
||||
|
||||
text = soup.get_text()
|
||||
strings = soup.stripped_strings
|
||||
if strong_cleanup:
|
||||
strings = [s for s in strings if re.search("[A-Za-z] ", s)]
|
||||
|
||||
text = '\n'.join([s.strip() for s in strings])
|
||||
lines = (line.strip() for line in text.splitlines())
|
||||
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
||||
text = '\n\n'.join(chunk for chunk in chunks if chunk)
|
||||
@ -251,6 +262,7 @@ def ui():
|
||||
|
||||
with gr.Tab("URL input"):
|
||||
url_input = gr.Textbox(lines=10, label='Input URLs', info='Enter one or more URLs separated by newline characters.')
|
||||
strong_cleanup = gr.Checkbox(value=params['strong_cleanup'], label='Strong cleanup', info='Only keeps html elements that look like long-form text.')
|
||||
update_url = gr.Button('Load data')
|
||||
|
||||
with gr.Tab("File input"):
|
||||
@ -258,15 +270,15 @@ def ui():
|
||||
update_file = gr.Button('Load data')
|
||||
|
||||
with gr.Tab("Generation settings"):
|
||||
chunk_count = gr.Number(value=5, label='Chunk count', info='The number of closest-matching chunks to include in the prompt.')
|
||||
chunk_count = gr.Number(value=params['chunk_count'], label='Chunk count', info='The number of closest-matching chunks to include in the prompt.')
|
||||
update_settings = gr.Button('Apply changes')
|
||||
|
||||
chunk_len = gr.Number(value=700, label='Chunk length', info='In characters, not tokens. This value is used when you click on "Load data".')
|
||||
chunk_len = gr.Number(value=params['chunk_length'], label='Chunk length', info='In characters, not tokens. This value is used when you click on "Load data".')
|
||||
|
||||
with gr.Column():
|
||||
last_updated = gr.Markdown()
|
||||
|
||||
update_data.click(feed_data_into_collector, [data_input, chunk_len], last_updated, show_progress=False)
|
||||
update_url.click(feed_url_into_collector, [url_input, chunk_len], last_updated, show_progress=False)
|
||||
update_url.click(feed_url_into_collector, [url_input, chunk_len, strong_cleanup], last_updated, show_progress=False)
|
||||
update_file.click(feed_file_into_collector, [file_input, chunk_len], last_updated, show_progress=False)
|
||||
update_settings.click(apply_settings, [chunk_count], last_updated, show_progress=False)
|
||||
|
Loading…
Reference in New Issue
Block a user