mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-22 08:07:56 +01:00
Add an option for cleaning up html in superbooga
This commit is contained in:
parent
c7ba2d4f3f
commit
9695bfe117
@ -16,6 +16,13 @@ from modules import chat, shared
|
|||||||
logging.info('Intercepting all calls to posthog :)')
|
logging.info('Intercepting all calls to posthog :)')
|
||||||
posthog.capture = lambda *args, **kwargs: None
|
posthog.capture = lambda *args, **kwargs: None
|
||||||
|
|
||||||
|
# These parameters are customizable through settings.json
|
||||||
|
params = {
|
||||||
|
'chunk_count': 5,
|
||||||
|
'chunk_length': 700
|
||||||
|
'strong_cleanup': True,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class Collecter():
|
class Collecter():
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@ -105,7 +112,7 @@ def feed_file_into_collector(file, chunk_len):
|
|||||||
yield i
|
yield i
|
||||||
|
|
||||||
|
|
||||||
def feed_url_into_collector(urls, chunk_len):
|
def feed_url_into_collector(urls, chunk_len, strong_cleanup=False):
|
||||||
urls = urls.strip().split('\n')
|
urls = urls.strip().split('\n')
|
||||||
all_text = ''
|
all_text = ''
|
||||||
cumulative = ''
|
cumulative = ''
|
||||||
@ -117,7 +124,11 @@ def feed_url_into_collector(urls, chunk_len):
|
|||||||
for script in soup(["script", "style"]):
|
for script in soup(["script", "style"]):
|
||||||
script.extract()
|
script.extract()
|
||||||
|
|
||||||
text = soup.get_text()
|
strings = soup.stripped_strings
|
||||||
|
if strong_cleanup:
|
||||||
|
strings = [s for s in strings if re.search("[A-Za-z] ", s)]
|
||||||
|
|
||||||
|
text = '\n'.join([s.strip() for s in strings])
|
||||||
lines = (line.strip() for line in text.splitlines())
|
lines = (line.strip() for line in text.splitlines())
|
||||||
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
||||||
text = '\n\n'.join(chunk for chunk in chunks if chunk)
|
text = '\n\n'.join(chunk for chunk in chunks if chunk)
|
||||||
@ -251,6 +262,7 @@ def ui():
|
|||||||
|
|
||||||
with gr.Tab("URL input"):
|
with gr.Tab("URL input"):
|
||||||
url_input = gr.Textbox(lines=10, label='Input URLs', info='Enter one or more URLs separated by newline characters.')
|
url_input = gr.Textbox(lines=10, label='Input URLs', info='Enter one or more URLs separated by newline characters.')
|
||||||
|
strong_cleanup = gr.Checkbox(value=params['strong_cleanup'], label='Strong cleanup', info='Only keeps html elements that look like long-form text.')
|
||||||
update_url = gr.Button('Load data')
|
update_url = gr.Button('Load data')
|
||||||
|
|
||||||
with gr.Tab("File input"):
|
with gr.Tab("File input"):
|
||||||
@ -258,15 +270,15 @@ def ui():
|
|||||||
update_file = gr.Button('Load data')
|
update_file = gr.Button('Load data')
|
||||||
|
|
||||||
with gr.Tab("Generation settings"):
|
with gr.Tab("Generation settings"):
|
||||||
chunk_count = gr.Number(value=5, label='Chunk count', info='The number of closest-matching chunks to include in the prompt.')
|
chunk_count = gr.Number(value=params['chunk_count'], label='Chunk count', info='The number of closest-matching chunks to include in the prompt.')
|
||||||
update_settings = gr.Button('Apply changes')
|
update_settings = gr.Button('Apply changes')
|
||||||
|
|
||||||
chunk_len = gr.Number(value=700, label='Chunk length', info='In characters, not tokens. This value is used when you click on "Load data".')
|
chunk_len = gr.Number(value=params['chunk_length'], label='Chunk length', info='In characters, not tokens. This value is used when you click on "Load data".')
|
||||||
|
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
last_updated = gr.Markdown()
|
last_updated = gr.Markdown()
|
||||||
|
|
||||||
update_data.click(feed_data_into_collector, [data_input, chunk_len], last_updated, show_progress=False)
|
update_data.click(feed_data_into_collector, [data_input, chunk_len], last_updated, show_progress=False)
|
||||||
update_url.click(feed_url_into_collector, [url_input, chunk_len], last_updated, show_progress=False)
|
update_url.click(feed_url_into_collector, [url_input, chunk_len, strong_cleanup], last_updated, show_progress=False)
|
||||||
update_file.click(feed_file_into_collector, [file_input, chunk_len], last_updated, show_progress=False)
|
update_file.click(feed_file_into_collector, [file_input, chunk_len], last_updated, show_progress=False)
|
||||||
update_settings.click(apply_settings, [chunk_count], last_updated, show_progress=False)
|
update_settings.click(apply_settings, [chunk_count], last_updated, show_progress=False)
|
||||||
|
Loading…
Reference in New Issue
Block a user