mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-22 08:07:56 +01:00
substitu superboog Beatiful Soup Parser (#2996)
* add lxml to requirments add lxml to requirments * Change Beaitful Soup Parser "lxml" parser which might be more tolerant of certain kinds of parsing errors than "html.parser" and quicker at the same time.
This commit is contained in:
parent
ab044a5a44
commit
1fc0b5041e
@ -2,3 +2,4 @@ beautifulsoup4==4.12.2
|
|||||||
chromadb==0.3.18
|
chromadb==0.3.18
|
||||||
posthog==2.4.2
|
posthog==2.4.2
|
||||||
sentence_transformers==2.2.2
|
sentence_transformers==2.2.2
|
||||||
|
lxml
|
||||||
|
@ -69,7 +69,7 @@ def feed_url_into_collector(urls, chunk_len, chunk_sep, strong_cleanup, threads)
|
|||||||
cumulative += 'Processing the HTML sources...'
|
cumulative += 'Processing the HTML sources...'
|
||||||
yield cumulative
|
yield cumulative
|
||||||
for content in contents:
|
for content in contents:
|
||||||
soup = BeautifulSoup(content, features="html.parser")
|
soup = BeautifulSoup(content, features="lxml")
|
||||||
for script in soup(["script", "style"]):
|
for script in soup(["script", "style"]):
|
||||||
script.extract()
|
script.extract()
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user