text-generation-webui/modules/html_generator.py

397 lines
14 KiB
Python
Raw Normal View History

2024-03-17 08:29:03 -07:00
import functools
2023-08-16 09:23:29 -07:00
import html
import os
2023-01-06 23:14:08 -03:00
import re
import time
from pathlib import Path
2023-01-06 23:14:08 -03:00
2023-03-15 12:33:26 -03:00
import markdown
2023-04-04 22:52:15 -03:00
from PIL import Image, ImageOps
2024-01-04 04:27:26 +01:00
from modules import shared
from modules.sane_markdown_lists import SaneListExtension
2024-01-22 03:25:55 -08:00
from modules.utils import get_available_chat_styles
# This is to store the paths to the thumbnails of the profile pictures
image_cache = {}
def minify_css(css: str) -> str:
# Step 1: Remove comments
css = re.sub(r'/\*.*?\*/', '', css, flags=re.DOTALL)
# Step 2: Remove leading and trailing whitespace
css = re.sub(r'^[ \t]*|[ \t]*$', '', css, flags=re.MULTILINE)
# Step 3: Remove spaces after specific characters ({ : ; ,})
css = re.sub(r'([:{;,])\s+', r'\1', css)
# Step 4: Remove spaces before `{`
css = re.sub(r'\s+{', '{', css)
# Step 5: Remove empty lines
css = re.sub(r'^\s*$', '', css, flags=re.MULTILINE)
# Step 6: Collapse all lines into one
css = re.sub(r'\n', '', css)
return css
2023-03-15 14:19:28 -03:00
with open(Path(__file__).resolve().parent / '../css/html_readable_style.css', 'r') as f:
readable_css = f.read()
2023-04-05 11:49:59 -03:00
with open(Path(__file__).resolve().parent / '../css/html_instruct_style.css', 'r') as f:
instruct_css = f.read()
2023-03-15 12:33:26 -03:00
# Custom chat styles
chat_styles = {}
for k in get_available_chat_styles():
chat_styles[k] = open(Path(f'css/chat_style-{k}.css'), 'r').read()
2023-09-14 16:15:08 -07:00
# Handle styles that derive from other styles
for k in chat_styles:
lines = chat_styles[k].split('\n')
input_string = lines[0]
match = re.search(r'chat_style-([a-z\-]*)\.css', input_string)
if match:
style = match.group(1)
chat_styles[k] = chat_styles.get(style, '') + '\n\n' + '\n'.join(lines[1:])
# Reduce the size of the CSS sources above
readable_css = minify_css(readable_css)
instruct_css = minify_css(instruct_css)
for k in chat_styles:
chat_styles[k] = minify_css(chat_styles[k])
2023-03-17 16:06:11 -03:00
def fix_newlines(string):
string = string.replace('\n', '\n\n')
string = re.sub(r"\n{3,}", "\n\n", string)
string = string.strip()
return string
2023-04-16 21:26:19 -03:00
def replace_quotes(text):
# Define a list of quote pairs (opening and closing), using HTML entities
quote_pairs = [
('"', '"'), # Double quotes
('“', '”'), # Unicode left and right double quotation marks
('‘', '’'), # Unicode left and right single quotation marks
('«', '»'), # French quotes
('„', '“'), # German quotes
('‘', '’'), # Alternative single quotes
('“', '”'), # Unicode quotes (numeric entities)
('“', '”'), # Unicode quotes (hex entities)
]
# Create a regex pattern that matches any of the quote pairs, including newlines
pattern = '|'.join(f'({re.escape(open_q)})(.*?)({re.escape(close_q)})' for open_q, close_q in quote_pairs)
# Replace matched patterns with <q> tags, keeping original quotes
replaced_text = re.sub(pattern, lambda m: f'<q>{m.group(1)}{m.group(2)}{m.group(3)}</q>', text, flags=re.DOTALL)
return replaced_text
2023-04-16 18:00:12 -03:00
def replace_blockquote(m):
return m.group().replace('\n', '\n> ').replace('\\begin{blockquote}', '').replace('\\end{blockquote}', '')
2023-04-16 21:26:19 -03:00
def add_long_list_class(html):
'''
Adds a long-list class to <ul> or <ol> containing long <li> items.
These will receive a smaller margin/padding in the CSS.
'''
# Helper function to check if a tag is within <pre> or <code>
def is_within_block(start_idx, end_idx, block_matches):
return any(start < start_idx < end or start < end_idx < end for start, end in block_matches)
# Find all <pre>...</pre> and <code>...</code> blocks
pre_blocks = [(m.start(), m.end()) for m in re.finditer(r'<pre.*?>.*?</pre>', html, re.DOTALL)]
code_blocks = [(m.start(), m.end()) for m in re.finditer(r'<code.*?>.*?</code>', html, re.DOTALL)]
all_blocks = pre_blocks + code_blocks
# Pattern to find <ul>...</ul> and <ol>...</ol> blocks and their contents
list_pattern = re.compile(r'(<[uo]l.*?>)(.*?)(</[uo]l>)', re.DOTALL)
li_pattern = re.compile(r'<li.*?>(.*?)</li>', re.DOTALL)
def process_list(match):
start_idx, end_idx = match.span()
if is_within_block(start_idx, end_idx, all_blocks):
return match.group(0) # Leave the block unchanged if within <pre> or <code>
opening_tag = match.group(1)
list_content = match.group(2)
closing_tag = match.group(3)
# Find all list items within this list
li_matches = li_pattern.finditer(list_content)
has_long_item = any(len(li_match.group(1).strip()) > 224 for li_match in li_matches)
if has_long_item:
# Add class="long-list" to the opening tag if it doesn't already have a class
if 'class=' not in opening_tag:
opening_tag = opening_tag[:-1] + ' class="long-list">'
else:
# If there's already a class, append long-list to it
opening_tag = re.sub(r'class="([^"]*)"', r'class="\1 long-list"', opening_tag)
return opening_tag + list_content + closing_tag
# Process HTML and replace list blocks
return list_pattern.sub(process_list, html)
@functools.lru_cache(maxsize=None)
2023-03-17 16:06:11 -03:00
def convert_to_markdown(string):
2023-04-16 18:00:12 -03:00
# Make \[ \] LaTeX equations inline
pattern = r'^\s*\\\[\s*\n([\s\S]*?)\n\s*\\\]\s*$'
replacement = r'\\[ \1 \\]'
string = re.sub(pattern, replacement, string, flags=re.MULTILINE)
# Escape backslashes
string = string.replace('\\', '\\\\')
# Quote to <q></q>
string = replace_quotes(string)
2023-04-16 18:00:12 -03:00
# Blockquote
2023-09-14 05:57:04 -07:00
string = re.sub(r'(^|[\n])&gt;', r'\1>', string)
2023-04-16 18:00:12 -03:00
pattern = re.compile(r'\\begin{blockquote}(.*?)\\end{blockquote}', re.DOTALL)
string = pattern.sub(replace_blockquote, string)
# Code
2023-03-17 16:06:11 -03:00
string = string.replace('\\begin{code}', '```')
string = string.replace('\\end{code}', '```')
string = string.replace('\\begin{align*}', '$$')
string = string.replace('\\end{align*}', '$$')
string = string.replace('\\begin{align}', '$$')
string = string.replace('\\end{align}', '$$')
string = string.replace('\\begin{equation}', '$$')
string = string.replace('\\end{equation}', '$$')
string = string.replace('\\begin{equation*}', '$$')
string = string.replace('\\end{equation*}', '$$')
2023-03-17 16:06:11 -03:00
string = re.sub(r"(.)```", r"\1\n```", string)
2023-04-16 18:00:12 -03:00
result = ''
is_code = False
2024-07-27 15:33:30 -07:00
is_latex = False
for line in string.split('\n'):
2024-07-27 15:33:30 -07:00
stripped_line = line.strip()
if stripped_line.startswith('```'):
is_code = not is_code
2024-07-27 15:33:30 -07:00
elif stripped_line.startswith('$$'):
is_latex = not is_latex
elif stripped_line.endswith('$$'):
is_latex = False
elif stripped_line.startswith('\\\\['):
is_latex = True
elif stripped_line.startswith('\\\\]'):
is_latex = False
elif stripped_line.endswith('\\\\]'):
is_latex = False
result += line
# Don't add an extra \n for code, LaTeX, or tables
if is_code or is_latex or line.startswith('|'):
result += '\n'
# Also don't add an extra \n for lists
elif stripped_line.startswith('-') or stripped_line.startswith('*') or stripped_line.startswith('+') or stripped_line.startswith('>') or re.match(r'\d+\.', stripped_line):
result += ' \n'
else:
result += ' \n'
result = result.strip()
if is_code:
result += '\n```' # Unfinished code block
# Unfinished list, like "\n1.". A |delete| string is added and then
# removed to force a <ol> or <ul> to be generated instead of a <p>.
list_item_pattern = r'(\n\d+\.?|\n\s*[-*+]\s*([*_~]{1,3})?)$'
if re.search(list_item_pattern, result):
delete_str = '|delete|'
if re.search(r'(\d+\.?)$', result) and not result.endswith('.'):
result += '.'
# Add the delete string after the list item
result = re.sub(list_item_pattern, r'\g<1> ' + delete_str, result)
# Convert to HTML using markdown
html_output = markdown.markdown(result, extensions=['fenced_code', 'tables', SaneListExtension()])
# Remove the delete string from the HTML output
2023-08-16 21:08:01 -07:00
pos = html_output.rfind(delete_str)
if pos > -1:
2023-08-16 21:08:01 -07:00
html_output = html_output[:pos] + html_output[pos + len(delete_str):]
else:
# Convert to HTML using markdown
html_output = markdown.markdown(result, extensions=['fenced_code', 'tables', SaneListExtension()])
2023-08-24 21:01:09 -07:00
# Unescape code blocks
pattern = re.compile(r'<code[^>]*>(.*?)</code>', re.DOTALL)
html_output = pattern.sub(lambda x: html.unescape(x.group()), html_output)
# Add "long-list" class to <ul> or <ol> containing a long <li> item
html_output = add_long_list_class(html_output)
2023-08-16 21:08:01 -07:00
return html_output
2023-03-17 16:06:11 -03:00
def convert_to_markdown_wrapped(string, use_cache=True):
'''
Used to avoid caching convert_to_markdown calls during streaming.
'''
if use_cache:
return convert_to_markdown(string)
return convert_to_markdown.__wrapped__(string)
2023-03-17 16:06:11 -03:00
def generate_basic_html(string):
convert_to_markdown.cache_clear()
2023-03-17 16:06:11 -03:00
string = convert_to_markdown(string)
2023-10-10 18:45:12 -07:00
string = f'<style>{readable_css}</style><div class="readable-container">{string}</div>'
2023-03-17 16:06:11 -03:00
return string
2023-01-15 16:43:31 -03:00
2023-04-04 23:03:58 -03:00
def make_thumbnail(image):
image = image.resize((350, round(image.size[1] / image.size[0] * 350)), Image.Resampling.LANCZOS)
2023-04-04 23:03:58 -03:00
if image.size[1] > 470:
image = ImageOps.fit(image, (350, 470), Image.LANCZOS)
2023-04-04 23:03:58 -03:00
return image
def get_image_cache(path):
2024-01-04 04:27:26 +01:00
cache_folder = Path(shared.args.disk_cache_dir)
if not cache_folder.exists():
cache_folder.mkdir()
mtime = os.stat(path).st_mtime
if (path in image_cache and mtime != image_cache[path][0]) or (path not in image_cache):
2023-04-04 23:03:58 -03:00
img = make_thumbnail(Image.open(path))
2024-01-04 04:27:26 +01:00
old_p = Path(f'{cache_folder}/{path.name}_cache.png')
p = Path(f'{cache_folder}/cache_{path.name}.png')
if old_p.exists():
old_p.rename(p)
output_file = p
img.convert('RGBA').save(output_file, format='PNG')
image_cache[path] = [mtime, output_file.as_posix()]
return image_cache[path][1]
2023-04-05 11:49:59 -03:00
def generate_instruct_html(history):
output = f'<style>{instruct_css}</style><div class="chat" id="chat"><div class="messages">'
2023-08-05 10:53:54 -06:00
for i, _row in enumerate(history):
row = [convert_to_markdown_wrapped(entry, use_cache=i != len(history) - 1) for entry in _row]
2023-04-05 11:49:59 -03:00
if row[0]: # Don't display empty user messages
output += (
f'<div class="user-message">'
f'<div class="text">'
f'<div class="message-body">{row[0]}</div>'
f'</div>'
f'</div>'
)
output += (
f'<div class="assistant-message">'
f'<div class="text">'
f'<div class="message-body">{row[1]}</div>'
f'</div>'
f'</div>'
)
2023-04-05 11:49:59 -03:00
2023-08-05 10:53:54 -06:00
output += "</div></div>"
2023-04-05 11:49:59 -03:00
return output
2023-12-04 02:45:50 +01:00
def generate_cai_chat_html(history, name1, name2, style, character, reset_cache=False):
output = f'<style>{chat_styles[style]}</style><div class="chat" id="chat"><div class="messages">'
2023-03-24 17:18:27 -03:00
2023-12-04 02:45:50 +01:00
# We use ?character and ?time.time() to force the browser to reset caches
img_bot = (
f'<img src="file/cache/pfp_character_thumb.png?{character}" class="pfp_character">'
if Path("cache/pfp_character_thumb.png").exists() else ''
)
img_me = (
f'<img src="file/cache/pfp_me.png?{time.time() if reset_cache else ""}">'
if Path("cache/pfp_me.png").exists() else ''
)
2023-08-05 10:53:54 -06:00
for i, _row in enumerate(history):
row = [convert_to_markdown_wrapped(entry, use_cache=i != len(history) - 1) for entry in _row]
2023-03-24 17:18:27 -03:00
if row[0]: # Don't display empty user messages
output += (
f'<div class="message">'
f'<div class="circle-you">{img_me}</div>'
f'<div class="text">'
f'<div class="username">{name1}</div>'
f'<div class="message-body">{row[0]}</div>'
f'</div>'
f'</div>'
)
output += (
f'<div class="message">'
f'<div class="circle-bot">{img_bot}</div>'
f'<div class="text">'
f'<div class="username">{name2}</div>'
f'<div class="message-body">{row[1]}</div>'
f'</div>'
f'</div>'
)
2023-08-05 10:53:54 -06:00
output += "</div></div>"
return output
2023-04-05 11:49:59 -03:00
2023-04-16 16:44:50 -03:00
def generate_chat_html(history, name1, name2, reset_cache=False):
output = f'<style>{chat_styles["wpp"]}</style><div class="chat" id="chat"><div class="messages">'
2023-04-16 16:44:50 -03:00
2023-08-05 10:53:54 -06:00
for i, _row in enumerate(history):
row = [convert_to_markdown_wrapped(entry, use_cache=i != len(history) - 1) for entry in _row]
2023-04-16 16:44:50 -03:00
if row[0]: # Don't display empty user messages
output += (
f'<div class="message">'
f'<div class="text-you">'
f'<div class="message-body">{row[0]}</div>'
f'</div>'
f'</div>'
)
output += (
f'<div class="message">'
f'<div class="text-bot">'
f'<div class="message-body">{row[1]}</div>'
f'</div>'
f'</div>'
)
2023-04-16 16:44:50 -03:00
2023-08-05 10:53:54 -06:00
output += "</div></div>"
2023-04-16 16:44:50 -03:00
return output
2023-04-05 11:49:59 -03:00
2023-12-04 02:45:50 +01:00
def chat_html_wrapper(history, name1, name2, mode, style, character, reset_cache=False):
if mode == 'instruct':
return generate_instruct_html(history['visible'])
elif style == 'wpp':
return generate_chat_html(history['visible'], name1, name2)
2023-04-05 11:49:59 -03:00
else:
2023-12-04 02:45:50 +01:00
return generate_cai_chat_html(history['visible'], name1, name2, style, character, reset_cache)