mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2025-01-25 02:58:50 +01:00
337 lines
14 KiB
Python
337 lines
14 KiB
Python
|
# Code based on the Sane List Extension for Python-Markdown
|
||
|
# =======================================
|
||
|
|
||
|
# Modify the behavior of Lists in Python-Markdown to act in a sane manner.
|
||
|
|
||
|
# See https://Python-Markdown.github.io/extensions/sane_lists
|
||
|
# for documentation.
|
||
|
|
||
|
# Original code Copyright 2011 [Waylan Limberg](http://achinghead.com)
|
||
|
|
||
|
# All changes Copyright 2011-2014 The Python Markdown Project
|
||
|
|
||
|
# License: [BSD](https://opensource.org/licenses/bsd-license.php)
|
||
|
|
||
|
"""
|
||
|
Modify the behavior of Lists in Python-Markdown to act in a sane manner.
|
||
|
"""
|
||
|
|
||
|
from __future__ import annotations
|
||
|
|
||
|
import re
|
||
|
import xml.etree.ElementTree as etree
|
||
|
from typing import TYPE_CHECKING
|
||
|
|
||
|
from markdown import Extension
|
||
|
from markdown.blockparser import BlockParser
|
||
|
from markdown.blockprocessors import (
|
||
|
ListIndentProcessor,
|
||
|
OListProcessor,
|
||
|
ParagraphProcessor
|
||
|
)
|
||
|
|
||
|
if TYPE_CHECKING: # pragma: no cover
|
||
|
from markdown import blockparser
|
||
|
|
||
|
|
||
|
# The min. number of added leading spaces needed to start a nested list
|
||
|
MIN_NESTED_LIST_INDENT = 2
|
||
|
assert MIN_NESTED_LIST_INDENT > 1, "'MIN_NESTED_LIST_INDENT' must be > 1"
|
||
|
|
||
|
|
||
|
class SaneListIndentProcessor(ListIndentProcessor):
|
||
|
""" Process children of list items.
|
||
|
|
||
|
Example
|
||
|
|
||
|
* a list item
|
||
|
process this part
|
||
|
|
||
|
or this part
|
||
|
|
||
|
"""
|
||
|
|
||
|
def __init__(self, *args):
|
||
|
super().__init__(*args)
|
||
|
self.INDENT_RE = re.compile(r'^(([ ])+)')
|
||
|
|
||
|
def test(self, parent: etree.Element, block: str) -> bool:
|
||
|
return block.startswith(' ' * MIN_NESTED_LIST_INDENT) and \
|
||
|
not self.parser.state.isstate('detabbed') and \
|
||
|
(parent.tag in self.ITEM_TYPES or
|
||
|
(len(parent) and parent[-1] is not None and
|
||
|
(parent[-1].tag in self.LIST_TYPES)))
|
||
|
|
||
|
def get_level(self, parent: etree.Element, block: str) -> tuple[int, etree.Element]:
|
||
|
""" Get level of indentation based on list level. """
|
||
|
# Get indent level
|
||
|
m = self.INDENT_RE.match(block)
|
||
|
if m:
|
||
|
indent_level = len(m.group(1)) / MIN_NESTED_LIST_INDENT
|
||
|
else:
|
||
|
indent_level = 0
|
||
|
if self.parser.state.isstate('list'):
|
||
|
# We're in a tight-list - so we already are at correct parent.
|
||
|
level = 1
|
||
|
else:
|
||
|
# We're in a loose-list - so we need to find parent.
|
||
|
level = 0
|
||
|
# Step through children of tree to find matching indent level.
|
||
|
while indent_level > level:
|
||
|
child = self.lastChild(parent)
|
||
|
if (child is not None and
|
||
|
(child.tag in self.LIST_TYPES or child.tag in self.ITEM_TYPES)):
|
||
|
if child.tag in self.LIST_TYPES:
|
||
|
level += 1
|
||
|
parent = child
|
||
|
else:
|
||
|
# No more child levels. If we're short of `indent_level`,
|
||
|
# we have a code block. So we stop here.
|
||
|
break
|
||
|
return level, parent
|
||
|
|
||
|
def detab(self, text: str, length: int | None = None) -> tuple[str, str]:
|
||
|
""" Remove a tab from the front of each line of the given text. """
|
||
|
if length is None:
|
||
|
length = MIN_NESTED_LIST_INDENT
|
||
|
newtext = []
|
||
|
lines = text.split('\n')
|
||
|
for line in lines:
|
||
|
if line.startswith(' ' * length):
|
||
|
newtext.append(line[length:])
|
||
|
elif not line.strip():
|
||
|
newtext.append('')
|
||
|
else:
|
||
|
break
|
||
|
return '\n'.join(newtext), '\n'.join(lines[len(newtext):])
|
||
|
|
||
|
def looseDetab(self, text: str, level: int = 1) -> str:
|
||
|
""" Remove indentation from front of lines but allowing dedented lines. """
|
||
|
lines = text.split('\n')
|
||
|
for i in range(len(lines)):
|
||
|
if lines[i].startswith(' ' * MIN_NESTED_LIST_INDENT * level):
|
||
|
lines[i] = lines[i][MIN_NESTED_LIST_INDENT * level:]
|
||
|
return '\n'.join(lines)
|
||
|
|
||
|
|
||
|
class SaneOListProcessor(OListProcessor):
|
||
|
""" Override `SIBLING_TAGS` to not include `ul` and set `LAZY_OL` to `False`. """
|
||
|
|
||
|
SIBLING_TAGS = ['ol']
|
||
|
""" Exclude `ul` from list of siblings. """
|
||
|
LAZY_OL = False
|
||
|
""" Disable lazy list behavior. """
|
||
|
|
||
|
def __init__(self, parser: blockparser.BlockParser):
|
||
|
super().__init__(parser)
|
||
|
# This restriction stems from the 'CodeBlockProcessor' class,
|
||
|
# which automatically matches blocks with an indent = self.tab_length
|
||
|
max_list_start_indent = self.tab_length - 1
|
||
|
# Detect an item (e.g., `1. item`)
|
||
|
self.RE = re.compile(r'^[ ]{0,%d}[\*_]{0,2}\d+\.[ ]+(.*)' % max_list_start_indent)
|
||
|
# Detect items on secondary lines. they can be of either list type.
|
||
|
self.CHILD_RE = re.compile(r'^[ ]{0,%d}([\*_]{0,2})((\d+\.))[ ]+(.*)' % (MIN_NESTED_LIST_INDENT - 1))
|
||
|
# Detect indented (nested) items of either type
|
||
|
self.INDENT_RE = re.compile(r'^[ ]{%d,%d}[\*_]{0,2}((\d+\.)|[*+-])[ ]+.*' %
|
||
|
(MIN_NESTED_LIST_INDENT, self.tab_length * 2 - 1))
|
||
|
|
||
|
def run(self, parent: etree.Element, blocks: list[str]) -> None:
|
||
|
# Check for multiple items in one block.
|
||
|
items = self.get_items(blocks.pop(0))
|
||
|
sibling = self.lastChild(parent)
|
||
|
|
||
|
if sibling is not None and sibling.tag in self.SIBLING_TAGS:
|
||
|
# Previous block was a list item, so set that as parent
|
||
|
lst = sibling
|
||
|
# make sure previous item is in a `p` - if the item has text,
|
||
|
# then it isn't in a `p`
|
||
|
if lst[-1].text:
|
||
|
# since it's possible there are other children for this
|
||
|
# sibling, we can't just `SubElement` the `p`, we need to
|
||
|
# insert it as the first item.
|
||
|
p = etree.Element('p')
|
||
|
p.text = lst[-1].text
|
||
|
lst[-1].text = ''
|
||
|
lst[-1].insert(0, p)
|
||
|
# if the last item has a tail, then the tail needs to be put in a `p`
|
||
|
# likely only when a header is not followed by a blank line
|
||
|
lch = self.lastChild(lst[-1])
|
||
|
if lch is not None and lch.tail:
|
||
|
p = etree.SubElement(lst[-1], 'p')
|
||
|
p.text = lch.tail.lstrip()
|
||
|
lch.tail = ''
|
||
|
|
||
|
# parse first block differently as it gets wrapped in a `p`.
|
||
|
li = etree.SubElement(lst, 'li')
|
||
|
self.parser.state.set('looselist')
|
||
|
firstitem = items.pop(0)
|
||
|
self.parser.parseBlocks(li, [firstitem])
|
||
|
self.parser.state.reset()
|
||
|
elif parent.tag in ['ol', 'ul']:
|
||
|
# this catches the edge case of a multi-item indented list whose
|
||
|
# first item is in a blank parent-list item:
|
||
|
# * * subitem1
|
||
|
# * subitem2
|
||
|
# see also `ListIndentProcessor`
|
||
|
lst = parent
|
||
|
else:
|
||
|
# This is a new list so create parent with appropriate tag.
|
||
|
lst = etree.SubElement(parent, self.TAG)
|
||
|
# Check if a custom start integer is set
|
||
|
if not self.LAZY_OL and self.STARTSWITH != '1':
|
||
|
lst.attrib['start'] = self.STARTSWITH
|
||
|
|
||
|
self.parser.state.set('list')
|
||
|
# Loop through items in block, recursively parsing each with the
|
||
|
# appropriate parent.
|
||
|
for item in items:
|
||
|
if item.startswith(" " * MIN_NESTED_LIST_INDENT):
|
||
|
# Item is indented. Parse with last item as parent
|
||
|
self.parser.parseBlocks(lst[-1], [item])
|
||
|
else:
|
||
|
# New item. Create `li` and parse with it as parent
|
||
|
li = etree.SubElement(lst, 'li')
|
||
|
self.parser.parseBlocks(li, [item])
|
||
|
self.parser.state.reset()
|
||
|
|
||
|
def looseDetab(self, text: str, indent_length: int, level: int = 1) -> str:
|
||
|
""" Remove indentation from front of lines but allowing dedented lines. """
|
||
|
lines = text.split('\n')
|
||
|
for i in range(len(lines)):
|
||
|
if lines[i].startswith(' ' * indent_length * level):
|
||
|
lines[i] = lines[i][indent_length * level:]
|
||
|
return '\n'.join(lines)
|
||
|
|
||
|
def get_items(self, block: str) -> list[str]:
|
||
|
""" Break a block into list items. """
|
||
|
# If first level of list is indented, remove that indentation
|
||
|
if (indent_len := len(block) - len(block.lstrip())) > 0:
|
||
|
block = self.looseDetab(block, indent_len)
|
||
|
items = []
|
||
|
for line in block.split('\n'):
|
||
|
m = self.CHILD_RE.match(line)
|
||
|
if m:
|
||
|
# This is a new list item
|
||
|
# Check first item for the start index
|
||
|
if not items:
|
||
|
# Detect the integer value of first list item
|
||
|
INTEGER_RE = re.compile(r'(\d+)')
|
||
|
self.STARTSWITH = INTEGER_RE.match(m.group(2)).group()
|
||
|
# Append to the list
|
||
|
items.append(m.group(1) + m.group(4))
|
||
|
elif self.INDENT_RE.match(line):
|
||
|
# This is an indented (possibly nested) item.
|
||
|
if items[-1].startswith(' ' * MIN_NESTED_LIST_INDENT):
|
||
|
# Previous item was indented. Append to that item.
|
||
|
items[-1] = '{}\n{}'.format(items[-1], line)
|
||
|
else:
|
||
|
items.append(line)
|
||
|
else:
|
||
|
# This is another line of previous item. Append to that item.
|
||
|
items[-1] = '{}\n{}'.format(items[-1], line)
|
||
|
return items
|
||
|
|
||
|
|
||
|
class SaneUListProcessor(SaneOListProcessor):
|
||
|
""" Override `SIBLING_TAGS` to not include `ol`. """
|
||
|
|
||
|
TAG: str = 'ul'
|
||
|
SIBLING_TAGS = ['ul']
|
||
|
""" Exclude `ol` from list of siblings. """
|
||
|
|
||
|
def __init__(self, parser: blockparser.BlockParser):
|
||
|
super().__init__(parser)
|
||
|
# Detect an item (e.g., `- item` or `+ item` or `* item`).
|
||
|
max_list_start_indent = self.tab_length - 1
|
||
|
self.RE = re.compile(r'^[ ]{0,%d}[*+-][ ]+(.*)' % max_list_start_indent)
|
||
|
self.CHILD_RE = re.compile(r'^[ ]{0,%d}(([*+-]))[ ]+(.*)' % (MIN_NESTED_LIST_INDENT - 1))
|
||
|
|
||
|
def get_items(self, block: str) -> list[str]:
|
||
|
""" Break a block into list items. """
|
||
|
# If first level of list is indented, remove that indentation
|
||
|
if (indent_len := len(block) - len(block.lstrip())) > 0:
|
||
|
block = self.looseDetab(block, indent_len)
|
||
|
items = []
|
||
|
for line in block.split('\n'):
|
||
|
m = self.CHILD_RE.match(line)
|
||
|
if m:
|
||
|
# Append to the list
|
||
|
items.append(m.group(3))
|
||
|
elif self.INDENT_RE.match(line):
|
||
|
# This is an indented (possibly nested) item.
|
||
|
if items[-1].startswith(' ' * MIN_NESTED_LIST_INDENT):
|
||
|
# Previous item was indented. Append to that item.
|
||
|
items[-1] = '{}\n{}'.format(items[-1], line)
|
||
|
else:
|
||
|
items.append(line)
|
||
|
else:
|
||
|
# This is another line of previous item. Append to that item.
|
||
|
items[-1] = '{}\n{}'.format(items[-1], line)
|
||
|
return items
|
||
|
|
||
|
|
||
|
class SaneParagraphProcessor(ParagraphProcessor):
|
||
|
""" Process Paragraph blocks. """
|
||
|
|
||
|
def __init__(self, parser: BlockParser):
|
||
|
super().__init__(parser)
|
||
|
max_list_start_indent = self.tab_length - 1
|
||
|
self.LIST_RE = re.compile(r"\s{2}\n(\s{0,%d}[\d+*-])" % max_list_start_indent)
|
||
|
|
||
|
def run(self, parent: etree.Element, blocks: list[str]) -> None:
|
||
|
block = blocks.pop(0)
|
||
|
if block.strip():
|
||
|
# Not a blank block. Add to parent, otherwise throw it away.
|
||
|
if self.parser.state.isstate('list'):
|
||
|
# The parent is a tight-list.
|
||
|
#
|
||
|
# Check for any children. This will likely only happen in a
|
||
|
# tight-list when a header isn't followed by a blank line.
|
||
|
# For example:
|
||
|
#
|
||
|
# * # Header
|
||
|
# Line 2 of list item - not part of header.
|
||
|
sibling = self.lastChild(parent)
|
||
|
if sibling is not None:
|
||
|
# Insert after sibling.
|
||
|
if sibling.tail:
|
||
|
sibling.tail = '{}\n{}'.format(sibling.tail, block)
|
||
|
else:
|
||
|
sibling.tail = '\n%s' % block
|
||
|
else:
|
||
|
# Append to parent.text
|
||
|
if parent.text:
|
||
|
parent.text = '{}\n{}'.format(parent.text, block)
|
||
|
else:
|
||
|
parent.text = block.lstrip()
|
||
|
else:
|
||
|
# Check if paragraph contains a list
|
||
|
next_list_block = None
|
||
|
if list_match := self.LIST_RE.search(block):
|
||
|
list_start = list_match.end() - len(list_match.group(1))
|
||
|
next_list_block = block[list_start:]
|
||
|
block = block[:list_start]
|
||
|
|
||
|
# Create a regular paragraph
|
||
|
p = etree.SubElement(parent, 'p')
|
||
|
p.text = block.lstrip()
|
||
|
|
||
|
# If a list was found, parse its block separately with the paragraph as the parent
|
||
|
if next_list_block:
|
||
|
self.parser.parseBlocks(p, [next_list_block])
|
||
|
|
||
|
|
||
|
class SaneListExtension(Extension):
|
||
|
""" Add sane lists to Markdown. """
|
||
|
|
||
|
def extendMarkdown(self, md):
|
||
|
""" Override existing Processors. """
|
||
|
md.parser.blockprocessors.register(SaneListIndentProcessor(md.parser), 'indent', 90)
|
||
|
md.parser.blockprocessors.register(SaneOListProcessor(md.parser), 'olist', 40)
|
||
|
md.parser.blockprocessors.register(SaneUListProcessor(md.parser), 'ulist', 30)
|
||
|
md.parser.blockprocessors.register(SaneParagraphProcessor(md.parser), 'paragraph', 10)
|
||
|
|
||
|
|
||
|
def makeExtension(**kwargs): # pragma: no cover
|
||
|
return SaneListExtension(**kwargs)
|