# Code based on the Sane List Extension for Python-Markdown # ======================================= # Modify the behavior of Lists in Python-Markdown to act in a sane manner. # See https://Python-Markdown.github.io/extensions/sane_lists # for documentation. # Original code Copyright 2011 [Waylan Limberg](http://achinghead.com) # All changes Copyright 2011-2014 The Python Markdown Project # License: [BSD](https://opensource.org/licenses/bsd-license.php) """ Modify the behavior of Lists in Python-Markdown to act in a sane manner. """ from __future__ import annotations import re import xml.etree.ElementTree as etree from typing import TYPE_CHECKING from markdown import Extension from markdown.blockparser import BlockParser from markdown.blockprocessors import ( ListIndentProcessor, OListProcessor, ParagraphProcessor ) if TYPE_CHECKING: # pragma: no cover from markdown import blockparser # The min. number of added leading spaces needed to start a nested list MIN_NESTED_LIST_INDENT = 2 assert MIN_NESTED_LIST_INDENT > 1, "'MIN_NESTED_LIST_INDENT' must be > 1" class SaneListIndentProcessor(ListIndentProcessor): """ Process children of list items. Example * a list item process this part or this part """ def __init__(self, *args): super().__init__(*args) self.INDENT_RE = re.compile(r'^(([ ])+)') def test(self, parent: etree.Element, block: str) -> bool: return block.startswith(' ' * MIN_NESTED_LIST_INDENT) and \ not self.parser.state.isstate('detabbed') and \ (parent.tag in self.ITEM_TYPES or (len(parent) and parent[-1] is not None and (parent[-1].tag in self.LIST_TYPES))) def get_level(self, parent: etree.Element, block: str) -> tuple[int, etree.Element]: """ Get level of indentation based on list level. """ # Get indent level m = self.INDENT_RE.match(block) if m: indent_level = len(m.group(1)) / MIN_NESTED_LIST_INDENT else: indent_level = 0 if self.parser.state.isstate('list'): # We're in a tight-list - so we already are at correct parent. level = 1 else: # We're in a loose-list - so we need to find parent. level = 0 # Step through children of tree to find matching indent level. while indent_level > level: child = self.lastChild(parent) if (child is not None and (child.tag in self.LIST_TYPES or child.tag in self.ITEM_TYPES)): if child.tag in self.LIST_TYPES: level += 1 parent = child else: # No more child levels. If we're short of `indent_level`, # we have a code block. So we stop here. break return level, parent def detab(self, text: str, length: int | None = None) -> tuple[str, str]: """ Remove a tab from the front of each line of the given text. """ if length is None: length = MIN_NESTED_LIST_INDENT newtext = [] lines = text.split('\n') for line in lines: if line.startswith(' ' * length): newtext.append(line[length:]) elif not line.strip(): newtext.append('') else: break return '\n'.join(newtext), '\n'.join(lines[len(newtext):]) def looseDetab(self, text: str, level: int = 1) -> str: """ Remove indentation from front of lines but allowing dedented lines. """ lines = text.split('\n') for i in range(len(lines)): if lines[i].startswith(' ' * MIN_NESTED_LIST_INDENT * level): lines[i] = lines[i][MIN_NESTED_LIST_INDENT * level:] return '\n'.join(lines) class SaneOListProcessor(OListProcessor): """ Override `SIBLING_TAGS` to not include `ul` and set `LAZY_OL` to `False`. """ SIBLING_TAGS = ['ol'] """ Exclude `ul` from list of siblings. """ LAZY_OL = False """ Disable lazy list behavior. """ def __init__(self, parser: blockparser.BlockParser): super().__init__(parser) # This restriction stems from the 'CodeBlockProcessor' class, # which automatically matches blocks with an indent = self.tab_length max_list_start_indent = self.tab_length - 1 # Detect an item (e.g., `1. item`) self.RE = re.compile(r'^[ ]{0,%d}[\*_]{0,2}\d+\.[ ]+(.*)' % max_list_start_indent) # Detect items on secondary lines. they can be of either list type. self.CHILD_RE = re.compile(r'^[ ]{0,%d}([\*_]{0,2})((\d+\.))[ ]+(.*)' % (MIN_NESTED_LIST_INDENT - 1)) # Detect indented (nested) items of either type self.INDENT_RE = re.compile(r'^[ ]{%d,%d}[\*_]{0,2}((\d+\.)|[*+-])[ ]+.*' % (MIN_NESTED_LIST_INDENT, self.tab_length * 2 - 1)) def run(self, parent: etree.Element, blocks: list[str]) -> None: # Check for multiple items in one block. items = self.get_items(blocks.pop(0)) sibling = self.lastChild(parent) if sibling is not None and sibling.tag in self.SIBLING_TAGS: # Previous block was a list item, so set that as parent lst = sibling # make sure previous item is in a `p` - if the item has text, # then it isn't in a `p` if lst[-1].text: # since it's possible there are other children for this # sibling, we can't just `SubElement` the `p`, we need to # insert it as the first item. p = etree.Element('p') p.text = lst[-1].text lst[-1].text = '' lst[-1].insert(0, p) # if the last item has a tail, then the tail needs to be put in a `p` # likely only when a header is not followed by a blank line lch = self.lastChild(lst[-1]) if lch is not None and lch.tail: p = etree.SubElement(lst[-1], 'p') p.text = lch.tail.lstrip() lch.tail = '' # parse first block differently as it gets wrapped in a `p`. li = etree.SubElement(lst, 'li') self.parser.state.set('looselist') firstitem = items.pop(0) self.parser.parseBlocks(li, [firstitem]) self.parser.state.reset() elif parent.tag in ['ol', 'ul']: # this catches the edge case of a multi-item indented list whose # first item is in a blank parent-list item: # * * subitem1 # * subitem2 # see also `ListIndentProcessor` lst = parent else: # This is a new list so create parent with appropriate tag. lst = etree.SubElement(parent, self.TAG) # Check if a custom start integer is set if not self.LAZY_OL and self.STARTSWITH != '1': lst.attrib['start'] = self.STARTSWITH self.parser.state.set('list') # Loop through items in block, recursively parsing each with the # appropriate parent. for item in items: if item.startswith(" " * MIN_NESTED_LIST_INDENT): # Item is indented. Parse with last item as parent self.parser.parseBlocks(lst[-1], [item]) else: # New item. Create `li` and parse with it as parent li = etree.SubElement(lst, 'li') self.parser.parseBlocks(li, [item]) self.parser.state.reset() def looseDetab(self, text: str, indent_length: int, level: int = 1) -> str: """ Remove indentation from front of lines but allowing dedented lines. """ lines = text.split('\n') for i in range(len(lines)): if lines[i].startswith(' ' * indent_length * level): lines[i] = lines[i][indent_length * level:] return '\n'.join(lines) def get_items(self, block: str) -> list[str]: """ Break a block into list items. """ # If first level of list is indented, remove that indentation if (indent_len := len(block) - len(block.lstrip())) > 0: block = self.looseDetab(block, indent_len) items = [] for line in block.split('\n'): m = self.CHILD_RE.match(line) if m: # This is a new list item # Check first item for the start index if not items: # Detect the integer value of first list item INTEGER_RE = re.compile(r'(\d+)') self.STARTSWITH = INTEGER_RE.match(m.group(2)).group() # Append to the list items.append(m.group(1) + m.group(4)) elif self.INDENT_RE.match(line): # This is an indented (possibly nested) item. if items[-1].startswith(' ' * MIN_NESTED_LIST_INDENT): # Previous item was indented. Append to that item. items[-1] = '{}\n{}'.format(items[-1], line) else: items.append(line) else: # This is another line of previous item. Append to that item. items[-1] = '{}\n{}'.format(items[-1], line) return items class SaneUListProcessor(SaneOListProcessor): """ Override `SIBLING_TAGS` to not include `ol`. """ TAG: str = 'ul' SIBLING_TAGS = ['ul'] """ Exclude `ol` from list of siblings. """ def __init__(self, parser: blockparser.BlockParser): super().__init__(parser) # Detect an item (e.g., `- item` or `+ item` or `* item`). max_list_start_indent = self.tab_length - 1 self.RE = re.compile(r'^[ ]{0,%d}[*+-][ ]+(.*)' % max_list_start_indent) self.CHILD_RE = re.compile(r'^[ ]{0,%d}(([*+-]))[ ]+(.*)' % (MIN_NESTED_LIST_INDENT - 1)) def get_items(self, block: str) -> list[str]: """ Break a block into list items. """ # If first level of list is indented, remove that indentation if (indent_len := len(block) - len(block.lstrip())) > 0: block = self.looseDetab(block, indent_len) items = [] for line in block.split('\n'): m = self.CHILD_RE.match(line) if m: # Append to the list items.append(m.group(3)) elif self.INDENT_RE.match(line): # This is an indented (possibly nested) item. if items[-1].startswith(' ' * MIN_NESTED_LIST_INDENT): # Previous item was indented. Append to that item. items[-1] = '{}\n{}'.format(items[-1], line) else: items.append(line) else: # This is another line of previous item. Append to that item. items[-1] = '{}\n{}'.format(items[-1], line) return items class SaneParagraphProcessor(ParagraphProcessor): """ Process Paragraph blocks. """ def __init__(self, parser: BlockParser): super().__init__(parser) max_list_start_indent = self.tab_length - 1 self.LIST_RE = re.compile(r"\s{2}\n(\s{0,%d}[\d+*-])" % max_list_start_indent) def run(self, parent: etree.Element, blocks: list[str]) -> None: block = blocks.pop(0) if block.strip(): # Not a blank block. Add to parent, otherwise throw it away. if self.parser.state.isstate('list'): # The parent is a tight-list. # # Check for any children. This will likely only happen in a # tight-list when a header isn't followed by a blank line. # For example: # # * # Header # Line 2 of list item - not part of header. sibling = self.lastChild(parent) if sibling is not None: # Insert after sibling. if sibling.tail: sibling.tail = '{}\n{}'.format(sibling.tail, block) else: sibling.tail = '\n%s' % block else: # Append to parent.text if parent.text: parent.text = '{}\n{}'.format(parent.text, block) else: parent.text = block.lstrip() else: # Check if paragraph contains a list next_list_block = None if list_match := self.LIST_RE.search(block): list_start = list_match.end() - len(list_match.group(1)) next_list_block = block[list_start:] block = block[:list_start] # Create a regular paragraph p = etree.SubElement(parent, 'p') p.text = block.lstrip() # If a list was found, parse its block separately with the paragraph as the parent if next_list_block: self.parser.parseBlocks(p, [next_list_block]) class SaneListExtension(Extension): """ Add sane lists to Markdown. """ def extendMarkdown(self, md): """ Override existing Processors. """ md.parser.blockprocessors.register(SaneListIndentProcessor(md.parser), 'indent', 90) md.parser.blockprocessors.register(SaneOListProcessor(md.parser), 'olist', 40) md.parser.blockprocessors.register(SaneUListProcessor(md.parser), 'ulist', 30) md.parser.blockprocessors.register(SaneParagraphProcessor(md.parser), 'paragraph', 10) def makeExtension(**kwargs): # pragma: no cover return SaneListExtension(**kwargs)