From 7871c99a5ef791a5ce24c4b1d016a8b4200baf34 Mon Sep 17 00:00:00 2001 From: Daniel Schadt Date: Thu, 19 Aug 2021 13:10:43 +0200 Subject: Add an internal Document representation Doing everything on strings is kinda wonky, so this adds an intermediate representation. The idea behind this is that the pipeline now goes Wikicode [1]-> Document [2]-> Output String Where step 1 takes care of templates and everything, and step 2 does the actual output formatting. This has the benefit that we can support multiple output types, some with more and some with less features (e.g., adding a Markdown output which keeps some of the original formatting intact), and it has the benefit of being less wonky (no hacks with "" for numbered lists, more streamlined formatting with newlines, ...). --- wikimini/__init__.py | 185 ++++++++--------- wikimini/document.py | 438 ++++++++++++++++++++++++++++++++++++++++ wikimini/templates/cite.py | 3 +- wikimini/templates/convert.py | 9 +- wikimini/templates/language.py | 21 +- wikimini/templates/mainlinks.py | 5 +- wikimini/templates/quotes.py | 11 +- wikimini/templates/various.py | 5 +- 8 files changed, 555 insertions(+), 122 deletions(-) create mode 100644 wikimini/document.py diff --git a/wikimini/__init__.py b/wikimini/__init__.py index b93ef81..fac0c84 100644 --- a/wikimini/__init__.py +++ b/wikimini/__init__.py @@ -4,8 +4,9 @@ import re from tabulate import tabulate -from typing import Union, Tuple +from typing import List, Union, Tuple +from .document import * #: The default API URL, pointing to the english Wikipedia. API_URL = "https://en.wikipedia.org/w/api.php" @@ -77,66 +78,71 @@ class Wikimini: text = revision["slots"]["main"]["content"] return (title, mwp.parse(text)) - def _convert(self, obj): - """Function that does the actual conversion. + def convert( + self, + obj: Union[mwp.wikicode.Wikicode, mwp.nodes.Node], + ) -> Union[Document, List[Node], List[Block]]: + """Function that converts and renders a node. - This is called recursively on each node, and should perform the correct - conversion - based on the node type. + This function is exposed for template implementors, for normal usage, + see :meth:`convert_to_document`. + + The input and output of this function is as follows: + + * If ``obj`` is a :class:`~mwparserfromhell.wikicode.Wikicode`, then + :meth:`convert` will return a :class:`document.Document`. + * If ``obj`` is a :class:`~mwparserfromhell.nodes.Node`, then + :meth:`convert` will return either a list of :class:`document.Node` + or a list of :class:`document.Block`, depending on whether the + converted object is inline (like a link), or a block object (like a + quote). + + Note that in the last case, the empty list ``[]`` might be returned, + indicating that the object should not be included in the output. + + Args: + obj: The object to convert. + + Returns: + The converted object. """ default = lambda obj:\ - mwp.wikicode.Wikicode([obj]).strip_code(collapse=False) + [Plain(mwp.wikicode.Wikicode([obj]).strip_code(collapse=False))] - # This does the actual conversion if isinstance(obj, mwp.wikicode.Wikicode): - converted = [] - iterator = iter(enumerate(obj.nodes)) - for i, node in iterator: - # Pattern: * [[Wikilink]]\n - if (i >= 2 and - i + 1 < len(obj.nodes) and - # Links can have a plural s after them - re.match("s?\n", str(obj.nodes[i+1])) and - isinstance(node, mwp.nodes.wikilink.Wikilink) and - str(obj.nodes[i-1]) == " " and - str(obj.nodes[i-2]) == "*"): - converted.pop() - converted.pop() - _, after = next(iterator) - converted.append("=> {} {}{}".format( - self.page_url(str(node.title)), - self._convert(node), - self._convert(after), - )) - continue - # Pattern: *[[Wikilink]]\n - elif (i >= 1 and - i + 1 < len(obj.nodes) and - re.match("s?\n", str(obj.nodes[i+1])) and - isinstance(node, mwp.nodes.wikilink.Wikilink) and - str(obj.nodes[i-1]) == "*"): - converted.pop() - _, after = next(iterator) - converted.append("=> {} {}{}".format( - self.page_url(str(node.title)), - self._convert(node), - self._convert(after), - )) - continue - - # Default: Just convert the node - converted.append(self._convert(node)) - return "".join(converted) + document = [] + for node in obj.nodes: + current = self.convert(node) + + if current == []: + pass + # Special case: We're starting a list, but we're already in a list + elif (document and len(current) == 1 and + isinstance(current[0], ItemList) and + isinstance(document[-1], ItemList) and + document[-1].ordered == current[0].ordered): + pass + # Special case: We're starting a list! + elif len(current) == 1 and isinstance(current[0], ItemList): + document.extend(current) + elif isinstance(current[0], Block): + document.extend(current) + document.append(Paragraph([])) + elif isinstance(current[0], Node): + for c in current: + insert_into(document, c) + return Document(document) elif isinstance(obj, mwp.nodes.heading.Heading): - return "{} {}\n".format("#" * min(obj.level, 3), obj.title.strip_code()) + return [Heading(obj.level, obj.title.strip_code())] elif isinstance(obj, mwp.nodes.tag.Tag): # Most tags are handled just fine and can be delegated to strip_code # (inline text styles), however we can do a bit better for list tags. if str(obj.wiki_markup) == "*": - return "* " + return [ItemList([], False)] elif str(obj.wiki_markup) == "#": - return " " + return [ItemList([], True)] elif str(obj.tag) == "ref": - return "" + return [] elif str(obj.tag) == "table": rows = [] header = () @@ -151,16 +157,16 @@ class Wikimini: continue if str(node.tag) == "th": row_is_header = True - parsed.append(self._convert(node.contents).strip()) + parsed.append( + self.convert(node.contents).plain().strip() + ) if not row_is_header: rows.append(parsed) else: header = parsed - return "".join([ - "\n```\n", - tabulate(rows, header, tablefmt=self.table_format), - "\n```\n", - ]) + return [ + Verbatim(tabulate(rows, header, tablefmt=self.table_format)) + ] else: return default(obj) elif isinstance(obj, mwp.nodes.template.Template): @@ -175,50 +181,28 @@ class Wikimini: return template(self, obj) elif isinstance(obj, mwp.nodes.wikilink.Wikilink): if str(obj.title).startswith("File:") or str(obj.text).startswith("thumb|"): - return "" + return [] elif str(obj.title).startswith("Category:"): - return "" + return [] else: - return default(obj) + return [InlineLink( + self.page_url(str(obj.title)), + Plain( + extract_plaintext(self.convert(obj.text)) if obj.text + else str(obj.title) + ), + )] else: return default(obj) - def _postprocess(self, gemtext): - # Strip out any more thumbs that have been left. - # This happens because the wikilinks are nested in each other, which the - # parser would only notice after doing the first replacement. We'll just - # take the easy way out here and use a regex to get rid of them. - gemtext = re.sub("^\\[\\[File:.*?\\]\\]$", "", gemtext, flags=re.MULTILINE) - - # Collapse too many empty lines - while "\n\n\n" in gemtext: - gemtext = gemtext.replace("\n\n\n", "\n\n") - - # Shortcut to avoid unnecessary splitting - if "" not in gemtext: - return gemtext - - lines = gemtext.split("\n") - counter = 1 - for idx in range(len(lines)): - line = lines[idx] - if line.startswith(""): - line = line.replace("", str(counter), 1) - lines[idx] = line - counter += 1 - else: - counter = 1 - return "\n".join(lines) - - - def wikicode_to_gemtext( - self, obj: Union[mwp.nodes.Node, mwp.wikicode.Wikicode] - ) -> str: - """Try to turn the given object into a sensible Gemtext representation. + def convert_to_document(self, obj: mwp.wikicode.Wikicode) -> Document: + """Try to turn the given object into a sensible + :class:`~document.Document` representation. - Note that wikicode is much more powerful than Gemtext, so this is a lossy - function. The returned Gemtext tries to mimic the content of the Wikicode - as much as possible (for human consumption). + Note that wikicode is much more powerful than the internal + representation, so this is a lossy function. The returned document tries + to mimic the content of the Wikicode as much as possible (for human + consumption). This function mostly mimics :meth:`~mwparserfromhell.wikicode.Wikicode.strip_code`, with some @@ -228,21 +212,26 @@ class Wikimini: obj: The object to convert. Returns: - The converted Gemtext. + The converted Document. """ # Avoid calling str() on the whole Wikicode here if (isinstance(obj, mwp.wikicode.Wikicode) and str(mwp.wikicode.Wikicode(obj.nodes[:2])) == "#REDIRECT "): + document = Document() title = str(obj.nodes[2].title) if "#" in title: title, section = title.split("#") section = f"Section '{section}'" else: section = "" - return "Redirect:\n=> {} {}\n{}".format( - self.page_url(title), title, section - ) - return self._postprocess(self._convert(obj)) + document.append(BlockLink(self.page_url(title), title)) + if section: + document.append(Paragraph([Plain(section)])) + return document + + document = self.convert(obj) + document.cleanup() + return document # import at the bottom to avoid circular dependencies diff --git a/wikimini/document.py b/wikimini/document.py new file mode 100644 index 0000000..2c901dc --- /dev/null +++ b/wikimini/document.py @@ -0,0 +1,438 @@ +"""The main class of this module is a :class:`Document`, which holds a parsed +and rendered Wikipedia article. + +We distinguish between two kinds of nodes, similar to HTML: +""" +import re +from dataclasses import dataclass, replace +from typing import List, Union + +class Document: + """A rendered Wikipedia article. + + Attributes: + blocks (List[Block]): A list of top-level nodes. + """ + __slots__ = ('blocks',) + + def __init__(self, blocks=None): + self.blocks = [] + if blocks: + self.blocks = blocks + + def __iter__(self): + return iter(self.blocks) + + def append(self, block: "Block"): + """Append a block to the document. + + Args: + block: The block to append. + """ + self.blocks.append(block) + + def cleanup(self): + """Clean up the document by cleaning up every contained block. + + See also :meth:`Block.cleanup`. + """ + for block in self.blocks: + block.cleanup() + self.blocks = [block for block in self.blocks if block] + + def nodes(self) -> List["Node"]: + """Discard the block information and return a list of inner nodes. + + Returns: + A list of all inner nodes. + """ + return [node for block in self.blocks for node in block.to_nodes()] + + def plain(self) -> str: + """Returns the plain text content of this document. + + Returns: + The plain text. + """ + return extract_plaintext(self) + + +@dataclass +class Node: + """Base class for all in-line text elements.""" + + def plain(self) -> str: + """Returns the plain text of this node, stripping all markup. + + Returns: + The plain text. + """ + + def with_text(self, text: str) -> "Node": + """Returns a new node that has the same markup, but the given text. + + Args: + text: The new text. + + Returns: + The new node, usually of the same type as the node this function is + called on. + """ + + def __getitem__(self, index): + if isinstance(index, int): + return self.plain()[index] + elif isinstance(index, slice): + text = self.plain()[index] + return self.with_text(text) + else: + raise TypeError("Node indices must be integers or slices") + + +@dataclass +class Plain(Node): + """A plain text node. + + Attributes: + text: The text content of this node. + """ + __slots__ = ("text",) + text: str + + def plain(self): + return self.text + + def with_text(self, text): + return Plain(text) + + +@dataclass +class Style(Node): + """Text that is styled with inline markup. + + Attributes: + inner: The content. + bold: Whether the text is bold. + italic: Whether the text is cursive. + monospace: Whether the text is monospaced. + """ + __slots__ = ("text", "bold", "italic", "monospace") + inner: Node + bold: bool + italic: bool + monospace: bool + + def plain(self): + return self.inner.plain() + + def with_text(self, text): + return replace(self, inner=self.inner.with_text(text)) + + +@dataclass +class InlineLink(Node): + """An inline link. + + Attributes: + href: The link target. + title: The text that should be shown. + """ + __slots__ = ("href", "title") + href: str + title: Union[Plain, Style] + + def plain(self): + if self.title is None: + return self.href + return self.title.plain() + + def with_text(self, text): + return replace(self, title=self.title.with_text(text)) + + +@dataclass +class Block: + """Base class for all top-level blocks.""" + + def cleanup(self): + """Clean up the content of this block. + + The exact meaning of this is dependent on the type of the block, but it + can involve stripping trailing/leading whitespace or other changes. + + Note that this modifies the block. + """ + + def append(self, node: Node): + """Append the given node to the block. + + Depending on the block, the node can either be inserted as-is (keeping + the markup information), or it is converted to plain text first. + + Args: + node: The node to insert. + """ + + def plain(self) -> str: + """Returns the plain text of this block, stripping all markup. + + Returns: + The plain text. + """ + + def to_nodes(self) -> List[Node]: + """Returns the inner nodes of this block. + + If the block is not made up of nodes, this will create new nodes that + contain the plain text content of this block. + + Returns: + The list of nodes. + """ + return [Plain(self.plain())] + + +@dataclass +class Paragraph(Block): + """A paragraph is a piece of text, which itself can hold inline markup.""" + __slots__ = ("nodes",) + nodes: List[Node] + + def __bool__(self): + return bool(self.nodes) + + def append(self, node): + self.nodes.append(node) + + def plain(self): + return "".join(node.plain() for node in self.nodes) + + def to_nodes(self): + return self.nodes + + def cleanup(self): + while self.nodes and re.match("^\\s+|^$", self.nodes[0].plain()): + self.nodes[0] = self.nodes[0].with_text( + self.nodes[0].plain().lstrip()) + if not self.nodes[0].plain(): + del self.nodes[0] + while self.nodes and re.search("\\s+$|^$", self.nodes[-1].plain()): + self.nodes[-1] = self.nodes[-1].with_text( + self.nodes[-1].plain().rstrip()) + if not self.nodes[-1].plain(): + del self.nodes[-1] + + +@dataclass +class Heading(Block): + """A heading. + + Attributes: + level: The level of the heading. + text: The heading text. + """ + __slots__ = ("level", "text") + level: int + text: str + + def __bool__(self): + return bool(self.text) + + def cleanup(self): + self.text = self.text.strip() + + def append(self, node): + self.text += node.plain() + + def plain(self): + return self.text + + +@dataclass +class Verbatim(Block): + """Text that should appear verbatim in the output, such as code. + + Attributes: + text: The text that should appear. + """ + __slots__ = ("text",) + text: str + + def __bool__(self): + return bool(self.text) + + def append(self, node): + self.text += node.plain() + + def plain(self): + return self.text + + +@dataclass +class ItemList(Block): + """A list of elements. + + Attributes: + items: The list of items. Each item is a list of inline :class:`Node`. + ordered: A flag indicating whether the list should be an ordered + (numbered) list. + """ + __slots__ = ("items", "ordered") + items: List[List[Node]] + ordered: bool + + def __bool__(self): + return bool(self.items) + + def new_item(self): + """Start a new item.""" + self.items.append([]) + + def append(self, node): + if not self.items: + self.new_item() + self.items[-1].append(node) + + def plain(self): + return "\n".join( + "".join(i.plain() for i in item) for item in self.items + ) + + def to_nodes(self): + return [node for item in self.items for node in item] + + def cleanup(self): + i = 0 + while i < len(self.items): + p = Paragraph(self.items[i]) + p.cleanup() + if p: + self.items[i] = p.to_nodes() + i += 1 + else: + del self.items[i] + + +@dataclass +class Blockquote(Block): + """A quote. + + Attributes: + nodes: The content of the blockquote, similar to + :attr:`Paragraph.nodes`. + """ + __slots__ = ("nodes",) + nodes: List[Node] + + def __bool__(self): + return bool(self.nodes) + + def append(self, node): + self.nodes.append(node) + + def plain(self): + return "".join(node.plain() for node in self.nodes) + + def to_nodes(self): + return self.nodes + + +@dataclass +class BlockLink(Block): + """A link on its own line. + + This is important for formats like Gemtext, where inline links will be + discarded. + + Attributes: + href: The target of the link. + title: The link text. + """ + __slots__ = ("href", "title") + href: str + title: str + + def append(self, node): + self.title += node.plain() + + def plain(self): + return self.title + + +def insert_into(blocks: List[Block], node: Node): + """Inserts the given node into the list of blocks. + + The node will always be inserted into the last block. If the list of blocks + is still empty, a fresh :class:`Paragraph` will be started. + + This function takes care of handling newlines properly. That means that a + double newline (``\\n\\n``) will start a new paragraph, and a single + newline (``\\n``) will start a new list item (if the current block is a + list). + + Note that this function will modify the given list of blocks. + + Args: + blocks: The list of blocks. + node: The node to insert. + """ + if not blocks: + blocks.append(Paragraph([])) + + current_block = blocks[-1] + + if isinstance(current_block, Paragraph): + if "\n\n" in node.plain(): + idx = node.plain().index("\n\n") + left = node[:idx] + right = node[idx+2:] + current_block.append(left) + blocks.append(Paragraph([])) + insert_into(blocks, right) + else: + current_block.append(node) + + elif isinstance(current_block, ItemList): + match = re.search("\\n\\n?", node.plain()) + if not match: + current_block.append(node) + else: + left_end, right_start = match.span() + current_block.append(node[:left_end]) + if match.group() == "\n\n": + blocks.append(Paragraph([])) + else: + current_block.new_item() + insert_into(blocks, node[right_start:]) + + else: + current_block.append(node) + + +def extract_plaintext(obj) -> str: + """Tries to extract plaintext from the given object. + + The given object can be one of many things: + + * A list of :class:`Node` + * A list of :class:`Block` + * A single :class:`Node` + * A single :class:`Block` + * A :class:`Document` + + This function is useful if you recursively call + :meth:`wikimini.Wikimini.convert` and want to include the output in + something that only accepts plain text. + """ + if isinstance(obj, Document): + return extract_plaintext(obj.blocks) + + if not isinstance(obj, list): + obj = [obj] + + if not obj: + return "" + elif isinstance(obj[0], Node): + return "".join(node.plain() for node in obj) + elif isinstance(obj[0], Block): + return "\n\n".join(block.plain() for block in obj) diff --git a/wikimini/templates/cite.py b/wikimini/templates/cite.py index ac4f597..e342a5f 100644 --- a/wikimini/templates/cite.py +++ b/wikimini/templates/cite.py @@ -1,5 +1,6 @@ """Citation related templates.""" from . import registry +from ..document import Plain def tmpl_citation(wikimini, obj): @@ -23,7 +24,7 @@ def tmpl_citation(wikimini, obj): names.append(last) elif first: names.append(first) - return "{} ({})".format(title, "; ".join(names)) + return [Plain("{} ({})".format(title, "; ".join(names)))] for name in ["cite", "citation", "cite arXiv", "cite AV media", "cite book", diff --git a/wikimini/templates/convert.py b/wikimini/templates/convert.py index a7a3f44..8bab782 100644 --- a/wikimini/templates/convert.py +++ b/wikimini/templates/convert.py @@ -1,20 +1,21 @@ """Implementations for the unit conversion templates.""" from . import registry +from ..document import Plain def tmpl_convert(wikimini, obj): """Renders the ``{{convert|...}}`` template.""" if str(obj.params[1]) in {"-", "to"}: - return "{0}{3} {1} {2}{3}".format( + return [Plain("{0}{3} {1} {2}{3}".format( obj.params[0].value.strip_code(), obj.params[1].value.strip_code(), obj.params[2].value.strip_code(), obj.params[3].value.strip_code(), - ) - return "{}{}".format( + ))] + return [Plain("{}{}".format( obj.params[0].value.strip_code(), obj.params[1].value.strip_code(), - ) + ))] registry.insert("convert", tmpl_convert) diff --git a/wikimini/templates/language.py b/wikimini/templates/language.py index e8ab738..22320da 100644 --- a/wikimini/templates/language.py +++ b/wikimini/templates/language.py @@ -1,12 +1,15 @@ """Language related templates.""" from . import registry +from ..document import Plain import pycountry def tmpl_ipa(wikimini, obj): """Renders the ``{{IPA|...}}`` template.""" - return "pronounced [{}]".format(wikimini._convert(obj.params[0].value)) + return [Plain("pronounced [{}]".format( + wikimini.convert(obj.params[0].value).plain() + ))] registry.insert("IPA", tmpl_ipa) @@ -14,7 +17,7 @@ registry.insert("IPA", tmpl_ipa) def tmpl_lang(wikimini, obj): """Renders the ``{{Lang|...}}`` template.""" - return wikimini._convert(obj.params[1].value) + return wikimini.convert(obj.params[1].value).nodes() registry.insert("lang", tmpl_lang) @@ -24,18 +27,18 @@ registry.insert("script", tmpl_lang) def tmpl_lang_code(language_name): """Creates a template renderer for a ``{{lang-xx|...}}`` template.""" def inner(wikimini, obj): - return "{}: {}".format( - language_name, wikimini._convert(obj.params[0].value) - ) + return [Plain("{}: {}".format( + language_name, wikimini.convert(obj.params[0].value).plain() + ))] return inner def tmpl_ipa_code(language_name): """Creates a template renderer for a ``{{IPA-xx|...}}`` template.""" def inner(wikimini, obj): - return "{} pronunciation: [{}]".format( - language_name, wikimini._convert(obj.params[0].value) - ) + return [Plain("{} pronunciation: [{}]".format( + language_name, wikimini.convert(obj.params[0].value).plain() + ))] return inner @@ -52,7 +55,7 @@ for language in pycountry.languages: def tmpl_country_flag(country): """Creates a template renderer for ``{{BRA}}`` country flags.""" def inner(wikimini, obj): - return country + return [Plain(country)] return inner diff --git a/wikimini/templates/mainlinks.py b/wikimini/templates/mainlinks.py index ffcbc5e..8c9abbf 100644 --- a/wikimini/templates/mainlinks.py +++ b/wikimini/templates/mainlinks.py @@ -1,14 +1,15 @@ """Renders templates that link to further articles.""" from . import registry +from ..document import Paragraph, Plain, BlockLink def tmpl_main(wikimini, obj): """Renders the ``{{main|...}}`` template.""" links = [ - "=> {} {}".format(wikimini.page_url(str(t.value)), t.value) + BlockLink(wikimini.page_url(str(t.value)), t.value.strip_code()) for t in obj.params ] - return "Main articles:\n{}\n".format("\n".join(links)) + return [Paragraph([Plain("Main articles:")])] + links registry.insert("main", tmpl_main) diff --git a/wikimini/templates/quotes.py b/wikimini/templates/quotes.py index b51d92d..39f6fa2 100644 --- a/wikimini/templates/quotes.py +++ b/wikimini/templates/quotes.py @@ -1,15 +1,15 @@ """Renders various quote related templates.""" from . import registry +from ..document import Blockquote def tmpl_quote(wikimini, obj): """Renders the ``{{blockquote|...}}`` template.""" text = obj.get("text", None) if not text: return "" - content = wikimini._convert(text.value) - lines = content.split("\n") - return "\n".join(f"> {line}" for line in lines) + content = wikimini.convert(text.value).nodes() + return [Blockquote(content)] registry.insert("blockquote", tmpl_quote) @@ -19,9 +19,8 @@ registry.insert("quote", tmpl_quote) def tmpl_cquote(wikimini, obj): """Renders the ``{{cquote|...}}`` template.""" text = obj.params[0] - content = wikimini._convert(text.value) - lines = content.split("\n") - return "\n".join(f"> {line}" for line in lines) + content = wikimini.convert(text.value).nodes() + return [Blockquote(content)] registry.insert("cquote", tmpl_cquote) diff --git a/wikimini/templates/various.py b/wikimini/templates/various.py index 8c6e0d5..6076ac4 100644 --- a/wikimini/templates/various.py +++ b/wikimini/templates/various.py @@ -1,16 +1,17 @@ """Various small templates.""" from . import registry +from ..document import Plain def tmpl_reign(wikimini, obj): """Renders the ``{{reign|...}}`` template.""" if not obj.params: - return "r. " + return [Plain("r. ")] first = obj.params[0].value.strip_code().strip() or "?" second = "" if len(obj.params) > 1: second = obj.params[1].value.strip_code().strip() - return f"r. {first} – {second}" + return [Plain(f"r. {first} – {second}")] registry.insert("reign", tmpl_reign) -- cgit v1.2.3