diff options
-rw-r--r-- | wikimini/__init__.py | 185 | ||||
-rw-r--r-- | wikimini/document.py | 438 | ||||
-rw-r--r-- | wikimini/templates/cite.py | 3 | ||||
-rw-r--r-- | wikimini/templates/convert.py | 9 | ||||
-rw-r--r-- | wikimini/templates/language.py | 21 | ||||
-rw-r--r-- | wikimini/templates/mainlinks.py | 5 | ||||
-rw-r--r-- | wikimini/templates/quotes.py | 11 | ||||
-rw-r--r-- | wikimini/templates/various.py | 5 |
8 files changed, 555 insertions, 122 deletions
diff --git a/wikimini/__init__.py b/wikimini/__init__.py index b93ef81..fac0c84 100644 --- a/wikimini/__init__.py +++ b/wikimini/__init__.py @@ -4,8 +4,9 @@ import re from tabulate import tabulate -from typing import Union, Tuple +from typing import List, Union, Tuple +from .document import * #: The default API URL, pointing to the english Wikipedia. API_URL = "https://en.wikipedia.org/w/api.php" @@ -77,66 +78,71 @@ class Wikimini: text = revision["slots"]["main"]["content"] return (title, mwp.parse(text)) - def _convert(self, obj): - """Function that does the actual conversion. + def convert( + self, + obj: Union[mwp.wikicode.Wikicode, mwp.nodes.Node], + ) -> Union[Document, List[Node], List[Block]]: + """Function that converts and renders a node. - This is called recursively on each node, and should perform the correct - conversion - based on the node type. + This function is exposed for template implementors, for normal usage, + see :meth:`convert_to_document`. + + The input and output of this function is as follows: + + * If ``obj`` is a :class:`~mwparserfromhell.wikicode.Wikicode`, then + :meth:`convert` will return a :class:`document.Document`. + * If ``obj`` is a :class:`~mwparserfromhell.nodes.Node`, then + :meth:`convert` will return either a list of :class:`document.Node` + or a list of :class:`document.Block`, depending on whether the + converted object is inline (like a link), or a block object (like a + quote). + + Note that in the last case, the empty list ``[]`` might be returned, + indicating that the object should not be included in the output. + + Args: + obj: The object to convert. + + Returns: + The converted object. """ default = lambda obj:\ - mwp.wikicode.Wikicode([obj]).strip_code(collapse=False) + [Plain(mwp.wikicode.Wikicode([obj]).strip_code(collapse=False))] - # This does the actual conversion if isinstance(obj, mwp.wikicode.Wikicode): - converted = [] - iterator = iter(enumerate(obj.nodes)) - for i, node in iterator: - # Pattern: * [[Wikilink]]\n - if (i >= 2 and - i + 1 < len(obj.nodes) and - # Links can have a plural s after them - re.match("s?\n", str(obj.nodes[i+1])) and - isinstance(node, mwp.nodes.wikilink.Wikilink) and - str(obj.nodes[i-1]) == " " and - str(obj.nodes[i-2]) == "*"): - converted.pop() - converted.pop() - _, after = next(iterator) - converted.append("=> {} {}{}".format( - self.page_url(str(node.title)), - self._convert(node), - self._convert(after), - )) - continue - # Pattern: *[[Wikilink]]\n - elif (i >= 1 and - i + 1 < len(obj.nodes) and - re.match("s?\n", str(obj.nodes[i+1])) and - isinstance(node, mwp.nodes.wikilink.Wikilink) and - str(obj.nodes[i-1]) == "*"): - converted.pop() - _, after = next(iterator) - converted.append("=> {} {}{}".format( - self.page_url(str(node.title)), - self._convert(node), - self._convert(after), - )) - continue - - # Default: Just convert the node - converted.append(self._convert(node)) - return "".join(converted) + document = [] + for node in obj.nodes: + current = self.convert(node) + + if current == []: + pass + # Special case: We're starting a list, but we're already in a list + elif (document and len(current) == 1 and + isinstance(current[0], ItemList) and + isinstance(document[-1], ItemList) and + document[-1].ordered == current[0].ordered): + pass + # Special case: We're starting a list! + elif len(current) == 1 and isinstance(current[0], ItemList): + document.extend(current) + elif isinstance(current[0], Block): + document.extend(current) + document.append(Paragraph([])) + elif isinstance(current[0], Node): + for c in current: + insert_into(document, c) + return Document(document) elif isinstance(obj, mwp.nodes.heading.Heading): - return "{} {}\n".format("#" * min(obj.level, 3), obj.title.strip_code()) + return [Heading(obj.level, obj.title.strip_code())] elif isinstance(obj, mwp.nodes.tag.Tag): # Most tags are handled just fine and can be delegated to strip_code # (inline text styles), however we can do a bit better for list tags. if str(obj.wiki_markup) == "*": - return "* " + return [ItemList([], False)] elif str(obj.wiki_markup) == "#": - return "<!NUM!> " + return [ItemList([], True)] elif str(obj.tag) == "ref": - return "" + return [] elif str(obj.tag) == "table": rows = [] header = () @@ -151,16 +157,16 @@ class Wikimini: continue if str(node.tag) == "th": row_is_header = True - parsed.append(self._convert(node.contents).strip()) + parsed.append( + self.convert(node.contents).plain().strip() + ) if not row_is_header: rows.append(parsed) else: header = parsed - return "".join([ - "\n```\n", - tabulate(rows, header, tablefmt=self.table_format), - "\n```\n", - ]) + return [ + Verbatim(tabulate(rows, header, tablefmt=self.table_format)) + ] else: return default(obj) elif isinstance(obj, mwp.nodes.template.Template): @@ -175,50 +181,28 @@ class Wikimini: return template(self, obj) elif isinstance(obj, mwp.nodes.wikilink.Wikilink): if str(obj.title).startswith("File:") or str(obj.text).startswith("thumb|"): - return "" + return [] elif str(obj.title).startswith("Category:"): - return "" + return [] else: - return default(obj) + return [InlineLink( + self.page_url(str(obj.title)), + Plain( + extract_plaintext(self.convert(obj.text)) if obj.text + else str(obj.title) + ), + )] else: return default(obj) - def _postprocess(self, gemtext): - # Strip out any more thumbs that have been left. - # This happens because the wikilinks are nested in each other, which the - # parser would only notice after doing the first replacement. We'll just - # take the easy way out here and use a regex to get rid of them. - gemtext = re.sub("^\\[\\[File:.*?\\]\\]$", "", gemtext, flags=re.MULTILINE) - - # Collapse too many empty lines - while "\n\n\n" in gemtext: - gemtext = gemtext.replace("\n\n\n", "\n\n") - - # Shortcut to avoid unnecessary splitting - if "<!NUM!>" not in gemtext: - return gemtext - - lines = gemtext.split("\n") - counter = 1 - for idx in range(len(lines)): - line = lines[idx] - if line.startswith("<!NUM!>"): - line = line.replace("<!NUM!>", str(counter), 1) - lines[idx] = line - counter += 1 - else: - counter = 1 - return "\n".join(lines) - - - def wikicode_to_gemtext( - self, obj: Union[mwp.nodes.Node, mwp.wikicode.Wikicode] - ) -> str: - """Try to turn the given object into a sensible Gemtext representation. + def convert_to_document(self, obj: mwp.wikicode.Wikicode) -> Document: + """Try to turn the given object into a sensible + :class:`~document.Document` representation. - Note that wikicode is much more powerful than Gemtext, so this is a lossy - function. The returned Gemtext tries to mimic the content of the Wikicode - as much as possible (for human consumption). + Note that wikicode is much more powerful than the internal + representation, so this is a lossy function. The returned document tries + to mimic the content of the Wikicode as much as possible (for human + consumption). This function mostly mimics :meth:`~mwparserfromhell.wikicode.Wikicode.strip_code`, with some @@ -228,21 +212,26 @@ class Wikimini: obj: The object to convert. Returns: - The converted Gemtext. + The converted Document. """ # Avoid calling str() on the whole Wikicode here if (isinstance(obj, mwp.wikicode.Wikicode) and str(mwp.wikicode.Wikicode(obj.nodes[:2])) == "#REDIRECT "): + document = Document() title = str(obj.nodes[2].title) if "#" in title: title, section = title.split("#") section = f"Section '{section}'" else: section = "" - return "Redirect:\n=> {} {}\n{}".format( - self.page_url(title), title, section - ) - return self._postprocess(self._convert(obj)) + document.append(BlockLink(self.page_url(title), title)) + if section: + document.append(Paragraph([Plain(section)])) + return document + + document = self.convert(obj) + document.cleanup() + return document # import at the bottom to avoid circular dependencies diff --git a/wikimini/document.py b/wikimini/document.py new file mode 100644 index 0000000..2c901dc --- /dev/null +++ b/wikimini/document.py @@ -0,0 +1,438 @@ +"""The main class of this module is a :class:`Document`, which holds a parsed +and rendered Wikipedia article. + +We distinguish between two kinds of nodes, similar to HTML: +""" +import re +from dataclasses import dataclass, replace +from typing import List, Union + +class Document: + """A rendered Wikipedia article. + + Attributes: + blocks (List[Block]): A list of top-level nodes. + """ + __slots__ = ('blocks',) + + def __init__(self, blocks=None): + self.blocks = [] + if blocks: + self.blocks = blocks + + def __iter__(self): + return iter(self.blocks) + + def append(self, block: "Block"): + """Append a block to the document. + + Args: + block: The block to append. + """ + self.blocks.append(block) + + def cleanup(self): + """Clean up the document by cleaning up every contained block. + + See also :meth:`Block.cleanup`. + """ + for block in self.blocks: + block.cleanup() + self.blocks = [block for block in self.blocks if block] + + def nodes(self) -> List["Node"]: + """Discard the block information and return a list of inner nodes. + + Returns: + A list of all inner nodes. + """ + return [node for block in self.blocks for node in block.to_nodes()] + + def plain(self) -> str: + """Returns the plain text content of this document. + + Returns: + The plain text. + """ + return extract_plaintext(self) + + +@dataclass +class Node: + """Base class for all in-line text elements.""" + + def plain(self) -> str: + """Returns the plain text of this node, stripping all markup. + + Returns: + The plain text. + """ + + def with_text(self, text: str) -> "Node": + """Returns a new node that has the same markup, but the given text. + + Args: + text: The new text. + + Returns: + The new node, usually of the same type as the node this function is + called on. + """ + + def __getitem__(self, index): + if isinstance(index, int): + return self.plain()[index] + elif isinstance(index, slice): + text = self.plain()[index] + return self.with_text(text) + else: + raise TypeError("Node indices must be integers or slices") + + +@dataclass +class Plain(Node): + """A plain text node. + + Attributes: + text: The text content of this node. + """ + __slots__ = ("text",) + text: str + + def plain(self): + return self.text + + def with_text(self, text): + return Plain(text) + + +@dataclass +class Style(Node): + """Text that is styled with inline markup. + + Attributes: + inner: The content. + bold: Whether the text is bold. + italic: Whether the text is cursive. + monospace: Whether the text is monospaced. + """ + __slots__ = ("text", "bold", "italic", "monospace") + inner: Node + bold: bool + italic: bool + monospace: bool + + def plain(self): + return self.inner.plain() + + def with_text(self, text): + return replace(self, inner=self.inner.with_text(text)) + + +@dataclass +class InlineLink(Node): + """An inline link. + + Attributes: + href: The link target. + title: The text that should be shown. + """ + __slots__ = ("href", "title") + href: str + title: Union[Plain, Style] + + def plain(self): + if self.title is None: + return self.href + return self.title.plain() + + def with_text(self, text): + return replace(self, title=self.title.with_text(text)) + + +@dataclass +class Block: + """Base class for all top-level blocks.""" + + def cleanup(self): + """Clean up the content of this block. + + The exact meaning of this is dependent on the type of the block, but it + can involve stripping trailing/leading whitespace or other changes. + + Note that this modifies the block. + """ + + def append(self, node: Node): + """Append the given node to the block. + + Depending on the block, the node can either be inserted as-is (keeping + the markup information), or it is converted to plain text first. + + Args: + node: The node to insert. + """ + + def plain(self) -> str: + """Returns the plain text of this block, stripping all markup. + + Returns: + The plain text. + """ + + def to_nodes(self) -> List[Node]: + """Returns the inner nodes of this block. + + If the block is not made up of nodes, this will create new nodes that + contain the plain text content of this block. + + Returns: + The list of nodes. + """ + return [Plain(self.plain())] + + +@dataclass +class Paragraph(Block): + """A paragraph is a piece of text, which itself can hold inline markup.""" + __slots__ = ("nodes",) + nodes: List[Node] + + def __bool__(self): + return bool(self.nodes) + + def append(self, node): + self.nodes.append(node) + + def plain(self): + return "".join(node.plain() for node in self.nodes) + + def to_nodes(self): + return self.nodes + + def cleanup(self): + while self.nodes and re.match("^\\s+|^$", self.nodes[0].plain()): + self.nodes[0] = self.nodes[0].with_text( + self.nodes[0].plain().lstrip()) + if not self.nodes[0].plain(): + del self.nodes[0] + while self.nodes and re.search("\\s+$|^$", self.nodes[-1].plain()): + self.nodes[-1] = self.nodes[-1].with_text( + self.nodes[-1].plain().rstrip()) + if not self.nodes[-1].plain(): + del self.nodes[-1] + + +@dataclass +class Heading(Block): + """A heading. + + Attributes: + level: The level of the heading. + text: The heading text. + """ + __slots__ = ("level", "text") + level: int + text: str + + def __bool__(self): + return bool(self.text) + + def cleanup(self): + self.text = self.text.strip() + + def append(self, node): + self.text += node.plain() + + def plain(self): + return self.text + + +@dataclass +class Verbatim(Block): + """Text that should appear verbatim in the output, such as code. + + Attributes: + text: The text that should appear. + """ + __slots__ = ("text",) + text: str + + def __bool__(self): + return bool(self.text) + + def append(self, node): + self.text += node.plain() + + def plain(self): + return self.text + + +@dataclass +class ItemList(Block): + """A list of elements. + + Attributes: + items: The list of items. Each item is a list of inline :class:`Node`. + ordered: A flag indicating whether the list should be an ordered + (numbered) list. + """ + __slots__ = ("items", "ordered") + items: List[List[Node]] + ordered: bool + + def __bool__(self): + return bool(self.items) + + def new_item(self): + """Start a new item.""" + self.items.append([]) + + def append(self, node): + if not self.items: + self.new_item() + self.items[-1].append(node) + + def plain(self): + return "\n".join( + "".join(i.plain() for i in item) for item in self.items + ) + + def to_nodes(self): + return [node for item in self.items for node in item] + + def cleanup(self): + i = 0 + while i < len(self.items): + p = Paragraph(self.items[i]) + p.cleanup() + if p: + self.items[i] = p.to_nodes() + i += 1 + else: + del self.items[i] + + +@dataclass +class Blockquote(Block): + """A quote. + + Attributes: + nodes: The content of the blockquote, similar to + :attr:`Paragraph.nodes`. + """ + __slots__ = ("nodes",) + nodes: List[Node] + + def __bool__(self): + return bool(self.nodes) + + def append(self, node): + self.nodes.append(node) + + def plain(self): + return "".join(node.plain() for node in self.nodes) + + def to_nodes(self): + return self.nodes + + +@dataclass +class BlockLink(Block): + """A link on its own line. + + This is important for formats like Gemtext, where inline links will be + discarded. + + Attributes: + href: The target of the link. + title: The link text. + """ + __slots__ = ("href", "title") + href: str + title: str + + def append(self, node): + self.title += node.plain() + + def plain(self): + return self.title + + +def insert_into(blocks: List[Block], node: Node): + """Inserts the given node into the list of blocks. + + The node will always be inserted into the last block. If the list of blocks + is still empty, a fresh :class:`Paragraph` will be started. + + This function takes care of handling newlines properly. That means that a + double newline (``\\n\\n``) will start a new paragraph, and a single + newline (``\\n``) will start a new list item (if the current block is a + list). + + Note that this function will modify the given list of blocks. + + Args: + blocks: The list of blocks. + node: The node to insert. + """ + if not blocks: + blocks.append(Paragraph([])) + + current_block = blocks[-1] + + if isinstance(current_block, Paragraph): + if "\n\n" in node.plain(): + idx = node.plain().index("\n\n") + left = node[:idx] + right = node[idx+2:] + current_block.append(left) + blocks.append(Paragraph([])) + insert_into(blocks, right) + else: + current_block.append(node) + + elif isinstance(current_block, ItemList): + match = re.search("\\n\\n?", node.plain()) + if not match: + current_block.append(node) + else: + left_end, right_start = match.span() + current_block.append(node[:left_end]) + if match.group() == "\n\n": + blocks.append(Paragraph([])) + else: + current_block.new_item() + insert_into(blocks, node[right_start:]) + + else: + current_block.append(node) + + +def extract_plaintext(obj) -> str: + """Tries to extract plaintext from the given object. + + The given object can be one of many things: + + * A list of :class:`Node` + * A list of :class:`Block` + * A single :class:`Node` + * A single :class:`Block` + * A :class:`Document` + + This function is useful if you recursively call + :meth:`wikimini.Wikimini.convert` and want to include the output in + something that only accepts plain text. + """ + if isinstance(obj, Document): + return extract_plaintext(obj.blocks) + + if not isinstance(obj, list): + obj = [obj] + + if not obj: + return "" + elif isinstance(obj[0], Node): + return "".join(node.plain() for node in obj) + elif isinstance(obj[0], Block): + return "\n\n".join(block.plain() for block in obj) diff --git a/wikimini/templates/cite.py b/wikimini/templates/cite.py index ac4f597..e342a5f 100644 --- a/wikimini/templates/cite.py +++ b/wikimini/templates/cite.py @@ -1,5 +1,6 @@ """Citation related templates.""" from . import registry +from ..document import Plain def tmpl_citation(wikimini, obj): @@ -23,7 +24,7 @@ def tmpl_citation(wikimini, obj): names.append(last) elif first: names.append(first) - return "{} ({})".format(title, "; ".join(names)) + return [Plain("{} ({})".format(title, "; ".join(names)))] for name in ["cite", "citation", "cite arXiv", "cite AV media", "cite book", diff --git a/wikimini/templates/convert.py b/wikimini/templates/convert.py index a7a3f44..8bab782 100644 --- a/wikimini/templates/convert.py +++ b/wikimini/templates/convert.py @@ -1,20 +1,21 @@ """Implementations for the unit conversion templates.""" from . import registry +from ..document import Plain def tmpl_convert(wikimini, obj): """Renders the ``{{convert|...}}`` template.""" if str(obj.params[1]) in {"-", "to"}: - return "{0}{3} {1} {2}{3}".format( + return [Plain("{0}{3} {1} {2}{3}".format( obj.params[0].value.strip_code(), obj.params[1].value.strip_code(), obj.params[2].value.strip_code(), obj.params[3].value.strip_code(), - ) - return "{}{}".format( + ))] + return [Plain("{}{}".format( obj.params[0].value.strip_code(), obj.params[1].value.strip_code(), - ) + ))] registry.insert("convert", tmpl_convert) diff --git a/wikimini/templates/language.py b/wikimini/templates/language.py index e8ab738..22320da 100644 --- a/wikimini/templates/language.py +++ b/wikimini/templates/language.py @@ -1,12 +1,15 @@ """Language related templates.""" from . import registry +from ..document import Plain import pycountry def tmpl_ipa(wikimini, obj): """Renders the ``{{IPA|...}}`` template.""" - return "pronounced [{}]".format(wikimini._convert(obj.params[0].value)) + return [Plain("pronounced [{}]".format( + wikimini.convert(obj.params[0].value).plain() + ))] registry.insert("IPA", tmpl_ipa) @@ -14,7 +17,7 @@ registry.insert("IPA", tmpl_ipa) def tmpl_lang(wikimini, obj): """Renders the ``{{Lang|...}}`` template.""" - return wikimini._convert(obj.params[1].value) + return wikimini.convert(obj.params[1].value).nodes() registry.insert("lang", tmpl_lang) @@ -24,18 +27,18 @@ registry.insert("script", tmpl_lang) def tmpl_lang_code(language_name): """Creates a template renderer for a ``{{lang-xx|...}}`` template.""" def inner(wikimini, obj): - return "{}: {}".format( - language_name, wikimini._convert(obj.params[0].value) - ) + return [Plain("{}: {}".format( + language_name, wikimini.convert(obj.params[0].value).plain() + ))] return inner def tmpl_ipa_code(language_name): """Creates a template renderer for a ``{{IPA-xx|...}}`` template.""" def inner(wikimini, obj): - return "{} pronunciation: [{}]".format( - language_name, wikimini._convert(obj.params[0].value) - ) + return [Plain("{} pronunciation: [{}]".format( + language_name, wikimini.convert(obj.params[0].value).plain() + ))] return inner @@ -52,7 +55,7 @@ for language in pycountry.languages: def tmpl_country_flag(country): """Creates a template renderer for ``{{BRA}}`` country flags.""" def inner(wikimini, obj): - return country + return [Plain(country)] return inner diff --git a/wikimini/templates/mainlinks.py b/wikimini/templates/mainlinks.py index ffcbc5e..8c9abbf 100644 --- a/wikimini/templates/mainlinks.py +++ b/wikimini/templates/mainlinks.py @@ -1,14 +1,15 @@ """Renders templates that link to further articles.""" from . import registry +from ..document import Paragraph, Plain, BlockLink def tmpl_main(wikimini, obj): """Renders the ``{{main|...}}`` template.""" links = [ - "=> {} {}".format(wikimini.page_url(str(t.value)), t.value) + BlockLink(wikimini.page_url(str(t.value)), t.value.strip_code()) for t in obj.params ] - return "Main articles:\n{}\n".format("\n".join(links)) + return [Paragraph([Plain("Main articles:")])] + links registry.insert("main", tmpl_main) diff --git a/wikimini/templates/quotes.py b/wikimini/templates/quotes.py index b51d92d..39f6fa2 100644 --- a/wikimini/templates/quotes.py +++ b/wikimini/templates/quotes.py @@ -1,15 +1,15 @@ """Renders various quote related templates.""" from . import registry +from ..document import Blockquote def tmpl_quote(wikimini, obj): """Renders the ``{{blockquote|...}}`` template.""" text = obj.get("text", None) if not text: return "" - content = wikimini._convert(text.value) - lines = content.split("\n") - return "\n".join(f"> {line}" for line in lines) + content = wikimini.convert(text.value).nodes() + return [Blockquote(content)] registry.insert("blockquote", tmpl_quote) @@ -19,9 +19,8 @@ registry.insert("quote", tmpl_quote) def tmpl_cquote(wikimini, obj): """Renders the ``{{cquote|...}}`` template.""" text = obj.params[0] - content = wikimini._convert(text.value) - lines = content.split("\n") - return "\n".join(f"> {line}" for line in lines) + content = wikimini.convert(text.value).nodes() + return [Blockquote(content)] registry.insert("cquote", tmpl_cquote) diff --git a/wikimini/templates/various.py b/wikimini/templates/various.py index 8c6e0d5..6076ac4 100644 --- a/wikimini/templates/various.py +++ b/wikimini/templates/various.py @@ -1,16 +1,17 @@ """Various small templates.""" from . import registry +from ..document import Plain def tmpl_reign(wikimini, obj): """Renders the ``{{reign|...}}`` template.""" if not obj.params: - return "r. " + return [Plain("r. ")] first = obj.params[0].value.strip_code().strip() or "?" second = "" if len(obj.params) > 1: second = obj.params[1].value.strip_code().strip() - return f"r. {first} – {second}" + return [Plain(f"r. {first} – {second}")] registry.insert("reign", tmpl_reign) |