Merge branch 'document-repr'

author: Daniel Schadt <kingdread@gmx.de> 2021-08-20 22:04:44 +0200
committer: Daniel Schadt <kingdread@gmx.de> 2021-08-20 22:04:44 +0200
commit: a9a7c6c77f3b6078e317d455b696ce76272b88cb (patch)
tree: 4d7649a3efe54378141e5fba6778281d2da4b8b4
parent: 469353899ae6d7d0c0b7b105c24baaa4841c6328 (diff)
parent: a114ad49db792ec190a5cb6c96acc47669ac4b03 (diff)
download: wikimini-a9a7c6c77f3b6078e317d455b696ce76272b88cb.tar.gz
wikimini-a9a7c6c77f3b6078e317d455b696ce76272b88cb.tar.bz2
wikimini-a9a7c6c77f3b6078e317d455b696ce76272b88cb.zip
11 files changed, 872 insertions, 134 deletions
diff --git a/wikimini/__init__.py b/wikimini/__init__.py
index b93ef81..d6e49a1 100644
--- a/wikimini/__init__.py
+++ b/wikimini/__init__.py
@@ -1,11 +1,14 @@
 import mwparserfromhell as mwp
 import requests
-import re
 
 from tabulate import tabulate
 
-from typing import Union, Tuple
+from typing import List, Union, Tuple
 
+from .document import (
+    Plain, BlockLink, InlineLink, Verbatim, Document, Node, Block, ItemList,
+    Paragraph, Heading, insert_into, extract_plaintext,
+)
 
 #: The default API URL, pointing to the english Wikipedia.
 API_URL = "https://en.wikipedia.org/w/api.php"
@@ -77,66 +80,73 @@ class Wikimini:
         text = revision["slots"]["main"]["content"]
         return (title, mwp.parse(text))
 
-    def _convert(self, obj):
-        """Function that does the actual conversion.
+    def convert(
+        self,
+        obj: Union[mwp.wikicode.Wikicode, mwp.nodes.Node],
+    ) -> Union[Document, List[Node], List[Block]]:
+        """Function that converts and renders a node.
 
-        This is called recursively on each node, and should perform the correct
-        conversion - based on the node type.
+        This function is exposed for template implementors, for normal usage,
+        see :meth:`convert_to_document`.
+
+        The input and output of this function is as follows:
+
+        * If ``obj`` is a :class:`~mwparserfromhell.wikicode.Wikicode`, then
+          :meth:`convert` will return a :class:`document.Document`.
+        * If ``obj`` is a :class:`~mwparserfromhell.nodes.Node`, then
+          :meth:`convert` will return either a list of :class:`document.Node`
+          or a list of :class:`document.Block`, depending on whether the
+          converted object is inline (like a link), or a block object (like a
+          quote).
+
+        Note that in the last case, the empty list ``[]`` might be returned,
+        indicating that the object should not be included in the output.
+
+        Args:
+            obj: The object to convert.
+
+        Returns:
+            The converted object.
         """
         default = lambda obj:\
-            mwp.wikicode.Wikicode([obj]).strip_code(collapse=False)
+            [Plain(mwp.wikicode.Wikicode([obj]).strip_code(collapse=False))]
 
-        # This does the actual conversion
         if isinstance(obj, mwp.wikicode.Wikicode):
-            converted = []
-            iterator = iter(enumerate(obj.nodes))
-            for i, node in iterator:
-                # Pattern: * [[Wikilink]]\n
-                if (i >= 2 and
-                        i + 1 < len(obj.nodes) and
-                        # Links can have a plural s after them
-                        re.match("s?\n", str(obj.nodes[i+1])) and
-                        isinstance(node, mwp.nodes.wikilink.Wikilink) and
-                        str(obj.nodes[i-1]) == " " and
-                        str(obj.nodes[i-2]) == "*"):
-                    converted.pop()
-                    converted.pop()
-                    _, after = next(iterator)
-                    converted.append("=> {} {}{}".format(
-                        self.page_url(str(node.title)),
-                        self._convert(node),
-                        self._convert(after),
-                    ))
-                    continue
-                # Pattern: *[[Wikilink]]\n
-                elif (i >= 1 and
-                        i + 1 < len(obj.nodes) and
-                        re.match("s?\n", str(obj.nodes[i+1])) and
-                        isinstance(node, mwp.nodes.wikilink.Wikilink) and
-                        str(obj.nodes[i-1]) == "*"):
-                    converted.pop()
-                    _, after = next(iterator)
-                    converted.append("=> {} {}{}".format(
-                        self.page_url(str(node.title)),
-                        self._convert(node),
-                        self._convert(after),
-                    ))
-                    continue
-
-                # Default: Just convert the node
-                converted.append(self._convert(node))
-            return "".join(converted)
+            document = []
+            for node in obj.nodes:
+                current = self.convert(node)
+
+                if current == []:
+                    pass
+                # Special case: We're starting a list, but we're already in a
+                # list
+                elif (document and len(current) == 1
+                        and isinstance(current[0], ItemList)
+                        and isinstance(document[-1], ItemList)
+                        and document[-1].ordered == current[0].ordered):
+                    pass
+                # Special case: We're starting a list!
+                elif len(current) == 1 and isinstance(current[0], ItemList):
+                    document.extend(current)
+                elif isinstance(current[0], Block):
+                    document.extend(current)
+                    document.append(Paragraph([]))
+                elif isinstance(current[0], Node):
+                    for c in current:
+                        insert_into(document, c)
+            return Document(document)
         elif isinstance(obj, mwp.nodes.heading.Heading):
-            return "{} {}\n".format("#" * min(obj.level, 3), obj.title.strip_code())
+            return [Heading(obj.level, obj.title.strip_code())]
         elif isinstance(obj, mwp.nodes.tag.Tag):
-            # Most tags are handled just fine and can be delegated to strip_code
-            # (inline text styles), however we can do a bit better for list tags.
+            # Most tags are handled just fine and can be delegated to
+            # strip_code (inline text styles), however we can do a bit better
+            # for list tags.
             if str(obj.wiki_markup) == "*":
-                return "* "
+                return [ItemList([], False)]
             elif str(obj.wiki_markup) == "#":
-                return "<!NUM!> "
+                return [ItemList([], True)]
             elif str(obj.tag) == "ref":
-                return ""
+                return []
             elif str(obj.tag) == "table":
                 rows = []
                 header = ()
@@ -151,22 +161,22 @@ class Wikimini:
                             continue
                         if str(node.tag) == "th":
                             row_is_header = True
-                        parsed.append(self._convert(node.contents).strip())
+                        parsed.append(
+                            self.convert(node.contents).plain().strip()
+                        )
                     if not row_is_header:
                         rows.append(parsed)
                     else:
                         header = parsed
-                return "".join([
-                    "\n```\n",
-                    tabulate(rows, header, tablefmt=self.table_format),
-                    "\n```\n",
-                ])
+                return [Verbatim(
+                    tabulate(rows, header, tablefmt=self.table_format)
+                )]
             else:
                 return default(obj)
         elif isinstance(obj, mwp.nodes.template.Template):
-            # Most templates are handled fine (and completely stripped), however,
-            # some of them are useful and provide some output that we should mimic
-            # (for example, the convert template).
+            # Most templates are handled fine (and completely stripped),
+            # however, some of them are useful and provide some output that we
+            # should mimic (for example, the convert template).
             name = str(obj.name)
             template = templates.registry.get(name)
             if template is None:
@@ -174,51 +184,30 @@ class Wikimini:
             else:
                 return template(self, obj)
         elif isinstance(obj, mwp.nodes.wikilink.Wikilink):
-            if str(obj.title).startswith("File:") or str(obj.text).startswith("thumb|"):
-                return ""
+            if (str(obj.title).startswith("File:")
+                    or str(obj.text).startswith("thumb|")):
+                return []
             elif str(obj.title).startswith("Category:"):
-                return ""
+                return []
             else:
-                return default(obj)
+                return [InlineLink(
+                    self.page_url(str(obj.title)),
+                    Plain(
+                        extract_plaintext(self.convert(obj.text)) if obj.text
+                        else str(obj.title)
+                    ),
+                )]
         else:
             return default(obj)
 
-    def _postprocess(self, gemtext):
-        # Strip out any more thumbs that have been left.
-        # This happens because the wikilinks are nested in each other, which the
-        # parser would only notice after doing the first replacement. We'll just
-        # take the easy way out here and use a regex to get rid of them.
-        gemtext = re.sub("^\\[\\[File:.*?\\]\\]$", "", gemtext, flags=re.MULTILINE)
-
-        # Collapse too many empty lines
-        while "\n\n\n" in gemtext:
-            gemtext = gemtext.replace("\n\n\n", "\n\n")
-
-        # Shortcut to avoid unnecessary splitting
-        if "<!NUM!>" not in gemtext:
-            return gemtext
-
-        lines = gemtext.split("\n")
-        counter = 1
-        for idx in range(len(lines)):
-            line = lines[idx]
-            if line.startswith("<!NUM!>"):
-                line = line.replace("<!NUM!>", str(counter), 1)
-                lines[idx] = line
-                counter += 1
-            else:
-                counter = 1
-        return "\n".join(lines)
-
-
-    def wikicode_to_gemtext(
-            self, obj: Union[mwp.nodes.Node, mwp.wikicode.Wikicode]
-        ) -> str:
-        """Try to turn the given object into a sensible Gemtext representation.
+    def convert_to_document(self, obj: mwp.wikicode.Wikicode) -> Document:
+        """Try to turn the given object into a sensible
+        :class:`~document.Document` representation.
 
-        Note that wikicode is much more powerful than Gemtext, so this is a lossy
-        function. The returned Gemtext tries to mimic the content of the Wikicode
-        as much as possible (for human consumption).
+        Note that wikicode is much more powerful than the internal
+        representation, so this is a lossy function. The returned document
+        tries to mimic the content of the Wikicode as much as possible (for
+        human consumption).
 
         This function mostly mimics
         :meth:`~mwparserfromhell.wikicode.Wikicode.strip_code`, with some
@@ -228,21 +217,26 @@ class Wikimini:
             obj: The object to convert.
 
         Returns:
-            The converted Gemtext.
+            The converted Document.
         """
         # Avoid calling str() on the whole Wikicode here
-        if (isinstance(obj, mwp.wikicode.Wikicode) and
-                str(mwp.wikicode.Wikicode(obj.nodes[:2])) == "#REDIRECT "):
+        if (isinstance(obj, mwp.wikicode.Wikicode)
+                and str(mwp.wikicode.Wikicode(obj.nodes[:2])) == "#REDIRECT "):
+            document = Document()
             title = str(obj.nodes[2].title)
             if "#" in title:
                 title, section = title.split("#")
                 section = f"Section '{section}'"
             else:
                 section = ""
-            return "Redirect:\n=> {} {}\n{}".format(
-                self.page_url(title), title, section
-            )
-        return self._postprocess(self._convert(obj))
+            document.append(BlockLink(self.page_url(title), title))
+            if section:
+                document.append(Paragraph([Plain(section)]))
+            return document
+
+        document = self.convert(obj)
+        document.cleanup()
+        return document
 
 
 # import at the bottom to avoid circular dependencies
diff --git a/wikimini/document.py b/wikimini/document.py
new file mode 100644
index 0000000..a363c25
--- /dev/null
+++ b/wikimini/document.py
@@ -0,0 +1,490 @@
+"""The main class of this module is a :class:`Document`, which holds a parsed
+and rendered Wikipedia article.
+
+We distinguish between two kinds of nodes, similar to HTML:
+"""
+import re
+from dataclasses import dataclass, replace
+from typing import List, Union
+
+
+class Document:
+    """A rendered Wikipedia article.
+
+    Attributes:
+        blocks (List[Block]): A list of top-level nodes.
+    """
+    __slots__ = ('blocks',)
+
+    def __init__(self, blocks=None):
+        self.blocks = []
+        if blocks:
+            self.blocks = blocks
+
+    def __iter__(self):
+        return iter(self.blocks)
+
+    def append(self, block: "Block"):
+        """Append a block to the document.
+
+        Args:
+            block: The block to append.
+        """
+        self.blocks.append(block)
+
+    def cleanup(self):
+        """Clean up the document by cleaning up every contained block.
+
+        See also :meth:`Block.cleanup`.
+        """
+        for block in self.blocks:
+            block.cleanup()
+        self.blocks = [block for block in self.blocks if block]
+
+    def nodes(self) -> List["Node"]:
+        """Discard the block information and return a list of inner nodes.
+
+        Returns:
+            A list of all inner nodes.
+        """
+        return [node for block in self.blocks for node in block.to_nodes()]
+
+    def plain(self) -> str:
+        """Returns the plain text content of this document.
+
+        Returns:
+            The plain text.
+        """
+        return extract_plaintext(self)
+
+
+@dataclass
+class Node:
+    """Base class for all in-line text elements."""
+
+    def plain(self) -> str:
+        """Returns the plain text of this node, stripping all markup.
+
+        Returns:
+            The plain text.
+        """
+
+    def with_text(self, text: str) -> "Node":
+        """Returns a new node that has the same markup, but the given text.
+
+        Args:
+            text: The new text.
+
+        Returns:
+            The new node, usually of the same type as the node this function is
+            called on.
+        """
+
+    def __len__(self):
+        return len(self.plain())
+
+    def __getitem__(self, index):
+        if isinstance(index, int):
+            return self.plain()[index]
+        elif isinstance(index, slice):
+            text = self.plain()[index]
+            return self.with_text(text)
+        else:
+            raise TypeError("Node indices must be integers or slices")
+
+
+@dataclass
+class Plain(Node):
+    """A plain text node.
+
+    Attributes:
+        text: The text content of this node.
+    """
+    __slots__ = ("text",)
+    text: str
+
+    def plain(self):
+        return self.text
+
+    def with_text(self, text):
+        return Plain(text)
+
+
+@dataclass
+class Style(Node):
+    """Text that is styled with inline markup.
+
+    Attributes:
+        inner: The content.
+        bold: Whether the text is bold.
+        italic: Whether the text is cursive.
+        monospace: Whether the text is monospaced.
+    """
+    __slots__ = ("text", "bold", "italic", "monospace")
+    inner: Node
+    bold: bool
+    italic: bool
+    monospace: bool
+
+    def plain(self):
+        return self.inner.plain()
+
+    def with_text(self, text):
+        return replace(self, inner=self.inner.with_text(text))
+
+
+@dataclass
+class InlineLink(Node):
+    """An inline link.
+
+    Attributes:
+        href: The link target.
+        title: The text that should be shown.
+    """
+    __slots__ = ("href", "title")
+    href: str
+    title: Union[Plain, Style]
+
+    def plain(self):
+        if self.title is None:
+            return self.href
+        return self.title.plain()
+
+    def with_text(self, text):
+        return replace(self, title=self.title.with_text(text))
+
+
+@dataclass
+class Block:
+    """Base class for all top-level blocks."""
+
+    def cleanup(self):
+        """Clean up the content of this block.
+
+        The exact meaning of this is dependent on the type of the block, but it
+        can involve stripping trailing/leading whitespace or other changes.
+
+        Note that this modifies the block.
+        """
+
+    def append(self, node: Node):
+        """Append the given node to the block.
+
+        Depending on the block, the node can either be inserted as-is (keeping
+        the markup information), or it is converted to plain text first.
+
+        Args:
+            node: The node to insert.
+        """
+
+    def plain(self) -> str:
+        """Returns the plain text of this block, stripping all markup.
+
+        Returns:
+            The plain text.
+        """
+
+    def to_nodes(self) -> List[Node]:
+        """Returns the inner nodes of this block.
+
+        If the block is not made up of nodes, this will create new nodes that
+        contain the plain text content of this block.
+
+        Returns:
+            The list of nodes.
+        """
+        return [Plain(self.plain())]
+
+
+@dataclass
+class LineBreak:
+    """Represents an enforced empty line."""
+    __slots__ = ()
+
+    def plain(self):
+        return "\n"
+
+
+@dataclass
+class Paragraph(Block):
+    """A paragraph is a piece of text, which itself can hold inline markup."""
+    __slots__ = ("nodes",)
+    nodes: List[Node]
+
+    def _find_index(self, idx):
+        offset = 0
+        for i, node in enumerate(self.nodes):
+            if idx < offset + len(node):
+                return (i, idx - offset)
+            offset += len(node)
+        if idx == offset:
+            return (i, len(node))
+        raise IndexError(f"{idx} is out of range")
+
+    def __bool__(self):
+        return bool(self.nodes)
+
+    def append(self, node):
+        self.nodes.append(node)
+
+    def plain(self):
+        return "".join(node.plain() for node in self.nodes)
+
+    def to_nodes(self):
+        return self.nodes
+
+    def cleanup(self):
+        # There is a chance that some "thumbnail" links will get through
+        # (mainly if their text also contains links, in which case it'd require
+        # multiple parsing passes). As a quick and dirty fix, we just delete
+        # that stuff here:
+        while match := re.search("\\[\\[File:.+?\\]\\]", self.plain()):
+            start_node, start_pos = self._find_index(match.start())
+            end_node, end_pos = self._find_index(match.end())
+
+            new_start = self.nodes[start_node][:start_pos]
+            new_end = self.nodes[end_node][end_pos:]
+            self.nodes[start_node:end_node + 1] = [new_start, new_end]
+
+        # Strip leading and trailing whitespace
+        while self.nodes and re.match("^\\s+|^$", self.nodes[0].plain()):
+            self.nodes[0] = self.nodes[0].with_text(
+                self.nodes[0].plain().lstrip())
+            if not self.nodes[0].plain():
+                del self.nodes[0]
+        while self.nodes and re.search("\\s+$|^$", self.nodes[-1].plain()):
+            self.nodes[-1] = self.nodes[-1].with_text(
+                self.nodes[-1].plain().rstrip())
+            if not self.nodes[-1].plain():
+                del self.nodes[-1]
+
+    def is_link_paragraph(self) -> bool:
+        """Returns whether the paragraph can be considered a "link item".
+
+        A link item is a paragraph that only consists of a link (and
+        potentially a plural identifier), usually found in the "See also"
+        section on Wikipedia.
+
+        In case of a link paragraph, the first node will be an
+        :class:`InlineLink`.
+
+        Returns:
+            True if the paragraph is a link paragraph.
+        """
+        if not self.nodes:
+            return False
+        return (isinstance(self.nodes[0], InlineLink)
+                and (len(self.nodes) == 1
+                     or len(self.nodes) == 2 and self.nodes[1].plain() == "s")
+                )
+
+
+@dataclass
+class Heading(Block):
+    """A heading.
+
+    Attributes:
+        level: The level of the heading.
+        text: The heading text.
+    """
+    __slots__ = ("level", "text")
+    level: int
+    text: str
+
+    def __bool__(self):
+        return bool(self.text)
+
+    def cleanup(self):
+        self.text = self.text.strip()
+
+    def append(self, node):
+        self.text += node.plain()
+
+    def plain(self):
+        return self.text
+
+
+@dataclass
+class Verbatim(Block):
+    """Text that should appear verbatim in the output, such as code.
+
+    Attributes:
+        text: The text that should appear.
+    """
+    __slots__ = ("text",)
+    text: str
+
+    def __bool__(self):
+        return bool(self.text)
+
+    def append(self, node):
+        self.text += node.plain()
+
+    def plain(self):
+        return self.text
+
+
+@dataclass
+class ItemList(Block):
+    """A list of elements.
+
+    Attributes:
+        items: The list of items. Each item is a :class:`Paragraph`.
+        ordered: A flag indicating whether the list should be an ordered
+            (numbered) list.
+    """
+    __slots__ = ("items", "ordered")
+    items: List[Paragraph]
+    ordered: bool
+
+    def __bool__(self):
+        return bool(self.items)
+
+    def new_item(self):
+        """Start a new item."""
+        self.items.append(Paragraph([]))
+
+    def append(self, node):
+        if not self.items:
+            self.new_item()
+        self.items[-1].append(node)
+
+    def plain(self):
+        return "\n".join(paragraph.plain() for paragraph in self.items)
+
+    def to_nodes(self):
+        return [node for item in self.items for node in item.nodes]
+
+    def cleanup(self):
+        i = 0
+        while i < len(self.items):
+            paragraph = self.items[i]
+            paragraph.cleanup()
+            if paragraph:
+                i += 1
+            else:
+                del self.items[i]
+
+
+@dataclass
+class BlockQuote(Block):
+    """A quote.
+
+    Attributes:
+        content: The content of the blockquote.
+    """
+    __slots__ = ("content",)
+    content: Paragraph
+
+    def __bool__(self):
+        return bool(self.content)
+
+    def append(self, node):
+        self.content.append(node)
+
+    def plain(self):
+        return self.content.plain()
+
+    def to_content(self):
+        return self.content.to_nodes()
+
+
+@dataclass
+class BlockLink(Block):
+    """A link on its own line.
+
+    This is important for formats like Gemtext, where inline links will be
+    discarded.
+
+    Attributes:
+        href: The target of the link.
+        title: The link text.
+    """
+    __slots__ = ("href", "title")
+    href: str
+    title: str
+
+    def append(self, node):
+        self.title += node.plain()
+
+    def plain(self):
+        return self.title
+
+
+def insert_into(blocks: List[Block], node: Node):
+    """Inserts the given node into the list of blocks.
+
+    The node will always be inserted into the last block. If the list of blocks
+    is still empty, a fresh :class:`Paragraph` will be started.
+
+    This function takes care of handling newlines properly. That means that a
+    double newline (``\\n\\n``) will start a new paragraph, and a single
+    newline (``\\n``) will start a new list item (if the current block is a
+    list).
+
+    Note that this function will modify the given list of blocks.
+
+    Args:
+        blocks: The list of blocks.
+        node: The node to insert.
+    """
+    if not blocks:
+        blocks.append(Paragraph([]))
+
+    current_block = blocks[-1]
+
+    if isinstance(current_block, Paragraph):
+        if "\n\n" in node.plain():
+            idx = node.plain().index("\n\n")
+            left = node[:idx]
+            right = node[idx + 2:]
+            current_block.append(left)
+            blocks.append(Paragraph([]))
+            insert_into(blocks, right)
+        else:
+            current_block.append(node)
+
+    elif isinstance(current_block, ItemList):
+        match = re.search("\\n\\n?", node.plain())
+        if not match:
+            current_block.append(node)
+        else:
+            left_end, right_start = match.span()
+            current_block.append(node[:left_end])
+            if match.group() == "\n\n":
+                blocks.append(Paragraph([]))
+            else:
+                current_block.new_item()
+            insert_into(blocks, node[right_start:])
+
+    else:
+        current_block.append(node)
+
+
+def extract_plaintext(obj) -> str:
+    """Tries to extract plaintext from the given object.
+
+    The given object can be one of many things:
+
+    * A list of :class:`Node`
+    * A list of :class:`Block`
+    * A single :class:`Node`
+    * A single :class:`Block`
+    * A :class:`Document`
+
+    This function is useful if you recursively call
+    :meth:`wikimini.Wikimini.convert` and want to include the output in
+    something that only accepts plain text.
+    """
+    if isinstance(obj, Document):
+        return extract_plaintext(obj.blocks)
+
+    if not isinstance(obj, list):
+        obj = [obj]
+
+    if not obj:
+        return ""
+    elif isinstance(obj[0], Node):
+        return "".join(node.plain() for node in obj)
+    elif isinstance(obj[0], Block):
+        return "\n\n".join(block.plain() for block in obj)
diff --git a/wikimini/formats/__init__.py b/wikimini/formats/__init__.py
new file mode 100644
index 0000000..b48486a
--- /dev/null
+++ b/wikimini/formats/__init__.py
@@ -0,0 +1,187 @@
+"""The formats are responsible for turning a
+:class:`~wikimini.document.Document` into an output string.
+
+Formats work by being given a file-like buffer as argument, into which the
+output should be written.
+"""
+import io
+from typing import TextIO, Union
+
+from ..document import (
+    Document, Block, BlockLink, BlockQuote, Heading, ItemList, LineBreak,
+    Paragraph, Verbatim, Node, InlineLink, Plain, Style,
+)
+
+
+class Format:
+    """:class:`Format` is the base class for all output formats.
+
+    Any output format should inherit from this class and override the specific
+    output methods. Note that by default, no output is generated.
+
+    The methods :meth:`render`, :meth:`render_document`, :meth:`render_block`
+    and :meth:`render_node` have sensible default implementations that dispatch
+    to the more specific rendering methods.
+
+    Attributes:
+        writer: The file-like object that output should be written to.
+    """
+    writer: TextIO
+
+    def __init__(self, writer: TextIO):
+        self.writer = writer
+
+    def render(self, obj: Union[Document, Block, Node]):
+        """Renders the given object.
+
+        Args:
+            obj: The object to render.
+        """
+        if isinstance(obj, Document):
+            self.render_document(obj)
+        elif isinstance(obj, Block):
+            self.render_block(obj)
+        elif isinstance(obj, Node):
+            self.render_node(obj)
+        else:
+            raise TypeError(f"Cannot render {obj}, unknown type")
+
+    def render_document(self, document: Document):
+        """Renders the given document.
+
+        Args:
+            document: The document to render.
+        """
+        for block in document:
+            self.render_block(block)
+
+    def render_block(self, block: Block):
+        """Renders a single block.
+
+        Args:
+            block: The block to render.
+        """
+        if isinstance(block, BlockLink):
+            self.render_block_link(block)
+        elif isinstance(block, BlockQuote):
+            self.render_block_quote(block)
+        elif isinstance(block, Heading):
+            self.render_heading(block)
+        elif isinstance(block, ItemList):
+            self.render_item_list(block)
+        elif isinstance(block, LineBreak):
+            self.render_line_break(block)
+        elif isinstance(block, Paragraph):
+            self.render_paragraph(block)
+        elif isinstance(block, Verbatim):
+            self.render_verbatim(block)
+        else:
+            raise TypeError(f"Unknown Block type given: {type(block)}")
+
+    def render_block_link(self, block_link: BlockLink):
+        """Renders a :class:`~wikimini.document.BlockLink`.
+
+        Args:
+            block_link: The block link to render.
+        """
+
+    def render_block_quote(self, block_quote: BlockQuote):
+        """Renders a :class:`~wikimini.document.BlockQuote`.
+
+        Args:
+            block_quote: The block quote to render.
+        """
+
+    def render_heading(self, heading: Heading):
+        """Renders a :class:`~wikimini.document.Heading`.
+
+        Args:
+            heading: The heading to render.
+        """
+
+    def render_item_list(self, item_list: ItemList):
+        """Renders a :class:`~wikimini.document.ItemList`.
+
+        Args:
+            item_list: The item list to render.
+        """
+
+    def render_line_break(self, line_break: LineBreak):
+        """Renders a :class:`~wikimini.document.LineBreak`.
+
+        Args:
+            line_break: The line break to render.
+        """
+
+    def render_paragraph(self, paragraph: Paragraph):
+        """Renders a :class:`~wikimini.document.Paragraph`.
+
+        Args:
+            paragraph: The paragraph to render.
+        """
+
+    def render_verbatim(self, verbatim: Verbatim):
+        """Renders a :class:`~wikimini.document.Verbatim`.
+
+        Args:
+            verbatim: The verbatim to render.
+        """
+
+    def render_node(self, node: Node):
+        """Renders a single node.
+
+        Args:
+            node: The node to render.
+        """
+        if isinstance(node, InlineLink):
+            self.render_inline_link(node)
+        elif isinstance(node, Plain):
+            self.render_plain(node)
+        elif isinstance(node, Style):
+            self.render_style(node)
+        else:
+            raise TypeError(f"Unknown node type: {type(node)}")
+
+    def render_inline_link(self, inline_link: InlineLink):
+        """Renders a :class:`~wikimini.document.InlineLink`.
+
+        Args:
+            inline_link: The inline link to render.
+        """
+
+    def render_plain(self, plain: Plain):
+        """Renders a :class:`~wikimini.document.Plain`.
+
+        Args:
+            plain: The plain text to render.
+        """
+
+    def render_style(self, style: Style):
+        """Renders a :class:`~wikimini.document.Style`.
+
+        Args:
+            style: The styled text to render.
+        """
+
+
+def as_string(formatter: Format, obj: Union[Document, Node, Block]) -> str:
+    """Runs the given format function and returns the result as a string.
+
+    This temporarily replaces the output writer by an in-memory string object,
+    runs the render function and then restores the writer.
+
+    Args:
+        formatter: The formatter to run.
+        obj: The object to render.
+
+    Returns:
+        The content, as string.
+    """
+    old_writer = formatter.writer
+    buffer = io.StringIO()
+    formatter.writer = buffer
+    try:
+        formatter.render(obj)
+    finally:
+        formatter.writer = old_writer
+    return buffer.getvalue()
diff --git a/wikimini/formats/gemtext.py b/wikimini/formats/gemtext.py
new file mode 100644
index 0000000..39df956
--- /dev/null
+++ b/wikimini/formats/gemtext.py
@@ -0,0 +1,58 @@
+"""This module contains a Gemtext formatter for
+:class:`~wikimini.document.Document`.
+"""
+from itertools import zip_longest
+from . import Format, as_string
+from ..document import LineBreak, BlockLink, InlineLink
+
+
+class Gemtext(Format):
+    """The Gemtext formatter."""
+
+    def render_document(self, document):
+        for block, next_block in zip_longest(
+                document.blocks, document.blocks[1:]):
+            self.render_block(block)
+            if not isinstance(next_block, (LineBreak, BlockLink)):
+                self.writer.write("\n")
+
+    def render_block_link(self, block_link):
+        self.writer.write(f"=> {block_link.href} {block_link.title}\n")
+
+    def render_block_quote(self, block_quote):
+        content = as_string(self, block_quote.content)
+        for line in content.split("\n"):
+            self.writer.write(f"> {line}\n")
+
+    def render_heading(self, heading):
+        level = min(3, heading.level)
+        self.writer.write("#" * level + f" {heading.text}\n")
+
+    def render_inline_link(self, inline_link):
+        self.render(inline_link.title)
+
+    def render_item_list(self, item_list):
+        for item in item_list.items:
+            if item.is_link_paragraph():
+                link = item.nodes[0]
+                self.render(BlockLink(link.href, item.plain()))
+            else:
+                self.writer.write("* ")
+                self.render(item)
+
+    def render_line_break(self, _):
+        self.writer.write("\n")
+
+    def render_paragraph(self, paragraph):
+        for node in paragraph.nodes:
+            self.render(node)
+        self.writer.write("\n")
+
+    def render_plain(self, plain):
+        self.writer.write(plain.text)
+
+    def render_style(self, style):
+        self.render(style.inner)
+
+    def render_verbatim(self, verbatim):
+        self.writer.write(f"```\n{verbatim.text}\n```\n")
diff --git a/wikimini/templates/__init__.py b/wikimini/templates/__init__.py
index 9e983e1..360b3fa 100644
--- a/wikimini/templates/__init__.py
+++ b/wikimini/templates/__init__.py
@@ -2,9 +2,9 @@
 
 This module contains functions that mimic Wikipedia's templates.
 
-A template is a function that takes the :class:`~wikimini.Wikimini` instance and the
-:class:`~mwparserfromhell.nodes.template.Template` node to convert, and returns
-a string with the template output (see :const:`Template`).
+A template is a function that takes the :class:`~wikimini.Wikimini` instance
+and the :class:`~mwparserfromhell.nodes.template.Template` node to convert, and
+returns a string with the template output (see :const:`Template`).
 """
 from typing import Callable, Optional
 
@@ -31,6 +31,7 @@ class Registry:
         Returns:
             The template if found, or :any:`None`.
         """
+        name = name.strip()
         # Are templates case-sensitive?
         #   Yes, except usually the first letter.
         # (https://en.wikipedia.org/wiki/Help:A_quick_guide_to_templates#FAQ)
diff --git a/wikimini/templates/cite.py b/wikimini/templates/cite.py
index ac4f597..e342a5f 100644
--- a/wikimini/templates/cite.py
+++ b/wikimini/templates/cite.py
@@ -1,5 +1,6 @@
 """Citation related templates."""
 from . import registry
+from ..document import Plain
 
 
 def tmpl_citation(wikimini, obj):
@@ -23,7 +24,7 @@ def tmpl_citation(wikimini, obj):
             names.append(last)
         elif first:
             names.append(first)
-    return "{} ({})".format(title, "; ".join(names))
+    return [Plain("{} ({})".format(title, "; ".join(names)))]
 
 
 for name in ["cite", "citation", "cite arXiv", "cite AV media", "cite book",
diff --git a/wikimini/templates/convert.py b/wikimini/templates/convert.py
index a7a3f44..8bab782 100644
--- a/wikimini/templates/convert.py
+++ b/wikimini/templates/convert.py
@@ -1,20 +1,21 @@
 """Implementations for the unit conversion templates."""
 from . import registry
+from ..document import Plain
 
 
 def tmpl_convert(wikimini, obj):
     """Renders the ``{{convert|...}}`` template."""
     if str(obj.params[1]) in {"-", "to"}:
-        return "{0}{3} {1} {2}{3}".format(
+        return [Plain("{0}{3} {1} {2}{3}".format(
             obj.params[0].value.strip_code(),
             obj.params[1].value.strip_code(),
             obj.params[2].value.strip_code(),
             obj.params[3].value.strip_code(),
-        )
-    return "{}{}".format(
+        ))]
+    return [Plain("{}{}".format(
         obj.params[0].value.strip_code(),
         obj.params[1].value.strip_code(),
-    )
+    ))]
 
 
 registry.insert("convert", tmpl_convert)
diff --git a/wikimini/templates/language.py b/wikimini/templates/language.py
index e8ab738..22320da 100644
--- a/wikimini/templates/language.py
+++ b/wikimini/templates/language.py
@@ -1,12 +1,15 @@
 """Language related templates."""
 from . import registry
+from ..document import Plain
 
 import pycountry
 
 
 def tmpl_ipa(wikimini, obj):
     """Renders the ``{{IPA|...}}`` template."""
-    return "pronounced [{}]".format(wikimini._convert(obj.params[0].value))
+    return [Plain("pronounced [{}]".format(
+        wikimini.convert(obj.params[0].value).plain()
+    ))]
 
 
 registry.insert("IPA", tmpl_ipa)
@@ -14,7 +17,7 @@ registry.insert("IPA", tmpl_ipa)
 
 def tmpl_lang(wikimini, obj):
     """Renders the ``{{Lang|...}}`` template."""
-    return wikimini._convert(obj.params[1].value)
+    return wikimini.convert(obj.params[1].value).nodes()
 
 
 registry.insert("lang", tmpl_lang)
@@ -24,18 +27,18 @@ registry.insert("script", tmpl_lang)
 def tmpl_lang_code(language_name):
     """Creates a template renderer for a ``{{lang-xx|...}}`` template."""
     def inner(wikimini, obj):
-        return "{}: {}".format(
-            language_name, wikimini._convert(obj.params[0].value)
-        )
+        return [Plain("{}: {}".format(
+            language_name, wikimini.convert(obj.params[0].value).plain()
+        ))]
     return inner
 
 
 def tmpl_ipa_code(language_name):
     """Creates a template renderer for a ``{{IPA-xx|...}}`` template."""
     def inner(wikimini, obj):
-        return "{} pronunciation: [{}]".format(
-            language_name, wikimini._convert(obj.params[0].value)
-        )
+        return [Plain("{} pronunciation: [{}]".format(
+            language_name, wikimini.convert(obj.params[0].value).plain()
+        ))]
     return inner
 
 
@@ -52,7 +55,7 @@ for language in pycountry.languages:
 def tmpl_country_flag(country):
     """Creates a template renderer for ``{{BRA}}`` country flags."""
     def inner(wikimini, obj):
-        return country
+        return [Plain(country)]
     return inner
 
 
diff --git a/wikimini/templates/mainlinks.py b/wikimini/templates/mainlinks.py
index ffcbc5e..3d945ed 100644
--- a/wikimini/templates/mainlinks.py
+++ b/wikimini/templates/mainlinks.py
@@ -1,14 +1,16 @@
 """Renders templates that link to further articles."""
 from . import registry
 
+from ..document import Paragraph, Plain, BlockLink
+
 
 def tmpl_main(wikimini, obj):
     """Renders the ``{{main|...}}`` template."""
     links = [
-        "=> {} {}".format(wikimini.page_url(str(t.value)), t.value)
+        BlockLink(wikimini.page_url(str(t.value)), t.value.strip_code())
         for t in obj.params
     ]
-    return "Main articles:\n{}\n".format("\n".join(links))
+    return [Paragraph([Plain("Main articles:")])] + links
 
 
 registry.insert("main", tmpl_main)
diff --git a/wikimini/templates/quotes.py b/wikimini/templates/quotes.py
index b51d92d..00b82fb 100644
--- a/wikimini/templates/quotes.py
+++ b/wikimini/templates/quotes.py
@@ -1,15 +1,16 @@
 """Renders various quote related templates."""
 from . import registry
 
+from ..document import BlockQuote, Paragraph
+
 
 def tmpl_quote(wikimini, obj):
     """Renders the ``{{blockquote|...}}`` template."""
     text = obj.get("text", None)
     if not text:
         return ""
-    content = wikimini._convert(text.value)
-    lines = content.split("\n")
-    return "\n".join(f"> {line}" for line in lines)
+    content = wikimini.convert(text.value).nodes()
+    return [BlockQuote(Paragraph(content))]
 
 
 registry.insert("blockquote", tmpl_quote)
@@ -19,9 +20,8 @@ registry.insert("quote", tmpl_quote)
 def tmpl_cquote(wikimini, obj):
     """Renders the ``{{cquote|...}}`` template."""
     text = obj.params[0]
-    content = wikimini._convert(text.value)
-    lines = content.split("\n")
-    return "\n".join(f"> {line}" for line in lines)
+    content = wikimini.convert(text.value).nodes()
+    return [BlockQuote(Paragraph(content))]
 
 
 registry.insert("cquote", tmpl_cquote)
diff --git a/wikimini/templates/various.py b/wikimini/templates/various.py
index 8c6e0d5..6076ac4 100644
--- a/wikimini/templates/various.py
+++ b/wikimini/templates/various.py
@@ -1,16 +1,17 @@
 """Various small templates."""
 from . import registry
+from ..document import Plain
 
 
 def tmpl_reign(wikimini, obj):
     """Renders the ``{{reign|...}}`` template."""
     if not obj.params:
-        return "r. "
+        return [Plain("r. ")]
     first = obj.params[0].value.strip_code().strip() or "?"
     second = ""
     if len(obj.params) > 1:
         second = obj.params[1].value.strip_code().strip()
-    return f"r. {first} – {second}"
+    return [Plain(f"r. {first} – {second}")]
 
 
 registry.insert("reign", tmpl_reign)
author	Daniel Schadt <kingdread@gmx.de>	2021-08-20 22:04:44 +0200
committer	Daniel Schadt <kingdread@gmx.de>	2021-08-20 22:04:44 +0200
commit	a9a7c6c77f3b6078e317d455b696ce76272b88cb (patch)
tree	4d7649a3efe54378141e5fba6778281d2da4b8b4
parent	469353899ae6d7d0c0b7b105c24baaa4841c6328 (diff)
parent	a114ad49db792ec190a5cb6c96acc47669ac4b03 (diff)
download	wikimini-a9a7c6c77f3b6078e317d455b696ce76272b88cb.tar.gz wikimini-a9a7c6c77f3b6078e317d455b696ce76272b88cb.tar.bz2 wikimini-a9a7c6c77f3b6078e317d455b696ce76272b88cb.zip