summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaniel Schadt <kingdread@gmx.de>2021-08-20 22:04:44 +0200
committerDaniel Schadt <kingdread@gmx.de>2021-08-20 22:04:44 +0200
commita9a7c6c77f3b6078e317d455b696ce76272b88cb (patch)
tree4d7649a3efe54378141e5fba6778281d2da4b8b4
parent469353899ae6d7d0c0b7b105c24baaa4841c6328 (diff)
parenta114ad49db792ec190a5cb6c96acc47669ac4b03 (diff)
downloadwikimini-a9a7c6c77f3b6078e317d455b696ce76272b88cb.tar.gz
wikimini-a9a7c6c77f3b6078e317d455b696ce76272b88cb.tar.bz2
wikimini-a9a7c6c77f3b6078e317d455b696ce76272b88cb.zip
Merge branch 'document-repr'
-rw-r--r--wikimini/__init__.py208
-rw-r--r--wikimini/document.py490
-rw-r--r--wikimini/formats/__init__.py187
-rw-r--r--wikimini/formats/gemtext.py58
-rw-r--r--wikimini/templates/__init__.py7
-rw-r--r--wikimini/templates/cite.py3
-rw-r--r--wikimini/templates/convert.py9
-rw-r--r--wikimini/templates/language.py21
-rw-r--r--wikimini/templates/mainlinks.py6
-rw-r--r--wikimini/templates/quotes.py12
-rw-r--r--wikimini/templates/various.py5
11 files changed, 872 insertions, 134 deletions
diff --git a/wikimini/__init__.py b/wikimini/__init__.py
index b93ef81..d6e49a1 100644
--- a/wikimini/__init__.py
+++ b/wikimini/__init__.py
@@ -1,11 +1,14 @@
import mwparserfromhell as mwp
import requests
-import re
from tabulate import tabulate
-from typing import Union, Tuple
+from typing import List, Union, Tuple
+from .document import (
+ Plain, BlockLink, InlineLink, Verbatim, Document, Node, Block, ItemList,
+ Paragraph, Heading, insert_into, extract_plaintext,
+)
#: The default API URL, pointing to the english Wikipedia.
API_URL = "https://en.wikipedia.org/w/api.php"
@@ -77,66 +80,73 @@ class Wikimini:
text = revision["slots"]["main"]["content"]
return (title, mwp.parse(text))
- def _convert(self, obj):
- """Function that does the actual conversion.
+ def convert(
+ self,
+ obj: Union[mwp.wikicode.Wikicode, mwp.nodes.Node],
+ ) -> Union[Document, List[Node], List[Block]]:
+ """Function that converts and renders a node.
- This is called recursively on each node, and should perform the correct
- conversion - based on the node type.
+ This function is exposed for template implementors, for normal usage,
+ see :meth:`convert_to_document`.
+
+ The input and output of this function is as follows:
+
+ * If ``obj`` is a :class:`~mwparserfromhell.wikicode.Wikicode`, then
+ :meth:`convert` will return a :class:`document.Document`.
+ * If ``obj`` is a :class:`~mwparserfromhell.nodes.Node`, then
+ :meth:`convert` will return either a list of :class:`document.Node`
+ or a list of :class:`document.Block`, depending on whether the
+ converted object is inline (like a link), or a block object (like a
+ quote).
+
+ Note that in the last case, the empty list ``[]`` might be returned,
+ indicating that the object should not be included in the output.
+
+ Args:
+ obj: The object to convert.
+
+ Returns:
+ The converted object.
"""
default = lambda obj:\
- mwp.wikicode.Wikicode([obj]).strip_code(collapse=False)
+ [Plain(mwp.wikicode.Wikicode([obj]).strip_code(collapse=False))]
- # This does the actual conversion
if isinstance(obj, mwp.wikicode.Wikicode):
- converted = []
- iterator = iter(enumerate(obj.nodes))
- for i, node in iterator:
- # Pattern: * [[Wikilink]]\n
- if (i >= 2 and
- i + 1 < len(obj.nodes) and
- # Links can have a plural s after them
- re.match("s?\n", str(obj.nodes[i+1])) and
- isinstance(node, mwp.nodes.wikilink.Wikilink) and
- str(obj.nodes[i-1]) == " " and
- str(obj.nodes[i-2]) == "*"):
- converted.pop()
- converted.pop()
- _, after = next(iterator)
- converted.append("=> {} {}{}".format(
- self.page_url(str(node.title)),
- self._convert(node),
- self._convert(after),
- ))
- continue
- # Pattern: *[[Wikilink]]\n
- elif (i >= 1 and
- i + 1 < len(obj.nodes) and
- re.match("s?\n", str(obj.nodes[i+1])) and
- isinstance(node, mwp.nodes.wikilink.Wikilink) and
- str(obj.nodes[i-1]) == "*"):
- converted.pop()
- _, after = next(iterator)
- converted.append("=> {} {}{}".format(
- self.page_url(str(node.title)),
- self._convert(node),
- self._convert(after),
- ))
- continue
-
- # Default: Just convert the node
- converted.append(self._convert(node))
- return "".join(converted)
+ document = []
+ for node in obj.nodes:
+ current = self.convert(node)
+
+ if current == []:
+ pass
+ # Special case: We're starting a list, but we're already in a
+ # list
+ elif (document and len(current) == 1
+ and isinstance(current[0], ItemList)
+ and isinstance(document[-1], ItemList)
+ and document[-1].ordered == current[0].ordered):
+ pass
+ # Special case: We're starting a list!
+ elif len(current) == 1 and isinstance(current[0], ItemList):
+ document.extend(current)
+ elif isinstance(current[0], Block):
+ document.extend(current)
+ document.append(Paragraph([]))
+ elif isinstance(current[0], Node):
+ for c in current:
+ insert_into(document, c)
+ return Document(document)
elif isinstance(obj, mwp.nodes.heading.Heading):
- return "{} {}\n".format("#" * min(obj.level, 3), obj.title.strip_code())
+ return [Heading(obj.level, obj.title.strip_code())]
elif isinstance(obj, mwp.nodes.tag.Tag):
- # Most tags are handled just fine and can be delegated to strip_code
- # (inline text styles), however we can do a bit better for list tags.
+ # Most tags are handled just fine and can be delegated to
+ # strip_code (inline text styles), however we can do a bit better
+ # for list tags.
if str(obj.wiki_markup) == "*":
- return "* "
+ return [ItemList([], False)]
elif str(obj.wiki_markup) == "#":
- return "<!NUM!> "
+ return [ItemList([], True)]
elif str(obj.tag) == "ref":
- return ""
+ return []
elif str(obj.tag) == "table":
rows = []
header = ()
@@ -151,22 +161,22 @@ class Wikimini:
continue
if str(node.tag) == "th":
row_is_header = True
- parsed.append(self._convert(node.contents).strip())
+ parsed.append(
+ self.convert(node.contents).plain().strip()
+ )
if not row_is_header:
rows.append(parsed)
else:
header = parsed
- return "".join([
- "\n```\n",
- tabulate(rows, header, tablefmt=self.table_format),
- "\n```\n",
- ])
+ return [Verbatim(
+ tabulate(rows, header, tablefmt=self.table_format)
+ )]
else:
return default(obj)
elif isinstance(obj, mwp.nodes.template.Template):
- # Most templates are handled fine (and completely stripped), however,
- # some of them are useful and provide some output that we should mimic
- # (for example, the convert template).
+ # Most templates are handled fine (and completely stripped),
+ # however, some of them are useful and provide some output that we
+ # should mimic (for example, the convert template).
name = str(obj.name)
template = templates.registry.get(name)
if template is None:
@@ -174,51 +184,30 @@ class Wikimini:
else:
return template(self, obj)
elif isinstance(obj, mwp.nodes.wikilink.Wikilink):
- if str(obj.title).startswith("File:") or str(obj.text).startswith("thumb|"):
- return ""
+ if (str(obj.title).startswith("File:")
+ or str(obj.text).startswith("thumb|")):
+ return []
elif str(obj.title).startswith("Category:"):
- return ""
+ return []
else:
- return default(obj)
+ return [InlineLink(
+ self.page_url(str(obj.title)),
+ Plain(
+ extract_plaintext(self.convert(obj.text)) if obj.text
+ else str(obj.title)
+ ),
+ )]
else:
return default(obj)
- def _postprocess(self, gemtext):
- # Strip out any more thumbs that have been left.
- # This happens because the wikilinks are nested in each other, which the
- # parser would only notice after doing the first replacement. We'll just
- # take the easy way out here and use a regex to get rid of them.
- gemtext = re.sub("^\\[\\[File:.*?\\]\\]$", "", gemtext, flags=re.MULTILINE)
-
- # Collapse too many empty lines
- while "\n\n\n" in gemtext:
- gemtext = gemtext.replace("\n\n\n", "\n\n")
-
- # Shortcut to avoid unnecessary splitting
- if "<!NUM!>" not in gemtext:
- return gemtext
-
- lines = gemtext.split("\n")
- counter = 1
- for idx in range(len(lines)):
- line = lines[idx]
- if line.startswith("<!NUM!>"):
- line = line.replace("<!NUM!>", str(counter), 1)
- lines[idx] = line
- counter += 1
- else:
- counter = 1
- return "\n".join(lines)
-
-
- def wikicode_to_gemtext(
- self, obj: Union[mwp.nodes.Node, mwp.wikicode.Wikicode]
- ) -> str:
- """Try to turn the given object into a sensible Gemtext representation.
+ def convert_to_document(self, obj: mwp.wikicode.Wikicode) -> Document:
+ """Try to turn the given object into a sensible
+ :class:`~document.Document` representation.
- Note that wikicode is much more powerful than Gemtext, so this is a lossy
- function. The returned Gemtext tries to mimic the content of the Wikicode
- as much as possible (for human consumption).
+ Note that wikicode is much more powerful than the internal
+ representation, so this is a lossy function. The returned document
+ tries to mimic the content of the Wikicode as much as possible (for
+ human consumption).
This function mostly mimics
:meth:`~mwparserfromhell.wikicode.Wikicode.strip_code`, with some
@@ -228,21 +217,26 @@ class Wikimini:
obj: The object to convert.
Returns:
- The converted Gemtext.
+ The converted Document.
"""
# Avoid calling str() on the whole Wikicode here
- if (isinstance(obj, mwp.wikicode.Wikicode) and
- str(mwp.wikicode.Wikicode(obj.nodes[:2])) == "#REDIRECT "):
+ if (isinstance(obj, mwp.wikicode.Wikicode)
+ and str(mwp.wikicode.Wikicode(obj.nodes[:2])) == "#REDIRECT "):
+ document = Document()
title = str(obj.nodes[2].title)
if "#" in title:
title, section = title.split("#")
section = f"Section '{section}'"
else:
section = ""
- return "Redirect:\n=> {} {}\n{}".format(
- self.page_url(title), title, section
- )
- return self._postprocess(self._convert(obj))
+ document.append(BlockLink(self.page_url(title), title))
+ if section:
+ document.append(Paragraph([Plain(section)]))
+ return document
+
+ document = self.convert(obj)
+ document.cleanup()
+ return document
# import at the bottom to avoid circular dependencies
diff --git a/wikimini/document.py b/wikimini/document.py
new file mode 100644
index 0000000..a363c25
--- /dev/null
+++ b/wikimini/document.py
@@ -0,0 +1,490 @@
+"""The main class of this module is a :class:`Document`, which holds a parsed
+and rendered Wikipedia article.
+
+We distinguish between two kinds of nodes, similar to HTML:
+"""
+import re
+from dataclasses import dataclass, replace
+from typing import List, Union
+
+
+class Document:
+ """A rendered Wikipedia article.
+
+ Attributes:
+ blocks (List[Block]): A list of top-level nodes.
+ """
+ __slots__ = ('blocks',)
+
+ def __init__(self, blocks=None):
+ self.blocks = []
+ if blocks:
+ self.blocks = blocks
+
+ def __iter__(self):
+ return iter(self.blocks)
+
+ def append(self, block: "Block"):
+ """Append a block to the document.
+
+ Args:
+ block: The block to append.
+ """
+ self.blocks.append(block)
+
+ def cleanup(self):
+ """Clean up the document by cleaning up every contained block.
+
+ See also :meth:`Block.cleanup`.
+ """
+ for block in self.blocks:
+ block.cleanup()
+ self.blocks = [block for block in self.blocks if block]
+
+ def nodes(self) -> List["Node"]:
+ """Discard the block information and return a list of inner nodes.
+
+ Returns:
+ A list of all inner nodes.
+ """
+ return [node for block in self.blocks for node in block.to_nodes()]
+
+ def plain(self) -> str:
+ """Returns the plain text content of this document.
+
+ Returns:
+ The plain text.
+ """
+ return extract_plaintext(self)
+
+
+@dataclass
+class Node:
+ """Base class for all in-line text elements."""
+
+ def plain(self) -> str:
+ """Returns the plain text of this node, stripping all markup.
+
+ Returns:
+ The plain text.
+ """
+
+ def with_text(self, text: str) -> "Node":
+ """Returns a new node that has the same markup, but the given text.
+
+ Args:
+ text: The new text.
+
+ Returns:
+ The new node, usually of the same type as the node this function is
+ called on.
+ """
+
+ def __len__(self):
+ return len(self.plain())
+
+ def __getitem__(self, index):
+ if isinstance(index, int):
+ return self.plain()[index]
+ elif isinstance(index, slice):
+ text = self.plain()[index]
+ return self.with_text(text)
+ else:
+ raise TypeError("Node indices must be integers or slices")
+
+
+@dataclass
+class Plain(Node):
+ """A plain text node.
+
+ Attributes:
+ text: The text content of this node.
+ """
+ __slots__ = ("text",)
+ text: str
+
+ def plain(self):
+ return self.text
+
+ def with_text(self, text):
+ return Plain(text)
+
+
+@dataclass
+class Style(Node):
+ """Text that is styled with inline markup.
+
+ Attributes:
+ inner: The content.
+ bold: Whether the text is bold.
+ italic: Whether the text is cursive.
+ monospace: Whether the text is monospaced.
+ """
+ __slots__ = ("text", "bold", "italic", "monospace")
+ inner: Node
+ bold: bool
+ italic: bool
+ monospace: bool
+
+ def plain(self):
+ return self.inner.plain()
+
+ def with_text(self, text):
+ return replace(self, inner=self.inner.with_text(text))
+
+
+@dataclass
+class InlineLink(Node):
+ """An inline link.
+
+ Attributes:
+ href: The link target.
+ title: The text that should be shown.
+ """
+ __slots__ = ("href", "title")
+ href: str
+ title: Union[Plain, Style]
+
+ def plain(self):
+ if self.title is None:
+ return self.href
+ return self.title.plain()
+
+ def with_text(self, text):
+ return replace(self, title=self.title.with_text(text))
+
+
+@dataclass
+class Block:
+ """Base class for all top-level blocks."""
+
+ def cleanup(self):
+ """Clean up the content of this block.
+
+ The exact meaning of this is dependent on the type of the block, but it
+ can involve stripping trailing/leading whitespace or other changes.
+
+ Note that this modifies the block.
+ """
+
+ def append(self, node: Node):
+ """Append the given node to the block.
+
+ Depending on the block, the node can either be inserted as-is (keeping
+ the markup information), or it is converted to plain text first.
+
+ Args:
+ node: The node to insert.
+ """
+
+ def plain(self) -> str:
+ """Returns the plain text of this block, stripping all markup.
+
+ Returns:
+ The plain text.
+ """
+
+ def to_nodes(self) -> List[Node]:
+ """Returns the inner nodes of this block.
+
+ If the block is not made up of nodes, this will create new nodes that
+ contain the plain text content of this block.
+
+ Returns:
+ The list of nodes.
+ """
+ return [Plain(self.plain())]
+
+
+@dataclass
+class LineBreak:
+ """Represents an enforced empty line."""
+ __slots__ = ()
+
+ def plain(self):
+ return "\n"
+
+
+@dataclass
+class Paragraph(Block):
+ """A paragraph is a piece of text, which itself can hold inline markup."""
+ __slots__ = ("nodes",)
+ nodes: List[Node]
+
+ def _find_index(self, idx):
+ offset = 0
+ for i, node in enumerate(self.nodes):
+ if idx < offset + len(node):
+ return (i, idx - offset)
+ offset += len(node)
+ if idx == offset:
+ return (i, len(node))
+ raise IndexError(f"{idx} is out of range")
+
+ def __bool__(self):
+ return bool(self.nodes)
+
+ def append(self, node):
+ self.nodes.append(node)
+
+ def plain(self):
+ return "".join(node.plain() for node in self.nodes)
+
+ def to_nodes(self):
+ return self.nodes
+
+ def cleanup(self):
+ # There is a chance that some "thumbnail" links will get through
+ # (mainly if their text also contains links, in which case it'd require
+ # multiple parsing passes). As a quick and dirty fix, we just delete
+ # that stuff here:
+ while match := re.search("\\[\\[File:.+?\\]\\]", self.plain()):
+ start_node, start_pos = self._find_index(match.start())
+ end_node, end_pos = self._find_index(match.end())
+
+ new_start = self.nodes[start_node][:start_pos]
+ new_end = self.nodes[end_node][end_pos:]
+ self.nodes[start_node:end_node + 1] = [new_start, new_end]
+
+ # Strip leading and trailing whitespace
+ while self.nodes and re.match("^\\s+|^$", self.nodes[0].plain()):
+ self.nodes[0] = self.nodes[0].with_text(
+ self.nodes[0].plain().lstrip())
+ if not self.nodes[0].plain():
+ del self.nodes[0]
+ while self.nodes and re.search("\\s+$|^$", self.nodes[-1].plain()):
+ self.nodes[-1] = self.nodes[-1].with_text(
+ self.nodes[-1].plain().rstrip())
+ if not self.nodes[-1].plain():
+ del self.nodes[-1]
+
+ def is_link_paragraph(self) -> bool:
+ """Returns whether the paragraph can be considered a "link item".
+
+ A link item is a paragraph that only consists of a link (and
+ potentially a plural identifier), usually found in the "See also"
+ section on Wikipedia.
+
+ In case of a link paragraph, the first node will be an
+ :class:`InlineLink`.
+
+ Returns:
+ True if the paragraph is a link paragraph.
+ """
+ if not self.nodes:
+ return False
+ return (isinstance(self.nodes[0], InlineLink)
+ and (len(self.nodes) == 1
+ or len(self.nodes) == 2 and self.nodes[1].plain() == "s")
+ )
+
+
+@dataclass
+class Heading(Block):
+ """A heading.
+
+ Attributes:
+ level: The level of the heading.
+ text: The heading text.
+ """
+ __slots__ = ("level", "text")
+ level: int
+ text: str
+
+ def __bool__(self):
+ return bool(self.text)
+
+ def cleanup(self):
+ self.text = self.text.strip()
+
+ def append(self, node):
+ self.text += node.plain()
+
+ def plain(self):
+ return self.text
+
+
+@dataclass
+class Verbatim(Block):
+ """Text that should appear verbatim in the output, such as code.
+
+ Attributes:
+ text: The text that should appear.
+ """
+ __slots__ = ("text",)
+ text: str
+
+ def __bool__(self):
+ return bool(self.text)
+
+ def append(self, node):
+ self.text += node.plain()
+
+ def plain(self):
+ return self.text
+
+
+@dataclass
+class ItemList(Block):
+ """A list of elements.
+
+ Attributes:
+ items: The list of items. Each item is a :class:`Paragraph`.
+ ordered: A flag indicating whether the list should be an ordered
+ (numbered) list.
+ """
+ __slots__ = ("items", "ordered")
+ items: List[Paragraph]
+ ordered: bool
+
+ def __bool__(self):
+ return bool(self.items)
+
+ def new_item(self):
+ """Start a new item."""
+ self.items.append(Paragraph([]))
+
+ def append(self, node):
+ if not self.items:
+ self.new_item()
+ self.items[-1].append(node)
+
+ def plain(self):
+ return "\n".join(paragraph.plain() for paragraph in self.items)
+
+ def to_nodes(self):
+ return [node for item in self.items for node in item.nodes]
+
+ def cleanup(self):
+ i = 0
+ while i < len(self.items):
+ paragraph = self.items[i]
+ paragraph.cleanup()
+ if paragraph:
+ i += 1
+ else:
+ del self.items[i]
+
+
+@dataclass
+class BlockQuote(Block):
+ """A quote.
+
+ Attributes:
+ content: The content of the blockquote.
+ """
+ __slots__ = ("content",)
+ content: Paragraph
+
+ def __bool__(self):
+ return bool(self.content)
+
+ def append(self, node):
+ self.content.append(node)
+
+ def plain(self):
+ return self.content.plain()
+
+ def to_content(self):
+ return self.content.to_nodes()
+
+
+@dataclass
+class BlockLink(Block):
+ """A link on its own line.
+
+ This is important for formats like Gemtext, where inline links will be
+ discarded.
+
+ Attributes:
+ href: The target of the link.
+ title: The link text.
+ """
+ __slots__ = ("href", "title")
+ href: str
+ title: str
+
+ def append(self, node):
+ self.title += node.plain()
+
+ def plain(self):
+ return self.title
+
+
+def insert_into(blocks: List[Block], node: Node):
+ """Inserts the given node into the list of blocks.
+
+ The node will always be inserted into the last block. If the list of blocks
+ is still empty, a fresh :class:`Paragraph` will be started.
+
+ This function takes care of handling newlines properly. That means that a
+ double newline (``\\n\\n``) will start a new paragraph, and a single
+ newline (``\\n``) will start a new list item (if the current block is a
+ list).
+
+ Note that this function will modify the given list of blocks.
+
+ Args:
+ blocks: The list of blocks.
+ node: The node to insert.
+ """
+ if not blocks:
+ blocks.append(Paragraph([]))
+
+ current_block = blocks[-1]
+
+ if isinstance(current_block, Paragraph):
+ if "\n\n" in node.plain():
+ idx = node.plain().index("\n\n")
+ left = node[:idx]
+ right = node[idx + 2:]
+ current_block.append(left)
+ blocks.append(Paragraph([]))
+ insert_into(blocks, right)
+ else:
+ current_block.append(node)
+
+ elif isinstance(current_block, ItemList):
+ match = re.search("\\n\\n?", node.plain())
+ if not match:
+ current_block.append(node)
+ else:
+ left_end, right_start = match.span()
+ current_block.append(node[:left_end])
+ if match.group() == "\n\n":
+ blocks.append(Paragraph([]))
+ else:
+ current_block.new_item()
+ insert_into(blocks, node[right_start:])
+
+ else:
+ current_block.append(node)
+
+
+def extract_plaintext(obj) -> str:
+ """Tries to extract plaintext from the given object.
+
+ The given object can be one of many things:
+
+ * A list of :class:`Node`
+ * A list of :class:`Block`
+ * A single :class:`Node`
+ * A single :class:`Block`
+ * A :class:`Document`
+
+ This function is useful if you recursively call
+ :meth:`wikimini.Wikimini.convert` and want to include the output in
+ something that only accepts plain text.
+ """
+ if isinstance(obj, Document):
+ return extract_plaintext(obj.blocks)
+
+ if not isinstance(obj, list):
+ obj = [obj]
+
+ if not obj:
+ return ""
+ elif isinstance(obj[0], Node):
+ return "".join(node.plain() for node in obj)
+ elif isinstance(obj[0], Block):
+ return "\n\n".join(block.plain() for block in obj)
diff --git a/wikimini/formats/__init__.py b/wikimini/formats/__init__.py
new file mode 100644
index 0000000..b48486a
--- /dev/null
+++ b/wikimini/formats/__init__.py
@@ -0,0 +1,187 @@
+"""The formats are responsible for turning a
+:class:`~wikimini.document.Document` into an output string.
+
+Formats work by being given a file-like buffer as argument, into which the
+output should be written.
+"""
+import io
+from typing import TextIO, Union
+
+from ..document import (
+ Document, Block, BlockLink, BlockQuote, Heading, ItemList, LineBreak,
+ Paragraph, Verbatim, Node, InlineLink, Plain, Style,
+)
+
+
+class Format:
+ """:class:`Format` is the base class for all output formats.
+
+ Any output format should inherit from this class and override the specific
+ output methods. Note that by default, no output is generated.
+
+ The methods :meth:`render`, :meth:`render_document`, :meth:`render_block`
+ and :meth:`render_node` have sensible default implementations that dispatch
+ to the more specific rendering methods.
+
+ Attributes:
+ writer: The file-like object that output should be written to.
+ """
+ writer: TextIO
+
+ def __init__(self, writer: TextIO):
+ self.writer = writer
+
+ def render(self, obj: Union[Document, Block, Node]):
+ """Renders the given object.
+
+ Args:
+ obj: The object to render.
+ """
+ if isinstance(obj, Document):
+ self.render_document(obj)
+ elif isinstance(obj, Block):
+ self.render_block(obj)
+ elif isinstance(obj, Node):
+ self.render_node(obj)
+ else:
+ raise TypeError(f"Cannot render {obj}, unknown type")
+
+ def render_document(self, document: Document):
+ """Renders the given document.
+
+ Args:
+ document: The document to render.
+ """
+ for block in document:
+ self.render_block(block)
+
+ def render_block(self, block: Block):
+ """Renders a single block.
+
+ Args:
+ block: The block to render.
+ """
+ if isinstance(block, BlockLink):
+ self.render_block_link(block)
+ elif isinstance(block, BlockQuote):
+ self.render_block_quote(block)
+ elif isinstance(block, Heading):
+ self.render_heading(block)
+ elif isinstance(block, ItemList):
+ self.render_item_list(block)
+ elif isinstance(block, LineBreak):
+ self.render_line_break(block)
+ elif isinstance(block, Paragraph):
+ self.render_paragraph(block)
+ elif isinstance(block, Verbatim):
+ self.render_verbatim(block)
+ else:
+ raise TypeError(f"Unknown Block type given: {type(block)}")
+
+ def render_block_link(self, block_link: BlockLink):
+ """Renders a :class:`~wikimini.document.BlockLink`.
+
+ Args:
+ block_link: The block link to render.
+ """
+
+ def render_block_quote(self, block_quote: BlockQuote):
+ """Renders a :class:`~wikimini.document.BlockQuote`.
+
+ Args:
+ block_quote: The block quote to render.
+ """
+
+ def render_heading(self, heading: Heading):
+ """Renders a :class:`~wikimini.document.Heading`.
+
+ Args:
+ heading: The heading to render.
+ """
+
+ def render_item_list(self, item_list: ItemList):
+ """Renders a :class:`~wikimini.document.ItemList`.
+
+ Args:
+ item_list: The item list to render.
+ """
+
+ def render_line_break(self, line_break: LineBreak):
+ """Renders a :class:`~wikimini.document.LineBreak`.
+
+ Args:
+ line_break: The line break to render.
+ """
+
+ def render_paragraph(self, paragraph: Paragraph):
+ """Renders a :class:`~wikimini.document.Paragraph`.
+
+ Args:
+ paragraph: The paragraph to render.
+ """
+
+ def render_verbatim(self, verbatim: Verbatim):
+ """Renders a :class:`~wikimini.document.Verbatim`.
+
+ Args:
+ verbatim: The verbatim to render.
+ """
+
+ def render_node(self, node: Node):
+ """Renders a single node.
+
+ Args:
+ node: The node to render.
+ """
+ if isinstance(node, InlineLink):
+ self.render_inline_link(node)
+ elif isinstance(node, Plain):
+ self.render_plain(node)
+ elif isinstance(node, Style):
+ self.render_style(node)
+ else:
+ raise TypeError(f"Unknown node type: {type(node)}")
+
+ def render_inline_link(self, inline_link: InlineLink):
+ """Renders a :class:`~wikimini.document.InlineLink`.
+
+ Args:
+ inline_link: The inline link to render.
+ """
+
+ def render_plain(self, plain: Plain):
+ """Renders a :class:`~wikimini.document.Plain`.
+
+ Args:
+ plain: The plain text to render.
+ """
+
+ def render_style(self, style: Style):
+ """Renders a :class:`~wikimini.document.Style`.
+
+ Args:
+ style: The styled text to render.
+ """
+
+
+def as_string(formatter: Format, obj: Union[Document, Node, Block]) -> str:
+ """Runs the given format function and returns the result as a string.
+
+ This temporarily replaces the output writer by an in-memory string object,
+ runs the render function and then restores the writer.
+
+ Args:
+ formatter: The formatter to run.
+ obj: The object to render.
+
+ Returns:
+ The content, as string.
+ """
+ old_writer = formatter.writer
+ buffer = io.StringIO()
+ formatter.writer = buffer
+ try:
+ formatter.render(obj)
+ finally:
+ formatter.writer = old_writer
+ return buffer.getvalue()
diff --git a/wikimini/formats/gemtext.py b/wikimini/formats/gemtext.py
new file mode 100644
index 0000000..39df956
--- /dev/null
+++ b/wikimini/formats/gemtext.py
@@ -0,0 +1,58 @@
+"""This module contains a Gemtext formatter for
+:class:`~wikimini.document.Document`.
+"""
+from itertools import zip_longest
+from . import Format, as_string
+from ..document import LineBreak, BlockLink, InlineLink
+
+
+class Gemtext(Format):
+ """The Gemtext formatter."""
+
+ def render_document(self, document):
+ for block, next_block in zip_longest(
+ document.blocks, document.blocks[1:]):
+ self.render_block(block)
+ if not isinstance(next_block, (LineBreak, BlockLink)):
+ self.writer.write("\n")
+
+ def render_block_link(self, block_link):
+ self.writer.write(f"=> {block_link.href} {block_link.title}\n")
+
+ def render_block_quote(self, block_quote):
+ content = as_string(self, block_quote.content)
+ for line in content.split("\n"):
+ self.writer.write(f"> {line}\n")
+
+ def render_heading(self, heading):
+ level = min(3, heading.level)
+ self.writer.write("#" * level + f" {heading.text}\n")
+
+ def render_inline_link(self, inline_link):
+ self.render(inline_link.title)
+
+ def render_item_list(self, item_list):
+ for item in item_list.items:
+ if item.is_link_paragraph():
+ link = item.nodes[0]
+ self.render(BlockLink(link.href, item.plain()))
+ else:
+ self.writer.write("* ")
+ self.render(item)
+
+ def render_line_break(self, _):
+ self.writer.write("\n")
+
+ def render_paragraph(self, paragraph):
+ for node in paragraph.nodes:
+ self.render(node)
+ self.writer.write("\n")
+
+ def render_plain(self, plain):
+ self.writer.write(plain.text)
+
+ def render_style(self, style):
+ self.render(style.inner)
+
+ def render_verbatim(self, verbatim):
+ self.writer.write(f"```\n{verbatim.text}\n```\n")
diff --git a/wikimini/templates/__init__.py b/wikimini/templates/__init__.py
index 9e983e1..360b3fa 100644
--- a/wikimini/templates/__init__.py
+++ b/wikimini/templates/__init__.py
@@ -2,9 +2,9 @@
This module contains functions that mimic Wikipedia's templates.
-A template is a function that takes the :class:`~wikimini.Wikimini` instance and the
-:class:`~mwparserfromhell.nodes.template.Template` node to convert, and returns
-a string with the template output (see :const:`Template`).
+A template is a function that takes the :class:`~wikimini.Wikimini` instance
+and the :class:`~mwparserfromhell.nodes.template.Template` node to convert, and
+returns a string with the template output (see :const:`Template`).
"""
from typing import Callable, Optional
@@ -31,6 +31,7 @@ class Registry:
Returns:
The template if found, or :any:`None`.
"""
+ name = name.strip()
# Are templates case-sensitive?
# Yes, except usually the first letter.
# (https://en.wikipedia.org/wiki/Help:A_quick_guide_to_templates#FAQ)
diff --git a/wikimini/templates/cite.py b/wikimini/templates/cite.py
index ac4f597..e342a5f 100644
--- a/wikimini/templates/cite.py
+++ b/wikimini/templates/cite.py
@@ -1,5 +1,6 @@
"""Citation related templates."""
from . import registry
+from ..document import Plain
def tmpl_citation(wikimini, obj):
@@ -23,7 +24,7 @@ def tmpl_citation(wikimini, obj):
names.append(last)
elif first:
names.append(first)
- return "{} ({})".format(title, "; ".join(names))
+ return [Plain("{} ({})".format(title, "; ".join(names)))]
for name in ["cite", "citation", "cite arXiv", "cite AV media", "cite book",
diff --git a/wikimini/templates/convert.py b/wikimini/templates/convert.py
index a7a3f44..8bab782 100644
--- a/wikimini/templates/convert.py
+++ b/wikimini/templates/convert.py
@@ -1,20 +1,21 @@
"""Implementations for the unit conversion templates."""
from . import registry
+from ..document import Plain
def tmpl_convert(wikimini, obj):
"""Renders the ``{{convert|...}}`` template."""
if str(obj.params[1]) in {"-", "to"}:
- return "{0}{3} {1} {2}{3}".format(
+ return [Plain("{0}{3} {1} {2}{3}".format(
obj.params[0].value.strip_code(),
obj.params[1].value.strip_code(),
obj.params[2].value.strip_code(),
obj.params[3].value.strip_code(),
- )
- return "{}{}".format(
+ ))]
+ return [Plain("{}{}".format(
obj.params[0].value.strip_code(),
obj.params[1].value.strip_code(),
- )
+ ))]
registry.insert("convert", tmpl_convert)
diff --git a/wikimini/templates/language.py b/wikimini/templates/language.py
index e8ab738..22320da 100644
--- a/wikimini/templates/language.py
+++ b/wikimini/templates/language.py
@@ -1,12 +1,15 @@
"""Language related templates."""
from . import registry
+from ..document import Plain
import pycountry
def tmpl_ipa(wikimini, obj):
"""Renders the ``{{IPA|...}}`` template."""
- return "pronounced [{}]".format(wikimini._convert(obj.params[0].value))
+ return [Plain("pronounced [{}]".format(
+ wikimini.convert(obj.params[0].value).plain()
+ ))]
registry.insert("IPA", tmpl_ipa)
@@ -14,7 +17,7 @@ registry.insert("IPA", tmpl_ipa)
def tmpl_lang(wikimini, obj):
"""Renders the ``{{Lang|...}}`` template."""
- return wikimini._convert(obj.params[1].value)
+ return wikimini.convert(obj.params[1].value).nodes()
registry.insert("lang", tmpl_lang)
@@ -24,18 +27,18 @@ registry.insert("script", tmpl_lang)
def tmpl_lang_code(language_name):
"""Creates a template renderer for a ``{{lang-xx|...}}`` template."""
def inner(wikimini, obj):
- return "{}: {}".format(
- language_name, wikimini._convert(obj.params[0].value)
- )
+ return [Plain("{}: {}".format(
+ language_name, wikimini.convert(obj.params[0].value).plain()
+ ))]
return inner
def tmpl_ipa_code(language_name):
"""Creates a template renderer for a ``{{IPA-xx|...}}`` template."""
def inner(wikimini, obj):
- return "{} pronunciation: [{}]".format(
- language_name, wikimini._convert(obj.params[0].value)
- )
+ return [Plain("{} pronunciation: [{}]".format(
+ language_name, wikimini.convert(obj.params[0].value).plain()
+ ))]
return inner
@@ -52,7 +55,7 @@ for language in pycountry.languages:
def tmpl_country_flag(country):
"""Creates a template renderer for ``{{BRA}}`` country flags."""
def inner(wikimini, obj):
- return country
+ return [Plain(country)]
return inner
diff --git a/wikimini/templates/mainlinks.py b/wikimini/templates/mainlinks.py
index ffcbc5e..3d945ed 100644
--- a/wikimini/templates/mainlinks.py
+++ b/wikimini/templates/mainlinks.py
@@ -1,14 +1,16 @@
"""Renders templates that link to further articles."""
from . import registry
+from ..document import Paragraph, Plain, BlockLink
+
def tmpl_main(wikimini, obj):
"""Renders the ``{{main|...}}`` template."""
links = [
- "=> {} {}".format(wikimini.page_url(str(t.value)), t.value)
+ BlockLink(wikimini.page_url(str(t.value)), t.value.strip_code())
for t in obj.params
]
- return "Main articles:\n{}\n".format("\n".join(links))
+ return [Paragraph([Plain("Main articles:")])] + links
registry.insert("main", tmpl_main)
diff --git a/wikimini/templates/quotes.py b/wikimini/templates/quotes.py
index b51d92d..00b82fb 100644
--- a/wikimini/templates/quotes.py
+++ b/wikimini/templates/quotes.py
@@ -1,15 +1,16 @@
"""Renders various quote related templates."""
from . import registry
+from ..document import BlockQuote, Paragraph
+
def tmpl_quote(wikimini, obj):
"""Renders the ``{{blockquote|...}}`` template."""
text = obj.get("text", None)
if not text:
return ""
- content = wikimini._convert(text.value)
- lines = content.split("\n")
- return "\n".join(f"> {line}" for line in lines)
+ content = wikimini.convert(text.value).nodes()
+ return [BlockQuote(Paragraph(content))]
registry.insert("blockquote", tmpl_quote)
@@ -19,9 +20,8 @@ registry.insert("quote", tmpl_quote)
def tmpl_cquote(wikimini, obj):
"""Renders the ``{{cquote|...}}`` template."""
text = obj.params[0]
- content = wikimini._convert(text.value)
- lines = content.split("\n")
- return "\n".join(f"> {line}" for line in lines)
+ content = wikimini.convert(text.value).nodes()
+ return [BlockQuote(Paragraph(content))]
registry.insert("cquote", tmpl_cquote)
diff --git a/wikimini/templates/various.py b/wikimini/templates/various.py
index 8c6e0d5..6076ac4 100644
--- a/wikimini/templates/various.py
+++ b/wikimini/templates/various.py
@@ -1,16 +1,17 @@
"""Various small templates."""
from . import registry
+from ..document import Plain
def tmpl_reign(wikimini, obj):
"""Renders the ``{{reign|...}}`` template."""
if not obj.params:
- return "r. "
+ return [Plain("r. ")]
first = obj.params[0].value.strip_code().strip() or "?"
second = ""
if len(obj.params) > 1:
second = obj.params[1].value.strip_code().strip()
- return f"r. {first} – {second}"
+ return [Plain(f"r. {first} – {second}")]
registry.insert("reign", tmpl_reign)