diff options
author | Daniel Schadt <kingdread@gmx.de> | 2021-08-20 22:04:44 +0200 |
---|---|---|
committer | Daniel Schadt <kingdread@gmx.de> | 2021-08-20 22:04:44 +0200 |
commit | a9a7c6c77f3b6078e317d455b696ce76272b88cb (patch) | |
tree | 4d7649a3efe54378141e5fba6778281d2da4b8b4 | |
parent | 469353899ae6d7d0c0b7b105c24baaa4841c6328 (diff) | |
parent | a114ad49db792ec190a5cb6c96acc47669ac4b03 (diff) | |
download | wikimini-a9a7c6c77f3b6078e317d455b696ce76272b88cb.tar.gz wikimini-a9a7c6c77f3b6078e317d455b696ce76272b88cb.tar.bz2 wikimini-a9a7c6c77f3b6078e317d455b696ce76272b88cb.zip |
Merge branch 'document-repr'
-rw-r--r-- | wikimini/__init__.py | 208 | ||||
-rw-r--r-- | wikimini/document.py | 490 | ||||
-rw-r--r-- | wikimini/formats/__init__.py | 187 | ||||
-rw-r--r-- | wikimini/formats/gemtext.py | 58 | ||||
-rw-r--r-- | wikimini/templates/__init__.py | 7 | ||||
-rw-r--r-- | wikimini/templates/cite.py | 3 | ||||
-rw-r--r-- | wikimini/templates/convert.py | 9 | ||||
-rw-r--r-- | wikimini/templates/language.py | 21 | ||||
-rw-r--r-- | wikimini/templates/mainlinks.py | 6 | ||||
-rw-r--r-- | wikimini/templates/quotes.py | 12 | ||||
-rw-r--r-- | wikimini/templates/various.py | 5 |
11 files changed, 872 insertions, 134 deletions
diff --git a/wikimini/__init__.py b/wikimini/__init__.py index b93ef81..d6e49a1 100644 --- a/wikimini/__init__.py +++ b/wikimini/__init__.py @@ -1,11 +1,14 @@ import mwparserfromhell as mwp import requests -import re from tabulate import tabulate -from typing import Union, Tuple +from typing import List, Union, Tuple +from .document import ( + Plain, BlockLink, InlineLink, Verbatim, Document, Node, Block, ItemList, + Paragraph, Heading, insert_into, extract_plaintext, +) #: The default API URL, pointing to the english Wikipedia. API_URL = "https://en.wikipedia.org/w/api.php" @@ -77,66 +80,73 @@ class Wikimini: text = revision["slots"]["main"]["content"] return (title, mwp.parse(text)) - def _convert(self, obj): - """Function that does the actual conversion. + def convert( + self, + obj: Union[mwp.wikicode.Wikicode, mwp.nodes.Node], + ) -> Union[Document, List[Node], List[Block]]: + """Function that converts and renders a node. - This is called recursively on each node, and should perform the correct - conversion - based on the node type. + This function is exposed for template implementors, for normal usage, + see :meth:`convert_to_document`. + + The input and output of this function is as follows: + + * If ``obj`` is a :class:`~mwparserfromhell.wikicode.Wikicode`, then + :meth:`convert` will return a :class:`document.Document`. + * If ``obj`` is a :class:`~mwparserfromhell.nodes.Node`, then + :meth:`convert` will return either a list of :class:`document.Node` + or a list of :class:`document.Block`, depending on whether the + converted object is inline (like a link), or a block object (like a + quote). + + Note that in the last case, the empty list ``[]`` might be returned, + indicating that the object should not be included in the output. + + Args: + obj: The object to convert. + + Returns: + The converted object. """ default = lambda obj:\ - mwp.wikicode.Wikicode([obj]).strip_code(collapse=False) + [Plain(mwp.wikicode.Wikicode([obj]).strip_code(collapse=False))] - # This does the actual conversion if isinstance(obj, mwp.wikicode.Wikicode): - converted = [] - iterator = iter(enumerate(obj.nodes)) - for i, node in iterator: - # Pattern: * [[Wikilink]]\n - if (i >= 2 and - i + 1 < len(obj.nodes) and - # Links can have a plural s after them - re.match("s?\n", str(obj.nodes[i+1])) and - isinstance(node, mwp.nodes.wikilink.Wikilink) and - str(obj.nodes[i-1]) == " " and - str(obj.nodes[i-2]) == "*"): - converted.pop() - converted.pop() - _, after = next(iterator) - converted.append("=> {} {}{}".format( - self.page_url(str(node.title)), - self._convert(node), - self._convert(after), - )) - continue - # Pattern: *[[Wikilink]]\n - elif (i >= 1 and - i + 1 < len(obj.nodes) and - re.match("s?\n", str(obj.nodes[i+1])) and - isinstance(node, mwp.nodes.wikilink.Wikilink) and - str(obj.nodes[i-1]) == "*"): - converted.pop() - _, after = next(iterator) - converted.append("=> {} {}{}".format( - self.page_url(str(node.title)), - self._convert(node), - self._convert(after), - )) - continue - - # Default: Just convert the node - converted.append(self._convert(node)) - return "".join(converted) + document = [] + for node in obj.nodes: + current = self.convert(node) + + if current == []: + pass + # Special case: We're starting a list, but we're already in a + # list + elif (document and len(current) == 1 + and isinstance(current[0], ItemList) + and isinstance(document[-1], ItemList) + and document[-1].ordered == current[0].ordered): + pass + # Special case: We're starting a list! + elif len(current) == 1 and isinstance(current[0], ItemList): + document.extend(current) + elif isinstance(current[0], Block): + document.extend(current) + document.append(Paragraph([])) + elif isinstance(current[0], Node): + for c in current: + insert_into(document, c) + return Document(document) elif isinstance(obj, mwp.nodes.heading.Heading): - return "{} {}\n".format("#" * min(obj.level, 3), obj.title.strip_code()) + return [Heading(obj.level, obj.title.strip_code())] elif isinstance(obj, mwp.nodes.tag.Tag): - # Most tags are handled just fine and can be delegated to strip_code - # (inline text styles), however we can do a bit better for list tags. + # Most tags are handled just fine and can be delegated to + # strip_code (inline text styles), however we can do a bit better + # for list tags. if str(obj.wiki_markup) == "*": - return "* " + return [ItemList([], False)] elif str(obj.wiki_markup) == "#": - return "<!NUM!> " + return [ItemList([], True)] elif str(obj.tag) == "ref": - return "" + return [] elif str(obj.tag) == "table": rows = [] header = () @@ -151,22 +161,22 @@ class Wikimini: continue if str(node.tag) == "th": row_is_header = True - parsed.append(self._convert(node.contents).strip()) + parsed.append( + self.convert(node.contents).plain().strip() + ) if not row_is_header: rows.append(parsed) else: header = parsed - return "".join([ - "\n```\n", - tabulate(rows, header, tablefmt=self.table_format), - "\n```\n", - ]) + return [Verbatim( + tabulate(rows, header, tablefmt=self.table_format) + )] else: return default(obj) elif isinstance(obj, mwp.nodes.template.Template): - # Most templates are handled fine (and completely stripped), however, - # some of them are useful and provide some output that we should mimic - # (for example, the convert template). + # Most templates are handled fine (and completely stripped), + # however, some of them are useful and provide some output that we + # should mimic (for example, the convert template). name = str(obj.name) template = templates.registry.get(name) if template is None: @@ -174,51 +184,30 @@ class Wikimini: else: return template(self, obj) elif isinstance(obj, mwp.nodes.wikilink.Wikilink): - if str(obj.title).startswith("File:") or str(obj.text).startswith("thumb|"): - return "" + if (str(obj.title).startswith("File:") + or str(obj.text).startswith("thumb|")): + return [] elif str(obj.title).startswith("Category:"): - return "" + return [] else: - return default(obj) + return [InlineLink( + self.page_url(str(obj.title)), + Plain( + extract_plaintext(self.convert(obj.text)) if obj.text + else str(obj.title) + ), + )] else: return default(obj) - def _postprocess(self, gemtext): - # Strip out any more thumbs that have been left. - # This happens because the wikilinks are nested in each other, which the - # parser would only notice after doing the first replacement. We'll just - # take the easy way out here and use a regex to get rid of them. - gemtext = re.sub("^\\[\\[File:.*?\\]\\]$", "", gemtext, flags=re.MULTILINE) - - # Collapse too many empty lines - while "\n\n\n" in gemtext: - gemtext = gemtext.replace("\n\n\n", "\n\n") - - # Shortcut to avoid unnecessary splitting - if "<!NUM!>" not in gemtext: - return gemtext - - lines = gemtext.split("\n") - counter = 1 - for idx in range(len(lines)): - line = lines[idx] - if line.startswith("<!NUM!>"): - line = line.replace("<!NUM!>", str(counter), 1) - lines[idx] = line - counter += 1 - else: - counter = 1 - return "\n".join(lines) - - - def wikicode_to_gemtext( - self, obj: Union[mwp.nodes.Node, mwp.wikicode.Wikicode] - ) -> str: - """Try to turn the given object into a sensible Gemtext representation. + def convert_to_document(self, obj: mwp.wikicode.Wikicode) -> Document: + """Try to turn the given object into a sensible + :class:`~document.Document` representation. - Note that wikicode is much more powerful than Gemtext, so this is a lossy - function. The returned Gemtext tries to mimic the content of the Wikicode - as much as possible (for human consumption). + Note that wikicode is much more powerful than the internal + representation, so this is a lossy function. The returned document + tries to mimic the content of the Wikicode as much as possible (for + human consumption). This function mostly mimics :meth:`~mwparserfromhell.wikicode.Wikicode.strip_code`, with some @@ -228,21 +217,26 @@ class Wikimini: obj: The object to convert. Returns: - The converted Gemtext. + The converted Document. """ # Avoid calling str() on the whole Wikicode here - if (isinstance(obj, mwp.wikicode.Wikicode) and - str(mwp.wikicode.Wikicode(obj.nodes[:2])) == "#REDIRECT "): + if (isinstance(obj, mwp.wikicode.Wikicode) + and str(mwp.wikicode.Wikicode(obj.nodes[:2])) == "#REDIRECT "): + document = Document() title = str(obj.nodes[2].title) if "#" in title: title, section = title.split("#") section = f"Section '{section}'" else: section = "" - return "Redirect:\n=> {} {}\n{}".format( - self.page_url(title), title, section - ) - return self._postprocess(self._convert(obj)) + document.append(BlockLink(self.page_url(title), title)) + if section: + document.append(Paragraph([Plain(section)])) + return document + + document = self.convert(obj) + document.cleanup() + return document # import at the bottom to avoid circular dependencies diff --git a/wikimini/document.py b/wikimini/document.py new file mode 100644 index 0000000..a363c25 --- /dev/null +++ b/wikimini/document.py @@ -0,0 +1,490 @@ +"""The main class of this module is a :class:`Document`, which holds a parsed +and rendered Wikipedia article. + +We distinguish between two kinds of nodes, similar to HTML: +""" +import re +from dataclasses import dataclass, replace +from typing import List, Union + + +class Document: + """A rendered Wikipedia article. + + Attributes: + blocks (List[Block]): A list of top-level nodes. + """ + __slots__ = ('blocks',) + + def __init__(self, blocks=None): + self.blocks = [] + if blocks: + self.blocks = blocks + + def __iter__(self): + return iter(self.blocks) + + def append(self, block: "Block"): + """Append a block to the document. + + Args: + block: The block to append. + """ + self.blocks.append(block) + + def cleanup(self): + """Clean up the document by cleaning up every contained block. + + See also :meth:`Block.cleanup`. + """ + for block in self.blocks: + block.cleanup() + self.blocks = [block for block in self.blocks if block] + + def nodes(self) -> List["Node"]: + """Discard the block information and return a list of inner nodes. + + Returns: + A list of all inner nodes. + """ + return [node for block in self.blocks for node in block.to_nodes()] + + def plain(self) -> str: + """Returns the plain text content of this document. + + Returns: + The plain text. + """ + return extract_plaintext(self) + + +@dataclass +class Node: + """Base class for all in-line text elements.""" + + def plain(self) -> str: + """Returns the plain text of this node, stripping all markup. + + Returns: + The plain text. + """ + + def with_text(self, text: str) -> "Node": + """Returns a new node that has the same markup, but the given text. + + Args: + text: The new text. + + Returns: + The new node, usually of the same type as the node this function is + called on. + """ + + def __len__(self): + return len(self.plain()) + + def __getitem__(self, index): + if isinstance(index, int): + return self.plain()[index] + elif isinstance(index, slice): + text = self.plain()[index] + return self.with_text(text) + else: + raise TypeError("Node indices must be integers or slices") + + +@dataclass +class Plain(Node): + """A plain text node. + + Attributes: + text: The text content of this node. + """ + __slots__ = ("text",) + text: str + + def plain(self): + return self.text + + def with_text(self, text): + return Plain(text) + + +@dataclass +class Style(Node): + """Text that is styled with inline markup. + + Attributes: + inner: The content. + bold: Whether the text is bold. + italic: Whether the text is cursive. + monospace: Whether the text is monospaced. + """ + __slots__ = ("text", "bold", "italic", "monospace") + inner: Node + bold: bool + italic: bool + monospace: bool + + def plain(self): + return self.inner.plain() + + def with_text(self, text): + return replace(self, inner=self.inner.with_text(text)) + + +@dataclass +class InlineLink(Node): + """An inline link. + + Attributes: + href: The link target. + title: The text that should be shown. + """ + __slots__ = ("href", "title") + href: str + title: Union[Plain, Style] + + def plain(self): + if self.title is None: + return self.href + return self.title.plain() + + def with_text(self, text): + return replace(self, title=self.title.with_text(text)) + + +@dataclass +class Block: + """Base class for all top-level blocks.""" + + def cleanup(self): + """Clean up the content of this block. + + The exact meaning of this is dependent on the type of the block, but it + can involve stripping trailing/leading whitespace or other changes. + + Note that this modifies the block. + """ + + def append(self, node: Node): + """Append the given node to the block. + + Depending on the block, the node can either be inserted as-is (keeping + the markup information), or it is converted to plain text first. + + Args: + node: The node to insert. + """ + + def plain(self) -> str: + """Returns the plain text of this block, stripping all markup. + + Returns: + The plain text. + """ + + def to_nodes(self) -> List[Node]: + """Returns the inner nodes of this block. + + If the block is not made up of nodes, this will create new nodes that + contain the plain text content of this block. + + Returns: + The list of nodes. + """ + return [Plain(self.plain())] + + +@dataclass +class LineBreak: + """Represents an enforced empty line.""" + __slots__ = () + + def plain(self): + return "\n" + + +@dataclass +class Paragraph(Block): + """A paragraph is a piece of text, which itself can hold inline markup.""" + __slots__ = ("nodes",) + nodes: List[Node] + + def _find_index(self, idx): + offset = 0 + for i, node in enumerate(self.nodes): + if idx < offset + len(node): + return (i, idx - offset) + offset += len(node) + if idx == offset: + return (i, len(node)) + raise IndexError(f"{idx} is out of range") + + def __bool__(self): + return bool(self.nodes) + + def append(self, node): + self.nodes.append(node) + + def plain(self): + return "".join(node.plain() for node in self.nodes) + + def to_nodes(self): + return self.nodes + + def cleanup(self): + # There is a chance that some "thumbnail" links will get through + # (mainly if their text also contains links, in which case it'd require + # multiple parsing passes). As a quick and dirty fix, we just delete + # that stuff here: + while match := re.search("\\[\\[File:.+?\\]\\]", self.plain()): + start_node, start_pos = self._find_index(match.start()) + end_node, end_pos = self._find_index(match.end()) + + new_start = self.nodes[start_node][:start_pos] + new_end = self.nodes[end_node][end_pos:] + self.nodes[start_node:end_node + 1] = [new_start, new_end] + + # Strip leading and trailing whitespace + while self.nodes and re.match("^\\s+|^$", self.nodes[0].plain()): + self.nodes[0] = self.nodes[0].with_text( + self.nodes[0].plain().lstrip()) + if not self.nodes[0].plain(): + del self.nodes[0] + while self.nodes and re.search("\\s+$|^$", self.nodes[-1].plain()): + self.nodes[-1] = self.nodes[-1].with_text( + self.nodes[-1].plain().rstrip()) + if not self.nodes[-1].plain(): + del self.nodes[-1] + + def is_link_paragraph(self) -> bool: + """Returns whether the paragraph can be considered a "link item". + + A link item is a paragraph that only consists of a link (and + potentially a plural identifier), usually found in the "See also" + section on Wikipedia. + + In case of a link paragraph, the first node will be an + :class:`InlineLink`. + + Returns: + True if the paragraph is a link paragraph. + """ + if not self.nodes: + return False + return (isinstance(self.nodes[0], InlineLink) + and (len(self.nodes) == 1 + or len(self.nodes) == 2 and self.nodes[1].plain() == "s") + ) + + +@dataclass +class Heading(Block): + """A heading. + + Attributes: + level: The level of the heading. + text: The heading text. + """ + __slots__ = ("level", "text") + level: int + text: str + + def __bool__(self): + return bool(self.text) + + def cleanup(self): + self.text = self.text.strip() + + def append(self, node): + self.text += node.plain() + + def plain(self): + return self.text + + +@dataclass +class Verbatim(Block): + """Text that should appear verbatim in the output, such as code. + + Attributes: + text: The text that should appear. + """ + __slots__ = ("text",) + text: str + + def __bool__(self): + return bool(self.text) + + def append(self, node): + self.text += node.plain() + + def plain(self): + return self.text + + +@dataclass +class ItemList(Block): + """A list of elements. + + Attributes: + items: The list of items. Each item is a :class:`Paragraph`. + ordered: A flag indicating whether the list should be an ordered + (numbered) list. + """ + __slots__ = ("items", "ordered") + items: List[Paragraph] + ordered: bool + + def __bool__(self): + return bool(self.items) + + def new_item(self): + """Start a new item.""" + self.items.append(Paragraph([])) + + def append(self, node): + if not self.items: + self.new_item() + self.items[-1].append(node) + + def plain(self): + return "\n".join(paragraph.plain() for paragraph in self.items) + + def to_nodes(self): + return [node for item in self.items for node in item.nodes] + + def cleanup(self): + i = 0 + while i < len(self.items): + paragraph = self.items[i] + paragraph.cleanup() + if paragraph: + i += 1 + else: + del self.items[i] + + +@dataclass +class BlockQuote(Block): + """A quote. + + Attributes: + content: The content of the blockquote. + """ + __slots__ = ("content",) + content: Paragraph + + def __bool__(self): + return bool(self.content) + + def append(self, node): + self.content.append(node) + + def plain(self): + return self.content.plain() + + def to_content(self): + return self.content.to_nodes() + + +@dataclass +class BlockLink(Block): + """A link on its own line. + + This is important for formats like Gemtext, where inline links will be + discarded. + + Attributes: + href: The target of the link. + title: The link text. + """ + __slots__ = ("href", "title") + href: str + title: str + + def append(self, node): + self.title += node.plain() + + def plain(self): + return self.title + + +def insert_into(blocks: List[Block], node: Node): + """Inserts the given node into the list of blocks. + + The node will always be inserted into the last block. If the list of blocks + is still empty, a fresh :class:`Paragraph` will be started. + + This function takes care of handling newlines properly. That means that a + double newline (``\\n\\n``) will start a new paragraph, and a single + newline (``\\n``) will start a new list item (if the current block is a + list). + + Note that this function will modify the given list of blocks. + + Args: + blocks: The list of blocks. + node: The node to insert. + """ + if not blocks: + blocks.append(Paragraph([])) + + current_block = blocks[-1] + + if isinstance(current_block, Paragraph): + if "\n\n" in node.plain(): + idx = node.plain().index("\n\n") + left = node[:idx] + right = node[idx + 2:] + current_block.append(left) + blocks.append(Paragraph([])) + insert_into(blocks, right) + else: + current_block.append(node) + + elif isinstance(current_block, ItemList): + match = re.search("\\n\\n?", node.plain()) + if not match: + current_block.append(node) + else: + left_end, right_start = match.span() + current_block.append(node[:left_end]) + if match.group() == "\n\n": + blocks.append(Paragraph([])) + else: + current_block.new_item() + insert_into(blocks, node[right_start:]) + + else: + current_block.append(node) + + +def extract_plaintext(obj) -> str: + """Tries to extract plaintext from the given object. + + The given object can be one of many things: + + * A list of :class:`Node` + * A list of :class:`Block` + * A single :class:`Node` + * A single :class:`Block` + * A :class:`Document` + + This function is useful if you recursively call + :meth:`wikimini.Wikimini.convert` and want to include the output in + something that only accepts plain text. + """ + if isinstance(obj, Document): + return extract_plaintext(obj.blocks) + + if not isinstance(obj, list): + obj = [obj] + + if not obj: + return "" + elif isinstance(obj[0], Node): + return "".join(node.plain() for node in obj) + elif isinstance(obj[0], Block): + return "\n\n".join(block.plain() for block in obj) diff --git a/wikimini/formats/__init__.py b/wikimini/formats/__init__.py new file mode 100644 index 0000000..b48486a --- /dev/null +++ b/wikimini/formats/__init__.py @@ -0,0 +1,187 @@ +"""The formats are responsible for turning a +:class:`~wikimini.document.Document` into an output string. + +Formats work by being given a file-like buffer as argument, into which the +output should be written. +""" +import io +from typing import TextIO, Union + +from ..document import ( + Document, Block, BlockLink, BlockQuote, Heading, ItemList, LineBreak, + Paragraph, Verbatim, Node, InlineLink, Plain, Style, +) + + +class Format: + """:class:`Format` is the base class for all output formats. + + Any output format should inherit from this class and override the specific + output methods. Note that by default, no output is generated. + + The methods :meth:`render`, :meth:`render_document`, :meth:`render_block` + and :meth:`render_node` have sensible default implementations that dispatch + to the more specific rendering methods. + + Attributes: + writer: The file-like object that output should be written to. + """ + writer: TextIO + + def __init__(self, writer: TextIO): + self.writer = writer + + def render(self, obj: Union[Document, Block, Node]): + """Renders the given object. + + Args: + obj: The object to render. + """ + if isinstance(obj, Document): + self.render_document(obj) + elif isinstance(obj, Block): + self.render_block(obj) + elif isinstance(obj, Node): + self.render_node(obj) + else: + raise TypeError(f"Cannot render {obj}, unknown type") + + def render_document(self, document: Document): + """Renders the given document. + + Args: + document: The document to render. + """ + for block in document: + self.render_block(block) + + def render_block(self, block: Block): + """Renders a single block. + + Args: + block: The block to render. + """ + if isinstance(block, BlockLink): + self.render_block_link(block) + elif isinstance(block, BlockQuote): + self.render_block_quote(block) + elif isinstance(block, Heading): + self.render_heading(block) + elif isinstance(block, ItemList): + self.render_item_list(block) + elif isinstance(block, LineBreak): + self.render_line_break(block) + elif isinstance(block, Paragraph): + self.render_paragraph(block) + elif isinstance(block, Verbatim): + self.render_verbatim(block) + else: + raise TypeError(f"Unknown Block type given: {type(block)}") + + def render_block_link(self, block_link: BlockLink): + """Renders a :class:`~wikimini.document.BlockLink`. + + Args: + block_link: The block link to render. + """ + + def render_block_quote(self, block_quote: BlockQuote): + """Renders a :class:`~wikimini.document.BlockQuote`. + + Args: + block_quote: The block quote to render. + """ + + def render_heading(self, heading: Heading): + """Renders a :class:`~wikimini.document.Heading`. + + Args: + heading: The heading to render. + """ + + def render_item_list(self, item_list: ItemList): + """Renders a :class:`~wikimini.document.ItemList`. + + Args: + item_list: The item list to render. + """ + + def render_line_break(self, line_break: LineBreak): + """Renders a :class:`~wikimini.document.LineBreak`. + + Args: + line_break: The line break to render. + """ + + def render_paragraph(self, paragraph: Paragraph): + """Renders a :class:`~wikimini.document.Paragraph`. + + Args: + paragraph: The paragraph to render. + """ + + def render_verbatim(self, verbatim: Verbatim): + """Renders a :class:`~wikimini.document.Verbatim`. + + Args: + verbatim: The verbatim to render. + """ + + def render_node(self, node: Node): + """Renders a single node. + + Args: + node: The node to render. + """ + if isinstance(node, InlineLink): + self.render_inline_link(node) + elif isinstance(node, Plain): + self.render_plain(node) + elif isinstance(node, Style): + self.render_style(node) + else: + raise TypeError(f"Unknown node type: {type(node)}") + + def render_inline_link(self, inline_link: InlineLink): + """Renders a :class:`~wikimini.document.InlineLink`. + + Args: + inline_link: The inline link to render. + """ + + def render_plain(self, plain: Plain): + """Renders a :class:`~wikimini.document.Plain`. + + Args: + plain: The plain text to render. + """ + + def render_style(self, style: Style): + """Renders a :class:`~wikimini.document.Style`. + + Args: + style: The styled text to render. + """ + + +def as_string(formatter: Format, obj: Union[Document, Node, Block]) -> str: + """Runs the given format function and returns the result as a string. + + This temporarily replaces the output writer by an in-memory string object, + runs the render function and then restores the writer. + + Args: + formatter: The formatter to run. + obj: The object to render. + + Returns: + The content, as string. + """ + old_writer = formatter.writer + buffer = io.StringIO() + formatter.writer = buffer + try: + formatter.render(obj) + finally: + formatter.writer = old_writer + return buffer.getvalue() diff --git a/wikimini/formats/gemtext.py b/wikimini/formats/gemtext.py new file mode 100644 index 0000000..39df956 --- /dev/null +++ b/wikimini/formats/gemtext.py @@ -0,0 +1,58 @@ +"""This module contains a Gemtext formatter for +:class:`~wikimini.document.Document`. +""" +from itertools import zip_longest +from . import Format, as_string +from ..document import LineBreak, BlockLink, InlineLink + + +class Gemtext(Format): + """The Gemtext formatter.""" + + def render_document(self, document): + for block, next_block in zip_longest( + document.blocks, document.blocks[1:]): + self.render_block(block) + if not isinstance(next_block, (LineBreak, BlockLink)): + self.writer.write("\n") + + def render_block_link(self, block_link): + self.writer.write(f"=> {block_link.href} {block_link.title}\n") + + def render_block_quote(self, block_quote): + content = as_string(self, block_quote.content) + for line in content.split("\n"): + self.writer.write(f"> {line}\n") + + def render_heading(self, heading): + level = min(3, heading.level) + self.writer.write("#" * level + f" {heading.text}\n") + + def render_inline_link(self, inline_link): + self.render(inline_link.title) + + def render_item_list(self, item_list): + for item in item_list.items: + if item.is_link_paragraph(): + link = item.nodes[0] + self.render(BlockLink(link.href, item.plain())) + else: + self.writer.write("* ") + self.render(item) + + def render_line_break(self, _): + self.writer.write("\n") + + def render_paragraph(self, paragraph): + for node in paragraph.nodes: + self.render(node) + self.writer.write("\n") + + def render_plain(self, plain): + self.writer.write(plain.text) + + def render_style(self, style): + self.render(style.inner) + + def render_verbatim(self, verbatim): + self.writer.write(f"```\n{verbatim.text}\n```\n") diff --git a/wikimini/templates/__init__.py b/wikimini/templates/__init__.py index 9e983e1..360b3fa 100644 --- a/wikimini/templates/__init__.py +++ b/wikimini/templates/__init__.py @@ -2,9 +2,9 @@ This module contains functions that mimic Wikipedia's templates. -A template is a function that takes the :class:`~wikimini.Wikimini` instance and the -:class:`~mwparserfromhell.nodes.template.Template` node to convert, and returns -a string with the template output (see :const:`Template`). +A template is a function that takes the :class:`~wikimini.Wikimini` instance +and the :class:`~mwparserfromhell.nodes.template.Template` node to convert, and +returns a string with the template output (see :const:`Template`). """ from typing import Callable, Optional @@ -31,6 +31,7 @@ class Registry: Returns: The template if found, or :any:`None`. """ + name = name.strip() # Are templates case-sensitive? # Yes, except usually the first letter. # (https://en.wikipedia.org/wiki/Help:A_quick_guide_to_templates#FAQ) diff --git a/wikimini/templates/cite.py b/wikimini/templates/cite.py index ac4f597..e342a5f 100644 --- a/wikimini/templates/cite.py +++ b/wikimini/templates/cite.py @@ -1,5 +1,6 @@ """Citation related templates.""" from . import registry +from ..document import Plain def tmpl_citation(wikimini, obj): @@ -23,7 +24,7 @@ def tmpl_citation(wikimini, obj): names.append(last) elif first: names.append(first) - return "{} ({})".format(title, "; ".join(names)) + return [Plain("{} ({})".format(title, "; ".join(names)))] for name in ["cite", "citation", "cite arXiv", "cite AV media", "cite book", diff --git a/wikimini/templates/convert.py b/wikimini/templates/convert.py index a7a3f44..8bab782 100644 --- a/wikimini/templates/convert.py +++ b/wikimini/templates/convert.py @@ -1,20 +1,21 @@ """Implementations for the unit conversion templates.""" from . import registry +from ..document import Plain def tmpl_convert(wikimini, obj): """Renders the ``{{convert|...}}`` template.""" if str(obj.params[1]) in {"-", "to"}: - return "{0}{3} {1} {2}{3}".format( + return [Plain("{0}{3} {1} {2}{3}".format( obj.params[0].value.strip_code(), obj.params[1].value.strip_code(), obj.params[2].value.strip_code(), obj.params[3].value.strip_code(), - ) - return "{}{}".format( + ))] + return [Plain("{}{}".format( obj.params[0].value.strip_code(), obj.params[1].value.strip_code(), - ) + ))] registry.insert("convert", tmpl_convert) diff --git a/wikimini/templates/language.py b/wikimini/templates/language.py index e8ab738..22320da 100644 --- a/wikimini/templates/language.py +++ b/wikimini/templates/language.py @@ -1,12 +1,15 @@ """Language related templates.""" from . import registry +from ..document import Plain import pycountry def tmpl_ipa(wikimini, obj): """Renders the ``{{IPA|...}}`` template.""" - return "pronounced [{}]".format(wikimini._convert(obj.params[0].value)) + return [Plain("pronounced [{}]".format( + wikimini.convert(obj.params[0].value).plain() + ))] registry.insert("IPA", tmpl_ipa) @@ -14,7 +17,7 @@ registry.insert("IPA", tmpl_ipa) def tmpl_lang(wikimini, obj): """Renders the ``{{Lang|...}}`` template.""" - return wikimini._convert(obj.params[1].value) + return wikimini.convert(obj.params[1].value).nodes() registry.insert("lang", tmpl_lang) @@ -24,18 +27,18 @@ registry.insert("script", tmpl_lang) def tmpl_lang_code(language_name): """Creates a template renderer for a ``{{lang-xx|...}}`` template.""" def inner(wikimini, obj): - return "{}: {}".format( - language_name, wikimini._convert(obj.params[0].value) - ) + return [Plain("{}: {}".format( + language_name, wikimini.convert(obj.params[0].value).plain() + ))] return inner def tmpl_ipa_code(language_name): """Creates a template renderer for a ``{{IPA-xx|...}}`` template.""" def inner(wikimini, obj): - return "{} pronunciation: [{}]".format( - language_name, wikimini._convert(obj.params[0].value) - ) + return [Plain("{} pronunciation: [{}]".format( + language_name, wikimini.convert(obj.params[0].value).plain() + ))] return inner @@ -52,7 +55,7 @@ for language in pycountry.languages: def tmpl_country_flag(country): """Creates a template renderer for ``{{BRA}}`` country flags.""" def inner(wikimini, obj): - return country + return [Plain(country)] return inner diff --git a/wikimini/templates/mainlinks.py b/wikimini/templates/mainlinks.py index ffcbc5e..3d945ed 100644 --- a/wikimini/templates/mainlinks.py +++ b/wikimini/templates/mainlinks.py @@ -1,14 +1,16 @@ """Renders templates that link to further articles.""" from . import registry +from ..document import Paragraph, Plain, BlockLink + def tmpl_main(wikimini, obj): """Renders the ``{{main|...}}`` template.""" links = [ - "=> {} {}".format(wikimini.page_url(str(t.value)), t.value) + BlockLink(wikimini.page_url(str(t.value)), t.value.strip_code()) for t in obj.params ] - return "Main articles:\n{}\n".format("\n".join(links)) + return [Paragraph([Plain("Main articles:")])] + links registry.insert("main", tmpl_main) diff --git a/wikimini/templates/quotes.py b/wikimini/templates/quotes.py index b51d92d..00b82fb 100644 --- a/wikimini/templates/quotes.py +++ b/wikimini/templates/quotes.py @@ -1,15 +1,16 @@ """Renders various quote related templates.""" from . import registry +from ..document import BlockQuote, Paragraph + def tmpl_quote(wikimini, obj): """Renders the ``{{blockquote|...}}`` template.""" text = obj.get("text", None) if not text: return "" - content = wikimini._convert(text.value) - lines = content.split("\n") - return "\n".join(f"> {line}" for line in lines) + content = wikimini.convert(text.value).nodes() + return [BlockQuote(Paragraph(content))] registry.insert("blockquote", tmpl_quote) @@ -19,9 +20,8 @@ registry.insert("quote", tmpl_quote) def tmpl_cquote(wikimini, obj): """Renders the ``{{cquote|...}}`` template.""" text = obj.params[0] - content = wikimini._convert(text.value) - lines = content.split("\n") - return "\n".join(f"> {line}" for line in lines) + content = wikimini.convert(text.value).nodes() + return [BlockQuote(Paragraph(content))] registry.insert("cquote", tmpl_cquote) diff --git a/wikimini/templates/various.py b/wikimini/templates/various.py index 8c6e0d5..6076ac4 100644 --- a/wikimini/templates/various.py +++ b/wikimini/templates/various.py @@ -1,16 +1,17 @@ """Various small templates.""" from . import registry +from ..document import Plain def tmpl_reign(wikimini, obj): """Renders the ``{{reign|...}}`` template.""" if not obj.params: - return "r. " + return [Plain("r. ")] first = obj.params[0].value.strip_code().strip() or "?" second = "" if len(obj.params) > 1: second = obj.params[1].value.strip_code().strip() - return f"r. {first} – {second}" + return [Plain(f"r. {first} – {second}")] registry.insert("reign", tmpl_reign) |