From b688a103d5266ebcbccc9d23a334af02102dffc7 Mon Sep 17 00:00:00 2001 From: Daniel Schadt Date: Mon, 16 Aug 2021 15:06:09 +0200 Subject: Initial commit This is not even a proper Python package yet, but the output is surprisingly good already, so I'd like to take this version and save it. --- wikimini.py | 222 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 222 insertions(+) create mode 100644 wikimini.py diff --git a/wikimini.py b/wikimini.py new file mode 100644 index 0000000..032639a --- /dev/null +++ b/wikimini.py @@ -0,0 +1,222 @@ +import mwparserfromhell as mwp +import requests +import re + +from tabulate import tabulate + +from typing import Union + + +API_URL = "https://en.wikipedia.org/w/api.php" + +# See https://pypi.org/project/tabulate/ for a reference of table formats. +TABLEFMT = "fancy_grid" + + +def page_url(title): + return title.replace(" ", "_") + + +def retrieve(title: str) -> mwp.wikicode.Wikicode: + params = { + "action": "query", + "prop": "revisions", + "rvprop": "content", + "rvslots": "main", + "rvlimit": 1, + "titles": title, + "format": "json", + "formatversion": "2", + } + headers = {"User-Agent": "Wikimini/1.0"} + req = requests.get(API_URL, headers=headers, params=params) + res = req.json() + revision = res["query"]["pages"][0]["revisions"][0] + text = revision["slots"]["main"]["content"] + return mwp.parse(text) + + +def render_convert(obj: mwp.nodes.template.Template) -> str: + """Renders the {{convert|...}} template.""" + if str(obj.params[1]) in {"-", "to"}: + return "{0}{3} {1} {2}{3}".format( + obj.params[0].value.strip_code(), + obj.params[1].value.strip_code(), + obj.params[2].value.strip_code(), + obj.params[3].value.strip_code(), + ) + return "{}{}".format( + obj.params[0].value.strip_code(), + obj.params[1].value.strip_code(), + ) + + +def render_reign(obj: mwp.nodes.template.Template) -> str: + """Renders the {{reign|...}} template.""" + if not obj.params: + return "r. " + first = obj.params[0].value.strip_code().strip() or "?" + second = "" + if len(obj.params) > 1: + second = obj.params[1].value.strip_code().strip() + return f"r. {first} – {second}" + + +def render_cite_book(obj: mwp.nodes.template.Template) -> str: + """Renders the {{cite book|...}} template.""" + title = obj.get("title", None) + if title: + title = title.value.strip_code().strip() + else: + title = "Untitled" + names = [] + for idx in ["%", "%1", "%2", "%3", "%4", "%5", "editor1-%"]: + last = obj.get(idx.replace("%", "last"), None) + if last: + last = last.value.strip_code().strip() + first = obj.get(idx.replace("%", "first"), None) + if first: + first = first.value.strip_code().strip() + if last and first: + names.append(f"{last}, {first}") + elif last: + names.append(last) + elif first: + names.append(first) + return "{} ({})".format(title, "; ".join(names)) + + +def _convert(obj: Union[mwp.nodes.Node, mwp.wikicode.Wikicode]) -> str: + default = lambda obj: mwp.wikicode.Wikicode([obj]).strip_code(collapse=False) + # This does the actual conversion + if isinstance(obj, mwp.wikicode.Wikicode): + converted = [] + iterator = iter(enumerate(obj.nodes)) + for i, node in iterator: + # Pattern: * [[Wikilink]]\n + if (i >= 2 and + i + 1 < len(obj.nodes) and + # Links can have a plural s after them + re.match("s?\n", str(obj.nodes[i+1])) and + isinstance(node, mwp.nodes.wikilink.Wikilink) and + str(obj.nodes[i-1]) == " " and + str(obj.nodes[i-2]) == "*"): + converted.pop() + converted.pop() + _, after = next(iterator) + converted.append("=> {} {}{}".format( + page_url(str(node.title)), + _convert(node), + _convert(after), + )) + continue + + # Default: Just convert the node + converted.append(_convert(node)) + return "".join(converted) + elif isinstance(obj, mwp.nodes.heading.Heading): + return "{} {}\n".format("#" * min(obj.level, 3), obj.title.strip_code()) + elif isinstance(obj, mwp.nodes.tag.Tag): + # Most tags are handled just fine and can be delegated to strip_code + # (inline text styles), however we can do a bit better for list tags. + if str(obj.wiki_markup) == "*": + return "*" + elif str(obj.wiki_markup) == "#": + return " {}".format(_convert(obj.contents)) + elif str(obj.tag) == "ref": + return "" + elif str(obj.tag) == "table": + rows = [] + header = None + for row in obj.contents.nodes: + if str(getattr(row, "tag", "")) != "tr": + continue + nodes = row.contents.nodes + parsed = [] + row_is_header = False + for node in nodes: + if str(getattr(node, "tag", "")) not in {"td", "th"}: + continue + if str(node.tag) == "th": + row_is_header = True + parsed.append(_convert(node.contents).strip()) + if not row_is_header: + rows.append(parsed) + else: + header = parsed + return "\n```\n" + tabulate(rows, header, tablefmt=TABLEFMT) + "\n```\n" + else: + return default(obj) + elif isinstance(obj, mwp.nodes.template.Template): + # Most templates are handled fine (and completely stripped), however, + # some of them are useful and provide some output that we should mimic + # (for example, the convert template). + name = str(obj.name).strip().lower() + if name in {"cvt", "convert"}: + return render_convert(obj) + elif name == "lang": + return _convert(obj.params[1].value) + elif name in {"reign", "r.", "ruled", "rexit"}: + return render_reign(obj) + elif name in {"cite book", "cite journal", "cite news"}: + return render_cite_book(obj) + else: + return default(obj) + elif isinstance(obj, mwp.nodes.wikilink.Wikilink): + if str(obj.title).startswith("File:") or str(obj.text).startswith("thumb|"): + return "" + elif str(obj.title).startswith("Category:"): + return "" + else: + return default(obj) + else: + return default(obj) + +def _postprocess(gemtext: str) -> str: + # Strip out any more thumbs that have been left. + # This happens because the wikilinks are nested in each other, which the + # parser would only notice after doing the first replacement. We'll just + # take the easy way out here and use a regex to get rid of them. + gemtext = re.sub("^\\[\\[File:.*?\\]\\]$", "", gemtext, flags=re.MULTILINE) + + # Collapse too many empty lines + while "\n\n\n" in gemtext: + gemtext = gemtext.replace("\n\n\n", "\n\n") + + # Shortcut to avoid unnecessary splitting + if "" not in gemtext: + return gemtext + + lines = gemtext.split("\n") + counter = 1 + for idx in range(len(lines)): + line = lines[idx] + if line.startswith(""): + line = line.replace("", str(counter), 1) + lines[idx] = line + counter += 1 + else: + counter = 1 + return "\n".join(lines) + + +def wikicode_to_gemtext( + obj: Union[mwp.nodes.Node, mwp.wikicode.Wikicode] + ) -> str: + """Try to turn the given object into a sensible Gemtext representation. + + Note that wikicode is much more powerful than Gemtext, so this is a lossy + function. The returned Gemtext tries to mimic the content of the Wikicode + as much as possible (for human consumption). + + This function mostly mimics + `~mwparserfromhell.wikicode.Wikicode.strip_code`, with some addition to + better handle things that *can* be represented by Gemtext. + + Args: + obj: The object to convert. + + Returns: + The converted Gemtext. + """ + return _postprocess(_convert(obj)) -- cgit v1.2.3