import mwparserfromhell as mwp import requests import re from tabulate import tabulate from typing import Union API_URL = "https://en.wikipedia.org/w/api.php" # See https://pypi.org/project/tabulate/ for a reference of table formats. TABLEFMT = "fancy_grid" def page_url(title): return title.replace(" ", "_") def retrieve(title: str) -> mwp.wikicode.Wikicode: params = { "action": "query", "prop": "revisions", "rvprop": "content", "rvslots": "main", "rvlimit": 1, "titles": title, "format": "json", "formatversion": "2", } headers = {"User-Agent": "Wikimini/1.0"} req = requests.get(API_URL, headers=headers, params=params) res = req.json() revision = res["query"]["pages"][0]["revisions"][0] text = revision["slots"]["main"]["content"] return mwp.parse(text) def render_convert(obj: mwp.nodes.template.Template) -> str: """Renders the {{convert|...}} template.""" if str(obj.params[1]) in {"-", "to"}: return "{0}{3} {1} {2}{3}".format( obj.params[0].value.strip_code(), obj.params[1].value.strip_code(), obj.params[2].value.strip_code(), obj.params[3].value.strip_code(), ) return "{}{}".format( obj.params[0].value.strip_code(), obj.params[1].value.strip_code(), ) def render_reign(obj: mwp.nodes.template.Template) -> str: """Renders the {{reign|...}} template.""" if not obj.params: return "r. " first = obj.params[0].value.strip_code().strip() or "?" second = "" if len(obj.params) > 1: second = obj.params[1].value.strip_code().strip() return f"r. {first} – {second}" def render_cite_book(obj: mwp.nodes.template.Template) -> str: """Renders the {{cite book|...}} template.""" title = obj.get("title", None) if title: title = title.value.strip_code().strip() else: title = "Untitled" names = [] for idx in ["%", "%1", "%2", "%3", "%4", "%5", "editor1-%"]: last = obj.get(idx.replace("%", "last"), None) if last: last = last.value.strip_code().strip() first = obj.get(idx.replace("%", "first"), None) if first: first = first.value.strip_code().strip() if last and first: names.append(f"{last}, {first}") elif last: names.append(last) elif first: names.append(first) return "{} ({})".format(title, "; ".join(names)) def _convert(obj: Union[mwp.nodes.Node, mwp.wikicode.Wikicode]) -> str: default = lambda obj: mwp.wikicode.Wikicode([obj]).strip_code(collapse=False) # This does the actual conversion if isinstance(obj, mwp.wikicode.Wikicode): converted = [] iterator = iter(enumerate(obj.nodes)) for i, node in iterator: # Pattern: * [[Wikilink]]\n if (i >= 2 and i + 1 < len(obj.nodes) and # Links can have a plural s after them re.match("s?\n", str(obj.nodes[i+1])) and isinstance(node, mwp.nodes.wikilink.Wikilink) and str(obj.nodes[i-1]) == " " and str(obj.nodes[i-2]) == "*"): converted.pop() converted.pop() _, after = next(iterator) converted.append("=> {} {}{}".format( page_url(str(node.title)), _convert(node), _convert(after), )) continue # Default: Just convert the node converted.append(_convert(node)) return "".join(converted) elif isinstance(obj, mwp.nodes.heading.Heading): return "{} {}\n".format("#" * min(obj.level, 3), obj.title.strip_code()) elif isinstance(obj, mwp.nodes.tag.Tag): # Most tags are handled just fine and can be delegated to strip_code # (inline text styles), however we can do a bit better for list tags. if str(obj.wiki_markup) == "*": return "*" elif str(obj.wiki_markup) == "#": return " {}".format(_convert(obj.contents)) elif str(obj.tag) == "ref": return "" elif str(obj.tag) == "table": rows = [] header = None for row in obj.contents.nodes: if str(getattr(row, "tag", "")) != "tr": continue nodes = row.contents.nodes parsed = [] row_is_header = False for node in nodes: if str(getattr(node, "tag", "")) not in {"td", "th"}: continue if str(node.tag) == "th": row_is_header = True parsed.append(_convert(node.contents).strip()) if not row_is_header: rows.append(parsed) else: header = parsed return "\n```\n" + tabulate(rows, header, tablefmt=TABLEFMT) + "\n```\n" else: return default(obj) elif isinstance(obj, mwp.nodes.template.Template): # Most templates are handled fine (and completely stripped), however, # some of them are useful and provide some output that we should mimic # (for example, the convert template). name = str(obj.name).strip().lower() if name in {"cvt", "convert"}: return render_convert(obj) elif name == "lang": return _convert(obj.params[1].value) elif name in {"reign", "r.", "ruled", "rexit"}: return render_reign(obj) elif name in {"cite book", "cite journal", "cite news"}: return render_cite_book(obj) else: return default(obj) elif isinstance(obj, mwp.nodes.wikilink.Wikilink): if str(obj.title).startswith("File:") or str(obj.text).startswith("thumb|"): return "" elif str(obj.title).startswith("Category:"): return "" else: return default(obj) else: return default(obj) def _postprocess(gemtext: str) -> str: # Strip out any more thumbs that have been left. # This happens because the wikilinks are nested in each other, which the # parser would only notice after doing the first replacement. We'll just # take the easy way out here and use a regex to get rid of them. gemtext = re.sub("^\\[\\[File:.*?\\]\\]$", "", gemtext, flags=re.MULTILINE) # Collapse too many empty lines while "\n\n\n" in gemtext: gemtext = gemtext.replace("\n\n\n", "\n\n") # Shortcut to avoid unnecessary splitting if "" not in gemtext: return gemtext lines = gemtext.split("\n") counter = 1 for idx in range(len(lines)): line = lines[idx] if line.startswith(""): line = line.replace("", str(counter), 1) lines[idx] = line counter += 1 else: counter = 1 return "\n".join(lines) def wikicode_to_gemtext( obj: Union[mwp.nodes.Node, mwp.wikicode.Wikicode] ) -> str: """Try to turn the given object into a sensible Gemtext representation. Note that wikicode is much more powerful than Gemtext, so this is a lossy function. The returned Gemtext tries to mimic the content of the Wikicode as much as possible (for human consumption). This function mostly mimics `~mwparserfromhell.wikicode.Wikicode.strip_code`, with some addition to better handle things that *can* be represented by Gemtext. Args: obj: The object to convert. Returns: The converted Gemtext. """ return _postprocess(_convert(obj))