diff options
author | Daniel Schadt <kingdread@gmx.de> | 2021-08-16 15:16:22 +0200 |
---|---|---|
committer | Daniel Schadt <kingdread@gmx.de> | 2021-08-16 15:16:22 +0200 |
commit | e846cbd2b18e43bdd69234930150f0cc97be984d (patch) | |
tree | 8f857fe9e08ca5675eee7d3a7f7adadce279c225 /wikimini.py | |
parent | b688a103d5266ebcbccc9d23a334af02102dffc7 (diff) | |
download | wikimini-e846cbd2b18e43bdd69234930150f0cc97be984d.tar.gz wikimini-e846cbd2b18e43bdd69234930150f0cc97be984d.tar.bz2 wikimini-e846cbd2b18e43bdd69234930150f0cc97be984d.zip |
Add a setup.py
Diffstat (limited to 'wikimini.py')
-rw-r--r-- | wikimini.py | 222 |
1 files changed, 0 insertions, 222 deletions
diff --git a/wikimini.py b/wikimini.py deleted file mode 100644 index 032639a..0000000 --- a/wikimini.py +++ /dev/null @@ -1,222 +0,0 @@ -import mwparserfromhell as mwp -import requests -import re - -from tabulate import tabulate - -from typing import Union - - -API_URL = "https://en.wikipedia.org/w/api.php" - -# See https://pypi.org/project/tabulate/ for a reference of table formats. -TABLEFMT = "fancy_grid" - - -def page_url(title): - return title.replace(" ", "_") - - -def retrieve(title: str) -> mwp.wikicode.Wikicode: - params = { - "action": "query", - "prop": "revisions", - "rvprop": "content", - "rvslots": "main", - "rvlimit": 1, - "titles": title, - "format": "json", - "formatversion": "2", - } - headers = {"User-Agent": "Wikimini/1.0"} - req = requests.get(API_URL, headers=headers, params=params) - res = req.json() - revision = res["query"]["pages"][0]["revisions"][0] - text = revision["slots"]["main"]["content"] - return mwp.parse(text) - - -def render_convert(obj: mwp.nodes.template.Template) -> str: - """Renders the {{convert|...}} template.""" - if str(obj.params[1]) in {"-", "to"}: - return "{0}{3} {1} {2}{3}".format( - obj.params[0].value.strip_code(), - obj.params[1].value.strip_code(), - obj.params[2].value.strip_code(), - obj.params[3].value.strip_code(), - ) - return "{}{}".format( - obj.params[0].value.strip_code(), - obj.params[1].value.strip_code(), - ) - - -def render_reign(obj: mwp.nodes.template.Template) -> str: - """Renders the {{reign|...}} template.""" - if not obj.params: - return "r. " - first = obj.params[0].value.strip_code().strip() or "?" - second = "" - if len(obj.params) > 1: - second = obj.params[1].value.strip_code().strip() - return f"r. {first} – {second}" - - -def render_cite_book(obj: mwp.nodes.template.Template) -> str: - """Renders the {{cite book|...}} template.""" - title = obj.get("title", None) - if title: - title = title.value.strip_code().strip() - else: - title = "Untitled" - names = [] - for idx in ["%", "%1", "%2", "%3", "%4", "%5", "editor1-%"]: - last = obj.get(idx.replace("%", "last"), None) - if last: - last = last.value.strip_code().strip() - first = obj.get(idx.replace("%", "first"), None) - if first: - first = first.value.strip_code().strip() - if last and first: - names.append(f"{last}, {first}") - elif last: - names.append(last) - elif first: - names.append(first) - return "{} ({})".format(title, "; ".join(names)) - - -def _convert(obj: Union[mwp.nodes.Node, mwp.wikicode.Wikicode]) -> str: - default = lambda obj: mwp.wikicode.Wikicode([obj]).strip_code(collapse=False) - # This does the actual conversion - if isinstance(obj, mwp.wikicode.Wikicode): - converted = [] - iterator = iter(enumerate(obj.nodes)) - for i, node in iterator: - # Pattern: * [[Wikilink]]\n - if (i >= 2 and - i + 1 < len(obj.nodes) and - # Links can have a plural s after them - re.match("s?\n", str(obj.nodes[i+1])) and - isinstance(node, mwp.nodes.wikilink.Wikilink) and - str(obj.nodes[i-1]) == " " and - str(obj.nodes[i-2]) == "*"): - converted.pop() - converted.pop() - _, after = next(iterator) - converted.append("=> {} {}{}".format( - page_url(str(node.title)), - _convert(node), - _convert(after), - )) - continue - - # Default: Just convert the node - converted.append(_convert(node)) - return "".join(converted) - elif isinstance(obj, mwp.nodes.heading.Heading): - return "{} {}\n".format("#" * min(obj.level, 3), obj.title.strip_code()) - elif isinstance(obj, mwp.nodes.tag.Tag): - # Most tags are handled just fine and can be delegated to strip_code - # (inline text styles), however we can do a bit better for list tags. - if str(obj.wiki_markup) == "*": - return "*" - elif str(obj.wiki_markup) == "#": - return "<!NUM!> {}".format(_convert(obj.contents)) - elif str(obj.tag) == "ref": - return "" - elif str(obj.tag) == "table": - rows = [] - header = None - for row in obj.contents.nodes: - if str(getattr(row, "tag", "")) != "tr": - continue - nodes = row.contents.nodes - parsed = [] - row_is_header = False - for node in nodes: - if str(getattr(node, "tag", "")) not in {"td", "th"}: - continue - if str(node.tag) == "th": - row_is_header = True - parsed.append(_convert(node.contents).strip()) - if not row_is_header: - rows.append(parsed) - else: - header = parsed - return "\n```\n" + tabulate(rows, header, tablefmt=TABLEFMT) + "\n```\n" - else: - return default(obj) - elif isinstance(obj, mwp.nodes.template.Template): - # Most templates are handled fine (and completely stripped), however, - # some of them are useful and provide some output that we should mimic - # (for example, the convert template). - name = str(obj.name).strip().lower() - if name in {"cvt", "convert"}: - return render_convert(obj) - elif name == "lang": - return _convert(obj.params[1].value) - elif name in {"reign", "r.", "ruled", "rexit"}: - return render_reign(obj) - elif name in {"cite book", "cite journal", "cite news"}: - return render_cite_book(obj) - else: - return default(obj) - elif isinstance(obj, mwp.nodes.wikilink.Wikilink): - if str(obj.title).startswith("File:") or str(obj.text).startswith("thumb|"): - return "" - elif str(obj.title).startswith("Category:"): - return "" - else: - return default(obj) - else: - return default(obj) - -def _postprocess(gemtext: str) -> str: - # Strip out any more thumbs that have been left. - # This happens because the wikilinks are nested in each other, which the - # parser would only notice after doing the first replacement. We'll just - # take the easy way out here and use a regex to get rid of them. - gemtext = re.sub("^\\[\\[File:.*?\\]\\]$", "", gemtext, flags=re.MULTILINE) - - # Collapse too many empty lines - while "\n\n\n" in gemtext: - gemtext = gemtext.replace("\n\n\n", "\n\n") - - # Shortcut to avoid unnecessary splitting - if "<!NUM!>" not in gemtext: - return gemtext - - lines = gemtext.split("\n") - counter = 1 - for idx in range(len(lines)): - line = lines[idx] - if line.startswith("<!NUM!>"): - line = line.replace("<!NUM!>", str(counter), 1) - lines[idx] = line - counter += 1 - else: - counter = 1 - return "\n".join(lines) - - -def wikicode_to_gemtext( - obj: Union[mwp.nodes.Node, mwp.wikicode.Wikicode] - ) -> str: - """Try to turn the given object into a sensible Gemtext representation. - - Note that wikicode is much more powerful than Gemtext, so this is a lossy - function. The returned Gemtext tries to mimic the content of the Wikicode - as much as possible (for human consumption). - - This function mostly mimics - `~mwparserfromhell.wikicode.Wikicode.strip_code`, with some addition to - better handle things that *can* be represented by Gemtext. - - Args: - obj: The object to convert. - - Returns: - The converted Gemtext. - """ - return _postprocess(_convert(obj)) |