summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaniel Schadt <kingdread@gmx.de>2021-08-16 15:06:09 +0200
committerDaniel Schadt <kingdread@gmx.de>2021-08-16 15:06:09 +0200
commitb688a103d5266ebcbccc9d23a334af02102dffc7 (patch)
tree38c382d5df1e69339f0dada3ae174457c5d432be
downloadwikimini-b688a103d5266ebcbccc9d23a334af02102dffc7.tar.gz
wikimini-b688a103d5266ebcbccc9d23a334af02102dffc7.tar.bz2
wikimini-b688a103d5266ebcbccc9d23a334af02102dffc7.zip
Initial commit
This is not even a proper Python package yet, but the output is surprisingly good already, so I'd like to take this version and save it.
-rw-r--r--wikimini.py222
1 files changed, 222 insertions, 0 deletions
diff --git a/wikimini.py b/wikimini.py
new file mode 100644
index 0000000..032639a
--- /dev/null
+++ b/wikimini.py
@@ -0,0 +1,222 @@
+import mwparserfromhell as mwp
+import requests
+import re
+
+from tabulate import tabulate
+
+from typing import Union
+
+
+API_URL = "https://en.wikipedia.org/w/api.php"
+
+# See https://pypi.org/project/tabulate/ for a reference of table formats.
+TABLEFMT = "fancy_grid"
+
+
+def page_url(title):
+ return title.replace(" ", "_")
+
+
+def retrieve(title: str) -> mwp.wikicode.Wikicode:
+ params = {
+ "action": "query",
+ "prop": "revisions",
+ "rvprop": "content",
+ "rvslots": "main",
+ "rvlimit": 1,
+ "titles": title,
+ "format": "json",
+ "formatversion": "2",
+ }
+ headers = {"User-Agent": "Wikimini/1.0"}
+ req = requests.get(API_URL, headers=headers, params=params)
+ res = req.json()
+ revision = res["query"]["pages"][0]["revisions"][0]
+ text = revision["slots"]["main"]["content"]
+ return mwp.parse(text)
+
+
+def render_convert(obj: mwp.nodes.template.Template) -> str:
+ """Renders the {{convert|...}} template."""
+ if str(obj.params[1]) in {"-", "to"}:
+ return "{0}{3} {1} {2}{3}".format(
+ obj.params[0].value.strip_code(),
+ obj.params[1].value.strip_code(),
+ obj.params[2].value.strip_code(),
+ obj.params[3].value.strip_code(),
+ )
+ return "{}{}".format(
+ obj.params[0].value.strip_code(),
+ obj.params[1].value.strip_code(),
+ )
+
+
+def render_reign(obj: mwp.nodes.template.Template) -> str:
+ """Renders the {{reign|...}} template."""
+ if not obj.params:
+ return "r. "
+ first = obj.params[0].value.strip_code().strip() or "?"
+ second = ""
+ if len(obj.params) > 1:
+ second = obj.params[1].value.strip_code().strip()
+ return f"r. {first} – {second}"
+
+
+def render_cite_book(obj: mwp.nodes.template.Template) -> str:
+ """Renders the {{cite book|...}} template."""
+ title = obj.get("title", None)
+ if title:
+ title = title.value.strip_code().strip()
+ else:
+ title = "Untitled"
+ names = []
+ for idx in ["%", "%1", "%2", "%3", "%4", "%5", "editor1-%"]:
+ last = obj.get(idx.replace("%", "last"), None)
+ if last:
+ last = last.value.strip_code().strip()
+ first = obj.get(idx.replace("%", "first"), None)
+ if first:
+ first = first.value.strip_code().strip()
+ if last and first:
+ names.append(f"{last}, {first}")
+ elif last:
+ names.append(last)
+ elif first:
+ names.append(first)
+ return "{} ({})".format(title, "; ".join(names))
+
+
+def _convert(obj: Union[mwp.nodes.Node, mwp.wikicode.Wikicode]) -> str:
+ default = lambda obj: mwp.wikicode.Wikicode([obj]).strip_code(collapse=False)
+ # This does the actual conversion
+ if isinstance(obj, mwp.wikicode.Wikicode):
+ converted = []
+ iterator = iter(enumerate(obj.nodes))
+ for i, node in iterator:
+ # Pattern: * [[Wikilink]]\n
+ if (i >= 2 and
+ i + 1 < len(obj.nodes) and
+ # Links can have a plural s after them
+ re.match("s?\n", str(obj.nodes[i+1])) and
+ isinstance(node, mwp.nodes.wikilink.Wikilink) and
+ str(obj.nodes[i-1]) == " " and
+ str(obj.nodes[i-2]) == "*"):
+ converted.pop()
+ converted.pop()
+ _, after = next(iterator)
+ converted.append("=> {} {}{}".format(
+ page_url(str(node.title)),
+ _convert(node),
+ _convert(after),
+ ))
+ continue
+
+ # Default: Just convert the node
+ converted.append(_convert(node))
+ return "".join(converted)
+ elif isinstance(obj, mwp.nodes.heading.Heading):
+ return "{} {}\n".format("#" * min(obj.level, 3), obj.title.strip_code())
+ elif isinstance(obj, mwp.nodes.tag.Tag):
+ # Most tags are handled just fine and can be delegated to strip_code
+ # (inline text styles), however we can do a bit better for list tags.
+ if str(obj.wiki_markup) == "*":
+ return "*"
+ elif str(obj.wiki_markup) == "#":
+ return "<!NUM!> {}".format(_convert(obj.contents))
+ elif str(obj.tag) == "ref":
+ return ""
+ elif str(obj.tag) == "table":
+ rows = []
+ header = None
+ for row in obj.contents.nodes:
+ if str(getattr(row, "tag", "")) != "tr":
+ continue
+ nodes = row.contents.nodes
+ parsed = []
+ row_is_header = False
+ for node in nodes:
+ if str(getattr(node, "tag", "")) not in {"td", "th"}:
+ continue
+ if str(node.tag) == "th":
+ row_is_header = True
+ parsed.append(_convert(node.contents).strip())
+ if not row_is_header:
+ rows.append(parsed)
+ else:
+ header = parsed
+ return "\n```\n" + tabulate(rows, header, tablefmt=TABLEFMT) + "\n```\n"
+ else:
+ return default(obj)
+ elif isinstance(obj, mwp.nodes.template.Template):
+ # Most templates are handled fine (and completely stripped), however,
+ # some of them are useful and provide some output that we should mimic
+ # (for example, the convert template).
+ name = str(obj.name).strip().lower()
+ if name in {"cvt", "convert"}:
+ return render_convert(obj)
+ elif name == "lang":
+ return _convert(obj.params[1].value)
+ elif name in {"reign", "r.", "ruled", "rexit"}:
+ return render_reign(obj)
+ elif name in {"cite book", "cite journal", "cite news"}:
+ return render_cite_book(obj)
+ else:
+ return default(obj)
+ elif isinstance(obj, mwp.nodes.wikilink.Wikilink):
+ if str(obj.title).startswith("File:") or str(obj.text).startswith("thumb|"):
+ return ""
+ elif str(obj.title).startswith("Category:"):
+ return ""
+ else:
+ return default(obj)
+ else:
+ return default(obj)
+
+def _postprocess(gemtext: str) -> str:
+ # Strip out any more thumbs that have been left.
+ # This happens because the wikilinks are nested in each other, which the
+ # parser would only notice after doing the first replacement. We'll just
+ # take the easy way out here and use a regex to get rid of them.
+ gemtext = re.sub("^\\[\\[File:.*?\\]\\]$", "", gemtext, flags=re.MULTILINE)
+
+ # Collapse too many empty lines
+ while "\n\n\n" in gemtext:
+ gemtext = gemtext.replace("\n\n\n", "\n\n")
+
+ # Shortcut to avoid unnecessary splitting
+ if "<!NUM!>" not in gemtext:
+ return gemtext
+
+ lines = gemtext.split("\n")
+ counter = 1
+ for idx in range(len(lines)):
+ line = lines[idx]
+ if line.startswith("<!NUM!>"):
+ line = line.replace("<!NUM!>", str(counter), 1)
+ lines[idx] = line
+ counter += 1
+ else:
+ counter = 1
+ return "\n".join(lines)
+
+
+def wikicode_to_gemtext(
+ obj: Union[mwp.nodes.Node, mwp.wikicode.Wikicode]
+ ) -> str:
+ """Try to turn the given object into a sensible Gemtext representation.
+
+ Note that wikicode is much more powerful than Gemtext, so this is a lossy
+ function. The returned Gemtext tries to mimic the content of the Wikicode
+ as much as possible (for human consumption).
+
+ This function mostly mimics
+ `~mwparserfromhell.wikicode.Wikicode.strip_code`, with some addition to
+ better handle things that *can* be represented by Gemtext.
+
+ Args:
+ obj: The object to convert.
+
+ Returns:
+ The converted Gemtext.
+ """
+ return _postprocess(_convert(obj))