summaryrefslogtreecommitdiff
path: root/wikimini.py
diff options
context:
space:
mode:
authorDaniel Schadt <kingdread@gmx.de>2021-08-16 15:16:22 +0200
committerDaniel Schadt <kingdread@gmx.de>2021-08-16 15:16:22 +0200
commite846cbd2b18e43bdd69234930150f0cc97be984d (patch)
tree8f857fe9e08ca5675eee7d3a7f7adadce279c225 /wikimini.py
parentb688a103d5266ebcbccc9d23a334af02102dffc7 (diff)
downloadwikimini-e846cbd2b18e43bdd69234930150f0cc97be984d.tar.gz
wikimini-e846cbd2b18e43bdd69234930150f0cc97be984d.tar.bz2
wikimini-e846cbd2b18e43bdd69234930150f0cc97be984d.zip
Add a setup.py
Diffstat (limited to 'wikimini.py')
-rw-r--r--wikimini.py222
1 files changed, 0 insertions, 222 deletions
diff --git a/wikimini.py b/wikimini.py
deleted file mode 100644
index 032639a..0000000
--- a/wikimini.py
+++ /dev/null
@@ -1,222 +0,0 @@
-import mwparserfromhell as mwp
-import requests
-import re
-
-from tabulate import tabulate
-
-from typing import Union
-
-
-API_URL = "https://en.wikipedia.org/w/api.php"
-
-# See https://pypi.org/project/tabulate/ for a reference of table formats.
-TABLEFMT = "fancy_grid"
-
-
-def page_url(title):
- return title.replace(" ", "_")
-
-
-def retrieve(title: str) -> mwp.wikicode.Wikicode:
- params = {
- "action": "query",
- "prop": "revisions",
- "rvprop": "content",
- "rvslots": "main",
- "rvlimit": 1,
- "titles": title,
- "format": "json",
- "formatversion": "2",
- }
- headers = {"User-Agent": "Wikimini/1.0"}
- req = requests.get(API_URL, headers=headers, params=params)
- res = req.json()
- revision = res["query"]["pages"][0]["revisions"][0]
- text = revision["slots"]["main"]["content"]
- return mwp.parse(text)
-
-
-def render_convert(obj: mwp.nodes.template.Template) -> str:
- """Renders the {{convert|...}} template."""
- if str(obj.params[1]) in {"-", "to"}:
- return "{0}{3} {1} {2}{3}".format(
- obj.params[0].value.strip_code(),
- obj.params[1].value.strip_code(),
- obj.params[2].value.strip_code(),
- obj.params[3].value.strip_code(),
- )
- return "{}{}".format(
- obj.params[0].value.strip_code(),
- obj.params[1].value.strip_code(),
- )
-
-
-def render_reign(obj: mwp.nodes.template.Template) -> str:
- """Renders the {{reign|...}} template."""
- if not obj.params:
- return "r. "
- first = obj.params[0].value.strip_code().strip() or "?"
- second = ""
- if len(obj.params) > 1:
- second = obj.params[1].value.strip_code().strip()
- return f"r. {first} – {second}"
-
-
-def render_cite_book(obj: mwp.nodes.template.Template) -> str:
- """Renders the {{cite book|...}} template."""
- title = obj.get("title", None)
- if title:
- title = title.value.strip_code().strip()
- else:
- title = "Untitled"
- names = []
- for idx in ["%", "%1", "%2", "%3", "%4", "%5", "editor1-%"]:
- last = obj.get(idx.replace("%", "last"), None)
- if last:
- last = last.value.strip_code().strip()
- first = obj.get(idx.replace("%", "first"), None)
- if first:
- first = first.value.strip_code().strip()
- if last and first:
- names.append(f"{last}, {first}")
- elif last:
- names.append(last)
- elif first:
- names.append(first)
- return "{} ({})".format(title, "; ".join(names))
-
-
-def _convert(obj: Union[mwp.nodes.Node, mwp.wikicode.Wikicode]) -> str:
- default = lambda obj: mwp.wikicode.Wikicode([obj]).strip_code(collapse=False)
- # This does the actual conversion
- if isinstance(obj, mwp.wikicode.Wikicode):
- converted = []
- iterator = iter(enumerate(obj.nodes))
- for i, node in iterator:
- # Pattern: * [[Wikilink]]\n
- if (i >= 2 and
- i + 1 < len(obj.nodes) and
- # Links can have a plural s after them
- re.match("s?\n", str(obj.nodes[i+1])) and
- isinstance(node, mwp.nodes.wikilink.Wikilink) and
- str(obj.nodes[i-1]) == " " and
- str(obj.nodes[i-2]) == "*"):
- converted.pop()
- converted.pop()
- _, after = next(iterator)
- converted.append("=> {} {}{}".format(
- page_url(str(node.title)),
- _convert(node),
- _convert(after),
- ))
- continue
-
- # Default: Just convert the node
- converted.append(_convert(node))
- return "".join(converted)
- elif isinstance(obj, mwp.nodes.heading.Heading):
- return "{} {}\n".format("#" * min(obj.level, 3), obj.title.strip_code())
- elif isinstance(obj, mwp.nodes.tag.Tag):
- # Most tags are handled just fine and can be delegated to strip_code
- # (inline text styles), however we can do a bit better for list tags.
- if str(obj.wiki_markup) == "*":
- return "*"
- elif str(obj.wiki_markup) == "#":
- return "<!NUM!> {}".format(_convert(obj.contents))
- elif str(obj.tag) == "ref":
- return ""
- elif str(obj.tag) == "table":
- rows = []
- header = None
- for row in obj.contents.nodes:
- if str(getattr(row, "tag", "")) != "tr":
- continue
- nodes = row.contents.nodes
- parsed = []
- row_is_header = False
- for node in nodes:
- if str(getattr(node, "tag", "")) not in {"td", "th"}:
- continue
- if str(node.tag) == "th":
- row_is_header = True
- parsed.append(_convert(node.contents).strip())
- if not row_is_header:
- rows.append(parsed)
- else:
- header = parsed
- return "\n```\n" + tabulate(rows, header, tablefmt=TABLEFMT) + "\n```\n"
- else:
- return default(obj)
- elif isinstance(obj, mwp.nodes.template.Template):
- # Most templates are handled fine (and completely stripped), however,
- # some of them are useful and provide some output that we should mimic
- # (for example, the convert template).
- name = str(obj.name).strip().lower()
- if name in {"cvt", "convert"}:
- return render_convert(obj)
- elif name == "lang":
- return _convert(obj.params[1].value)
- elif name in {"reign", "r.", "ruled", "rexit"}:
- return render_reign(obj)
- elif name in {"cite book", "cite journal", "cite news"}:
- return render_cite_book(obj)
- else:
- return default(obj)
- elif isinstance(obj, mwp.nodes.wikilink.Wikilink):
- if str(obj.title).startswith("File:") or str(obj.text).startswith("thumb|"):
- return ""
- elif str(obj.title).startswith("Category:"):
- return ""
- else:
- return default(obj)
- else:
- return default(obj)
-
-def _postprocess(gemtext: str) -> str:
- # Strip out any more thumbs that have been left.
- # This happens because the wikilinks are nested in each other, which the
- # parser would only notice after doing the first replacement. We'll just
- # take the easy way out here and use a regex to get rid of them.
- gemtext = re.sub("^\\[\\[File:.*?\\]\\]$", "", gemtext, flags=re.MULTILINE)
-
- # Collapse too many empty lines
- while "\n\n\n" in gemtext:
- gemtext = gemtext.replace("\n\n\n", "\n\n")
-
- # Shortcut to avoid unnecessary splitting
- if "<!NUM!>" not in gemtext:
- return gemtext
-
- lines = gemtext.split("\n")
- counter = 1
- for idx in range(len(lines)):
- line = lines[idx]
- if line.startswith("<!NUM!>"):
- line = line.replace("<!NUM!>", str(counter), 1)
- lines[idx] = line
- counter += 1
- else:
- counter = 1
- return "\n".join(lines)
-
-
-def wikicode_to_gemtext(
- obj: Union[mwp.nodes.Node, mwp.wikicode.Wikicode]
- ) -> str:
- """Try to turn the given object into a sensible Gemtext representation.
-
- Note that wikicode is much more powerful than Gemtext, so this is a lossy
- function. The returned Gemtext tries to mimic the content of the Wikicode
- as much as possible (for human consumption).
-
- This function mostly mimics
- `~mwparserfromhell.wikicode.Wikicode.strip_code`, with some addition to
- better handle things that *can* be represented by Gemtext.
-
- Args:
- obj: The object to convert.
-
- Returns:
- The converted Gemtext.
- """
- return _postprocess(_convert(obj))