diff options
author | Daniel Schadt <kingdread@gmx.de> | 2021-08-17 01:42:45 +0200 |
---|---|---|
committer | Daniel Schadt <kingdread@gmx.de> | 2021-08-17 01:42:45 +0200 |
commit | 1543a13c880bf22037466109cbd77b2d0c9f21c3 (patch) | |
tree | 3a3033cff52e12d84e923ec7810b32abc6799067 | |
parent | 02f21a9ca9f9842502f8b89cf77aafc05082bcf1 (diff) | |
download | wikimini-1543a13c880bf22037466109cbd77b2d0c9f21c3.tar.gz wikimini-1543a13c880bf22037466109cbd77b2d0c9f21c3.tar.bz2 wikimini-1543a13c880bf22037466109cbd77b2d0c9f21c3.zip |
Reorganize code
Cramming everything into a single file is not necessarily good, so this
patch splits it up a bit. Furthermore, the templates are no longer
hardcoded, but managed through a registry.
This breaks the lang-ar implementation, which was a weird special case
anyway. Properly fixing it would be to include all country codes.
-rw-r--r-- | wikimini/__init__.py | 500 | ||||
-rw-r--r-- | wikimini/templates/__init__.py | 58 | ||||
-rw-r--r-- | wikimini/templates/cite.py | 36 | ||||
-rw-r--r-- | wikimini/templates/convert.py | 21 | ||||
-rw-r--r-- | wikimini/templates/language.py | 19 | ||||
-rw-r--r-- | wikimini/templates/mainlinks.py | 15 | ||||
-rw-r--r-- | wikimini/templates/quotes.py | 27 | ||||
-rw-r--r-- | wikimini/templates/various.py | 19 |
8 files changed, 427 insertions, 268 deletions
diff --git a/wikimini/__init__.py b/wikimini/__init__.py index 397c539..b93ef81 100644 --- a/wikimini/__init__.py +++ b/wikimini/__init__.py @@ -4,282 +4,246 @@ import re from tabulate import tabulate -from typing import Union +from typing import Union, Tuple +#: The default API URL, pointing to the english Wikipedia. API_URL = "https://en.wikipedia.org/w/api.php" -# See https://pypi.org/project/tabulate/ for a reference of table formats. -TABLEFMT = "fancy_grid" - - -def page_url(title): - return title.replace(" ", "_") - - -def retrieve(title: str) -> (str, mwp.wikicode.Wikicode): - params = { - "action": "query", - "prop": "revisions", - "rvprop": "content", - "rvslots": "main", - "rvlimit": 1, - "titles": title, - "format": "json", - "formatversion": "2", - } - headers = {"User-Agent": "Wikimini/1.0"} - req = requests.get(API_URL, headers=headers, params=params) - res = req.json() - page = res["query"]["pages"][0] - title = page["title"] - revision = page["revisions"][0] - text = revision["slots"]["main"]["content"] - return (title, mwp.parse(text)) - - -def render_convert(obj: mwp.nodes.template.Template) -> str: - """Renders the {{convert|...}} template.""" - if str(obj.params[1]) in {"-", "to"}: - return "{0}{3} {1} {2}{3}".format( - obj.params[0].value.strip_code(), - obj.params[1].value.strip_code(), - obj.params[2].value.strip_code(), - obj.params[3].value.strip_code(), - ) - return "{}{}".format( - obj.params[0].value.strip_code(), - obj.params[1].value.strip_code(), - ) - - -def render_reign(obj: mwp.nodes.template.Template) -> str: - """Renders the {{reign|...}} template.""" - if not obj.params: - return "r. " - first = obj.params[0].value.strip_code().strip() or "?" - second = "" - if len(obj.params) > 1: - second = obj.params[1].value.strip_code().strip() - return f"r. {first} – {second}" - - -def render_cite_book(obj: mwp.nodes.template.Template) -> str: - """Renders the {{cite book|...}} template.""" - title = obj.get("title", None) - if title: - title = title.value.strip_code().strip() - else: - title = "Untitled" - names = [] - for idx in ["%", "%1", "%2", "%3", "%4", "%5", "editor1-%"]: - last = obj.get(idx.replace("%", "last"), None) - if last: - last = last.value.strip_code().strip() - first = obj.get(idx.replace("%", "first"), None) - if first: - first = first.value.strip_code().strip() - if last and first: - names.append(f"{last}, {first}") - elif last: - names.append(last) - elif first: - names.append(first) - return "{} ({})".format(title, "; ".join(names)) - - -def render_main(obj: mwp.nodes.template.Template) -> str: - """Renders the {{main|...}} template.""" - links = [ - "=> {} {}".format(page_url(str(t.value)), t.value) for t in obj.params - ] - return "Main articles:\n{}\n".format("\n".join(links)) - - -def render_quote(obj): - text = obj.get("text", None) - if not text: - return "" - content = _convert(text.value) - lines = content.split("\n") - return "\n".join(f"> {line}" for line in lines) - - -def render_cquote(obj): - text = obj.params[0] - content = _convert(text.value) - lines = content.split("\n") - return "\n".join(f"> {line}" for line in lines) - - -def _convert(obj: Union[mwp.nodes.Node, mwp.wikicode.Wikicode]) -> str: - default = lambda obj: mwp.wikicode.Wikicode([obj]).strip_code(collapse=False) - # This does the actual conversion - if isinstance(obj, mwp.wikicode.Wikicode): - converted = [] - iterator = iter(enumerate(obj.nodes)) - for i, node in iterator: - # Pattern: * [[Wikilink]]\n - if (i >= 2 and - i + 1 < len(obj.nodes) and - # Links can have a plural s after them - re.match("s?\n", str(obj.nodes[i+1])) and - isinstance(node, mwp.nodes.wikilink.Wikilink) and - str(obj.nodes[i-1]) == " " and - str(obj.nodes[i-2]) == "*"): - converted.pop() - converted.pop() - _, after = next(iterator) - converted.append("=> {} {}{}".format( - page_url(str(node.title)), - _convert(node), - _convert(after), - )) - continue - # Pattern: *[[Wikilink]]\n - elif (i >= 1 and - i + 1 < len(obj.nodes) and - re.match("s?\n", str(obj.nodes[i+1])) and - isinstance(node, mwp.nodes.wikilink.Wikilink) and - str(obj.nodes[i-1]) == "*"): - converted.pop() - _, after = next(iterator) - converted.append("=> {} {}{}".format( - page_url(str(node.title)), - _convert(node), - _convert(after), - )) - continue - - # Default: Just convert the node - converted.append(_convert(node)) - return "".join(converted) - elif isinstance(obj, mwp.nodes.heading.Heading): - return "{} {}\n".format("#" * min(obj.level, 3), obj.title.strip_code()) - elif isinstance(obj, mwp.nodes.tag.Tag): - # Most tags are handled just fine and can be delegated to strip_code - # (inline text styles), however we can do a bit better for list tags. - if str(obj.wiki_markup) == "*": - return "* " - elif str(obj.wiki_markup) == "#": - return "<!NUM!> " - elif str(obj.tag) == "ref": - return "" - elif str(obj.tag) == "table": - rows = [] - header = () - for row in obj.contents.nodes: - if str(getattr(row, "tag", "")) != "tr": +#: The default format for tables. +#: +#: See https://pypi.org/project/tabulate/ for a reference of table formats. +TABLE_FORMAT = "fancy_grid" + + +class Wikimini: + """The main object for Wikipedia/Gemini access and conversion. + + Attributes: + api_url (str): The base URL of the API. + table_format (str): The name of the table style, see + :const:`TABLE_FORMAT`. + """ + def __init__(self, api_url=API_URL, table_format=TABLE_FORMAT): + self.api_url = api_url + self.table_format = table_format + + def page_url(self, title: str) -> str: + """Returns the link for page with the given title. + + By default, this only replaces the special characters (like the space). + You can override this function if you need more fine-grained control + over how other Wikipedia articles are linked, e.g. if you pass them via + a query string or a special parameter. + + Note that this function should only return the URL, not the full link + line. + + Args: + title: Title of the page that should be linked. + + Returns: + The link to the page. + """ + return title.replace(" ", "_") + + def retrieve(self, title: str) -> Tuple[str, mwp.wikicode.Wikicode]: + """Retrieves and parses the Wikipedia article with the given title. + + This uses the API specified in :attr:`api_url` to get the Wikicode. + + Args: + title: The title of the page to retrieve. + + Returns: + A pair of page title and the parsed Wikicode. + """ + params = { + "action": "query", + "prop": "revisions", + "rvprop": "content", + "rvslots": "main", + "rvlimit": 1, + "titles": title, + "format": "json", + "formatversion": "2", + } + headers = {"User-Agent": "Wikimini/1.0"} + req = requests.get(self.api_url, headers=headers, params=params) + res = req.json() + page = res["query"]["pages"][0] + title = page["title"] + revision = page["revisions"][0] + text = revision["slots"]["main"]["content"] + return (title, mwp.parse(text)) + + def _convert(self, obj): + """Function that does the actual conversion. + + This is called recursively on each node, and should perform the correct + conversion - based on the node type. + """ + default = lambda obj:\ + mwp.wikicode.Wikicode([obj]).strip_code(collapse=False) + + # This does the actual conversion + if isinstance(obj, mwp.wikicode.Wikicode): + converted = [] + iterator = iter(enumerate(obj.nodes)) + for i, node in iterator: + # Pattern: * [[Wikilink]]\n + if (i >= 2 and + i + 1 < len(obj.nodes) and + # Links can have a plural s after them + re.match("s?\n", str(obj.nodes[i+1])) and + isinstance(node, mwp.nodes.wikilink.Wikilink) and + str(obj.nodes[i-1]) == " " and + str(obj.nodes[i-2]) == "*"): + converted.pop() + converted.pop() + _, after = next(iterator) + converted.append("=> {} {}{}".format( + self.page_url(str(node.title)), + self._convert(node), + self._convert(after), + )) + continue + # Pattern: *[[Wikilink]]\n + elif (i >= 1 and + i + 1 < len(obj.nodes) and + re.match("s?\n", str(obj.nodes[i+1])) and + isinstance(node, mwp.nodes.wikilink.Wikilink) and + str(obj.nodes[i-1]) == "*"): + converted.pop() + _, after = next(iterator) + converted.append("=> {} {}{}".format( + self.page_url(str(node.title)), + self._convert(node), + self._convert(after), + )) continue - nodes = row.contents.nodes - parsed = [] - row_is_header = False - for node in nodes: - if str(getattr(node, "tag", "")) not in {"td", "th"}: + + # Default: Just convert the node + converted.append(self._convert(node)) + return "".join(converted) + elif isinstance(obj, mwp.nodes.heading.Heading): + return "{} {}\n".format("#" * min(obj.level, 3), obj.title.strip_code()) + elif isinstance(obj, mwp.nodes.tag.Tag): + # Most tags are handled just fine and can be delegated to strip_code + # (inline text styles), however we can do a bit better for list tags. + if str(obj.wiki_markup) == "*": + return "* " + elif str(obj.wiki_markup) == "#": + return "<!NUM!> " + elif str(obj.tag) == "ref": + return "" + elif str(obj.tag) == "table": + rows = [] + header = () + for row in obj.contents.nodes: + if str(getattr(row, "tag", "")) != "tr": continue - if str(node.tag) == "th": - row_is_header = True - parsed.append(_convert(node.contents).strip()) - if not row_is_header: - rows.append(parsed) - else: - header = parsed - return "\n```\n" + tabulate(rows, header, tablefmt=TABLEFMT) + "\n```\n" - else: - return default(obj) - elif isinstance(obj, mwp.nodes.template.Template): - # Most templates are handled fine (and completely stripped), however, - # some of them are useful and provide some output that we should mimic - # (for example, the convert template). - name = str(obj.name).strip().lower() - if name in {"cvt", "convert"}: - return render_convert(obj) - elif name in {"lang", "script"}: - return _convert(obj.params[1].value) - elif name == "lang-ar": - return "Arabic {}/{}".format( - _convert(obj.params[0].value), - _convert(obj.params[1].value), - ) - elif name in {"reign", "r.", "ruled", "rexit"}: - return render_reign(obj) - elif name in {"cite book", "cite journal", "cite news"}: - return render_cite_book(obj) - elif name.startswith("ipa"): - return "pronounced [{}]".format(_convert(obj.params[0].value)) - elif name in {"main", "main article"}: - return render_main(obj) - elif name in {"quote", "blockquote"}: - return render_quote(obj) - elif name == "cquote": - return render_cquote(obj) + nodes = row.contents.nodes + parsed = [] + row_is_header = False + for node in nodes: + if str(getattr(node, "tag", "")) not in {"td", "th"}: + continue + if str(node.tag) == "th": + row_is_header = True + parsed.append(self._convert(node.contents).strip()) + if not row_is_header: + rows.append(parsed) + else: + header = parsed + return "".join([ + "\n```\n", + tabulate(rows, header, tablefmt=self.table_format), + "\n```\n", + ]) + else: + return default(obj) + elif isinstance(obj, mwp.nodes.template.Template): + # Most templates are handled fine (and completely stripped), however, + # some of them are useful and provide some output that we should mimic + # (for example, the convert template). + name = str(obj.name) + template = templates.registry.get(name) + if template is None: + return default(obj) + else: + return template(self, obj) + elif isinstance(obj, mwp.nodes.wikilink.Wikilink): + if str(obj.title).startswith("File:") or str(obj.text).startswith("thumb|"): + return "" + elif str(obj.title).startswith("Category:"): + return "" + else: + return default(obj) else: return default(obj) - elif isinstance(obj, mwp.nodes.wikilink.Wikilink): - if str(obj.title).startswith("File:") or str(obj.text).startswith("thumb|"): - return "" - elif str(obj.title).startswith("Category:"): - return "" - else: - return default(obj) - else: - return default(obj) - -def _postprocess(gemtext: str) -> str: - # Strip out any more thumbs that have been left. - # This happens because the wikilinks are nested in each other, which the - # parser would only notice after doing the first replacement. We'll just - # take the easy way out here and use a regex to get rid of them. - gemtext = re.sub("^\\[\\[File:.*?\\]\\]$", "", gemtext, flags=re.MULTILINE) - - # Collapse too many empty lines - while "\n\n\n" in gemtext: - gemtext = gemtext.replace("\n\n\n", "\n\n") - - # Shortcut to avoid unnecessary splitting - if "<!NUM!>" not in gemtext: - return gemtext - - lines = gemtext.split("\n") - counter = 1 - for idx in range(len(lines)): - line = lines[idx] - if line.startswith("<!NUM!>"): - line = line.replace("<!NUM!>", str(counter), 1) - lines[idx] = line - counter += 1 - else: - counter = 1 - return "\n".join(lines) - - -def wikicode_to_gemtext( - obj: Union[mwp.nodes.Node, mwp.wikicode.Wikicode] - ) -> str: - """Try to turn the given object into a sensible Gemtext representation. - Note that wikicode is much more powerful than Gemtext, so this is a lossy - function. The returned Gemtext tries to mimic the content of the Wikicode - as much as possible (for human consumption). - - This function mostly mimics - `~mwparserfromhell.wikicode.Wikicode.strip_code`, with some addition to - better handle things that *can* be represented by Gemtext. + def _postprocess(self, gemtext): + # Strip out any more thumbs that have been left. + # This happens because the wikilinks are nested in each other, which the + # parser would only notice after doing the first replacement. We'll just + # take the easy way out here and use a regex to get rid of them. + gemtext = re.sub("^\\[\\[File:.*?\\]\\]$", "", gemtext, flags=re.MULTILINE) + + # Collapse too many empty lines + while "\n\n\n" in gemtext: + gemtext = gemtext.replace("\n\n\n", "\n\n") + + # Shortcut to avoid unnecessary splitting + if "<!NUM!>" not in gemtext: + return gemtext + + lines = gemtext.split("\n") + counter = 1 + for idx in range(len(lines)): + line = lines[idx] + if line.startswith("<!NUM!>"): + line = line.replace("<!NUM!>", str(counter), 1) + lines[idx] = line + counter += 1 + else: + counter = 1 + return "\n".join(lines) + + + def wikicode_to_gemtext( + self, obj: Union[mwp.nodes.Node, mwp.wikicode.Wikicode] + ) -> str: + """Try to turn the given object into a sensible Gemtext representation. + + Note that wikicode is much more powerful than Gemtext, so this is a lossy + function. The returned Gemtext tries to mimic the content of the Wikicode + as much as possible (for human consumption). + + This function mostly mimics + :meth:`~mwparserfromhell.wikicode.Wikicode.strip_code`, with some + additions to better handle things that *can* be represented by Gemtext. + + Args: + obj: The object to convert. + + Returns: + The converted Gemtext. + """ + # Avoid calling str() on the whole Wikicode here + if (isinstance(obj, mwp.wikicode.Wikicode) and + str(mwp.wikicode.Wikicode(obj.nodes[:2])) == "#REDIRECT "): + title = str(obj.nodes[2].title) + if "#" in title: + title, section = title.split("#") + section = f"Section '{section}'" + else: + section = "" + return "Redirect:\n=> {} {}\n{}".format( + self.page_url(title), title, section + ) + return self._postprocess(self._convert(obj)) - Args: - obj: The object to convert. - Returns: - The converted Gemtext. - """ - # Avoid calling str() on the whole Wikicode here - if (isinstance(obj, mwp.wikicode.Wikicode) and - str(mwp.wikicode.Wikicode(obj.nodes[:2])) == "#REDIRECT "): - title = str(obj.nodes[2].title) - if "#" in title: - title, section = title.split("#") - section = f"Section '{section}'" - else: - section = "" - return "Redirect:\n=> {} {}\n{}".format(page_url(title), title, section) - return _postprocess(_convert(obj)) +# import at the bottom to avoid circular dependencies +from . import templates # pylint: disable=wrong-import-position diff --git a/wikimini/templates/__init__.py b/wikimini/templates/__init__.py new file mode 100644 index 0000000..9e983e1 --- /dev/null +++ b/wikimini/templates/__init__.py @@ -0,0 +1,58 @@ +"""Template substitution for Wikimini. + +This module contains functions that mimic Wikipedia's templates. + +A template is a function that takes the :class:`~wikimini.Wikimini` instance and the +:class:`~mwparserfromhell.nodes.template.Template` node to convert, and returns +a string with the template output (see :const:`Template`). +""" +from typing import Callable, Optional + +import mwparserfromhell as mwp + +from .. import Wikimini + + +#: The type of a template render function. +Template = Callable[[Wikimini, mwp.nodes.template.Template], str] + + +class Registry: + """A container for all available templates.""" + def __init__(self): + self.templates = {} + + def get(self, name: str) -> Optional[Template]: + """Retrieves the template with the given name. + + Args: + name: The name of the template. + + Returns: + The template if found, or :any:`None`. + """ + # Are templates case-sensitive? + # Yes, except usually the first letter. + # (https://en.wikipedia.org/wiki/Help:A_quick_guide_to_templates#FAQ) + template = self.templates.get(name) + if template is None: + template = self.templates.get(name[0].swapcase() + name[1:]) + return template + + def insert(self, name: str, template: Template): + """Insert the given template into the registry. + + Args: + name: The name of the template. + template: The template to insert. + """ + self.templates[name] = template + + +#: The global template registry. +registry = Registry() + + +from . import ( # pylint: disable=wrong-import-position + convert, mainlinks, quotes, various, cite, language +) diff --git a/wikimini/templates/cite.py b/wikimini/templates/cite.py new file mode 100644 index 0000000..ac4f597 --- /dev/null +++ b/wikimini/templates/cite.py @@ -0,0 +1,36 @@ +"""Citation related templates.""" +from . import registry + + +def tmpl_citation(wikimini, obj): + """Renders the ``{{citation|...}}`` template.""" + title = obj.get("title", None) + if title: + title = title.value.strip_code().strip() + else: + title = "Untitled" + names = [] + for idx in ["%", "%1", "%2", "%3", "%4", "%5", "editor1-%"]: + last = obj.get(idx.replace("%", "last"), None) + if last: + last = last.value.strip_code().strip() + first = obj.get(idx.replace("%", "first"), None) + if first: + first = first.value.strip_code().strip() + if last and first: + names.append(f"{last}, {first}") + elif last: + names.append(last) + elif first: + names.append(first) + return "{} ({})".format(title, "; ".join(names)) + + +for name in ["cite", "citation", "cite arXiv", "cite AV media", "cite book", + "cite conference", "cite encyclopedia", "cite episode", + "cite interview", "cite journal", "cite magazine", + "cite mailing list", "cite map", "cite news", "cite newsgroup", + "cite podcast", "cite press release", "cite report", + "cite serial", "cite sign", "cite speech", "cite techreport", + "cite thesis", "cite web"]: + registry.insert(name, tmpl_citation) diff --git a/wikimini/templates/convert.py b/wikimini/templates/convert.py new file mode 100644 index 0000000..a7a3f44 --- /dev/null +++ b/wikimini/templates/convert.py @@ -0,0 +1,21 @@ +"""Implementations for the unit conversion templates.""" +from . import registry + + +def tmpl_convert(wikimini, obj): + """Renders the ``{{convert|...}}`` template.""" + if str(obj.params[1]) in {"-", "to"}: + return "{0}{3} {1} {2}{3}".format( + obj.params[0].value.strip_code(), + obj.params[1].value.strip_code(), + obj.params[2].value.strip_code(), + obj.params[3].value.strip_code(), + ) + return "{}{}".format( + obj.params[0].value.strip_code(), + obj.params[1].value.strip_code(), + ) + + +registry.insert("convert", tmpl_convert) +registry.insert("cvt", tmpl_convert) diff --git a/wikimini/templates/language.py b/wikimini/templates/language.py new file mode 100644 index 0000000..052b7f0 --- /dev/null +++ b/wikimini/templates/language.py @@ -0,0 +1,19 @@ +"""Language related templates.""" +from . import registry + + +def tmpl_ipa(wikimini, obj): + """Renders the ``{{IPA|...}}`` template.""" + return "pronounced [{}]".format(wikimini._convert(obj.params[0].value)) + + +registry.insert("IPA", tmpl_ipa) + + +def tmpl_lang(wikimini, obj): + """Renders the ``{{Lang|...}}`` template.""" + return wikimini._convert(obj.params[1].value) + + +registry.insert("lang", tmpl_lang) +registry.insert("script", tmpl_lang) diff --git a/wikimini/templates/mainlinks.py b/wikimini/templates/mainlinks.py new file mode 100644 index 0000000..ffcbc5e --- /dev/null +++ b/wikimini/templates/mainlinks.py @@ -0,0 +1,15 @@ +"""Renders templates that link to further articles.""" +from . import registry + + +def tmpl_main(wikimini, obj): + """Renders the ``{{main|...}}`` template.""" + links = [ + "=> {} {}".format(wikimini.page_url(str(t.value)), t.value) + for t in obj.params + ] + return "Main articles:\n{}\n".format("\n".join(links)) + + +registry.insert("main", tmpl_main) +registry.insert("main article", tmpl_main) diff --git a/wikimini/templates/quotes.py b/wikimini/templates/quotes.py new file mode 100644 index 0000000..7c1429d --- /dev/null +++ b/wikimini/templates/quotes.py @@ -0,0 +1,27 @@ +"""Renders various quote related templates.""" +from . import registry + + +def tmpl_quote(wikimini, obj): + """Renders the ``{{blockquote|...}}`` template.""" + text = obj.get("text", None) + if not text: + return "" + content = text.value.strip_code() + lines = content.split("\n") + return "\n".join(f"> {line}" for line in lines) + + +registry.insert("blockquote", tmpl_quote) +registry.insert("quote", tmpl_quote) + + +def tmpl_cquote(wikimini, obj): + """Renders the ``{{cquote|...}}`` template.""" + text = obj.params[0] + content = text.value.strip_code() + lines = content.split("\n") + return "\n".join(f"> {line}" for line in lines) + + +registry.insert("cquote", tmpl_cquote) diff --git a/wikimini/templates/various.py b/wikimini/templates/various.py new file mode 100644 index 0000000..8c6e0d5 --- /dev/null +++ b/wikimini/templates/various.py @@ -0,0 +1,19 @@ +"""Various small templates.""" +from . import registry + + +def tmpl_reign(wikimini, obj): + """Renders the ``{{reign|...}}`` template.""" + if not obj.params: + return "r. " + first = obj.params[0].value.strip_code().strip() or "?" + second = "" + if len(obj.params) > 1: + second = obj.params[1].value.strip_code().strip() + return f"r. {first} – {second}" + + +registry.insert("reign", tmpl_reign) +registry.insert("ruled", tmpl_reign) +registry.insert("rexit", tmpl_reign) +registry.insert("r.", tmpl_reign) |