From e846cbd2b18e43bdd69234930150f0cc97be984d Mon Sep 17 00:00:00 2001
From: Daniel Schadt <kingdread@gmx.de>
Date: Mon, 16 Aug 2021 15:16:22 +0200
Subject: Add a setup.py

---
 .gitignore           |   2 +
 setup.py             |  15 ++++
 wikimini.py          | 222 ---------------------------------------------------
 wikimini/__init__.py | 222 +++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 239 insertions(+), 222 deletions(-)
 create mode 100644 .gitignore
 create mode 100644 setup.py
 delete mode 100644 wikimini.py
 create mode 100644 wikimini/__init__.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..127aacf
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+__pycache__
+wikimini.egg-info
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..5af6a59
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python
+import setuptools
+
+setuptools.setup(
+    name='wikimini',
+    version='0.1',
+    description='Wikipedia to Gemini converter',
+    author='Daniel Schadt',
+    packages=['wikimini'],
+    install_requires=[
+        "requests",
+        "mwparserfromhell",
+        "tabulate",
+    ],
+ )
diff --git a/wikimini.py b/wikimini.py
deleted file mode 100644
index 032639a..0000000
--- a/wikimini.py
+++ /dev/null
@@ -1,222 +0,0 @@
-import mwparserfromhell as mwp
-import requests
-import re
-
-from tabulate import tabulate
-
-from typing import Union
-
-
-API_URL = "https://en.wikipedia.org/w/api.php"
-
-# See https://pypi.org/project/tabulate/ for a reference of table formats.
-TABLEFMT = "fancy_grid"
-
-
-def page_url(title):
-    return title.replace(" ", "_")
-
-
-def retrieve(title: str) -> mwp.wikicode.Wikicode:
-    params = {
-        "action": "query",
-        "prop": "revisions",
-        "rvprop": "content",
-        "rvslots": "main",
-        "rvlimit": 1,
-        "titles": title,
-        "format": "json",
-        "formatversion": "2",
-    }
-    headers = {"User-Agent": "Wikimini/1.0"}
-    req = requests.get(API_URL, headers=headers, params=params)
-    res = req.json()
-    revision = res["query"]["pages"][0]["revisions"][0]
-    text = revision["slots"]["main"]["content"]
-    return mwp.parse(text)
-
-
-def render_convert(obj: mwp.nodes.template.Template) -> str:
-    """Renders the {{convert|...}} template."""
-    if str(obj.params[1]) in {"-", "to"}:
-        return "{0}{3} {1} {2}{3}".format(
-            obj.params[0].value.strip_code(),
-            obj.params[1].value.strip_code(),
-            obj.params[2].value.strip_code(),
-            obj.params[3].value.strip_code(),
-        )
-    return "{}{}".format(
-        obj.params[0].value.strip_code(),
-        obj.params[1].value.strip_code(),
-    )
-
-
-def render_reign(obj: mwp.nodes.template.Template) -> str:
-    """Renders the {{reign|...}} template."""
-    if not obj.params:
-        return "r. "
-    first = obj.params[0].value.strip_code().strip() or "?"
-    second = ""
-    if len(obj.params) > 1:
-        second = obj.params[1].value.strip_code().strip()
-    return f"r. {first} – {second}"
-
-
-def render_cite_book(obj: mwp.nodes.template.Template) -> str:
-    """Renders the {{cite book|...}} template."""
-    title = obj.get("title", None)
-    if title:
-        title = title.value.strip_code().strip()
-    else:
-        title = "Untitled"
-    names = []
-    for idx in ["%", "%1", "%2", "%3", "%4", "%5", "editor1-%"]:
-        last = obj.get(idx.replace("%", "last"), None)
-        if last:
-            last = last.value.strip_code().strip()
-        first = obj.get(idx.replace("%", "first"), None)
-        if first:
-            first = first.value.strip_code().strip()
-        if last and first:
-            names.append(f"{last}, {first}")
-        elif last:
-            names.append(last)
-        elif first:
-            names.append(first)
-    return "{} ({})".format(title, "; ".join(names))
-
-
-def _convert(obj: Union[mwp.nodes.Node, mwp.wikicode.Wikicode]) -> str:
-    default = lambda obj: mwp.wikicode.Wikicode([obj]).strip_code(collapse=False)
-    # This does the actual conversion
-    if isinstance(obj, mwp.wikicode.Wikicode):
-        converted = []
-        iterator = iter(enumerate(obj.nodes))
-        for i, node in iterator:
-            # Pattern: * [[Wikilink]]\n
-            if (i >= 2 and
-                    i + 1 < len(obj.nodes) and
-                    # Links can have a plural s after them
-                    re.match("s?\n", str(obj.nodes[i+1])) and
-                    isinstance(node, mwp.nodes.wikilink.Wikilink) and
-                    str(obj.nodes[i-1]) == " " and
-                    str(obj.nodes[i-2]) == "*"):
-                converted.pop()
-                converted.pop()
-                _, after = next(iterator)
-                converted.append("=> {} {}{}".format(
-                    page_url(str(node.title)),
-                    _convert(node),
-                    _convert(after),
-                ))
-                continue
-
-            # Default: Just convert the node
-            converted.append(_convert(node))
-        return "".join(converted)
-    elif isinstance(obj, mwp.nodes.heading.Heading):
-        return "{} {}\n".format("#" * min(obj.level, 3), obj.title.strip_code())
-    elif isinstance(obj, mwp.nodes.tag.Tag):
-        # Most tags are handled just fine and can be delegated to strip_code
-        # (inline text styles), however we can do a bit better for list tags.
-        if str(obj.wiki_markup) == "*":
-            return "*"
-        elif str(obj.wiki_markup) == "#":
-            return "<!NUM!> {}".format(_convert(obj.contents))
-        elif str(obj.tag) == "ref":
-            return ""
-        elif str(obj.tag) == "table":
-            rows = []
-            header = None
-            for row in obj.contents.nodes:
-                if str(getattr(row, "tag", "")) != "tr":
-                    continue
-                nodes = row.contents.nodes
-                parsed = []
-                row_is_header = False
-                for node in nodes:
-                    if str(getattr(node, "tag", "")) not in {"td", "th"}:
-                        continue
-                    if str(node.tag) == "th":
-                        row_is_header = True
-                    parsed.append(_convert(node.contents).strip())
-                if not row_is_header:
-                    rows.append(parsed)
-                else:
-                    header = parsed
-            return "\n```\n" + tabulate(rows, header, tablefmt=TABLEFMT) + "\n```\n"
-        else:
-            return default(obj)
-    elif isinstance(obj, mwp.nodes.template.Template):
-        # Most templates are handled fine (and completely stripped), however,
-        # some of them are useful and provide some output that we should mimic
-        # (for example, the convert template).
-        name = str(obj.name).strip().lower()
-        if name in {"cvt", "convert"}:
-            return render_convert(obj)
-        elif name == "lang":
-            return _convert(obj.params[1].value)
-        elif name in {"reign", "r.", "ruled", "rexit"}:
-            return render_reign(obj)
-        elif name in {"cite book", "cite journal", "cite news"}:
-            return render_cite_book(obj)
-        else:
-            return default(obj)
-    elif isinstance(obj, mwp.nodes.wikilink.Wikilink):
-        if str(obj.title).startswith("File:") or str(obj.text).startswith("thumb|"):
-            return ""
-        elif str(obj.title).startswith("Category:"):
-            return ""
-        else:
-            return default(obj)
-    else:
-        return default(obj)
-
-def _postprocess(gemtext: str) -> str:
-    # Strip out any more thumbs that have been left.
-    # This happens because the wikilinks are nested in each other, which the
-    # parser would only notice after doing the first replacement. We'll just
-    # take the easy way out here and use a regex to get rid of them.
-    gemtext = re.sub("^\\[\\[File:.*?\\]\\]$", "", gemtext, flags=re.MULTILINE)
-
-    # Collapse too many empty lines
-    while "\n\n\n" in gemtext:
-        gemtext = gemtext.replace("\n\n\n", "\n\n")
-
-    # Shortcut to avoid unnecessary splitting
-    if "<!NUM!>" not in gemtext:
-        return gemtext
-
-    lines = gemtext.split("\n")
-    counter = 1
-    for idx in range(len(lines)):
-        line = lines[idx]
-        if line.startswith("<!NUM!>"):
-            line = line.replace("<!NUM!>", str(counter), 1)
-            lines[idx] = line
-            counter += 1
-        else:
-            counter = 1
-    return "\n".join(lines)
-
-
-def wikicode_to_gemtext(
-        obj: Union[mwp.nodes.Node, mwp.wikicode.Wikicode]
-    ) -> str:
-    """Try to turn the given object into a sensible Gemtext representation.
-
-    Note that wikicode is much more powerful than Gemtext, so this is a lossy
-    function. The returned Gemtext tries to mimic the content of the Wikicode
-    as much as possible (for human consumption).
-
-    This function mostly mimics
-    `~mwparserfromhell.wikicode.Wikicode.strip_code`, with some addition to
-    better handle things that *can* be represented by Gemtext.
-
-    Args:
-        obj: The object to convert.
-
-    Returns:
-        The converted Gemtext.
-    """
-    return _postprocess(_convert(obj))
diff --git a/wikimini/__init__.py b/wikimini/__init__.py
new file mode 100644
index 0000000..032639a
--- /dev/null
+++ b/wikimini/__init__.py
@@ -0,0 +1,222 @@
+import mwparserfromhell as mwp
+import requests
+import re
+
+from tabulate import tabulate
+
+from typing import Union
+
+
+API_URL = "https://en.wikipedia.org/w/api.php"
+
+# See https://pypi.org/project/tabulate/ for a reference of table formats.
+TABLEFMT = "fancy_grid"
+
+
+def page_url(title):
+    return title.replace(" ", "_")
+
+
+def retrieve(title: str) -> mwp.wikicode.Wikicode:
+    params = {
+        "action": "query",
+        "prop": "revisions",
+        "rvprop": "content",
+        "rvslots": "main",
+        "rvlimit": 1,
+        "titles": title,
+        "format": "json",
+        "formatversion": "2",
+    }
+    headers = {"User-Agent": "Wikimini/1.0"}
+    req = requests.get(API_URL, headers=headers, params=params)
+    res = req.json()
+    revision = res["query"]["pages"][0]["revisions"][0]
+    text = revision["slots"]["main"]["content"]
+    return mwp.parse(text)
+
+
+def render_convert(obj: mwp.nodes.template.Template) -> str:
+    """Renders the {{convert|...}} template."""
+    if str(obj.params[1]) in {"-", "to"}:
+        return "{0}{3} {1} {2}{3}".format(
+            obj.params[0].value.strip_code(),
+            obj.params[1].value.strip_code(),
+            obj.params[2].value.strip_code(),
+            obj.params[3].value.strip_code(),
+        )
+    return "{}{}".format(
+        obj.params[0].value.strip_code(),
+        obj.params[1].value.strip_code(),
+    )
+
+
+def render_reign(obj: mwp.nodes.template.Template) -> str:
+    """Renders the {{reign|...}} template."""
+    if not obj.params:
+        return "r. "
+    first = obj.params[0].value.strip_code().strip() or "?"
+    second = ""
+    if len(obj.params) > 1:
+        second = obj.params[1].value.strip_code().strip()
+    return f"r. {first} – {second}"
+
+
+def render_cite_book(obj: mwp.nodes.template.Template) -> str:
+    """Renders the {{cite book|...}} template."""
+    title = obj.get("title", None)
+    if title:
+        title = title.value.strip_code().strip()
+    else:
+        title = "Untitled"
+    names = []
+    for idx in ["%", "%1", "%2", "%3", "%4", "%5", "editor1-%"]:
+        last = obj.get(idx.replace("%", "last"), None)
+        if last:
+            last = last.value.strip_code().strip()
+        first = obj.get(idx.replace("%", "first"), None)
+        if first:
+            first = first.value.strip_code().strip()
+        if last and first:
+            names.append(f"{last}, {first}")
+        elif last:
+            names.append(last)
+        elif first:
+            names.append(first)
+    return "{} ({})".format(title, "; ".join(names))
+
+
+def _convert(obj: Union[mwp.nodes.Node, mwp.wikicode.Wikicode]) -> str:
+    default = lambda obj: mwp.wikicode.Wikicode([obj]).strip_code(collapse=False)
+    # This does the actual conversion
+    if isinstance(obj, mwp.wikicode.Wikicode):
+        converted = []
+        iterator = iter(enumerate(obj.nodes))
+        for i, node in iterator:
+            # Pattern: * [[Wikilink]]\n
+            if (i >= 2 and
+                    i + 1 < len(obj.nodes) and
+                    # Links can have a plural s after them
+                    re.match("s?\n", str(obj.nodes[i+1])) and
+                    isinstance(node, mwp.nodes.wikilink.Wikilink) and
+                    str(obj.nodes[i-1]) == " " and
+                    str(obj.nodes[i-2]) == "*"):
+                converted.pop()
+                converted.pop()
+                _, after = next(iterator)
+                converted.append("=> {} {}{}".format(
+                    page_url(str(node.title)),
+                    _convert(node),
+                    _convert(after),
+                ))
+                continue
+
+            # Default: Just convert the node
+            converted.append(_convert(node))
+        return "".join(converted)
+    elif isinstance(obj, mwp.nodes.heading.Heading):
+        return "{} {}\n".format("#" * min(obj.level, 3), obj.title.strip_code())
+    elif isinstance(obj, mwp.nodes.tag.Tag):
+        # Most tags are handled just fine and can be delegated to strip_code
+        # (inline text styles), however we can do a bit better for list tags.
+        if str(obj.wiki_markup) == "*":
+            return "*"
+        elif str(obj.wiki_markup) == "#":
+            return "<!NUM!> {}".format(_convert(obj.contents))
+        elif str(obj.tag) == "ref":
+            return ""
+        elif str(obj.tag) == "table":
+            rows = []
+            header = None
+            for row in obj.contents.nodes:
+                if str(getattr(row, "tag", "")) != "tr":
+                    continue
+                nodes = row.contents.nodes
+                parsed = []
+                row_is_header = False
+                for node in nodes:
+                    if str(getattr(node, "tag", "")) not in {"td", "th"}:
+                        continue
+                    if str(node.tag) == "th":
+                        row_is_header = True
+                    parsed.append(_convert(node.contents).strip())
+                if not row_is_header:
+                    rows.append(parsed)
+                else:
+                    header = parsed
+            return "\n```\n" + tabulate(rows, header, tablefmt=TABLEFMT) + "\n```\n"
+        else:
+            return default(obj)
+    elif isinstance(obj, mwp.nodes.template.Template):
+        # Most templates are handled fine (and completely stripped), however,
+        # some of them are useful and provide some output that we should mimic
+        # (for example, the convert template).
+        name = str(obj.name).strip().lower()
+        if name in {"cvt", "convert"}:
+            return render_convert(obj)
+        elif name == "lang":
+            return _convert(obj.params[1].value)
+        elif name in {"reign", "r.", "ruled", "rexit"}:
+            return render_reign(obj)
+        elif name in {"cite book", "cite journal", "cite news"}:
+            return render_cite_book(obj)
+        else:
+            return default(obj)
+    elif isinstance(obj, mwp.nodes.wikilink.Wikilink):
+        if str(obj.title).startswith("File:") or str(obj.text).startswith("thumb|"):
+            return ""
+        elif str(obj.title).startswith("Category:"):
+            return ""
+        else:
+            return default(obj)
+    else:
+        return default(obj)
+
+def _postprocess(gemtext: str) -> str:
+    # Strip out any more thumbs that have been left.
+    # This happens because the wikilinks are nested in each other, which the
+    # parser would only notice after doing the first replacement. We'll just
+    # take the easy way out here and use a regex to get rid of them.
+    gemtext = re.sub("^\\[\\[File:.*?\\]\\]$", "", gemtext, flags=re.MULTILINE)
+
+    # Collapse too many empty lines
+    while "\n\n\n" in gemtext:
+        gemtext = gemtext.replace("\n\n\n", "\n\n")
+
+    # Shortcut to avoid unnecessary splitting
+    if "<!NUM!>" not in gemtext:
+        return gemtext
+
+    lines = gemtext.split("\n")
+    counter = 1
+    for idx in range(len(lines)):
+        line = lines[idx]
+        if line.startswith("<!NUM!>"):
+            line = line.replace("<!NUM!>", str(counter), 1)
+            lines[idx] = line
+            counter += 1
+        else:
+            counter = 1
+    return "\n".join(lines)
+
+
+def wikicode_to_gemtext(
+        obj: Union[mwp.nodes.Node, mwp.wikicode.Wikicode]
+    ) -> str:
+    """Try to turn the given object into a sensible Gemtext representation.
+
+    Note that wikicode is much more powerful than Gemtext, so this is a lossy
+    function. The returned Gemtext tries to mimic the content of the Wikicode
+    as much as possible (for human consumption).
+
+    This function mostly mimics
+    `~mwparserfromhell.wikicode.Wikicode.strip_code`, with some addition to
+    better handle things that *can* be represented by Gemtext.
+
+    Args:
+        obj: The object to convert.
+
+    Returns:
+        The converted Gemtext.
+    """
+    return _postprocess(_convert(obj))
-- 
cgit v1.2.3