summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaniel Schadt <kingdread@gmx.de>2021-08-22 23:52:06 +0200
committerDaniel Schadt <kingdread@gmx.de>2021-08-22 23:52:06 +0200
commit23077006b0fdbbf645652d1f76e4be3fe374537f (patch)
tree743f6f98de9979bf758152bf1fc4e3b82d9688fc
parentf39ec7d2f4b609c0968767590eeb864a48b41401 (diff)
downloadwikimini-23077006b0fdbbf645652d1f76e4be3fe374537f.tar.gz
wikimini-23077006b0fdbbf645652d1f76e4be3fe374537f.tar.bz2
wikimini-23077006b0fdbbf645652d1f76e4be3fe374537f.zip
Add some initial documentation
-rw-r--r--.gitignore2
-rw-r--r--doc/Makefile20
-rw-r--r--doc/conf.py63
-rw-r--r--doc/document.rst5
-rw-r--r--doc/formats.rst5
-rw-r--r--doc/index.rst84
-rw-r--r--doc/make.bat35
-rw-r--r--doc/templates.rst5
-rw-r--r--wikimini/document.py63
-rw-r--r--wikimini/formats/__init__.py58
-rw-r--r--wikimini/formats/gemtext.py6
-rw-r--r--wikimini/templates/__init__.py59
12 files changed, 388 insertions, 17 deletions
diff --git a/.gitignore b/.gitignore
index 127aacf..528d901 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,4 @@
__pycache__
wikimini.egg-info
+/doc/_build/
+/dist/
diff --git a/doc/Makefile b/doc/Makefile
new file mode 100644
index 0000000..d4bb2cb
--- /dev/null
+++ b/doc/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS ?=
+SPHINXBUILD ?= sphinx-build
+SOURCEDIR = .
+BUILDDIR = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/doc/conf.py b/doc/conf.py
new file mode 100644
index 0000000..327cc49
--- /dev/null
+++ b/doc/conf.py
@@ -0,0 +1,63 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'Wikimini'
+copyright = '2021, Daniel Schadt'
+author = 'Daniel Schadt'
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+ "sphinx.ext.autodoc",
+ "sphinx.ext.napoleon",
+ "sphinx_autodoc_typehints",
+ "sphinx.ext.intersphinx",
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'alabaster'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# -- Intersphinx mappings ----------------------------------------------------
+intersphinx_mapping = {
+ 'mwparserfromhell': ('https://mwparserfromhell.readthedocs.io/en/latest/',
+ None),
+ 'stdlib': ('https://docs.python.org/3', None),
+}
diff --git a/doc/document.rst b/doc/document.rst
new file mode 100644
index 0000000..9acbb93
--- /dev/null
+++ b/doc/document.rst
@@ -0,0 +1,5 @@
+The Document representation
+===========================
+
+.. automodule:: wikimini.document
+ :members:
diff --git a/doc/formats.rst b/doc/formats.rst
new file mode 100644
index 0000000..ddf01be
--- /dev/null
+++ b/doc/formats.rst
@@ -0,0 +1,5 @@
+Output Formats
+==============
+
+.. automodule:: wikimini.formats
+ :members:
diff --git a/doc/index.rst b/doc/index.rst
new file mode 100644
index 0000000..9eaecfc
--- /dev/null
+++ b/doc/index.rst
@@ -0,0 +1,84 @@
+.. Wikimini documentation master file, created by
+ sphinx-quickstart on Tue Aug 17 00:07:39 2021.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+Welcome to Wikimini's documentation!
+====================================
+
+.. toctree::
+ :maxdepth: 2
+ :caption: Contents:
+
+ document
+ templates
+ formats
+
+Wikimini is a library that takes Wikimedia Markup and renders it into a text
+format, such as `Gemtext
+<https://gemini.circumlunar.space/docs/gemtext.gmi>`__::
+
+ from wikimini import Wikimini
+ from wikimini.formats.gemtext import Gemtext
+ import sys
+
+ # The English Wikipedia is the default source.
+ wiki = Wikimini()
+ _, markup = wiki.retrieve("Coffee")
+ document = wiki.convert_to_document(markup)
+ Gemtext(sys.stdout).render(document)
+
+The reason why Wikimini is "better" than simply stripping all markup (such as
+:meth:`mwparserfromhell.wikicode.Wikicode.strip_code` does) is that you can
+keep a lot more information: Some interesting bits are implemented as templates
+in Wikipedia (markup like ``{{lang|ar|قَهْوَة}}``), and leaving them out either
+means missing out on the provided information, or having nonsensical
+punctuation in your output.
+
+The Wikimini pipeline is made to work in three steps:
+
+#. We start with the parsed :class:`~mwparserfromhell.wikicode.Wikicode`, which
+ is a parsed representation of Wikipedia's markup language
+#. Then convert the :class:`~mwparserfromhell.wikicode.Wikicode` to our
+ internal representation, the :class:`~wikimini.document.Document`. This step
+ already executes the templates and provides a stripped-down markup that only
+ keeps the essential meta information (like heading).
+#. Lastly, we convert our :class:`~wikimini.document.Document` to our desired
+ format with the help of a :class:`~wikimini.formats.Format`.
+
+Extensibility
+-------------
+
+Wikimini is extensible in multiple ways:
+
+The easiest extension is to make Wikimini work for different Mediawiki
+instances. This can be done by passing the correct API URL to the constructor
+of :class:`~wikimini.Wikimini`.
+
+You can also extend Wikimini by teaching it about more templates, for that, see
+:doc:`templates`.
+
+Additionally, you can implement other output formats, see :doc:`formats` for
+that.
+
+Reference
+---------
+
+Most of the interaction with Wikimini is done through the
+:class:`wikimini.Wikimini` object:
+
+.. autoclass:: wikimini.Wikimini
+ :members:
+
+Additionally, the module defines some constants:
+
+.. autodata:: wikimini.API_URL
+
+.. autodata:: wikimini.TABLE_FORMAT
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/doc/make.bat b/doc/make.bat
new file mode 100644
index 0000000..2119f51
--- /dev/null
+++ b/doc/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+ set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+ echo.
+ echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+ echo.installed, then set the SPHINXBUILD environment variable to point
+ echo.to the full path of the 'sphinx-build' executable. Alternatively you
+ echo.may add the Sphinx directory to PATH.
+ echo.
+ echo.If you don't have Sphinx installed, grab it from
+ echo.http://sphinx-doc.org/
+ exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/doc/templates.rst b/doc/templates.rst
new file mode 100644
index 0000000..b0b36fb
--- /dev/null
+++ b/doc/templates.rst
@@ -0,0 +1,5 @@
+Templates
+=========
+
+.. automodule:: wikimini.templates
+ :members:
diff --git a/wikimini/document.py b/wikimini/document.py
index c72aa78..8dbd998 100644
--- a/wikimini/document.py
+++ b/wikimini/document.py
@@ -1,7 +1,59 @@
-"""The main class of this module is a :class:`Document`, which holds a parsed
-and rendered Wikipedia article.
-
-We distinguish between two kinds of nodes, similar to HTML:
+"""Wikimini internally represents a parsed article as a :class:`Document`. A
+:class:`Document` contains no more references to Wiki-specific markup or
+templates, and instead represents the contents in a "limited" set of markup
+elements.
+
+Wikimini distinguishes between two kinds of elements: Block-level
+(:class:`Block`) and inline (:class:`Node`). A :class:`Document` consists of
+multiple block-level elements, where each block-level element may contain a
+various amount of inline elements.
+
+Working with block elements
+---------------------------
+
+Any block element inherits from :class:`Block`, so any method defined there may
+be used.
+
+Note that :meth:`Block.append` works in a fairly limited fashion. If you are
+trying to build blocks piece by piece, you might want to consider
+:func:`insert_into`, which does some additional checks — for example, it starts
+new paragraphs when a double linebreak is found (``"\\n\\n"``)::
+
+ >>> from wikimini.document import *
+ >>> blocks = []
+ >>> insert_into(blocks, Plain("Paragraph "))
+ >>> insert_into(blocks, Plain("1\\n\\nParagraph 2"))
+ >>> # blocks == [Paragraph(...), Paragraph(...)]
+ >>> assert len(blocks) == 2
+ >>>
+
+Working with inline elements
+----------------------------
+
+Any inline markup inherits from :class:`Node`, so any method defined there may
+be used. Some inline nodes can have other inline nodes nested, for example
+:class:`Style` can have other :class:`Style` nodes nested.
+
+Since a :class:`Document` does not contain nodes directly, any nodes have to be
+wrapped in an appropriate block-level construct first. The trivial choice is a
+:class:`Paragraph`, although there are multiple blocks that contain nodes:
+
+* :class:`Paragraph`
+* :class:`ItemList` (by having :class:`Paragraph` internally)
+* :class:`BlockQuote` (by having :class:`Paragraph` internally)
+
+If you want to add a note to a block element regardless of the block's type or
+ability to keep the nodes as-is, use :meth:`Block.append`. In the case of block
+elements that can not contain inline nodes, the markup will be stripped and
+only the plain text will be inserted::
+
+ >>> from wikimini.document import *
+ >>> verbatim = Verbatim("")
+ >>> verbatim.append(InlineLink("http://localhost", Plain("Visit here!")))
+ >>> assert verbatim.text == "Visit here!"
+
+Reference
+---------
"""
import re
from dataclasses import dataclass, replace
@@ -12,9 +64,10 @@ class Document:
"""A rendered Wikipedia article.
Attributes:
- blocks (List[Block]): A list of top-level nodes.
+ blocks: A list of top-level blocks.
"""
__slots__ = ('blocks',)
+ blocks: List["Block"]
def __init__(self, blocks=None):
self.blocks = []
diff --git a/wikimini/formats/__init__.py b/wikimini/formats/__init__.py
index b48486a..c85cf1d 100644
--- a/wikimini/formats/__init__.py
+++ b/wikimini/formats/__init__.py
@@ -1,8 +1,58 @@
-"""The formats are responsible for turning a
-:class:`~wikimini.document.Document` into an output string.
+"""A :class:`Format` is responsible for turning a
+:class:`~wikimini.document.Document` into an output string. It works as a
+visitor that has methods for the various kinds of document elements, and each
+method can be overriden to produce the desired output.
-Formats work by being given a file-like buffer as argument, into which the
-output should be written.
+Every :class:`Format` is initialized with a file-like writer object that takes
+unicode (such as a file openend in ``"w"`` mode, or a :class:`io.StringIO`
+object) to which the output is written.
+
+Using Formats
+-------------
+
+In order to use a specific format, create an instance of it with your desired
+output stream and then call :meth:`Format.render` (or
+:meth:`Format.render_document`)::
+
+ from wikimini.document import Document
+ from wikimini.formats.gemtext import Gemtext
+ import sys
+ dummy = Document()
+ Gemtext(sys.stdout).render(dummy)
+ # Equivalent in this case, but more explicit:
+ Gemtext(sys.stdout).render_document(dummy)
+
+If you prefer to have your output as a string, you can use the
+:func:`as_string` helper::
+
+ >>> from wikimini.document import Heading
+ >>> from wikimini.formats import gemtext, as_string
+ >>> as_string(gemtext.Gemtext(None), Heading(1, "Coffee"))
+ '# Coffee\\n'
+ >>>
+
+Implementing Formats
+--------------------
+
+In order to implement a new format, simply subclass :class:`Format` and
+override the stub methods.
+
+Note that :meth:`~Format.render`, :meth:`~Format.render_block` and
+:meth:`~Format.render_node` are simple dispatchers and usually should not be
+overriden — override the more specific methods instead.
+
+:meth:`~Format.render_document` contains a default implementation that simply
+renders each block in the document using :meth:`~Format.render_block`. You may
+choose to override this if you need more fine grained control over how
+consecutive blocks are joined.
+
+Available Formats
+-----------------
+
+.. autoclass:: wikimini.formats.gemtext.Gemtext
+
+Reference
+---------
"""
import io
from typing import TextIO, Union
diff --git a/wikimini/formats/gemtext.py b/wikimini/formats/gemtext.py
index 39df956..c8c8e83 100644
--- a/wikimini/formats/gemtext.py
+++ b/wikimini/formats/gemtext.py
@@ -7,7 +7,11 @@ from ..document import LineBreak, BlockLink, InlineLink
class Gemtext(Format):
- """The Gemtext formatter."""
+ """The Gemtext formatter.
+
+ For the Gemtext specification, see
+ https://gemini.circumlunar.space/docs/gemtext.gmi
+ """
def render_document(self, document):
for block, next_block in zip_longest(
diff --git a/wikimini/templates/__init__.py b/wikimini/templates/__init__.py
index 533ba5e..6614936 100644
--- a/wikimini/templates/__init__.py
+++ b/wikimini/templates/__init__.py
@@ -1,10 +1,55 @@
-"""Template substitution for Wikimini.
-
-This module contains functions that mimic Wikipedia's templates.
-
-A template is a function that takes the :class:`~wikimini.Wikimini` instance
-and the :class:`~mwparserfromhell.nodes.template.Template` node to convert, and
-returns a string with the template output (see :const:`Template`).
+""" A template in the context of Wikipedia is something like a macro, that is a
+piece of "wikicode" that is included from a different page. Templates on
+Wikipedia are used for semantic markup and to ensure a consistent output across
+Wikipedia pages. For example, the `{{lang-de|...}}
+<https://en.wikipedia.org/wiki/Template:Lang-de>`__ template ensures that the
+text is prefixed with "German:", written in cursive, and has the right
+``lang=de`` HTML attribute applied.
+
+A lot of templates can be ignored when converting from Wikicode to a plain text
+format (e.g. ``{{stub}}``), however some templates are both (relatively) easy
+to mimic (at least in their basic functionality), and contain interesting data
+that we'd like to keep in the output (e.g. ``{{lang|...}}``).
+
+Implementation in Wikimini
+--------------------------
+
+The type of a template is given in :data:`Template`. The type alias is hard to
+digest, so here's a breakdown:
+
+* A template is a function.
+* The function takes as arguments the :class:`~wikimini.Wikimini` instance and
+ the :class:`mwparserfromhell.nodes.Node` that represents the invocation.
+* A template may return either a list of :class:`wikimini.document.Block` or
+ :class:`wikimini.document.Node`, depending on whether the template should
+ expand to block-level constructs (like ``{{blockquote}}``) or to inline text
+ (like ``{{lang}}``).
+
+In order to make your template implementation known to Wikimini, you need to
+register it using :meth:`Registry.insert`. There are multiple possibilities for
+that:
+
+You can either register it in the global registry
+:data:`wikimini.templates.registry`, in which case it will be used by any
+:class:`~wikimini.Wikimini` instance that uses the global registry (which is
+the default).
+
+You can start a new registry using only your own templates by creating one and
+registering all your templates::
+
+ from wikimini.templates import Registry
+ from wikimini.document import Plain
+ from wikimini import Wikimini
+ registry = Registry()
+ registry.insert("lang", lambda _, _: [Plain("Schnitzel!")])
+
+ wiki = Wikimini(registry=registry)
+
+Alternatively, you can also clone an existing registry using
+:meth:`Registry.copy` and then add templates to it.
+
+Reference
+---------
"""
import copy
from typing import Callable, Optional, Union, Sequence