diff options
-rw-r--r-- | .gitignore | 2 | ||||
-rw-r--r-- | doc/Makefile | 20 | ||||
-rw-r--r-- | doc/conf.py | 63 | ||||
-rw-r--r-- | doc/document.rst | 5 | ||||
-rw-r--r-- | doc/formats.rst | 5 | ||||
-rw-r--r-- | doc/index.rst | 84 | ||||
-rw-r--r-- | doc/make.bat | 35 | ||||
-rw-r--r-- | doc/templates.rst | 5 | ||||
-rw-r--r-- | wikimini/document.py | 63 | ||||
-rw-r--r-- | wikimini/formats/__init__.py | 58 | ||||
-rw-r--r-- | wikimini/formats/gemtext.py | 6 | ||||
-rw-r--r-- | wikimini/templates/__init__.py | 59 |
12 files changed, 388 insertions, 17 deletions
@@ -1,2 +1,4 @@ __pycache__ wikimini.egg-info +/doc/_build/ +/dist/ diff --git a/doc/Makefile b/doc/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/doc/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/doc/conf.py b/doc/conf.py new file mode 100644 index 0000000..327cc49 --- /dev/null +++ b/doc/conf.py @@ -0,0 +1,63 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- Project information ----------------------------------------------------- + +project = 'Wikimini' +copyright = '2021, Daniel Schadt' +author = 'Daniel Schadt' + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.napoleon", + "sphinx_autodoc_typehints", + "sphinx.ext.intersphinx", +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'alabaster' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# -- Intersphinx mappings ---------------------------------------------------- +intersphinx_mapping = { + 'mwparserfromhell': ('https://mwparserfromhell.readthedocs.io/en/latest/', + None), + 'stdlib': ('https://docs.python.org/3', None), +} diff --git a/doc/document.rst b/doc/document.rst new file mode 100644 index 0000000..9acbb93 --- /dev/null +++ b/doc/document.rst @@ -0,0 +1,5 @@ +The Document representation +=========================== + +.. automodule:: wikimini.document + :members: diff --git a/doc/formats.rst b/doc/formats.rst new file mode 100644 index 0000000..ddf01be --- /dev/null +++ b/doc/formats.rst @@ -0,0 +1,5 @@ +Output Formats +============== + +.. automodule:: wikimini.formats + :members: diff --git a/doc/index.rst b/doc/index.rst new file mode 100644 index 0000000..9eaecfc --- /dev/null +++ b/doc/index.rst @@ -0,0 +1,84 @@ +.. Wikimini documentation master file, created by + sphinx-quickstart on Tue Aug 17 00:07:39 2021. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to Wikimini's documentation! +==================================== + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + document + templates + formats + +Wikimini is a library that takes Wikimedia Markup and renders it into a text +format, such as `Gemtext +<https://gemini.circumlunar.space/docs/gemtext.gmi>`__:: + + from wikimini import Wikimini + from wikimini.formats.gemtext import Gemtext + import sys + + # The English Wikipedia is the default source. + wiki = Wikimini() + _, markup = wiki.retrieve("Coffee") + document = wiki.convert_to_document(markup) + Gemtext(sys.stdout).render(document) + +The reason why Wikimini is "better" than simply stripping all markup (such as +:meth:`mwparserfromhell.wikicode.Wikicode.strip_code` does) is that you can +keep a lot more information: Some interesting bits are implemented as templates +in Wikipedia (markup like ``{{lang|ar|قَهْوَة}}``), and leaving them out either +means missing out on the provided information, or having nonsensical +punctuation in your output. + +The Wikimini pipeline is made to work in three steps: + +#. We start with the parsed :class:`~mwparserfromhell.wikicode.Wikicode`, which + is a parsed representation of Wikipedia's markup language +#. Then convert the :class:`~mwparserfromhell.wikicode.Wikicode` to our + internal representation, the :class:`~wikimini.document.Document`. This step + already executes the templates and provides a stripped-down markup that only + keeps the essential meta information (like heading). +#. Lastly, we convert our :class:`~wikimini.document.Document` to our desired + format with the help of a :class:`~wikimini.formats.Format`. + +Extensibility +------------- + +Wikimini is extensible in multiple ways: + +The easiest extension is to make Wikimini work for different Mediawiki +instances. This can be done by passing the correct API URL to the constructor +of :class:`~wikimini.Wikimini`. + +You can also extend Wikimini by teaching it about more templates, for that, see +:doc:`templates`. + +Additionally, you can implement other output formats, see :doc:`formats` for +that. + +Reference +--------- + +Most of the interaction with Wikimini is done through the +:class:`wikimini.Wikimini` object: + +.. autoclass:: wikimini.Wikimini + :members: + +Additionally, the module defines some constants: + +.. autodata:: wikimini.API_URL + +.. autodata:: wikimini.TABLE_FORMAT + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/doc/make.bat b/doc/make.bat new file mode 100644 index 0000000..2119f51 --- /dev/null +++ b/doc/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+ set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+ echo.
+ echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+ echo.installed, then set the SPHINXBUILD environment variable to point
+ echo.to the full path of the 'sphinx-build' executable. Alternatively you
+ echo.may add the Sphinx directory to PATH.
+ echo.
+ echo.If you don't have Sphinx installed, grab it from
+ echo.http://sphinx-doc.org/
+ exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/doc/templates.rst b/doc/templates.rst new file mode 100644 index 0000000..b0b36fb --- /dev/null +++ b/doc/templates.rst @@ -0,0 +1,5 @@ +Templates +========= + +.. automodule:: wikimini.templates + :members: diff --git a/wikimini/document.py b/wikimini/document.py index c72aa78..8dbd998 100644 --- a/wikimini/document.py +++ b/wikimini/document.py @@ -1,7 +1,59 @@ -"""The main class of this module is a :class:`Document`, which holds a parsed -and rendered Wikipedia article. - -We distinguish between two kinds of nodes, similar to HTML: +"""Wikimini internally represents a parsed article as a :class:`Document`. A +:class:`Document` contains no more references to Wiki-specific markup or +templates, and instead represents the contents in a "limited" set of markup +elements. + +Wikimini distinguishes between two kinds of elements: Block-level +(:class:`Block`) and inline (:class:`Node`). A :class:`Document` consists of +multiple block-level elements, where each block-level element may contain a +various amount of inline elements. + +Working with block elements +--------------------------- + +Any block element inherits from :class:`Block`, so any method defined there may +be used. + +Note that :meth:`Block.append` works in a fairly limited fashion. If you are +trying to build blocks piece by piece, you might want to consider +:func:`insert_into`, which does some additional checks — for example, it starts +new paragraphs when a double linebreak is found (``"\\n\\n"``):: + + >>> from wikimini.document import * + >>> blocks = [] + >>> insert_into(blocks, Plain("Paragraph ")) + >>> insert_into(blocks, Plain("1\\n\\nParagraph 2")) + >>> # blocks == [Paragraph(...), Paragraph(...)] + >>> assert len(blocks) == 2 + >>> + +Working with inline elements +---------------------------- + +Any inline markup inherits from :class:`Node`, so any method defined there may +be used. Some inline nodes can have other inline nodes nested, for example +:class:`Style` can have other :class:`Style` nodes nested. + +Since a :class:`Document` does not contain nodes directly, any nodes have to be +wrapped in an appropriate block-level construct first. The trivial choice is a +:class:`Paragraph`, although there are multiple blocks that contain nodes: + +* :class:`Paragraph` +* :class:`ItemList` (by having :class:`Paragraph` internally) +* :class:`BlockQuote` (by having :class:`Paragraph` internally) + +If you want to add a note to a block element regardless of the block's type or +ability to keep the nodes as-is, use :meth:`Block.append`. In the case of block +elements that can not contain inline nodes, the markup will be stripped and +only the plain text will be inserted:: + + >>> from wikimini.document import * + >>> verbatim = Verbatim("") + >>> verbatim.append(InlineLink("http://localhost", Plain("Visit here!"))) + >>> assert verbatim.text == "Visit here!" + +Reference +--------- """ import re from dataclasses import dataclass, replace @@ -12,9 +64,10 @@ class Document: """A rendered Wikipedia article. Attributes: - blocks (List[Block]): A list of top-level nodes. + blocks: A list of top-level blocks. """ __slots__ = ('blocks',) + blocks: List["Block"] def __init__(self, blocks=None): self.blocks = [] diff --git a/wikimini/formats/__init__.py b/wikimini/formats/__init__.py index b48486a..c85cf1d 100644 --- a/wikimini/formats/__init__.py +++ b/wikimini/formats/__init__.py @@ -1,8 +1,58 @@ -"""The formats are responsible for turning a -:class:`~wikimini.document.Document` into an output string. +"""A :class:`Format` is responsible for turning a +:class:`~wikimini.document.Document` into an output string. It works as a +visitor that has methods for the various kinds of document elements, and each +method can be overriden to produce the desired output. -Formats work by being given a file-like buffer as argument, into which the -output should be written. +Every :class:`Format` is initialized with a file-like writer object that takes +unicode (such as a file openend in ``"w"`` mode, or a :class:`io.StringIO` +object) to which the output is written. + +Using Formats +------------- + +In order to use a specific format, create an instance of it with your desired +output stream and then call :meth:`Format.render` (or +:meth:`Format.render_document`):: + + from wikimini.document import Document + from wikimini.formats.gemtext import Gemtext + import sys + dummy = Document() + Gemtext(sys.stdout).render(dummy) + # Equivalent in this case, but more explicit: + Gemtext(sys.stdout).render_document(dummy) + +If you prefer to have your output as a string, you can use the +:func:`as_string` helper:: + + >>> from wikimini.document import Heading + >>> from wikimini.formats import gemtext, as_string + >>> as_string(gemtext.Gemtext(None), Heading(1, "Coffee")) + '# Coffee\\n' + >>> + +Implementing Formats +-------------------- + +In order to implement a new format, simply subclass :class:`Format` and +override the stub methods. + +Note that :meth:`~Format.render`, :meth:`~Format.render_block` and +:meth:`~Format.render_node` are simple dispatchers and usually should not be +overriden — override the more specific methods instead. + +:meth:`~Format.render_document` contains a default implementation that simply +renders each block in the document using :meth:`~Format.render_block`. You may +choose to override this if you need more fine grained control over how +consecutive blocks are joined. + +Available Formats +----------------- + +.. autoclass:: wikimini.formats.gemtext.Gemtext + +Reference +--------- """ import io from typing import TextIO, Union diff --git a/wikimini/formats/gemtext.py b/wikimini/formats/gemtext.py index 39df956..c8c8e83 100644 --- a/wikimini/formats/gemtext.py +++ b/wikimini/formats/gemtext.py @@ -7,7 +7,11 @@ from ..document import LineBreak, BlockLink, InlineLink class Gemtext(Format): - """The Gemtext formatter.""" + """The Gemtext formatter. + + For the Gemtext specification, see + https://gemini.circumlunar.space/docs/gemtext.gmi + """ def render_document(self, document): for block, next_block in zip_longest( diff --git a/wikimini/templates/__init__.py b/wikimini/templates/__init__.py index 533ba5e..6614936 100644 --- a/wikimini/templates/__init__.py +++ b/wikimini/templates/__init__.py @@ -1,10 +1,55 @@ -"""Template substitution for Wikimini. - -This module contains functions that mimic Wikipedia's templates. - -A template is a function that takes the :class:`~wikimini.Wikimini` instance -and the :class:`~mwparserfromhell.nodes.template.Template` node to convert, and -returns a string with the template output (see :const:`Template`). +""" A template in the context of Wikipedia is something like a macro, that is a +piece of "wikicode" that is included from a different page. Templates on +Wikipedia are used for semantic markup and to ensure a consistent output across +Wikipedia pages. For example, the `{{lang-de|...}} +<https://en.wikipedia.org/wiki/Template:Lang-de>`__ template ensures that the +text is prefixed with "German:", written in cursive, and has the right +``lang=de`` HTML attribute applied. + +A lot of templates can be ignored when converting from Wikicode to a plain text +format (e.g. ``{{stub}}``), however some templates are both (relatively) easy +to mimic (at least in their basic functionality), and contain interesting data +that we'd like to keep in the output (e.g. ``{{lang|...}}``). + +Implementation in Wikimini +-------------------------- + +The type of a template is given in :data:`Template`. The type alias is hard to +digest, so here's a breakdown: + +* A template is a function. +* The function takes as arguments the :class:`~wikimini.Wikimini` instance and + the :class:`mwparserfromhell.nodes.Node` that represents the invocation. +* A template may return either a list of :class:`wikimini.document.Block` or + :class:`wikimini.document.Node`, depending on whether the template should + expand to block-level constructs (like ``{{blockquote}}``) or to inline text + (like ``{{lang}}``). + +In order to make your template implementation known to Wikimini, you need to +register it using :meth:`Registry.insert`. There are multiple possibilities for +that: + +You can either register it in the global registry +:data:`wikimini.templates.registry`, in which case it will be used by any +:class:`~wikimini.Wikimini` instance that uses the global registry (which is +the default). + +You can start a new registry using only your own templates by creating one and +registering all your templates:: + + from wikimini.templates import Registry + from wikimini.document import Plain + from wikimini import Wikimini + registry = Registry() + registry.insert("lang", lambda _, _: [Plain("Schnitzel!")]) + + wiki = Wikimini(registry=registry) + +Alternatively, you can also clone an existing registry using +:meth:`Registry.copy` and then add templates to it. + +Reference +--------- """ import copy from typing import Callable, Optional, Union, Sequence |