summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaniel Schadt <kingdread@gmx.de>2021-08-20 12:35:03 +0200
committerDaniel Schadt <kingdread@gmx.de>2021-08-20 12:35:03 +0200
commitd6e7479fb0b845415c9d1bdcc42936a4f36dde39 (patch)
treed90f8ae6595d1635f85212467ad46b04605ba2a5
parenta05368c8c8f9b97d727dc8d2efcf847743b29f66 (diff)
downloadwikimini-d6e7479fb0b845415c9d1bdcc42936a4f36dde39.tar.gz
wikimini-d6e7479fb0b845415c9d1bdcc42936a4f36dde39.tar.bz2
wikimini-d6e7479fb0b845415c9d1bdcc42936a4f36dde39.zip
properly strip File: links that got through
-rw-r--r--wikimini/document.py26
1 files changed, 26 insertions, 0 deletions
diff --git a/wikimini/document.py b/wikimini/document.py
index 1e18a8b..a7da2a2 100644
--- a/wikimini/document.py
+++ b/wikimini/document.py
@@ -80,6 +80,9 @@ class Node:
called on.
"""
+ def __len__(self):
+ return len(self.plain())
+
def __getitem__(self, index):
if isinstance(index, int):
return self.plain()[index]
@@ -208,6 +211,16 @@ class Paragraph(Block):
__slots__ = ("nodes",)
nodes: List[Node]
+ def _find_index(self, idx):
+ offset = 0
+ for i, node in enumerate(self.nodes):
+ if idx < offset + len(node):
+ return (i, idx - offset)
+ offset += len(node)
+ if idx == offset:
+ return (i, len(node))
+ raise IndexError(f"{idx} is out of range")
+
def __bool__(self):
return bool(self.nodes)
@@ -221,6 +234,19 @@ class Paragraph(Block):
return self.nodes
def cleanup(self):
+ # There is a chance that some "thumbnail" links will get through
+ # (mainly if their text also contains links, in which case it'd require
+ # multiple parsing passes). As a quick and dirty fix, we just delete
+ # that stuff here:
+ while match := re.search("\\[\\[File:.+?\\]\\]", self.plain()):
+ start_node, start_pos = self._find_index(match.start())
+ end_node, end_pos = self._find_index(match.end())
+
+ new_start = self.nodes[start_node][:start_pos]
+ new_end = self.nodes[end_node][end_pos:]
+ self.nodes[start_node:end_node + 1] = [new_start, new_end]
+
+ # Strip leading and trailing whitespace
while self.nodes and re.match("^\\s+|^$", self.nodes[0].plain()):
self.nodes[0] = self.nodes[0].with_text(
self.nodes[0].plain().lstrip())