summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--wikimini/document.py26
1 files changed, 26 insertions, 0 deletions
diff --git a/wikimini/document.py b/wikimini/document.py
index 1e18a8b..a7da2a2 100644
--- a/wikimini/document.py
+++ b/wikimini/document.py
@@ -80,6 +80,9 @@ class Node:
called on.
"""
+ def __len__(self):
+ return len(self.plain())
+
def __getitem__(self, index):
if isinstance(index, int):
return self.plain()[index]
@@ -208,6 +211,16 @@ class Paragraph(Block):
__slots__ = ("nodes",)
nodes: List[Node]
+ def _find_index(self, idx):
+ offset = 0
+ for i, node in enumerate(self.nodes):
+ if idx < offset + len(node):
+ return (i, idx - offset)
+ offset += len(node)
+ if idx == offset:
+ return (i, len(node))
+ raise IndexError(f"{idx} is out of range")
+
def __bool__(self):
return bool(self.nodes)
@@ -221,6 +234,19 @@ class Paragraph(Block):
return self.nodes
def cleanup(self):
+ # There is a chance that some "thumbnail" links will get through
+ # (mainly if their text also contains links, in which case it'd require
+ # multiple parsing passes). As a quick and dirty fix, we just delete
+ # that stuff here:
+ while match := re.search("\\[\\[File:.+?\\]\\]", self.plain()):
+ start_node, start_pos = self._find_index(match.start())
+ end_node, end_pos = self._find_index(match.end())
+
+ new_start = self.nodes[start_node][:start_pos]
+ new_end = self.nodes[end_node][end_pos:]
+ self.nodes[start_node:end_node + 1] = [new_start, new_end]
+
+ # Strip leading and trailing whitespace
while self.nodes and re.match("^\\s+|^$", self.nodes[0].plain()):
self.nodes[0] = self.nodes[0].with_text(
self.nodes[0].plain().lstrip())