diff options
-rw-r--r-- | wikimini/document.py | 26 |
1 files changed, 26 insertions, 0 deletions
diff --git a/wikimini/document.py b/wikimini/document.py index 1e18a8b..a7da2a2 100644 --- a/wikimini/document.py +++ b/wikimini/document.py @@ -80,6 +80,9 @@ class Node: called on. """ + def __len__(self): + return len(self.plain()) + def __getitem__(self, index): if isinstance(index, int): return self.plain()[index] @@ -208,6 +211,16 @@ class Paragraph(Block): __slots__ = ("nodes",) nodes: List[Node] + def _find_index(self, idx): + offset = 0 + for i, node in enumerate(self.nodes): + if idx < offset + len(node): + return (i, idx - offset) + offset += len(node) + if idx == offset: + return (i, len(node)) + raise IndexError(f"{idx} is out of range") + def __bool__(self): return bool(self.nodes) @@ -221,6 +234,19 @@ class Paragraph(Block): return self.nodes def cleanup(self): + # There is a chance that some "thumbnail" links will get through + # (mainly if their text also contains links, in which case it'd require + # multiple parsing passes). As a quick and dirty fix, we just delete + # that stuff here: + while match := re.search("\\[\\[File:.+?\\]\\]", self.plain()): + start_node, start_pos = self._find_index(match.start()) + end_node, end_pos = self._find_index(match.end()) + + new_start = self.nodes[start_node][:start_pos] + new_end = self.nodes[end_node][end_pos:] + self.nodes[start_node:end_node + 1] = [new_start, new_end] + + # Strip leading and trailing whitespace while self.nodes and re.match("^\\s+|^$", self.nodes[0].plain()): self.nodes[0] = self.nodes[0].with_text( self.nodes[0].plain().lstrip()) |