diff options
author | Daniel Schadt <kingdread@gmx.de> | 2021-08-21 21:58:50 +0200 |
---|---|---|
committer | Daniel Schadt <kingdread@gmx.de> | 2021-08-21 21:58:50 +0200 |
commit | 01f52aa2481c314511f43560757c5bd63ca05ac2 (patch) | |
tree | 802376c7873d449bc1ccc0230093e233298a6519 | |
parent | a9a7c6c77f3b6078e317d455b696ce76272b88cb (diff) | |
download | wikimini-01f52aa2481c314511f43560757c5bd63ca05ac2.tar.gz wikimini-01f52aa2481c314511f43560757c5bd63ca05ac2.tar.bz2 wikimini-01f52aa2481c314511f43560757c5bd63ca05ac2.zip |
generalize File: stripping
This has two reasons:
First, there's more than just File: we might want to strip. Category:
was another example, but there's more - User:, Help:, ... Using \w+
should catch them all.
Secondly, and maybe more importantly, different languages have their
namespaces localized as well. For example, in German, we have Datei:
instead of File:, or Kategorie: instead of Category:. This fix makes the
stripping work properly there as well.
One future change that might have to be done is to expand the regex to
catch namespaces with a space/underscore in it.
-rw-r--r-- | wikimini/__init__.py | 7 | ||||
-rw-r--r-- | wikimini/document.py | 2 |
2 files changed, 4 insertions, 5 deletions
diff --git a/wikimini/__init__.py b/wikimini/__init__.py index d6e49a1..876c7a7 100644 --- a/wikimini/__init__.py +++ b/wikimini/__init__.py @@ -1,3 +1,5 @@ +import re + import mwparserfromhell as mwp import requests @@ -184,10 +186,7 @@ class Wikimini: else: return template(self, obj) elif isinstance(obj, mwp.nodes.wikilink.Wikilink): - if (str(obj.title).startswith("File:") - or str(obj.text).startswith("thumb|")): - return [] - elif str(obj.title).startswith("Category:"): + if re.match("\\w+:", str(obj.title)): return [] else: return [InlineLink( diff --git a/wikimini/document.py b/wikimini/document.py index a363c25..c72aa78 100644 --- a/wikimini/document.py +++ b/wikimini/document.py @@ -238,7 +238,7 @@ class Paragraph(Block): # (mainly if their text also contains links, in which case it'd require # multiple parsing passes). As a quick and dirty fix, we just delete # that stuff here: - while match := re.search("\\[\\[File:.+?\\]\\]", self.plain()): + while match := re.search("\\[\\[\\w+:.+?\\]\\]", self.plain()): start_node, start_pos = self._find_index(match.start()) end_node, end_pos = self._find_index(match.end()) |