summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaniel Schadt <kingdread@gmx.de>2021-08-21 21:58:50 +0200
committerDaniel Schadt <kingdread@gmx.de>2021-08-21 21:58:50 +0200
commit01f52aa2481c314511f43560757c5bd63ca05ac2 (patch)
tree802376c7873d449bc1ccc0230093e233298a6519
parenta9a7c6c77f3b6078e317d455b696ce76272b88cb (diff)
downloadwikimini-01f52aa2481c314511f43560757c5bd63ca05ac2.tar.gz
wikimini-01f52aa2481c314511f43560757c5bd63ca05ac2.tar.bz2
wikimini-01f52aa2481c314511f43560757c5bd63ca05ac2.zip
generalize File: stripping
This has two reasons: First, there's more than just File: we might want to strip. Category: was another example, but there's more - User:, Help:, ... Using \w+ should catch them all. Secondly, and maybe more importantly, different languages have their namespaces localized as well. For example, in German, we have Datei: instead of File:, or Kategorie: instead of Category:. This fix makes the stripping work properly there as well. One future change that might have to be done is to expand the regex to catch namespaces with a space/underscore in it.
-rw-r--r--wikimini/__init__.py7
-rw-r--r--wikimini/document.py2
2 files changed, 4 insertions, 5 deletions
diff --git a/wikimini/__init__.py b/wikimini/__init__.py
index d6e49a1..876c7a7 100644
--- a/wikimini/__init__.py
+++ b/wikimini/__init__.py
@@ -1,3 +1,5 @@
+import re
+
import mwparserfromhell as mwp
import requests
@@ -184,10 +186,7 @@ class Wikimini:
else:
return template(self, obj)
elif isinstance(obj, mwp.nodes.wikilink.Wikilink):
- if (str(obj.title).startswith("File:")
- or str(obj.text).startswith("thumb|")):
- return []
- elif str(obj.title).startswith("Category:"):
+ if re.match("\\w+:", str(obj.title)):
return []
else:
return [InlineLink(
diff --git a/wikimini/document.py b/wikimini/document.py
index a363c25..c72aa78 100644
--- a/wikimini/document.py
+++ b/wikimini/document.py
@@ -238,7 +238,7 @@ class Paragraph(Block):
# (mainly if their text also contains links, in which case it'd require
# multiple parsing passes). As a quick and dirty fix, we just delete
# that stuff here:
- while match := re.search("\\[\\[File:.+?\\]\\]", self.plain()):
+ while match := re.search("\\[\\[\\w+:.+?\\]\\]", self.plain()):
start_node, start_pos = self._find_index(match.start())
end_node, end_pos = self._find_index(match.end())