From 01f52aa2481c314511f43560757c5bd63ca05ac2 Mon Sep 17 00:00:00 2001
From: Daniel Schadt <kingdread@gmx.de>
Date: Sat, 21 Aug 2021 21:58:50 +0200
Subject: generalize File: stripping

This has two reasons:

First, there's more than just File: we might want to strip. Category:
was another example, but there's more - User:, Help:, ... Using \w+
should catch them all.

Secondly, and maybe more importantly, different languages have their
namespaces localized as well. For example, in German, we have Datei:
instead of File:, or Kategorie: instead of Category:. This fix makes the
stripping work properly there as well.

One future change that might have to be done is to expand the regex to
catch namespaces with a space/underscore in it.
---
 wikimini/__init__.py | 7 +++----
 wikimini/document.py | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/wikimini/__init__.py b/wikimini/__init__.py
index d6e49a1..876c7a7 100644
--- a/wikimini/__init__.py
+++ b/wikimini/__init__.py
@@ -1,3 +1,5 @@
+import re
+
 import mwparserfromhell as mwp
 import requests
 
@@ -184,10 +186,7 @@ class Wikimini:
             else:
                 return template(self, obj)
         elif isinstance(obj, mwp.nodes.wikilink.Wikilink):
-            if (str(obj.title).startswith("File:")
-                    or str(obj.text).startswith("thumb|")):
-                return []
-            elif str(obj.title).startswith("Category:"):
+            if re.match("\\w+:", str(obj.title)):
                 return []
             else:
                 return [InlineLink(
diff --git a/wikimini/document.py b/wikimini/document.py
index a363c25..c72aa78 100644
--- a/wikimini/document.py
+++ b/wikimini/document.py
@@ -238,7 +238,7 @@ class Paragraph(Block):
         # (mainly if their text also contains links, in which case it'd require
         # multiple parsing passes). As a quick and dirty fix, we just delete
         # that stuff here:
-        while match := re.search("\\[\\[File:.+?\\]\\]", self.plain()):
+        while match := re.search("\\[\\[\\w+:.+?\\]\\]", self.plain()):
             start_node, start_pos = self._find_index(match.start())
             end_node, end_pos = self._find_index(match.end())
 
-- 
cgit v1.2.3