From 01f52aa2481c314511f43560757c5bd63ca05ac2 Mon Sep 17 00:00:00 2001 From: Daniel Schadt Date: Sat, 21 Aug 2021 21:58:50 +0200 Subject: generalize File: stripping This has two reasons: First, there's more than just File: we might want to strip. Category: was another example, but there's more - User:, Help:, ... Using \w+ should catch them all. Secondly, and maybe more importantly, different languages have their namespaces localized as well. For example, in German, we have Datei: instead of File:, or Kategorie: instead of Category:. This fix makes the stripping work properly there as well. One future change that might have to be done is to expand the regex to catch namespaces with a space/underscore in it. --- wikimini/__init__.py | 7 +++---- wikimini/document.py | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/wikimini/__init__.py b/wikimini/__init__.py index d6e49a1..876c7a7 100644 --- a/wikimini/__init__.py +++ b/wikimini/__init__.py @@ -1,3 +1,5 @@ +import re + import mwparserfromhell as mwp import requests @@ -184,10 +186,7 @@ class Wikimini: else: return template(self, obj) elif isinstance(obj, mwp.nodes.wikilink.Wikilink): - if (str(obj.title).startswith("File:") - or str(obj.text).startswith("thumb|")): - return [] - elif str(obj.title).startswith("Category:"): + if re.match("\\w+:", str(obj.title)): return [] else: return [InlineLink( diff --git a/wikimini/document.py b/wikimini/document.py index a363c25..c72aa78 100644 --- a/wikimini/document.py +++ b/wikimini/document.py @@ -238,7 +238,7 @@ class Paragraph(Block): # (mainly if their text also contains links, in which case it'd require # multiple parsing passes). As a quick and dirty fix, we just delete # that stuff here: - while match := re.search("\\[\\[File:.+?\\]\\]", self.plain()): + while match := re.search("\\[\\[\\w+:.+?\\]\\]", self.plain()): start_node, start_pos = self._find_index(match.start()) end_node, end_pos = self._find_index(match.end()) -- cgit v1.2.3