From eb9fbf5f24b5e41cadd1eac8ca970ba819ecb6a5 Mon Sep 17 00:00:00 2001 From: Chris McDonough Date: Wed, 23 Sep 2009 10:24:42 +0000 Subject: Features -------- - Speed up ``repoze.bfg.encode.urlencode`` (nee' ``repoze.bfg.url.urlencode``) slightly. - Speed up ``repoze.bfg.traversal.model_path`` and ``repoze.bfg.traversal.model_path_tuple`` slightly. Internal -------- - Move ``repoze.bfg.traversal._url_quote`` into ``repoze.bfg.encode`` as ``url_quote``. Backwards Incompatibilities --------------------------- - We previously had a Unicode-aware wrapper for the ``urllib.urlencode`` function named ``repoze.bfg.url.urlencode`` which delegated to the stdlib function, but which marshalled all unicode values to utf-8 strings before calling the stdlib version. A newer replacement now lives in ``repoze.bfg.encode`` (old imports will still work). The replacement does not delegate to the stdlib. The replacement diverges from the stdlib implementation and the previous ``repoze.bfg.url`` url implementation inasmuch as its ``doseq`` argument is a decoy: it always behaves in the ``doseq=True`` way (which is the only sane behavior) for speed purposes. The old import location (``repoze.bfg.url.urlencode``) still functions and has not been deprecated. --- CHANGES.txt | 30 +++++++++++ repoze/bfg/encode.py | 107 ++++++++++++++++++++++++++++++++++++++++ repoze/bfg/tests/test_encode.py | 61 +++++++++++++++++++++++ repoze/bfg/tests/test_url.py | 28 ----------- repoze/bfg/traversal.py | 66 ++++++------------------- repoze/bfg/url.py | 47 +----------------- repoze/bfg/urldispatch.py | 6 +-- 7 files changed, 217 insertions(+), 128 deletions(-) create mode 100644 repoze/bfg/encode.py create mode 100644 repoze/bfg/tests/test_encode.py diff --git a/CHANGES.txt b/CHANGES.txt index 42a87940c..8e8a901f1 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -6,6 +6,36 @@ Features - Speed up ``repoze.bfg.location.lineage`` slightly. +- Speed up ``repoze.bfg.encode.urlencode`` (nee' + ``repoze.bfg.url.urlencode``) slightly. + +- Speed up ``repoze.bfg.traversal.model_path`` and + ``repoze.bfg.traversal.model_path_tuple`` slightly. + +Internal +-------- + +- Move ``repoze.bfg.traversal._url_quote`` into ``repoze.bfg.encode`` + as ``url_quote``. + +Backwards Incompatibilities +--------------------------- + +- We previously had a Unicode-aware wrapper for the + ``urllib.urlencode`` function named ``repoze.bfg.url.urlencode`` + which delegated to the stdlib function, but which marshalled all + unicode values to utf-8 strings before calling the stdlib version. + A newer replacement now lives in ``repoze.bfg.encode`` (old imports + will still work). The replacement does not delegate to the stdlib. + + The replacement diverges from the stdlib implementation and the + previous ``repoze.bfg.url`` url implementation inasmuch as its + ``doseq`` argument is a decoy: it always behaves in the + ``doseq=True`` way (which is the only sane behavior) for speed + purposes. + + The old import location (``repoze.bfg.url.urlencode``) still + functions and has not been deprecated. 1.1a4 (2009-09-23) ================== diff --git a/repoze/bfg/encode.py b/repoze/bfg/encode.py new file mode 100644 index 000000000..127c405ed --- /dev/null +++ b/repoze/bfg/encode.py @@ -0,0 +1,107 @@ +import re + +always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' + 'abcdefghijklmnopqrstuvwxyz' + '0123456789' '_.-') +_safemaps = {} +_must_quote = {} + +def url_quote(s, safe=''): + """quote('abc def') -> 'abc%20def' + + Faster version of Python stdlib urllib.quote which also quotes + the '/' character. + + Each part of a URL, e.g. the path info, the query, etc., has a + different set of reserved characters that must be quoted. + + RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists + the following reserved characters. + + reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | + "$" | "," + + Each of these characters is reserved in some component of a URL, + but not necessarily in all of them. + + Unlike the default version of this function in the Python stdlib, + by default, the quote function is intended for quoting individual + path segments instead of an already composed path that might have + '/' characters in it. Thus, it *will* encode any '/' character it + finds in a string. + """ + cachekey = (safe, always_safe) + try: + safe_map = _safemaps[cachekey] + if not _must_quote[cachekey].search(s): + return s + except KeyError: + safe += always_safe + _must_quote[cachekey] = re.compile(r'[^%s]' % safe) + safe_map = {} + for i in range(256): + c = chr(i) + safe_map[c] = (c in safe) and c or ('%%%02X' % i) + _safemaps[cachekey] = safe_map + res = map(safe_map.__getitem__, s) + return ''.join(res) + +def quote_plus(s, safe=''): + """ Version of stdlib quote_plus which uses faster url_quote """ + if ' ' in s: + s = url_quote(s, safe + ' ') + return s.replace(' ', '+') + return url_quote(s, safe) + +def urlencode(query, doseq=True): + """ + An alternate implementation of Python's stdlib `urllib.urlencode + function `_ which + accepts unicode keys and values within the ``query`` + dict/sequence; all Unicode keys and values are first converted to + UTF-8 before being used to compose the query string. + + The value of ``query`` must be a sequence of two-tuples + representing key/value pairs *or* an object (often a dictionary) + with an ``.items()`` method that returns a sequence of two-tuples + representing key/value pairs. + + For minimal calling convention backwards compatibility, this + version of urlencode accepts *but ignores* a second argument + conventionally named ``doseq``. The Python stdlib version behaves + differently when ``doseq`` is False and when a sequence is + presented as one of the values. This version always behaves in + the ``doseq=True`` mode, no matter what the value of the second + argument. + + See the Python stdlib documentation for ``urllib.urlencode`` for + more information. + """ + try: + # presumed to be a dictionary + query = query.items() + except AttributeError: + pass + + result = '' + prefix = '' + + for (k, v) in query: + if k.__class__ is unicode: + k = k.encode('utf-8') + k = quote_plus(str(k)) + if hasattr(v, '__iter__'): + for x in v: + if x.__class__ is unicode: + x = x.encode('utf-8') + x = quote_plus(str(x)) + result += '%s%s=%s' % (prefix, k, x) + prefix = '&' + else: + if v.__class__ is unicode: + v = v.encode('utf-8') + v = quote_plus(str(v)) + result += '%s%s=%s' % (prefix, k, v) + prefix = '&' + + return result diff --git a/repoze/bfg/tests/test_encode.py b/repoze/bfg/tests/test_encode.py new file mode 100644 index 000000000..364247fb3 --- /dev/null +++ b/repoze/bfg/tests/test_encode.py @@ -0,0 +1,61 @@ +import unittest + +class UrlEncodeTests(unittest.TestCase): + def _callFUT(self, query, doseq=False): + from repoze.bfg.encode import urlencode + return urlencode(query, doseq) + + def test_ascii_only(self): + result = self._callFUT([('a',1), ('b',2)]) + self.assertEqual(result, 'a=1&b=2') + + def test_unicode_key(self): + la = unicode('LaPe\xc3\xb1a', 'utf-8') + result = self._callFUT([(la, 1), ('b',2)]) + self.assertEqual(result, 'LaPe%C3%B1a=1&b=2') + + def test_unicode_val_single(self): + la = unicode('LaPe\xc3\xb1a', 'utf-8') + result = self._callFUT([('a', la), ('b',2)]) + self.assertEqual(result, 'a=LaPe%C3%B1a&b=2') + + def test_unicode_val_multiple(self): + la = [unicode('LaPe\xc3\xb1a', 'utf-8')] * 2 + result = self._callFUT([('a', la), ('b',2)], doseq=True) + self.assertEqual(result, 'a=LaPe%C3%B1a&a=LaPe%C3%B1a&b=2') + + def test_dict(self): + result = self._callFUT({'a':1}) + self.assertEqual(result, 'a=1') + +class URLQuoteTests(unittest.TestCase): + def _callFUT(self, val, safe=''): + from repoze.bfg.encode import url_quote + return url_quote(val, safe) + + def test_it_default(self): + la = 'La/Pe\xc3\xb1a' + result = self._callFUT(la) + self.assertEqual(result, 'La%2FPe%C3%B1a') + + def test_it_with_safe(self): + la = 'La/Pe\xc3\xb1a' + result = self._callFUT(la, '/') + self.assertEqual(result, 'La/Pe%C3%B1a') + +class TestQuotePlus(unittest.TestCase): + def _callFUT(self, val, safe=''): + from repoze.bfg.encode import quote_plus + return quote_plus(val, safe) + + def test_it_default(self): + la = 'La Pe\xc3\xb1a' + result = self._callFUT(la) + self.assertEqual(result, 'La+Pe%C3%B1a') + + def test_it_with_safe(self): + la = 'La /Pe\xc3\xb1a' + result = self._callFUT(la, '/') + self.assertEqual(result, 'La+/Pe%C3%B1a') + + diff --git a/repoze/bfg/tests/test_url.py b/repoze/bfg/tests/test_url.py index 5833b8880..1199328e3 100644 --- a/repoze/bfg/tests/test_url.py +++ b/repoze/bfg/tests/test_url.py @@ -129,34 +129,6 @@ class ModelURLTests(unittest.TestCase): result = self._callFUT(root, request) self.assertEqual(result, 'http://example.com:5432/') -class UrlEncodeTests(unittest.TestCase): - def _callFUT(self, query, doseq=False): - from repoze.bfg.url import urlencode - return urlencode(query, doseq) - - def test_ascii_only(self): - result = self._callFUT([('a',1), ('b',2)]) - self.assertEqual(result, 'a=1&b=2') - - def test_unicode_key(self): - la = unicode('LaPe\xc3\xb1a', 'utf-8') - result = self._callFUT([(la, 1), ('b',2)]) - self.assertEqual(result, 'LaPe%C3%B1a=1&b=2') - - def test_unicode_val_single(self): - la = unicode('LaPe\xc3\xb1a', 'utf-8') - result = self._callFUT([('a', la), ('b',2)]) - self.assertEqual(result, 'a=LaPe%C3%B1a&b=2') - - def test_unicode_val_multiple(self): - la = [unicode('LaPe\xc3\xb1a', 'utf-8')] * 2 - result = self._callFUT([('a', la), ('b',2)], doseq=True) - self.assertEqual(result, 'a=LaPe%C3%B1a&a=LaPe%C3%B1a&b=2') - - def test_dict(self): - result = self._callFUT({'a':1}) - self.assertEqual(result, 'a=1') - class TestRouteUrl(unittest.TestCase): def setUp(self): cleanUp() diff --git a/repoze/bfg/traversal.py b/repoze/bfg/traversal.py index 108174924..b43bbc295 100644 --- a/repoze/bfg/traversal.py +++ b/repoze/bfg/traversal.py @@ -1,4 +1,3 @@ -import re import urllib from zope.component import queryMultiAdapter @@ -16,6 +15,7 @@ from repoze.bfg.interfaces import ITraverserFactory from repoze.bfg.interfaces import VH_ROOT_KEY from repoze.bfg.location import lineage +from repoze.bfg.encode import url_quote def find_root(model): """ Find the root node in the graph to which ``model`` @@ -137,8 +137,13 @@ def model_path(model, *elements): will be prepended to the generated path rather than a single leading '/' character. """ - path = _model_path_list(model, *elements) - return path and '/'.join([quote_path_segment(x) for x in path]) or '/' + # joining strings is a bit expensive so we delegate to a function + # which caches the joined result for us + return _join_path_tuple(model_path_tuple(model, *elements)) + +@lru_cache(1000) +def _join_path_tuple(tuple): + return tuple and '/'.join([quote_path_segment(x) for x in tuple]) or '/' def traverse(model, path): """Given a model object as ``model`` and a string or tuple @@ -345,8 +350,8 @@ def model_path_tuple(model, *elements): def _model_path_list(model, *elements): """ Implementation detail shared by model_path and model_path_tuple """ - lpath = reversed(list(lineage(model))) - path = [ location.__name__ or '' for location in lpath ] + path = [loc.__name__ or '' for loc in lineage(model)] + path.reverse() path.extend(elements) return path @@ -485,9 +490,9 @@ def quote_path_segment(segment): return _segment_cache[segment] except KeyError: if segment.__class__ is unicode: # isinstance slighly slower (~15%) - result = _url_quote(segment.encode('utf-8')) + result = url_quote(segment.encode('utf-8')) else: - result = _url_quote(segment) + result = url_quote(segment) # we don't need a lock to mutate _segment_cache, as the below # will generate exactly one Python bytecode (STORE_SUBSCR) _segment_cache[segment] = result @@ -643,48 +648,7 @@ class TraversalContextURL(object): app_url = request.application_url # never ends in a slash return app_url + path -always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' - 'abcdefghijklmnopqrstuvwxyz' - '0123456789' '_.-') -_safemaps = {} -_must_quote = {} - -def _url_quote(s, safe = ''): - """quote('abc def') -> 'abc%20def' - - Faster version of Python stdlib urllib.quote which also quotes - the '/' character. - - Each part of a URL, e.g. the path info, the query, etc., has a - different set of reserved characters that must be quoted. - - RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists - the following reserved characters. - - reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | - "$" | "," - - Each of these characters is reserved in some component of a URL, - but not necessarily in all of them. +@lru_cache(1000) +def _join_path_tuple(tuple): + return tuple and '/'.join([quote_path_segment(x) for x in tuple]) or '/' - Unlike the default version of this function in the Python stdlib, - by default, the quote function is intended for quoting individual - path segments instead of an already composed path that might have - '/' characters in it. Thus, it *will* encode any '/' character it - finds in a string. - """ - cachekey = (safe, always_safe) - try: - safe_map = _safemaps[cachekey] - if not _must_quote[cachekey].search(s): - return s - except KeyError: - safe += always_safe - _must_quote[cachekey] = re.compile(r'[^%s]' % safe) - safe_map = {} - for i in range(256): - c = chr(i) - safe_map[c] = (c in safe) and c or ('%%%02X' % i) - _safemaps[cachekey] = safe_map - res = map(safe_map.__getitem__, s) - return ''.join(res) diff --git a/repoze/bfg/url.py b/repoze/bfg/url.py index 5d1f08faa..1fd9bab1e 100644 --- a/repoze/bfg/url.py +++ b/repoze/bfg/url.py @@ -1,7 +1,6 @@ """ Utility functions for dealing with URLs in repoze.bfg """ import os -import urllib from zope.component import getUtility from zope.component import queryMultiAdapter @@ -9,6 +8,7 @@ from zope.component import queryMultiAdapter from repoze.bfg.interfaces import IContextURL from repoze.bfg.interfaces import IRoutesMapper +from repoze.bfg.encode import urlencode from repoze.bfg.path import caller_package from repoze.bfg.static import StaticRootFactory from repoze.bfg.traversal import TraversalContextURL @@ -249,48 +249,3 @@ def static_url(path, request, **kw): raise ValueError('No static URL definition matching %s' % path) -def urlencode(query, doseq=False): - """ - A wrapper around Python's stdlib `urllib.urlencode function - `_ which accepts - unicode keys and values within the ``query`` dict/sequence; all - Unicode keys and values are first converted to UTF-8 before being - used to compose the query string. The behavior of the function is - otherwise the same as the stdlib version. - - The value of ``query`` must be a sequence of two-tuples - representing key/value pairs *or* an object (often a dictionary) - with an ``.items()`` method that returns a sequence of two-tuples - representing key/value pairs. ``doseq`` controls what happens - when a sequence is presented as one of the values. See the Python - stdlib documentation for ``urllib.urlencode`` for more - information. - """ - if hasattr(query, 'items'): - # presumed to be a dictionary - query = query.items() - - newquery = [] - for k, v in query: - - if k.__class__ is unicode: - k = k.encode('utf-8') - - try: - v.__iter__ - except AttributeError: - if v.__class__ is unicode: - v = v.encode('utf-8') - else: - L = [] - for x in v: - if x.__class__ is unicode: - x = x.encode('utf-8') - L.append(x) - v = L - - newquery.append((k, v)) - - return urllib.urlencode(newquery, doseq=doseq) - - diff --git a/repoze/bfg/urldispatch.py b/repoze/bfg/urldispatch.py index c1d1f71e4..58ea192c6 100644 --- a/repoze/bfg/urldispatch.py +++ b/repoze/bfg/urldispatch.py @@ -1,9 +1,9 @@ import re from urllib import unquote -from repoze.bfg.traversal import _url_quote -from repoze.bfg.traversal import quote_path_segment from repoze.bfg.traversal import traversal_path +from repoze.bfg.traversal import quote_path_segment +from repoze.bfg.encode import url_quote _marker = object() @@ -111,7 +111,7 @@ def _compile_route(route): v = '/'.join([quote_path_segment(x) for x in v]) elif k != star: try: - v = _url_quote(v) + v = url_quote(v) except TypeError: pass newdict[k] = v -- cgit v1.2.3