diff options
| -rw-r--r-- | CHANGES.txt | 60 | ||||
| -rw-r--r-- | docs/narr/urldispatch.rst | 118 | ||||
| -rw-r--r-- | pyramid/compat.py | 24 | ||||
| -rw-r--r-- | pyramid/config/testing.py | 10 | ||||
| -rw-r--r-- | pyramid/config/util.py | 6 | ||||
| -rw-r--r-- | pyramid/tests/test_config/test_util.py | 16 | ||||
| -rw-r--r-- | pyramid/tests/test_traversal.py | 60 | ||||
| -rw-r--r-- | pyramid/tests/test_urldispatch.py | 89 | ||||
| -rw-r--r-- | pyramid/traversal.py | 96 | ||||
| -rw-r--r-- | pyramid/url.py | 2 | ||||
| -rw-r--r-- | pyramid/urldispatch.py | 128 |
11 files changed, 504 insertions, 105 deletions
diff --git a/CHANGES.txt b/CHANGES.txt index 07c0b564d..8aafeef74 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -27,6 +27,66 @@ Bug Fixes now been fixed and the code matches the documentation. See also https://github.com/Pylons/pyramid/issues/386 +- Literal portions of route patterns were not URL-quoted when ``route_url`` + or ``route_path`` was used to generate a URL or path. + +- The result of ``route_path`` or ``route_url`` might have been ``unicode`` + or ``str`` depending on the input. It is now guaranteed to always be + ``str``. + +- URL matching when the pattern contained non-ASCII characters in literal + parts was indeterminate. Now the pattern supplied to ``add_route`` is + assumed to be either: a ``unicode`` value, or a ``str`` value that contains + only ASCII characters. If you now want to match the path info from a URL + that contains high order characters, you can pass the Unicode + representation of the decoded path portion in the pattern. + +- When using a ``traverse=`` route predicate, traversal would fail with a + URLDecodeError if there were any high-order characters in the traversal + pattern or in the matched dynamic segments. + +- Using a dynamic segment named ``traverse`` in a route pattern like this:: + + config.add_route('trav_route', 'traversal/{traverse:.*}') + + Would cause a ``UnicodeDecodeError`` when the route was matched and the + matched portion of the URL contained any high-order characters. See + https://github.com/Pylons/pyramid/issues/385 . + +- When using a ``*traverse`` stararg in a route pattern, a URL that matched + that possessed a ``@@`` in its name (signifying a view name) would be + inappropriately quoted by the traversal machinery during traversal, + resulting in the view not being found properly. See + https://github.com/Pylons/pyramid/issues/382 and + https://github.com/Pylons/pyramid/issues/375 . + +Backwards Incompatibilities +--------------------------- + +- String values passed to ``route_url`` or ``route_path`` that are meant to + replace "remainder" matches will now be URL-quoted except for embedded + slashes. For example:: + + config.add_route('remain', '/foo*remainder') + request.route_path('remain', remainder='abc / def') + # -> '/foo/abc%20/%20def' + + Previously string values passed as remainder replacements were tacked on + untouched, without any URL-quoting. But this doesn't really work logically + if the value passed is Unicode (raw unicode cannot be placed in a URL or in + a path) and it is inconsistent with the rest of the URL generation + machinery if the value is a string (it won't be quoted unless by the + caller). + + Some folks will have been relying on the older behavior to tack on query + string elements and anchor portions of the URL; sorry, you'll need to + change your code to use the ``_query`` and/or ``_anchor`` arguments to + ``route_path`` or ``route_url`` to do this now. + +- If you pass a bytestring that contains non-ASCII characters to + ``add_route`` as a pattern, it will now fail at startup time. Use Unicode + instead. + 1.3a3 (2011-12-21) ================== diff --git a/docs/narr/urldispatch.rst b/docs/narr/urldispatch.rst index 35613ea1b..6d9dfdd92 100644 --- a/docs/narr/urldispatch.rst +++ b/docs/narr/urldispatch.rst @@ -235,7 +235,7 @@ When matching the following URL: .. code-block:: text - foo/La%20Pe%C3%B1a + http://example.com/foo/La%20Pe%C3%B1a The matchdict will look like so (the value is URL-decoded / UTF-8 decoded): @@ -243,6 +243,51 @@ The matchdict will look like so (the value is URL-decoded / UTF-8 decoded): {'bar':u'La Pe\xf1a'} +Literal strings in the path segment should represent the *decoded* value of +the ``PATH_INFO`` provided to Pyramid. You don't want to use a URL-encoded +value or a bytestring representing the literal's UTF-8 in the pattern. For +example, rather than this: + +.. code-block:: text + + /Foo%20Bar/{baz} + +You'll want to use something like this: + +.. code-block:: text + + /Foo Bar/{baz} + +For patterns that contain "high-order" characters in its literals, you'll +want to use a Unicode value as the pattern as opposed to any URL-encoded or +UTF-8-encoded value. For example, you might be tempted to use a bytestring +pattern like this: + +.. code-block:: text + + /La Pe\xc3\xb1a/{x} + +But this will either cause an error at startup time or it won't match +properly. You'll want to use a Unicode value as the pattern instead rather +than raw bytestring escapes. You can use a high-order Unicode value as the +pattern by using `Python source file encoding +<http://www.python.org/dev/peps/pep-0263/>`_ plus the "real" character in the +Unicode pattern in the source, like so: + +.. code-block:: text + + /La Peña/{x} + +Or you can ignore source file encoding and use equivalent Unicode escape +characters in the pattern. + +.. code-block:: text + + /La Pe\xf1a/{x} + +Dynamic segment names cannot contain high-order characters, so this applies +only to literals in the pattern. + If the pattern has a ``*`` in it, the name which follows it is considered a "remainder match". A remainder match *must* come at the end of the pattern. Unlike segment replacement markers, it does not need to be preceded by a @@ -612,7 +657,6 @@ Use the :meth:`pyramid.request.Request.route_url` method to generate URLs based on route patterns. For example, if you've configured a route with the ``name`` "foo" and the ``pattern`` "{a}/{b}/{c}", you might do this. -.. ignore-next-block .. code-block:: python :linenos: @@ -620,8 +664,74 @@ based on route patterns. For example, if you've configured a route with the This would return something like the string ``http://example.com/1/2/3`` (at least if the current protocol and hostname implied ``http://example.com``). -See the :meth:`~pyramid.request.Request.route_url` API documentation for more -information. + +To generate only the *path* portion of a URL from a route, use the +:meth:`pyramid.request.Request.route_path` API instead of +:meth:`~pyramid.request.Request.route_url`. + +.. code-block:: python + + url = request.route_path('foo', a='1', b='2', c='3') + +This will return the string ``/1/2/3`` rather than a full URL. + +Replacement values passed to ``route_url`` or ``route_path`` must be Unicode +or bytestrings encoded in UTF-8. One exception to this rule exists: if +you're trying to replace a "remainder" match value (a ``*stararg`` +replacement value), the value may be a tuple containing Unicode strings or +UTF-8 strings. + +Note that URLs and paths generated by ``route_path`` and ``route_url`` are +always URL-quoted string types (they contain no non-ASCII characters). +Therefore, if you've added a route like so: + +.. code-block:: python + + config.add_route('la', u'/La Peña/{city}') + +And you later generate a URL using ``route_path`` or ``route_url`` like so: + +.. code-block:: python + + url = request.route_path('la', city=u'Québec') + +You will wind up with the path encoded to UTF-8 and URL quoted like so: + +.. code-block:: text + + /La%20Pe%C3%B1a/Qu%C3%A9bec + +If you have a ``*stararg`` remainder dynamic part of your route pattern: + +.. code-block:: python + + config.add_route('abc', 'a/b/c/*foo') + +And you later generate a URL using ``route_path`` or ``route_url`` using a +*string* as the replacement value: + +.. code-block:: python + + url = request.route_path('abc', foo=u'Québec/biz') + +The value you pass will be URL-quoted except for embedded slashes in the +result: + +.. code-block:: text + + /a/b/c/Qu%C3%A9bec/biz + +You can get a similar result by passing a tuple composed of path elements: + +.. code-block:: python + + url = request.route_path('abc', foo=(u'Québec', u'biz')) + +Each value in the tuple will be url-quoted and joined by slashes in this case: + +.. code-block:: text + + /a/b/c/Qu%C3%A9bec/biz .. index:: single: static routes diff --git a/pyramid/compat.py b/pyramid/compat.py index 12b8c9f37..948a1c3be 100644 --- a/pyramid/compat.py +++ b/pyramid/compat.py @@ -41,7 +41,7 @@ def text_(s, encoding='latin-1', errors='strict'): def bytes_(s, encoding='latin-1', errors='strict'): """ If ``s`` is an instance of ``text_type``, return ``s.encode(encoding, errors)``, otherwise return ``s``""" - if isinstance(s, text_type): + if isinstance(s, text_type): # pragma: no cover return s.encode(encoding, errors) return s @@ -105,10 +105,10 @@ else: from urllib import unquote as url_unquote from urllib import urlencode as url_encode from urllib2 import urlopen as url_open - def url_unquote_text(v, encoding='utf-8', errors='replace'): + def url_unquote_text(v, encoding='utf-8', errors='replace'): # pragma: no cover v = url_unquote(v) return v.decode(encoding, errors) - def url_unquote_native(v, encoding='utf-8', errors='replace'): + def url_unquote_native(v, encoding='utf-8', errors='replace'): # pragma: no cover return native_(url_unquote_text(v, encoding, errors)) @@ -213,3 +213,21 @@ except ImportError: # pragma: no cover import json +if PY3: # pragma: no cover + # see PEP 3333 for why we encode WSGI PATH_INFO to latin-1 before + # decoding it to utf-8 + def decode_path_info(path): + return path.encode('latin-1').decode('utf-8') +else: + def decode_path_info(path): + return path.decode('utf-8') + +if PY3: # pragma: no cover + # see PEP 3333 for why we decode the path to latin-1 + from urllib.parse import unquote_to_bytes + def unquote_bytes_to_wsgi(bytestring): + return unquote_to_bytes(bytestring).decode('latin-1') +else: + from urlparse import unquote as unquote_to_bytes + def unquote_bytes_to_wsgi(bytestring): + return unquote_to_bytes(bytestring) diff --git a/pyramid/config/testing.py b/pyramid/config/testing.py index 3cdc1aa24..f40cf25a7 100644 --- a/pyramid/config/testing.py +++ b/pyramid/config/testing.py @@ -8,7 +8,11 @@ from pyramid.interfaces import ( ) from pyramid.renderers import RendererHelper -from pyramid.traversal import traversal_path_info + +from pyramid.traversal import ( + decode_path_info, + split_path_info, + ) from pyramid.config.util import action_method @@ -66,9 +70,9 @@ class TestingConfiguratorMixin(object): self.context = context def __call__(self, request): - path = request.environ['PATH_INFO'] + path = decode_path_info(request.environ['PATH_INFO']) ob = resources[path] - traversed = traversal_path_info(path) + traversed = split_path_info(path) return {'context':ob, 'view_name':'','subpath':(), 'traversed':traversed, 'virtual_root':ob, 'virtual_root_path':(), 'root':ob} diff --git a/pyramid/config/util.py b/pyramid/config/util.py index b0e873de3..79f13e4a0 100644 --- a/pyramid/config/util.py +++ b/pyramid/config/util.py @@ -15,7 +15,7 @@ from pyramid.exceptions import ConfigurationError from pyramid.traversal import ( find_interface, - traversal_path_info, + traversal_path, ) from hashlib import md5 @@ -268,8 +268,8 @@ def make_predicates(xhr=None, request_method=None, path_info=None, if 'traverse' in context: return True m = context['match'] - tvalue = tgenerate(m) - m['traverse'] = traversal_path_info(tvalue) + tvalue = tgenerate(m) # tvalue will be urlquoted string + m['traverse'] = traversal_path(tvalue) # will be seq of unicode return True # This isn't actually a predicate, it's just a infodict # modifier that injects ``traverse`` into the matchdict. As a diff --git a/pyramid/tests/test_config/test_util.py b/pyramid/tests/test_config/test_util.py index 1180e7e29..ebf308929 100644 --- a/pyramid/tests/test_config/test_util.py +++ b/pyramid/tests/test_config/test_util.py @@ -1,4 +1,5 @@ import unittest +from pyramid.compat import text_ class Test__make_predicates(unittest.TestCase): def _callFUT(self, **kw): @@ -227,6 +228,21 @@ class Test__make_predicates(unittest.TestCase): self.assertEqual(info, {'match': {'a':'a', 'b':'b', 'traverse':('1', 'a', 'b')}}) + def test_traverse_matches_with_highorder_chars(self): + order, predicates, phash = self._callFUT( + traverse=text_(b'/La Pe\xc3\xb1a/{x}', 'utf-8')) + self.assertEqual(len(predicates), 1) + pred = predicates[0] + info = {'match':{'x':text_(b'Qu\xc3\xa9bec', 'utf-8')}} + request = DummyRequest() + result = pred(info, request) + self.assertEqual(result, True) + self.assertEqual( + info['match']['traverse'], + (text_(b'La Pe\xc3\xb1a', 'utf-8'), + text_(b'Qu\xc3\xa9bec', 'utf-8')) + ) + def test_custom_predicates_can_affect_traversal(self): def custom(info, request): m = info['match'] diff --git a/pyramid/tests/test_traversal.py b/pyramid/tests/test_traversal.py index 72192b23b..1f9971ca5 100644 --- a/pyramid/tests/test_traversal.py +++ b/pyramid/tests/test_traversal.py @@ -1,10 +1,13 @@ import unittest from pyramid.testing import cleanUp -from pyramid.compat import text_ -from pyramid.compat import native_ -from pyramid.compat import text_type -from pyramid.compat import url_quote +from pyramid.compat import ( + text_, + native_, + text_type, + url_quote, + PY3, + ) class TraversalPathTests(unittest.TestCase): def _callFUT(self, path): @@ -131,6 +134,28 @@ class ResourceTreeTraverserTests(unittest.TestCase): self.assertEqual(result['virtual_root'], policy.root) self.assertEqual(result['virtual_root_path'], ()) + def test_call_with_pathinfo_highorder(self): + foo = DummyContext(None, text_(b'Qu\xc3\xa9bec', 'utf-8')) + root = DummyContext(foo, 'root') + policy = self._makeOne(root) + if PY3: # pragma: no cover + path_info = b'/Qu\xc3\xa9bec'.decode('latin-1') + else: + path_info = b'/Qu\xc3\xa9bec' + environ = self._getEnviron(PATH_INFO=path_info) + request = DummyRequest(environ) + result = policy(request) + self.assertEqual(result['context'], foo) + self.assertEqual(result['view_name'], '') + self.assertEqual(result['subpath'], ()) + self.assertEqual( + result['traversed'], + (text_(b'Qu\xc3\xa9bec', 'utf-8'),) + ) + self.assertEqual(result['root'], policy.root) + self.assertEqual(result['virtual_root'], policy.root) + self.assertEqual(result['virtual_root_path'], ()) + def test_call_pathel_with_no_getitem(self): policy = self._makeOne(None) environ = self._getEnviron(PATH_INFO='/foo/bar') @@ -295,6 +320,33 @@ class ResourceTreeTraverserTests(unittest.TestCase): self.assertEqual(result['virtual_root'], policy.root) self.assertEqual(result['virtual_root_path'], ()) + def test_call_with_vh_root_highorder(self): + bar = DummyContext(None, 'bar') + foo = DummyContext(bar, text_(b'Qu\xc3\xa9bec', 'utf-8')) + root = DummyContext(foo, 'root') + policy = self._makeOne(root) + if PY3: # pragma: no cover + vhm_root = b'/Qu\xc3\xa9bec'.decode('latin-1') + else: + vhm_root = b'/Qu\xc3\xa9bec' + environ = self._getEnviron(HTTP_X_VHM_ROOT=vhm_root, + PATH_INFO='/bar') + request = DummyRequest(environ) + result = policy(request) + self.assertEqual(result['context'], bar) + self.assertEqual(result['view_name'], '') + self.assertEqual(result['subpath'], ()) + self.assertEqual( + result['traversed'], + (text_(b'Qu\xc3\xa9bec', 'utf-8'), text_('bar')) + ) + self.assertEqual(result['root'], policy.root) + self.assertEqual(result['virtual_root'], foo) + self.assertEqual( + result['virtual_root_path'], + (text_(b'Qu\xc3\xa9bec', 'utf-8'),) + ) + def test_non_utf8_path_segment_unicode_path_segments_fails(self): from pyramid.exceptions import URLDecodeError foo = DummyContext() diff --git a/pyramid/tests/test_urldispatch.py b/pyramid/tests/test_urldispatch.py index be823b045..370f072ff 100644 --- a/pyramid/tests/test_urldispatch.py +++ b/pyramid/tests/test_urldispatch.py @@ -115,6 +115,12 @@ class RoutesMapperTests(unittest.TestCase): self.assertEqual(mapper.routelist[0].pattern, 'archives/:action/:article2') + def test___call__pathinfo_cant_be_decoded(self): + from pyramid.exceptions import URLDecodeError + mapper = self._makeOne() + request = self._getRequest(PATH_INFO=b'\xff\xfe\xe6\x00') + self.assertRaises(URLDecodeError, mapper, request) + def test___call__route_matches(self): mapper = self._makeOne() mapper.connect('foo', 'archives/:action/:article') @@ -292,12 +298,6 @@ class TestCompileRoute(unittest.TestCase): self.assertEqual(matcher('foo/baz/biz/buz/bar'), None) self.assertEqual(generator({'baz':1, 'buz':2}), '/foo/1/biz/2/bar') - def test_url_decode_error(self): - from pyramid.exceptions import URLDecodeError - matcher, generator = self._callFUT('/:foo') - self.assertRaises(URLDecodeError, matcher, - native_(b'/\xff\xfe\x8b\x00')) - def test_custom_regex(self): matcher, generator = self._callFUT('foo/{baz}/biz/{buz:[^/\.]+}.{bar}') self.assertEqual(matcher('/foo/baz/biz/buz.bar'), @@ -328,7 +328,8 @@ class TestCompileRoute(unittest.TestCase): self.assertEqual(generator({'buz':2001}), '/2001') def test_custom_regex_with_embedded_squigglies3(self): - matcher, generator = self._callFUT('/{buz:(\d{2}|\d{4})-[a-zA-Z]{3,4}-\d{2}}') + matcher, generator = self._callFUT( + '/{buz:(\d{2}|\d{4})-[a-zA-Z]{3,4}-\d{2}}') self.assertEqual(matcher('/2001-Nov-15'), {'buz':'2001-Nov-15'}) self.assertEqual(matcher('/99-June-10'), {'buz':'99-June-10'}) self.assertEqual(matcher('/2-Nov-15'), None) @@ -337,6 +338,63 @@ class TestCompileRoute(unittest.TestCase): self.assertEqual(generator({'buz':'2001-Nov-15'}), '/2001-Nov-15') self.assertEqual(generator({'buz':'99-June-10'}), '/99-June-10') + def test_pattern_with_high_order_literal(self): + pattern = text_(b'/La Pe\xc3\xb1a/{x}', 'utf-8') + matcher, generator = self._callFUT(pattern) + self.assertEqual(matcher(text_(b'/La Pe\xc3\xb1a/x', 'utf-8')), + {'x':'x'}) + self.assertEqual(generator({'x':'1'}), '/La%20Pe%C3%B1a/1') + + def test_pattern_generate_with_high_order_dynamic(self): + pattern = '/{x}' + _, generator = self._callFUT(pattern) + self.assertEqual( + generator({'x':text_(b'La Pe\xc3\xb1a', 'utf-8')}), + '/La%20Pe%C3%B1a') + + def test_docs_sample_generate(self): + # sample from urldispatch.rst + pattern = text_(b'/La Pe\xc3\xb1a/{city}', 'utf-8') + _, generator = self._callFUT(pattern) + self.assertEqual( + generator({'city':text_(b'Qu\xc3\xa9bec', 'utf-8')}), + '/La%20Pe%C3%B1a/Qu%C3%A9bec') + + def test_generate_with_mixedtype_values(self): + pattern = '/{city}/{state}' + _, generator = self._callFUT(pattern) + result = generator( + {'city': text_(b'Qu\xc3\xa9bec', 'utf-8'), + 'state': b'La Pe\xc3\xb1a'} + ) + self.assertEqual(result, '/Qu%C3%A9bec/La%20Pe%C3%B1a') + # should be a native string + self.assertEqual(type(result), str) + + def test_highorder_pattern_utf8(self): + pattern = b'/La Pe\xc3\xb1a/{city}' + self.assertRaises(ValueError, self._callFUT, pattern) + + def test_generate_with_string_remainder_and_unicode_replacement(self): + pattern = text_(b'/abc*remainder', 'utf-8') + _, generator = self._callFUT(pattern) + result = generator( + {'remainder': text_(b'/Qu\xc3\xa9bec/La Pe\xc3\xb1a', 'utf-8')} + ) + self.assertEqual(result, '/abc/Qu%C3%A9bec/La%20Pe%C3%B1a') + # should be a native string + self.assertEqual(type(result), str) + + def test_generate_with_string_remainder_and_nonstring_replacement(self): + pattern = text_(b'/abc/*remainder', 'utf-8') + _, generator = self._callFUT(pattern) + result = generator( + {'remainder': None} + ) + self.assertEqual(result, '/abc/None') + # should be a native string + self.assertEqual(type(result), str) + class TestCompileRouteFunctional(unittest.TestCase): def matches(self, pattern, path, expected): from pyramid.urldispatch import _compile_route @@ -368,11 +426,11 @@ class TestCompileRouteFunctional(unittest.TestCase): self.matches('*traverse', '/zzz/abc', {'traverse':('zzz', 'abc')}) self.matches('*traverse', '/zzz/ abc', {'traverse':('zzz', ' abc')}) #'/La%20Pe%C3%B1a' - self.matches('{x}', native_(b'/La Pe\xc3\xb1a'), - {'x':text_(b'La Pe\xf1a')}) + self.matches('{x}', text_(b'/La Pe\xc3\xb1a', 'utf-8'), + {'x':text_(b'La Pe\xc3\xb1a', 'utf-8')}) # '/La%20Pe%C3%B1a/x' - self.matches('*traverse', native_(b'/La Pe\xc3\xb1a/x'), - {'traverse':(text_(b'La Pe\xf1a'), 'x')}) + self.matches('*traverse', text_(b'/La Pe\xc3\xb1a/x'), + {'traverse':(text_(b'La Pe\xc3\xb1a'), 'x')}) self.matches('/foo/{id}.html', '/foo/bar.html', {'id':'bar'}) self.matches('/{num:[0-9]+}/*traverse', '/555/abc/def', {'num':'555', 'traverse':('abc', 'def')}) @@ -394,11 +452,12 @@ class TestCompileRouteFunctional(unittest.TestCase): self.matches('*traverse', '/zzz/abc', {'traverse':('zzz', 'abc')}) self.matches('*traverse', '/zzz/ abc', {'traverse':('zzz', ' abc')}) #'/La%20Pe%C3%B1a' - self.matches(':x', native_(b'/La Pe\xc3\xb1a'), - {'x':text_(b'La Pe\xf1a')}) + # pattern, path, expected + self.matches(':x', text_(b'/La Pe\xc3\xb1a', 'utf-8'), + {'x':text_(b'La Pe\xc3\xb1a', 'utf-8')}) # '/La%20Pe%C3%B1a/x' - self.matches('*traverse', native_(b'/La Pe\xc3\xb1a/x'), - {'traverse':(text_(b'La Pe\xf1a'), 'x')}) + self.matches('*traverse', text_(b'/La Pe\xc3\xb1a/x', 'utf-8'), + {'traverse':(text_(b'La Pe\xc3\xb1a', 'utf-8'), 'x')}) self.matches('/foo/:id.html', '/foo/bar.html', {'id':'bar'}) self.matches('/foo/:id_html', '/foo/bar_html', {'id_html':'bar_html'}) self.matches('zzz/:_', '/zzz/abc', {'_':'abc'}) diff --git a/pyramid/traversal.py b/pyramid/traversal.py index cd624fd30..84dcd33ec 100644 --- a/pyramid/traversal.py +++ b/pyramid/traversal.py @@ -16,12 +16,12 @@ from pyramid.compat import ( PY3, native_, text_, - bytes_, ascii_native_, text_type, binary_type, - url_unquote_native, is_nonstr_iter, + decode_path_info, + unquote_bytes_to_wsgi, ) from pyramid.encode import url_quote @@ -429,33 +429,46 @@ def virtual_root(resource, request): def traversal_path(path): """ Variant of :func:`pyramid.traversal.traversal_path_info` suitable for - decoding paths that are URL-encoded.""" - path = ascii_native_(path) - path = url_unquote_native(path, 'latin-1', 'strict') - return traversal_path_info(path) + decoding paths that are URL-encoded. + + If this function is passed a Unicode object instead of a sequence of + bytes as ``path``, that Unicode object *must* directly encodeable to + ASCII. For example, u'/foo' will work but u'/<unprintable unicode>' (a + Unicode object with characters that cannot be encoded to ascii) will + not. A :exc:`UnicodeEncodeError` will be raised if the Unicode cannot be + encoded directly to ASCII. + """ + if isinstance(path, text_type): + # must not possess characters outside ascii + path = path.encode('ascii') + # we unquote this path exactly like a PEP 3333 server would + path = unquote_bytes_to_wsgi(path) # result will be a native string + return traversal_path_info(path) # result will be a tuple of unicode @lru_cache(1000) def traversal_path_info(path): - """ Given a ``PATH_INFO`` environ value (slash-separated path segments), - return a tuple representing that path which can be used to traverse a - resource tree. - - ``PATH_INFO`` is assumed to already be URL-decoded. It is encoded to - bytes using the Latin-1 encoding; the resulting set of bytes is - subsequently decoded to text using the UTF-8 encoding; a - :exc:`pyramid.exc.URLDecodeError` is raised if a the URL cannot be - decoded. - - The ``PATH_INFO`` is split on slashes, creating a list of segments. Each - segment subsequently decoded into Unicode. If a segment name is empty or - if it is ``.``, it is ignored. If a segment name is ``..``, the previous - segment is deleted, and the ``..`` is ignored. - - If this function is passed a Unicode object instead of a string, that - Unicode object *must* directly encodeable to ASCII. For example, u'/foo' - will work but u'/<unprintable unicode>' (a Unicode object with characters - that cannot be encoded to ascii) will not. A :exc:`UnicodeError` will be - raised if the Unicode cannot be encoded directly to ASCII. + """ Given``path``, return a tuple representing that path which can be + used to traverse a resource tree. ``path`` is assumed to be an + already-URL-decoded ``str`` type as if it had come to us from an upstream + WSGI server as the ``PATH_INFO`` environ variable. + + The ``path`` is first decoded to from its WSGI representation to Unicode; + it is decoded differently depending on platform: + + - On Python 2, ``path`` is decoded to Unicode from bytes using the UTF-8 + decoding directly; a :exc:`pyramid.exc.URLDecodeError` is raised if a the + URL cannot be decoded. + + - On Python 3, as per the PEP 3333 spec, ``path`` is first encoded to + bytes using the Latin-1 encoding; the resulting set of bytes is + subsequently decoded to text using the UTF-8 encoding; a + :exc:`pyramid.exc.URLDecodeError` is raised if a the URL cannot be + decoded. + + The ``path`` is split on slashes, creating a list of segments. If a + segment name is empty or if it is ``.``, it is ignored. If a segment + name is ``..``, the previous segment is deleted, and the ``..`` is + ignored. Examples: @@ -504,9 +517,15 @@ def traversal_path_info(path): applications in :app:`Pyramid`. """ try: - path = bytes_(path, 'latin-1').decode('utf-8') + path = decode_path_info(path) # result will be Unicode except UnicodeDecodeError as e: raise URLDecodeError(e.encoding, e.object, e.start, e.end, e.reason) + return split_path_info(path) # result will be tuple of Unicode + +@lru_cache(1000) +def split_path_info(path): + # suitable for splitting an already-unquoted-already-decoded (unicode) + # path value path = path.strip('/') clean = [] for segment in path.split('/'): @@ -622,25 +641,34 @@ class ResourceTreeTraverser(object): path = matchdict.get('traverse', '/') or '/' if is_nonstr_iter(path): # this is a *traverse stararg (not a {traverse}) - path = '/'.join([quote_path_segment(x) for x in path]) or '/' + # routing has already decoded these elements, so we just + # need to join them + path = '/'.join(path) or '/' subpath = matchdict.get('subpath', ()) if not is_nonstr_iter(subpath): # this is not a *subpath stararg (just a {subpath}) - subpath = traversal_path_info(subpath) + # routing has already decoded this string, so we just need + # to split it + subpath = split_path_info(subpath) else: # this request did not match a route subpath = () try: - path = environ['PATH_INFO'] or '/' + # empty if mounted under a path in mod_wsgi, for example + path = decode_path_info(environ['PATH_INFO'] or '/') except KeyError: path = '/' + except UnicodeDecodeError as e: + raise URLDecodeError(e.encoding, e.object, e.start, e.end, + e.reason) if VH_ROOT_KEY in environ: - vroot_path = environ[VH_ROOT_KEY] - vroot_tuple = traversal_path_info(vroot_path) - vpath = vroot_path + path + # HTTP_X_VHM_ROOT + vroot_path = decode_path_info(environ[VH_ROOT_KEY]) + vroot_tuple = split_path_info(vroot_path) + vpath = vroot_path + path # both will (must) be unicode or asciistr vroot_idx = len(vroot_tuple) -1 else: vroot_tuple = () @@ -660,7 +688,7 @@ class ResourceTreeTraverser(object): # and this hurts readability; apologies i = 0 view_selector = self.VIEW_SELECTOR - vpath_tuple = traversal_path_info(vpath) + vpath_tuple = split_path_info(vpath) for segment in vpath_tuple: if segment[:2] == view_selector: return {'context':ob, diff --git a/pyramid/url.py b/pyramid/url.py index afb602d3a..e6a508c17 100644 --- a/pyramid/url.py +++ b/pyramid/url.py @@ -67,7 +67,7 @@ class URLMethodsMixin(object): encoded to UTF-8. The resulting strings are joined with slashes and rendered into the URL. If a string is passed as a ``*remainder`` replacement value, it is tacked on to the URL - untouched. + after being URL-quoted-except-for-embedded-slashes. If a keyword argument ``_query`` is present, it will be used to compose a query string that will be tacked on to the end of the diff --git a/pyramid/urldispatch.py b/pyramid/urldispatch.py index c7520b8d2..bd1da8f71 100644 --- a/pyramid/urldispatch.py +++ b/pyramid/urldispatch.py @@ -7,19 +7,21 @@ from pyramid.interfaces import ( ) from pyramid.compat import ( + PY3, native_, - bytes_, + text_, text_type, string_types, + binary_type, is_nonstr_iter, - url_quote, ) from pyramid.exceptions import URLDecodeError from pyramid.traversal import ( - traversal_path_info, quote_path_segment, + decode_path_info, + split_path_info, ) _marker = object() @@ -70,9 +72,11 @@ class RoutesMapper(object): environ = request.environ try: # empty if mounted under a path in mod_wsgi, for example - path = environ['PATH_INFO'] or '/' + path = decode_path_info(environ['PATH_INFO'] or '/') except KeyError: path = '/' + except UnicodeDecodeError as e: + raise URLDecodeError(e.encoding, e.object, e.start, e.end, e.reason) for route in self.routelist: match = route.match(path) @@ -100,80 +104,128 @@ def update_pattern(matchobj): return '{%s}' % name[1:] def _compile_route(route): + # This function really wants to consume Unicode patterns natively, but if + # someone passes us a bytestring, we allow it by converting it to Unicode + # using the ASCII decoding. We decode it using ASCII because we dont + # want to accept bytestrings with high-order characters in them here as + # we have no idea what the encoding represents. + if route.__class__ is not text_type: + try: + route = text_(route, 'ascii') + except UnicodeDecodeError: + raise ValueError( + 'The pattern value passed to add_route must be ' + 'either a Unicode string or a plain string without ' + 'any non-ASCII characters (you provided %r).' % route) + if old_route_re.search(route) and not route_re.search(route): route = old_route_re.sub(update_pattern, route) if not route.startswith('/'): route = '/' + route - star = None + remainder = None if star_at_end.search(route): - route, star = route.rsplit('*', 1) + route, remainder = route.rsplit('*', 1) pat = route_re.split(route) + + # every element in "pat" will be Unicode (regardless of whether the + # route_re regex pattern is itself Unicode or str) pat.reverse() rpat = [] gen = [] prefix = pat.pop() # invar: always at least one element (route='/'+route) - rpat.append(re.escape(prefix)) - gen.append(prefix) + + # We want to generate URL-encoded URLs, so we url-quote the prefix, being + # careful not to quote any embedded slashes. We have to replace '%' with + # '%%' afterwards, as the strings that go into "gen" are used as string + # replacement targets. + gen.append(quote_path_segment(prefix, safe='/').replace('%', '%%')) # native + rpat.append(re.escape(prefix)) # unicode while pat: - name = pat.pop() + name = pat.pop() # unicode name = name[1:-1] if ':' in name: name, reg = name.split(':') else: reg = '[^/]+' - gen.append('%%(%s)s' % name) - name = '(?P<%s>%s)' % (name, reg) + gen.append('%%(%s)s' % native_(name)) # native + name = '(?P<%s>%s)' % (name, reg) # unicode rpat.append(name) - s = pat.pop() + s = pat.pop() # unicode if s: - rpat.append(re.escape(s)) - gen.append(s) + rpat.append(re.escape(s)) # unicode + # We want to generate URL-encoded URLs, so we url-quote this + # literal in the pattern, being careful not to quote the embedded + # slashes. We have to replace '%' with '%%' afterwards, as the + # strings that go into "gen" are used as string replacement + # targets. What is appended to gen is a native string. + gen.append(quote_path_segment(s, safe='/').replace('%', '%%')) - if star: - rpat.append('(?P<%s>.*?)' % star) - gen.append('%%(%s)s' % star) + if remainder: + rpat.append('(?P<%s>.*?)' % remainder) # unicode + gen.append('%%(%s)s' % native_(remainder)) # native - pattern = ''.join(rpat) + '$' + pattern = ''.join(rpat) + '$' # unicode match = re.compile(pattern).match def matcher(path): + # This function really wants to consume Unicode patterns natively, + # but if someone passes us a bytestring, we allow it by converting it + # to Unicode using the ASCII decoding. We decode it using ASCII + # because we dont want to accept bytestrings with high-order + # characters in them here as we have no idea what the encoding + # represents. + if path.__class__ is not text_type: + path = text_(path, 'ascii') m = match(path) if m is None: - return m + return None d = {} for k, v in m.groupdict().items(): - if k == star: - d[k] = traversal_path_info(v) + # k and v will be Unicode 2.6.4 and lower doesnt accept unicode + # kwargs as **kw, so we explicitly cast the keys to native + # strings in case someone wants to pass the result as **kw + nk = native_(k, 'ascii') + if k == remainder: + d[nk] = split_path_info(v) else: - try: - val = bytes_(v).decode('utf-8', 'strict') - d[k] = val - except UnicodeDecodeError as e: - raise URLDecodeError( - e.encoding, e.object, e.start, e.end, e.reason - ) - - + d[nk] = v return d - gen = ''.join(gen) def generator(dict): newdict = {} for k, v in dict.items(): - if v.__class__ is text_type: - v = native_(v, 'utf-8') - if k == star and is_nonstr_iter(v): - v = '/'.join([quote_path_segment(x) for x in v]) - elif k != star: + if PY3: # pragma: no cover + if v.__class__ is binary_type: + # url_quote below needs a native string, not bytes on Py3 + v = v.decode('utf-8') + else: + if v.__class__ is text_type: + # url_quote below needs bytes, not unicode on Py2 + v = v.encode('utf-8') + + if k == remainder: + # a stararg argument + if is_nonstr_iter(v): + v = '/'.join([quote_path_segment(x) for x in v]) # native + else: + if v.__class__ not in string_types: + v = str(v) + v = quote_path_segment(v, safe='/') + else: if v.__class__ not in string_types: v = str(v) - v = url_quote(v, safe='') + # v may be bytes (py2) or native string (py3) + v = quote_path_segment(v) + + # at this point, the value will be a native string newdict[k] = v - return gen % newdict + + result = gen % newdict # native string result + return result return matcher, generator |
