From 5c43d5725f7d3266f56ec58b412f797a82078aa0 Mon Sep 17 00:00:00 2001 From: Chris McDonough Date: Wed, 4 Jan 2012 01:16:50 -0500 Subject: untested work --- pyramid/compat.py | 18 +++++++++++ pyramid/traversal.py | 84 +++++++++++++++++++++++++++++++++------------------- 2 files changed, 71 insertions(+), 31 deletions(-) diff --git a/pyramid/compat.py b/pyramid/compat.py index 12b8c9f37..7376278ac 100644 --- a/pyramid/compat.py +++ b/pyramid/compat.py @@ -213,3 +213,21 @@ except ImportError: # pragma: no cover import json +if PY3: # pragma: no cover + # see PEP 3333 for why we encode WSGI PATH_INFO to latin-1 before + # decoding it to utf-8 + def decode_path_info(path): + return path.encode('latin-1').decode('utf-8') +else: + def decode_path_info(path): + return path.decode('utf-8') + +if PY3: # pragma: no cover + # see PEP 3333 for why we decode the path to latin-1 + from urllib.parse import unquote_to_bytes + def unquote_bytes_to_wsgi(bytestring): + return unquote_to_bytes(bytestring).decode('latin-1') +else: + from urlparse import unquote as unquote_to_bytes + def unquote_bytes_to_wsgi(bytestring): + return unquote_to_bytes(bytestring) diff --git a/pyramid/traversal.py b/pyramid/traversal.py index cd624fd30..3489dade0 100644 --- a/pyramid/traversal.py +++ b/pyramid/traversal.py @@ -16,12 +16,12 @@ from pyramid.compat import ( PY3, native_, text_, - bytes_, ascii_native_, text_type, binary_type, - url_unquote_native, is_nonstr_iter, + decode_path_info, + unquote_bytes_to_wsgi, ) from pyramid.encode import url_quote @@ -429,33 +429,44 @@ def virtual_root(resource, request): def traversal_path(path): """ Variant of :func:`pyramid.traversal.traversal_path_info` suitable for - decoding paths that are URL-encoded.""" - path = ascii_native_(path) - path = url_unquote_native(path, 'latin-1', 'strict') + decoding paths that are URL-encoded. + + If this function is passed a Unicode object instead of a sequence of + bytes as ``path``, that Unicode object *must* directly encodeable to + ASCII. For example, u'/foo' will work but u'/' (a + Unicode object with characters that cannot be encoded to ascii) will + not. A :exc:`UnicodeEncodeError` will be raised if the Unicode cannot be + encoded directly to ASCII. + """ + # we unquote this path exactly like a PEP 3333 server would + if isinstance(path, text_type): + path = path.encode('ascii') + path = unquote_bytes_to_wsgi(path) # result will be a native string return traversal_path_info(path) -@lru_cache(1000) def traversal_path_info(path): - """ Given a ``PATH_INFO`` environ value (slash-separated path segments), - return a tuple representing that path which can be used to traverse a - resource tree. - - ``PATH_INFO`` is assumed to already be URL-decoded. It is encoded to - bytes using the Latin-1 encoding; the resulting set of bytes is - subsequently decoded to text using the UTF-8 encoding; a - :exc:`pyramid.exc.URLDecodeError` is raised if a the URL cannot be - decoded. - - The ``PATH_INFO`` is split on slashes, creating a list of segments. Each - segment subsequently decoded into Unicode. If a segment name is empty or - if it is ``.``, it is ignored. If a segment name is ``..``, the previous - segment is deleted, and the ``..`` is ignored. - - If this function is passed a Unicode object instead of a string, that - Unicode object *must* directly encodeable to ASCII. For example, u'/foo' - will work but u'/' (a Unicode object with characters - that cannot be encoded to ascii) will not. A :exc:`UnicodeError` will be - raised if the Unicode cannot be encoded directly to ASCII. + """ Given``path``, return a tuple representing that path which can be + used to traverse a resource tree. ``path`` is assumed to be an + already-URL-decoded ``str`` type as if it had come to us from an upstream + WSGI server as the ``PATH_INFO`` environment variable. + + The ``path`` is first decoded to from its WSGI representation to Unicode; + it is decoded differently depending on platform: + + - On Python 2, ``path`` is decoded to Unicode from bytes using the UTF-8 + decoding directly; a :exc:`pyramid.exc.URLDecodeError` is raised if a the + URL cannot be decoded. + + - On Python 3, as per the WSGI spec, ``path`` is first encoded to bytes + using the Latin-1 encoding; the resulting set of bytes is subsequently + decoded to text using the UTF-8 encoding; a + :exc:`pyramid.exc.URLDecodeError` is raised if a the URL cannot be + decoded. + + The ``path`` is split on slashes, creating a list of segments. If a + segment name is empty or if it is ``.``, it is ignored. If a segment + name is ``..``, the previous segment is deleted, and the ``..`` is + ignored. Examples: @@ -504,9 +515,15 @@ def traversal_path_info(path): applications in :app:`Pyramid`. """ try: - path = bytes_(path, 'latin-1').decode('utf-8') + path = decode_path_info(path) except UnicodeDecodeError as e: raise URLDecodeError(e.encoding, e.object, e.start, e.end, e.reason) + return split_path_info(path) + +@lru_cache(1000) +def split_path_info(path): + # suitable for splitting an already-unquoted-already-decoded path_info + # string path = path.strip('/') clean = [] for segment in path.split('/'): @@ -622,23 +639,28 @@ class ResourceTreeTraverser(object): path = matchdict.get('traverse', '/') or '/' if is_nonstr_iter(path): # this is a *traverse stararg (not a {traverse}) - path = '/'.join([quote_path_segment(x) for x in path]) or '/' + # routing has already decoded these elements, so we just + # need to join them + path = '/'.join(path) or '/' subpath = matchdict.get('subpath', ()) if not is_nonstr_iter(subpath): # this is not a *subpath stararg (just a {subpath}) - subpath = traversal_path_info(subpath) + # routing has already decoded this string, so we just need + # to split it + subpath = split_path_info(subpath) else: # this request did not match a route subpath = () try: - path = environ['PATH_INFO'] or '/' + path = decode_path_info(environ['PATH_INFO'] or '/') except KeyError: path = '/' if VH_ROOT_KEY in environ: - vroot_path = environ[VH_ROOT_KEY] + # HTTP_X_VHM_ROOT + vroot_path = decode_path_info(environ[VH_ROOT_KEY]) vroot_tuple = traversal_path_info(vroot_path) vpath = vroot_path + path vroot_idx = len(vroot_tuple) -1 -- cgit v1.2.3