From 5c43d5725f7d3266f56ec58b412f797a82078aa0 Mon Sep 17 00:00:00 2001
From: Chris McDonough <chrism@plope.com>
Date: Wed, 4 Jan 2012 01:16:50 -0500
Subject: untested work

---
 pyramid/compat.py    | 18 +++++++++++
 pyramid/traversal.py | 84 +++++++++++++++++++++++++++++++++-------------------
 2 files changed, 71 insertions(+), 31 deletions(-)
diff --git a/pyramid/compat.py b/pyramid/compat.py
index 12b8c9f37..7376278ac 100644
--- a/pyramid/compat.py
+++ b/pyramid/compat.py
@@ -213,3 +213,21 @@ except ImportError: # pragma: no cover
 import json
 
     
+if PY3: # pragma: no cover
+    # see PEP 3333 for why we encode WSGI PATH_INFO to latin-1 before
+    # decoding it to utf-8
+    def decode_path_info(path):
+        return path.encode('latin-1').decode('utf-8')
+else:
+    def decode_path_info(path):
+        return path.decode('utf-8')
+
+if PY3: # pragma: no cover
+    # see PEP 3333 for why we decode the path to latin-1 
+    from urllib.parse import unquote_to_bytes
+    def unquote_bytes_to_wsgi(bytestring):
+        return unquote_to_bytes(bytestring).decode('latin-1')
+else:
+    from urlparse import unquote as unquote_to_bytes
+    def unquote_bytes_to_wsgi(bytestring):
+        return unquote_to_bytes(bytestring)
diff --git a/pyramid/traversal.py b/pyramid/traversal.py
index cd624fd30..3489dade0 100644
--- a/pyramid/traversal.py
+++ b/pyramid/traversal.py
@@ -16,12 +16,12 @@ from pyramid.compat import (
     PY3,
     native_,
     text_,
-    bytes_,
     ascii_native_,
     text_type,
     binary_type,
-    url_unquote_native,
     is_nonstr_iter,
+    decode_path_info,
+    unquote_bytes_to_wsgi,
     )
 
 from pyramid.encode import url_quote
@@ -429,33 +429,44 @@ def virtual_root(resource, request):
 
 def traversal_path(path):
     """ Variant of :func:`pyramid.traversal.traversal_path_info` suitable for
-    decoding paths that are URL-encoded."""
-    path = ascii_native_(path)
-    path = url_unquote_native(path, 'latin-1', 'strict')
+    decoding paths that are URL-encoded.
+
+    If this function is passed a Unicode object instead of a sequence of
+    bytes as ``path``, that Unicode object *must* directly encodeable to
+    ASCII.  For example, u'/foo' will work but u'/<unprintable unicode>' (a
+    Unicode object with characters that cannot be encoded to ascii) will
+    not. A :exc:`UnicodeEncodeError` will be raised if the Unicode cannot be
+    encoded directly to ASCII.
+    """
+    # we unquote this path exactly like a PEP 3333 server would
+    if isinstance(path, text_type):
+        path = path.encode('ascii')
+    path = unquote_bytes_to_wsgi(path) # result will be a native string
     return traversal_path_info(path)
 
-@lru_cache(1000)
 def traversal_path_info(path):
-    """ Given a ``PATH_INFO`` environ value (slash-separated path segments),
-    return a tuple representing that path which can be used to traverse a
-    resource tree.
-
-    ``PATH_INFO`` is assumed to already be URL-decoded.  It is encoded to
-    bytes using the Latin-1 encoding; the resulting set of bytes is
-    subsequently decoded to text using the UTF-8 encoding; a
-    :exc:`pyramid.exc.URLDecodeError` is raised if a the URL cannot be
-    decoded.
-
-    The ``PATH_INFO`` is split on slashes, creating a list of segments.  Each
-    segment subsequently decoded into Unicode.  If a segment name is empty or
-    if it is ``.``, it is ignored.  If a segment name is ``..``, the previous
-    segment is deleted, and the ``..`` is ignored.
-
-    If this function is passed a Unicode object instead of a string, that
-    Unicode object *must* directly encodeable to ASCII.  For example, u'/foo'
-    will work but u'/<unprintable unicode>' (a Unicode object with characters
-    that cannot be encoded to ascii) will not. A :exc:`UnicodeError` will be
-    raised if the Unicode cannot be encoded directly to ASCII.
+    """ Given``path``, return a tuple representing that path which can be
+    used to traverse a resource tree.  ``path`` is assumed to be an
+    already-URL-decoded ``str`` type as if it had come to us from an upstream
+    WSGI server as the ``PATH_INFO`` environment variable.
+
+    The ``path`` is first decoded to from its WSGI representation to Unicode;
+    it is decoded differently depending on platform:
+
+    - On Python 2, ``path`` is decoded to Unicode from bytes using the UTF-8
+      decoding directly; a :exc:`pyramid.exc.URLDecodeError` is raised if a the
+      URL cannot be decoded.
+
+    - On Python 3, as per the WSGI spec, ``path`` is first encoded to bytes
+      using the Latin-1 encoding; the resulting set of bytes is subsequently
+      decoded to text using the UTF-8 encoding; a
+      :exc:`pyramid.exc.URLDecodeError` is raised if a the URL cannot be
+      decoded.
+
+    The ``path`` is split on slashes, creating a list of segments.  If a
+    segment name is empty or if it is ``.``, it is ignored.  If a segment
+    name is ``..``, the previous segment is deleted, and the ``..`` is
+    ignored.
 
     Examples:
 
@@ -504,9 +515,15 @@ def traversal_path_info(path):
       applications in :app:`Pyramid`.
     """
     try:
-        path = bytes_(path, 'latin-1').decode('utf-8')
+        path = decode_path_info(path)
     except UnicodeDecodeError as e:
         raise URLDecodeError(e.encoding, e.object, e.start, e.end, e.reason)
+    return split_path_info(path)
+
+@lru_cache(1000)
+def split_path_info(path):
+    # suitable for splitting an already-unquoted-already-decoded path_info
+    # string
     path = path.strip('/')
     clean = []
     for segment in path.split('/'):
@@ -622,23 +639,28 @@ class ResourceTreeTraverser(object):
             path = matchdict.get('traverse', '/') or '/'
             if is_nonstr_iter(path):
                 # this is a *traverse stararg (not a {traverse})
-                path = '/'.join([quote_path_segment(x) for x in path]) or '/'
+                # routing has already decoded these elements, so we just
+                # need to join them
+                path = '/'.join(path) or '/'
 
             subpath = matchdict.get('subpath', ())
             if not is_nonstr_iter(subpath):
                 # this is not a *subpath stararg (just a {subpath})
-                subpath = traversal_path_info(subpath)
+                # routing has already decoded this string, so we just need
+                # to split it
+                subpath = split_path_info(subpath)
 
         else:
             # this request did not match a route
             subpath = ()
             try:
-                path = environ['PATH_INFO'] or '/'
+                path = decode_path_info(environ['PATH_INFO'] or '/')
             except KeyError:
                 path = '/'
 
         if VH_ROOT_KEY in environ:
-            vroot_path = environ[VH_ROOT_KEY]
+            # HTTP_X_VHM_ROOT
+            vroot_path = decode_path_info(environ[VH_ROOT_KEY]) 
             vroot_tuple = traversal_path_info(vroot_path)
             vpath = vroot_path + path
             vroot_idx = len(vroot_tuple) -1
-- 
cgit v1.2.3