fix urldispatch matching and generation to cope with various inputs

author: Chris McDonough <chrism@plope.com> 2012-01-05 02:41:32 -0500
committer: Chris McDonough <chrism@plope.com> 2012-01-05 02:41:32 -0500
commit: a511b1423334f855e996bb06714b36aa86f861e9 (patch)
tree: 62e4bcabdb465b28545110e0ac02b5b3ec55a364
parent: f2ef9a3026723cabbfaeffd128d7b7a874f74002 (diff)
download: pyramid-a511b1423334f855e996bb06714b36aa86f861e9.tar.gz
pyramid-a511b1423334f855e996bb06714b36aa86f861e9.tar.bz2
pyramid-a511b1423334f855e996bb06714b36aa86f861e9.zip
3 files changed, 200 insertions, 45 deletions
diff --git a/docs/narr/urldispatch.rst b/docs/narr/urldispatch.rst
index 35613ea1b..7e485f8ae 100644
--- a/docs/narr/urldispatch.rst
+++ b/docs/narr/urldispatch.rst
@@ -235,7 +235,7 @@ When matching the following URL:
 
 .. code-block:: text
 
-   foo/La%20Pe%C3%B1a
+   http://example.com/foo/La%20Pe%C3%B1a
 
 The matchdict will look like so (the value is URL-decoded / UTF-8 decoded):
 
@@ -243,6 +243,50 @@ The matchdict will look like so (the value is URL-decoded / UTF-8 decoded):
 
    {'bar':u'La Pe\xf1a'}
 
+Literal strings in the path segment should represent the *decoded* value of
+the ``PATH_INFO`` provided to Pyramid.  You don't want to use a URL-encoded
+value or a bytestring representing the literal's UTF-8 in the pattern.  For
+example, rather than this:
+
+.. code-block:: text
+
+   /Foo%20Bar/{baz}
+
+You'll want to use something like this:
+
+.. code-block:: text
+
+   /Foo Bar/{baz}
+
+For patterns that contain "high-order" characters in its literals, you'll
+want to use a Unicode value as the pattern as opposed to any URL-encoded or
+UTF-8-encoded value.  For example, you might be tempted to use a bytestring
+pattern like this:
+
+.. code-block:: text
+
+   /La Pe\xc3\xb1a/{x}
+
+But that probably won't match as you expect it to.  You'll want to use a
+Unicode value as the pattern instead rather than raw bytestring escapes.  You
+can use a high-order Unicode value as the pattern by using `Python source
+file encoding <http://www.python.org/dev/peps/pep-0263/>`_ plus the "real"
+character in the Unicode pattern in the source, like so:
+
+.. code-block:: text
+
+   /La Peña/{x}
+
+Or you can ignore source file encoding and use equivalent Unicode escape
+characters in the pattern.
+
+.. code-block:: text
+
+   /La Pe\xf1a/{x}
+
+Dynamic segment names cannot contain high-order characters, so this applies
+only to literals in the pattern.
+
 If the pattern has a ``*`` in it, the name which follows it is considered a
 "remainder match".  A remainder match *must* come at the end of the pattern.
 Unlike segment replacement markers, it does not need to be preceded by a
@@ -612,7 +656,6 @@ Use the :meth:`pyramid.request.Request.route_url` method to generate URLs
 based on route patterns.  For example, if you've configured a route with the
 ``name`` "foo" and the ``pattern`` "{a}/{b}/{c}", you might do this.
 
-.. ignore-next-block
 .. code-block:: python
    :linenos:
 
@@ -620,7 +663,45 @@ based on route patterns.  For example, if you've configured a route with the
 
 This would return something like the string ``http://example.com/1/2/3`` (at
 least if the current protocol and hostname implied ``http://example.com``).
-See the :meth:`~pyramid.request.Request.route_url` API documentation for more
+
+To get only the *path* of a route, use the
+:meth:`pyramid.request.Request.route_path` API instead of
+:meth:`~pyramid.request.Request.route_url`.
+
+.. code-block:: python
+
+   url = request.route_path('foo', a='1', b='2', c='3')
+
+This will return the string ``/1/2/3`` rather than a full URL.
+
+Note that URLs and paths generated by ``route_path`` and ``route_url`` are
+always URL-quoted string types (which contain no non-ASCII characters).
+Therefore, if you've added a route like so:
+
+.. code-block:: python
+
+   config.add_route('la', u'/La Peña/{city}')
+
+And you later generate a URL using ``route_path`` or ``route_url`` like so:
+
+.. code-block:: python
+
+   url = request.route_path('la', city=u'Québec')
+
+You will wind up with the path encoded to UTF-8 and URL quoted like so:
+
+.. code-block:: python
+
+   /La%20Pe%C3%B1a/Qu%C3%A9bec
+
+.. note::
+
+   Generating URL-quoted URLs and paths is new as of Pyramid 1.3 (and Pyramid
+   1.2 after 1.2.6).  Previous versions generated unquoted URLs and paths
+   (which was broken).
+
+See the :meth:`~pyramid.request.Request.route_url` and
+:meth:`~pyramid.request.Request.route_path` API documentation for more
 information.
 
 .. index::
diff --git a/pyramid/tests/test_urldispatch.py b/pyramid/tests/test_urldispatch.py
index be823b045..856bdcb78 100644
--- a/pyramid/tests/test_urldispatch.py
+++ b/pyramid/tests/test_urldispatch.py
@@ -292,12 +292,6 @@ class TestCompileRoute(unittest.TestCase):
         self.assertEqual(matcher('foo/baz/biz/buz/bar'), None)
         self.assertEqual(generator({'baz':1, 'buz':2}), '/foo/1/biz/2/bar')
 
-    def test_url_decode_error(self):
-        from pyramid.exceptions import URLDecodeError
-        matcher, generator = self._callFUT('/:foo')
-        self.assertRaises(URLDecodeError, matcher,
-                          native_(b'/\xff\xfe\x8b\x00'))
-    
     def test_custom_regex(self):
         matcher, generator = self._callFUT('foo/{baz}/biz/{buz:[^/\.]+}.{bar}')
         self.assertEqual(matcher('/foo/baz/biz/buz.bar'),
@@ -328,7 +322,8 @@ class TestCompileRoute(unittest.TestCase):
         self.assertEqual(generator({'buz':2001}), '/2001')
 
     def test_custom_regex_with_embedded_squigglies3(self):
-        matcher, generator = self._callFUT('/{buz:(\d{2}|\d{4})-[a-zA-Z]{3,4}-\d{2}}')
+        matcher, generator = self._callFUT(
+            '/{buz:(\d{2}|\d{4})-[a-zA-Z]{3,4}-\d{2}}')
         self.assertEqual(matcher('/2001-Nov-15'), {'buz':'2001-Nov-15'})
         self.assertEqual(matcher('/99-June-10'), {'buz':'99-June-10'})
         self.assertEqual(matcher('/2-Nov-15'), None)
@@ -337,6 +332,39 @@ class TestCompileRoute(unittest.TestCase):
         self.assertEqual(generator({'buz':'2001-Nov-15'}), '/2001-Nov-15')
         self.assertEqual(generator({'buz':'99-June-10'}), '/99-June-10')
 
+    def test_pattern_with_high_order_literal(self):
+        pattern = text_(b'/La Pe\xc3\xb1a/{x}', 'utf-8')
+        matcher, generator = self._callFUT(pattern)
+        self.assertEqual(matcher(text_(b'/La Pe\xc3\xb1a/x', 'utf-8')),
+                         {'x':'x'})
+        self.assertEqual(generator({'x':'1'}), '/La%20Pe%C3%B1a/1')
+
+    def test_pattern_generate_with_high_order_dynamic(self):
+        pattern = '/{x}'
+        _, generator = self._callFUT(pattern)
+        self.assertEqual(
+            generator({'x':text_(b'La Pe\xc3\xb1a', 'utf-8')}),
+            '/La%20Pe%C3%B1a')
+
+    def test_docs_sample_generate(self):
+        # sample from urldispatch.rst
+        pattern = text_(b'/La Pe\xc3\xb1a/{city}', 'utf-8')
+        _, generator = self._callFUT(pattern)
+        self.assertEqual(
+            generator({'city':text_(b'Qu\xc3\xa9bec', 'utf-8')}),
+            '/La%20Pe%C3%B1a/Qu%C3%A9bec')
+
+    def test_generate_with_mixedtype_values(self):
+        pattern = '/{city}/{state}'
+        _, generator = self._callFUT(pattern)
+        result = generator(
+            {'city': text_(b'Qu\xc3\xa9bec', 'utf-8'),
+             'state': b'La Pe\xc3\xb1a'}
+            )
+        self.assertEqual(result, '/Qu%C3%A9bec/La%20Pe%C3%B1a')
+        # should be a native string
+        self.assertEqual(type(result), str)
+
 class TestCompileRouteFunctional(unittest.TestCase):
     def matches(self, pattern, path, expected):
         from pyramid.urldispatch import _compile_route
@@ -368,11 +396,11 @@ class TestCompileRouteFunctional(unittest.TestCase):
         self.matches('*traverse', '/zzz/abc', {'traverse':('zzz', 'abc')})
         self.matches('*traverse', '/zzz/ abc', {'traverse':('zzz', ' abc')})
         #'/La%20Pe%C3%B1a'
-        self.matches('{x}', native_(b'/La Pe\xc3\xb1a'),
-                     {'x':text_(b'La Pe\xf1a')})
+        self.matches('{x}', text_(b'/La Pe\xc3\xb1a', 'utf-8'),
+                     {'x':text_(b'La Pe\xc3\xb1a', 'utf-8')})
         # '/La%20Pe%C3%B1a/x'
-        self.matches('*traverse', native_(b'/La Pe\xc3\xb1a/x'),
-                     {'traverse':(text_(b'La Pe\xf1a'), 'x')})
+        self.matches('*traverse', text_(b'/La Pe\xc3\xb1a/x'),
+                     {'traverse':(text_(b'La Pe\xc3\xb1a'), 'x')})
         self.matches('/foo/{id}.html', '/foo/bar.html', {'id':'bar'})
         self.matches('/{num:[0-9]+}/*traverse', '/555/abc/def',
                      {'num':'555', 'traverse':('abc', 'def')})
@@ -394,11 +422,12 @@ class TestCompileRouteFunctional(unittest.TestCase):
         self.matches('*traverse', '/zzz/abc', {'traverse':('zzz', 'abc')})
         self.matches('*traverse', '/zzz/ abc', {'traverse':('zzz', ' abc')})
         #'/La%20Pe%C3%B1a'
-        self.matches(':x', native_(b'/La Pe\xc3\xb1a'),
-                     {'x':text_(b'La Pe\xf1a')})
+        # pattern, path, expected
+        self.matches(':x', text_(b'/La Pe\xc3\xb1a', 'utf-8'),
+                     {'x':text_(b'La Pe\xc3\xb1a', 'utf-8')})
         # '/La%20Pe%C3%B1a/x'
-        self.matches('*traverse', native_(b'/La Pe\xc3\xb1a/x'),
-                     {'traverse':(text_(b'La Pe\xf1a'), 'x')})
+        self.matches('*traverse', text_(b'/La Pe\xc3\xb1a/x', 'utf-8'),
+                     {'traverse':(text_(b'La Pe\xc3\xb1a', 'utf-8'), 'x')})
         self.matches('/foo/:id.html', '/foo/bar.html', {'id':'bar'})
         self.matches('/foo/:id_html', '/foo/bar_html', {'id_html':'bar_html'})
         self.matches('zzz/:_', '/zzz/abc', {'_':'abc'})
diff --git a/pyramid/urldispatch.py b/pyramid/urldispatch.py
index 73875b675..cb0e57c4d 100644
--- a/pyramid/urldispatch.py
+++ b/pyramid/urldispatch.py
@@ -7,10 +7,12 @@ from pyramid.interfaces import (
     )
 
 from pyramid.compat import (
+    PY3,
     native_,
-    bytes_,
+    text_,
     text_type,
     string_types,
+    binary_type,
     is_nonstr_iter,
     url_quote,
     )
@@ -103,72 +105,115 @@ def update_pattern(matchobj):
     return '{%s}' % name[1:]
 
 def _compile_route(route):
+    # This function really wants to consume Unicode patterns natively, but if
+    # someone passes us a bytestring, we allow it by converting it to Unicode
+    # using the ASCII decoding.  We decode it using ASCII because we dont
+    # want to accept bytestrings with high-order characters in them here as
+    # we have no idea what the encoding represents.
+    if route.__class__ is not text_type:
+        route = text_(route, 'ascii') 
+
     if old_route_re.search(route) and not route_re.search(route):
         route = old_route_re.sub(update_pattern, route)
 
     if not route.startswith('/'):
         route = '/' + route
 
-    star = None
+    remainder = None
     if star_at_end.search(route):
-        route, star = route.rsplit('*', 1)
+        route, remainder = route.rsplit('*', 1)
 
     pat = route_re.split(route)
+
+    # every element in "pat" will be Unicode (regardless of whether the
+    # route_re regex pattern is itself Unicode or str)
     pat.reverse()
     rpat = []
     gen = []
     prefix = pat.pop() # invar: always at least one element (route='/'+route)
-    rpat.append(re.escape(prefix))
-    gen.append(prefix)
+
+    # We want to generate URL-encoded URLs, so we url-quote the prefix, being
+    # careful not to quote any embedded slashes.  We have to replace '%' with
+    # '%%' afterwards, as the strings that go into "gen" are used as string
+    # replacement targets.
+    gen.append(quote_path_segment(prefix, safe='/').replace('%', '%%')) # native
+    rpat.append(re.escape(prefix)) # unicode
 
     while pat:
-        name = pat.pop()
+        name = pat.pop() # unicode
         name = name[1:-1]
         if ':' in name:
             name, reg = name.split(':')
         else:
             reg = '[^/]+'
-        gen.append('%%(%s)s' % name)
-        name = '(?P<%s>%s)' % (name, reg)
+        gen.append('%%(%s)s' % native_(name)) # native
+        name = '(?P<%s>%s)' % (name, reg) # unicode
         rpat.append(name)
-        s = pat.pop()
+        s = pat.pop() # unicode
         if s:
-            rpat.append(re.escape(s))
-            gen.append(s)
+            rpat.append(re.escape(s)) # unicode
+            # We want to generate URL-encoded URLs, so we url-quote this
+            # literal in the pattern, being careful not to quote the embedded
+            # slashes.  We have to replace '%' with '%%' afterwards, as the
+            # strings that go into "gen" are used as string replacement
+            # targets.  What is appended to gen is a native string.
+            gen.append(quote_path_segment(s, safe='/').replace('%', '%%'))
 
-    if star:
-        rpat.append('(?P<%s>.*?)' % star)
-        gen.append('%%(%s)s' % star)
+    if remainder:
+        rpat.append('(?P<%s>.*?)' % remainder) # unicode
+        gen.append('%%(%s)s' % native_(remainder)) # native
 
-    pattern = ''.join(rpat) + '$'
+    pattern = ''.join(rpat) + '$' # unicode
 
     match = re.compile(pattern).match
     def matcher(path):
+        # This function really wants to consume Unicode patterns natively,
+        # but if someone passes us a bytestring, we allow it by converting it
+        # to Unicode using the ASCII decoding.  We decode it using ASCII
+        # because we dont want to accept bytestrings with high-order
+        # characters in them here as we have no idea what the encoding
+        # represents.
+        if path.__class__ is not text_type:
+            path = text_(path, 'ascii')
         m = match(path)
         if m is None:
-            return m
+            return None
         d = {}
         for k, v in m.groupdict().items():
-            if k == star:
-                d[k] = split_path_info(v)
+            # k and v will be Unicode 2.6.4 and lower doesnt accept unicode
+            # kwargs as **kw, so we explicitly cast the keys to native
+            # strings in case someone wants to pass the result as **kw
+            nk = native_(k, 'ascii')
+            if k == remainder:
+                d[nk] = split_path_info(v)
             else:
-                d[k] = v
+                d[nk] = v
         return d
-                    
 
     gen = ''.join(gen)
     def generator(dict):
         newdict = {}
         for k, v in dict.items():
-            if v.__class__ is text_type:
-                v = native_(v, 'utf-8')
-            if k == star and is_nonstr_iter(v):
-                v = '/'.join([quote_path_segment(x) for x in v])
-            elif k != star:
+            if PY3:
+                if v.__class__ is binary_type:
+                    # url_quote below needs a native string, not bytes on Py3
+                    v = v.decode('utf-8')
+            else:
+                if v.__class__ is text_type:
+                    # url_quote below needs bytes, not unicode on Py2
+                    v = v.encode('utf-8')
+            if k == remainder and is_nonstr_iter(v):
+                v = '/'.join([quote_path_segment(x) for x in v]) # native
+            elif k != remainder:
                 if v.__class__ not in string_types:
                     v = str(v)
-                v = url_quote(v, safe='')
+                # v may be bytes (py2) or native string (py3)
+                v = url_quote(v, safe='') # defaults to utf8 encoding on py3
+
+            # at this point, the value will be a native string
             newdict[k] = v
-        return gen % newdict
+
+        result = gen % newdict # native string result
+        return result
 
     return matcher, generator
author	Chris McDonough <chrism@plope.com>	2012-01-05 02:41:32 -0500
committer	Chris McDonough <chrism@plope.com>	2012-01-05 02:41:32 -0500
commit	a511b1423334f855e996bb06714b36aa86f861e9 (patch)
tree	62e4bcabdb465b28545110e0ac02b5b3ec55a364
parent	f2ef9a3026723cabbfaeffd128d7b7a874f74002 (diff)
download	pyramid-a511b1423334f855e996bb06714b36aa86f861e9.tar.gz pyramid-a511b1423334f855e996bb06714b36aa86f861e9.tar.bz2 pyramid-a511b1423334f855e996bb06714b36aa86f861e9.zip