summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris McDonough <chrism@agendaless.com>2010-09-24 22:10:00 +0000
committerChris McDonough <chrism@agendaless.com>2010-09-24 22:10:00 +0000
commit841313c5ff30e9c7322b230051d1a099d3369461 (patch)
treef05de38fea3a6e954cbd77f543307092ff731796
parentf0fc3e5c1dee40d4262a85204c356516d1d2ea5a (diff)
downloadpyramid-841313c5ff30e9c7322b230051d1a099d3369461.tar.gz
pyramid-841313c5ff30e9c7322b230051d1a099d3369461.tar.bz2
pyramid-841313c5ff30e9c7322b230051d1a099d3369461.zip
- The ``repoze.bfg.traversal.traversal_path`` API now eagerly attempts
to encode a Unicode ``path`` into ASCII before attempting to split it and decode its segments. This is for convenience, effectively to allow a (stored-as-Unicode-in-a-database, or retrieved-as-Unicode-from-a-request-parameter) Unicode path to be passed to ``find_model``, which eventually internally uses the ``traversal_path`` function under the hood. In version 1.2 and prior, if the ``path`` was Unicode, that Unicode was split on slashes and each resulting segment value was Unicode. An inappropriate call to the ``decode()`` method of a resulting Unicode path segment could cause a ``UnicodeDecodeError`` to occur even if the Unicode representation of the path contained no 'high order' characters (it effectively did a "double decode"). By converting the Unicode path argument to ASCII before we attempt to decode and split, genuine errors will occur in a more obvious place while also allowing us to handle (for convenience) the case that it's a Unicode representation formed entirely from ASCII-compatible characters.
-rw-r--r--CHANGES.txt24
-rw-r--r--repoze/bfg/tests/test_traversal.py12
-rw-r--r--repoze/bfg/traversal.py27
3 files changed, 60 insertions, 3 deletions
diff --git a/CHANGES.txt b/CHANGES.txt
index 9550a88fd..8d50d7061 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,3 +1,27 @@
+Next release
+------------
+
+Features
+--------
+
+- The ``repoze.bfg.traversal.traversal_path`` API now eagerly attempts
+ to encode a Unicode ``path`` into ASCII before attempting to split
+ it and decode its segments. This is for convenience, effectively to
+ allow a (stored-as-Unicode-in-a-database, or
+ retrieved-as-Unicode-from-a-request-parameter) Unicode path to be
+ passed to ``find_model``, which eventually internally uses the
+ ``traversal_path`` function under the hood. In version 1.2 and
+ prior, if the ``path`` was Unicode, that Unicode was split on
+ slashes and each resulting segment value was Unicode. An
+ inappropriate call to the ``decode()`` method of a resulting Unicode
+ path segment could cause a ``UnicodeDecodeError`` to occur even if
+ the Unicode representation of the path contained no 'high order'
+ characters (it effectively did a "double decode"). By converting
+ the Unicode path argument to ASCII before we attempt to decode and
+ split, genuine errors will occur in a more obvious place while also
+ allowing us to handle (for convenience) the case that it's a Unicode
+ representation formed entirely from ASCII-compatible characters.
+
1.3a14 (2010-09-14)
===================
diff --git a/repoze/bfg/tests/test_traversal.py b/repoze/bfg/tests/test_traversal.py
index a9727902e..1cae0a05f 100644
--- a/repoze/bfg/tests/test_traversal.py
+++ b/repoze/bfg/tests/test_traversal.py
@@ -50,6 +50,18 @@ class TraversalPathTests(unittest.TestCase):
path = '/'.join([encoded, encoded])
self.assertRaises(URLDecodeError, self._callFUT, path)
+ def test_unicode_highorder_chars(self):
+ path = u'/%E6%B5%81%E8%A1%8C%E8%B6%8B%E5%8A%BF'
+ self.assertEqual(self._callFUT(path), (u'\u6d41\u884c\u8d8b\u52bf',))
+
+ def test_unicode_simple(self):
+ path = u'/abc'
+ self.assertEqual(self._callFUT(path), (u'abc',))
+
+ def test_unicode_undecodeable_to_ascii(self):
+ path = unicode('/La Pe\xc3\xb1a', 'utf-8')
+ self.assertRaises(UnicodeEncodeError, self._callFUT, path)
+
class ModelGraphTraverserTests(unittest.TestCase):
def setUp(self):
cleanUp()
diff --git a/repoze/bfg/traversal.py b/repoze/bfg/traversal.py
index 6bd54cd31..84365e2ff 100644
--- a/repoze/bfg/traversal.py
+++ b/repoze/bfg/traversal.py
@@ -371,14 +371,33 @@ def virtual_root(model, request):
def traversal_path(path):
""" Given a ``PATH_INFO`` string (slash-separated path segments),
return a tuple representing that path which can be used to
- traverse a graph. The ``PATH_INFO`` is split on slashes, creating
- a list of segments. Each segment is URL-unquoted, and decoded
+ traverse a graph.
+
+ The ``PATH_INFO`` is split on slashes, creating a list of
+ segments. Each segment is URL-unquoted, and subsequently decoded
into Unicode. Each segment is assumed to be encoded using the
UTF-8 encoding (or a subset, such as ASCII); a
:exc:`repoze.bfg.exceptions.URLDecodeError` is raised if a segment
cannot be decoded. If a segment name is empty or if it is ``.``,
it is ignored. If a segment name is ``..``, the previous segment
- is deleted, and the ``..`` is ignored. Examples:
+ is deleted, and the ``..`` is ignored.
+
+ If this function is passed a Unicode object instead of a string,
+ that Unicode object *must* directly encodeable to ASCII. For
+ example, u'/foo' will work but u'/<unprintable unicode>' (a
+ Unicode object with characters that cannot be encoded to ascii)
+ will not.
+
+ .. note: New in version 1.3, this API eagerly attempts to encode a
+ Unicode ``path`` into ASCII before attempting to split it and
+ decode its segments. This is for convenience. In version 1.2
+ and prior, if the path was Unicode, an inappropriate call to
+ the ``decode()`` method of a Unicode path segment could cause a
+ ``UnicodeDecodeError`` to occur even if the Unicode
+ representation of the path contained no 'high order'
+ characters.
+
+ Examples:
``/``
@@ -425,6 +444,8 @@ def traversal_path(path):
their own traversal machinery, as opposed to users
writing applications in :mod:`repoze.bfg`.
"""
+ if isinstance(path, unicode):
+ path = path.encode('ascii')
path = path.strip('/')
clean = []
for segment in path.split('/'):