diff options
| author | Chris McDonough <chrism@agendaless.com> | 2010-09-24 22:10:00 +0000 |
|---|---|---|
| committer | Chris McDonough <chrism@agendaless.com> | 2010-09-24 22:10:00 +0000 |
| commit | 841313c5ff30e9c7322b230051d1a099d3369461 (patch) | |
| tree | f05de38fea3a6e954cbd77f543307092ff731796 | |
| parent | f0fc3e5c1dee40d4262a85204c356516d1d2ea5a (diff) | |
| download | pyramid-841313c5ff30e9c7322b230051d1a099d3369461.tar.gz pyramid-841313c5ff30e9c7322b230051d1a099d3369461.tar.bz2 pyramid-841313c5ff30e9c7322b230051d1a099d3369461.zip | |
- The ``repoze.bfg.traversal.traversal_path`` API now eagerly attempts
to encode a Unicode ``path`` into ASCII before attempting to split
it and decode its segments. This is for convenience, effectively to
allow a (stored-as-Unicode-in-a-database, or
retrieved-as-Unicode-from-a-request-parameter) Unicode path to be
passed to ``find_model``, which eventually internally uses the
``traversal_path`` function under the hood. In version 1.2 and
prior, if the ``path`` was Unicode, that Unicode was split on
slashes and each resulting segment value was Unicode. An
inappropriate call to the ``decode()`` method of a resulting Unicode
path segment could cause a ``UnicodeDecodeError`` to occur even if
the Unicode representation of the path contained no 'high order'
characters (it effectively did a "double decode"). By converting
the Unicode path argument to ASCII before we attempt to decode and
split, genuine errors will occur in a more obvious place while also
allowing us to handle (for convenience) the case that it's a Unicode
representation formed entirely from ASCII-compatible characters.
| -rw-r--r-- | CHANGES.txt | 24 | ||||
| -rw-r--r-- | repoze/bfg/tests/test_traversal.py | 12 | ||||
| -rw-r--r-- | repoze/bfg/traversal.py | 27 |
3 files changed, 60 insertions, 3 deletions
diff --git a/CHANGES.txt b/CHANGES.txt index 9550a88fd..8d50d7061 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,3 +1,27 @@ +Next release +------------ + +Features +-------- + +- The ``repoze.bfg.traversal.traversal_path`` API now eagerly attempts + to encode a Unicode ``path`` into ASCII before attempting to split + it and decode its segments. This is for convenience, effectively to + allow a (stored-as-Unicode-in-a-database, or + retrieved-as-Unicode-from-a-request-parameter) Unicode path to be + passed to ``find_model``, which eventually internally uses the + ``traversal_path`` function under the hood. In version 1.2 and + prior, if the ``path`` was Unicode, that Unicode was split on + slashes and each resulting segment value was Unicode. An + inappropriate call to the ``decode()`` method of a resulting Unicode + path segment could cause a ``UnicodeDecodeError`` to occur even if + the Unicode representation of the path contained no 'high order' + characters (it effectively did a "double decode"). By converting + the Unicode path argument to ASCII before we attempt to decode and + split, genuine errors will occur in a more obvious place while also + allowing us to handle (for convenience) the case that it's a Unicode + representation formed entirely from ASCII-compatible characters. + 1.3a14 (2010-09-14) =================== diff --git a/repoze/bfg/tests/test_traversal.py b/repoze/bfg/tests/test_traversal.py index a9727902e..1cae0a05f 100644 --- a/repoze/bfg/tests/test_traversal.py +++ b/repoze/bfg/tests/test_traversal.py @@ -50,6 +50,18 @@ class TraversalPathTests(unittest.TestCase): path = '/'.join([encoded, encoded]) self.assertRaises(URLDecodeError, self._callFUT, path) + def test_unicode_highorder_chars(self): + path = u'/%E6%B5%81%E8%A1%8C%E8%B6%8B%E5%8A%BF' + self.assertEqual(self._callFUT(path), (u'\u6d41\u884c\u8d8b\u52bf',)) + + def test_unicode_simple(self): + path = u'/abc' + self.assertEqual(self._callFUT(path), (u'abc',)) + + def test_unicode_undecodeable_to_ascii(self): + path = unicode('/La Pe\xc3\xb1a', 'utf-8') + self.assertRaises(UnicodeEncodeError, self._callFUT, path) + class ModelGraphTraverserTests(unittest.TestCase): def setUp(self): cleanUp() diff --git a/repoze/bfg/traversal.py b/repoze/bfg/traversal.py index 6bd54cd31..84365e2ff 100644 --- a/repoze/bfg/traversal.py +++ b/repoze/bfg/traversal.py @@ -371,14 +371,33 @@ def virtual_root(model, request): def traversal_path(path): """ Given a ``PATH_INFO`` string (slash-separated path segments), return a tuple representing that path which can be used to - traverse a graph. The ``PATH_INFO`` is split on slashes, creating - a list of segments. Each segment is URL-unquoted, and decoded + traverse a graph. + + The ``PATH_INFO`` is split on slashes, creating a list of + segments. Each segment is URL-unquoted, and subsequently decoded into Unicode. Each segment is assumed to be encoded using the UTF-8 encoding (or a subset, such as ASCII); a :exc:`repoze.bfg.exceptions.URLDecodeError` is raised if a segment cannot be decoded. If a segment name is empty or if it is ``.``, it is ignored. If a segment name is ``..``, the previous segment - is deleted, and the ``..`` is ignored. Examples: + is deleted, and the ``..`` is ignored. + + If this function is passed a Unicode object instead of a string, + that Unicode object *must* directly encodeable to ASCII. For + example, u'/foo' will work but u'/<unprintable unicode>' (a + Unicode object with characters that cannot be encoded to ascii) + will not. + + .. note: New in version 1.3, this API eagerly attempts to encode a + Unicode ``path`` into ASCII before attempting to split it and + decode its segments. This is for convenience. In version 1.2 + and prior, if the path was Unicode, an inappropriate call to + the ``decode()`` method of a Unicode path segment could cause a + ``UnicodeDecodeError`` to occur even if the Unicode + representation of the path contained no 'high order' + characters. + + Examples: ``/`` @@ -425,6 +444,8 @@ def traversal_path(path): their own traversal machinery, as opposed to users writing applications in :mod:`repoze.bfg`. """ + if isinstance(path, unicode): + path = path.encode('ascii') path = path.strip('/') clean = [] for segment in path.split('/'): |
