repoze/bfg/url.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174

""" Utility functions for dealing with URLs in repoze.bfg """

import re
import urllib

from repoze.bfg.location import lineage

def model_url(model, request, *elements, **kw):
    """
    Generate a string representing the absolute URL of the model
    object based on the ``wsgi.url_scheme``, ``HTTP_HOST`` or
    ``SERVER_NAME`` in the request, plus any ``SCRIPT_NAME``.  If a
    ``query`` keyword argument is provided, a query string based on
    its value will be composed and appended to the generated URL
    string (see details below).  The overall result of this function
    is always a string (never unicode).  The ``model`` passed in must
    be :term:`location`-aware.

    .. note:: If any model in the lineage has a unicode name, it will
              be converted to UTF-8 before being attached to the URL.
              When composing the path based on the model lineage,
              empty names in the model graph are ignored.

    Any positional arguments passed in as ``elements`` must be strings
    or unicode objects.  These will be joined by slashes and appended
    to the generated model URL.  Each of the elements passed in is
    URL-quoted before being appended; if any element is unicode, it
    will converted to a UTF-8 bytestring before being URL-quoted.

    .. warning:: if no ``elements`` arguments are specified, the model
                 URL will end with a trailing slash.  If any
                 ``elements`` are used, the generated URL will *not*
                 end in trailing a slash.

    If a keyword argument ``query`` is present, it will used to
    compose a query string that will be tacked on to the end of the
    URL.  The value of ``query`` must be a sequence of two-tuples *or*
    a data structure with an ``.items()`` method that returns a
    sequence of two-tuples (presumably a dictionary).  This data
    structure will be turned into a query string per the documentation
    of ``repoze.url.urlencode`` function.  After the query data is
    turned into a query string, a leading ``?`` is prepended, and the
    the resulting string is appended to the generated URL.

    .. note:: Python data structures that are passed as ``query``
              which are sequences or dictionaries are turned into a
              string under the same rules as when run through
              urllib.urlencode with the ``doseq`` argument equal to
              ``True``.  This means that sequences can be passed as
              values, and a k=v pair will be placed into the query
              string for each value.
    """

    qs = ''
    if 'query' in kw:
        qs = '?' + urlencode(kw['query'], doseq=True)
        
    rpath = []
    for location in lineage(model):
        name = location.__name__
        if name:
            rpath.append(_urlsegment(name))
    prefix = '/'.join(reversed(rpath))
    suffix = '/'.join([_urlsegment(s) for s in elements])
    path = '/'.join([prefix, suffix])
    if not path.startswith('/'):
        path = '/' + path
    app_url = request.application_url # never ends in a slash
    return app_url + path + qs

def urlencode(query, doseq=False):
    """
    A wrapper around Python's stdlib `urllib.urlencode function
    <http://docs.python.org/library/urllib.html>`_ which accepts
    unicode keys and values within the ``query`` dict/sequence; all
    Unicode keys and values are first converted to UTF-8 before being
    used to compose the query string.  The behavior of the function is
    otherwise the same as the stdlib version.

    The value of ``query`` must be a sequence of two-tuples
    representing key/value pairs *or* an object (often a dictionary)
    with an ``.items()`` method that returns a sequence of two-tuples
    representing key/value pairs.  ``doseq`` controls what happens
    when a sequence is presented as one of the values.  See the Python
    stdlib documentation for more information.
    """
    if hasattr(query, 'items'):
        # dictionary
        query = query.items()
    # presumed to be a sequence of two-tuples
    newquery = []
    for k, v in query:
        if k.__class__ is unicode:
            k = k.encode('utf-8')

        if isinstance(v, (tuple, list)):
            L = []
            for x in v:
                if x.__class__ is unicode:
                    x = x.encode('utf-8')
                L.append(x)
            v = L
        elif v.__class__ is unicode:
            v = v.encode('utf-8')
        newquery.append((k, v))

    return urllib.urlencode(newquery, doseq=doseq)

_segment_cache = {}

def _urlsegment(s):
    """ The bit of this code that deals with ``_segment_cache`` is an
    optimization: we cache all the computation of URL path segments in
    this module-scope dictionary with the original string (or unicode
    value) as the key, so we can look it up later without needing to
    reencode or re-url-quote it """
    result = _segment_cache.get(s)
    if result is None:
        if s.__class__ is unicode: # isinstance slighly slower (~15%)
            result = _url_quote(s.encode('utf-8'))
        else:
            result = _url_quote(s)
        # we don't need a lock to mutate _segment_cache, as the below
        # will generate exactly one Python bytecode (STORE_SUBSCR)
        _segment_cache[s] = result
    return result


always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
               'abcdefghijklmnopqrstuvwxyz'
               '0123456789' '_.-')
_safemaps = {}
_must_quote = {}

def _url_quote(s, safe = '/'):
    """quote('abc def') -> 'abc%20def'

    Faster version of Python stdlib urllib.quote.  See
    http://bugs.python.org/issue1285086 for more information.

    Each part of a URL, e.g. the path info, the query, etc., has a
    different set of reserved characters that must be quoted.

    RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
    the following reserved characters.

    reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
                  "$" | ","

    Each of these characters is reserved in some component of a URL,
    but not necessarily in all of them.

    By default, the quote function is intended for quoting the path
    section of a URL.  Thus, it will not encode '/'.  This character
    is reserved, but in typical usage the quote function is being
    called on a path where the existing slash characters are used as
    reserved characters.
    """
    cachekey = (safe, always_safe)
    try:
        safe_map = _safemaps[cachekey]
        if not _must_quote[cachekey].search(s):
            return s
    except KeyError:
        safe += always_safe
        _must_quote[cachekey] = re.compile(r'[^%s]' % safe)
        safe_map = {}
        for i in range(256):
            c = chr(i)
            safe_map[c] = (c in safe) and c or ('%%%02X' % i)
        _safemaps[cachekey] = safe_map
    res = map(safe_map.__getitem__, s)
    return ''.join(res)