""" Utility functions for dealing with URLs in repoze.bfg """ import re import urllib from repoze.bfg.location import lineage def model_url(model, request, *elements, **kw): """ Generate a string representing the absolute URL of the model object based on the ``wsgi.url_scheme``, ``HTTP_HOST`` or ``SERVER_NAME`` in the request, plus any ``SCRIPT_NAME``. If a ``query`` keyword argument is provided, a query string based on its value will be composed and appended to the generated URL string (see details below). The overall result of this function is always a string (never unicode). The ``model`` passed in must be :term:`location`-aware. .. note:: If any model in the lineage has a unicode name, it will be converted to UTF-8 before being attached to the URL. When composing the path based on the model lineage, empty names in the model graph are ignored. Any positional arguments passed in as ``elements`` must be strings or unicode objects. These will be joined by slashes and appended to the generated model URL. Each of the elements passed in is URL-quoted before being appended; if any element is unicode, it will converted to a UTF-8 bytestring before being URL-quoted. .. warning:: if no ``elements`` arguments are specified, the model URL will end with a trailing slash. If any ``elements`` are used, the generated URL will *not* end in trailing a slash. If a keyword argument ``query`` is present, it will used to compose a query string that will be tacked on to the end of the URL. The value of ``query`` must be a sequence of two-tuples *or* a data structure with an ``.items()`` method that returns a sequence of two-tuples (presumably a dictionary). This data structure will be turned into a query string per the documentation of ``repoze.url.urlencode`` function. After the query data is turned into a query string, a leading ``?`` is prepended, and the the resulting string is appended to the generated URL. .. note:: Python data structures that are passed as ``query`` which are sequences or dictionaries are turned into a string under the same rules as when run through urllib.urlencode with the ``doseq`` argument equal to ``True``. This means that sequences can be passed as values, and a k=v pair will be placed into the query string for each value. """ qs = '' if 'query' in kw: qs = '?' + urlencode(kw['query'], doseq=True) rpath = [] for location in lineage(model): name = location.__name__ if name: rpath.append(_urlsegment(name)) prefix = '/'.join(reversed(rpath)) suffix = '/'.join([_urlsegment(s) for s in elements]) path = '/'.join([prefix, suffix]) if not path.startswith('/'): path = '/' + path app_url = request.application_url # never ends in a slash return app_url + path + qs def urlencode(query, doseq=False): """ A wrapper around Python's stdlib `urllib.urlencode function `_ which accepts unicode keys and values within the ``query`` dict/sequence; all Unicode keys and values are first converted to UTF-8 before being used to compose the query string. The behavior of the function is otherwise the same as the stdlib version. The value of ``query`` must be a sequence of two-tuples representing key/value pairs *or* an object (often a dictionary) with an ``.items()`` method that returns a sequence of two-tuples representing key/value pairs. ``doseq`` controls what happens when a sequence is presented as one of the values. See the Python stdlib documentation for more information. """ if hasattr(query, 'items'): # dictionary query = query.items() # presumed to be a sequence of two-tuples newquery = [] for k, v in query: if k.__class__ is unicode: k = k.encode('utf-8') if isinstance(v, (tuple, list)): L = [] for x in v: if x.__class__ is unicode: x = x.encode('utf-8') L.append(x) v = L elif v.__class__ is unicode: v = v.encode('utf-8') newquery.append((k, v)) return urllib.urlencode(newquery, doseq=doseq) _segment_cache = {} def _urlsegment(s): """ The bit of this code that deals with ``_segment_cache`` is an optimization: we cache all the computation of URL path segments in this module-scope dictionary with the original string (or unicode value) as the key, so we can look it up later without needing to reencode or re-url-quote it """ result = _segment_cache.get(s) if result is None: if s.__class__ is unicode: # isinstance slighly slower (~15%) result = _url_quote(s.encode('utf-8')) else: result = _url_quote(s) # we don't need a lock to mutate _segment_cache, as the below # will generate exactly one Python bytecode (STORE_SUBSCR) _segment_cache[s] = result return result always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' '0123456789' '_.-') _safemaps = {} _must_quote = {} def _url_quote(s, safe = '/'): """quote('abc def') -> 'abc%20def' Faster version of Python stdlib urllib.quote. See http://bugs.python.org/issue1285086 for more information. Each part of a URL, e.g. the path info, the query, etc., has a different set of reserved characters that must be quoted. RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists the following reserved characters. reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | "$" | "," Each of these characters is reserved in some component of a URL, but not necessarily in all of them. By default, the quote function is intended for quoting the path section of a URL. Thus, it will not encode '/'. This character is reserved, but in typical usage the quote function is being called on a path where the existing slash characters are used as reserved characters. """ cachekey = (safe, always_safe) try: safe_map = _safemaps[cachekey] if not _must_quote[cachekey].search(s): return s except KeyError: safe += always_safe _must_quote[cachekey] = re.compile(r'[^%s]' % safe) safe_map = {} for i in range(256): c = chr(i) safe_map[c] = (c in safe) and c or ('%%%02X' % i) _safemaps[cachekey] = safe_map res = map(safe_map.__getitem__, s) return ''.join(res)