sima/lib/http.py

   1 # -*- coding: utf-8 -*-
   2
   3 # Copyright (c) 2014 Jack Kaliko <kaliko@azylum.org>
   4 # Copyright (c) 2012, 2013 Eric Larson <eric@ionrock.org>
   5 #
   6 #   This program is free software: you can redistribute it and/or modify
   7 #   it under the terms of the GNU General Public License as published by
   8 #   the Free Software Foundation, either version 3 of the License, or
   9 #   (at your option) any later version.
  10 #
  11 #   This program is distributed in the hope that it will be useful,
  12 #   but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 #   GNU General Public License for more details.
  15 #
  16 #   You should have received a copy of the GNU General Public License
  17 #   along with this program.  If not, see <http://www.gnu.org/licenses/>.
  18 #
  19 #
  20 """
  21 The httplib2 algorithms ported for use with requests.
  22 """
  23 import re
  24 import calendar
  25 import time
  26
  27 import email.utils
  28
  29 from .cache import DictCache
  30
  31
  32 URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?")
  33
  34
  35 def parse_uri(uri):
  36     """Parses a URI using the regex given in Appendix B of RFC 3986.
  37
  38         (scheme, authority, path, query, fragment) = parse_uri(uri)
  39     """
  40     groups = URI.match(uri).groups()
  41     return (groups[1], groups[3], groups[4], groups[6], groups[8])
  42
  43
  44 class CacheController(object):
  45     """An interface to see if request should cached or not.
  46     """
  47     def __init__(self, cache=None, cache_etags=True):
  48         self.cache = cache or DictCache()
  49         self.cache_etags = cache_etags
  50
  51     def _urlnorm(self, uri):
  52         """Normalize the URL to create a safe key for the cache"""
  53         (scheme, authority, path, query, _) = parse_uri(uri)
  54         if not scheme or not authority:
  55             raise Exception("Only absolute URIs are allowed. uri = %s" % uri)
  56         authority = authority.lower()
  57         scheme = scheme.lower()
  58         if not path:
  59             path = "/"
  60
  61         # Order of params might changed
  62         query = ''.join(sorted(query.split('&')))
  63         # Could do syntax based normalization of the URI before
  64         # computing the digest. See Section 6.2.2 of Std 66.
  65         request_uri = query and "?".join([path, query]) or path
  66         scheme = scheme.lower()
  67         defrag_uri = scheme + "://" + authority + request_uri
  68
  69         return defrag_uri
  70
  71     def cache_url(self, uri):
  72         return self._urlnorm(uri)
  73
  74     def parse_cache_control(self, headers):
  75         """
  76         Parse the cache control headers returning a dictionary with values
  77         for the different directives.
  78         """
  79         retval = {}
  80
  81         # requests provides a CaseInsensitiveDict as headers
  82         cc_header = 'cache-control'
  83         if cc_header in headers:
  84             parts = headers[cc_header].split(',')
  85             parts_with_args = [
  86                 tuple([x.strip().lower() for x in part.split("=", 1)])
  87                 for part in parts if -1 != part.find("=")]
  88             parts_wo_args = [(name.strip().lower(), 1)
  89                              for name in parts if -1 == name.find("=")]
  90             retval = dict(parts_with_args + parts_wo_args)
  91         return retval
  92
  93     def cached_request(self, url, headers):
  94         """Return the cached resquest if available and fresh
  95         """
  96         cache_url = self.cache_url(url)
  97         cc = self.parse_cache_control(headers)
  98
  99         # non-caching states
 100         no_cache = True if 'no-cache' in cc else False
 101         if 'max-age' in cc and cc['max-age'] == 0:
 102             no_cache = True
 103
 104         # see if it is in the cache anyways
 105         in_cache = self.cache.get(cache_url)
 106         if no_cache or not in_cache:
 107             return False
 108
 109         # It is in the cache, so lets see if it is going to be
 110         # fresh enough
 111         resp = self.cache.get(cache_url)
 112
 113         # Check our Vary header to make sure our request headers match
 114         # up. We don't delete it from the though, we just don't return
 115         # our cached value.
 116         #
 117         # NOTE: Because httplib2 stores raw content, it denotes
 118         #       headers that were sent in the original response by
 119         #       adding -varied-$name. We don't have to do that b/c we
 120         #       are storing the object which has a reference to the
 121         #       original request. If that changes, then I'd propose
 122         #       using the varied headers in the cache key to avoid the
 123         #       situation all together.
 124         if 'vary' in resp.headers:
 125             varied_headers = resp.headers['vary'].replace(' ', '').split(',')
 126             original_headers = resp.request.headers
 127             for header in varied_headers:
 128                 # If our headers don't match for the headers listed in
 129                 # the vary header, then don't use the cached response
 130                 if headers.get(header, None) != original_headers.get(header):
 131                     return False
 132
 133         now = time.time()
 134         date = calendar.timegm(
 135             email.utils.parsedate_tz(resp.headers['date'])
 136         )
 137         current_age = max(0, now - date)
 138
 139         # TODO: There is an assumption that the result will be a
 140         # requests response object. This may not be best since we
 141         # could probably avoid instantiating or constructing the
 142         # response until we know we need it.
 143         resp_cc = self.parse_cache_control(resp.headers)
 144
 145         # determine freshness
 146         freshness_lifetime = 0
 147         if 'max-age' in resp_cc and resp_cc['max-age'].isdigit():
 148             freshness_lifetime = int(resp_cc['max-age'])
 149         elif 'expires' in resp.headers:
 150             expires = email.utils.parsedate_tz(resp.headers['expires'])
 151             if expires is not None:
 152                 expire_time = calendar.timegm(expires) - date
 153                 freshness_lifetime = max(0, expire_time)
 154
 155         # determine if we are setting freshness limit in the req
 156         if 'max-age' in cc:
 157             try:
 158                 freshness_lifetime = int(cc['max-age'])
 159             except ValueError:
 160                 freshness_lifetime = 0
 161
 162         if 'min-fresh' in cc:
 163             try:
 164                 min_fresh = int(cc['min-fresh'])
 165             except ValueError:
 166                 min_fresh = 0
 167             # adjust our current age by our min fresh
 168             current_age += min_fresh
 169
 170         # see how fresh we actually are
 171         fresh = (freshness_lifetime > current_age)
 172
 173         if fresh:
 174             # make sure we set the from_cache to true
 175             resp.from_cache = True
 176             return resp
 177
 178         # we're not fresh. If we don't have an Etag, clear it out
 179         if 'etag' not in resp.headers:
 180             self.cache.delete(cache_url)
 181
 182         if 'etag' in resp.headers:
 183             headers['If-None-Match'] = resp.headers['ETag']
 184
 185         if 'last-modified' in resp.headers:
 186             headers['If-Modified-Since'] = resp.headers['Last-Modified']
 187
 188         # return the original handler
 189         return False
 190
 191     def add_headers(self, url):
 192         resp = self.cache.get(url)
 193         if resp and 'etag' in resp.headers:
 194             return {'If-None-Match': resp.headers['etag']}
 195         return {}
 196
 197     def cache_response(self, request, resp):
 198         """
 199         Algorithm for caching requests.
 200
 201         This assumes a requests Response object.
 202         """
 203         # From httplib2: Don't cache 206's since we aren't going to
 204         # handle byte range requests
 205         if resp.status_code not in [200, 203]:
 206             return
 207
 208         cc_req = self.parse_cache_control(request.headers)
 209         cc_resp = self.parse_cache_control(resp.headers)
 210
 211         cache_url = self.cache_url(request.url)
 212
 213         # Delete it from the cache if we happen to have it stored there
 214         no_store = cc_resp.get('no-store') or cc_req.get('no-store')
 215         if no_store and self.cache.get(cache_url):
 216             self.cache.delete(cache_url)
 217
 218         # If we've been given an etag, then keep the response
 219         if self.cache_etags and 'etag' in resp.headers:
 220             self.cache.set(cache_url, resp)
 221
 222         # Add to the cache if the response headers demand it. If there
 223         # is no date header then we can't do anything about expiring
 224         # the cache.
 225         elif 'date' in resp.headers:
 226             # cache when there is a max-age > 0
 227             if cc_resp and cc_resp.get('max-age'):
 228                 if int(cc_resp['max-age']) > 0:
 229                     self.cache.set(cache_url, resp)
 230
 231             # If the request can expire, it means we should cache it
 232             # in the meantime.
 233             elif 'expires' in resp.headers:
 234                 if resp.headers['expires']:
 235                     self.cache.set(cache_url, resp)
 236
 237     def update_cached_response(self, request, response):
 238         """On a 304 we will get a new set of headers that we want to
 239         update our cached value with, assuming we have one.
 240
 241         This should only ever be called when we've sent an ETag and
 242         gotten a 304 as the response.
 243         """
 244         cache_url = self.cache_url(request.url)
 245
 246         resp = self.cache.get(cache_url)
 247
 248         if not resp:
 249             # we didn't have a cached response
 250             return response
 251
 252         # did so lets update our headers
 253         resp.headers.update(response.headers)
 254
 255         # we want a 200 b/c we have content via the cache
 256         request.status_code = 200
 257
 258         # update the request as it has the if-none-match header + any
 259         # other headers that the server might have updated (ie Date,
 260         # Cache-Control, Expires, etc.)
 261         resp.request = request
 262
 263         # update our cache
 264         self.cache.set(cache_url, resp)
 265
 266         # Let everyone know this was from the cache.
 267         resp.from_cache = True
 268
 269         return resp