sima/lib/http.py

   1 # -*- coding: utf-8 -*-
   2
   3 # Copyright (c) 2014 Jack Kaliko <kaliko@azylum.org>
   4 # Copyright (c) 2012, 2013 Eric Larson <eric@ionrock.org>
   5 #
   6 #   This program is free software: you can redistribute it and/or modify
   7 #   it under the terms of the GNU General Public License as published by
   8 #   the Free Software Foundation, either version 3 of the License, or
   9 #   (at your option) any later version.
  10 #
  11 #   This program is distributed in the hope that it will be useful,
  12 #   but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 #   GNU General Public License for more details.
  15 #
  16 #   You should have received a copy of the GNU General Public License
  17 #   along with this program.  If not, see <http://www.gnu.org/licenses/>.
  18 #
  19 #
  20 """
  21 The httplib2 algorithms ported for use with requests.
  22 """
  23 import re
  24 import calendar
  25 import time
  26
  27 import email.utils
  28
  29 from .cache import DictCache
  30
  31
  32 URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?")
  33
  34
  35 def parse_uri(uri):
  36     """Parses a URI using the regex given in Appendix B of RFC 3986.
  37
  38         (scheme, authority, path, query, fragment) = parse_uri(uri)
  39     """
  40     groups = URI.match(uri).groups()
  41     return (groups[1], groups[3], groups[4], groups[6], groups[8])
  42
  43
  44 class CacheController(object):
  45     """An interface to see if request should cached or not.
  46     """
  47     def __init__(self, cache=None, cache_etags=True):
  48         self.cache = cache or DictCache()
  49         self.cache_etags = cache_etags
  50
  51     def _urlnorm(self, uri):
  52         """Normalize the URL to create a safe key for the cache"""
  53         (scheme, authority, path, query, _) = parse_uri(uri)
  54         if not scheme or not authority:
  55             raise Exception("Only absolute URIs are allowed. uri = %s" % uri)
  56         authority = authority.lower()
  57         scheme = scheme.lower()
  58         if not path:
  59             path = "/"
  60
  61         # Could do syntax based normalization of the URI before
  62         # computing the digest. See Section 6.2.2 of Std 66.
  63         request_uri = query and "?".join([path, query]) or path
  64         scheme = scheme.lower()
  65         defrag_uri = scheme + "://" + authority + request_uri
  66
  67         return defrag_uri
  68
  69     def cache_url(self, uri):
  70         return self._urlnorm(uri)
  71
  72     def parse_cache_control(self, headers):
  73         """
  74         Parse the cache control headers returning a dictionary with values
  75         for the different directives.
  76         """
  77         retval = {}
  78
  79         # requests provides a CaseInsensitiveDict as headers
  80         cc_header = 'cache-control'
  81         if cc_header in headers:
  82             parts = headers[cc_header].split(',')
  83             parts_with_args = [
  84                 tuple([x.strip().lower() for x in part.split("=", 1)])
  85                 for part in parts if -1 != part.find("=")]
  86             parts_wo_args = [(name.strip().lower(), 1)
  87                              for name in parts if -1 == name.find("=")]
  88             retval = dict(parts_with_args + parts_wo_args)
  89         return retval
  90
  91     def cached_request(self, url, headers):
  92         """Return the cached resquest if available and fresh
  93         """
  94         cache_url = self.cache_url(url)
  95         cc = self.parse_cache_control(headers)
  96
  97         # non-caching states
  98         no_cache = True if 'no-cache' in cc else False
  99         if 'max-age' in cc and cc['max-age'] == 0:
 100             no_cache = True
 101
 102         # see if it is in the cache anyways
 103         in_cache = self.cache.get(cache_url)
 104         if no_cache or not in_cache:
 105             return False
 106
 107         # It is in the cache, so lets see if it is going to be
 108         # fresh enough
 109         resp = self.cache.get(cache_url)
 110
 111         # Check our Vary header to make sure our request headers match
 112         # up. We don't delete it from the though, we just don't return
 113         # our cached value.
 114         #
 115         # NOTE: Because httplib2 stores raw content, it denotes
 116         #       headers that were sent in the original response by
 117         #       adding -varied-$name. We don't have to do that b/c we
 118         #       are storing the object which has a reference to the
 119         #       original request. If that changes, then I'd propose
 120         #       using the varied headers in the cache key to avoid the
 121         #       situation all together.
 122         if 'vary' in resp.headers:
 123             varied_headers = resp.headers['vary'].replace(' ', '').split(',')
 124             original_headers = resp.request.headers
 125             for header in varied_headers:
 126                 # If our headers don't match for the headers listed in
 127                 # the vary header, then don't use the cached response
 128                 if headers.get(header, None) != original_headers.get(header):
 129                     return False
 130
 131         now = time.time()
 132         date = calendar.timegm(
 133             email.utils.parsedate_tz(resp.headers['date'])
 134         )
 135         current_age = max(0, now - date)
 136
 137         # TODO: There is an assumption that the result will be a
 138         # requests response object. This may not be best since we
 139         # could probably avoid instantiating or constructing the
 140         # response until we know we need it.
 141         resp_cc = self.parse_cache_control(resp.headers)
 142
 143         # determine freshness
 144         freshness_lifetime = 0
 145         if 'max-age' in resp_cc and resp_cc['max-age'].isdigit():
 146             freshness_lifetime = int(resp_cc['max-age'])
 147         elif 'expires' in resp.headers:
 148             expires = email.utils.parsedate_tz(resp.headers['expires'])
 149             if expires is not None:
 150                 expire_time = calendar.timegm(expires) - date
 151                 freshness_lifetime = max(0, expire_time)
 152
 153         # determine if we are setting freshness limit in the req
 154         if 'max-age' in cc:
 155             try:
 156                 freshness_lifetime = int(cc['max-age'])
 157             except ValueError:
 158                 freshness_lifetime = 0
 159
 160         if 'min-fresh' in cc:
 161             try:
 162                 min_fresh = int(cc['min-fresh'])
 163             except ValueError:
 164                 min_fresh = 0
 165             # adjust our current age by our min fresh
 166             current_age += min_fresh
 167
 168         # see how fresh we actually are
 169         fresh = (freshness_lifetime > current_age)
 170
 171         if fresh:
 172             # make sure we set the from_cache to true
 173             resp.from_cache = True
 174             return resp
 175
 176         # we're not fresh. If we don't have an Etag, clear it out
 177         if 'etag' not in resp.headers:
 178             self.cache.delete(cache_url)
 179
 180         if 'etag' in resp.headers:
 181             headers['If-None-Match'] = resp.headers['ETag']
 182
 183         if 'last-modified' in resp.headers:
 184             headers['If-Modified-Since'] = resp.headers['Last-Modified']
 185
 186         # return the original handler
 187         return False
 188
 189     def add_headers(self, url):
 190         resp = self.cache.get(url)
 191         if resp and 'etag' in resp.headers:
 192             return {'If-None-Match': resp.headers['etag']}
 193         return {}
 194
 195     def cache_response(self, request, resp):
 196         """
 197         Algorithm for caching requests.
 198
 199         This assumes a requests Response object.
 200         """
 201         # From httplib2: Don't cache 206's since we aren't going to
 202         # handle byte range requests
 203         if resp.status_code not in [200, 203]:
 204             return
 205
 206         cc_req = self.parse_cache_control(request.headers)
 207         cc_resp = self.parse_cache_control(resp.headers)
 208
 209         cache_url = self.cache_url(request.url)
 210
 211         # Delete it from the cache if we happen to have it stored there
 212         no_store = cc_resp.get('no-store') or cc_req.get('no-store')
 213         if no_store and self.cache.get(cache_url):
 214             self.cache.delete(cache_url)
 215
 216         # If we've been given an etag, then keep the response
 217         if self.cache_etags and 'etag' in resp.headers:
 218             self.cache.set(cache_url, resp)
 219
 220         # Add to the cache if the response headers demand it. If there
 221         # is no date header then we can't do anything about expiring
 222         # the cache.
 223         elif 'date' in resp.headers:
 224             # cache when there is a max-age > 0
 225             if cc_resp and cc_resp.get('max-age'):
 226                 if int(cc_resp['max-age']) > 0:
 227                     self.cache.set(cache_url, resp)
 228
 229             # If the request can expire, it means we should cache it
 230             # in the meantime.
 231             elif 'expires' in resp.headers:
 232                 if resp.headers['expires']:
 233                     self.cache.set(cache_url, resp)
 234
 235     def update_cached_response(self, request, response):
 236         """On a 304 we will get a new set of headers that we want to
 237         update our cached value with, assuming we have one.
 238
 239         This should only ever be called when we've sent an ETag and
 240         gotten a 304 as the response.
 241         """
 242         cache_url = self.cache_url(request.url)
 243
 244         resp = self.cache.get(cache_url)
 245
 246         if not resp:
 247             # we didn't have a cached response
 248             return response
 249
 250         # did so lets update our headers
 251         resp.headers.update(response.headers)
 252
 253         # we want a 200 b/c we have content via the cache
 254         request.status_code = 200
 255
 256         # update the request as it has the if-none-match header + any
 257         # other headers that the server might have updated (ie Date,
 258         # Cache-Control, Expires, etc.)
 259         resp.request = request
 260
 261         # update our cache
 262         self.cache.set(cache_url, resp)
 263
 264         # Let everyone know this was from the cache.
 265         resp.from_cache = True
 266
 267         return resp