sima/lib/http.py

   1 # -*- coding: utf-8 -*-
   2
   3 # Copyright (c) 2014-2015 Jack Kaliko <kaliko@azylum.org>
   4 # Copyright (c) 2012, 2013 Eric Larson <eric@ionrock.org>
   5 #
   6 #   This program is free software: you can redistribute it and/or modify
   7 #   it under the terms of the GNU General Public License as published by
   8 #   the Free Software Foundation, either version 3 of the License, or
   9 #   (at your option) any later version.
  10 #
  11 #   This program is distributed in the hope that it will be useful,
  12 #   but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 #   GNU General Public License for more details.
  15 #
  16 #   You should have received a copy of the GNU General Public License
  17 #   along with this program.  If not, see <http://www.gnu.org/licenses/>.
  18 #
  19 #
  20 """
  21 The httplib2 algorithms ported for use with requests.
  22 """
  23 import re
  24 import calendar
  25 import time
  26
  27 import email.utils
  28
  29 from requests import Session, Request, Timeout, ConnectionError
  30
  31 from sima import SOCKET_TIMEOUT, WAIT_BETWEEN_REQUESTS
  32 from sima.utils.utils import WSError, WSTimeout, WSHTTPError, Throttle
  33 from .cache import DictCache
  34
  35
  36 URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?")
  37
  38
  39 def parse_uri(uri):
  40     """Parses a URI using the regex given in Appendix B of RFC 3986.
  41
  42         (scheme, authority, path, query, fragment) = parse_uri(uri)
  43     """
  44     groups = URI.match(uri).groups()
  45     return (groups[1], groups[3], groups[4], groups[6], groups[8])
  46
  47
  48 class CacheController(object):
  49     """An interface to see if request should cached or not.
  50     """
  51     CACHE_ANYWAY = False
  52
  53     def __init__(self, cache=None, cache_etags=True):
  54         self.cache = cache or DictCache()
  55         self.cache_etags = cache_etags
  56
  57     def _urlnorm(self, uri):
  58         """Normalize the URL to create a safe key for the cache"""
  59         (scheme, authority, path, query, _) = parse_uri(uri)
  60         if not scheme or not authority:
  61             raise Exception("Only absolute URIs are allowed. uri = %s" % uri)
  62         authority = authority.lower()
  63         scheme = scheme.lower()
  64         if not path:
  65             path = "/"
  66
  67         # Could do syntax based normalization of the URI before
  68         # computing the digest. See Section 6.2.2 of Std 66.
  69         request_uri = query and "?".join([path, query]) or path
  70         scheme = scheme.lower()
  71         defrag_uri = scheme + "://" + authority + request_uri
  72
  73         return defrag_uri
  74
  75     def cache_url(self, uri):
  76         return self._urlnorm(uri)
  77
  78     def parse_cache_control(self, headers):
  79         """
  80         Parse the cache control headers returning a dictionary with values
  81         for the different directives.
  82         """
  83         retval = {}
  84
  85         # requests provides a CaseInsensitiveDict as headers
  86         cc_header = 'cache-control'
  87         if cc_header in headers:
  88             parts = headers[cc_header].split(',')
  89             parts_with_args = [
  90                 tuple([x.strip().lower() for x in part.split("=", 1)])
  91                 for part in parts if -1 != part.find("=")]
  92             parts_wo_args = [(name.strip().lower(), 1)
  93                              for name in parts if -1 == name.find("=")]
  94             retval = dict(parts_with_args + parts_wo_args)
  95         return retval
  96
  97     def cached_request(self, request):
  98         """Return the cached resquest if available and fresh
  99         """
 100         cache_url = self.cache_url(request.url)
 101         cc = self.parse_cache_control(request.headers)
 102
 103         # non-caching states
 104         no_cache = True if 'no-cache' in cc else False
 105         if 'max-age' in cc and cc['max-age'] == 0:
 106             no_cache = True
 107         # see if it is in the cache anyways
 108         in_cache = self.cache.get(cache_url)
 109         if no_cache or not in_cache:
 110             return False
 111
 112         # It is in the cache, so lets see if it is going to be
 113         # fresh enough
 114         resp = self.cache.get(cache_url)
 115
 116         # Check our Vary header to make sure our request headers match
 117         # up. We don't delete it from the though, we just don't return
 118         # our cached value.
 119         #
 120         # NOTE: Because httplib2 stores raw content, it denotes
 121         #       headers that were sent in the original response by
 122         #       adding -varied-$name. We don't have to do that b/c we
 123         #       are storing the object which has a reference to the
 124         #       original request. If that changes, then I'd propose
 125         #       using the varied headers in the cache key to avoid the
 126         #       situation all together.
 127         if 'vary' in resp.headers:
 128             varied_headers = resp.headers['vary'].replace(' ', '').split(',')
 129             original_headers = resp.request.headers
 130             for header in varied_headers:
 131                 # If our headers don't match for the headers listed in
 132                 # the vary header, then don't use the cached response
 133                 if request.headers.get(header, None) != original_headers.get(header):
 134                     return False
 135
 136         now = time.time()
 137         date = calendar.timegm(
 138             email.utils.parsedate_tz(resp.headers['date'])
 139         )
 140         current_age = max(0, now - date)
 141
 142         # TODO: There is an assumption that the result will be a
 143         # requests response object. This may not be best since we
 144         # could probably avoid instantiating or constructing the
 145         # response until we know we need it.
 146         resp_cc = self.parse_cache_control(resp.headers)
 147
 148         # determine freshness
 149         freshness_lifetime = 0
 150         if 'max-age' in resp_cc and resp_cc['max-age'].isdigit():
 151             freshness_lifetime = int(resp_cc['max-age'])
 152         elif 'expires' in resp.headers:
 153             expires = email.utils.parsedate_tz(resp.headers['expires'])
 154             if expires is not None:
 155                 expire_time = calendar.timegm(expires) - date
 156                 freshness_lifetime = max(0, expire_time)
 157
 158         # determine if we are setting freshness limit in the req
 159         if 'max-age' in cc:
 160             try:
 161                 freshness_lifetime = int(cc['max-age'])
 162             except ValueError:
 163                 freshness_lifetime = 0
 164
 165         if 'min-fresh' in cc:
 166             try:
 167                 min_fresh = int(cc['min-fresh'])
 168             except ValueError:
 169                 min_fresh = 0
 170             # adjust our current age by our min fresh
 171             current_age += min_fresh
 172
 173         # see how fresh we actually are
 174         fresh = (freshness_lifetime > current_age)
 175
 176         if fresh:
 177             # make sure we set the from_cache to true
 178             resp.from_cache = True
 179             return resp
 180
 181         # we're not fresh. If we don't have an Etag, clear it out
 182         if 'etag' not in resp.headers:
 183             self.cache.delete(cache_url)
 184
 185         if 'etag' in resp.headers:
 186             request.headers['If-None-Match'] = resp.headers['ETag']
 187
 188         if 'last-modified' in resp.headers:
 189             request.headers['If-Modified-Since'] = resp.headers['Last-Modified']
 190
 191         # return the original handler
 192         return False
 193
 194     def cache_response(self, request, resp):
 195         """
 196         Algorithm for caching requests.
 197
 198         This assumes a requests Response object.
 199         """
 200         # From httplib2: Don't cache 206's since we aren't going to
 201         # handle byte range requests
 202         if resp.status_code not in [200, 203]:
 203             return
 204
 205         cc_req = self.parse_cache_control(request.headers)
 206         cc_resp = self.parse_cache_control(resp.headers)
 207
 208         cache_url = self.cache_url(request.url)
 209
 210         # Delete it from the cache if we happen to have it stored there
 211         no_store = cc_resp.get('no-store') or cc_req.get('no-store')
 212         if no_store and self.cache.get(cache_url):
 213             self.cache.delete(cache_url)
 214
 215         # If we've been given an etag, then keep the response
 216         if self.cache_etags and 'etag' in resp.headers:
 217             self.cache.set(cache_url, resp)
 218
 219         # Add to the cache if the response headers demand it. If there
 220         # is no date header then we can't do anything about expiring
 221         # the cache.
 222         elif 'date' in resp.headers:
 223             # cache when there is a max-age > 0
 224             if cc_resp and cc_resp.get('max-age'):
 225                 if int(cc_resp['max-age']) > 0:
 226                     self.cache.set(cache_url, resp)
 227
 228             # If the request can expire, it means we should cache it
 229             # in the meantime.
 230             elif 'expires' in resp.headers:
 231                 if resp.headers['expires']:
 232                     self.cache.set(cache_url, resp)
 233             # Force one month max age if no Cache-Control header is found
 234             # Overriding header disappearance on LastFM web service...
 235             # https://getsatisfaction.com/lastfm/topics/-web-api-http-cache-control-header
 236             elif CacheController.CACHE_ANYWAY:
 237                 resp.headers['Cache-Control'] = 'max-age=2419200'
 238                 self.cache.set(cache_url, resp)
 239
 240     def update_cached_response(self, request, response):
 241         """On a 304 we will get a new set of headers that we want to
 242         update our cached value with, assuming we have one.
 243
 244         This should only ever be called when we've sent an ETag and
 245         gotten a 304 as the response.
 246         """
 247         cache_url = self.cache_url(request.url)
 248
 249         resp = self.cache.get(cache_url)
 250
 251         if not resp:
 252             # we didn't have a cached response
 253             return response
 254
 255         # did so lets update our headers
 256         resp.headers.update(response.headers)
 257
 258         # we want a 200 b/c we have content via the cache
 259         request.status_code = 200
 260
 261         # update the request as it has the if-none-match header + any
 262         # other headers that the server might have updated (ie Date,
 263         # Cache-Control, Expires, etc.)
 264         resp.request = request
 265
 266         # update our cache
 267         self.cache.set(cache_url, resp)
 268
 269         # Let everyone know this was from the cache.
 270         resp.from_cache = True
 271
 272         return resp
 273
 274
 275 class HttpClient:
 276     def __init__(self, cache=None, stats=None):
 277         """
 278         Prepare http request
 279         Use cached elements or proceed http request
 280         """
 281         self.stats = stats
 282         self.controller = CacheController(cache)
 283
 284     def __call__(self, ress, payload):
 285         req = Request('GET', ress, params=payload,).prepare()
 286         if self.stats:
 287             self.stats.update(total=self.stats.get('total')+1)
 288         cached_response = self.controller.cached_request(req)
 289         if cached_response:
 290             if self.stats:
 291                 self.stats.update(ccontrol=self.stats.get('ccontrol')+1)
 292             return cached_response
 293         try:
 294             return self.fetch_ws(req)
 295         except Timeout:
 296             raise WSTimeout('Failed to reach server within {0}s'.format(
 297                 SOCKET_TIMEOUT))
 298         except ConnectionError as err:
 299             raise WSError(err)
 300
 301     @Throttle(WAIT_BETWEEN_REQUESTS)
 302     def fetch_ws(self, prepreq):
 303         """fetch from web service"""
 304         sess = Session()
 305         settings = sess.merge_environment_settings(prepreq.url, {}, None, False, None)
 306         resp = sess.send(prepreq, timeout=SOCKET_TIMEOUT, **settings)
 307         if resp.status_code == 304:
 308             self.stats.update(etag=self.stats.get('etag')+1)
 309             resp = self.controller.update_cached_response(prepreq, resp)
 310         elif resp.status_code != 200:
 311             raise WSHTTPError('{0.status_code}: {0.reason}'.format(resp))
 312         self.controller.cache_response(resp.request, resp)
 313         return resp
 314
 315 # VIM MODLINE
 316 # vim: ai ts=4 sw=4 sts=4 expandtab