sima/lib/http.py

   1 # -*- coding: utf-8 -*-
   2
   3 # Copyright (c) 2014-2015 Jack Kaliko <kaliko@azylum.org>
   4 # Copyright (c) 2012, 2013 Eric Larson <eric@ionrock.org>
   5 #
   6 #   This program is free software: you can redistribute it and/or modify
   7 #   it under the terms of the GNU General Public License as published by
   8 #   the Free Software Foundation, either version 3 of the License, or
   9 #   (at your option) any later version.
  10 #
  11 #   This program is distributed in the hope that it will be useful,
  12 #   but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 #   GNU General Public License for more details.
  15 #
  16 #   You should have received a copy of the GNU General Public License
  17 #   along with this program.  If not, see <http://www.gnu.org/licenses/>.
  18 #
  19 #
  20 """
  21 The httplib2 algorithms ported for use with requests.
  22 """
  23 import re
  24 import calendar
  25 import time
  26
  27 import email.utils
  28
  29 from requests import Session, Request, Timeout, ConnectionError
  30
  31 from sima import SOCKET_TIMEOUT, WAIT_BETWEEN_REQUESTS
  32 from sima.utils.utils import WSError, WSTimeout, WSHTTPError, Throttle
  33 from .cache import DictCache
  34
  35
  36 URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?")
  37
  38
  39 def parse_uri(uri):
  40     """Parses a URI using the regex given in Appendix B of RFC 3986.
  41
  42         (scheme, authority, path, query, fragment) = parse_uri(uri)
  43     """
  44     groups = URI.match(uri).groups()
  45     return (groups[1], groups[3], groups[4], groups[6], groups[8])
  46
  47
  48 class CacheController(object):
  49     """An interface to see if request should cached or not.
  50     """
  51     def __init__(self, cache=None, cache_etags=True):
  52         self.cache = cache or DictCache()
  53         self.cache_etags = cache_etags
  54
  55     def _urlnorm(self, uri):
  56         """Normalize the URL to create a safe key for the cache"""
  57         (scheme, authority, path, query, _) = parse_uri(uri)
  58         if not scheme or not authority:
  59             raise Exception("Only absolute URIs are allowed. uri = %s" % uri)
  60         authority = authority.lower()
  61         scheme = scheme.lower()
  62         if not path:
  63             path = "/"
  64
  65         # Could do syntax based normalization of the URI before
  66         # computing the digest. See Section 6.2.2 of Std 66.
  67         request_uri = query and "?".join([path, query]) or path
  68         scheme = scheme.lower()
  69         defrag_uri = scheme + "://" + authority + request_uri
  70
  71         return defrag_uri
  72
  73     def cache_url(self, uri):
  74         return self._urlnorm(uri)
  75
  76     def parse_cache_control(self, headers):
  77         """
  78         Parse the cache control headers returning a dictionary with values
  79         for the different directives.
  80         """
  81         retval = {}
  82
  83         # requests provides a CaseInsensitiveDict as headers
  84         cc_header = 'cache-control'
  85         if cc_header in headers:
  86             parts = headers[cc_header].split(',')
  87             parts_with_args = [
  88                 tuple([x.strip().lower() for x in part.split("=", 1)])
  89                 for part in parts if -1 != part.find("=")]
  90             parts_wo_args = [(name.strip().lower(), 1)
  91                              for name in parts if -1 == name.find("=")]
  92             retval = dict(parts_with_args + parts_wo_args)
  93         return retval
  94
  95     def cached_request(self, request):
  96         """Return the cached resquest if available and fresh
  97         """
  98         cache_url = self.cache_url(request.url)
  99         cc = self.parse_cache_control(request.headers)
 100
 101         # non-caching states
 102         no_cache = True if 'no-cache' in cc else False
 103         if 'max-age' in cc and cc['max-age'] == 0:
 104             no_cache = True
 105
 106         # see if it is in the cache anyways
 107         in_cache = self.cache.get(cache_url)
 108         if no_cache or not in_cache:
 109             return False
 110
 111         # It is in the cache, so lets see if it is going to be
 112         # fresh enough
 113         resp = self.cache.get(cache_url)
 114
 115         # Check our Vary header to make sure our request headers match
 116         # up. We don't delete it from the though, we just don't return
 117         # our cached value.
 118         #
 119         # NOTE: Because httplib2 stores raw content, it denotes
 120         #       headers that were sent in the original response by
 121         #       adding -varied-$name. We don't have to do that b/c we
 122         #       are storing the object which has a reference to the
 123         #       original request. If that changes, then I'd propose
 124         #       using the varied headers in the cache key to avoid the
 125         #       situation all together.
 126         if 'vary' in resp.headers:
 127             varied_headers = resp.headers['vary'].replace(' ', '').split(',')
 128             original_headers = resp.request.headers
 129             for header in varied_headers:
 130                 # If our headers don't match for the headers listed in
 131                 # the vary header, then don't use the cached response
 132                 if request.headers.get(header, None) != original_headers.get(header):
 133                     return False
 134
 135         now = time.time()
 136         date = calendar.timegm(
 137             email.utils.parsedate_tz(resp.headers['date'])
 138         )
 139         current_age = max(0, now - date)
 140
 141         # TODO: There is an assumption that the result will be a
 142         # requests response object. This may not be best since we
 143         # could probably avoid instantiating or constructing the
 144         # response until we know we need it.
 145         resp_cc = self.parse_cache_control(resp.headers)
 146
 147         # determine freshness
 148         freshness_lifetime = 0
 149         if 'max-age' in resp_cc and resp_cc['max-age'].isdigit():
 150             freshness_lifetime = int(resp_cc['max-age'])
 151         elif 'expires' in resp.headers:
 152             expires = email.utils.parsedate_tz(resp.headers['expires'])
 153             if expires is not None:
 154                 expire_time = calendar.timegm(expires) - date
 155                 freshness_lifetime = max(0, expire_time)
 156
 157         # determine if we are setting freshness limit in the req
 158         if 'max-age' in cc:
 159             try:
 160                 freshness_lifetime = int(cc['max-age'])
 161             except ValueError:
 162                 freshness_lifetime = 0
 163
 164         if 'min-fresh' in cc:
 165             try:
 166                 min_fresh = int(cc['min-fresh'])
 167             except ValueError:
 168                 min_fresh = 0
 169             # adjust our current age by our min fresh
 170             current_age += min_fresh
 171
 172         # see how fresh we actually are
 173         fresh = (freshness_lifetime > current_age)
 174
 175         if fresh:
 176             # make sure we set the from_cache to true
 177             resp.from_cache = True
 178             return resp
 179
 180         # we're not fresh. If we don't have an Etag, clear it out
 181         if 'etag' not in resp.headers:
 182             self.cache.delete(cache_url)
 183
 184         if 'etag' in resp.headers:
 185             request.headers['If-None-Match'] = resp.headers['ETag']
 186
 187         if 'last-modified' in resp.headers:
 188             request.headers['If-Modified-Since'] = resp.headers['Last-Modified']
 189
 190         # return the original handler
 191         return False
 192
 193     def add_headers(self, url):
 194         resp = self.cache.get(url)
 195         if resp and 'etag' in resp.headers:
 196             return {'If-None-Match': resp.headers['etag']}
 197         return {}
 198
 199     def cache_response(self, request, resp):
 200         """
 201         Algorithm for caching requests.
 202
 203         This assumes a requests Response object.
 204         """
 205         # From httplib2: Don't cache 206's since we aren't going to
 206         # handle byte range requests
 207         if resp.status_code not in [200, 203]:
 208             return
 209
 210         cc_req = self.parse_cache_control(request.headers)
 211         cc_resp = self.parse_cache_control(resp.headers)
 212
 213         cache_url = self.cache_url(request.url)
 214
 215         # Delete it from the cache if we happen to have it stored there
 216         no_store = cc_resp.get('no-store') or cc_req.get('no-store')
 217         if no_store and self.cache.get(cache_url):
 218             self.cache.delete(cache_url)
 219
 220         # If we've been given an etag, then keep the response
 221         if self.cache_etags and 'etag' in resp.headers:
 222             self.cache.set(cache_url, resp)
 223
 224         # Add to the cache if the response headers demand it. If there
 225         # is no date header then we can't do anything about expiring
 226         # the cache.
 227         elif 'date' in resp.headers:
 228             # cache when there is a max-age > 0
 229             if cc_resp and cc_resp.get('max-age'):
 230                 if int(cc_resp['max-age']) > 0:
 231                     self.cache.set(cache_url, resp)
 232
 233             # If the request can expire, it means we should cache it
 234             # in the meantime.
 235             elif 'expires' in resp.headers:
 236                 if resp.headers['expires']:
 237                     self.cache.set(cache_url, resp)
 238
 239     def update_cached_response(self, request, response):
 240         """On a 304 we will get a new set of headers that we want to
 241         update our cached value with, assuming we have one.
 242
 243         This should only ever be called when we've sent an ETag and
 244         gotten a 304 as the response.
 245         """
 246         cache_url = self.cache_url(request.url)
 247
 248         resp = self.cache.get(cache_url)
 249
 250         if not resp:
 251             # we didn't have a cached response
 252             return response
 253
 254         # did so lets update our headers
 255         resp.headers.update(response.headers)
 256
 257         # we want a 200 b/c we have content via the cache
 258         request.status_code = 200
 259
 260         # update the request as it has the if-none-match header + any
 261         # other headers that the server might have updated (ie Date,
 262         # Cache-Control, Expires, etc.)
 263         resp.request = request
 264
 265         # update our cache
 266         self.cache.set(cache_url, resp)
 267
 268         # Let everyone know this was from the cache.
 269         resp.from_cache = True
 270
 271         return resp
 272
 273
 274 class HttpClient:
 275     def __init__(self, cache=None, stats=None):
 276         """
 277         Prepare http request
 278         Use cached elements or proceed http request
 279         """
 280         self.stats = stats
 281         self.controller = CacheController(cache)
 282
 283     def __call__(self, ress, payload):
 284         req = Request('GET', ress, params=payload,).prepare()
 285         if self.stats:
 286             self.stats.update(total=self.stats.get('total')+1)
 287         cached_response = self.controller.cached_request(req)
 288         if cached_response:
 289             if self.stats:
 290                 self.stats.update(ccontrol=self.stats.get('ccontrol')+1)
 291             return cached_response
 292         try:
 293             return self.fetch_ws(req)
 294         except Timeout:
 295             raise WSTimeout('Failed to reach server within {0}s'.format(
 296                 SOCKET_TIMEOUT))
 297         except ConnectionError as err:
 298             raise WSError(err)
 299
 300     @Throttle(WAIT_BETWEEN_REQUESTS)
 301     def fetch_ws(self, prepreq):
 302         """fetch from web service"""
 303         sess = Session()
 304         settings = sess.merge_environment_settings(prepreq.url, {}, None, False, None)
 305         resp = sess.send(prepreq, timeout=SOCKET_TIMEOUT, **settings)
 306         if resp.status_code == 304:
 307             self.stats.update(etag=self.stats.get('etag')+1)
 308             resp = self.controller.update_cached_response(prepreq, resp)
 309         elif resp.status_code != 200:
 310             raise WSHTTPError('{0.status_code}: {0.reason}'.format(resp))
 311         ratelimit = resp.headers.get('x-ratelimit-remaining', None)
 312         if ratelimit and self.stats:
 313             minrl = min(int(ratelimit), self.stats.get('minrl'))
 314             self.stats.update(minrl=minrl)
 315         self.controller.cache_response(resp.request, resp)
 316         return resp
 317
 318 # VIM MODLINE
 319 # vim: ai ts=4 sw=4 sts=4 expandtab