1 # -*- coding: utf-8 -*-
3 # Copyright (c) 2014-2015 Jack Kaliko <kaliko@azylum.org>
4 # Copyright (c) 2012, 2013 Eric Larson <eric@ionrock.org>
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 The httplib2 algorithms ported for use with requests.
29 from requests import Session, Request, Timeout, ConnectionError
31 from sima import SOCKET_TIMEOUT, WAIT_BETWEEN_REQUESTS
32 from sima.utils.utils import WSError, WSTimeout, WSHTTPError, Throttle
33 from .cache import DictCache
36 URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?")
40 """Parses a URI using the regex given in Appendix B of RFC 3986.
42 (scheme, authority, path, query, fragment) = parse_uri(uri)
44 groups = URI.match(uri).groups()
45 return (groups[1], groups[3], groups[4], groups[6], groups[8])
48 class CacheController(object):
49 """An interface to see if request should cached or not.
53 def __init__(self, cache=None, cache_etags=True):
54 self.cache = cache or DictCache()
55 self.cache_etags = cache_etags
57 def _urlnorm(self, uri):
58 """Normalize the URL to create a safe key for the cache"""
59 (scheme, authority, path, query, _) = parse_uri(uri)
60 if not scheme or not authority:
61 raise Exception("Only absolute URIs are allowed. uri = %s" % uri)
62 authority = authority.lower()
63 scheme = scheme.lower()
67 # Could do syntax based normalization of the URI before
68 # computing the digest. See Section 6.2.2 of Std 66.
69 request_uri = query and "?".join([path, query]) or path
70 scheme = scheme.lower()
71 defrag_uri = scheme + "://" + authority + request_uri
75 def cache_url(self, uri):
76 return self._urlnorm(uri)
78 def parse_cache_control(self, headers):
80 Parse the cache control headers returning a dictionary with values
81 for the different directives.
85 # requests provides a CaseInsensitiveDict as headers
86 cc_header = 'cache-control'
87 if cc_header in headers:
88 parts = headers[cc_header].split(',')
90 tuple([x.strip().lower() for x in part.split("=", 1)])
91 for part in parts if -1 != part.find("=")]
92 parts_wo_args = [(name.strip().lower(), 1)
93 for name in parts if -1 == name.find("=")]
94 retval = dict(parts_with_args + parts_wo_args)
97 def cached_request(self, request):
98 """Return the cached resquest if available and fresh
100 cache_url = self.cache_url(request.url)
101 cc = self.parse_cache_control(request.headers)
104 no_cache = True if 'no-cache' in cc else False
105 if 'max-age' in cc and cc['max-age'] == 0:
107 # see if it is in the cache anyways
108 in_cache = self.cache.get(cache_url)
109 if no_cache or not in_cache:
112 # It is in the cache, so lets see if it is going to be
114 resp = self.cache.get(cache_url)
116 # Check our Vary header to make sure our request headers match
117 # up. We don't delete it from the though, we just don't return
120 # NOTE: Because httplib2 stores raw content, it denotes
121 # headers that were sent in the original response by
122 # adding -varied-$name. We don't have to do that b/c we
123 # are storing the object which has a reference to the
124 # original request. If that changes, then I'd propose
125 # using the varied headers in the cache key to avoid the
126 # situation all together.
127 if 'vary' in resp.headers:
128 varied_headers = resp.headers['vary'].replace(' ', '').split(',')
129 original_headers = resp.request.headers
130 for header in varied_headers:
131 # If our headers don't match for the headers listed in
132 # the vary header, then don't use the cached response
133 if request.headers.get(header, None) != original_headers.get(header):
137 date = calendar.timegm(
138 email.utils.parsedate_tz(resp.headers['date'])
140 current_age = max(0, now - date)
142 # TODO: There is an assumption that the result will be a
143 # requests response object. This may not be best since we
144 # could probably avoid instantiating or constructing the
145 # response until we know we need it.
146 resp_cc = self.parse_cache_control(resp.headers)
148 # determine freshness
149 freshness_lifetime = 0
150 if 'max-age' in resp_cc and resp_cc['max-age'].isdigit():
151 freshness_lifetime = int(resp_cc['max-age'])
152 elif 'expires' in resp.headers:
153 expires = email.utils.parsedate_tz(resp.headers['expires'])
154 if expires is not None:
155 expire_time = calendar.timegm(expires) - date
156 freshness_lifetime = max(0, expire_time)
158 # determine if we are setting freshness limit in the req
161 freshness_lifetime = int(cc['max-age'])
163 freshness_lifetime = 0
165 if 'min-fresh' in cc:
167 min_fresh = int(cc['min-fresh'])
170 # adjust our current age by our min fresh
171 current_age += min_fresh
173 # see how fresh we actually are
174 fresh = (freshness_lifetime > current_age)
177 # make sure we set the from_cache to true
178 resp.from_cache = True
181 # we're not fresh. If we don't have an Etag, clear it out
182 if 'etag' not in resp.headers:
183 self.cache.delete(cache_url)
185 if 'etag' in resp.headers:
186 request.headers['If-None-Match'] = resp.headers['ETag']
188 if 'last-modified' in resp.headers:
189 request.headers['If-Modified-Since'] = resp.headers['Last-Modified']
191 # return the original handler
194 def cache_response(self, request, resp):
196 Algorithm for caching requests.
198 This assumes a requests Response object.
200 # From httplib2: Don't cache 206's since we aren't going to
201 # handle byte range requests
202 if resp.status_code not in [200, 203]:
205 cc_req = self.parse_cache_control(request.headers)
206 cc_resp = self.parse_cache_control(resp.headers)
208 cache_url = self.cache_url(request.url)
210 # Delete it from the cache if we happen to have it stored there
211 no_store = cc_resp.get('no-store') or cc_req.get('no-store')
212 if no_store and self.cache.get(cache_url):
213 self.cache.delete(cache_url)
215 # If we've been given an etag, then keep the response
216 if self.cache_etags and 'etag' in resp.headers:
217 self.cache.set(cache_url, resp)
219 # Add to the cache if the response headers demand it. If there
220 # is no date header then we can't do anything about expiring
222 elif 'date' in resp.headers:
223 # cache when there is a max-age > 0
224 if cc_resp and cc_resp.get('max-age'):
225 if int(cc_resp['max-age']) > 0:
226 self.cache.set(cache_url, resp)
228 # If the request can expire, it means we should cache it
230 elif 'expires' in resp.headers:
231 if resp.headers['expires']:
232 self.cache.set(cache_url, resp)
233 # Force one month max age if no Cache-Control header is found
234 # Overriding header disappearance on LastFM web service...
235 # https://getsatisfaction.com/lastfm/topics/-web-api-http-cache-control-header
236 elif CacheController.CACHE_ANYWAY:
237 resp.headers['Cache-Control'] = 'max-age=2419200'
238 self.cache.set(cache_url, resp)
240 def update_cached_response(self, request, response):
241 """On a 304 we will get a new set of headers that we want to
242 update our cached value with, assuming we have one.
244 This should only ever be called when we've sent an ETag and
245 gotten a 304 as the response.
247 cache_url = self.cache_url(request.url)
249 resp = self.cache.get(cache_url)
252 # we didn't have a cached response
255 # did so lets update our headers
256 resp.headers.update(response.headers)
258 # we want a 200 b/c we have content via the cache
259 request.status_code = 200
261 # update the request as it has the if-none-match header + any
262 # other headers that the server might have updated (ie Date,
263 # Cache-Control, Expires, etc.)
264 resp.request = request
267 self.cache.set(cache_url, resp)
269 # Let everyone know this was from the cache.
270 resp.from_cache = True
276 def __init__(self, cache=None, stats=None):
279 Use cached elements or proceed http request
282 self.controller = CacheController(cache)
284 def __call__(self, ress, payload):
285 req = Request('GET', ress, params=payload,).prepare()
287 self.stats.update(total=self.stats.get('total')+1)
288 cached_response = self.controller.cached_request(req)
291 self.stats.update(ccontrol=self.stats.get('ccontrol')+1)
292 return cached_response
294 return self.fetch_ws(req)
296 raise WSTimeout('Failed to reach server within {0}s'.format(
298 except ConnectionError as err:
301 @Throttle(WAIT_BETWEEN_REQUESTS)
302 def fetch_ws(self, prepreq):
303 """fetch from web service"""
305 settings = sess.merge_environment_settings(prepreq.url, {}, None, False, None)
306 resp = sess.send(prepreq, timeout=SOCKET_TIMEOUT, **settings)
307 if resp.status_code == 304:
308 self.stats.update(etag=self.stats.get('etag')+1)
309 resp = self.controller.update_cached_response(prepreq, resp)
310 elif resp.status_code != 200:
311 raise WSHTTPError('{0.status_code}: {0.reason}'.format(resp))
312 self.controller.cache_response(resp.request, resp)
316 # vim: ai ts=4 sw=4 sts=4 expandtab