1 # -*- coding: utf-8 -*-
3 # Copyright (c) 2014-2015 Jack Kaliko <kaliko@azylum.org>
4 # Copyright (c) 2012, 2013 Eric Larson <eric@ionrock.org>
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 The httplib2 algorithms ported for use with requests.
29 from requests import Session, Request, Timeout, ConnectionError
31 from sima import SOCKET_TIMEOUT, WAIT_BETWEEN_REQUESTS
32 from sima.utils.utils import WSError, WSTimeout, WSHTTPError, Throttle
33 from .cache import DictCache
36 URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?")
40 """Parses a URI using the regex given in Appendix B of RFC 3986.
42 (scheme, authority, path, query, fragment) = parse_uri(uri)
44 groups = URI.match(uri).groups()
45 return (groups[1], groups[3], groups[4], groups[6], groups[8])
48 class CacheController(object):
49 """An interface to see if request should cached or not.
51 def __init__(self, cache=None, cache_etags=True):
52 self.cache = cache or DictCache()
53 self.cache_etags = cache_etags
55 def _urlnorm(self, uri):
56 """Normalize the URL to create a safe key for the cache"""
57 (scheme, authority, path, query, _) = parse_uri(uri)
58 if not scheme or not authority:
59 raise Exception("Only absolute URIs are allowed. uri = %s" % uri)
60 authority = authority.lower()
61 scheme = scheme.lower()
65 # Could do syntax based normalization of the URI before
66 # computing the digest. See Section 6.2.2 of Std 66.
67 request_uri = query and "?".join([path, query]) or path
68 scheme = scheme.lower()
69 defrag_uri = scheme + "://" + authority + request_uri
73 def cache_url(self, uri):
74 return self._urlnorm(uri)
76 def parse_cache_control(self, headers):
78 Parse the cache control headers returning a dictionary with values
79 for the different directives.
83 # requests provides a CaseInsensitiveDict as headers
84 cc_header = 'cache-control'
85 if cc_header in headers:
86 parts = headers[cc_header].split(',')
88 tuple([x.strip().lower() for x in part.split("=", 1)])
89 for part in parts if -1 != part.find("=")]
90 parts_wo_args = [(name.strip().lower(), 1)
91 for name in parts if -1 == name.find("=")]
92 retval = dict(parts_with_args + parts_wo_args)
95 def cached_request(self, request):
96 """Return the cached resquest if available and fresh
98 cache_url = self.cache_url(request.url)
99 cc = self.parse_cache_control(request.headers)
102 no_cache = True if 'no-cache' in cc else False
103 if 'max-age' in cc and cc['max-age'] == 0:
106 # see if it is in the cache anyways
107 in_cache = self.cache.get(cache_url)
108 if no_cache or not in_cache:
111 # It is in the cache, so lets see if it is going to be
113 resp = self.cache.get(cache_url)
115 # Check our Vary header to make sure our request headers match
116 # up. We don't delete it from the though, we just don't return
119 # NOTE: Because httplib2 stores raw content, it denotes
120 # headers that were sent in the original response by
121 # adding -varied-$name. We don't have to do that b/c we
122 # are storing the object which has a reference to the
123 # original request. If that changes, then I'd propose
124 # using the varied headers in the cache key to avoid the
125 # situation all together.
126 if 'vary' in resp.headers:
127 varied_headers = resp.headers['vary'].replace(' ', '').split(',')
128 original_headers = resp.request.headers
129 for header in varied_headers:
130 # If our headers don't match for the headers listed in
131 # the vary header, then don't use the cached response
132 if request.headers.get(header, None) != original_headers.get(header):
136 date = calendar.timegm(
137 email.utils.parsedate_tz(resp.headers['date'])
139 current_age = max(0, now - date)
141 # TODO: There is an assumption that the result will be a
142 # requests response object. This may not be best since we
143 # could probably avoid instantiating or constructing the
144 # response until we know we need it.
145 resp_cc = self.parse_cache_control(resp.headers)
147 # determine freshness
148 freshness_lifetime = 0
149 if 'max-age' in resp_cc and resp_cc['max-age'].isdigit():
150 freshness_lifetime = int(resp_cc['max-age'])
151 elif 'expires' in resp.headers:
152 expires = email.utils.parsedate_tz(resp.headers['expires'])
153 if expires is not None:
154 expire_time = calendar.timegm(expires) - date
155 freshness_lifetime = max(0, expire_time)
157 # determine if we are setting freshness limit in the req
160 freshness_lifetime = int(cc['max-age'])
162 freshness_lifetime = 0
164 if 'min-fresh' in cc:
166 min_fresh = int(cc['min-fresh'])
169 # adjust our current age by our min fresh
170 current_age += min_fresh
172 # see how fresh we actually are
173 fresh = (freshness_lifetime > current_age)
176 # make sure we set the from_cache to true
177 resp.from_cache = True
180 # we're not fresh. If we don't have an Etag, clear it out
181 if 'etag' not in resp.headers:
182 self.cache.delete(cache_url)
184 if 'etag' in resp.headers:
185 request.headers['If-None-Match'] = resp.headers['ETag']
187 if 'last-modified' in resp.headers:
188 request.headers['If-Modified-Since'] = resp.headers['Last-Modified']
190 # return the original handler
193 def add_headers(self, url):
194 resp = self.cache.get(url)
195 if resp and 'etag' in resp.headers:
196 return {'If-None-Match': resp.headers['etag']}
199 def cache_response(self, request, resp):
201 Algorithm for caching requests.
203 This assumes a requests Response object.
205 # From httplib2: Don't cache 206's since we aren't going to
206 # handle byte range requests
207 if resp.status_code not in [200, 203]:
210 cc_req = self.parse_cache_control(request.headers)
211 cc_resp = self.parse_cache_control(resp.headers)
213 cache_url = self.cache_url(request.url)
215 # Delete it from the cache if we happen to have it stored there
216 no_store = cc_resp.get('no-store') or cc_req.get('no-store')
217 if no_store and self.cache.get(cache_url):
218 self.cache.delete(cache_url)
220 # If we've been given an etag, then keep the response
221 if self.cache_etags and 'etag' in resp.headers:
222 self.cache.set(cache_url, resp)
224 # Add to the cache if the response headers demand it. If there
225 # is no date header then we can't do anything about expiring
227 elif 'date' in resp.headers:
228 # cache when there is a max-age > 0
229 if cc_resp and cc_resp.get('max-age'):
230 if int(cc_resp['max-age']) > 0:
231 self.cache.set(cache_url, resp)
233 # If the request can expire, it means we should cache it
235 elif 'expires' in resp.headers:
236 if resp.headers['expires']:
237 self.cache.set(cache_url, resp)
239 def update_cached_response(self, request, response):
240 """On a 304 we will get a new set of headers that we want to
241 update our cached value with, assuming we have one.
243 This should only ever be called when we've sent an ETag and
244 gotten a 304 as the response.
246 cache_url = self.cache_url(request.url)
248 resp = self.cache.get(cache_url)
251 # we didn't have a cached response
254 # did so lets update our headers
255 resp.headers.update(response.headers)
257 # we want a 200 b/c we have content via the cache
258 request.status_code = 200
260 # update the request as it has the if-none-match header + any
261 # other headers that the server might have updated (ie Date,
262 # Cache-Control, Expires, etc.)
263 resp.request = request
266 self.cache.set(cache_url, resp)
268 # Let everyone know this was from the cache.
269 resp.from_cache = True
275 def __init__(self, cache=None, stats=None):
278 Use cached elements or proceed http request
281 self.controller = CacheController(cache)
283 def __call__(self, ress, payload):
284 req = Request('GET', ress, params=payload,).prepare()
286 self.stats.update(total=self.stats.get('total')+1)
287 cached_response = self.controller.cached_request(req)
290 self.stats.update(ccontrol=self.stats.get('ccontrol')+1)
291 return cached_response
293 return self.fetch_ws(req)
295 raise WSTimeout('Failed to reach server within {0}s'.format(
297 except ConnectionError as err:
300 @Throttle(WAIT_BETWEEN_REQUESTS)
301 def fetch_ws(self, prepreq):
302 """fetch from web service"""
304 settings = sess.merge_environment_settings(prepreq.url, {}, None, False, None)
305 resp = sess.send(prepreq, timeout=SOCKET_TIMEOUT, **settings)
306 if resp.status_code == 304:
307 self.stats.update(etag=self.stats.get('etag')+1)
308 resp = self.controller.update_cached_response(prepreq, resp)
309 elif resp.status_code != 200:
310 raise WSHTTPError('{0.status_code}: {0.reason}'.format(resp))
311 ratelimit = resp.headers.get('x-ratelimit-remaining', None)
312 if ratelimit and self.stats:
313 minrl = min(int(ratelimit), self.stats.get('minrl'))
314 self.stats.update(minrl=minrl)
315 self.controller.cache_response(resp.request, resp)
319 # vim: ai ts=4 sw=4 sts=4 expandtab