1 # -*- coding: utf-8 -*-
3 # Copyright (c) 2014 Jack Kaliko <kaliko@azylum.org>
4 # Copyright (c) 2012, 2013 Eric Larson <eric@ionrock.org>
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 The httplib2 algorithms ported for use with requests.
29 from .cache import DictCache
32 URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?")
36 """Parses a URI using the regex given in Appendix B of RFC 3986.
38 (scheme, authority, path, query, fragment) = parse_uri(uri)
40 groups = URI.match(uri).groups()
41 return (groups[1], groups[3], groups[4], groups[6], groups[8])
44 class CacheController(object):
45 """An interface to see if request should cached or not.
47 def __init__(self, cache=None, cache_etags=True):
48 self.cache = cache or DictCache()
49 self.cache_etags = cache_etags
51 def _urlnorm(self, uri):
52 """Normalize the URL to create a safe key for the cache"""
53 (scheme, authority, path, query, _) = parse_uri(uri)
54 if not scheme or not authority:
55 raise Exception("Only absolute URIs are allowed. uri = %s" % uri)
56 authority = authority.lower()
57 scheme = scheme.lower()
61 # Order of params might changed
62 query = ''.join(sorted(query.split('&')))
63 # Could do syntax based normalization of the URI before
64 # computing the digest. See Section 6.2.2 of Std 66.
65 request_uri = query and "?".join([path, query]) or path
66 scheme = scheme.lower()
67 defrag_uri = scheme + "://" + authority + request_uri
71 def cache_url(self, uri):
72 return self._urlnorm(uri)
74 def parse_cache_control(self, headers):
76 Parse the cache control headers returning a dictionary with values
77 for the different directives.
81 # requests provides a CaseInsensitiveDict as headers
82 cc_header = 'cache-control'
83 if cc_header in headers:
84 parts = headers[cc_header].split(',')
86 tuple([x.strip().lower() for x in part.split("=", 1)])
87 for part in parts if -1 != part.find("=")]
88 parts_wo_args = [(name.strip().lower(), 1)
89 for name in parts if -1 == name.find("=")]
90 retval = dict(parts_with_args + parts_wo_args)
93 def cached_request(self, url, headers):
94 """Return the cached resquest if available and fresh
96 cache_url = self.cache_url(url)
97 cc = self.parse_cache_control(headers)
100 no_cache = True if 'no-cache' in cc else False
101 if 'max-age' in cc and cc['max-age'] == 0:
104 # see if it is in the cache anyways
105 in_cache = self.cache.get(cache_url)
106 if no_cache or not in_cache:
109 # It is in the cache, so lets see if it is going to be
111 resp = self.cache.get(cache_url)
113 # Check our Vary header to make sure our request headers match
114 # up. We don't delete it from the though, we just don't return
117 # NOTE: Because httplib2 stores raw content, it denotes
118 # headers that were sent in the original response by
119 # adding -varied-$name. We don't have to do that b/c we
120 # are storing the object which has a reference to the
121 # original request. If that changes, then I'd propose
122 # using the varied headers in the cache key to avoid the
123 # situation all together.
124 if 'vary' in resp.headers:
125 varied_headers = resp.headers['vary'].replace(' ', '').split(',')
126 original_headers = resp.request.headers
127 for header in varied_headers:
128 # If our headers don't match for the headers listed in
129 # the vary header, then don't use the cached response
130 if headers.get(header, None) != original_headers.get(header):
134 date = calendar.timegm(
135 email.utils.parsedate_tz(resp.headers['date'])
137 current_age = max(0, now - date)
139 # TODO: There is an assumption that the result will be a
140 # requests response object. This may not be best since we
141 # could probably avoid instantiating or constructing the
142 # response until we know we need it.
143 resp_cc = self.parse_cache_control(resp.headers)
145 # determine freshness
146 freshness_lifetime = 0
147 if 'max-age' in resp_cc and resp_cc['max-age'].isdigit():
148 freshness_lifetime = int(resp_cc['max-age'])
149 elif 'expires' in resp.headers:
150 expires = email.utils.parsedate_tz(resp.headers['expires'])
151 if expires is not None:
152 expire_time = calendar.timegm(expires) - date
153 freshness_lifetime = max(0, expire_time)
155 # determine if we are setting freshness limit in the req
158 freshness_lifetime = int(cc['max-age'])
160 freshness_lifetime = 0
162 if 'min-fresh' in cc:
164 min_fresh = int(cc['min-fresh'])
167 # adjust our current age by our min fresh
168 current_age += min_fresh
170 # see how fresh we actually are
171 fresh = (freshness_lifetime > current_age)
174 # make sure we set the from_cache to true
175 resp.from_cache = True
178 # we're not fresh. If we don't have an Etag, clear it out
179 if 'etag' not in resp.headers:
180 self.cache.delete(cache_url)
182 if 'etag' in resp.headers:
183 headers['If-None-Match'] = resp.headers['ETag']
185 if 'last-modified' in resp.headers:
186 headers['If-Modified-Since'] = resp.headers['Last-Modified']
188 # return the original handler
191 def add_headers(self, url):
192 resp = self.cache.get(url)
193 if resp and 'etag' in resp.headers:
194 return {'If-None-Match': resp.headers['etag']}
197 def cache_response(self, request, resp):
199 Algorithm for caching requests.
201 This assumes a requests Response object.
203 # From httplib2: Don't cache 206's since we aren't going to
204 # handle byte range requests
205 if resp.status_code not in [200, 203]:
208 cc_req = self.parse_cache_control(request.headers)
209 cc_resp = self.parse_cache_control(resp.headers)
211 cache_url = self.cache_url(request.url)
213 # Delete it from the cache if we happen to have it stored there
214 no_store = cc_resp.get('no-store') or cc_req.get('no-store')
215 if no_store and self.cache.get(cache_url):
216 self.cache.delete(cache_url)
218 # If we've been given an etag, then keep the response
219 if self.cache_etags and 'etag' in resp.headers:
220 self.cache.set(cache_url, resp)
222 # Add to the cache if the response headers demand it. If there
223 # is no date header then we can't do anything about expiring
225 elif 'date' in resp.headers:
226 # cache when there is a max-age > 0
227 if cc_resp and cc_resp.get('max-age'):
228 if int(cc_resp['max-age']) > 0:
229 self.cache.set(cache_url, resp)
231 # If the request can expire, it means we should cache it
233 elif 'expires' in resp.headers:
234 if resp.headers['expires']:
235 self.cache.set(cache_url, resp)
237 def update_cached_response(self, request, response):
238 """On a 304 we will get a new set of headers that we want to
239 update our cached value with, assuming we have one.
241 This should only ever be called when we've sent an ETag and
242 gotten a 304 as the response.
244 cache_url = self.cache_url(request.url)
246 resp = self.cache.get(cache_url)
249 # we didn't have a cached response
252 # did so lets update our headers
253 resp.headers.update(response.headers)
255 # we want a 200 b/c we have content via the cache
256 request.status_code = 200
258 # update the request as it has the if-none-match header + any
259 # other headers that the server might have updated (ie Date,
260 # Cache-Control, Expires, etc.)
261 resp.request = request
264 self.cache.set(cache_url, resp)
266 # Let everyone know this was from the cache.
267 resp.from_cache = True