1 # -*- coding: utf-8 -*-
3 # Copyright (c) 2014 Jack Kaliko <kaliko@azylum.org>
4 # Copyright (c) 2012, 2013 Eric Larson <eric@ionrock.org>
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 The httplib2 algorithms ported for use with requests.
29 from .cache import DictCache
32 URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?")
36 """Parses a URI using the regex given in Appendix B of RFC 3986.
38 (scheme, authority, path, query, fragment) = parse_uri(uri)
40 groups = URI.match(uri).groups()
41 return (groups[1], groups[3], groups[4], groups[6], groups[8])
44 class CacheController(object):
45 """An interface to see if request should cached or not.
47 def __init__(self, cache=None, cache_etags=True):
48 self.cache = cache or DictCache()
49 self.cache_etags = cache_etags
51 def _urlnorm(self, uri):
52 """Normalize the URL to create a safe key for the cache"""
53 (scheme, authority, path, query, _) = parse_uri(uri)
54 if not scheme or not authority:
55 raise Exception("Only absolute URIs are allowed. uri = %s" % uri)
56 authority = authority.lower()
57 scheme = scheme.lower()
61 # Could do syntax based normalization of the URI before
62 # computing the digest. See Section 6.2.2 of Std 66.
63 request_uri = query and "?".join([path, query]) or path
64 scheme = scheme.lower()
65 defrag_uri = scheme + "://" + authority + request_uri
69 def cache_url(self, uri):
70 return self._urlnorm(uri)
72 def parse_cache_control(self, headers):
74 Parse the cache control headers returning a dictionary with values
75 for the different directives.
79 # requests provides a CaseInsensitiveDict as headers
80 cc_header = 'cache-control'
81 if cc_header in headers:
82 parts = headers[cc_header].split(',')
84 tuple([x.strip().lower() for x in part.split("=", 1)])
85 for part in parts if -1 != part.find("=")]
86 parts_wo_args = [(name.strip().lower(), 1)
87 for name in parts if -1 == name.find("=")]
88 retval = dict(parts_with_args + parts_wo_args)
91 def cached_request(self, url, headers):
92 """Return the cached resquest if available and fresh
94 cache_url = self.cache_url(url)
95 cc = self.parse_cache_control(headers)
98 no_cache = True if 'no-cache' in cc else False
99 if 'max-age' in cc and cc['max-age'] == 0:
102 # see if it is in the cache anyways
103 in_cache = self.cache.get(cache_url)
104 if no_cache or not in_cache:
107 # It is in the cache, so lets see if it is going to be
109 resp = self.cache.get(cache_url)
111 # Check our Vary header to make sure our request headers match
112 # up. We don't delete it from the though, we just don't return
115 # NOTE: Because httplib2 stores raw content, it denotes
116 # headers that were sent in the original response by
117 # adding -varied-$name. We don't have to do that b/c we
118 # are storing the object which has a reference to the
119 # original request. If that changes, then I'd propose
120 # using the varied headers in the cache key to avoid the
121 # situation all together.
122 if 'vary' in resp.headers:
123 varied_headers = resp.headers['vary'].replace(' ', '').split(',')
124 original_headers = resp.request.headers
125 for header in varied_headers:
126 # If our headers don't match for the headers listed in
127 # the vary header, then don't use the cached response
128 if headers.get(header, None) != original_headers.get(header):
132 date = calendar.timegm(
133 email.utils.parsedate_tz(resp.headers['date'])
135 current_age = max(0, now - date)
137 # TODO: There is an assumption that the result will be a
138 # requests response object. This may not be best since we
139 # could probably avoid instantiating or constructing the
140 # response until we know we need it.
141 resp_cc = self.parse_cache_control(resp.headers)
143 # determine freshness
144 freshness_lifetime = 0
145 if 'max-age' in resp_cc and resp_cc['max-age'].isdigit():
146 freshness_lifetime = int(resp_cc['max-age'])
147 elif 'expires' in resp.headers:
148 expires = email.utils.parsedate_tz(resp.headers['expires'])
149 if expires is not None:
150 expire_time = calendar.timegm(expires) - date
151 freshness_lifetime = max(0, expire_time)
153 # determine if we are setting freshness limit in the req
156 freshness_lifetime = int(cc['max-age'])
158 freshness_lifetime = 0
160 if 'min-fresh' in cc:
162 min_fresh = int(cc['min-fresh'])
165 # adjust our current age by our min fresh
166 current_age += min_fresh
168 # see how fresh we actually are
169 fresh = (freshness_lifetime > current_age)
172 # make sure we set the from_cache to true
173 resp.from_cache = True
176 # we're not fresh. If we don't have an Etag, clear it out
177 if 'etag' not in resp.headers:
178 self.cache.delete(cache_url)
180 if 'etag' in resp.headers:
181 headers['If-None-Match'] = resp.headers['ETag']
183 if 'last-modified' in resp.headers:
184 headers['If-Modified-Since'] = resp.headers['Last-Modified']
186 # return the original handler
189 def add_headers(self, url):
190 resp = self.cache.get(url)
191 if resp and 'etag' in resp.headers:
192 return {'If-None-Match': resp.headers['etag']}
195 def cache_response(self, request, resp):
197 Algorithm for caching requests.
199 This assumes a requests Response object.
201 # From httplib2: Don't cache 206's since we aren't going to
202 # handle byte range requests
203 if resp.status_code not in [200, 203]:
206 cc_req = self.parse_cache_control(request.headers)
207 cc_resp = self.parse_cache_control(resp.headers)
209 cache_url = self.cache_url(request.url)
211 # Delete it from the cache if we happen to have it stored there
212 no_store = cc_resp.get('no-store') or cc_req.get('no-store')
213 if no_store and self.cache.get(cache_url):
214 self.cache.delete(cache_url)
216 # If we've been given an etag, then keep the response
217 if self.cache_etags and 'etag' in resp.headers:
218 self.cache.set(cache_url, resp)
220 # Add to the cache if the response headers demand it. If there
221 # is no date header then we can't do anything about expiring
223 elif 'date' in resp.headers:
224 # cache when there is a max-age > 0
225 if cc_resp and cc_resp.get('max-age'):
226 if int(cc_resp['max-age']) > 0:
227 self.cache.set(cache_url, resp)
229 # If the request can expire, it means we should cache it
231 elif 'expires' in resp.headers:
232 if resp.headers['expires']:
233 self.cache.set(cache_url, resp)
235 def update_cached_response(self, request, response):
236 """On a 304 we will get a new set of headers that we want to
237 update our cached value with, assuming we have one.
239 This should only ever be called when we've sent an ETag and
240 gotten a 304 as the response.
242 cache_url = self.cache_url(request.url)
244 resp = self.cache.get(cache_url)
247 # we didn't have a cached response
250 # did so lets update our headers
251 resp.headers.update(response.headers)
253 # we want a 200 b/c we have content via the cache
254 request.status_code = 200
256 # update the request as it has the if-none-match header + any
257 # other headers that the server might have updated (ie Date,
258 # Cache-Control, Expires, etc.)
259 resp.request = request
262 self.cache.set(cache_url, resp)
264 # Let everyone know this was from the cache.
265 resp.from_cache = True