X-Git-Url: https://git.kaliko.me/?a=blobdiff_plain;f=sima%2Flib%2Fhttp.py;h=5014516e82cc228b06b421d2372a6f04eeb95f62;hb=HEAD;hp=1040c2a98f949daae3a9e15261136b3e77a0df6a;hpb=71500abd7ef16784d027a8a20aa28b06e8a13a4f;p=mpd-sima.git diff --git a/sima/lib/http.py b/sima/lib/http.py index 1040c2a..5014516 100644 --- a/sima/lib/http.py +++ b/sima/lib/http.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright (c) 2014 Jack Kaliko +# Copyright (c) 2014-2015, 2020, 2021 kaliko # Copyright (c) 2012, 2013 Eric Larson # # This program is free software: you can redistribute it and/or modify @@ -26,6 +26,10 @@ import time import email.utils +from requests import Session, Request, Timeout, ConnectionError as HTTPConnectionError + +from sima import SOCKET_TIMEOUT, WAIT_BETWEEN_REQUESTS +from sima.utils.utils import WSError, WSTimeout, WSHTTPError, Throttle from .cache import DictCache @@ -41,9 +45,11 @@ def parse_uri(uri): return (groups[1], groups[3], groups[4], groups[6], groups[8]) -class CacheController(object): +class CacheController: """An interface to see if request should cached or not. """ + CACHE_ANYWAY = False + def __init__(self, cache=None, cache_etags=True): self.cache = cache or DictCache() self.cache_etags = cache_etags @@ -52,7 +58,7 @@ class CacheController(object): """Normalize the URL to create a safe key for the cache""" (scheme, authority, path, query, _) = parse_uri(uri) if not scheme or not authority: - raise Exception("Only absolute URIs are allowed. uri = %s" % uri) + raise Exception(f'Only absolute URIs are allowed. uri = {uri}') authority = authority.lower() scheme = scheme.lower() if not path: @@ -60,7 +66,7 @@ class CacheController(object): # Could do syntax based normalization of the URI before # computing the digest. See Section 6.2.2 of Std 66. - request_uri = query and "?".join([path, query]) or path + request_uri = "?".join([path, query]) if query else path scheme = scheme.lower() defrag_uri = scheme + "://" + authority + request_uri @@ -88,17 +94,16 @@ class CacheController(object): retval = dict(parts_with_args + parts_wo_args) return retval - def cached_request(self, url, headers): + def cached_request(self, request): """Return the cached resquest if available and fresh """ - cache_url = self.cache_url(url) - cc = self.parse_cache_control(headers) + cache_url = self.cache_url(request.url) + cc = self.parse_cache_control(request.headers) # non-caching states - no_cache = True if 'no-cache' in cc else False + no_cache = bool('no-cache' in cc) if 'max-age' in cc and cc['max-age'] == 0: no_cache = True - # see if it is in the cache anyways in_cache = self.cache.get(cache_url) if no_cache or not in_cache: @@ -125,7 +130,7 @@ class CacheController(object): for header in varied_headers: # If our headers don't match for the headers listed in # the vary header, then don't use the cached response - if headers.get(header, None) != original_headers.get(header): + if request.headers.get(header, None) != original_headers.get(header): return False now = time.time() @@ -173,8 +178,16 @@ class CacheController(object): resp.from_cache = True return resp - # we're not fresh. - self.cache.delete(cache_url) + # we're not fresh. If we don't have an Etag, clear it out + if 'etag' not in resp.headers: + self.cache.delete(cache_url) + + if 'etag' in resp.headers: + request.headers['If-None-Match'] = resp.headers['ETag'] + + if 'last-modified' in resp.headers: + request.headers['If-Modified-Since'] = resp.headers['Last-Modified'] + # return the original handler return False @@ -199,10 +212,14 @@ class CacheController(object): if no_store and self.cache.get(cache_url): self.cache.delete(cache_url) + # If we've been given an etag, then keep the response + if self.cache_etags and 'etag' in resp.headers: + self.cache.set(cache_url, resp) + # Add to the cache if the response headers demand it. If there # is no date header then we can't do anything about expiring # the cache. - if 'date' in resp.headers: + elif 'date' in resp.headers: # cache when there is a max-age > 0 if cc_resp and cc_resp.get('max-age'): if int(cc_resp['max-age']) > 0: @@ -213,3 +230,86 @@ class CacheController(object): elif 'expires' in resp.headers: if resp.headers['expires']: self.cache.set(cache_url, resp) + # Force one month max age if no Cache-Control header is found + # Overriding header disappearance on LastFM web service... + # https://gitlab.com/kaliko/sima/-/issues/7 + elif CacheController.CACHE_ANYWAY: + resp.headers['Cache-Control'] = 'max-age=2419200' + self.cache.set(cache_url, resp) + + def update_cached_response(self, request, response): + """On a 304 we will get a new set of headers that we want to + update our cached value with, assuming we have one. + + This should only ever be called when we've sent an ETag and + gotten a 304 as the response. + """ + cache_url = self.cache_url(request.url) + + resp = self.cache.get(cache_url) + + if not resp: + # we didn't have a cached response + return response + + # did so lets update our headers + resp.headers.update(response.headers) + + # we want a 200 b/c we have content via the cache + request.status_code = 200 + + # update the request as it has the if-none-match header + any + # other headers that the server might have updated (ie Date, + # Cache-Control, Expires, etc.) + resp.request = request + + # update our cache + self.cache.set(cache_url, resp) + + # Let everyone know this was from the cache. + resp.from_cache = True + + return resp + + +class HttpClient: + def __init__(self, cache=None, stats=None): + """ + Prepare http request + Use cached elements or proceed http request + """ + self.stats = stats + self.controller = CacheController(cache) + self.sess = Session() + + def __call__(self, ress, payload): + req = Request('GET', ress, params=payload,).prepare() + if self.stats: + self.stats.update(total=self.stats.get('total')+1) + cached_response = self.controller.cached_request(req) + if cached_response: + if self.stats: + self.stats.update(ccontrol=self.stats.get('ccontrol')+1) + return cached_response + try: + return self.fetch_ws(req) + except Timeout as err: + raise WSTimeout(f'Failed to reach server within {SOCKET_TIMEOUT}s') from err + except HTTPConnectionError as err: + raise WSError(err) from err + + @Throttle(WAIT_BETWEEN_REQUESTS) + def fetch_ws(self, prepreq): + """fetch from web service""" + settings = self.sess.merge_environment_settings(prepreq.url, {}, None, False, None) + resp = self.sess.send(prepreq, timeout=SOCKET_TIMEOUT, **settings) + if resp.status_code == 304: + self.stats.update(etag=self.stats.get('etag')+1) + resp = self.controller.update_cached_response(prepreq, resp) + elif resp.status_code != 200: + raise WSHTTPError(f'{resp.status_code}: {resp.reason}') + self.controller.cache_response(resp.request, resp) + return resp + +# VIM MODLINE +# vim: ai ts=4 sw=4 sts=4 expandtab