Persistent HTTP cache based on CacheControl

author kaliko <efrim@azylum.org>

Thu, 20 Feb 2014 19:09:29 +0000 (20:09 +0100)

committer kaliko <efrim@azylum.org>

Thu, 20 Feb 2014 19:56:30 +0000 (20:56 +0100)
author kaliko <efrim@azylum.org>
Thu, 20 Feb 2014 19:09:29 +0000 (20:09 +0100)
committer kaliko <efrim@azylum.org>
Thu, 20 Feb 2014 19:56:30 +0000 (20:56 +0100)
diff --git a/sima/lib/httpcli/__init__.py b/sima/lib/httpcli/__init__.py

new file mode 100644 (file)

index 0000000..e69de29
diff --git a/sima/lib/httpcli/cache.py b/sima/lib/httpcli/cache.py

new file mode 100644 (file)

index 0000000..22c751d
--- /dev/null
+++ b/sima/lib/httpcli/cache.py
@@ -0,0 +1,76 @@
+"""
+The cache object API for implementing caches. The default is just a
+dictionary, which in turns means it is not threadsafe for writing.
+"""
+
+import os
+import base64
+import codecs
+
+from hashlib import md5
+from pickle import load, dump
+from threading import Lock
+
+from .filelock import FileLock
+
+
+class BaseCache:
+
+    def get(self, key):
+        raise NotImplemented()
+
+    def set(self, key, value):
+        raise NotImplemented()
+
+    def delete(self, key):
+        raise NotImplemented()
+
+
+class DictCache(BaseCache):
+
+    def __init__(self, init_dict=None):
+        self.lock = Lock()
+        self.data = init_dict or {}
+
+    def get(self, key):
+        return self.data.get(key, None)
+
+    def set(self, key, value):
+        with self.lock:
+            self.data.update({key: value})
+
+    def delete(self, key):
+        with self.lock:
+            if key in self.data:
+                self.data.pop(key)
+
+
+class FileCache:
+
+    def __init__(self, directory, forever=False):
+        self.directory = directory
+        self.forever = forever
+
+        if not os.path.isdir(self.directory):
+            os.mkdir(self.directory)
+
+    def encode(self, x):
+        return md5(x.encode('utf-8')).hexdigest()
+
+    def _fn(self, name):
+        return os.path.join(self.directory, self.encode(name))
+
+    def get(self, key):
+        name = self._fn(key)
+        if os.path.exists(name):
+            return load(codecs.open(name, 'rb'))
+
+    def set(self, key, value):
+        name = self._fn(key)
+        with FileLock(name):
+            with codecs.open(name, 'w+b') as fh:
+                dump(value, fh)
+
+    def delete(self, key):
+        if not self.forever:
+            os.remove(self._fn(key))
diff --git a/sima/lib/httpcli/controller.py b/sima/lib/httpcli/controller.py

new file mode 100644 (file)

index 0000000..c447895
--- /dev/null
+++ b/sima/lib/httpcli/controller.py
@@ -0,0 +1,213 @@
+"""
+The httplib2 algorithms ported for use with requests.
+"""
+import re
+import calendar
+import time
+
+from sima.lib.httpcli.cache import DictCache
+import email.utils
+
+
+URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?")
+
+
+def parse_uri(uri):
+    """Parses a URI using the regex given in Appendix B of RFC 3986.
+
+        (scheme, authority, path, query, fragment) = parse_uri(uri)
+    """
+    groups = URI.match(uri).groups()
+    return (groups[1], groups[3], groups[4], groups[6], groups[8])
+
+
+class CacheController(object):
+    """An interface to see if request should cached or not.
+    """
+    def __init__(self, cache=None, cache_etags=True):
+        self.cache = cache or DictCache()
+        self.cache_etags = cache_etags
+
+    def _urlnorm(self, uri):
+        """Normalize the URL to create a safe key for the cache"""
+        (scheme, authority, path, query, fragment) = parse_uri(uri)
+        if not scheme or not authority:
+            raise Exception("Only absolute URIs are allowed. uri = %s" % uri)
+        authority = authority.lower()
+        scheme = scheme.lower()
+        if not path:
+            path = "/"
+
+        # Could do syntax based normalization of the URI before
+        # computing the digest. See Section 6.2.2 of Std 66.
+        request_uri = query and "?".join([path, query]) or path
+        scheme = scheme.lower()
+        defrag_uri = scheme + "://" + authority + request_uri
+
+        return defrag_uri
+
+    def cache_url(self, uri):
+        return self._urlnorm(uri)
+
+    def parse_cache_control(self, headers):
+        """
+        Parse the cache control headers returning a dictionary with values
+        for the different directives.
+        """
+        retval = {}
+
+        cc_header = 'cache-control'
+        if 'Cache-Control' in headers:
+            cc_header = 'Cache-Control'
+
+        if cc_header in headers:
+            parts = headers[cc_header].split(',')
+            parts_with_args = [
+                tuple([x.strip().lower() for x in part.split("=", 1)])
+                for part in parts if -1 != part.find("=")]
+            parts_wo_args = [(name.strip().lower(), 1)
+                             for name in parts if -1 == name.find("=")]
+            retval = dict(parts_with_args + parts_wo_args)
+        return retval
+
+    def cached_request(self, url, headers):
+        cache_url = self.cache_url(url)
+        cc = self.parse_cache_control(headers)
+
+        # non-caching states
+        no_cache = True if 'no-cache' in cc else False
+        if 'max-age' in cc and cc['max-age'] == 0:
+            no_cache = True
+
+        # see if it is in the cache anyways
+        in_cache = self.cache.get(cache_url)
+        if no_cache or not in_cache:
+            return False
+
+        # It is in the cache, so lets see if it is going to be
+        # fresh enough
+        resp = self.cache.get(cache_url)
+
+        # Check our Vary header to make sure our request headers match
+        # up. We don't delete it from the though, we just don't return
+        # our cached value.
+        #
+        # NOTE: Because httplib2 stores raw content, it denotes
+        #       headers that were sent in the original response by
+        #       adding -varied-$name. We don't have to do that b/c we
+        #       are storing the object which has a reference to the
+        #       original request. If that changes, then I'd propose
+        #       using the varied headers in the cache key to avoid the
+        #       situation all together.
+        if 'vary' in resp.headers:
+            varied_headers = resp.headers['vary'].replace(' ', '').split(',')
+            original_headers = resp.request.headers
+            for header in varied_headers:
+                # If our headers don't match for the headers listed in
+                # the vary header, then don't use the cached response
+                if headers.get(header, None) != original_headers.get(header):
+                    return False
+
+        now = time.time()
+        date = calendar.timegm(
+            email.utils.parsedate_tz(resp.headers['date'])
+        )
+        current_age = max(0, now - date)
+
+        # TODO: There is an assumption that the result will be a
+        # requests response object. This may not be best since we
+        # could probably avoid instantiating or constructing the
+        # response until we know we need it.
+        resp_cc = self.parse_cache_control(resp.headers)
+
+        # determine freshness
+        freshness_lifetime = 0
+        if 'max-age' in resp_cc and resp_cc['max-age'].isdigit():
+            freshness_lifetime = int(resp_cc['max-age'])
+        elif 'expires' in resp.headers:
+            expires = email.utils.parsedate_tz(resp.headers['expires'])
+            if expires is not None:
+                expire_time = calendar.timegm(expires) - date
+                freshness_lifetime = max(0, expire_time)
+
+        # determine if we are setting freshness limit in the req
+        if 'max-age' in cc:
+            try:
+                freshness_lifetime = int(cc['max-age'])
+            except ValueError:
+                freshness_lifetime = 0
+
+        if 'min-fresh' in cc:
+            try:
+                min_fresh = int(cc['min-fresh'])
+            except ValueError:
+                min_fresh = 0
+            # adjust our current age by our min fresh
+            current_age += min_fresh
+
+        # see how fresh we actually are
+        fresh = (freshness_lifetime > current_age)
+
+        if fresh:
+            # make sure we set the from_cache to true
+            resp.from_cache = True
+            return resp
+
+        # we're not fresh. If we don't have an Etag, clear it out
+        if 'etag' not in resp.headers:
+            self.cache.delete(cache_url)
+
+        if 'etag' in resp.headers:
+            headers['If-None-Match'] = resp.headers['ETag']
+
+        if 'last-modified' in resp.headers:
+            headers['If-Modified-Since'] = resp.headers['Last-Modified']
+
+        # return the original handler
+        return False
+
+    def add_headers(self, url):
+        resp = self.cache.get(url)
+        if resp and 'etag' in resp.headers:
+            return {'If-None-Match': resp.headers['etag']}
+        return {}
+
+    def cache_response(self, request, resp):
+        """
+        Algorithm for caching requests.
+
+        This assumes a requests Response object.
+        """
+        # From httplib2: Don't cache 206's since we aren't going to
+        # handle byte range requests
+        if resp.status_code not in [200, 203]:
+            return
+
+        cc_req = self.parse_cache_control(request.headers)
+        cc = self.parse_cache_control(resp.headers)
+
+        cache_url = self.cache_url(request.url)
+
+        # Delete it from the cache if we happen to have it stored there
+        no_store = cc.get('no-store') or cc_req.get('no-store')
+        if no_store and self.cache.get(cache_url):
+            self.cache.delete(cache_url)
+
+        # If we've been given an etag, then keep the response
+        if self.cache_etags and 'etag' in resp.headers:
+            self.cache.set(cache_url, resp)
+
+        # Add to the cache if the response headers demand it. If there
+        # is no date header then we can't do anything about expiring
+        # the cache.
+        elif 'date' in resp.headers:
+            # cache when there is a max-age > 0
+            if cc and cc.get('max-age'):
+                if int(cc['max-age']) > 0:
+                    self.cache.set(cache_url, resp)
+
+            # If the request can expire, it means we should cache it
+            # in the meantime.
+            elif 'expires' in resp.headers:
+                if resp.headers['expires']:
+                    self.cache.set(cache_url, resp)
diff --git a/sima/lib/httpcli/filelock.py b/sima/lib/httpcli/filelock.py

new file mode 100644 (file)

index 0000000..6dc331b
--- /dev/null
+++ b/sima/lib/httpcli/filelock.py
@@ -0,0 +1,80 @@
+# -*- coding: utf-8 -*-\r
+# https://github.com/dmfrey/FileLock\r
+\r
+import os\r
+import time\r
+import errno\r
+\r
+class FileLockException(Exception):\r
+    pass\r
+\r
+class FileLock:\r
+    """ A file locking mechanism that has context-manager support so\r
+        you can use it in a with statement. This should be relatively cross\r
+        compatible as it doesn't rely on msvcrt or fcntl for the locking.\r
+    """\r
+\r
+    def __init__(self, file_name, timeout=10, delay=.05):\r
+        """ Prepare the file locker. Specify the file to lock and optionally\r
+            the maximum timeout and the delay between each attempt to lock.\r
+        """\r
+        self.is_locked = False\r
+        self.lockfile = os.path.join(os.getcwd(), "%s.lock" % file_name)\r
+        self.file_name = file_name\r
+        self.timeout = timeout\r
+        self.delay = delay\r
+\r
+\r
+    def acquire(self):\r
+        """ Acquire the lock, if possible. If the lock is in use, it check again\r
+            every `wait` seconds. It does this until it either gets the lock or\r
+            exceeds `timeout` number of seconds, in which case it throws\r
+            an exception.\r
+        """\r
+        start_time = time.time()\r
+        while True:\r
+            try:\r
+                self.fd = os.open(self.lockfile, os.O_CREAT|os.O_EXCL|os.O_RDWR)\r
+                break;\r
+            except OSError as e:\r
+                if e.errno != errno.EEXIST:\r
+                    raise\r
+                if (time.time() - start_time) >= self.timeout:\r
+                    raise FileLockException("Timeout occured.")\r
+                time.sleep(self.delay)\r
+        self.is_locked = True\r
+\r
+\r
+    def release(self):\r
+        """ Get rid of the lock by deleting the lockfile.\r
+            When working in a `with` statement, this gets automatically\r
+            called at the end.\r
+        """\r
+        if self.is_locked:\r
+            os.close(self.fd)\r
+            os.unlink(self.lockfile)\r
+            self.is_locked = False\r
+\r
+\r
+    def __enter__(self):\r
+        """ Activated when used in the with statement.\r
+            Should automatically acquire a lock to be used in the with block.\r
+        """\r
+        if not self.is_locked:\r
+            self.acquire()\r
+        return self\r
+\r
+\r
+    def __exit__(self, type, value, traceback):\r
+        """ Activated at the end of the with statement.\r
+            It automatically releases the lock if it isn't locked.\r
+        """\r
+        if self.is_locked:\r
+            self.release()\r
+\r
+\r
+    def __del__(self):\r
+        """ Make sure that the FileLock instance doesn't leave a lockfile\r
+            lying around.\r
+        """\r
+        self.release()\r
diff --git a/sima/lib/simaecho.py b/sima/lib/simaecho.py

index bbfc11416d8fafd8a4f7eaa4b1e064af924848d5..592ea0387f9fd718312cf50c3028e3f70f271f07 100644 (file)
--- a/sima/lib/simaecho.py
+++ b/sima/lib/simaecho.py
@@ -21,19 +21,21 @@
  Consume EchoNest web service
  """
  
  Consume EchoNest web service
  """
  
-__version__ = '0.0.1'
+__version__ = '0.0.2'
  __author__ = 'Jack Kaliko'
  
  
  from datetime import datetime, timedelta
  
  __author__ = 'Jack Kaliko'
  
  
  from datetime import datetime, timedelta
  
-from requests import get, Request, Timeout, ConnectionError
+from requests import Session, Request, Timeout, ConnectionError
  
  from sima import ECH
  from sima.lib.meta import Artist
  from sima.lib.track import Track
  
  from sima import ECH
  from sima.lib.meta import Artist
  from sima.lib.track import Track
+from sima.lib.httpcli.controller import CacheController
+from sima.lib.httpcli.cache import FileCache
  from sima.utils.utils import WSError, WSNotFound, WSTimeout, WSHTTPError
  from sima.utils.utils import WSError, WSNotFound, WSTimeout, WSHTTPError
-from sima.utils.utils import getws, Throttle, Cache, purge_cache
+from sima.utils.utils import getws, Throttle
  if len(ECH.get('apikey')) == 23:  # simple hack allowing imp.reload
      getws(ECH)
  
  if len(ECH.get('apikey')) == 23:  # simple hack allowing imp.reload
      getws(ECH)
  
@@ -46,26 +48,27 @@ class SimaEch:
      """EchoNest http client
      """
      root_url = 'http://{host}/api/{version}'.format(**ECH)
      """EchoNest http client
      """
      root_url = 'http://{host}/api/{version}'.format(**ECH)
-    cache = {}
      timestamp = datetime.utcnow()
      ratelimit = None
      name = 'EchoNest'
      timestamp = datetime.utcnow()
      ratelimit = None
      name = 'EchoNest'
+    cache = FileCache('/home/kaliko/.local/share/mpd_sima/http')
  
  
-    def __init__(self, cache=True):
-        self.artist = None
+    def __init__(self):
          self._ressource = None
          self.current_element = None
          self._ressource = None
          self.current_element = None
-        self.caching = cache
-        purge_cache(self.__class__)
+        self.controller = CacheController(self.cache)
  
      def _fetch(self, payload):
          """Use cached elements or proceed http request"""
  
      def _fetch(self, payload):
          """Use cached elements or proceed http request"""
-        url = Request('GET', self._ressource, params=payload,).prepare().url
-        if url in SimaEch.cache:
-            self.current_element = SimaEch.cache.get(url).elem
-            return
+        req = Request('GET', self._ressource, params=payload,
+                      ).prepare()
+        if self.cache:
+            cached_response = self.controller.cached_request(req.url, req.headers)
+            if cached_response:
+                return cached_response.json()
+
          try:
          try:
-            self._fetch_ws(payload)
+            return self._fetch_ws(req)
          except Timeout:
              raise WSTimeout('Failed to reach server within {0}s'.format(
                                 SOCKET_TIMEOUT))
          except Timeout:
              raise WSTimeout('Failed to reach server within {0}s'.format(
                                 SOCKET_TIMEOUT))
@@ -73,28 +76,28 @@ class SimaEch:
              raise WSError(err)
  
      @Throttle(WAIT_BETWEEN_REQUESTS)
              raise WSError(err)
  
      @Throttle(WAIT_BETWEEN_REQUESTS)
-    def _fetch_ws(self, payload):
+    def _fetch_ws(self, prepreq):
          """fetch from web service"""
          """fetch from web service"""
-        req = get(self._ressource, params=payload,
-                            timeout=SOCKET_TIMEOUT)
-        self.__class__.ratelimit = req.headers.get('x-ratelimit-remaining', None)
-        if req.status_code is not 200:
-            raise WSHTTPError('{0.status_code}: {0.reason}'.format(req))
-        self.current_element = req.json()
-        self._controls_answer()
-        if self.caching:
-            SimaEch.cache.update({req.url:
-                                 Cache(self.current_element)})
-
-    def _controls_answer(self):
+        sess = Session()
+        resp = sess.send(prepreq, timeout=SOCKET_TIMEOUT)
+        self.__class__.ratelimit = resp.headers.get('x-ratelimit-remaining', None)
+        if resp.status_code is not 200:
+            raise WSHTTPError('{0.status_code}: {0.reason}'.format(resp))
+        ans = resp.json()
+        self._controls_answer(ans)
+        if self.cache:
+            self.controller.cache_response(resp.request, resp)
+        return ans
+
+    def _controls_answer(self, ans):
          """Controls answer.
          """
          """Controls answer.
          """
-        status = self.current_element.get('response').get('status')
+        status = ans.get('response').get('status')
          code = status.get('code')
          if code is 0:
              return True
          if code is 5:
          code = status.get('code')
          if code is 0:
              return True
          if code is 5:
-            raise WSNotFound('Artist not found: "{0}"'.format(self.artist))
+            raise WSNotFound('Artist not found')
          raise WSError(status.get('message'))
  
      def _forge_payload(self, artist, top=False):
          raise WSError(status.get('message'))
  
      def _forge_payload(self, artist, top=False):
@@ -103,7 +106,6 @@ class SimaEch:
          payload = {'api_key': ECH.get('apikey')}
          if not isinstance(artist, Artist):
              raise TypeError('"{0!r}" not an Artist object'.format(artist))
          payload = {'api_key': ECH.get('apikey')}
          if not isinstance(artist, Artist):
              raise TypeError('"{0!r}" not an Artist object'.format(artist))
-        self.artist = artist
          if artist.mbid:
              payload.update(
                      id='musicbrainz:artist:{0}'.format(artist.mbid))
          if artist.mbid:
              payload.update(
                      id='musicbrainz:artist:{0}'.format(artist.mbid))
@@ -128,8 +130,8 @@ class SimaEch:
          payload = self._forge_payload(artist)
          # Construct URL
          self._ressource = '{0}/artist/similar'.format(SimaEch.root_url)
          payload = self._forge_payload(artist)
          # Construct URL
          self._ressource = '{0}/artist/similar'.format(SimaEch.root_url)
-        self._fetch(payload)
-        for art in self.current_element.get('response').get('artists'):
+        ans = self._fetch(payload)
+        for art in ans.get('response').get('artists'):
              artist = {}
              mbid = None
              if 'foreign_ids' in art:
              artist = {}
              mbid = None
              if 'foreign_ids' in art:
@@ -145,13 +147,13 @@ class SimaEch:
          payload = self._forge_payload(artist, top=True)
          # Construct URL
          self._ressource = '{0}/song/search'.format(SimaEch.root_url)
          payload = self._forge_payload(artist, top=True)
          # Construct URL
          self._ressource = '{0}/song/search'.format(SimaEch.root_url)
-        self._fetch(payload)
+        ans = self._fetch(payload)
          titles = list()
          artist = {
                  'artist': artist.name,
                  'musicbrainz_artistid': artist.mbid,
                  }
          titles = list()
          artist = {
                  'artist': artist.name,
                  'musicbrainz_artistid': artist.mbid,
                  }
-        for song in self.current_element.get('response').get('songs'):
+        for song in ans.get('response').get('songs'):
              title = song.get('title')
              if title not in titles:
                  titles.append(title)
              title = song.get('title')
              if title not in titles:
                  titles.append(title)
author	kaliko <efrim@azylum.org>
	Thu, 20 Feb 2014 19:09:29 +0000 (20:09 +0100)
committer	kaliko <efrim@azylum.org>
	Thu, 20 Feb 2014 19:56:30 +0000 (20:56 +0100)
sima/lib/httpcli/__init__.py	[new file with mode: 0644]	patch \| blob
sima/lib/httpcli/cache.py	[new file with mode: 0644]	patch \| blob
sima/lib/httpcli/controller.py	[new file with mode: 0644]	patch \| blob
sima/lib/httpcli/filelock.py	[new file with mode: 0644]	patch \| blob
sima/lib/simaecho.py		patch \| blob \| history