Add levenstein fuzzy method in SimaStr

author kaliko <efrim@azylum.org>

Fri, 1 Nov 2013 10:58:23 +0000 (11:58 +0100)

committer kaliko <efrim@azylum.org>

Fri, 1 Nov 2013 10:58:23 +0000 (11:58 +0100)
author kaliko <efrim@azylum.org>
Fri, 1 Nov 2013 10:58:23 +0000 (11:58 +0100)
committer kaliko <efrim@azylum.org>
Fri, 1 Nov 2013 10:58:23 +0000 (11:58 +0100)
diff --git a/sima/client.py b/sima/client.py

index 929f66281fd7489cc2d001a75efa5755e3ffe138..fba50fdaf468c259febb6af304f85c2c4395ba99 100644 (file)
--- a/sima/client.py
+++ b/sima/client.py
@@ -22,7 +22,6 @@ except ImportError as err:
  from .lib.player import Player
  from .lib.track import Track
  from .lib.simastr import SimaStr
-from .utils.leven import levenshtein_ratio
  
  
  class PlayerError(Exception):
@@ -151,21 +150,14 @@ class PlayerClient(Player):
                  matching_artists.append(fuzz_art)
                  self.log.debug('"%s" matches "%s".' % (fuzz_art, artist))
                  return matching_artists
-            # Proceed with levenshtein and SimaStr
-            leven = levenshtein_ratio(artist.stripped.lower(),
-                    SimaStr(fuzz_art).stripped.lower())
-            # SimaStr string __eq__, not regular string comparison here
+            # SimaStr string __eq__ (not regular string comparison here)
              if artist == fuzz_art:
                  matching_artists.append(fuzz_art)
                  self.log.info('"%s" quite probably matches "%s" (SimaStr)' %
                                (fuzz_art, artist))
-            elif leven >= 0.82:  # PARAM
-                matching_artists.append(fuzz_art)
-                self.log.debug('FZZZ: "%s" should match "%s" (lr=%1.3f)' %
-                               (fuzz_art, artist, leven))
              else:
-                self.log.debug('FZZZ: "%s" does not match "%s" (lr=%1.3f)' %
-                               (fuzz_art, artist, leven))
+                self.log.debug('FZZZ: "%s" does not match "%s"' %
+                               (fuzz_art, artist))
          return matching_artists
  
      def find_album(self, artist, album):
diff --git a/sima/lib/simastr.py b/sima/lib/simastr.py

index 7e7668c891979562b3232c614a053b345866f42e..56cd2423a0474b540df2c4f010ee89f017585b0e 100644 (file)
--- a/sima/lib/simastr.py
+++ b/sima/lib/simastr.py
@@ -24,12 +24,11 @@ SimaStr
  Special unicode() subclass to perform fuzzy match on specific strings with
  known noise.
  
-Artist names often contain a leading 'The ' which might, or might not be
-present. Some other noise sources in artist name are 'and' words :
-    'and'/'&'/'n'/'N'.
-
-The SimaStr() object removes these words and compute equality on "stripped"
-strings.
+ * SimaStr() object removes specific patterns from the string
+ * Diacritic are removed
+ * Equality test is done on lower-cased string
+ * Equality test is not an exact comparison, the levenshtein edition distance
+   between stripped and filtered strings is used
  
  >>> from simastr import SimaStr
  >>> art0 = SimaStr('The Desert Sessions & PJ Harvey')
@@ -38,9 +37,14 @@ strings.
  >>> True
  >>> art0 == 'Desert Sessions And PJ Harvey'
  >>> True
+>>> # diacritic filter + levenshtein  example
+>>> art0 = sima.lib.simastr.SimaStr('Hubert Félix Thiéphaine')
+>>> art1 = sima.lib.simastr.SimaStr('Hubert-Felix Thiephaine')
+>>> art0 == art1
+>>> True
  >>>
  
-Current stripped word patterns (usually English followed by French andx
+Current stripped word patterns (usually English followed by French and
  Spanish alternatives)
      leading (case-insensitive):
              "the","le","la","les","el","los"
@@ -50,10 +54,9 @@ Spanish alternatives)
              combination of "[- !?\.]+" "\(? ?[Ll]ive ?\)?"
  
  
-Possibility to access to stripped string :
+Possibility to access to stripped string:
  
  >>> art0 = SimaStr('The Desert Sessions & PJ Harvey')
->>> art.stripped
  >>> print (art0, art0.stripped)
  >>> ('The Desert Sessions & PJ Harvey', 'Desert Sessions PJ Harvey')
  
@@ -63,11 +66,14 @@ TODO:
  """
  
  __author__ = 'Jack Kaliko'
-__version__ = '0.3'
+__version__ = '0.4'
  
  # IMPORTS
+import unicodedata
  from re import (compile, U, I)
  
+from ..utils.leven import levenshtein_ratio
+
  
  class SimaStr(str):
      """
@@ -95,11 +101,12 @@ class SimaStr(str):
      def __init__(self, fuzzstr):
          """
          """
-        str().__init__(fuzzstr)
+        super().__init__(fuzzstr)
          self.orig = str(fuzzstr)
          self.stripped = str(fuzzstr.strip())
          # fuzzy computation
          self._get_root()
+        self.remove_diacritics()
  
      def _get_root(self):
          """
@@ -121,13 +128,22 @@ class SimaStr(str):
              #print sea.groupdict()
              self.stripped = sea.group('root0')
  
+    def remove_diacritics(self):
+        self.stripped = ''.join(x for x in
+                                unicodedata.normalize('NFKD', self.stripped)
+                                if unicodedata.category(x) != 'Mn')
+
      def __hash__(self):
          return hash(self.stripped)
  
      def __eq__(self, other):
          if not isinstance(other, SimaStr):
-            return hash(self) == hash(SimaStr(other))
-        return hash(self) == hash(other)
+            other = SimaStr(other)
+        levenr = levenshtein_ratio(self.stripped.lower(),
+                                   other.stripped.lower())
+        if hash(self) == hash(other):
+            return True
+        return levenr >= 0.82
  
      def __ne__(self, other):
          if not isinstance(other, SimaStr):
author	kaliko <efrim@azylum.org>
	Fri, 1 Nov 2013 10:58:23 +0000 (11:58 +0100)
committer	kaliko <efrim@azylum.org>
	Fri, 1 Nov 2013 10:58:23 +0000 (11:58 +0100)
sima/client.py		patch \| blob \| history
sima/lib/simastr.py		patch \| blob \| history