X-Git-Url: https://git.kaliko.me/?a=blobdiff_plain;f=sima%2Flib%2Fsimastr.py;h=ec82d91510bd57c724e829ff968ff34bb9e27e3e;hb=HEAD;hp=7e7668c891979562b3232c614a053b345866f42e;hpb=c1bda032095902bdcd183c530a9c4de28f3c828a;p=mpd-sima.git

diff --git a/sima/lib/simastr.py b/sima/lib/simastr.py
index 7e7668c..ec82d91 100644
--- a/sima/lib/simastr.py
+++ b/sima/lib/simastr.py
@@ -1,7 +1,6 @@
 # -*- coding: utf-8 -*-
-
 #
-# Copyright (c) 2009, 2010, 2013 Jack Kaliko <kaliko@azylum.org>
+# Copyright (c) 2009, 2010, 2013, 2021 kaliko <kaliko@azylum.org>
 #
 #  This program is free software; you can redistribute it and/or modify
 #  it under the terms of the GNU General Public License as
@@ -18,18 +17,17 @@
 #  If not, see <http://www.gnu.org/licenses/>.
 #
 
-"""
+r"""
 SimaStr
 
 Special unicode() subclass to perform fuzzy match on specific strings with
 known noise.
 
-Artist names often contain a leading 'The ' which might, or might not be
-present. Some other noise sources in artist name are 'and' words :
-    'and'/'&'/'n'/'N'.
-
-The SimaStr() object removes these words and compute equality on "stripped"
-strings.
+ * SimaStr() object removes specific patterns from the string
+ * Diacritic are removed
+ * Equality test is done on lower-cased string
+ * Equality test is not an exact comparison, the levenshtein edition distance
+   between stripped and filtered strings is used
 
 >>> from simastr import SimaStr
 >>> art0 = SimaStr('The Desert Sessions & PJ Harvey')
@@ -38,9 +36,14 @@ strings.
 >>> True
 >>> art0 == 'Desert Sessions And PJ Harvey'
 >>> True
+>>> # diacritic filter + levenshtein  example
+>>> art0 = sima.lib.simastr.SimaStr('Hubert FÃ©lix ThiÃ©phaine')
+>>> art1 = sima.lib.simastr.SimaStr('Hubert-Felix Thiephaine')
+>>> art0 == art1
+>>> True
 >>>
 
-Current stripped word patterns (usually English followed by French andx
+Current stripped word patterns (usually English followed by French and
 Spanish alternatives)
     leading (case-insensitive):
             "the","le","la","les","el","los"
@@ -50,10 +53,9 @@ Spanish alternatives)
             combination of "[- !?\.]+" "\(? ?[Ll]ive ?\)?"
 
 
-Possibility to access to stripped string :
+Possibility to access to stripped string:
 
 >>> art0 = SimaStr('The Desert Sessions & PJ Harvey')
->>> art.stripped
 >>> print (art0, art0.stripped)
 >>> ('The Desert Sessions & PJ Harvey', 'Desert Sessions PJ Harvey')
 
@@ -63,10 +65,13 @@ TODO:
 """
 
 __author__ = 'Jack Kaliko'
-__version__ = '0.3'
+__version__ = '0.4'
 
 # IMPORTS
-from re import (compile, U, I)
+import unicodedata
+from re import compile as re_compile, U, I
+
+from ..utils.leven import levenshtein_ratio
 
 
 class SimaStr(str):
@@ -74,7 +79,9 @@ class SimaStr(str):
     Specific string object for artist names and song titles.
     Here follows some class variables for regex to run on strings.
     """
-    regexp_dict = dict()
+    diafilter = True
+    leven_ratio = 0.82
+    regexp_dict = {}
 
     # Leading patterns: The Le Les
     # case-insensitive matching for this RE
@@ -86,20 +93,25 @@ class SimaStr(str):
     # Trailing patterns: ! ? live
     # TODO: add "concert" key word
     #       add "Live at <somewhere>"
-    regexp_dict.update({'trail': '([- !?\.]|\(? ?[Ll]ive ?\)?)'})
+    regexp_dict.update({'trail': r'([- !?\.]|\(? ?[Ll]ive ?\)?)'})
 
-    reg_lead = compile('^(?P<lead>%(lead)s )(?P<root0>.*)$' % regexp_dict, I | U)
-    reg_midl = compile('^(?P<root0>.*)(?P<mid> %(mid)s )(?P<root1>.*)' % regexp_dict, U)
-    reg_trail = compile('^(?P<root0>.*?)(?P<trail>%(trail)s+$)' % regexp_dict, U)
+    reg_lead = re_compile('^(?P<lead>%(lead)s )(?P<root0>.*)$' % regexp_dict, I | U)
+    reg_midl = re_compile('^(?P<root0>.*)(?P<mid> %(mid)s )(?P<root1>.*)' % regexp_dict, U)
+    reg_trail = re_compile('^(?P<root0>.*?)(?P<trail>%(trail)s+$)' % regexp_dict, U)
 
     def __init__(self, fuzzstr):
         """
         """
-        str().__init__(fuzzstr)
+        super().__init__()
         self.orig = str(fuzzstr)
         self.stripped = str(fuzzstr.strip())
         # fuzzy computation
         self._get_root()
+        if self.__class__.diafilter:
+            self.remove_diacritics()
+
+    def __new__(cls, fuzzstr):
+        return super(SimaStr, cls).__new__(cls, fuzzstr)
 
     def _get_root(self):
         """
@@ -107,27 +119,34 @@ class SimaStr(str):
         """
         sea = SimaStr.reg_lead.search(self.stripped)
         if sea:
-            #print sea.groupdict()
             self.stripped = sea.group('root0')
 
         sea = SimaStr.reg_midl.search(self.stripped)
         if sea:
-            #print sea.groupdict()
             self.stripped = str().join([sea.group('root0'), ' ',
                                         sea.group('root1')])
 
         sea = SimaStr.reg_trail.search(self.stripped)
         if sea:
-            #print sea.groupdict()
             self.stripped = sea.group('root0')
 
+    def remove_diacritics(self):
+        """converting diacritics"""
+        self.stripped = ''.join(x for x in
+                                unicodedata.normalize('NFKD', self.stripped)
+                                if unicodedata.category(x) != 'Mn')
+
     def __hash__(self):
         return hash(self.stripped)
 
     def __eq__(self, other):
         if not isinstance(other, SimaStr):
-            return hash(self) == hash(SimaStr(other))
-        return hash(self) == hash(other)
+            other = SimaStr(other)
+        levenr = levenshtein_ratio(self.stripped.lower(),
+                                   other.stripped.lower())
+        if hash(self) == hash(other):
+            return True
+        return levenr >= self.__class__.leven_ratio
 
     def __ne__(self, other):
         if not isinstance(other, SimaStr):
@@ -135,40 +154,5 @@ class SimaStr(str):
         return hash(self) != hash(other)
 
 
-# Script starts here
-if __name__ == "__main__":
-    import time
-    print(SimaStr('KÃ©tanoue'))
-    #from leven import levenshtein_ratio
-    CASES_LIST = list([
-        dict({
-                    'got': 'Guns N\' Roses (live)!! !',
-                'look for': 'Guns And Roses'}),
-        dict({
-                     'got': 'Jesus & Mary Chains',
-                'look for': 'The Jesus and Mary Chains - live'}),
-        dict({
-                         'got': 'Desert sessions',
-                    'look for': 'The Desert Sessions'}),
-        dict({
-                         'got': 'TÃªtes Raides',
-                    'look for': 'Les TÃªtes Raides'}),
-        dict({
-                         'got': 'Noir DÃ©sir',
-                    'look for': 'Noir DÃ©sir'}),
-        dict({
-                         'got': 'No Future',
-                    'look for': 'Future'})])
-
-    for case in CASES_LIST[:]:
-        str0 = case.get('got')
-        str1 = case.get('look for')
-        fz_str0 = SimaStr(str0)
-        fz_str1 = SimaStr(str1)
-        print(fz_str0, '\n', fz_str1)
-        print(fz_str0.stripped == fz_str1.stripped)
-        #print levenshtein_ratio(fz_str0.lower(), fz_str1.lower())
-        time.sleep(1)
-
 # VIM MODLINE
 # vim: ai ts=4 sw=4 sts=4 expandtab