X-Git-Url: https://git.kaliko.me/?a=blobdiff_plain;f=sima%2Flib%2Fsimastr.py;h=ec82d91510bd57c724e829ff968ff34bb9e27e3e;hb=HEAD;hp=7e7668c891979562b3232c614a053b345866f42e;hpb=c1bda032095902bdcd183c530a9c4de28f3c828a;p=mpd-sima.git diff --git a/sima/lib/simastr.py b/sima/lib/simastr.py index 7e7668c..ec82d91 100644 --- a/sima/lib/simastr.py +++ b/sima/lib/simastr.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- - # -# Copyright (c) 2009, 2010, 2013 Jack Kaliko +# Copyright (c) 2009, 2010, 2013, 2021 kaliko # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as @@ -18,18 +17,17 @@ # If not, see . # -""" +r""" SimaStr Special unicode() subclass to perform fuzzy match on specific strings with known noise. -Artist names often contain a leading 'The ' which might, or might not be -present. Some other noise sources in artist name are 'and' words : - 'and'/'&'/'n'/'N'. - -The SimaStr() object removes these words and compute equality on "stripped" -strings. + * SimaStr() object removes specific patterns from the string + * Diacritic are removed + * Equality test is done on lower-cased string + * Equality test is not an exact comparison, the levenshtein edition distance + between stripped and filtered strings is used >>> from simastr import SimaStr >>> art0 = SimaStr('The Desert Sessions & PJ Harvey') @@ -38,9 +36,14 @@ strings. >>> True >>> art0 == 'Desert Sessions And PJ Harvey' >>> True +>>> # diacritic filter + levenshtein example +>>> art0 = sima.lib.simastr.SimaStr('Hubert Félix Thiéphaine') +>>> art1 = sima.lib.simastr.SimaStr('Hubert-Felix Thiephaine') +>>> art0 == art1 +>>> True >>> -Current stripped word patterns (usually English followed by French andx +Current stripped word patterns (usually English followed by French and Spanish alternatives) leading (case-insensitive): "the","le","la","les","el","los" @@ -50,10 +53,9 @@ Spanish alternatives) combination of "[- !?\.]+" "\(? ?[Ll]ive ?\)?" -Possibility to access to stripped string : +Possibility to access to stripped string: >>> art0 = SimaStr('The Desert Sessions & PJ Harvey') ->>> art.stripped >>> print (art0, art0.stripped) >>> ('The Desert Sessions & PJ Harvey', 'Desert Sessions PJ Harvey') @@ -63,10 +65,13 @@ TODO: """ __author__ = 'Jack Kaliko' -__version__ = '0.3' +__version__ = '0.4' # IMPORTS -from re import (compile, U, I) +import unicodedata +from re import compile as re_compile, U, I + +from ..utils.leven import levenshtein_ratio class SimaStr(str): @@ -74,7 +79,9 @@ class SimaStr(str): Specific string object for artist names and song titles. Here follows some class variables for regex to run on strings. """ - regexp_dict = dict() + diafilter = True + leven_ratio = 0.82 + regexp_dict = {} # Leading patterns: The Le Les # case-insensitive matching for this RE @@ -86,20 +93,25 @@ class SimaStr(str): # Trailing patterns: ! ? live # TODO: add "concert" key word # add "Live at " - regexp_dict.update({'trail': '([- !?\.]|\(? ?[Ll]ive ?\)?)'}) + regexp_dict.update({'trail': r'([- !?\.]|\(? ?[Ll]ive ?\)?)'}) - reg_lead = compile('^(?P%(lead)s )(?P.*)$' % regexp_dict, I | U) - reg_midl = compile('^(?P.*)(?P %(mid)s )(?P.*)' % regexp_dict, U) - reg_trail = compile('^(?P.*?)(?P%(trail)s+$)' % regexp_dict, U) + reg_lead = re_compile('^(?P%(lead)s )(?P.*)$' % regexp_dict, I | U) + reg_midl = re_compile('^(?P.*)(?P %(mid)s )(?P.*)' % regexp_dict, U) + reg_trail = re_compile('^(?P.*?)(?P%(trail)s+$)' % regexp_dict, U) def __init__(self, fuzzstr): """ """ - str().__init__(fuzzstr) + super().__init__() self.orig = str(fuzzstr) self.stripped = str(fuzzstr.strip()) # fuzzy computation self._get_root() + if self.__class__.diafilter: + self.remove_diacritics() + + def __new__(cls, fuzzstr): + return super(SimaStr, cls).__new__(cls, fuzzstr) def _get_root(self): """ @@ -107,27 +119,34 @@ class SimaStr(str): """ sea = SimaStr.reg_lead.search(self.stripped) if sea: - #print sea.groupdict() self.stripped = sea.group('root0') sea = SimaStr.reg_midl.search(self.stripped) if sea: - #print sea.groupdict() self.stripped = str().join([sea.group('root0'), ' ', sea.group('root1')]) sea = SimaStr.reg_trail.search(self.stripped) if sea: - #print sea.groupdict() self.stripped = sea.group('root0') + def remove_diacritics(self): + """converting diacritics""" + self.stripped = ''.join(x for x in + unicodedata.normalize('NFKD', self.stripped) + if unicodedata.category(x) != 'Mn') + def __hash__(self): return hash(self.stripped) def __eq__(self, other): if not isinstance(other, SimaStr): - return hash(self) == hash(SimaStr(other)) - return hash(self) == hash(other) + other = SimaStr(other) + levenr = levenshtein_ratio(self.stripped.lower(), + other.stripped.lower()) + if hash(self) == hash(other): + return True + return levenr >= self.__class__.leven_ratio def __ne__(self, other): if not isinstance(other, SimaStr): @@ -135,40 +154,5 @@ class SimaStr(str): return hash(self) != hash(other) -# Script starts here -if __name__ == "__main__": - import time - print(SimaStr('Kétanoue')) - #from leven import levenshtein_ratio - CASES_LIST = list([ - dict({ - 'got': 'Guns N\' Roses (live)!! !', - 'look for': 'Guns And Roses'}), - dict({ - 'got': 'Jesus & Mary Chains', - 'look for': 'The Jesus and Mary Chains - live'}), - dict({ - 'got': 'Desert sessions', - 'look for': 'The Desert Sessions'}), - dict({ - 'got': 'Têtes Raides', - 'look for': 'Les Têtes Raides'}), - dict({ - 'got': 'Noir Désir', - 'look for': 'Noir Désir'}), - dict({ - 'got': 'No Future', - 'look for': 'Future'})]) - - for case in CASES_LIST[:]: - str0 = case.get('got') - str1 = case.get('look for') - fz_str0 = SimaStr(str0) - fz_str1 = SimaStr(str1) - print(fz_str0, '\n', fz_str1) - print(fz_str0.stripped == fz_str1.stripped) - #print levenshtein_ratio(fz_str0.lower(), fz_str1.lower()) - time.sleep(1) - # VIM MODLINE # vim: ai ts=4 sw=4 sts=4 expandtab