X-Git-Url: https://git.kaliko.me/?a=blobdiff_plain;f=sima%2Flib%2Fsimastr.py;h=56edbb4d7e68b82a816ad2d30cb6a99698d78864;hb=71500abd7ef16784d027a8a20aa28b06e8a13a4f;hp=c7162144cd95cb3b3d65b793e3e18871ed16e220;hpb=380d9fb347d1d367eb1a421f32c9ebce640c1639;p=mpd-sima.git diff --git a/sima/lib/simastr.py b/sima/lib/simastr.py index c716214..56edbb4 100644 --- a/sima/lib/simastr.py +++ b/sima/lib/simastr.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- - # # Copyright (c) 2009, 2010, 2013 Jack Kaliko # @@ -18,7 +17,7 @@ # If not, see . # -""" +r""" SimaStr Special unicode() subclass to perform fuzzy match on specific strings with @@ -70,7 +69,7 @@ __version__ = '0.4' # IMPORTS import unicodedata -from re import (compile, U, I) +from re import compile as re_compile, U, I from ..utils.leven import levenshtein_ratio @@ -80,6 +79,8 @@ class SimaStr(str): Specific string object for artist names and song titles. Here follows some class variables for regex to run on strings. """ + diafilter = True + leven_ratio = 0.82 regexp_dict = dict() # Leading patterns: The Le Les @@ -92,11 +93,11 @@ class SimaStr(str): # Trailing patterns: ! ? live # TODO: add "concert" key word # add "Live at " - regexp_dict.update({'trail': '([- !?\.]|\(? ?[Ll]ive ?\)?)'}) + regexp_dict.update({'trail': r'([- !?\.]|\(? ?[Ll]ive ?\)?)'}) - reg_lead = compile('^(?P%(lead)s )(?P.*)$' % regexp_dict, I | U) - reg_midl = compile('^(?P.*)(?P %(mid)s )(?P.*)' % regexp_dict, U) - reg_trail = compile('^(?P.*?)(?P%(trail)s+$)' % regexp_dict, U) + reg_lead = re_compile('^(?P%(lead)s )(?P.*)$' % regexp_dict, I | U) + reg_midl = re_compile('^(?P.*)(?P %(mid)s )(?P.*)' % regexp_dict, U) + reg_trail = re_compile('^(?P.*?)(?P%(trail)s+$)' % regexp_dict, U) def __init__(self, fuzzstr): """ @@ -105,7 +106,8 @@ class SimaStr(str): self.stripped = str(fuzzstr.strip()) # fuzzy computation self._get_root() - self.remove_diacritics() + if self.__class__.diafilter: + self.remove_diacritics() def __new__(cls, fuzzstr): return super(SimaStr, cls).__new__(cls, fuzzstr) @@ -131,6 +133,7 @@ class SimaStr(str): self.stripped = sea.group('root0') def remove_diacritics(self): + """converting diacritics""" self.stripped = ''.join(x for x in unicodedata.normalize('NFKD', self.stripped) if unicodedata.category(x) != 'Mn') @@ -145,7 +148,7 @@ class SimaStr(str): other.stripped.lower()) if hash(self) == hash(other): return True - return levenr >= 0.82 + return levenr >= self.__class__.leven_ratio def __ne__(self, other): if not isinstance(other, SimaStr):