X-Git-Url: http://git.kaliko.me/?a=blobdiff_plain;f=sima%2Flib%2Fsimastr.py;h=56edbb4d7e68b82a816ad2d30cb6a99698d78864;hb=e0c9aed7aff3a338a023eb4dd9d2732546387ba0;hp=56cd2423a0474b540df2c4f010ee89f017585b0e;hpb=1d41464ccb6ff66441947eef0305518e3ce79a77;p=mpd-sima.git diff --git a/sima/lib/simastr.py b/sima/lib/simastr.py index 56cd242..56edbb4 100644 --- a/sima/lib/simastr.py +++ b/sima/lib/simastr.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- - # # Copyright (c) 2009, 2010, 2013 Jack Kaliko # @@ -18,7 +17,7 @@ # If not, see . # -""" +r""" SimaStr Special unicode() subclass to perform fuzzy match on specific strings with @@ -70,7 +69,7 @@ __version__ = '0.4' # IMPORTS import unicodedata -from re import (compile, U, I) +from re import compile as re_compile, U, I from ..utils.leven import levenshtein_ratio @@ -80,6 +79,8 @@ class SimaStr(str): Specific string object for artist names and song titles. Here follows some class variables for regex to run on strings. """ + diafilter = True + leven_ratio = 0.82 regexp_dict = dict() # Leading patterns: The Le Les @@ -92,21 +93,24 @@ class SimaStr(str): # Trailing patterns: ! ? live # TODO: add "concert" key word # add "Live at " - regexp_dict.update({'trail': '([- !?\.]|\(? ?[Ll]ive ?\)?)'}) + regexp_dict.update({'trail': r'([- !?\.]|\(? ?[Ll]ive ?\)?)'}) - reg_lead = compile('^(?P%(lead)s )(?P.*)$' % regexp_dict, I | U) - reg_midl = compile('^(?P.*)(?P %(mid)s )(?P.*)' % regexp_dict, U) - reg_trail = compile('^(?P.*?)(?P%(trail)s+$)' % regexp_dict, U) + reg_lead = re_compile('^(?P%(lead)s )(?P.*)$' % regexp_dict, I | U) + reg_midl = re_compile('^(?P.*)(?P %(mid)s )(?P.*)' % regexp_dict, U) + reg_trail = re_compile('^(?P.*?)(?P%(trail)s+$)' % regexp_dict, U) def __init__(self, fuzzstr): """ """ - super().__init__(fuzzstr) self.orig = str(fuzzstr) self.stripped = str(fuzzstr.strip()) # fuzzy computation self._get_root() - self.remove_diacritics() + if self.__class__.diafilter: + self.remove_diacritics() + + def __new__(cls, fuzzstr): + return super(SimaStr, cls).__new__(cls, fuzzstr) def _get_root(self): """ @@ -129,6 +133,7 @@ class SimaStr(str): self.stripped = sea.group('root0') def remove_diacritics(self): + """converting diacritics""" self.stripped = ''.join(x for x in unicodedata.normalize('NFKD', self.stripped) if unicodedata.category(x) != 'Mn') @@ -143,7 +148,7 @@ class SimaStr(str): other.stripped.lower()) if hash(self) == hash(other): return True - return levenr >= 0.82 + return levenr >= self.__class__.leven_ratio def __ne__(self, other): if not isinstance(other, SimaStr):