# -*- coding: utf-8 -*-
-
#
# Copyright (c) 2009, 2010, 2013 Jack Kaliko <kaliko@azylum.org>
#
# If not, see <http://www.gnu.org/licenses/>.
#
-"""
+r"""
SimaStr
Special unicode() subclass to perform fuzzy match on specific strings with
known noise.
-Artist names often contain a leading 'The ' which might, or might not be
-present. Some other noise sources in artist name are 'and' words :
- 'and'/'&'/'n'/'N'.
-
-The SimaStr() object removes these words and compute equality on "stripped"
-strings.
+ * SimaStr() object removes specific patterns from the string
+ * Diacritic are removed
+ * Equality test is done on lower-cased string
+ * Equality test is not an exact comparison, the levenshtein edition distance
+ between stripped and filtered strings is used
>>> from simastr import SimaStr
>>> art0 = SimaStr('The Desert Sessions & PJ Harvey')
>>> True
>>> art0 == 'Desert Sessions And PJ Harvey'
>>> True
+>>> # diacritic filter + levenshtein example
+>>> art0 = sima.lib.simastr.SimaStr('Hubert Félix Thiéphaine')
+>>> art1 = sima.lib.simastr.SimaStr('Hubert-Felix Thiephaine')
+>>> art0 == art1
+>>> True
>>>
-Current stripped word patterns (usually English followed by French andx
+Current stripped word patterns (usually English followed by French and
Spanish alternatives)
leading (case-insensitive):
"the","le","la","les","el","los"
combination of "[- !?\.]+" "\(? ?[Ll]ive ?\)?"
-Possibility to access to stripped string :
+Possibility to access to stripped string:
>>> art0 = SimaStr('The Desert Sessions & PJ Harvey')
->>> art.stripped
>>> print (art0, art0.stripped)
>>> ('The Desert Sessions & PJ Harvey', 'Desert Sessions PJ Harvey')
"""
__author__ = 'Jack Kaliko'
-__version__ = '0.3'
+__version__ = '0.4'
# IMPORTS
-from re import (compile, U, I)
+import unicodedata
+from re import compile as re_compile, U, I
+
+from ..utils.leven import levenshtein_ratio
class SimaStr(str):
Specific string object for artist names and song titles.
Here follows some class variables for regex to run on strings.
"""
+ diafilter = True
+ leven_ratio = 0.82
regexp_dict = dict()
# Leading patterns: The Le Les
# Trailing patterns: ! ? live
# TODO: add "concert" key word
# add "Live at <somewhere>"
- regexp_dict.update({'trail': '([- !?\.]|\(? ?[Ll]ive ?\)?)'})
+ regexp_dict.update({'trail': r'([- !?\.]|\(? ?[Ll]ive ?\)?)'})
- reg_lead = compile('^(?P<lead>%(lead)s )(?P<root0>.*)$' % regexp_dict, I | U)
- reg_midl = compile('^(?P<root0>.*)(?P<mid> %(mid)s )(?P<root1>.*)' % regexp_dict, U)
- reg_trail = compile('^(?P<root0>.*?)(?P<trail>%(trail)s+$)' % regexp_dict, U)
+ reg_lead = re_compile('^(?P<lead>%(lead)s )(?P<root0>.*)$' % regexp_dict, I | U)
+ reg_midl = re_compile('^(?P<root0>.*)(?P<mid> %(mid)s )(?P<root1>.*)' % regexp_dict, U)
+ reg_trail = re_compile('^(?P<root0>.*?)(?P<trail>%(trail)s+$)' % regexp_dict, U)
def __init__(self, fuzzstr):
"""
"""
- str().__init__(fuzzstr)
self.orig = str(fuzzstr)
self.stripped = str(fuzzstr.strip())
# fuzzy computation
self._get_root()
+ if self.__class__.diafilter:
+ self.remove_diacritics()
+
+ def __new__(cls, fuzzstr):
+ return super(SimaStr, cls).__new__(cls, fuzzstr)
def _get_root(self):
"""
#print sea.groupdict()
self.stripped = sea.group('root0')
+ def remove_diacritics(self):
+ """converting diacritics"""
+ self.stripped = ''.join(x for x in
+ unicodedata.normalize('NFKD', self.stripped)
+ if unicodedata.category(x) != 'Mn')
+
def __hash__(self):
return hash(self.stripped)
def __eq__(self, other):
if not isinstance(other, SimaStr):
- return hash(self) == hash(SimaStr(other))
- return hash(self) == hash(other)
+ other = SimaStr(other)
+ levenr = levenshtein_ratio(self.stripped.lower(),
+ other.stripped.lower())
+ if hash(self) == hash(other):
+ return True
+ return levenr >= self.__class__.leven_ratio
def __ne__(self, other):
if not isinstance(other, SimaStr):
return hash(self) != hash(other)
-# Script starts here
-if __name__ == "__main__":
- import time
- print(SimaStr('Kétanoue'))
- #from leven import levenshtein_ratio
- CASES_LIST = list([
- dict({
- 'got': 'Guns N\' Roses (live)!! !',
- 'look for': 'Guns And Roses'}),
- dict({
- 'got': 'Jesus & Mary Chains',
- 'look for': 'The Jesus and Mary Chains - live'}),
- dict({
- 'got': 'Desert sessions',
- 'look for': 'The Desert Sessions'}),
- dict({
- 'got': 'Têtes Raides',
- 'look for': 'Les Têtes Raides'}),
- dict({
- 'got': 'Noir Désir',
- 'look for': 'Noir Désir'}),
- dict({
- 'got': 'No Future',
- 'look for': 'Future'})])
-
- for case in CASES_LIST[:]:
- str0 = case.get('got')
- str1 = case.get('look for')
- fz_str0 = SimaStr(str0)
- fz_str1 = SimaStr(str1)
- print(fz_str0, '\n', fz_str1)
- print(fz_str0.stripped == fz_str1.stripped)
- #print levenshtein_ratio(fz_str0.lower(), fz_str1.lower())
- time.sleep(1)
-
# VIM MODLINE
# vim: ai ts=4 sw=4 sts=4 expandtab