1 # -*- coding: utf-8 -*-
3 # Copyright (c) 2009, 2010, 2013 kaliko <kaliko@azylum.org>
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as
7 # published by the Free Software Foundation; either version 3 of the
8 # License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful, but
11 # WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 # General Public License for more details.
15 # You should have received a copy of the GNU General Public
16 # License along with this program.
17 # If not, see <http://www.gnu.org/licenses/>.
23 Special unicode() subclass to perform fuzzy match on specific strings with
26 * SimaStr() object removes specific patterns from the string
27 * Diacritic are removed
28 * Equality test is done on lower-cased string
29 * Equality test is not an exact comparison, the levenshtein edition distance
30 between stripped and filtered strings is used
32 >>> from simastr import SimaStr
33 >>> art0 = SimaStr('The Desert Sessions & PJ Harvey')
34 >>> art1 = SimaStr('Desert Sessions And PJ Harvey')
37 >>> art0 == 'Desert Sessions And PJ Harvey'
39 >>> # diacritic filter + levenshtein example
40 >>> art0 = sima.lib.simastr.SimaStr('Hubert Félix Thiéphaine')
41 >>> art1 = sima.lib.simastr.SimaStr('Hubert-Felix Thiephaine')
46 Current stripped word patterns (usually English followed by French and
48 leading (case-insensitive):
49 "the","le","la","les","el","los"
51 "[Aa]nd","&","[Nn]'?","[Ee]t"
53 combination of "[- !?\.]+" "\(? ?[Ll]ive ?\)?"
56 Possibility to access to stripped string:
58 >>> art0 = SimaStr('The Desert Sessions & PJ Harvey')
59 >>> print (art0, art0.stripped)
60 >>> ('The Desert Sessions & PJ Harvey', 'Desert Sessions PJ Harvey')
63 * Have a look to difflib.SequenceMatcher to find possible improvements
64 * Find a way to allow users patterns.
67 __author__ = 'Jack Kaliko'
72 from re import compile as re_compile, U, I
74 from ..utils.leven import levenshtein_ratio
79 Specific string object for artist names and song titles.
80 Here follows some class variables for regex to run on strings.
86 # Leading patterns: The Le Les
87 # case-insensitive matching for this RE
88 regexp_dict.update({'lead': '(the|l[ae][s]?|los|el)'})
90 # Middle patterns: And & Et N
91 regexp_dict.update({'mid': '(And|&|and|[Nn]\'?|et)'})
93 # Trailing patterns: ! ? live
94 # TODO: add "concert" key word
95 # add "Live at <somewhere>"
96 regexp_dict.update({'trail': r'([- !?\.]|\(? ?[Ll]ive ?\)?)'})
98 reg_lead = re_compile('^(?P<lead>%(lead)s )(?P<root0>.*)$' % regexp_dict, I | U)
99 reg_midl = re_compile('^(?P<root0>.*)(?P<mid> %(mid)s )(?P<root1>.*)' % regexp_dict, U)
100 reg_trail = re_compile('^(?P<root0>.*?)(?P<trail>%(trail)s+$)' % regexp_dict, U)
102 def __init__(self, fuzzstr):
105 self.orig = str(fuzzstr)
106 self.stripped = str(fuzzstr.strip())
109 if self.__class__.diafilter:
110 self.remove_diacritics()
112 def __new__(cls, fuzzstr):
113 return super(SimaStr, cls).__new__(cls, fuzzstr)
117 Remove all patterns in string.
119 sea = SimaStr.reg_lead.search(self.stripped)
121 #print sea.groupdict()
122 self.stripped = sea.group('root0')
124 sea = SimaStr.reg_midl.search(self.stripped)
126 #print sea.groupdict()
127 self.stripped = str().join([sea.group('root0'), ' ',
130 sea = SimaStr.reg_trail.search(self.stripped)
132 #print sea.groupdict()
133 self.stripped = sea.group('root0')
135 def remove_diacritics(self):
136 """converting diacritics"""
137 self.stripped = ''.join(x for x in
138 unicodedata.normalize('NFKD', self.stripped)
139 if unicodedata.category(x) != 'Mn')
142 return hash(self.stripped)
144 def __eq__(self, other):
145 if not isinstance(other, SimaStr):
146 other = SimaStr(other)
147 levenr = levenshtein_ratio(self.stripped.lower(),
148 other.stripped.lower())
149 if hash(self) == hash(other):
151 return levenr >= self.__class__.leven_ratio
153 def __ne__(self, other):
154 if not isinstance(other, SimaStr):
155 return hash(self) != hash(SimaStr(other))
156 return hash(self) != hash(other)
160 # vim: ai ts=4 sw=4 sts=4 expandtab