1 # -*- coding: utf-8 -*-
4 # Copyright (c) 2009, 2010, 2013 Jack Kaliko <kaliko@azylum.org>
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as
8 # published by the Free Software Foundation; either version 3 of the
9 # License, or (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public
17 # License along with this program.
18 # If not, see <http://www.gnu.org/licenses/>.
24 Special unicode() subclass to perform fuzzy match on specific strings with
27 * SimaStr() object removes specific patterns from the string
28 * Diacritic are removed
29 * Equality test is done on lower-cased string
30 * Equality test is not an exact comparison, the levenshtein edition distance
31 between stripped and filtered strings is used
33 >>> from simastr import SimaStr
34 >>> art0 = SimaStr('The Desert Sessions & PJ Harvey')
35 >>> art1 = SimaStr('Desert Sessions And PJ Harvey')
38 >>> art0 == 'Desert Sessions And PJ Harvey'
40 >>> # diacritic filter + levenshtein example
41 >>> art0 = sima.lib.simastr.SimaStr('Hubert Félix Thiéphaine')
42 >>> art1 = sima.lib.simastr.SimaStr('Hubert-Felix Thiephaine')
47 Current stripped word patterns (usually English followed by French and
49 leading (case-insensitive):
50 "the","le","la","les","el","los"
52 "[Aa]nd","&","[Nn]'?","[Ee]t"
54 combination of "[- !?\.]+" "\(? ?[Ll]ive ?\)?"
57 Possibility to access to stripped string:
59 >>> art0 = SimaStr('The Desert Sessions & PJ Harvey')
60 >>> print (art0, art0.stripped)
61 >>> ('The Desert Sessions & PJ Harvey', 'Desert Sessions PJ Harvey')
64 * Have a look to difflib.SequenceMatcher to find possible improvements
65 * Find a way to allow users patterns.
68 __author__ = 'Jack Kaliko'
73 from re import (compile, U, I)
75 from ..utils.leven import levenshtein_ratio
80 Specific string object for artist names and song titles.
81 Here follows some class variables for regex to run on strings.
85 # Leading patterns: The Le Les
86 # case-insensitive matching for this RE
87 regexp_dict.update({'lead': '(the|l[ae][s]?|los|el)'})
89 # Middle patterns: And & Et N
90 regexp_dict.update({'mid': '(And|&|and|[Nn]\'?|et)'})
92 # Trailing patterns: ! ? live
93 # TODO: add "concert" key word
94 # add "Live at <somewhere>"
95 regexp_dict.update({'trail': '([- !?\.]|\(? ?[Ll]ive ?\)?)'})
97 reg_lead = compile('^(?P<lead>%(lead)s )(?P<root0>.*)$' % regexp_dict, I | U)
98 reg_midl = compile('^(?P<root0>.*)(?P<mid> %(mid)s )(?P<root1>.*)' % regexp_dict, U)
99 reg_trail = compile('^(?P<root0>.*?)(?P<trail>%(trail)s+$)' % regexp_dict, U)
101 def __init__(self, fuzzstr):
104 self.orig = str(fuzzstr)
105 self.stripped = str(fuzzstr.strip())
108 self.remove_diacritics()
110 def __new__(cls, fuzzstr):
111 return super(SimaStr, cls).__new__(cls, fuzzstr)
115 Remove all patterns in string.
117 sea = SimaStr.reg_lead.search(self.stripped)
119 #print sea.groupdict()
120 self.stripped = sea.group('root0')
122 sea = SimaStr.reg_midl.search(self.stripped)
124 #print sea.groupdict()
125 self.stripped = str().join([sea.group('root0'), ' ',
128 sea = SimaStr.reg_trail.search(self.stripped)
130 #print sea.groupdict()
131 self.stripped = sea.group('root0')
133 def remove_diacritics(self):
134 self.stripped = ''.join(x for x in
135 unicodedata.normalize('NFKD', self.stripped)
136 if unicodedata.category(x) != 'Mn')
139 return hash(self.stripped)
141 def __eq__(self, other):
142 if not isinstance(other, SimaStr):
143 other = SimaStr(other)
144 levenr = levenshtein_ratio(self.stripped.lower(),
145 other.stripped.lower())
146 if hash(self) == hash(other):
148 return levenr >= 0.82
150 def __ne__(self, other):
151 if not isinstance(other, SimaStr):
152 return hash(self) != hash(SimaStr(other))
153 return hash(self) != hash(other)
157 if __name__ == "__main__":
159 print(SimaStr('Kétanoue'))
160 #from leven import levenshtein_ratio
163 'got': 'Guns N\' Roses (live)!! !',
164 'look for': 'Guns And Roses'}),
166 'got': 'Jesus & Mary Chains',
167 'look for': 'The Jesus and Mary Chains - live'}),
169 'got': 'Desert sessions',
170 'look for': 'The Desert Sessions'}),
172 'got': 'Têtes Raides',
173 'look for': 'Les Têtes Raides'}),
176 'look for': 'Noir Désir'}),
179 'look for': 'Future'})])
181 for case in CASES_LIST[:]:
182 str0 = case.get('got')
183 str1 = case.get('look for')
184 fz_str0 = SimaStr(str0)
185 fz_str1 = SimaStr(str1)
186 print(fz_str0, '\n', fz_str1)
187 print(fz_str0.stripped == fz_str1.stripped)
188 #print levenshtein_ratio(fz_str0.lower(), fz_str1.lower())
192 # vim: ai ts=4 sw=4 sts=4 expandtab