1 # -*- coding: utf-8 -*-
4 # Copyright (c) 2009, 2010, 2013 Jack Kaliko <kaliko@azylum.org>
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as
8 # published by the Free Software Foundation; either version 3 of the
9 # License, or (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public
17 # License along with this program.
18 # If not, see <http://www.gnu.org/licenses/>.
24 Special unicode() subclass to perform fuzzy match on specific strings with
27 Artist names often contain a leading 'The ' which might, or might not be
28 present. Some other noise sources in artist name are 'and' words :
31 The SimaStr() object removes these words and compute equality on "stripped"
34 >>> from simastr import SimaStr
35 >>> art0 = SimaStr('The Desert Sessions & PJ Harvey')
36 >>> art1 = SimaStr('Desert Sessions And PJ Harvey')
39 >>> art0 == 'Desert Sessions And PJ Harvey'
43 Current stripped word patterns (usually English followed by French andx
45 leading (case-insensitive):
46 "the","le","la","les","el","los"
48 "[Aa]nd","&","[Nn]'?","[Ee]t"
50 combination of "[- !?\.]+" "\(? ?[Ll]ive ?\)?"
53 Possibility to access to stripped string :
55 >>> art0 = SimaStr('The Desert Sessions & PJ Harvey')
57 >>> print (art0, art0.stripped)
58 >>> ('The Desert Sessions & PJ Harvey', 'Desert Sessions PJ Harvey')
61 * Have a look to difflib.SequenceMatcher to find possible improvements
62 * Find a way to allow users patterns.
65 __author__ = 'Jack Kaliko'
69 from re import (compile, U, I)
74 Specific string object for artist names and song titles.
75 Here follows some class variables for regex to run on strings.
79 # Leading patterns: The Le Les
80 # case-insensitive matching for this RE
81 regexp_dict.update({'lead': '(the|l[ae][s]?|los|el)'})
83 # Middle patterns: And & Et N
84 regexp_dict.update({'mid': '(And|&|and|[Nn]\'?|et)'})
86 # Trailing patterns: ! ? live
87 # TODO: add "concert" key word
88 # add "Live at <somewhere>"
89 regexp_dict.update({'trail': '([- !?\.]|\(? ?[Ll]ive ?\)?)'})
91 reg_lead = compile('^(?P<lead>%(lead)s )(?P<root0>.*)$' % regexp_dict, I | U)
92 reg_midl = compile('^(?P<root0>.*)(?P<mid> %(mid)s )(?P<root1>.*)' % regexp_dict, U)
93 reg_trail = compile('^(?P<root0>.*?)(?P<trail>%(trail)s+$)' % regexp_dict, U)
95 def __init__(self, fuzzstr):
98 str().__init__(fuzzstr)
99 self.orig = str(fuzzstr)
100 self.stripped = str(fuzzstr.strip())
106 Remove all patterns in string.
108 sea = SimaStr.reg_lead.search(self.stripped)
110 #print sea.groupdict()
111 self.stripped = sea.group('root0')
113 sea = SimaStr.reg_midl.search(self.stripped)
115 #print sea.groupdict()
116 self.stripped = str().join([sea.group('root0'), ' ',
119 sea = SimaStr.reg_trail.search(self.stripped)
121 #print sea.groupdict()
122 self.stripped = sea.group('root0')
125 return hash(self.stripped)
127 def __eq__(self, other):
128 if not isinstance(other, SimaStr):
129 return hash(self) == hash(SimaStr(other))
130 return hash(self) == hash(other)
132 def __ne__(self, other):
133 if not isinstance(other, SimaStr):
134 return hash(self) != hash(SimaStr(other))
135 return hash(self) != hash(other)
139 if __name__ == "__main__":
141 print(SimaStr('Kétanoue'))
142 #from leven import levenshtein_ratio
145 'got': 'Guns N\' Roses (live)!! !',
146 'look for': 'Guns And Roses'}),
148 'got': 'Jesus & Mary Chains',
149 'look for': 'The Jesus and Mary Chains - live'}),
151 'got': 'Desert sessions',
152 'look for': 'The Desert Sessions'}),
154 'got': 'Têtes Raides',
155 'look for': 'Les Têtes Raides'}),
158 'look for': 'Noir Désir'}),
161 'look for': 'Future'})])
163 for case in CASES_LIST[:]:
164 str0 = case.get('got')
165 str1 = case.get('look for')
166 fz_str0 = SimaStr(str0)
167 fz_str1 = SimaStr(str1)
168 print(fz_str0, '\n', fz_str1)
169 print(fz_str0.stripped == fz_str1.stripped)
170 #print levenshtein_ratio(fz_str0.lower(), fz_str1.lower())
174 # vim: ai ts=4 sw=4 sts=4 expandtab