sima/lib/simastr.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # Copyright (c) 2009, 2010, 2013 Jack Kaliko <kaliko@azylum.org>
   4 #
   5 #  This program is free software; you can redistribute it and/or modify
   6 #  it under the terms of the GNU General Public License as
   7 #  published by the Free Software Foundation; either version 3 of the
   8 #  License, or (at your option) any later version.
   9 #
  10 #  This program is distributed in the hope that it will be useful, but
  11 #  WITHOUT ANY WARRANTY; without even the implied warranty of
  12 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13 #  General Public License for more details.
  14 #
  15 #  You should have received a copy of the GNU General Public
  16 #  License along with this program.
  17 #  If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 r"""
  21 SimaStr
  22
  23 Special unicode() subclass to perform fuzzy match on specific strings with
  24 known noise.
  25
  26  * SimaStr() object removes specific patterns from the string
  27  * Diacritic are removed
  28  * Equality test is done on lower-cased string
  29  * Equality test is not an exact comparison, the levenshtein edition distance
  30    between stripped and filtered strings is used
  31
  32 >>> from simastr import SimaStr
  33 >>> art0 = SimaStr('The Desert Sessions & PJ Harvey')
  34 >>> art1 = SimaStr('Desert Sessions And PJ Harvey')
  35 >>> art0 == art1
  36 >>> True
  37 >>> art0 == 'Desert Sessions And PJ Harvey'
  38 >>> True
  39 >>> # diacritic filter + levenshtein  example
  40 >>> art0 = sima.lib.simastr.SimaStr('Hubert Félix Thiéphaine')
  41 >>> art1 = sima.lib.simastr.SimaStr('Hubert-Felix Thiephaine')
  42 >>> art0 == art1
  43 >>> True
  44 >>>
  45
  46 Current stripped word patterns (usually English followed by French and
  47 Spanish alternatives)
  48     leading (case-insensitive):
  49             "the","le","la","les","el","los"
  50     middle:
  51             "[Aa]nd","&","[Nn]'?","[Ee]t"
  52     trailing:
  53             combination of "[- !?\.]+" "\(? ?[Ll]ive ?\)?"
  54
  55
  56 Possibility to access to stripped string:
  57
  58 >>> art0 = SimaStr('The Desert Sessions & PJ Harvey')
  59 >>> print (art0, art0.stripped)
  60 >>> ('The Desert Sessions & PJ Harvey', 'Desert Sessions PJ Harvey')
  61
  62 TODO:
  63     * Have a look to difflib.SequenceMatcher to find possible improvements
  64     * Find a way to allow users patterns.
  65 """
  66
  67 __author__ = 'Jack Kaliko'
  68 __version__ = '0.4'
  69
  70 # IMPORTS
  71 import unicodedata
  72 from re import compile as re_compile, U, I
  73
  74 from ..utils.leven import levenshtein_ratio
  75
  76
  77 class SimaStr(str):
  78     """
  79     Specific string object for artist names and song titles.
  80     Here follows some class variables for regex to run on strings.
  81     """
  82     diafilter = True
  83     leven_ratio = 0.82
  84     regexp_dict = dict()
  85
  86     # Leading patterns: The Le Les
  87     # case-insensitive matching for this RE
  88     regexp_dict.update({'lead': '(the|l[ae][s]?|los|el)'})
  89
  90     # Middle patterns: And & Et N
  91     regexp_dict.update({'mid': '(And|&|and|[Nn]\'?|et)'})
  92
  93     # Trailing patterns: ! ? live
  94     # TODO: add "concert" key word
  95     #       add "Live at <somewhere>"
  96     regexp_dict.update({'trail': r'([- !?\.]|\(? ?[Ll]ive ?\)?)'})
  97
  98     reg_lead = re_compile('^(?P<lead>%(lead)s )(?P<root0>.*)$' % regexp_dict, I | U)
  99     reg_midl = re_compile('^(?P<root0>.*)(?P<mid> %(mid)s )(?P<root1>.*)' % regexp_dict, U)
 100     reg_trail = re_compile('^(?P<root0>.*?)(?P<trail>%(trail)s+$)' % regexp_dict, U)
 101
 102     def __init__(self, fuzzstr):
 103         """
 104         """
 105         self.orig = str(fuzzstr)
 106         self.stripped = str(fuzzstr.strip())
 107         # fuzzy computation
 108         self._get_root()
 109         if self.__class__.diafilter:
 110             self.remove_diacritics()
 111
 112     def __new__(cls, fuzzstr):
 113         return super(SimaStr, cls).__new__(cls, fuzzstr)
 114
 115     def _get_root(self):
 116         """
 117         Remove all patterns in string.
 118         """
 119         sea = SimaStr.reg_lead.search(self.stripped)
 120         if sea:
 121             #print sea.groupdict()
 122             self.stripped = sea.group('root0')
 123
 124         sea = SimaStr.reg_midl.search(self.stripped)
 125         if sea:
 126             #print sea.groupdict()
 127             self.stripped = str().join([sea.group('root0'), ' ',
 128                                         sea.group('root1')])
 129
 130         sea = SimaStr.reg_trail.search(self.stripped)
 131         if sea:
 132             #print sea.groupdict()
 133             self.stripped = sea.group('root0')
 134
 135     def remove_diacritics(self):
 136         """converting diacritics"""
 137         self.stripped = ''.join(x for x in
 138                                 unicodedata.normalize('NFKD', self.stripped)
 139                                 if unicodedata.category(x) != 'Mn')
 140
 141     def __hash__(self):
 142         return hash(self.stripped)
 143
 144     def __eq__(self, other):
 145         if not isinstance(other, SimaStr):
 146             other = SimaStr(other)
 147         levenr = levenshtein_ratio(self.stripped.lower(),
 148                                    other.stripped.lower())
 149         if hash(self) == hash(other):
 150             return True
 151         return levenr >= self.__class__.leven_ratio
 152
 153     def __ne__(self, other):
 154         if not isinstance(other, SimaStr):
 155             return hash(self) != hash(SimaStr(other))
 156         return hash(self) != hash(other)
 157
 158
 159 # Script starts here
 160 if __name__ == "__main__":
 161     import time
 162     print(SimaStr('Kétanoue'))
 163     #from leven import levenshtein_ratio
 164     CASES_LIST = list([
 165         dict({
 166                     'got': 'Guns N\' Roses (live)!! !',
 167                 'look for': 'Guns And Roses'}),
 168         dict({
 169                      'got': 'Jesus & Mary Chains',
 170                 'look for': 'The Jesus and Mary Chains - live'}),
 171         dict({
 172                          'got': 'Desert sessions',
 173                     'look for': 'The Desert Sessions'}),
 174         dict({
 175                          'got': 'Têtes Raides',
 176                     'look for': 'Les Têtes Raides'}),
 177         dict({
 178                          'got': 'Noir Désir',
 179                     'look for': 'Noir Désir'}),
 180         dict({
 181                          'got': 'No Future',
 182                     'look for': 'Future'})])
 183
 184     for case in CASES_LIST[:]:
 185         str0 = case.get('got')
 186         str1 = case.get('look for')
 187         fz_str0 = SimaStr(str0)
 188         fz_str1 = SimaStr(str1)
 189         print(fz_str0, '\n', fz_str1)
 190         print(fz_str0.stripped == fz_str1.stripped)
 191         #print levenshtein_ratio(fz_str0.lower(), fz_str1.lower())
 192         time.sleep(1)
 193
 194 # VIM MODLINE
 195 # vim: ai ts=4 sw=4 sts=4 expandtab