sima/lib/simastr.py

   1 # -*- coding: utf-8 -*-
   2
   3 #
   4 # Copyright (c) 2009, 2010, 2013 Jack Kaliko <kaliko@azylum.org>
   5 #
   6 #  This program is free software; you can redistribute it and/or modify
   7 #  it under the terms of the GNU General Public License as
   8 #  published by the Free Software Foundation; either version 3 of the
   9 #  License, or (at your option) any later version.
  10 #
  11 #  This program is distributed in the hope that it will be useful, but
  12 #  WITHOUT ANY WARRANTY; without even the implied warranty of
  13 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 #  General Public License for more details.
  15 #
  16 #  You should have received a copy of the GNU General Public
  17 #  License along with this program.
  18 #  If not, see <http://www.gnu.org/licenses/>.
  19 #
  20
  21 """
  22 SimaStr
  23
  24 Special unicode() subclass to perform fuzzy match on specific strings with
  25 known noise.
  26
  27  * SimaStr() object removes specific patterns from the string
  28  * Diacritic are removed
  29  * Equality test is done on lower-cased string
  30  * Equality test is not an exact comparison, the levenshtein edition distance
  31    between stripped and filtered strings is used
  32
  33 >>> from simastr import SimaStr
  34 >>> art0 = SimaStr('The Desert Sessions & PJ Harvey')
  35 >>> art1 = SimaStr('Desert Sessions And PJ Harvey')
  36 >>> art0 == art1
  37 >>> True
  38 >>> art0 == 'Desert Sessions And PJ Harvey'
  39 >>> True
  40 >>> # diacritic filter + levenshtein  example
  41 >>> art0 = sima.lib.simastr.SimaStr('Hubert Félix Thiéphaine')
  42 >>> art1 = sima.lib.simastr.SimaStr('Hubert-Felix Thiephaine')
  43 >>> art0 == art1
  44 >>> True
  45 >>>
  46
  47 Current stripped word patterns (usually English followed by French and
  48 Spanish alternatives)
  49     leading (case-insensitive):
  50             "the","le","la","les","el","los"
  51     middle:
  52             "[Aa]nd","&","[Nn]'?","[Ee]t"
  53     trailing:
  54             combination of "[- !?\.]+" "\(? ?[Ll]ive ?\)?"
  55
  56
  57 Possibility to access to stripped string:
  58
  59 >>> art0 = SimaStr('The Desert Sessions & PJ Harvey')
  60 >>> print (art0, art0.stripped)
  61 >>> ('The Desert Sessions & PJ Harvey', 'Desert Sessions PJ Harvey')
  62
  63 TODO:
  64     * Have a look to difflib.SequenceMatcher to find possible improvements
  65     * Find a way to allow users patterns.
  66 """
  67
  68 __author__ = 'Jack Kaliko'
  69 __version__ = '0.4'
  70
  71 # IMPORTS
  72 import unicodedata
  73 from re import (compile, U, I)
  74
  75 from ..utils.leven import levenshtein_ratio
  76
  77
  78 class SimaStr(str):
  79     """
  80     Specific string object for artist names and song titles.
  81     Here follows some class variables for regex to run on strings.
  82     """
  83     regexp_dict = dict()
  84
  85     # Leading patterns: The Le Les
  86     # case-insensitive matching for this RE
  87     regexp_dict.update({'lead': '(the|l[ae][s]?|los|el)'})
  88
  89     # Middle patterns: And & Et N
  90     regexp_dict.update({'mid': '(And|&|and|[Nn]\'?|et)'})
  91
  92     # Trailing patterns: ! ? live
  93     # TODO: add "concert" key word
  94     #       add "Live at <somewhere>"
  95     regexp_dict.update({'trail': '([- !?\.]|\(? ?[Ll]ive ?\)?)'})
  96
  97     reg_lead = compile('^(?P<lead>%(lead)s )(?P<root0>.*)$' % regexp_dict, I | U)
  98     reg_midl = compile('^(?P<root0>.*)(?P<mid> %(mid)s )(?P<root1>.*)' % regexp_dict, U)
  99     reg_trail = compile('^(?P<root0>.*?)(?P<trail>%(trail)s+$)' % regexp_dict, U)
 100
 101     def __init__(self, fuzzstr):
 102         """
 103         """
 104         super().__init__(fuzzstr)
 105         self.orig = str(fuzzstr)
 106         self.stripped = str(fuzzstr.strip())
 107         # fuzzy computation
 108         self._get_root()
 109         self.remove_diacritics()
 110
 111     def _get_root(self):
 112         """
 113         Remove all patterns in string.
 114         """
 115         sea = SimaStr.reg_lead.search(self.stripped)
 116         if sea:
 117             #print sea.groupdict()
 118             self.stripped = sea.group('root0')
 119
 120         sea = SimaStr.reg_midl.search(self.stripped)
 121         if sea:
 122             #print sea.groupdict()
 123             self.stripped = str().join([sea.group('root0'), ' ',
 124                                         sea.group('root1')])
 125
 126         sea = SimaStr.reg_trail.search(self.stripped)
 127         if sea:
 128             #print sea.groupdict()
 129             self.stripped = sea.group('root0')
 130
 131     def remove_diacritics(self):
 132         self.stripped = ''.join(x for x in
 133                                 unicodedata.normalize('NFKD', self.stripped)
 134                                 if unicodedata.category(x) != 'Mn')
 135
 136     def __hash__(self):
 137         return hash(self.stripped)
 138
 139     def __eq__(self, other):
 140         if not isinstance(other, SimaStr):
 141             other = SimaStr(other)
 142         levenr = levenshtein_ratio(self.stripped.lower(),
 143                                    other.stripped.lower())
 144         if hash(self) == hash(other):
 145             return True
 146         return levenr >= 0.82
 147
 148     def __ne__(self, other):
 149         if not isinstance(other, SimaStr):
 150             return hash(self) != hash(SimaStr(other))
 151         return hash(self) != hash(other)
 152
 153
 154 # Script starts here
 155 if __name__ == "__main__":
 156     import time
 157     print(SimaStr('Kétanoue'))
 158     #from leven import levenshtein_ratio
 159     CASES_LIST = list([
 160         dict({
 161                     'got': 'Guns N\' Roses (live)!! !',
 162                 'look for': 'Guns And Roses'}),
 163         dict({
 164                      'got': 'Jesus & Mary Chains',
 165                 'look for': 'The Jesus and Mary Chains - live'}),
 166         dict({
 167                          'got': 'Desert sessions',
 168                     'look for': 'The Desert Sessions'}),
 169         dict({
 170                          'got': 'Têtes Raides',
 171                     'look for': 'Les Têtes Raides'}),
 172         dict({
 173                          'got': 'Noir Désir',
 174                     'look for': 'Noir Désir'}),
 175         dict({
 176                          'got': 'No Future',
 177                     'look for': 'Future'})])
 178
 179     for case in CASES_LIST[:]:
 180         str0 = case.get('got')
 181         str1 = case.get('look for')
 182         fz_str0 = SimaStr(str0)
 183         fz_str1 = SimaStr(str1)
 184         print(fz_str0, '\n', fz_str1)
 185         print(fz_str0.stripped == fz_str1.stripped)
 186         #print levenshtein_ratio(fz_str0.lower(), fz_str1.lower())
 187         time.sleep(1)
 188
 189 # VIM MODLINE
 190 # vim: ai ts=4 sw=4 sts=4 expandtab