lib/simastr.py

   1 # -*- coding: utf-8 -*-
   2
   3 #
   4 # Copyright (c) 2009, 2010, 2013 Jack Kaliko <kaliko@azylum.org>
   5 #
   6 #  This program is free software; you can redistribute it and/or modify
   7 #  it under the terms of the GNU General Public License as
   8 #  published by the Free Software Foundation; either version 3 of the
   9 #  License, or (at your option) any later version.
  10 #
  11 #  This program is distributed in the hope that it will be useful, but
  12 #  WITHOUT ANY WARRANTY; without even the implied warranty of
  13 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 #  General Public License for more details.
  15 #
  16 #  You should have received a copy of the GNU General Public
  17 #  License along with this program.
  18 #  If not, see <http://www.gnu.org/licenses/>.
  19 #
  20
  21 """
  22 SimaStr
  23
  24 Special unicode() subclass to perform fuzzy match on specific strings with
  25 known noise.
  26
  27 Artist names often contain a leading 'The ' which might, or might not be
  28 present. Some other noise sources in artist name are 'and' words :
  29     'and'/'&'/'n'/'N'.
  30
  31 The SimaStr() object removes these words and compute equality on "stripped"
  32 strings.
  33
  34 >>> from simastr import SimaStr
  35 >>> art0 = SimaStr('The Desert Sessions & PJ Harvey')
  36 >>> art1 = SimaStr('Desert Sessions And PJ Harvey')
  37 >>> art0 == art1
  38 >>> True
  39 >>> art0 == 'Desert Sessions And PJ Harvey'
  40 >>> True
  41 >>>
  42
  43 Current stripped word patterns (usually English followed by French andx
  44 Spanish alternatives)
  45     leading (case-insensitive):
  46             "the","le","la","les","el","los"
  47     middle:
  48             "[Aa]nd","&","[Nn]'?","[Ee]t"
  49     trailing:
  50             combination of "[- !?\.]+" "\(? ?[Ll]ive ?\)?"
  51
  52
  53 Possibility to access to stripped string :
  54
  55 >>> art0 = SimaStr('The Desert Sessions & PJ Harvey')
  56 >>> art.stripped
  57 >>> print (art0, art0.stripped)
  58 >>> ('The Desert Sessions & PJ Harvey', 'Desert Sessions PJ Harvey')
  59
  60 TODO:
  61     * Have a look to difflib.SequenceMatcher to find possible improvements
  62     * Find a way to allow users patterns.
  63 """
  64
  65 __author__ = 'Jack Kaliko'
  66 __version__ = '0.3'
  67
  68 # IMPORTS
  69 from re import (compile, U, I)
  70
  71
  72 class SimaStr(str):
  73     """
  74     Specific string object for artist names and song titles.
  75     Here follows some class variables for regex to run on strings.
  76     """
  77     regexp_dict = dict()
  78
  79     # Leading patterns: The Le Les
  80     # case-insensitive matching for this RE
  81     regexp_dict.update({'lead': '(the|l[ae][s]?|los|el)'})
  82
  83     # Middle patterns: And & Et N
  84     regexp_dict.update({'mid': '(And|&|and|[Nn]\'?|et)'})
  85
  86     # Trailing patterns: ! ? live
  87     # TODO: add "concert" key word
  88     #       add "Live at <somewhere>"
  89     regexp_dict.update({'trail': '([- !?\.]|\(? ?[Ll]ive ?\)?)'})
  90
  91     reg_lead = compile('^(?P<lead>%(lead)s )(?P<root0>.*)$' % regexp_dict, I | U)
  92     reg_midl = compile('^(?P<root0>.*)(?P<mid> %(mid)s )(?P<root1>.*)' % regexp_dict, U)
  93     reg_trail = compile('^(?P<root0>.*?)(?P<trail>%(trail)s+$)' % regexp_dict, U)
  94
  95     def __init__(self, fuzzstr):
  96         """
  97         """
  98         str().__init__(fuzzstr)
  99         self.orig = str(fuzzstr)
 100         self.stripped = str(fuzzstr.strip())
 101         # fuzzy computation
 102         self._get_root()
 103
 104     def _get_root(self):
 105         """
 106         Remove all patterns in string.
 107         """
 108         sea = SimaStr.reg_lead.search(self.stripped)
 109         if sea:
 110             #print sea.groupdict()
 111             self.stripped = sea.group('root0')
 112
 113         sea = SimaStr.reg_midl.search(self.stripped)
 114         if sea:
 115             #print sea.groupdict()
 116             self.stripped = str().join([sea.group('root0'), ' ',
 117                                         sea.group('root1')])
 118
 119         sea = SimaStr.reg_trail.search(self.stripped)
 120         if sea:
 121             #print sea.groupdict()
 122             self.stripped = sea.group('root0')
 123
 124     def __hash__(self):
 125         return hash(self.stripped)
 126
 127     def __eq__(self, other):
 128         if not isinstance(other, SimaStr):
 129             return hash(self) == hash(SimaStr(other))
 130         return hash(self) == hash(other)
 131
 132     def __ne__(self, other):
 133         if not isinstance(other, SimaStr):
 134             return hash(self) != hash(SimaStr(other))
 135         return hash(self) != hash(other)
 136
 137
 138 # Script starts here
 139 if __name__ == "__main__":
 140     import time
 141     print(SimaStr('Kétanoue'))
 142     #from leven import levenshtein_ratio
 143     CASES_LIST = list([
 144         dict({
 145                     'got': 'Guns N\' Roses (live)!! !',
 146                 'look for': 'Guns And Roses'}),
 147         dict({
 148                      'got': 'Jesus & Mary Chains',
 149                 'look for': 'The Jesus and Mary Chains - live'}),
 150         dict({
 151                          'got': 'Desert sessions',
 152                     'look for': 'The Desert Sessions'}),
 153         dict({
 154                          'got': 'Têtes Raides',
 155                     'look for': 'Les Têtes Raides'}),
 156         dict({
 157                          'got': 'Noir Désir',
 158                     'look for': 'Noir Désir'}),
 159         dict({
 160                          'got': 'No Future',
 161                     'look for': 'Future'})])
 162
 163     for case in CASES_LIST[:]:
 164         str0 = case.get('got')
 165         str1 = case.get('look for')
 166         fz_str0 = SimaStr(str0)
 167         fz_str1 = SimaStr(str1)
 168         print(fz_str0, '\n', fz_str1)
 169         print(fz_str0.stripped == fz_str1.stripped)
 170         #print levenshtein_ratio(fz_str0.lower(), fz_str1.lower())
 171         time.sleep(1)
 172
 173 # VIM MODLINE
 174 # vim: ai ts=4 sw=4 sts=4 expandtab