# -*- coding: utf-8 -*-
-
#
-# Copyright (c) 2009, 2010, 2013 Jack Kaliko <kaliko@azylum.org>
+# Copyright (c) 2009, 2010, 2013 kaliko <kaliko@azylum.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as
# If not, see <http://www.gnu.org/licenses/>.
#
-"""
+r"""
SimaStr
Special unicode() subclass to perform fuzzy match on specific strings with
# IMPORTS
import unicodedata
-from re import (compile, U, I)
+from re import compile as re_compile, U, I
from ..utils.leven import levenshtein_ratio
Specific string object for artist names and song titles.
Here follows some class variables for regex to run on strings.
"""
+ diafilter = True
+ leven_ratio = 0.82
regexp_dict = dict()
# Leading patterns: The Le Les
# Trailing patterns: ! ? live
# TODO: add "concert" key word
# add "Live at <somewhere>"
- regexp_dict.update({'trail': '([- !?\.]|\(? ?[Ll]ive ?\)?)'})
+ regexp_dict.update({'trail': r'([- !?\.]|\(? ?[Ll]ive ?\)?)'})
- reg_lead = compile('^(?P<lead>%(lead)s )(?P<root0>.*)$' % regexp_dict, I | U)
- reg_midl = compile('^(?P<root0>.*)(?P<mid> %(mid)s )(?P<root1>.*)' % regexp_dict, U)
- reg_trail = compile('^(?P<root0>.*?)(?P<trail>%(trail)s+$)' % regexp_dict, U)
+ reg_lead = re_compile('^(?P<lead>%(lead)s )(?P<root0>.*)$' % regexp_dict, I | U)
+ reg_midl = re_compile('^(?P<root0>.*)(?P<mid> %(mid)s )(?P<root1>.*)' % regexp_dict, U)
+ reg_trail = re_compile('^(?P<root0>.*?)(?P<trail>%(trail)s+$)' % regexp_dict, U)
def __init__(self, fuzzstr):
"""
self.stripped = str(fuzzstr.strip())
# fuzzy computation
self._get_root()
- self.remove_diacritics()
+ if self.__class__.diafilter:
+ self.remove_diacritics()
def __new__(cls, fuzzstr):
return super(SimaStr, cls).__new__(cls, fuzzstr)
self.stripped = sea.group('root0')
def remove_diacritics(self):
+ """converting diacritics"""
self.stripped = ''.join(x for x in
unicodedata.normalize('NFKD', self.stripped)
if unicodedata.category(x) != 'Mn')
other.stripped.lower())
if hash(self) == hash(other):
return True
- return levenr >= 0.82
+ return levenr >= self.__class__.leven_ratio
def __ne__(self, other):
if not isinstance(other, SimaStr):
return hash(self) != hash(other)
-# Script starts here
-if __name__ == "__main__":
- import time
- print(SimaStr('Kétanoue'))
- #from leven import levenshtein_ratio
- CASES_LIST = list([
- dict({
- 'got': 'Guns N\' Roses (live)!! !',
- 'look for': 'Guns And Roses'}),
- dict({
- 'got': 'Jesus & Mary Chains',
- 'look for': 'The Jesus and Mary Chains - live'}),
- dict({
- 'got': 'Desert sessions',
- 'look for': 'The Desert Sessions'}),
- dict({
- 'got': 'Têtes Raides',
- 'look for': 'Les Têtes Raides'}),
- dict({
- 'got': 'Noir Désir',
- 'look for': 'Noir Désir'}),
- dict({
- 'got': 'No Future',
- 'look for': 'Future'})])
-
- for case in CASES_LIST[:]:
- str0 = case.get('got')
- str1 = case.get('look for')
- fz_str0 = SimaStr(str0)
- fz_str1 = SimaStr(str1)
- print(fz_str0, '\n', fz_str1)
- print(fz_str0.stripped == fz_str1.stripped)
- #print levenshtein_ratio(fz_str0.lower(), fz_str1.lower())
- time.sleep(1)
-
# VIM MODLINE
# vim: ai ts=4 sw=4 sts=4 expandtab