sima/utils/leven.py

   1 # -*- coding: utf-8 -*-
   2 # Copyright (c) 2009, 2010, 2013 kaliko <kaliko@azylum.org>
   3 #
   4 #  This file is part of sima
   5 #
   6 #  sima is free software: you can redistribute it and/or modify
   7 #  it under the terms of the GNU General Public License as published by
   8 #  the Free Software Foundation, either version 3 of the License, or
   9 #  (at your option) any later version.
  10 #
  11 #  sima is distributed in the hope that it will be useful,
  12 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 #  GNU General Public License for more details.
  15 #
  16 #  You should have received a copy of the GNU General Public License
  17 #  along with sima.  If not, see <http://www.gnu.org/licenses/>.
  18 #
  19 """Computes levenshtein distance/ratio"""
  20
  21 def levenshtein(a_st, b_st):
  22     """Computes the Levenshtein distance between two strings."""
  23     n_a, m_b = len(a_st), len(b_st)
  24     if n_a > m_b:
  25         # Make sure n <= m, to use O(min(n_a,m_b)) space
  26         a_st, b_st = b_st, a_st
  27         n_a, m_b = m_b, n_a
  28
  29     current = list(range(n_a+1))
  30     for i in range(1, m_b+1):
  31         previous, current = current, [i]+[0]*n_a
  32         for j in range(1, n_a+1):
  33             add, delete = previous[j] + 1, current[j-1] + 1
  34             change = previous[j-1]
  35             if a_st[j-1] != b_st[i-1]:
  36                 change = change + 1
  37             current[j] = min(add, delete, change)
  38
  39     return current[n_a]
  40
  41 def levenshtein_ratio(string, strong):
  42     """
  43     Compute levenshtein ratio.
  44         Ratio = levenshtein distance / lenght of longer string
  45     The longer string length is the upper bound of levenshtein distance.
  46     """
  47     lev_dist = levenshtein(string, strong)
  48     max_len = max(len(string), len(strong))
  49     ratio = 1 - (float(lev_dist) / float(max_len))
  50     return ratio
  51
  52
  53 # VIM MODLINE
  54 # vim: ai ts=4 sw=4 sts=4 expandtab