001/*
002 * The contents of this file are subject to the terms of the Common Development and
003 * Distribution License (the License). You may not use this file except in compliance with the
004 * License.
005 *
006 * You can obtain a copy of the License at legal/CDDLv1.0.txt. See the License for the
007 * specific language governing permission and limitations under the License.
008 *
009 * When distributing Covered Software, include this CDDL Header Notice in each file and include
010 * the License file at legal/CDDLv1.0.txt. If applicable, add the following below the CDDL
011 * Header, with the fields enclosed by brackets [] replaced by your own identifying
012 * information: "Portions Copyright [year] [name of copyright owner]".
013 *
014 * Copyright 2008 Sun Microsystems, Inc.
015 * Portions Copyright 2014 ForgeRock AS.
016 */
017package org.opends.server.util;
018
019
020
021import static org.forgerock.util.Reject.*;
022
023
024
025/**
026 * This class provides an implementation of the Levenshtein distance algorithm,
027 * which may be used to determine the minimum number of changes required to
028 * transform one string into another.  For the purpose of this algorithm, a
029 * change is defined as replacing one character with another, inserting a new
030 * character, or removing an existing character.
031 * <BR><BR>
032 * The basic algorithm works as follows for a source string S of length X and
033 * a target string T of length Y:
034 * <OL>
035 *   <LI>Create a matrix M with dimensions of X+1, Y+1.</LI>
036 *   <LI>Fill the first row with sequentially-increasing values 0 through
037 *       X.</LI>
038 *   <LI>Fill the first column with sequentially-increasing values 0 through
039 *       Y.</LI>
040 *   <LI>Create a nested loop iterating over the characters in the strings.  In
041 *       the outer loop, iterate through characters in S from 0 to X-1 using an
042 *       iteration counter I.  In the inner loop, iterate through the characters
043 *       in T from 0 to Y-1 using an iterator counter J.  Calculate the
044 *       following three things and place the smallest value in the matrix in
045 *       row I+1 column J+1:
046 *     <UL>
047 *       <LI>One more than the value in the matrix position immediately to the
048 *           left (i.e., 1 + M[I][J+1]).</LI>
049 *       <LI>One more than the value in the matrix position immediately above
050 *           (i.e., 1 + M[I+1][J]).</LI>
051 *       <LI>Define a value V to be zero if S[I] is the same as T[J], or one if
052 *           they are different.  Add that value to the value in the matrix
053 *           position immediately above and to the left (i.e.,
054 *           V + M[I][J]).</LI>
055 *     </UL>
056 *   </LI>
057 *   <LI>The Levenshtein distance value, which is the least number of changes
058 *       needed to transform the source string into the target string, will be
059 *       the value in the bottom right corner of the matrix (i.e.,
060 *       M[X][Y]).</LI>
061 * </OL>
062 * <BR><BR>
063 * Note that this is a completely "clean room" implementation, developed from a
064 * description of the algorithm, rather than copying an existing implementation.
065 * Doing it in this way eliminates copyright and licensing concerns associated
066 * with using an existing implementation.
067 */
068@org.opends.server.types.PublicAPI(
069     stability=org.opends.server.types.StabilityLevel.UNCOMMITTED,
070     mayInstantiate=false,
071     mayExtend=false,
072     mayInvoke=true)
073public final class LevenshteinDistance
074{
075  /**
076   * Calculates the Levenshtein distance between the provided string values.
077   *
078   * @param  source  The source string to compare.  It must not be {@code null}.
079   * @param  target  The target string to compare.  It must not be {@code null}.
080   *
081   * @return  The minimum number of changes required to turn the source string
082   *          into the target string.
083   */
084  public static int calculate(String source, String target)
085  {
086    ifNull(source, target);
087
088    // sl == source length; tl == target length
089    int sl = source.length();
090    int tl = target.length();
091
092
093    // If either of the lengths is zero, then the distance is the length of the
094    // other string.
095    if (sl == 0)
096    {
097      return tl;
098    }
099    else if (tl == 0)
100    {
101      return sl;
102    }
103
104
105    // w == matrix width; h == matrix height
106    int w = sl+1;
107    int h = tl+1;
108
109
110    // m == matrix array
111    // Create the array and fill it with values 0..sl in the first dimension and
112    // 0..tl in the second dimension.
113    int[][] m = new int[w][h];
114    for (int i=0; i < w; i++)
115    {
116      m[i][0] = i;
117    }
118
119    for (int i=1; i < h; i++)
120    {
121      m[0][i] = i;
122    }
123
124    for (int i=0,x=1; i < sl; i++,x++)
125    {
126      char s = source.charAt(i);
127
128      for (int j=0,y=1; j < tl; j++,y++)
129      {
130        char t = target.charAt(j);
131
132
133        // Figure out what to put in the appropriate matrix slot.  It should be
134        // the lowest of:
135        // - One more than the value to the left
136        // - One more than the value to the top
137        // - If the characters are equal, the value to the upper left, otherwise
138        //   one more than the value to the upper left.
139        m[x][y] = Math.min(Math.min((m[i][y] + 1), (m[x][j] + 1)),
140                           (m[i][j] + ((s == t) ? 0 : 1)));
141      }
142    }
143
144    // The Levenshtein distance should now be the value in the lower right
145    // corner of the matrix.
146    return m[sl][tl];
147  }
148}
149