001/* 002 * The contents of this file are subject to the terms of the Common Development and 003 * Distribution License (the License). You may not use this file except in compliance with the 004 * License. 005 * 006 * You can obtain a copy of the License at legal/CDDLv1.0.txt. See the License for the 007 * specific language governing permission and limitations under the License. 008 * 009 * When distributing Covered Software, include this CDDL Header Notice in each file and include 010 * the License file at legal/CDDLv1.0.txt. If applicable, add the following below the CDDL 011 * Header, with the fields enclosed by brackets [] replaced by your own identifying 012 * information: "Portions Copyright [year] [name of copyright owner]". 013 * 014 * Copyright 2008 Sun Microsystems, Inc. 015 * Portions Copyright 2014 ForgeRock AS. 016 */ 017package org.opends.server.util; 018 019 020 021import static org.forgerock.util.Reject.*; 022 023 024 025/** 026 * This class provides an implementation of the Levenshtein distance algorithm, 027 * which may be used to determine the minimum number of changes required to 028 * transform one string into another. For the purpose of this algorithm, a 029 * change is defined as replacing one character with another, inserting a new 030 * character, or removing an existing character. 031 * <BR><BR> 032 * The basic algorithm works as follows for a source string S of length X and 033 * a target string T of length Y: 034 * <OL> 035 * <LI>Create a matrix M with dimensions of X+1, Y+1.</LI> 036 * <LI>Fill the first row with sequentially-increasing values 0 through 037 * X.</LI> 038 * <LI>Fill the first column with sequentially-increasing values 0 through 039 * Y.</LI> 040 * <LI>Create a nested loop iterating over the characters in the strings. In 041 * the outer loop, iterate through characters in S from 0 to X-1 using an 042 * iteration counter I. In the inner loop, iterate through the characters 043 * in T from 0 to Y-1 using an iterator counter J. Calculate the 044 * following three things and place the smallest value in the matrix in 045 * row I+1 column J+1: 046 * <UL> 047 * <LI>One more than the value in the matrix position immediately to the 048 * left (i.e., 1 + M[I][J+1]).</LI> 049 * <LI>One more than the value in the matrix position immediately above 050 * (i.e., 1 + M[I+1][J]).</LI> 051 * <LI>Define a value V to be zero if S[I] is the same as T[J], or one if 052 * they are different. Add that value to the value in the matrix 053 * position immediately above and to the left (i.e., 054 * V + M[I][J]).</LI> 055 * </UL> 056 * </LI> 057 * <LI>The Levenshtein distance value, which is the least number of changes 058 * needed to transform the source string into the target string, will be 059 * the value in the bottom right corner of the matrix (i.e., 060 * M[X][Y]).</LI> 061 * </OL> 062 * <BR><BR> 063 * Note that this is a completely "clean room" implementation, developed from a 064 * description of the algorithm, rather than copying an existing implementation. 065 * Doing it in this way eliminates copyright and licensing concerns associated 066 * with using an existing implementation. 067 */ 068@org.opends.server.types.PublicAPI( 069 stability=org.opends.server.types.StabilityLevel.UNCOMMITTED, 070 mayInstantiate=false, 071 mayExtend=false, 072 mayInvoke=true) 073public final class LevenshteinDistance 074{ 075 /** 076 * Calculates the Levenshtein distance between the provided string values. 077 * 078 * @param source The source string to compare. It must not be {@code null}. 079 * @param target The target string to compare. It must not be {@code null}. 080 * 081 * @return The minimum number of changes required to turn the source string 082 * into the target string. 083 */ 084 public static int calculate(String source, String target) 085 { 086 ifNull(source, target); 087 088 // sl == source length; tl == target length 089 int sl = source.length(); 090 int tl = target.length(); 091 092 093 // If either of the lengths is zero, then the distance is the length of the 094 // other string. 095 if (sl == 0) 096 { 097 return tl; 098 } 099 else if (tl == 0) 100 { 101 return sl; 102 } 103 104 105 // w == matrix width; h == matrix height 106 int w = sl+1; 107 int h = tl+1; 108 109 110 // m == matrix array 111 // Create the array and fill it with values 0..sl in the first dimension and 112 // 0..tl in the second dimension. 113 int[][] m = new int[w][h]; 114 for (int i=0; i < w; i++) 115 { 116 m[i][0] = i; 117 } 118 119 for (int i=1; i < h; i++) 120 { 121 m[0][i] = i; 122 } 123 124 for (int i=0,x=1; i < sl; i++,x++) 125 { 126 char s = source.charAt(i); 127 128 for (int j=0,y=1; j < tl; j++,y++) 129 { 130 char t = target.charAt(j); 131 132 133 // Figure out what to put in the appropriate matrix slot. It should be 134 // the lowest of: 135 // - One more than the value to the left 136 // - One more than the value to the top 137 // - If the characters are equal, the value to the upper left, otherwise 138 // one more than the value to the upper left. 139 m[x][y] = Math.min(Math.min((m[i][y] + 1), (m[x][j] + 1)), 140 (m[i][j] + ((s == t) ? 0 : 1))); 141 } 142 } 143 144 // The Levenshtein distance should now be the value in the lower right 145 // corner of the matrix. 146 return m[sl][tl]; 147 } 148} 149