; Computes the number of typos (Damerau-Levenshtein distance) between two short strings. ; Four types of differences are counted: ; insertion of a character, abcd ab#cd ; deletion of a character, abcd acd ; exchange of a character abcd ab$d ; inversion of adjacent chars abcd acbd ; ; This function does NOT satisfy the so-called "triangle inequality", which means ; more simply that it makes NO attempt to compute the MINIMUM edit distance in all ; cases. If you need that, you should use more complex algorithms. ; ; This simple function allows a fuzzy compare for e.g. recovering from typical ; human typos in short strings like names, address, cities... while getting rid of ; minor scripting differences. ; ; Strings are lowercased. ; String $st2 can be used as a pattern similar to the SQL 'LIKE' operator: ; '_' and trailing '%' act as in LIKE. These wildcards can be passed as parameters ; but these should contain exactly one character for the function to work properly. ; ; Complexity is in O(n^2) so don't use with long strings! ; Func _Typos(Const $st1, Const $st2, $anychar = '_', $anytail = '%') Local $s1, $s2, $pen, $del, $ins, $subst If Not IsString($st1) Then Return SetError(-1, -1, -1) If Not IsString($st2) Then Return SetError(-2, -2, -1) If $st2 = '' Then Return StringLen($st1) If $st2 == $anytail Then Return 0 If $st1 = '' Then Return(StringInStr($st2 & $anytail, $anytail, 1) - 1) EndIf ;~ $s1 = StringSplit(_LowerUnaccent($st1)), "", 2) ;; _LowerUnaccent() addon function not available here ;~ $s2 = StringSplit(_LowerUnaccent($st2)), "", 2) ;; _LowerUnaccent() addon function not available here $s1 = StringSplit(StringLower($st1), "", 2) $s2 = StringSplit(StringLower($st2), "", 2) Local $l1 = UBound($s1), $l2 = UBound($s2) Local $r[$l1 + 1][$l2 + 1] For $x = 0 To $l2 - 1 Switch $s2[$x] Case $anychar If $x < $l1 Then $s2[$x] = $s1[$x] EndIf Case $anytail $l2 = $x If $l1 > $l2 Then $l1 = $l2 EndIf ExitLoop EndSwitch $r[0][$x] = $x Next $r[0][$l2] = $l2 For $x = 0 To $l1 $r[$x][0] = $x Next For $x = 1 To $l1 For $y = 1 To $l2 $pen = Not ($s1[$x - 1] == $s2[$y - 1]) $del = $r[$x-1][$y] + 1 $ins = $r[$x][$y-1] + 1 $subst = $r[$x-1][$y-1] + $pen If $del > $ins Then $del = $ins If $del > $subst Then $del = $subst $r[$x][$y] = $del If ($pen And $x > 1 And $y > 1 And $s1[$x-1] == $s2[$y-2] And $s1[$x-2] == $s2[$y-1]) Then If $r[$x][$y] >= $r[$x-2][$y-2] Then $r[$x][$y] = $r[$x-2][$y-2] + 1 $r[$x-1][$y-1] = $r[$x][$y] EndIf Next Next Return ($r[$l1][$l2]) EndFunc ;; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ;; example usage ;; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #include Local $reference = "lexicographically" Local $Words[11][2] = [ _ [$reference], _ ["Lexicôgraphicaly"], _ ["lexkographicaly"], _ ["Lexico9raphically"], _ ["lexioo9raphically"], _ ["Lexicographical"], _ ["lexicographlcally"], _ ["Lex1cogr@phically"], _ ["lexic0graphïca1yl"], _ ["lexIcOgraphically"], _ ["Lexlcographically"] _ ] For $i = 0 To UBound($Words) - 1 $Words[$i][1] = _Typos($Words[$i][0], $reference) Next _ArrayDisplay($Words, "Number of typos") ConsoleWrite("Usage of '_' and '%' wildcards in pattern:" & @LF & @TAB & "_Typos('lex1c0gr@fhlâofznho', 'LEx_c_gr%') = " & _Typos('lex1c0gr@fhlofznho', 'lex_c_gr%') & @LF) ConsoleWrite("Does not always return the absolute minimum edit distance:" & @LF & @TAB & "_Typos('bdac', 'abcd') = " & _Typos('bdac', 'abcd') & @LF) ; abcd --> bacd swap ab --> ba = 1 typo ; bacd --> badc swap cd --> dc = 2 typos ; badc --> bdac swap ad --> da = 3 typos but the function returns 4