idun/vendor/github.com/xrash/smetrics/jaro.go
kekskurse 8ad61755e6 init
2023-10-12 10:20:53 +02:00

86 lines
2 KiB
Go

package smetrics
import (
"math"
)
// The Jaro distance. The result is 1 for equal strings, and 0 for completely different strings.
func Jaro(a, b string) float64 {
// If both strings are zero-length, they are completely equal,
// therefore return 1.
if len(a) == 0 && len(b) == 0 {
return 1
}
// If one string is zero-length, strings are completely different,
// therefore return 0.
if len(a) == 0 || len(b) == 0 {
return 0
}
// Define the necessary variables for the algorithm.
la := float64(len(a))
lb := float64(len(b))
matchRange := int(math.Max(0, math.Floor(math.Max(la, lb)/2.0)-1))
matchesA := make([]bool, len(a))
matchesB := make([]bool, len(b))
var matches float64 = 0
// Step 1: Matches
// Loop through each character of the first string,
// looking for a matching character in the second string.
for i := 0; i < len(a); i++ {
start := int(math.Max(0, float64(i-matchRange)))
end := int(math.Min(lb-1, float64(i+matchRange)))
for j := start; j <= end; j++ {
if matchesB[j] {
continue
}
if a[i] == b[j] {
matchesA[i] = true
matchesB[j] = true
matches++
break
}
}
}
// If there are no matches, strings are completely different,
// therefore return 0.
if matches == 0 {
return 0
}
// Step 2: Transpositions
// Loop through the matches' arrays, looking for
// unaligned matches. Count the number of unaligned matches.
unaligned := 0
j := 0
for i := 0; i < len(a); i++ {
if !matchesA[i] {
continue
}
for !matchesB[j] {
j++
}
if a[i] != b[j] {
unaligned++
}
j++
}
// The number of unaligned matches divided by two, is the number of _transpositions_.
transpositions := math.Floor(float64(unaligned / 2))
// Jaro distance is the average between these three numbers:
// 1. matches / length of string A
// 2. matches / length of string B
// 3. (matches - transpositions/matches)
// So, all that divided by three is the final result.
return ((matches / la) + (matches / lb) + ((matches - transpositions) / matches)) / 3.0
}