idun/vendor/github.com/xrash/smetrics/jaro-winkler.go
kekskurse 8ad61755e6 init
2023-10-12 10:20:53 +02:00

28 lines
999 B
Go

package smetrics
import (
"math"
)
// The Jaro-Winkler distance. The result is 1 for equal strings, and 0 for completely different strings. It is commonly used on Record Linkage stuff, thus it tries to be accurate for common typos when writing real names such as person names and street names.
// Jaro-Winkler is a modification of the Jaro algorithm. It works by first running Jaro, then boosting the score of exact matches at the beginning of the strings. Because of that, it introduces two more parameters: the boostThreshold and the prefixSize. These are commonly set to 0.7 and 4, respectively.
func JaroWinkler(a, b string, boostThreshold float64, prefixSize int) float64 {
j := Jaro(a, b)
if j <= boostThreshold {
return j
}
prefixSize = int(math.Min(float64(len(a)), math.Min(float64(prefixSize), float64(len(b)))))
var prefixMatch float64
for i := 0; i < prefixSize; i++ {
if a[i] == b[i] {
prefixMatch++
} else {
break
}
}
return j + 0.1*prefixMatch*(1.0-j)
}