Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Jaro-Winkler distance for external ranking #29

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions distance_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package fuzzy

import (
"testing"
)

func TestLevshtein(t *testing.T) {
s1 := "hello"
s2 := "hollaaaa"
lev := Levenshtein(&s1, &s2)

if lev != 5 {
t.Errorf("Lev %v", lev)
}
}

func TestJaro(t *testing.T) {
s1 := "hello"
s2 := "hollaaaa"
j := Jaro(s1, s2)

if j != 0.6833333333333332 {
t.Errorf("J %v", j)
}
}

func TestJaroWinkler(t *testing.T) {
s1 := "LATE"
s2 := "LACE"
jw := JaroWinkler(s1, s2)

if jw != 0.8666666666666667 {
t.Errorf("JW %v", jw)
}
}
132 changes: 132 additions & 0 deletions distances.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
package fuzzy

// Calculate the Levenshtein distance between two strings
func Levenshtein(a, b *string) int {
la := len(*a)
lb := len(*b)
d := make([]int, la+1)
var lastdiag, olddiag, temp int

for i := 1; i <= la; i++ {
d[i] = i
}
for i := 1; i <= lb; i++ {
d[0] = i
lastdiag = i - 1
for j := 1; j <= la; j++ {
olddiag = d[j]
min := d[j] + 1
if (d[j-1] + 1) < min {
min = d[j-1] + 1
}
if (*a)[j-1] == (*b)[i-1] {
temp = 0
} else {
temp = 1
}
if (lastdiag + temp) < min {
min = lastdiag + temp
}
d[j] = min
lastdiag = olddiag
}
}
return d[la]
}

// Calculate Jaro-Winkler distance between two strings
func JaroWinkler(s1, s2 string) float64 {
jaroDistance := Jaro(s1, s2)

if jaroDistance > 0.7 {
prefix := 0

for i := 0; i < Min(len(s1), len(s2)); i++ {
if s1[i] == s2[i] {
prefix += 1
} else {
break
}
}

prefix = Min(4, prefix)

jaroDistance += 0.1 * float64(prefix) * (1 - jaroDistance)
}

return jaroDistance
}

func Jaro(s1, s2 string) float64 {

if s1 == s2 {
return 1.0
}

len1 := len(s1)
len2 := len(s2)

if len1 == 0 || len2 == 0 {
return 0.0
}

maxDistance := int(float64((Max(len1, len2))/2.0) - 1.0)

match := 0.

hashS1 := make([]int, len1)
hashS2 := make([]int, len2)

for i := 0; i < len1; i++ {
for j := Max(0, 1-maxDistance); j < Min(len2, i+maxDistance+1); j++ {
if (s1[i] == s2[j]) && (hashS2[j] == 0) {
hashS1[i] = 1
hashS2[j] = 1
match += 1
break
}
}
}

if match == 0 {
return 0.0
}

t := 0.0
point := 0

for i := 0; i < len1; i++ {
if hashS1[i] != 0 {
// loop on hashS2 until it finds 1
for hashS2[point] < 1 {
point++
}
if s1[i] != s2[point] {
t++
}
point++
}
//t = t /2
}
t = t / 2

// Jaro Similarity
// return float64(((match/len1)+(match/len2)+match-t)/match) / 3.0
return (match/float64(len1) +
match/float64(len2) +
(match-t)/match) / 3
}

func Max(x, y int) int {
if x < y {
return y
}
return x
}

func Min(x, y int) int {
if x > y {
return y
}
return x
}
73 changes: 31 additions & 42 deletions fuzzy.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,11 @@ const (
)

type Potential struct {
Term string // Potential term string
Score int // Score
Leven int // Levenstein distance from the suggestion to the input
Method Method // How this potential was matched
Term string // Potential term string
Score int // Score
Leven int // Levenstein distance from the suggestion to the input
JaroWinkler float64 // JaroWinkler distance from the suggestion to the input
Method Method // How this potential was matched
}

type Counts struct {
Expand Down Expand Up @@ -233,40 +234,6 @@ func (model *Model) SetDivergenceThreshold(val int) {
model.Unlock()
}

// Calculate the Levenshtein distance between two strings
func Levenshtein(a, b *string) int {
la := len(*a)
lb := len(*b)
d := make([]int, la+1)
var lastdiag, olddiag, temp int

for i := 1; i <= la; i++ {
d[i] = i
}
for i := 1; i <= lb; i++ {
d[0] = i
lastdiag = i - 1
for j := 1; j <= la; j++ {
olddiag = d[j]
min := d[j] + 1
if (d[j-1] + 1) < min {
min = d[j-1] + 1
}
if (*a)[j-1] == (*b)[i-1] {
temp = 0
} else {
temp = 1
}
if (lastdiag + temp) < min {
min = lastdiag + temp
}
d[j] = min
lastdiag = olddiag
}
}
return d[la]
}

// Add an array of words to train the model in bulk
func (model *Model) Train(terms []string) {
for _, term := range terms {
Expand Down Expand Up @@ -328,6 +295,27 @@ func (model *Model) TrainQuery(term string) {
}
}

// Train using a search query term. This builds a second popularity
// index of terms used to search, as opposed to generally occurring
// in corpus text. It also adds a user define count (query count) to advice on ranking.
// see SetCount for inspiration.
// If the term exists in the model, advances it by `count`, otherwise count will be the
// starting point as opposed to `1` in the standard TrainQuery
func (model *Model) TrainQueryWithUserCount(term string, count int) {
model.Lock()
if t, ok := model.Data[term]; ok {
t.Query = t.Query + count
} else {
model.Data[term] = &Counts{count, 1}
}
model.SuffDivergence++
update := model.SuffDivergence > model.SuffDivergenceThreshold
model.Unlock()
if update {
model.updateSuffixArr()
}
}

// For a given term, create the partially deleted lookup keys
func (model *Model) createSuggestKeys(term string) {
edits := model.EditsMulti(term, model.Depth)
Expand Down Expand Up @@ -482,7 +470,7 @@ func (model *Model) suggestPotential(input string, exhaustive bool) map[string]*

// 0 - If this is a dictionary term we're all good, no need to go further
if model.corpusCount(input) > model.Threshold {
suggestions[input] = &Potential{Term: input, Score: model.corpusCount(input), Leven: 0, Method: MethodIsWord}
suggestions[input] = &Potential{Term: input, Score: model.corpusCount(input), Leven: 0, JaroWinkler: 0.0, Method: MethodIsWord}
if !exhaustive {
return suggestions
}
Expand All @@ -492,7 +480,7 @@ func (model *Model) suggestPotential(input string, exhaustive bool) map[string]*
if sugg, ok := model.Suggest[input]; ok {
for _, pot := range sugg {
if _, ok := suggestions[pot]; !ok {
suggestions[pot] = &Potential{Term: pot, Score: model.corpusCount(pot), Leven: Levenshtein(&input, &pot), Method: MethodSuggestMapsToInput}
suggestions[pot] = &Potential{Term: pot, Score: model.corpusCount(pot), Leven: Levenshtein(&input, &pot), JaroWinkler: JaroWinkler(input, pot), Method: MethodSuggestMapsToInput}
}
}

Expand All @@ -508,7 +496,7 @@ func (model *Model) suggestPotential(input string, exhaustive bool) map[string]*
score := model.corpusCount(edit)
if score > 0 && len(edit) > 2 {
if _, ok := suggestions[edit]; !ok {
suggestions[edit] = &Potential{Term: edit, Score: score, Leven: Levenshtein(&input, &edit), Method: MethodInputDeleteMapsToDict}
suggestions[edit] = &Potential{Term: edit, Score: score, Leven: Levenshtein(&input, &edit), JaroWinkler: JaroWinkler(input, edit), Method: MethodInputDeleteMapsToDict}
}
if score > max {
max = score
Expand All @@ -530,9 +518,10 @@ func (model *Model) suggestPotential(input string, exhaustive bool) map[string]*
// Is this a real transpose or replace?
for _, pot := range sugg {
lev := Levenshtein(&input, &pot)
jw := JaroWinkler(input, pot)
if lev <= model.Depth+1 { // The +1 doesn't seem to impact speed, but has greater coverage when the depth is not sufficient to make suggestions
if _, ok := suggestions[pot]; !ok {
suggestions[pot] = &Potential{Term: pot, Score: model.corpusCount(pot), Leven: lev, Method: MethodInputDeleteMapsToSuggest}
suggestions[pot] = &Potential{Term: pot, Score: model.corpusCount(pot), Leven: lev, JaroWinkler: jw, Method: MethodInputDeleteMapsToSuggest}
}
}
}
Expand Down