sajari · gleicon · Apr 15, 2021 · Apr 15, 2021 · Apr 15, 2021 · Apr 16, 2021
diff --git a/distance_test.go b/distance_test.go
@@ -0,0 +1,35 @@
+package fuzzy
+
+import (
+	"testing"
+)
+
+func TestLevshtein(t *testing.T) {
+	s1 := "hello"
+	s2 := "hollaaaa"
+	lev := Levenshtein(&s1, &s2)
+
+	if lev != 5 {
+		t.Errorf("Lev %v", lev)
+	}
+}
+
+func TestJaro(t *testing.T) {
+	s1 := "hello"
+	s2 := "hollaaaa"
+	j := Jaro(s1, s2)
+
+	if j != 0.6833333333333332 {
+		t.Errorf("J %v", j)
+	}
+}
+
+func TestJaroWinkler(t *testing.T) {
+	s1 := "LATE"
+	s2 := "LACE"
+	jw := JaroWinkler(s1, s2)
+
+	if jw != 0.8666666666666667 {
+		t.Errorf("JW %v", jw)
+	}
+}
diff --git a/distances.go b/distances.go
@@ -0,0 +1,132 @@
+package fuzzy
+
+// Calculate the Levenshtein distance between two strings
+func Levenshtein(a, b *string) int {
+	la := len(*a)
+	lb := len(*b)
+	d := make([]int, la+1)
+	var lastdiag, olddiag, temp int
+
+	for i := 1; i <= la; i++ {
+		d[i] = i
+	}
+	for i := 1; i <= lb; i++ {
+		d[0] = i
+		lastdiag = i - 1
+		for j := 1; j <= la; j++ {
+			olddiag = d[j]
+			min := d[j] + 1
+			if (d[j-1] + 1) < min {
+				min = d[j-1] + 1
+			}
+			if (*a)[j-1] == (*b)[i-1] {
+				temp = 0
+			} else {
+				temp = 1
+			}
+			if (lastdiag + temp) < min {
+				min = lastdiag + temp
+			}
+			d[j] = min
+			lastdiag = olddiag
+		}
+	}
+	return d[la]
+}
+
+// Calculate Jaro-Winkler distance between two strings
+func JaroWinkler(s1, s2 string) float64 {
+	jaroDistance := Jaro(s1, s2)
+
+	if jaroDistance > 0.7 {
+		prefix := 0
+
+		for i := 0; i < Min(len(s1), len(s2)); i++ {
+			if s1[i] == s2[i] {
+				prefix += 1
+			} else {
+				break
+			}
+		}
+
+		prefix = Min(4, prefix)
+
+		jaroDistance += 0.1 * float64(prefix) * (1 - jaroDistance)
+	}
+
+	return jaroDistance
+}
+
+func Jaro(s1, s2 string) float64 {
+
+	if s1 == s2 {
+		return 1.0
+	}
+
+	len1 := len(s1)
+	len2 := len(s2)
+
+	if len1 == 0 || len2 == 0 {
+		return 0.0
+	}
+
+	maxDistance := int(float64((Max(len1, len2))/2.0) - 1.0)
+
+	match := 0.
+
+	hashS1 := make([]int, len1)
+	hashS2 := make([]int, len2)
+
+	for i := 0; i < len1; i++ {
+		for j := Max(0, 1-maxDistance); j < Min(len2, i+maxDistance+1); j++ {
+			if (s1[i] == s2[j]) && (hashS2[j] == 0) {
+				hashS1[i] = 1
+				hashS2[j] = 1
+				match += 1
+				break
+			}
+		}
+	}
+
+	if match == 0 {
+		return 0.0
+	}
+
+	t := 0.0
+	point := 0
+
+	for i := 0; i < len1; i++ {
+		if hashS1[i] != 0 {
+			// loop on hashS2 until it finds 1
+			for hashS2[point] < 1 {
+				point++
+			}
+			if s1[i] != s2[point] {
+				t++
+			}
+			point++
+		}
+		//t = t /2
+	}
+	t = t / 2
+
+	// Jaro Similarity
+	//	return float64(((match/len1)+(match/len2)+match-t)/match) / 3.0
+	return (match/float64(len1) +
+		match/float64(len2) +
+		(match-t)/match) / 3
+}
+
+func Max(x, y int) int {
+	if x < y {
+		return y
+	}
+	return x
+}
+
+func Min(x, y int) int {
+	if x > y {
+		return y
+	}
+	return x
+}
diff --git a/fuzzy.go b/fuzzy.go
@@ -36,10 +36,11 @@ const (
 )
 
 type Potential struct {
-	Term   string // Potential term string
-	Score  int    // Score
-	Leven  int    // Levenstein distance from the suggestion to the input
-	Method Method // How this potential was matched
+	Term        string  // Potential term string
+	Score       int     // Score
+	Leven       int     // Levenstein distance from the suggestion to the input
+	JaroWinkler float64 // JaroWinkler distance from the suggestion to the input
+	Method      Method  // How this potential was matched
 }
 
 type Counts struct {
@@ -233,40 +234,6 @@ func (model *Model) SetDivergenceThreshold(val int) {
 	model.Unlock()
 }
 
-// Calculate the Levenshtein distance between two strings
-func Levenshtein(a, b *string) int {
-	la := len(*a)
-	lb := len(*b)
-	d := make([]int, la+1)
-	var lastdiag, olddiag, temp int
-
-	for i := 1; i <= la; i++ {
-		d[i] = i
-	}
-	for i := 1; i <= lb; i++ {
-		d[0] = i
-		lastdiag = i - 1
-		for j := 1; j <= la; j++ {
-			olddiag = d[j]
-			min := d[j] + 1
-			if (d[j-1] + 1) < min {
-				min = d[j-1] + 1
-			}
-			if (*a)[j-1] == (*b)[i-1] {
-				temp = 0
-			} else {
-				temp = 1
-			}
-			if (lastdiag + temp) < min {
-				min = lastdiag + temp
-			}
-			d[j] = min
-			lastdiag = olddiag
-		}
-	}
-	return d[la]
-}
-
 // Add an array of words to train the model in bulk
 func (model *Model) Train(terms []string) {
 	for _, term := range terms {
@@ -328,6 +295,27 @@ func (model *Model) TrainQuery(term string) {
 	}
 }
 
+// Train using a search query term. This builds a second popularity
+// index of terms used to search, as opposed to generally occurring
+// in corpus text. It also adds a user define count (query count) to advice on ranking.
+// see SetCount for inspiration.
+// If the term exists in the model, advances it by `count`, otherwise count will be the
+// starting point as opposed to `1` in the standard TrainQuery
+func (model *Model) TrainQueryWithUserCount(term string, count int) {
+	model.Lock()
+	if t, ok := model.Data[term]; ok {
+		t.Query = t.Query + count
+	} else {
+		model.Data[term] = &Counts{count, 1}
+	}
+	model.SuffDivergence++
+	update := model.SuffDivergence > model.SuffDivergenceThreshold
+	model.Unlock()
+	if update {
+		model.updateSuffixArr()
+	}
+}
+
 // For a given term, create the partially deleted lookup keys
 func (model *Model) createSuggestKeys(term string) {
 	edits := model.EditsMulti(term, model.Depth)
@@ -482,7 +470,7 @@ func (model *Model) suggestPotential(input string, exhaustive bool) map[string]*
 
 	// 0 - If this is a dictionary term we're all good, no need to go further
 	if model.corpusCount(input) > model.Threshold {
-		suggestions[input] = &Potential{Term: input, Score: model.corpusCount(input), Leven: 0, Method: MethodIsWord}
+		suggestions[input] = &Potential{Term: input, Score: model.corpusCount(input), Leven: 0, JaroWinkler: 0.0, Method: MethodIsWord}
 		if !exhaustive {
 			return suggestions
 		}
@@ -492,7 +480,7 @@ func (model *Model) suggestPotential(input string, exhaustive bool) map[string]*
 	if sugg, ok := model.Suggest[input]; ok {
 		for _, pot := range sugg {
 			if _, ok := suggestions[pot]; !ok {
-				suggestions[pot] = &Potential{Term: pot, Score: model.corpusCount(pot), Leven: Levenshtein(&input, &pot), Method: MethodSuggestMapsToInput}
+				suggestions[pot] = &Potential{Term: pot, Score: model.corpusCount(pot), Leven: Levenshtein(&input, &pot), JaroWinkler: JaroWinkler(input, pot), Method: MethodSuggestMapsToInput}
 			}
 		}
 
@@ -508,7 +496,7 @@ func (model *Model) suggestPotential(input string, exhaustive bool) map[string]*
 		score := model.corpusCount(edit)
 		if score > 0 && len(edit) > 2 {
 			if _, ok := suggestions[edit]; !ok {
-				suggestions[edit] = &Potential{Term: edit, Score: score, Leven: Levenshtein(&input, &edit), Method: MethodInputDeleteMapsToDict}
+				suggestions[edit] = &Potential{Term: edit, Score: score, Leven: Levenshtein(&input, &edit), JaroWinkler: JaroWinkler(input, edit), Method: MethodInputDeleteMapsToDict}
 			}
 			if score > max {
 				max = score
@@ -530,9 +518,10 @@ func (model *Model) suggestPotential(input string, exhaustive bool) map[string]*
 			// Is this a real transpose or replace?
 			for _, pot := range sugg {
 				lev := Levenshtein(&input, &pot)
+				jw := JaroWinkler(input, pot)
 				if lev <= model.Depth+1 { // The +1 doesn't seem to impact speed, but has greater coverage when the depth is not sufficient to make suggestions
 					if _, ok := suggestions[pot]; !ok {
-						suggestions[pot] = &Potential{Term: pot, Score: model.corpusCount(pot), Leven: lev, Method: MethodInputDeleteMapsToSuggest}
+						suggestions[pot] = &Potential{Term: pot, Score: model.corpusCount(pot), Leven: lev, JaroWinkler: jw, Method: MethodInputDeleteMapsToSuggest}
 					}
 				}
 			}