xref: /aosp_15_r20/external/licenseclassifier/stringclassifier/classifier_test.go (revision 46c4c49da23cae783fa41bf46525a6505638499a)
1*46c4c49dSIbrahim Kanouche// Copyright 2017 Google Inc.
2*46c4c49dSIbrahim Kanouche//
3*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License");
4*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License.
5*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at
6*46c4c49dSIbrahim Kanouche//
7*46c4c49dSIbrahim Kanouche//	http://www.apache.org/licenses/LICENSE-2.0
8*46c4c49dSIbrahim Kanouche//
9*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software
10*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS,
11*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and
13*46c4c49dSIbrahim Kanouche// limitations under the License.
14*46c4c49dSIbrahim Kanouche//
15*46c4c49dSIbrahim Kanouche// Select test data comes from
16*46c4c49dSIbrahim Kanouche// The Project Gutenberg eBook of The humour of Ireland, by D. J., (David James), (1866-1917) O'Donoghue
17*46c4c49dSIbrahim Kanouche
18*46c4c49dSIbrahim Kanouchepackage stringclassifier
19*46c4c49dSIbrahim Kanouche
20*46c4c49dSIbrahim Kanoucheimport (
21*46c4c49dSIbrahim Kanouche	"reflect"
22*46c4c49dSIbrahim Kanouche	"regexp"
23*46c4c49dSIbrahim Kanouche	"sort"
24*46c4c49dSIbrahim Kanouche	"testing"
25*46c4c49dSIbrahim Kanouche
26*46c4c49dSIbrahim Kanouche	"github.com/sergi/go-diff/diffmatchpatch"
27*46c4c49dSIbrahim Kanouche)
28*46c4c49dSIbrahim Kanouche
29*46c4c49dSIbrahim Kanouchevar (
30*46c4c49dSIbrahim Kanouche	gettysburg = `Four score and seven years ago our fathers brought forth
31*46c4c49dSIbrahim Kanoucheon this continent, a new nation, conceived in Liberty, and dedicated to the
32*46c4c49dSIbrahim Kanoucheproposition that all men are created equal.`
33*46c4c49dSIbrahim Kanouche	modifiedGettysburg = `Four score and seven years ago our fathers brought forth
34*46c4c49dSIbrahim Kanoucheon this continent, a nation that was new and improved, conceived in Liberty, and
35*46c4c49dSIbrahim Kanouchededicated to the proposition that all men are created equal.`
36*46c4c49dSIbrahim Kanouche	gettysburgExtraWord = `Four score and seven years ago our fathers brought forth
37*46c4c49dSIbrahim Kanoucheon this continent, a new nation, conceived in Liberty, and dedicated to the
38*46c4c49dSIbrahim Kanoucheproposition that all men are created equal.Foobar`
39*46c4c49dSIbrahim Kanouche
40*46c4c49dSIbrahim Kanouche	declaration = `When in the Course of human events, it becomes necessary
41*46c4c49dSIbrahim Kanouchefor one people to dissolve the political bands which have connected them with
42*46c4c49dSIbrahim Kanoucheanother, and to assume among the powers of the earth, the separate and equal
43*46c4c49dSIbrahim Kanouchestation to which the Laws of Nature and of Nature's God entitle them, a decent
44*46c4c49dSIbrahim Kanoucherespect to the opinions of mankind requires that they should declare the causes
45*46c4c49dSIbrahim Kanouchewhich impel them to the separation.`
46*46c4c49dSIbrahim Kanouche
47*46c4c49dSIbrahim Kanouche	loremipsum = `Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla
48*46c4c49dSIbrahim Kanouchevarius enim mattis, rhoncus lectus id, aliquet sem. Phasellus eget ex in dolor
49*46c4c49dSIbrahim Kanouchefeugiat ultricies. Etiam interdum sit amet nisl in placerat.  Sed vitae enim
50*46c4c49dSIbrahim Kanouchevulputate, tempus leo commodo, accumsan nulla.`
51*46c4c49dSIbrahim Kanouche	modifiedLorem = `Lorem ipsum dolor amet, consectetur adipiscing elit. Nulla
52*46c4c49dSIbrahim Kanouchevarius enim mattis, lectus id, aliquet rhoncus  sem. Phasellus eget ex in dolor
53*46c4c49dSIbrahim Kanouchefeugiat ultricies. Etiam interdum sit amet sit  nisl in placerat.  Sed vitae enim
54*46c4c49dSIbrahim Kanouchevulputate, tempus leo commodo, accumsan nulla.`
55*46c4c49dSIbrahim Kanouche	lessModifiedLorem = `Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla
56*46c4c49dSIbrahim Kanouchevarius enim mattis, rhoncus lectus id, aliquet. Phasellus eget ex in dolor
57*46c4c49dSIbrahim Kanouchefeugiat ultricies. Etiam interdum sit amet nisl in placerat.  Sed vitae enim
58*46c4c49dSIbrahim Kanouchevulputate, tempus leo commodo, accumsan nulla.`
59*46c4c49dSIbrahim Kanouche	humourOfIreland = `As a rule, Irish poets have not extracted a pessimistic
60*46c4c49dSIbrahim Kanouchephilosophy from liquor; they areelevated,” not depressed, and do not deem
61*46c4c49dSIbrahim Kanoucheit essential to the production of a poem that its author should be a cynic or
62*46c4c49dSIbrahim Kanouchean evil prophet. One of the best attributes of Irish poetry is its constant
63*46c4c49dSIbrahim Kanoucheexpression of the natural emotions. Previous to the close of the
64*46c4c49dSIbrahim Kanoucheseventeenth[xvi] century, it is said, drunkenness was not suggested by the
65*46c4c49dSIbrahim Kanouchepoets as common in Irelandthe popularity of Bacchanalian songs since that
66*46c4c49dSIbrahim Kanouchedate seems to prove that the vice soon became a virtue. Maginn is the
67*46c4c49dSIbrahim Kanouchenoisiest of modern revellers, and easily roars the others down.
68*46c4c49dSIbrahim Kanouche`
69*46c4c49dSIbrahim Kanouche	fellowInTheGoatSkin = `There was a poor widow living down there near the Iron
70*46c4c49dSIbrahim KanoucheForge when the country was all covered with forests, and you might walk on
71*46c4c49dSIbrahim Kanouchethe tops of trees from Carnew to the Ladys Island, and she had one boy. She
72*46c4c49dSIbrahim Kanouchewas very poor, as I said before, and was not able to buy clothes for her son.
73*46c4c49dSIbrahim KanoucheSo when she was going out she fixed him snug and combustible in the ash-pit,
74*46c4c49dSIbrahim Kanoucheand piled the warm ashes about him. The boy knew no better, and was as happy
75*46c4c49dSIbrahim Kanoucheas the day was long; and he was happier still when a neighbour[10] gave his
76*46c4c49dSIbrahim Kanouchemother a kid to keep him company when herself was abroad. The kid and the lad
77*46c4c49dSIbrahim Kanoucheplayed like two may-boys; and when she was old enough to give milk, wasnt it
78*46c4c49dSIbrahim Kanouchea godsend to the little family? You wont prevent the boy from growing up
79*46c4c49dSIbrahim Kanoucheinto a young man, but not a screed of clothes had he then no more than when
80*46c4c49dSIbrahim Kanouchehe was a gorsoon.
81*46c4c49dSIbrahim Kanouche`
82*46c4c49dSIbrahim Kanouche	oldCrowYoungCrow = `There was an old crow teaching a young crow one day, and
83*46c4c49dSIbrahim Kanouchehe said to him, “Now, my son,” says he, “listen to the advice Im going to
84*46c4c49dSIbrahim Kanouchegive you. If you see a person coming near you and stooping, mind yourself,
85*46c4c49dSIbrahim Kanoucheand be on your keeping; hes stooping for a stone to throw at you.”
86*46c4c49dSIbrahim Kanouche
87*46c4c49dSIbrahim KanoucheBut tell me,” says the young crow, “what should I do if he had a stone
88*46c4c49dSIbrahim Kanouchealready down in his pocket?”
89*46c4c49dSIbrahim Kanouche
90*46c4c49dSIbrahim KanoucheMusha, golong out of that,” says the old crow, “youve learned enough; the
91*46c4c49dSIbrahim Kanouchedevil another learning Im able to give you.”
92*46c4c49dSIbrahim Kanouche`
93*46c4c49dSIbrahim Kanouche	nullifiable = `[[ , _ , _ , _
94*46c4c49dSIbrahim Kanouche? _ : _
95*46c4c49dSIbrahim Kanouche? _ : _
96*46c4c49dSIbrahim Kanouche? _ : _
97*46c4c49dSIbrahim Kanouche]
98*46c4c49dSIbrahim Kanouche}
99*46c4c49dSIbrahim Kanouche`
100*46c4c49dSIbrahim Kanouche	nonWords = regexp.MustCompile("[[:punct:]]+")
101*46c4c49dSIbrahim Kanouche)
102*46c4c49dSIbrahim Kanouche
103*46c4c49dSIbrahim Kanouche// removeNonWords removes non-words from the string, replacing them with empty
104*46c4c49dSIbrahim Kanouche// string. (This is meant to exercise tokenization problems.)
105*46c4c49dSIbrahim Kanouchefunc removeNonWords(s string) string {
106*46c4c49dSIbrahim Kanouche	return nonWords.ReplaceAllString(s, "")
107*46c4c49dSIbrahim Kanouche}
108*46c4c49dSIbrahim Kanouche
109*46c4c49dSIbrahim Kanouchefunc TestClassify_NearestMatch(t *testing.T) {
110*46c4c49dSIbrahim Kanouche	c := New(DefaultConfidenceThreshold, FlattenWhitespace)
111*46c4c49dSIbrahim Kanouche	c.AddValue("gettysburg", gettysburg)
112*46c4c49dSIbrahim Kanouche	c.AddValue("declaration", declaration)
113*46c4c49dSIbrahim Kanouche	c.AddValue("loremipsum", loremipsum)
114*46c4c49dSIbrahim Kanouche
115*46c4c49dSIbrahim Kanouche	tests := []struct {
116*46c4c49dSIbrahim Kanouche		description string
117*46c4c49dSIbrahim Kanouche		input       string  // input string to match
118*46c4c49dSIbrahim Kanouche		name        string  // name of expected nearest match
119*46c4c49dSIbrahim Kanouche		minConf     float64 // the lowest confidence accepted for the match
120*46c4c49dSIbrahim Kanouche		maxConf     float64 // the highest confidence we expect for this match
121*46c4c49dSIbrahim Kanouche	}{
122*46c4c49dSIbrahim Kanouche		{
123*46c4c49dSIbrahim Kanouche			description: "Full Declaration",
124*46c4c49dSIbrahim Kanouche			input:       declaration,
125*46c4c49dSIbrahim Kanouche			name:        "declaration",
126*46c4c49dSIbrahim Kanouche			minConf:     1.0,
127*46c4c49dSIbrahim Kanouche			maxConf:     1.0,
128*46c4c49dSIbrahim Kanouche		},
129*46c4c49dSIbrahim Kanouche		{
130*46c4c49dSIbrahim Kanouche			description: "Modified Lorem",
131*46c4c49dSIbrahim Kanouche			input:       modifiedLorem,
132*46c4c49dSIbrahim Kanouche			name:        "loremipsum",
133*46c4c49dSIbrahim Kanouche			minConf:     0.90,
134*46c4c49dSIbrahim Kanouche			maxConf:     0.91,
135*46c4c49dSIbrahim Kanouche		},
136*46c4c49dSIbrahim Kanouche		{
137*46c4c49dSIbrahim Kanouche			description: "Modified Gettysburg",
138*46c4c49dSIbrahim Kanouche			input:       modifiedGettysburg,
139*46c4c49dSIbrahim Kanouche			name:        "gettysburg",
140*46c4c49dSIbrahim Kanouche			minConf:     0.86,
141*46c4c49dSIbrahim Kanouche			maxConf:     0.87,
142*46c4c49dSIbrahim Kanouche		},
143*46c4c49dSIbrahim Kanouche	}
144*46c4c49dSIbrahim Kanouche
145*46c4c49dSIbrahim Kanouche	for _, tt := range tests {
146*46c4c49dSIbrahim Kanouche		m := c.NearestMatch(tt.input)
147*46c4c49dSIbrahim Kanouche
148*46c4c49dSIbrahim Kanouche		if got, want := m.Name, tt.name; got != want {
149*46c4c49dSIbrahim Kanouche			t.Errorf("NearestMatch(%q) = %q, want %q", tt.description, got, want)
150*46c4c49dSIbrahim Kanouche		}
151*46c4c49dSIbrahim Kanouche		if got, want := m.Confidence, tt.minConf; got < want {
152*46c4c49dSIbrahim Kanouche			t.Errorf("NearestMatch(%q) returned confidence %v, want minimum of %v", tt.description, got, want)
153*46c4c49dSIbrahim Kanouche		}
154*46c4c49dSIbrahim Kanouche		if got, want := m.Confidence, tt.maxConf; got > want {
155*46c4c49dSIbrahim Kanouche			t.Errorf("NearestMatch(%q) = %v, want maxiumum of %v", tt.description, got, want)
156*46c4c49dSIbrahim Kanouche		}
157*46c4c49dSIbrahim Kanouche	}
158*46c4c49dSIbrahim Kanouche}
159*46c4c49dSIbrahim Kanouche
160*46c4c49dSIbrahim Kanouchetype result struct {
161*46c4c49dSIbrahim Kanouche	key    string // key of expected nearest match
162*46c4c49dSIbrahim Kanouche	offset int    // offset of match in unknown string
163*46c4c49dSIbrahim Kanouche
164*46c4c49dSIbrahim Kanouche	// The confidence values are retrieved by simply running the classifier
165*46c4c49dSIbrahim Kanouche	// and noting the output. A value greater than the "max" is fine and
166*46c4c49dSIbrahim Kanouche	// the tests can be adjusted to account for it. A value less than "min"
167*46c4c49dSIbrahim Kanouche	// should be carefully scrutinzed before adjusting the tests.
168*46c4c49dSIbrahim Kanouche	minConf float64 // the lowest confidence accepted for the match
169*46c4c49dSIbrahim Kanouche	maxConf float64 // the highest confidence we expect for this match
170*46c4c49dSIbrahim Kanouche}
171*46c4c49dSIbrahim Kanouche
172*46c4c49dSIbrahim Kanouchefunc TestClassify_MultipleMatch(t *testing.T) {
173*46c4c49dSIbrahim Kanouche	c := New(DefaultConfidenceThreshold, FlattenWhitespace)
174*46c4c49dSIbrahim Kanouche	c.AddValue("gettysburg", gettysburg)
175*46c4c49dSIbrahim Kanouche	c.AddValue("declaration", declaration)
176*46c4c49dSIbrahim Kanouche	c.AddValue("declaration-close", declaration[:len(declaration)/2-1]+"_"+declaration[len(declaration)/2:])
177*46c4c49dSIbrahim Kanouche	c.AddValue("loremipsum", loremipsum)
178*46c4c49dSIbrahim Kanouche
179*46c4c49dSIbrahim Kanouche	cNormalize := New(DefaultConfidenceThreshold, FlattenWhitespace, removeNonWords)
180*46c4c49dSIbrahim Kanouche	cNormalize.AddValue("gettysburg", gettysburg)
181*46c4c49dSIbrahim Kanouche
182*46c4c49dSIbrahim Kanouche	tests := []struct {
183*46c4c49dSIbrahim Kanouche		description string
184*46c4c49dSIbrahim Kanouche		c           *Classifier
185*46c4c49dSIbrahim Kanouche		input       string // input string to match
186*46c4c49dSIbrahim Kanouche		want        []result
187*46c4c49dSIbrahim Kanouche	}{
188*46c4c49dSIbrahim Kanouche		{
189*46c4c49dSIbrahim Kanouche			description: "Exact text match",
190*46c4c49dSIbrahim Kanouche			c:           c,
191*46c4c49dSIbrahim Kanouche			input:       fellowInTheGoatSkin + declaration + humourOfIreland,
192*46c4c49dSIbrahim Kanouche			want: []result{
193*46c4c49dSIbrahim Kanouche				{
194*46c4c49dSIbrahim Kanouche					key:     "declaration",
195*46c4c49dSIbrahim Kanouche					offset:  845,
196*46c4c49dSIbrahim Kanouche					minConf: 1.0,
197*46c4c49dSIbrahim Kanouche					maxConf: 1.0,
198*46c4c49dSIbrahim Kanouche				},
199*46c4c49dSIbrahim Kanouche			},
200*46c4c49dSIbrahim Kanouche		},
201*46c4c49dSIbrahim Kanouche		{
202*46c4c49dSIbrahim Kanouche			description: "Partial text match",
203*46c4c49dSIbrahim Kanouche			c:           c,
204*46c4c49dSIbrahim Kanouche			input:       fellowInTheGoatSkin + modifiedLorem + humourOfIreland,
205*46c4c49dSIbrahim Kanouche			want: []result{
206*46c4c49dSIbrahim Kanouche				{
207*46c4c49dSIbrahim Kanouche					key:     "loremipsum",
208*46c4c49dSIbrahim Kanouche					offset:  845,
209*46c4c49dSIbrahim Kanouche					minConf: 0.90,
210*46c4c49dSIbrahim Kanouche					maxConf: 0.91,
211*46c4c49dSIbrahim Kanouche				},
212*46c4c49dSIbrahim Kanouche			},
213*46c4c49dSIbrahim Kanouche		},
214*46c4c49dSIbrahim Kanouche		{
215*46c4c49dSIbrahim Kanouche			description: "Two partial matches",
216*46c4c49dSIbrahim Kanouche			c:           c,
217*46c4c49dSIbrahim Kanouche			input:       fellowInTheGoatSkin + modifiedLorem + humourOfIreland + modifiedGettysburg + oldCrowYoungCrow,
218*46c4c49dSIbrahim Kanouche			want: []result{
219*46c4c49dSIbrahim Kanouche				{
220*46c4c49dSIbrahim Kanouche					key:     "loremipsum",
221*46c4c49dSIbrahim Kanouche					offset:  845,
222*46c4c49dSIbrahim Kanouche					minConf: 0.90,
223*46c4c49dSIbrahim Kanouche					maxConf: 0.91,
224*46c4c49dSIbrahim Kanouche				},
225*46c4c49dSIbrahim Kanouche				{
226*46c4c49dSIbrahim Kanouche					key:     "gettysburg",
227*46c4c49dSIbrahim Kanouche					offset:  1750,
228*46c4c49dSIbrahim Kanouche					minConf: 0.86,
229*46c4c49dSIbrahim Kanouche					maxConf: 0.87,
230*46c4c49dSIbrahim Kanouche				},
231*46c4c49dSIbrahim Kanouche			},
232*46c4c49dSIbrahim Kanouche		},
233*46c4c49dSIbrahim Kanouche		{
234*46c4c49dSIbrahim Kanouche			description: "Partial matches of similar text",
235*46c4c49dSIbrahim Kanouche			c:           c,
236*46c4c49dSIbrahim Kanouche			input:       fellowInTheGoatSkin + modifiedLorem + humourOfIreland + lessModifiedLorem + oldCrowYoungCrow,
237*46c4c49dSIbrahim Kanouche			want: []result{
238*46c4c49dSIbrahim Kanouche				{
239*46c4c49dSIbrahim Kanouche					key:     "loremipsum",
240*46c4c49dSIbrahim Kanouche					offset:  1750,
241*46c4c49dSIbrahim Kanouche					minConf: 0.98,
242*46c4c49dSIbrahim Kanouche					maxConf: 0.99,
243*46c4c49dSIbrahim Kanouche				},
244*46c4c49dSIbrahim Kanouche				{
245*46c4c49dSIbrahim Kanouche					key:     "loremipsum",
246*46c4c49dSIbrahim Kanouche					offset:  845,
247*46c4c49dSIbrahim Kanouche					minConf: 0.90,
248*46c4c49dSIbrahim Kanouche					maxConf: 0.91,
249*46c4c49dSIbrahim Kanouche				},
250*46c4c49dSIbrahim Kanouche			},
251*46c4c49dSIbrahim Kanouche		},
252*46c4c49dSIbrahim Kanouche		{
253*46c4c49dSIbrahim Kanouche			description: "Nullifiable text",
254*46c4c49dSIbrahim Kanouche			c:           c,
255*46c4c49dSIbrahim Kanouche			input:       nullifiable,
256*46c4c49dSIbrahim Kanouche			want:        nil,
257*46c4c49dSIbrahim Kanouche		},
258*46c4c49dSIbrahim Kanouche		{
259*46c4c49dSIbrahim Kanouche			description: "No match",
260*46c4c49dSIbrahim Kanouche			c:           c,
261*46c4c49dSIbrahim Kanouche			input:       fellowInTheGoatSkin + humourOfIreland,
262*46c4c49dSIbrahim Kanouche			want:        nil,
263*46c4c49dSIbrahim Kanouche		},
264*46c4c49dSIbrahim Kanouche		{
265*46c4c49dSIbrahim Kanouche			description: "Exact text match, with extra word and non-word normalizer",
266*46c4c49dSIbrahim Kanouche			c:           cNormalize,
267*46c4c49dSIbrahim Kanouche			input:       fellowInTheGoatSkin + gettysburgExtraWord + humourOfIreland,
268*46c4c49dSIbrahim Kanouche			want: []result{
269*46c4c49dSIbrahim Kanouche				{
270*46c4c49dSIbrahim Kanouche					key:     "gettysburg",
271*46c4c49dSIbrahim Kanouche					offset:  825,
272*46c4c49dSIbrahim Kanouche					minConf: 1.0,
273*46c4c49dSIbrahim Kanouche					maxConf: 1.0,
274*46c4c49dSIbrahim Kanouche				},
275*46c4c49dSIbrahim Kanouche			},
276*46c4c49dSIbrahim Kanouche		},
277*46c4c49dSIbrahim Kanouche	}
278*46c4c49dSIbrahim Kanouche
279*46c4c49dSIbrahim Kanouche	for _, tt := range tests {
280*46c4c49dSIbrahim Kanouche		matches := tt.c.MultipleMatch(tt.input)
281*46c4c49dSIbrahim Kanouche		if len(matches) != len(tt.want) {
282*46c4c49dSIbrahim Kanouche			t.Errorf("MultipleMatch(%q) not enough matches = %v, want %v", tt.description, len(matches), len(tt.want))
283*46c4c49dSIbrahim Kanouche		}
284*46c4c49dSIbrahim Kanouche
285*46c4c49dSIbrahim Kanouche		for i := 0; i < len(matches); i++ {
286*46c4c49dSIbrahim Kanouche			m := matches[i]
287*46c4c49dSIbrahim Kanouche			w := tt.want[i]
288*46c4c49dSIbrahim Kanouche			if got, want := m.Name, w.key; got != want {
289*46c4c49dSIbrahim Kanouche				t.Errorf("MultipleMatch(%q) = %q, want %q", tt.description, got, want)
290*46c4c49dSIbrahim Kanouche			}
291*46c4c49dSIbrahim Kanouche			if got, want := m.Confidence, w.minConf; got < want {
292*46c4c49dSIbrahim Kanouche				t.Errorf("MultipleMatch(%q) %q = %v, want minimum of %v", tt.description, w.key, got, want)
293*46c4c49dSIbrahim Kanouche			}
294*46c4c49dSIbrahim Kanouche			if got, want := m.Confidence, w.maxConf; got > want {
295*46c4c49dSIbrahim Kanouche				t.Errorf("MultipleMatch(%q) %q = %v, want maximum of %v", tt.description, w.key, got, want)
296*46c4c49dSIbrahim Kanouche			}
297*46c4c49dSIbrahim Kanouche			if got, want := m.Offset, w.offset; got != want {
298*46c4c49dSIbrahim Kanouche				t.Errorf("MultipleMatch(%q) %q = %v, want offset of %v", tt.description, w.key, got, want)
299*46c4c49dSIbrahim Kanouche			}
300*46c4c49dSIbrahim Kanouche		}
301*46c4c49dSIbrahim Kanouche	}
302*46c4c49dSIbrahim Kanouche}
303*46c4c49dSIbrahim Kanouche
304*46c4c49dSIbrahim Kanouchefunc TestClassify_DiffRatio(t *testing.T) {
305*46c4c49dSIbrahim Kanouche	tests := []struct {
306*46c4c49dSIbrahim Kanouche		x, y string
307*46c4c49dSIbrahim Kanouche		want float64
308*46c4c49dSIbrahim Kanouche	}{
309*46c4c49dSIbrahim Kanouche		{"", "", 1.0},
310*46c4c49dSIbrahim Kanouche		{"a", "b", 1.0},
311*46c4c49dSIbrahim Kanouche		{"", "abc", 0},
312*46c4c49dSIbrahim Kanouche		{"ab", "c", 0.5},
313*46c4c49dSIbrahim Kanouche		{"a", "bc", 0.5},
314*46c4c49dSIbrahim Kanouche		{"a", "bcde", 0.25},
315*46c4c49dSIbrahim Kanouche	}
316*46c4c49dSIbrahim Kanouche
317*46c4c49dSIbrahim Kanouche	for _, tt := range tests {
318*46c4c49dSIbrahim Kanouche		if got, want := diffRatio(tt.x, tt.y), tt.want; got != want {
319*46c4c49dSIbrahim Kanouche			t.Errorf("diffRatio(%q, %q) = %f, want %f", tt.x, tt.y, got, want)
320*46c4c49dSIbrahim Kanouche		}
321*46c4c49dSIbrahim Kanouche	}
322*46c4c49dSIbrahim Kanouche}
323*46c4c49dSIbrahim Kanouche
324*46c4c49dSIbrahim Kanouchefunc TestClassify_Matches(t *testing.T) {
325*46c4c49dSIbrahim Kanouche	tests := []struct {
326*46c4c49dSIbrahim Kanouche		description string
327*46c4c49dSIbrahim Kanouche		matches     Matches
328*46c4c49dSIbrahim Kanouche		want        Matches
329*46c4c49dSIbrahim Kanouche	}{
330*46c4c49dSIbrahim Kanouche		{
331*46c4c49dSIbrahim Kanouche			description: "Different names, same confidences, same offset",
332*46c4c49dSIbrahim Kanouche			matches: Matches{
333*46c4c49dSIbrahim Kanouche				&Match{
334*46c4c49dSIbrahim Kanouche					Name:       "b",
335*46c4c49dSIbrahim Kanouche					Confidence: 0.42,
336*46c4c49dSIbrahim Kanouche					Offset:     0,
337*46c4c49dSIbrahim Kanouche				},
338*46c4c49dSIbrahim Kanouche				&Match{
339*46c4c49dSIbrahim Kanouche					Name:       "a",
340*46c4c49dSIbrahim Kanouche					Confidence: 0.42,
341*46c4c49dSIbrahim Kanouche					Offset:     0,
342*46c4c49dSIbrahim Kanouche				},
343*46c4c49dSIbrahim Kanouche			},
344*46c4c49dSIbrahim Kanouche			want: Matches{
345*46c4c49dSIbrahim Kanouche				&Match{
346*46c4c49dSIbrahim Kanouche					Name:       "a",
347*46c4c49dSIbrahim Kanouche					Confidence: 0.42,
348*46c4c49dSIbrahim Kanouche					Offset:     0,
349*46c4c49dSIbrahim Kanouche				},
350*46c4c49dSIbrahim Kanouche				&Match{
351*46c4c49dSIbrahim Kanouche					Name:       "b",
352*46c4c49dSIbrahim Kanouche					Confidence: 0.42,
353*46c4c49dSIbrahim Kanouche					Offset:     0,
354*46c4c49dSIbrahim Kanouche				},
355*46c4c49dSIbrahim Kanouche			},
356*46c4c49dSIbrahim Kanouche		},
357*46c4c49dSIbrahim Kanouche		{
358*46c4c49dSIbrahim Kanouche			description: "Same names, different confidences, same offset",
359*46c4c49dSIbrahim Kanouche			matches: Matches{
360*46c4c49dSIbrahim Kanouche				&Match{
361*46c4c49dSIbrahim Kanouche					Name:       "b",
362*46c4c49dSIbrahim Kanouche					Confidence: 0.42,
363*46c4c49dSIbrahim Kanouche					Offset:     0,
364*46c4c49dSIbrahim Kanouche				},
365*46c4c49dSIbrahim Kanouche				&Match{
366*46c4c49dSIbrahim Kanouche					Name:       "b",
367*46c4c49dSIbrahim Kanouche					Confidence: 0.90,
368*46c4c49dSIbrahim Kanouche					Offset:     0,
369*46c4c49dSIbrahim Kanouche				},
370*46c4c49dSIbrahim Kanouche			},
371*46c4c49dSIbrahim Kanouche			want: Matches{
372*46c4c49dSIbrahim Kanouche				&Match{
373*46c4c49dSIbrahim Kanouche					Name:       "b",
374*46c4c49dSIbrahim Kanouche					Confidence: 0.90,
375*46c4c49dSIbrahim Kanouche					Offset:     0,
376*46c4c49dSIbrahim Kanouche				},
377*46c4c49dSIbrahim Kanouche				&Match{
378*46c4c49dSIbrahim Kanouche					Name:       "b",
379*46c4c49dSIbrahim Kanouche					Confidence: 0.42,
380*46c4c49dSIbrahim Kanouche					Offset:     0,
381*46c4c49dSIbrahim Kanouche				},
382*46c4c49dSIbrahim Kanouche			},
383*46c4c49dSIbrahim Kanouche		},
384*46c4c49dSIbrahim Kanouche		{
385*46c4c49dSIbrahim Kanouche			description: "Same names, same confidences, different offsets",
386*46c4c49dSIbrahim Kanouche			matches: Matches{
387*46c4c49dSIbrahim Kanouche				&Match{
388*46c4c49dSIbrahim Kanouche					Name:       "b",
389*46c4c49dSIbrahim Kanouche					Confidence: 0.42,
390*46c4c49dSIbrahim Kanouche					Offset:     42,
391*46c4c49dSIbrahim Kanouche				},
392*46c4c49dSIbrahim Kanouche				&Match{
393*46c4c49dSIbrahim Kanouche					Name:       "b",
394*46c4c49dSIbrahim Kanouche					Confidence: 0.42,
395*46c4c49dSIbrahim Kanouche					Offset:     0,
396*46c4c49dSIbrahim Kanouche				},
397*46c4c49dSIbrahim Kanouche			},
398*46c4c49dSIbrahim Kanouche			want: Matches{
399*46c4c49dSIbrahim Kanouche				&Match{
400*46c4c49dSIbrahim Kanouche					Name:       "b",
401*46c4c49dSIbrahim Kanouche					Confidence: 0.42,
402*46c4c49dSIbrahim Kanouche					Offset:     0,
403*46c4c49dSIbrahim Kanouche				},
404*46c4c49dSIbrahim Kanouche				&Match{
405*46c4c49dSIbrahim Kanouche					Name:       "b",
406*46c4c49dSIbrahim Kanouche					Confidence: 0.42,
407*46c4c49dSIbrahim Kanouche					Offset:     42,
408*46c4c49dSIbrahim Kanouche				},
409*46c4c49dSIbrahim Kanouche			},
410*46c4c49dSIbrahim Kanouche		},
411*46c4c49dSIbrahim Kanouche
412*46c4c49dSIbrahim Kanouche		{
413*46c4c49dSIbrahim Kanouche			description: "Different names, different confidences, same offset",
414*46c4c49dSIbrahim Kanouche			matches: Matches{
415*46c4c49dSIbrahim Kanouche				&Match{
416*46c4c49dSIbrahim Kanouche					Name:       "b",
417*46c4c49dSIbrahim Kanouche					Confidence: 0.42,
418*46c4c49dSIbrahim Kanouche					Offset:     0,
419*46c4c49dSIbrahim Kanouche				},
420*46c4c49dSIbrahim Kanouche				&Match{
421*46c4c49dSIbrahim Kanouche					Name:       "a",
422*46c4c49dSIbrahim Kanouche					Confidence: 0.90,
423*46c4c49dSIbrahim Kanouche					Offset:     0,
424*46c4c49dSIbrahim Kanouche				},
425*46c4c49dSIbrahim Kanouche			},
426*46c4c49dSIbrahim Kanouche			want: Matches{
427*46c4c49dSIbrahim Kanouche				&Match{
428*46c4c49dSIbrahim Kanouche					Name:       "a",
429*46c4c49dSIbrahim Kanouche					Confidence: 0.90,
430*46c4c49dSIbrahim Kanouche					Offset:     0,
431*46c4c49dSIbrahim Kanouche				},
432*46c4c49dSIbrahim Kanouche				&Match{
433*46c4c49dSIbrahim Kanouche					Name:       "b",
434*46c4c49dSIbrahim Kanouche					Confidence: 0.42,
435*46c4c49dSIbrahim Kanouche					Offset:     0,
436*46c4c49dSIbrahim Kanouche				},
437*46c4c49dSIbrahim Kanouche			},
438*46c4c49dSIbrahim Kanouche		},
439*46c4c49dSIbrahim Kanouche		{
440*46c4c49dSIbrahim Kanouche			description: "Different names, same confidences, different offset",
441*46c4c49dSIbrahim Kanouche			matches: Matches{
442*46c4c49dSIbrahim Kanouche				&Match{
443*46c4c49dSIbrahim Kanouche					Name:       "b",
444*46c4c49dSIbrahim Kanouche					Confidence: 0.42,
445*46c4c49dSIbrahim Kanouche					Offset:     37,
446*46c4c49dSIbrahim Kanouche				},
447*46c4c49dSIbrahim Kanouche				&Match{
448*46c4c49dSIbrahim Kanouche					Name:       "a",
449*46c4c49dSIbrahim Kanouche					Confidence: 0.42,
450*46c4c49dSIbrahim Kanouche					Offset:     0,
451*46c4c49dSIbrahim Kanouche				},
452*46c4c49dSIbrahim Kanouche			},
453*46c4c49dSIbrahim Kanouche			want: Matches{
454*46c4c49dSIbrahim Kanouche				&Match{
455*46c4c49dSIbrahim Kanouche					Name:       "a",
456*46c4c49dSIbrahim Kanouche					Confidence: 0.42,
457*46c4c49dSIbrahim Kanouche					Offset:     0,
458*46c4c49dSIbrahim Kanouche				},
459*46c4c49dSIbrahim Kanouche				&Match{
460*46c4c49dSIbrahim Kanouche					Name:       "b",
461*46c4c49dSIbrahim Kanouche					Confidence: 0.42,
462*46c4c49dSIbrahim Kanouche					Offset:     37,
463*46c4c49dSIbrahim Kanouche				},
464*46c4c49dSIbrahim Kanouche			},
465*46c4c49dSIbrahim Kanouche		},
466*46c4c49dSIbrahim Kanouche		{
467*46c4c49dSIbrahim Kanouche			description: "Different names, different confidences, different offset",
468*46c4c49dSIbrahim Kanouche			matches: Matches{
469*46c4c49dSIbrahim Kanouche				&Match{
470*46c4c49dSIbrahim Kanouche					Name:       "a",
471*46c4c49dSIbrahim Kanouche					Confidence: 0.42,
472*46c4c49dSIbrahim Kanouche					Offset:     0,
473*46c4c49dSIbrahim Kanouche				},
474*46c4c49dSIbrahim Kanouche				&Match{
475*46c4c49dSIbrahim Kanouche					Name:       "b",
476*46c4c49dSIbrahim Kanouche					Confidence: 0.90,
477*46c4c49dSIbrahim Kanouche					Offset:     37,
478*46c4c49dSIbrahim Kanouche				},
479*46c4c49dSIbrahim Kanouche			},
480*46c4c49dSIbrahim Kanouche			want: Matches{
481*46c4c49dSIbrahim Kanouche				&Match{
482*46c4c49dSIbrahim Kanouche					Name:       "b",
483*46c4c49dSIbrahim Kanouche					Confidence: 0.90,
484*46c4c49dSIbrahim Kanouche					Offset:     37,
485*46c4c49dSIbrahim Kanouche				},
486*46c4c49dSIbrahim Kanouche				&Match{
487*46c4c49dSIbrahim Kanouche					Name:       "a",
488*46c4c49dSIbrahim Kanouche					Confidence: 0.42,
489*46c4c49dSIbrahim Kanouche					Offset:     0,
490*46c4c49dSIbrahim Kanouche				},
491*46c4c49dSIbrahim Kanouche			},
492*46c4c49dSIbrahim Kanouche		},
493*46c4c49dSIbrahim Kanouche	}
494*46c4c49dSIbrahim Kanouche
495*46c4c49dSIbrahim Kanouche	for _, tt := range tests {
496*46c4c49dSIbrahim Kanouche		sort.Sort(tt.matches)
497*46c4c49dSIbrahim Kanouche		if !reflect.DeepEqual(tt.matches, tt.want) {
498*46c4c49dSIbrahim Kanouche			for _, x := range tt.matches {
499*46c4c49dSIbrahim Kanouche				t.Errorf("got: %v", x)
500*46c4c49dSIbrahim Kanouche			}
501*46c4c49dSIbrahim Kanouche			for _, x := range tt.want {
502*46c4c49dSIbrahim Kanouche				t.Errorf("want: %v", x)
503*46c4c49dSIbrahim Kanouche			}
504*46c4c49dSIbrahim Kanouche			t.Errorf("MatchesSort(%q) = %v, want %v", tt.description, tt.matches, tt.want)
505*46c4c49dSIbrahim Kanouche		}
506*46c4c49dSIbrahim Kanouche	}
507*46c4c49dSIbrahim Kanouche}
508*46c4c49dSIbrahim Kanouche
509*46c4c49dSIbrahim Kanouchefunc TestClassify_DiffRangeEnd(t *testing.T) {
510*46c4c49dSIbrahim Kanouche	dmp := diffmatchpatch.New()
511*46c4c49dSIbrahim Kanouche	tests := []struct {
512*46c4c49dSIbrahim Kanouche		description string
513*46c4c49dSIbrahim Kanouche		unknown     string
514*46c4c49dSIbrahim Kanouche		known       string
515*46c4c49dSIbrahim Kanouche		end         int
516*46c4c49dSIbrahim Kanouche	}{
517*46c4c49dSIbrahim Kanouche		{
518*46c4c49dSIbrahim Kanouche			description: "identical",
519*46c4c49dSIbrahim Kanouche			unknown:     declaration,
520*46c4c49dSIbrahim Kanouche			known:       declaration,
521*46c4c49dSIbrahim Kanouche			end:         1,
522*46c4c49dSIbrahim Kanouche		},
523*46c4c49dSIbrahim Kanouche		{
524*46c4c49dSIbrahim Kanouche			description: "lorem",
525*46c4c49dSIbrahim Kanouche			unknown:     lessModifiedLorem,
526*46c4c49dSIbrahim Kanouche			known:       loremipsum,
527*46c4c49dSIbrahim Kanouche			end:         3,
528*46c4c49dSIbrahim Kanouche		},
529*46c4c49dSIbrahim Kanouche		{
530*46c4c49dSIbrahim Kanouche			description: "gettysburg",
531*46c4c49dSIbrahim Kanouche			unknown:     modifiedGettysburg,
532*46c4c49dSIbrahim Kanouche			known:       gettysburg,
533*46c4c49dSIbrahim Kanouche			end:         19,
534*46c4c49dSIbrahim Kanouche		},
535*46c4c49dSIbrahim Kanouche	}
536*46c4c49dSIbrahim Kanouche
537*46c4c49dSIbrahim Kanouche	for _, tt := range tests {
538*46c4c49dSIbrahim Kanouche		diffs := dmp.DiffMain(tt.unknown, tt.known, true)
539*46c4c49dSIbrahim Kanouche		if e := diffRangeEnd(tt.known, diffs); e != tt.end {
540*46c4c49dSIbrahim Kanouche			t.Errorf("DiffRangeEnd(%q) = end %v, want %v", tt.description, e, tt.end)
541*46c4c49dSIbrahim Kanouche		}
542*46c4c49dSIbrahim Kanouche	}
543*46c4c49dSIbrahim Kanouche}
544*46c4c49dSIbrahim Kanouche
545*46c4c49dSIbrahim Kanouchefunc BenchmarkClassifier(b *testing.B) {
546*46c4c49dSIbrahim Kanouche	c := New(DefaultConfidenceThreshold, FlattenWhitespace)
547*46c4c49dSIbrahim Kanouche	c.AddValue("gettysburg", gettysburg)
548*46c4c49dSIbrahim Kanouche	c.AddValue("declaration", declaration)
549*46c4c49dSIbrahim Kanouche	c.AddValue("loremipsum", loremipsum)
550*46c4c49dSIbrahim Kanouche
551*46c4c49dSIbrahim Kanouche	b.ResetTimer()
552*46c4c49dSIbrahim Kanouche	for i := 0; i < b.N; i++ {
553*46c4c49dSIbrahim Kanouche		c.NearestMatch(modifiedLorem)
554*46c4c49dSIbrahim Kanouche	}
555*46c4c49dSIbrahim Kanouche}
556