1*46c4c49dSIbrahim Kanouche// Copyright 2017 Google Inc. 2*46c4c49dSIbrahim Kanouche// 3*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License"); 4*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License. 5*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at 6*46c4c49dSIbrahim Kanouche// 7*46c4c49dSIbrahim Kanouche// http://www.apache.org/licenses/LICENSE-2.0 8*46c4c49dSIbrahim Kanouche// 9*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software 10*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS, 11*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and 13*46c4c49dSIbrahim Kanouche// limitations under the License. 14*46c4c49dSIbrahim Kanouche// 15*46c4c49dSIbrahim Kanouche// Select test data comes from 16*46c4c49dSIbrahim Kanouche// The Project Gutenberg eBook of The humour of Ireland, by D. J., (David James), (1866-1917) O'Donoghue 17*46c4c49dSIbrahim Kanouche 18*46c4c49dSIbrahim Kanouchepackage stringclassifier 19*46c4c49dSIbrahim Kanouche 20*46c4c49dSIbrahim Kanoucheimport ( 21*46c4c49dSIbrahim Kanouche "reflect" 22*46c4c49dSIbrahim Kanouche "regexp" 23*46c4c49dSIbrahim Kanouche "sort" 24*46c4c49dSIbrahim Kanouche "testing" 25*46c4c49dSIbrahim Kanouche 26*46c4c49dSIbrahim Kanouche "github.com/sergi/go-diff/diffmatchpatch" 27*46c4c49dSIbrahim Kanouche) 28*46c4c49dSIbrahim Kanouche 29*46c4c49dSIbrahim Kanouchevar ( 30*46c4c49dSIbrahim Kanouche gettysburg = `Four score and seven years ago our fathers brought forth 31*46c4c49dSIbrahim Kanoucheon this continent, a new nation, conceived in Liberty, and dedicated to the 32*46c4c49dSIbrahim Kanoucheproposition that all men are created equal.` 33*46c4c49dSIbrahim Kanouche modifiedGettysburg = `Four score and seven years ago our fathers brought forth 34*46c4c49dSIbrahim Kanoucheon this continent, a nation that was new and improved, conceived in Liberty, and 35*46c4c49dSIbrahim Kanouchededicated to the proposition that all men are created equal.` 36*46c4c49dSIbrahim Kanouche gettysburgExtraWord = `Four score and seven years ago our fathers brought forth 37*46c4c49dSIbrahim Kanoucheon this continent, a new nation, conceived in Liberty, and dedicated to the 38*46c4c49dSIbrahim Kanoucheproposition that all men are created equal.Foobar` 39*46c4c49dSIbrahim Kanouche 40*46c4c49dSIbrahim Kanouche declaration = `When in the Course of human events, it becomes necessary 41*46c4c49dSIbrahim Kanouchefor one people to dissolve the political bands which have connected them with 42*46c4c49dSIbrahim Kanoucheanother, and to assume among the powers of the earth, the separate and equal 43*46c4c49dSIbrahim Kanouchestation to which the Laws of Nature and of Nature's God entitle them, a decent 44*46c4c49dSIbrahim Kanoucherespect to the opinions of mankind requires that they should declare the causes 45*46c4c49dSIbrahim Kanouchewhich impel them to the separation.` 46*46c4c49dSIbrahim Kanouche 47*46c4c49dSIbrahim Kanouche loremipsum = `Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla 48*46c4c49dSIbrahim Kanouchevarius enim mattis, rhoncus lectus id, aliquet sem. Phasellus eget ex in dolor 49*46c4c49dSIbrahim Kanouchefeugiat ultricies. Etiam interdum sit amet nisl in placerat. Sed vitae enim 50*46c4c49dSIbrahim Kanouchevulputate, tempus leo commodo, accumsan nulla.` 51*46c4c49dSIbrahim Kanouche modifiedLorem = `Lorem ipsum dolor amet, consectetur adipiscing elit. Nulla 52*46c4c49dSIbrahim Kanouchevarius enim mattis, lectus id, aliquet rhoncus sem. Phasellus eget ex in dolor 53*46c4c49dSIbrahim Kanouchefeugiat ultricies. Etiam interdum sit amet sit nisl in placerat. Sed vitae enim 54*46c4c49dSIbrahim Kanouchevulputate, tempus leo commodo, accumsan nulla.` 55*46c4c49dSIbrahim Kanouche lessModifiedLorem = `Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla 56*46c4c49dSIbrahim Kanouchevarius enim mattis, rhoncus lectus id, aliquet. Phasellus eget ex in dolor 57*46c4c49dSIbrahim Kanouchefeugiat ultricies. Etiam interdum sit amet nisl in placerat. Sed vitae enim 58*46c4c49dSIbrahim Kanouchevulputate, tempus leo commodo, accumsan nulla.` 59*46c4c49dSIbrahim Kanouche humourOfIreland = `As a rule, Irish poets have not extracted a pessimistic 60*46c4c49dSIbrahim Kanouchephilosophy from liquor; they are “elevated,” not depressed, and do not deem 61*46c4c49dSIbrahim Kanoucheit essential to the production of a poem that its author should be a cynic or 62*46c4c49dSIbrahim Kanouchean evil prophet. One of the best attributes of Irish poetry is its constant 63*46c4c49dSIbrahim Kanoucheexpression of the natural emotions. Previous to the close of the 64*46c4c49dSIbrahim Kanoucheseventeenth[xvi] century, it is said, drunkenness was not suggested by the 65*46c4c49dSIbrahim Kanouchepoets as common in Ireland—the popularity of Bacchanalian songs since that 66*46c4c49dSIbrahim Kanouchedate seems to prove that the vice soon became a virtue. Maginn is the 67*46c4c49dSIbrahim Kanouchenoisiest of modern revellers, and easily roars the others down. 68*46c4c49dSIbrahim Kanouche` 69*46c4c49dSIbrahim Kanouche fellowInTheGoatSkin = `There was a poor widow living down there near the Iron 70*46c4c49dSIbrahim KanoucheForge when the country was all covered with forests, and you might walk on 71*46c4c49dSIbrahim Kanouchethe tops of trees from Carnew to the Lady’s Island, and she had one boy. She 72*46c4c49dSIbrahim Kanouchewas very poor, as I said before, and was not able to buy clothes for her son. 73*46c4c49dSIbrahim KanoucheSo when she was going out she fixed him snug and combustible in the ash-pit, 74*46c4c49dSIbrahim Kanoucheand piled the warm ashes about him. The boy knew no better, and was as happy 75*46c4c49dSIbrahim Kanoucheas the day was long; and he was happier still when a neighbour[10] gave his 76*46c4c49dSIbrahim Kanouchemother a kid to keep him company when herself was abroad. The kid and the lad 77*46c4c49dSIbrahim Kanoucheplayed like two may-boys; and when she was old enough to give milk, wasn’t it 78*46c4c49dSIbrahim Kanouchea godsend to the little family? You won’t prevent the boy from growing up 79*46c4c49dSIbrahim Kanoucheinto a young man, but not a screed of clothes had he then no more than when 80*46c4c49dSIbrahim Kanouchehe was a gorsoon. 81*46c4c49dSIbrahim Kanouche` 82*46c4c49dSIbrahim Kanouche oldCrowYoungCrow = `There was an old crow teaching a young crow one day, and 83*46c4c49dSIbrahim Kanouchehe said to him, “Now, my son,” says he, “listen to the advice I’m going to 84*46c4c49dSIbrahim Kanouchegive you. If you see a person coming near you and stooping, mind yourself, 85*46c4c49dSIbrahim Kanoucheand be on your keeping; he’s stooping for a stone to throw at you.” 86*46c4c49dSIbrahim Kanouche 87*46c4c49dSIbrahim Kanouche“But tell me,” says the young crow, “what should I do if he had a stone 88*46c4c49dSIbrahim Kanouchealready down in his pocket?” 89*46c4c49dSIbrahim Kanouche 90*46c4c49dSIbrahim Kanouche“Musha, go ’long out of that,” says the old crow, “you’ve learned enough; the 91*46c4c49dSIbrahim Kanouchedevil another learning I’m able to give you.” 92*46c4c49dSIbrahim Kanouche` 93*46c4c49dSIbrahim Kanouche nullifiable = `[[ , _ , _ , _ 94*46c4c49dSIbrahim Kanouche? _ : _ 95*46c4c49dSIbrahim Kanouche? _ : _ 96*46c4c49dSIbrahim Kanouche? _ : _ 97*46c4c49dSIbrahim Kanouche] 98*46c4c49dSIbrahim Kanouche} 99*46c4c49dSIbrahim Kanouche` 100*46c4c49dSIbrahim Kanouche nonWords = regexp.MustCompile("[[:punct:]]+") 101*46c4c49dSIbrahim Kanouche) 102*46c4c49dSIbrahim Kanouche 103*46c4c49dSIbrahim Kanouche// removeNonWords removes non-words from the string, replacing them with empty 104*46c4c49dSIbrahim Kanouche// string. (This is meant to exercise tokenization problems.) 105*46c4c49dSIbrahim Kanouchefunc removeNonWords(s string) string { 106*46c4c49dSIbrahim Kanouche return nonWords.ReplaceAllString(s, "") 107*46c4c49dSIbrahim Kanouche} 108*46c4c49dSIbrahim Kanouche 109*46c4c49dSIbrahim Kanouchefunc TestClassify_NearestMatch(t *testing.T) { 110*46c4c49dSIbrahim Kanouche c := New(DefaultConfidenceThreshold, FlattenWhitespace) 111*46c4c49dSIbrahim Kanouche c.AddValue("gettysburg", gettysburg) 112*46c4c49dSIbrahim Kanouche c.AddValue("declaration", declaration) 113*46c4c49dSIbrahim Kanouche c.AddValue("loremipsum", loremipsum) 114*46c4c49dSIbrahim Kanouche 115*46c4c49dSIbrahim Kanouche tests := []struct { 116*46c4c49dSIbrahim Kanouche description string 117*46c4c49dSIbrahim Kanouche input string // input string to match 118*46c4c49dSIbrahim Kanouche name string // name of expected nearest match 119*46c4c49dSIbrahim Kanouche minConf float64 // the lowest confidence accepted for the match 120*46c4c49dSIbrahim Kanouche maxConf float64 // the highest confidence we expect for this match 121*46c4c49dSIbrahim Kanouche }{ 122*46c4c49dSIbrahim Kanouche { 123*46c4c49dSIbrahim Kanouche description: "Full Declaration", 124*46c4c49dSIbrahim Kanouche input: declaration, 125*46c4c49dSIbrahim Kanouche name: "declaration", 126*46c4c49dSIbrahim Kanouche minConf: 1.0, 127*46c4c49dSIbrahim Kanouche maxConf: 1.0, 128*46c4c49dSIbrahim Kanouche }, 129*46c4c49dSIbrahim Kanouche { 130*46c4c49dSIbrahim Kanouche description: "Modified Lorem", 131*46c4c49dSIbrahim Kanouche input: modifiedLorem, 132*46c4c49dSIbrahim Kanouche name: "loremipsum", 133*46c4c49dSIbrahim Kanouche minConf: 0.90, 134*46c4c49dSIbrahim Kanouche maxConf: 0.91, 135*46c4c49dSIbrahim Kanouche }, 136*46c4c49dSIbrahim Kanouche { 137*46c4c49dSIbrahim Kanouche description: "Modified Gettysburg", 138*46c4c49dSIbrahim Kanouche input: modifiedGettysburg, 139*46c4c49dSIbrahim Kanouche name: "gettysburg", 140*46c4c49dSIbrahim Kanouche minConf: 0.86, 141*46c4c49dSIbrahim Kanouche maxConf: 0.87, 142*46c4c49dSIbrahim Kanouche }, 143*46c4c49dSIbrahim Kanouche } 144*46c4c49dSIbrahim Kanouche 145*46c4c49dSIbrahim Kanouche for _, tt := range tests { 146*46c4c49dSIbrahim Kanouche m := c.NearestMatch(tt.input) 147*46c4c49dSIbrahim Kanouche 148*46c4c49dSIbrahim Kanouche if got, want := m.Name, tt.name; got != want { 149*46c4c49dSIbrahim Kanouche t.Errorf("NearestMatch(%q) = %q, want %q", tt.description, got, want) 150*46c4c49dSIbrahim Kanouche } 151*46c4c49dSIbrahim Kanouche if got, want := m.Confidence, tt.minConf; got < want { 152*46c4c49dSIbrahim Kanouche t.Errorf("NearestMatch(%q) returned confidence %v, want minimum of %v", tt.description, got, want) 153*46c4c49dSIbrahim Kanouche } 154*46c4c49dSIbrahim Kanouche if got, want := m.Confidence, tt.maxConf; got > want { 155*46c4c49dSIbrahim Kanouche t.Errorf("NearestMatch(%q) = %v, want maxiumum of %v", tt.description, got, want) 156*46c4c49dSIbrahim Kanouche } 157*46c4c49dSIbrahim Kanouche } 158*46c4c49dSIbrahim Kanouche} 159*46c4c49dSIbrahim Kanouche 160*46c4c49dSIbrahim Kanouchetype result struct { 161*46c4c49dSIbrahim Kanouche key string // key of expected nearest match 162*46c4c49dSIbrahim Kanouche offset int // offset of match in unknown string 163*46c4c49dSIbrahim Kanouche 164*46c4c49dSIbrahim Kanouche // The confidence values are retrieved by simply running the classifier 165*46c4c49dSIbrahim Kanouche // and noting the output. A value greater than the "max" is fine and 166*46c4c49dSIbrahim Kanouche // the tests can be adjusted to account for it. A value less than "min" 167*46c4c49dSIbrahim Kanouche // should be carefully scrutinzed before adjusting the tests. 168*46c4c49dSIbrahim Kanouche minConf float64 // the lowest confidence accepted for the match 169*46c4c49dSIbrahim Kanouche maxConf float64 // the highest confidence we expect for this match 170*46c4c49dSIbrahim Kanouche} 171*46c4c49dSIbrahim Kanouche 172*46c4c49dSIbrahim Kanouchefunc TestClassify_MultipleMatch(t *testing.T) { 173*46c4c49dSIbrahim Kanouche c := New(DefaultConfidenceThreshold, FlattenWhitespace) 174*46c4c49dSIbrahim Kanouche c.AddValue("gettysburg", gettysburg) 175*46c4c49dSIbrahim Kanouche c.AddValue("declaration", declaration) 176*46c4c49dSIbrahim Kanouche c.AddValue("declaration-close", declaration[:len(declaration)/2-1]+"_"+declaration[len(declaration)/2:]) 177*46c4c49dSIbrahim Kanouche c.AddValue("loremipsum", loremipsum) 178*46c4c49dSIbrahim Kanouche 179*46c4c49dSIbrahim Kanouche cNormalize := New(DefaultConfidenceThreshold, FlattenWhitespace, removeNonWords) 180*46c4c49dSIbrahim Kanouche cNormalize.AddValue("gettysburg", gettysburg) 181*46c4c49dSIbrahim Kanouche 182*46c4c49dSIbrahim Kanouche tests := []struct { 183*46c4c49dSIbrahim Kanouche description string 184*46c4c49dSIbrahim Kanouche c *Classifier 185*46c4c49dSIbrahim Kanouche input string // input string to match 186*46c4c49dSIbrahim Kanouche want []result 187*46c4c49dSIbrahim Kanouche }{ 188*46c4c49dSIbrahim Kanouche { 189*46c4c49dSIbrahim Kanouche description: "Exact text match", 190*46c4c49dSIbrahim Kanouche c: c, 191*46c4c49dSIbrahim Kanouche input: fellowInTheGoatSkin + declaration + humourOfIreland, 192*46c4c49dSIbrahim Kanouche want: []result{ 193*46c4c49dSIbrahim Kanouche { 194*46c4c49dSIbrahim Kanouche key: "declaration", 195*46c4c49dSIbrahim Kanouche offset: 845, 196*46c4c49dSIbrahim Kanouche minConf: 1.0, 197*46c4c49dSIbrahim Kanouche maxConf: 1.0, 198*46c4c49dSIbrahim Kanouche }, 199*46c4c49dSIbrahim Kanouche }, 200*46c4c49dSIbrahim Kanouche }, 201*46c4c49dSIbrahim Kanouche { 202*46c4c49dSIbrahim Kanouche description: "Partial text match", 203*46c4c49dSIbrahim Kanouche c: c, 204*46c4c49dSIbrahim Kanouche input: fellowInTheGoatSkin + modifiedLorem + humourOfIreland, 205*46c4c49dSIbrahim Kanouche want: []result{ 206*46c4c49dSIbrahim Kanouche { 207*46c4c49dSIbrahim Kanouche key: "loremipsum", 208*46c4c49dSIbrahim Kanouche offset: 845, 209*46c4c49dSIbrahim Kanouche minConf: 0.90, 210*46c4c49dSIbrahim Kanouche maxConf: 0.91, 211*46c4c49dSIbrahim Kanouche }, 212*46c4c49dSIbrahim Kanouche }, 213*46c4c49dSIbrahim Kanouche }, 214*46c4c49dSIbrahim Kanouche { 215*46c4c49dSIbrahim Kanouche description: "Two partial matches", 216*46c4c49dSIbrahim Kanouche c: c, 217*46c4c49dSIbrahim Kanouche input: fellowInTheGoatSkin + modifiedLorem + humourOfIreland + modifiedGettysburg + oldCrowYoungCrow, 218*46c4c49dSIbrahim Kanouche want: []result{ 219*46c4c49dSIbrahim Kanouche { 220*46c4c49dSIbrahim Kanouche key: "loremipsum", 221*46c4c49dSIbrahim Kanouche offset: 845, 222*46c4c49dSIbrahim Kanouche minConf: 0.90, 223*46c4c49dSIbrahim Kanouche maxConf: 0.91, 224*46c4c49dSIbrahim Kanouche }, 225*46c4c49dSIbrahim Kanouche { 226*46c4c49dSIbrahim Kanouche key: "gettysburg", 227*46c4c49dSIbrahim Kanouche offset: 1750, 228*46c4c49dSIbrahim Kanouche minConf: 0.86, 229*46c4c49dSIbrahim Kanouche maxConf: 0.87, 230*46c4c49dSIbrahim Kanouche }, 231*46c4c49dSIbrahim Kanouche }, 232*46c4c49dSIbrahim Kanouche }, 233*46c4c49dSIbrahim Kanouche { 234*46c4c49dSIbrahim Kanouche description: "Partial matches of similar text", 235*46c4c49dSIbrahim Kanouche c: c, 236*46c4c49dSIbrahim Kanouche input: fellowInTheGoatSkin + modifiedLorem + humourOfIreland + lessModifiedLorem + oldCrowYoungCrow, 237*46c4c49dSIbrahim Kanouche want: []result{ 238*46c4c49dSIbrahim Kanouche { 239*46c4c49dSIbrahim Kanouche key: "loremipsum", 240*46c4c49dSIbrahim Kanouche offset: 1750, 241*46c4c49dSIbrahim Kanouche minConf: 0.98, 242*46c4c49dSIbrahim Kanouche maxConf: 0.99, 243*46c4c49dSIbrahim Kanouche }, 244*46c4c49dSIbrahim Kanouche { 245*46c4c49dSIbrahim Kanouche key: "loremipsum", 246*46c4c49dSIbrahim Kanouche offset: 845, 247*46c4c49dSIbrahim Kanouche minConf: 0.90, 248*46c4c49dSIbrahim Kanouche maxConf: 0.91, 249*46c4c49dSIbrahim Kanouche }, 250*46c4c49dSIbrahim Kanouche }, 251*46c4c49dSIbrahim Kanouche }, 252*46c4c49dSIbrahim Kanouche { 253*46c4c49dSIbrahim Kanouche description: "Nullifiable text", 254*46c4c49dSIbrahim Kanouche c: c, 255*46c4c49dSIbrahim Kanouche input: nullifiable, 256*46c4c49dSIbrahim Kanouche want: nil, 257*46c4c49dSIbrahim Kanouche }, 258*46c4c49dSIbrahim Kanouche { 259*46c4c49dSIbrahim Kanouche description: "No match", 260*46c4c49dSIbrahim Kanouche c: c, 261*46c4c49dSIbrahim Kanouche input: fellowInTheGoatSkin + humourOfIreland, 262*46c4c49dSIbrahim Kanouche want: nil, 263*46c4c49dSIbrahim Kanouche }, 264*46c4c49dSIbrahim Kanouche { 265*46c4c49dSIbrahim Kanouche description: "Exact text match, with extra word and non-word normalizer", 266*46c4c49dSIbrahim Kanouche c: cNormalize, 267*46c4c49dSIbrahim Kanouche input: fellowInTheGoatSkin + gettysburgExtraWord + humourOfIreland, 268*46c4c49dSIbrahim Kanouche want: []result{ 269*46c4c49dSIbrahim Kanouche { 270*46c4c49dSIbrahim Kanouche key: "gettysburg", 271*46c4c49dSIbrahim Kanouche offset: 825, 272*46c4c49dSIbrahim Kanouche minConf: 1.0, 273*46c4c49dSIbrahim Kanouche maxConf: 1.0, 274*46c4c49dSIbrahim Kanouche }, 275*46c4c49dSIbrahim Kanouche }, 276*46c4c49dSIbrahim Kanouche }, 277*46c4c49dSIbrahim Kanouche } 278*46c4c49dSIbrahim Kanouche 279*46c4c49dSIbrahim Kanouche for _, tt := range tests { 280*46c4c49dSIbrahim Kanouche matches := tt.c.MultipleMatch(tt.input) 281*46c4c49dSIbrahim Kanouche if len(matches) != len(tt.want) { 282*46c4c49dSIbrahim Kanouche t.Errorf("MultipleMatch(%q) not enough matches = %v, want %v", tt.description, len(matches), len(tt.want)) 283*46c4c49dSIbrahim Kanouche } 284*46c4c49dSIbrahim Kanouche 285*46c4c49dSIbrahim Kanouche for i := 0; i < len(matches); i++ { 286*46c4c49dSIbrahim Kanouche m := matches[i] 287*46c4c49dSIbrahim Kanouche w := tt.want[i] 288*46c4c49dSIbrahim Kanouche if got, want := m.Name, w.key; got != want { 289*46c4c49dSIbrahim Kanouche t.Errorf("MultipleMatch(%q) = %q, want %q", tt.description, got, want) 290*46c4c49dSIbrahim Kanouche } 291*46c4c49dSIbrahim Kanouche if got, want := m.Confidence, w.minConf; got < want { 292*46c4c49dSIbrahim Kanouche t.Errorf("MultipleMatch(%q) %q = %v, want minimum of %v", tt.description, w.key, got, want) 293*46c4c49dSIbrahim Kanouche } 294*46c4c49dSIbrahim Kanouche if got, want := m.Confidence, w.maxConf; got > want { 295*46c4c49dSIbrahim Kanouche t.Errorf("MultipleMatch(%q) %q = %v, want maximum of %v", tt.description, w.key, got, want) 296*46c4c49dSIbrahim Kanouche } 297*46c4c49dSIbrahim Kanouche if got, want := m.Offset, w.offset; got != want { 298*46c4c49dSIbrahim Kanouche t.Errorf("MultipleMatch(%q) %q = %v, want offset of %v", tt.description, w.key, got, want) 299*46c4c49dSIbrahim Kanouche } 300*46c4c49dSIbrahim Kanouche } 301*46c4c49dSIbrahim Kanouche } 302*46c4c49dSIbrahim Kanouche} 303*46c4c49dSIbrahim Kanouche 304*46c4c49dSIbrahim Kanouchefunc TestClassify_DiffRatio(t *testing.T) { 305*46c4c49dSIbrahim Kanouche tests := []struct { 306*46c4c49dSIbrahim Kanouche x, y string 307*46c4c49dSIbrahim Kanouche want float64 308*46c4c49dSIbrahim Kanouche }{ 309*46c4c49dSIbrahim Kanouche {"", "", 1.0}, 310*46c4c49dSIbrahim Kanouche {"a", "b", 1.0}, 311*46c4c49dSIbrahim Kanouche {"", "abc", 0}, 312*46c4c49dSIbrahim Kanouche {"ab", "c", 0.5}, 313*46c4c49dSIbrahim Kanouche {"a", "bc", 0.5}, 314*46c4c49dSIbrahim Kanouche {"a", "bcde", 0.25}, 315*46c4c49dSIbrahim Kanouche } 316*46c4c49dSIbrahim Kanouche 317*46c4c49dSIbrahim Kanouche for _, tt := range tests { 318*46c4c49dSIbrahim Kanouche if got, want := diffRatio(tt.x, tt.y), tt.want; got != want { 319*46c4c49dSIbrahim Kanouche t.Errorf("diffRatio(%q, %q) = %f, want %f", tt.x, tt.y, got, want) 320*46c4c49dSIbrahim Kanouche } 321*46c4c49dSIbrahim Kanouche } 322*46c4c49dSIbrahim Kanouche} 323*46c4c49dSIbrahim Kanouche 324*46c4c49dSIbrahim Kanouchefunc TestClassify_Matches(t *testing.T) { 325*46c4c49dSIbrahim Kanouche tests := []struct { 326*46c4c49dSIbrahim Kanouche description string 327*46c4c49dSIbrahim Kanouche matches Matches 328*46c4c49dSIbrahim Kanouche want Matches 329*46c4c49dSIbrahim Kanouche }{ 330*46c4c49dSIbrahim Kanouche { 331*46c4c49dSIbrahim Kanouche description: "Different names, same confidences, same offset", 332*46c4c49dSIbrahim Kanouche matches: Matches{ 333*46c4c49dSIbrahim Kanouche &Match{ 334*46c4c49dSIbrahim Kanouche Name: "b", 335*46c4c49dSIbrahim Kanouche Confidence: 0.42, 336*46c4c49dSIbrahim Kanouche Offset: 0, 337*46c4c49dSIbrahim Kanouche }, 338*46c4c49dSIbrahim Kanouche &Match{ 339*46c4c49dSIbrahim Kanouche Name: "a", 340*46c4c49dSIbrahim Kanouche Confidence: 0.42, 341*46c4c49dSIbrahim Kanouche Offset: 0, 342*46c4c49dSIbrahim Kanouche }, 343*46c4c49dSIbrahim Kanouche }, 344*46c4c49dSIbrahim Kanouche want: Matches{ 345*46c4c49dSIbrahim Kanouche &Match{ 346*46c4c49dSIbrahim Kanouche Name: "a", 347*46c4c49dSIbrahim Kanouche Confidence: 0.42, 348*46c4c49dSIbrahim Kanouche Offset: 0, 349*46c4c49dSIbrahim Kanouche }, 350*46c4c49dSIbrahim Kanouche &Match{ 351*46c4c49dSIbrahim Kanouche Name: "b", 352*46c4c49dSIbrahim Kanouche Confidence: 0.42, 353*46c4c49dSIbrahim Kanouche Offset: 0, 354*46c4c49dSIbrahim Kanouche }, 355*46c4c49dSIbrahim Kanouche }, 356*46c4c49dSIbrahim Kanouche }, 357*46c4c49dSIbrahim Kanouche { 358*46c4c49dSIbrahim Kanouche description: "Same names, different confidences, same offset", 359*46c4c49dSIbrahim Kanouche matches: Matches{ 360*46c4c49dSIbrahim Kanouche &Match{ 361*46c4c49dSIbrahim Kanouche Name: "b", 362*46c4c49dSIbrahim Kanouche Confidence: 0.42, 363*46c4c49dSIbrahim Kanouche Offset: 0, 364*46c4c49dSIbrahim Kanouche }, 365*46c4c49dSIbrahim Kanouche &Match{ 366*46c4c49dSIbrahim Kanouche Name: "b", 367*46c4c49dSIbrahim Kanouche Confidence: 0.90, 368*46c4c49dSIbrahim Kanouche Offset: 0, 369*46c4c49dSIbrahim Kanouche }, 370*46c4c49dSIbrahim Kanouche }, 371*46c4c49dSIbrahim Kanouche want: Matches{ 372*46c4c49dSIbrahim Kanouche &Match{ 373*46c4c49dSIbrahim Kanouche Name: "b", 374*46c4c49dSIbrahim Kanouche Confidence: 0.90, 375*46c4c49dSIbrahim Kanouche Offset: 0, 376*46c4c49dSIbrahim Kanouche }, 377*46c4c49dSIbrahim Kanouche &Match{ 378*46c4c49dSIbrahim Kanouche Name: "b", 379*46c4c49dSIbrahim Kanouche Confidence: 0.42, 380*46c4c49dSIbrahim Kanouche Offset: 0, 381*46c4c49dSIbrahim Kanouche }, 382*46c4c49dSIbrahim Kanouche }, 383*46c4c49dSIbrahim Kanouche }, 384*46c4c49dSIbrahim Kanouche { 385*46c4c49dSIbrahim Kanouche description: "Same names, same confidences, different offsets", 386*46c4c49dSIbrahim Kanouche matches: Matches{ 387*46c4c49dSIbrahim Kanouche &Match{ 388*46c4c49dSIbrahim Kanouche Name: "b", 389*46c4c49dSIbrahim Kanouche Confidence: 0.42, 390*46c4c49dSIbrahim Kanouche Offset: 42, 391*46c4c49dSIbrahim Kanouche }, 392*46c4c49dSIbrahim Kanouche &Match{ 393*46c4c49dSIbrahim Kanouche Name: "b", 394*46c4c49dSIbrahim Kanouche Confidence: 0.42, 395*46c4c49dSIbrahim Kanouche Offset: 0, 396*46c4c49dSIbrahim Kanouche }, 397*46c4c49dSIbrahim Kanouche }, 398*46c4c49dSIbrahim Kanouche want: Matches{ 399*46c4c49dSIbrahim Kanouche &Match{ 400*46c4c49dSIbrahim Kanouche Name: "b", 401*46c4c49dSIbrahim Kanouche Confidence: 0.42, 402*46c4c49dSIbrahim Kanouche Offset: 0, 403*46c4c49dSIbrahim Kanouche }, 404*46c4c49dSIbrahim Kanouche &Match{ 405*46c4c49dSIbrahim Kanouche Name: "b", 406*46c4c49dSIbrahim Kanouche Confidence: 0.42, 407*46c4c49dSIbrahim Kanouche Offset: 42, 408*46c4c49dSIbrahim Kanouche }, 409*46c4c49dSIbrahim Kanouche }, 410*46c4c49dSIbrahim Kanouche }, 411*46c4c49dSIbrahim Kanouche 412*46c4c49dSIbrahim Kanouche { 413*46c4c49dSIbrahim Kanouche description: "Different names, different confidences, same offset", 414*46c4c49dSIbrahim Kanouche matches: Matches{ 415*46c4c49dSIbrahim Kanouche &Match{ 416*46c4c49dSIbrahim Kanouche Name: "b", 417*46c4c49dSIbrahim Kanouche Confidence: 0.42, 418*46c4c49dSIbrahim Kanouche Offset: 0, 419*46c4c49dSIbrahim Kanouche }, 420*46c4c49dSIbrahim Kanouche &Match{ 421*46c4c49dSIbrahim Kanouche Name: "a", 422*46c4c49dSIbrahim Kanouche Confidence: 0.90, 423*46c4c49dSIbrahim Kanouche Offset: 0, 424*46c4c49dSIbrahim Kanouche }, 425*46c4c49dSIbrahim Kanouche }, 426*46c4c49dSIbrahim Kanouche want: Matches{ 427*46c4c49dSIbrahim Kanouche &Match{ 428*46c4c49dSIbrahim Kanouche Name: "a", 429*46c4c49dSIbrahim Kanouche Confidence: 0.90, 430*46c4c49dSIbrahim Kanouche Offset: 0, 431*46c4c49dSIbrahim Kanouche }, 432*46c4c49dSIbrahim Kanouche &Match{ 433*46c4c49dSIbrahim Kanouche Name: "b", 434*46c4c49dSIbrahim Kanouche Confidence: 0.42, 435*46c4c49dSIbrahim Kanouche Offset: 0, 436*46c4c49dSIbrahim Kanouche }, 437*46c4c49dSIbrahim Kanouche }, 438*46c4c49dSIbrahim Kanouche }, 439*46c4c49dSIbrahim Kanouche { 440*46c4c49dSIbrahim Kanouche description: "Different names, same confidences, different offset", 441*46c4c49dSIbrahim Kanouche matches: Matches{ 442*46c4c49dSIbrahim Kanouche &Match{ 443*46c4c49dSIbrahim Kanouche Name: "b", 444*46c4c49dSIbrahim Kanouche Confidence: 0.42, 445*46c4c49dSIbrahim Kanouche Offset: 37, 446*46c4c49dSIbrahim Kanouche }, 447*46c4c49dSIbrahim Kanouche &Match{ 448*46c4c49dSIbrahim Kanouche Name: "a", 449*46c4c49dSIbrahim Kanouche Confidence: 0.42, 450*46c4c49dSIbrahim Kanouche Offset: 0, 451*46c4c49dSIbrahim Kanouche }, 452*46c4c49dSIbrahim Kanouche }, 453*46c4c49dSIbrahim Kanouche want: Matches{ 454*46c4c49dSIbrahim Kanouche &Match{ 455*46c4c49dSIbrahim Kanouche Name: "a", 456*46c4c49dSIbrahim Kanouche Confidence: 0.42, 457*46c4c49dSIbrahim Kanouche Offset: 0, 458*46c4c49dSIbrahim Kanouche }, 459*46c4c49dSIbrahim Kanouche &Match{ 460*46c4c49dSIbrahim Kanouche Name: "b", 461*46c4c49dSIbrahim Kanouche Confidence: 0.42, 462*46c4c49dSIbrahim Kanouche Offset: 37, 463*46c4c49dSIbrahim Kanouche }, 464*46c4c49dSIbrahim Kanouche }, 465*46c4c49dSIbrahim Kanouche }, 466*46c4c49dSIbrahim Kanouche { 467*46c4c49dSIbrahim Kanouche description: "Different names, different confidences, different offset", 468*46c4c49dSIbrahim Kanouche matches: Matches{ 469*46c4c49dSIbrahim Kanouche &Match{ 470*46c4c49dSIbrahim Kanouche Name: "a", 471*46c4c49dSIbrahim Kanouche Confidence: 0.42, 472*46c4c49dSIbrahim Kanouche Offset: 0, 473*46c4c49dSIbrahim Kanouche }, 474*46c4c49dSIbrahim Kanouche &Match{ 475*46c4c49dSIbrahim Kanouche Name: "b", 476*46c4c49dSIbrahim Kanouche Confidence: 0.90, 477*46c4c49dSIbrahim Kanouche Offset: 37, 478*46c4c49dSIbrahim Kanouche }, 479*46c4c49dSIbrahim Kanouche }, 480*46c4c49dSIbrahim Kanouche want: Matches{ 481*46c4c49dSIbrahim Kanouche &Match{ 482*46c4c49dSIbrahim Kanouche Name: "b", 483*46c4c49dSIbrahim Kanouche Confidence: 0.90, 484*46c4c49dSIbrahim Kanouche Offset: 37, 485*46c4c49dSIbrahim Kanouche }, 486*46c4c49dSIbrahim Kanouche &Match{ 487*46c4c49dSIbrahim Kanouche Name: "a", 488*46c4c49dSIbrahim Kanouche Confidence: 0.42, 489*46c4c49dSIbrahim Kanouche Offset: 0, 490*46c4c49dSIbrahim Kanouche }, 491*46c4c49dSIbrahim Kanouche }, 492*46c4c49dSIbrahim Kanouche }, 493*46c4c49dSIbrahim Kanouche } 494*46c4c49dSIbrahim Kanouche 495*46c4c49dSIbrahim Kanouche for _, tt := range tests { 496*46c4c49dSIbrahim Kanouche sort.Sort(tt.matches) 497*46c4c49dSIbrahim Kanouche if !reflect.DeepEqual(tt.matches, tt.want) { 498*46c4c49dSIbrahim Kanouche for _, x := range tt.matches { 499*46c4c49dSIbrahim Kanouche t.Errorf("got: %v", x) 500*46c4c49dSIbrahim Kanouche } 501*46c4c49dSIbrahim Kanouche for _, x := range tt.want { 502*46c4c49dSIbrahim Kanouche t.Errorf("want: %v", x) 503*46c4c49dSIbrahim Kanouche } 504*46c4c49dSIbrahim Kanouche t.Errorf("MatchesSort(%q) = %v, want %v", tt.description, tt.matches, tt.want) 505*46c4c49dSIbrahim Kanouche } 506*46c4c49dSIbrahim Kanouche } 507*46c4c49dSIbrahim Kanouche} 508*46c4c49dSIbrahim Kanouche 509*46c4c49dSIbrahim Kanouchefunc TestClassify_DiffRangeEnd(t *testing.T) { 510*46c4c49dSIbrahim Kanouche dmp := diffmatchpatch.New() 511*46c4c49dSIbrahim Kanouche tests := []struct { 512*46c4c49dSIbrahim Kanouche description string 513*46c4c49dSIbrahim Kanouche unknown string 514*46c4c49dSIbrahim Kanouche known string 515*46c4c49dSIbrahim Kanouche end int 516*46c4c49dSIbrahim Kanouche }{ 517*46c4c49dSIbrahim Kanouche { 518*46c4c49dSIbrahim Kanouche description: "identical", 519*46c4c49dSIbrahim Kanouche unknown: declaration, 520*46c4c49dSIbrahim Kanouche known: declaration, 521*46c4c49dSIbrahim Kanouche end: 1, 522*46c4c49dSIbrahim Kanouche }, 523*46c4c49dSIbrahim Kanouche { 524*46c4c49dSIbrahim Kanouche description: "lorem", 525*46c4c49dSIbrahim Kanouche unknown: lessModifiedLorem, 526*46c4c49dSIbrahim Kanouche known: loremipsum, 527*46c4c49dSIbrahim Kanouche end: 3, 528*46c4c49dSIbrahim Kanouche }, 529*46c4c49dSIbrahim Kanouche { 530*46c4c49dSIbrahim Kanouche description: "gettysburg", 531*46c4c49dSIbrahim Kanouche unknown: modifiedGettysburg, 532*46c4c49dSIbrahim Kanouche known: gettysburg, 533*46c4c49dSIbrahim Kanouche end: 19, 534*46c4c49dSIbrahim Kanouche }, 535*46c4c49dSIbrahim Kanouche } 536*46c4c49dSIbrahim Kanouche 537*46c4c49dSIbrahim Kanouche for _, tt := range tests { 538*46c4c49dSIbrahim Kanouche diffs := dmp.DiffMain(tt.unknown, tt.known, true) 539*46c4c49dSIbrahim Kanouche if e := diffRangeEnd(tt.known, diffs); e != tt.end { 540*46c4c49dSIbrahim Kanouche t.Errorf("DiffRangeEnd(%q) = end %v, want %v", tt.description, e, tt.end) 541*46c4c49dSIbrahim Kanouche } 542*46c4c49dSIbrahim Kanouche } 543*46c4c49dSIbrahim Kanouche} 544*46c4c49dSIbrahim Kanouche 545*46c4c49dSIbrahim Kanouchefunc BenchmarkClassifier(b *testing.B) { 546*46c4c49dSIbrahim Kanouche c := New(DefaultConfidenceThreshold, FlattenWhitespace) 547*46c4c49dSIbrahim Kanouche c.AddValue("gettysburg", gettysburg) 548*46c4c49dSIbrahim Kanouche c.AddValue("declaration", declaration) 549*46c4c49dSIbrahim Kanouche c.AddValue("loremipsum", loremipsum) 550*46c4c49dSIbrahim Kanouche 551*46c4c49dSIbrahim Kanouche b.ResetTimer() 552*46c4c49dSIbrahim Kanouche for i := 0; i < b.N; i++ { 553*46c4c49dSIbrahim Kanouche c.NearestMatch(modifiedLorem) 554*46c4c49dSIbrahim Kanouche } 555*46c4c49dSIbrahim Kanouche} 556