1// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package syntax
6
7import "testing"
8
9var simplifyTests = []struct {
10	Regexp string
11	Simple string
12}{
13	// Already-simple constructs
14	{`a`, `a`},
15	{`ab`, `ab`},
16	{`a|b`, `[ab]`},
17	{`ab|cd`, `ab|cd`},
18	{`(ab)*`, `(ab)*`},
19	{`(ab)+`, `(ab)+`},
20	{`(ab)?`, `(ab)?`},
21	{`.`, `(?s:.)`},
22	{`^`, `(?m:^)`},
23	{`$`, `(?m:$)`},
24	{`[ac]`, `[ac]`},
25	{`[^ac]`, `[^ac]`},
26
27	// Posix character classes
28	{`[[:alnum:]]`, `[0-9A-Za-z]`},
29	{`[[:alpha:]]`, `[A-Za-z]`},
30	{`[[:blank:]]`, `[\t ]`},
31	{`[[:cntrl:]]`, `[\x00-\x1f\x7f]`},
32	{`[[:digit:]]`, `[0-9]`},
33	{`[[:graph:]]`, `[!-~]`},
34	{`[[:lower:]]`, `[a-z]`},
35	{`[[:print:]]`, `[ -~]`},
36	{`[[:punct:]]`, "[!-/:-@\\[-`\\{-~]"},
37	{`[[:space:]]`, `[\t-\r ]`},
38	{`[[:upper:]]`, `[A-Z]`},
39	{`[[:xdigit:]]`, `[0-9A-Fa-f]`},
40
41	// Perl character classes
42	{`\d`, `[0-9]`},
43	{`\s`, `[\t\n\f\r ]`},
44	{`\w`, `[0-9A-Z_a-z]`},
45	{`\D`, `[^0-9]`},
46	{`\S`, `[^\t\n\f\r ]`},
47	{`\W`, `[^0-9A-Z_a-z]`},
48	{`[\d]`, `[0-9]`},
49	{`[\s]`, `[\t\n\f\r ]`},
50	{`[\w]`, `[0-9A-Z_a-z]`},
51	{`[\D]`, `[^0-9]`},
52	{`[\S]`, `[^\t\n\f\r ]`},
53	{`[\W]`, `[^0-9A-Z_a-z]`},
54
55	// Posix repetitions
56	{`a{1}`, `a`},
57	{`a{2}`, `aa`},
58	{`a{5}`, `aaaaa`},
59	{`a{0,1}`, `a?`},
60	// The next three are illegible because Simplify inserts (?:)
61	// parens instead of () parens to avoid creating extra
62	// captured subexpressions. The comments show a version with fewer parens.
63	{`(a){0,2}`, `(?:(a)(a)?)?`},                       //       (aa?)?
64	{`(a){0,4}`, `(?:(a)(?:(a)(?:(a)(a)?)?)?)?`},       //   (a(a(aa?)?)?)?
65	{`(a){2,6}`, `(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // aa(a(a(aa?)?)?)?
66	{`a{0,2}`, `(?:aa?)?`},                             //       (aa?)?
67	{`a{0,4}`, `(?:a(?:a(?:aa?)?)?)?`},                 //   (a(a(aa?)?)?)?
68	{`a{2,6}`, `aa(?:a(?:a(?:aa?)?)?)?`},               // aa(a(a(aa?)?)?)?
69	{`a{0,}`, `a*`},
70	{`a{1,}`, `a+`},
71	{`a{2,}`, `aa+`},
72	{`a{5,}`, `aaaaa+`},
73
74	// Test that operators simplify their arguments.
75	{`(?:a{1,}){1,}`, `a+`},
76	{`(a{1,}b{1,})`, `(a+b+)`},
77	{`a{1,}|b{1,}`, `a+|b+`},
78	{`(?:a{1,})*`, `(?:a+)*`},
79	{`(?:a{1,})+`, `a+`},
80	{`(?:a{1,})?`, `(?:a+)?`},
81	{``, `(?:)`},
82	{`a{0}`, `(?:)`},
83
84	// Character class simplification
85	{`[ab]`, `[ab]`},
86	{`[abc]`, `[a-c]`},
87	{`[a-za-za-z]`, `[a-z]`},
88	{`[A-Za-zA-Za-z]`, `[A-Za-z]`},
89	{`[ABCDEFGH]`, `[A-H]`},
90	{`[AB-CD-EF-GH]`, `[A-H]`},
91	{`[W-ZP-XE-R]`, `[E-Z]`},
92	{`[a-ee-gg-m]`, `[a-m]`},
93	{`[a-ea-ha-m]`, `[a-m]`},
94	{`[a-ma-ha-e]`, `[a-m]`},
95	{`[a-zA-Z0-9 -~]`, `[ -~]`},
96
97	// Empty character classes
98	{`[^[:cntrl:][:^cntrl:]]`, `[^\x00-\x{10FFFF}]`},
99
100	// Full character classes
101	{`[[:cntrl:][:^cntrl:]]`, `(?s:.)`},
102
103	// Unicode case folding.
104	{`(?i)A`, `(?i:A)`},
105	{`(?i)a`, `(?i:A)`},
106	{`(?i)[A]`, `(?i:A)`},
107	{`(?i)[a]`, `(?i:A)`},
108	{`(?i)K`, `(?i:K)`},
109	{`(?i)k`, `(?i:K)`},
110	{`(?i)\x{212a}`, "(?i:K)"},
111	{`(?i)[K]`, "[Kk\u212A]"},
112	{`(?i)[k]`, "[Kk\u212A]"},
113	{`(?i)[\x{212a}]`, "[Kk\u212A]"},
114	{`(?i)[a-z]`, "[A-Za-z\u017F\u212A]"},
115	{`(?i)[\x00-\x{FFFD}]`, "[\\x00-\uFFFD]"},
116	{`(?i)[\x00-\x{10FFFF}]`, `(?s:.)`},
117
118	// Empty string as a regular expression.
119	// The empty string must be preserved inside parens in order
120	// to make submatches work right, so these tests are less
121	// interesting than they might otherwise be. String inserts
122	// explicit (?:) in place of non-parenthesized empty strings,
123	// to make them easier to spot for other parsers.
124	{`(a|b|c|)`, `([a-c]|(?:))`},
125	{`(a|b|)`, `([ab]|(?:))`},
126	{`(|)`, `()`},
127	{`a()`, `a()`},
128	{`(()|())`, `(()|())`},
129	{`(a|)`, `(a|(?:))`},
130	{`ab()cd()`, `ab()cd()`},
131	{`()`, `()`},
132	{`()*`, `()*`},
133	{`()+`, `()+`},
134	{`()?`, `()?`},
135	{`(){0}`, `(?:)`},
136	{`(){1}`, `()`},
137	{`(){1,}`, `()+`},
138	{`(){0,2}`, `(?:()()?)?`},
139}
140
141func TestSimplify(t *testing.T) {
142	for _, tt := range simplifyTests {
143		re, err := Parse(tt.Regexp, MatchNL|Perl&^OneLine)
144		if err != nil {
145			t.Errorf("Parse(%#q) = error %v", tt.Regexp, err)
146			continue
147		}
148		s := re.Simplify().String()
149		if s != tt.Simple {
150			t.Errorf("Simplify(%#q) = %#q, want %#q", tt.Regexp, s, tt.Simple)
151		}
152	}
153}
154