xref: /aosp_15_r20/external/antlr/runtime/Python/tests/t012lexerXML.py (revision 16467b971bd3e2009fad32dd79016f2c7e421deb)
1*16467b97STreehugger Robotimport antlr3
2*16467b97STreehugger Robotimport testbase
3*16467b97STreehugger Robotimport unittest
4*16467b97STreehugger Robotimport os
5*16467b97STreehugger Robotimport sys
6*16467b97STreehugger Robotfrom cStringIO import StringIO
7*16467b97STreehugger Robotimport difflib
8*16467b97STreehugger Robotimport textwrap
9*16467b97STreehugger Robot
10*16467b97STreehugger Robotclass t012lexerXML(testbase.ANTLRTest):
11*16467b97STreehugger Robot    def setUp(self):
12*16467b97STreehugger Robot        self.compileGrammar('t012lexerXMLLexer.g')
13*16467b97STreehugger Robot
14*16467b97STreehugger Robot
15*16467b97STreehugger Robot    def lexerClass(self, base):
16*16467b97STreehugger Robot        class TLexer(base):
17*16467b97STreehugger Robot            def emitErrorMessage(self, msg):
18*16467b97STreehugger Robot                # report errors to /dev/null
19*16467b97STreehugger Robot                pass
20*16467b97STreehugger Robot
21*16467b97STreehugger Robot            def reportError(self, re):
22*16467b97STreehugger Robot                # no error recovery yet, just crash!
23*16467b97STreehugger Robot                raise re
24*16467b97STreehugger Robot
25*16467b97STreehugger Robot        return TLexer
26*16467b97STreehugger Robot
27*16467b97STreehugger Robot
28*16467b97STreehugger Robot    def testValid(self):
29*16467b97STreehugger Robot        inputPath = os.path.splitext(__file__)[0] + '.input'
30*16467b97STreehugger Robot        stream = antlr3.StringStream(unicode(open(inputPath).read(), 'utf-8'))
31*16467b97STreehugger Robot        lexer = self.getLexer(stream)
32*16467b97STreehugger Robot
33*16467b97STreehugger Robot        while True:
34*16467b97STreehugger Robot            token = lexer.nextToken()
35*16467b97STreehugger Robot            if token.type == self.lexerModule.EOF:
36*16467b97STreehugger Robot                break
37*16467b97STreehugger Robot
38*16467b97STreehugger Robot
39*16467b97STreehugger Robot        output = unicode(lexer.outbuf.getvalue(), 'utf-8')
40*16467b97STreehugger Robot
41*16467b97STreehugger Robot        outputPath = os.path.splitext(__file__)[0] + '.output'
42*16467b97STreehugger Robot        testOutput = unicode(open(outputPath).read(), 'utf-8')
43*16467b97STreehugger Robot
44*16467b97STreehugger Robot        success = (output == testOutput)
45*16467b97STreehugger Robot        if not success:
46*16467b97STreehugger Robot            d = difflib.Differ()
47*16467b97STreehugger Robot            r = d.compare(output.splitlines(1), testOutput.splitlines(1))
48*16467b97STreehugger Robot            self.fail(
49*16467b97STreehugger Robot                ''.join([l.encode('ascii', 'backslashreplace') for l in r])
50*16467b97STreehugger Robot                )
51*16467b97STreehugger Robot
52*16467b97STreehugger Robot
53*16467b97STreehugger Robot    def testMalformedInput1(self):
54*16467b97STreehugger Robot        input = textwrap.dedent("""\
55*16467b97STreehugger Robot        <?xml version='1.0'?>
56*16467b97STreehugger Robot        <document d>
57*16467b97STreehugger Robot        </document>
58*16467b97STreehugger Robot        """)
59*16467b97STreehugger Robot
60*16467b97STreehugger Robot        stream = antlr3.StringStream(input)
61*16467b97STreehugger Robot        lexer = self.getLexer(stream)
62*16467b97STreehugger Robot
63*16467b97STreehugger Robot        try:
64*16467b97STreehugger Robot            while True:
65*16467b97STreehugger Robot                token = lexer.nextToken()
66*16467b97STreehugger Robot                if token.type == antlr3.EOF:
67*16467b97STreehugger Robot                    break
68*16467b97STreehugger Robot
69*16467b97STreehugger Robot            raise AssertionError
70*16467b97STreehugger Robot
71*16467b97STreehugger Robot        except antlr3.NoViableAltException, exc:
72*16467b97STreehugger Robot            assert exc.unexpectedType == '>', repr(exc.unexpectedType)
73*16467b97STreehugger Robot            assert exc.charPositionInLine == 11, repr(exc.charPositionInLine)
74*16467b97STreehugger Robot            assert exc.line == 2, repr(exc.line)
75*16467b97STreehugger Robot
76*16467b97STreehugger Robot
77*16467b97STreehugger Robot    def testMalformedInput2(self):
78*16467b97STreehugger Robot        input = textwrap.dedent("""\
79*16467b97STreehugger Robot        <?tml version='1.0'?>
80*16467b97STreehugger Robot        <document>
81*16467b97STreehugger Robot        </document>
82*16467b97STreehugger Robot        """)
83*16467b97STreehugger Robot
84*16467b97STreehugger Robot        stream = antlr3.StringStream(input)
85*16467b97STreehugger Robot        lexer = self.getLexer(stream)
86*16467b97STreehugger Robot
87*16467b97STreehugger Robot        try:
88*16467b97STreehugger Robot            while True:
89*16467b97STreehugger Robot                token = lexer.nextToken()
90*16467b97STreehugger Robot                if token.type == antlr3.EOF:
91*16467b97STreehugger Robot                    break
92*16467b97STreehugger Robot
93*16467b97STreehugger Robot            raise AssertionError
94*16467b97STreehugger Robot
95*16467b97STreehugger Robot        except antlr3.MismatchedSetException, exc:
96*16467b97STreehugger Robot            assert exc.unexpectedType == 't', repr(exc.unexpectedType)
97*16467b97STreehugger Robot            assert exc.charPositionInLine == 2, repr(exc.charPositionInLine)
98*16467b97STreehugger Robot            assert exc.line == 1, repr(exc.line)
99*16467b97STreehugger Robot
100*16467b97STreehugger Robot
101*16467b97STreehugger Robot    def testMalformedInput3(self):
102*16467b97STreehugger Robot        input = textwrap.dedent("""\
103*16467b97STreehugger Robot        <?xml version='1.0'?>
104*16467b97STreehugger Robot        <docu ment attr="foo">
105*16467b97STreehugger Robot        </document>
106*16467b97STreehugger Robot        """)
107*16467b97STreehugger Robot
108*16467b97STreehugger Robot        stream = antlr3.StringStream(input)
109*16467b97STreehugger Robot        lexer = self.getLexer(stream)
110*16467b97STreehugger Robot
111*16467b97STreehugger Robot        try:
112*16467b97STreehugger Robot            while True:
113*16467b97STreehugger Robot                token = lexer.nextToken()
114*16467b97STreehugger Robot                if token.type == antlr3.EOF:
115*16467b97STreehugger Robot                    break
116*16467b97STreehugger Robot
117*16467b97STreehugger Robot            raise AssertionError
118*16467b97STreehugger Robot
119*16467b97STreehugger Robot        except antlr3.NoViableAltException, exc:
120*16467b97STreehugger Robot            assert exc.unexpectedType == 'a', repr(exc.unexpectedType)
121*16467b97STreehugger Robot            assert exc.charPositionInLine == 11, repr(exc.charPositionInLine)
122*16467b97STreehugger Robot            assert exc.line == 2, repr(exc.line)
123*16467b97STreehugger Robot
124*16467b97STreehugger Robot
125*16467b97STreehugger Robot
126*16467b97STreehugger Robotif __name__ == '__main__':
127*16467b97STreehugger Robot    unittest.main()
128*16467b97STreehugger Robot
129*16467b97STreehugger Robot
130*16467b97STreehugger Robot## # run an infinite loop with randomly mangled input
131*16467b97STreehugger Robot## while True:
132*16467b97STreehugger Robot##     print "ping"
133*16467b97STreehugger Robot
134*16467b97STreehugger Robot##     input = """\
135*16467b97STreehugger Robot## <?xml version='1.0'?>
136*16467b97STreehugger Robot## <!DOCTYPE component [
137*16467b97STreehugger Robot## <!ELEMENT component (PCDATA|sub)*>
138*16467b97STreehugger Robot## <!ATTLIST component
139*16467b97STreehugger Robot##           attr CDATA #IMPLIED
140*16467b97STreehugger Robot##           attr2 CDATA #IMPLIED
141*16467b97STreehugger Robot## >
142*16467b97STreehugger Robot## <!ELMENT sub EMPTY>
143*16467b97STreehugger Robot
144*16467b97STreehugger Robot## ]>
145*16467b97STreehugger Robot## <component attr="val'ue" attr2='val"ue'>
146*16467b97STreehugger Robot## <!-- This is a comment -->
147*16467b97STreehugger Robot## Text
148*16467b97STreehugger Robot## <![CDATA[huhu]]>
149*16467b97STreehugger Robot## &amp;
150*16467b97STreehugger Robot## &lt;
151*16467b97STreehugger Robot## <?xtal cursor='11'?>
152*16467b97STreehugger Robot## <sub/>
153*16467b97STreehugger Robot## <sub></sub>
154*16467b97STreehugger Robot## </component>
155*16467b97STreehugger Robot## """
156*16467b97STreehugger Robot
157*16467b97STreehugger Robot##     import random
158*16467b97STreehugger Robot##     input = list(input) # make it mutable
159*16467b97STreehugger Robot##     for _ in range(3):
160*16467b97STreehugger Robot##         p1 = random.randrange(len(input))
161*16467b97STreehugger Robot##         p2 = random.randrange(len(input))
162*16467b97STreehugger Robot
163*16467b97STreehugger Robot##         c1 = input[p1]
164*16467b97STreehugger Robot##         input[p1] = input[p2]
165*16467b97STreehugger Robot##         input[p2] = c1
166*16467b97STreehugger Robot##     input = ''.join(input) # back to string
167*16467b97STreehugger Robot
168*16467b97STreehugger Robot##     stream = antlr3.StringStream(input)
169*16467b97STreehugger Robot##     lexer = Lexer(stream)
170*16467b97STreehugger Robot
171*16467b97STreehugger Robot##     try:
172*16467b97STreehugger Robot##         while True:
173*16467b97STreehugger Robot##             token = lexer.nextToken()
174*16467b97STreehugger Robot##             if token.type == EOF:
175*16467b97STreehugger Robot##                 break
176*16467b97STreehugger Robot
177*16467b97STreehugger Robot##     except antlr3.RecognitionException, exc:
178*16467b97STreehugger Robot##         print exc
179*16467b97STreehugger Robot##         for l in input.splitlines()[0:exc.line]:
180*16467b97STreehugger Robot##             print l
181*16467b97STreehugger Robot##         print ' '*exc.charPositionInLine + '^'
182*16467b97STreehugger Robot
183*16467b97STreehugger Robot##     except BaseException, exc:
184*16467b97STreehugger Robot##         print '\n'.join(['%02d: %s' % (idx+1, l) for idx, l in enumerate(input.splitlines())])
185*16467b97STreehugger Robot##         print "%s at %d:%d" % (exc, stream.line, stream.charPositionInLine)
186*16467b97STreehugger Robot##         print
187*16467b97STreehugger Robot
188*16467b97STreehugger Robot##         raise
189*16467b97STreehugger Robot
190