blob: 6524768d1fc3dc7e8483acb914672a3d322609c9 [file] [log] [blame]
Greg Steindb84a102009-01-31 07:40:26 +00001#
2# simple scanner for Thrift. emits tokens.
3#
4
5__all__ = ['Scanner', 'SimpleScanner', 'Token', 'TYPE_INT',
6 'ExpectedError', 'ExpectedType', 'UnexpectedEOF',
7 'UnknownToken', 'IncorrectSyntax',
8 ]
9
10import re
11
12re_int = re.compile('[+-]?[0-9]+$') # special handling
13re_hex = re.compile('0x[0-9A-Fa-f]+')
14re_dub = re.compile(r'[+-]?[0-9]*(\.[0-9]+)?([eE][+-]?[0-9]+)?')
15
16re_white = re.compile('[ \t\r\n]+')
17re_silly = re.compile(r'/\*+\*/')
18re_multi = re.compile(r'/\*[^*]/*([^*/]|[^*]/|\*[^/])*\*+/')
19re_comment = re.compile('//[^\n]*')
20re_unix = re.compile('#[^\n]*')
21
22re_doc = re.compile(r'/\*\*([^*/]|[^*]/|\*[^/])*\*+/')
23
24re_ident = re.compile('[a-zA-Z_][\.a-zA-Z_0-9]*')
25re_symbol = re.compile(r'[:;,{}()=<>\[\]]')
26re_dliteral = re.compile('"[^"]*"')
27re_sliteral = re.compile("'[^']*'")
28re_st_ident = re.compile('[a-zA-Z-][.a-zA-Z_0-9-]*')
29
30skip_re = [re_white, re_silly, re_multi, re_comment, re_unix]
31
32types = [
33 ('HEX', re_hex), # keep before re_dub
34 ('DUB', re_dub),
35 ('DOC', re_doc),
36 ('ID', re_ident),
37 ('SYM', re_symbol),
38 ('LIT', re_dliteral),
39 ('LIT', re_sliteral),
40 ('STID', re_st_ident),
41 ]
42
43for key, pattern in types:
44 globals()['TYPE_' + key] = key
45 __all__.append('TYPE_' + key)
46TYPE_INT = 'INT'
47
48
49class SimpleScanner(object):
50
51 def __init__(self, contents):
52 self.contents = contents
53 self.lineno = 1
54
55 def get(self):
56 """Get the next token.
57
58 Consumes and returns the next token. Note that leading whitespace is
59 skipped.
60
61 Returns None if there are no more tokens.
62 """
63 self._skip()
64
65 if not self.contents:
66 return None
67
68 for ttype, pattern in types:
69 m = pattern.match(self.contents)
70 if m:
71 if m.end() == 0:
72 continue
73 tvalue = m.group()
74 if pattern is re_dub and re_int.match(tvalue):
75 ttype = TYPE_INT
76 elif ttype == TYPE_LIT:
77 # strip quotes
78 tvalue = tvalue[1:-1]
79 ### fold TYPE_HEX into TYPE_INT? convert INT/DUB away from string?
80 token = Token(ttype, tvalue)
81 self._chomp(m.end())
82 return token
83
84 raise UnknownToken(self.lineno)
85
86 def _skip(self):
87 "Skip over leading whitespace."
88
89 while True:
90 for pattern in skip_re:
91 m = pattern.match(self.contents)
92 if m:
93 self._chomp(m.end())
94 break
95 else:
96 # nothing matched. all done.
97 return
98
99 def _chomp(self, amt):
100 "Chomp AMT bytes off the front of the contents. Count newlines."
101 self.lineno += self.contents[:amt].count('\n')
102 self.contents = self.contents[amt:]
103
104
105class Scanner(SimpleScanner):
106 def __init__(self, contents):
107 SimpleScanner.__init__(self, contents)
108
109 self.doc = None
110 self.pending = None
111
112 def get(self, eof_allowed=True):
113 if self.pending is not None:
114 token = self.pending
115 self.pending = None
116 return token
117
118 self.doc = None
119 while True:
120 t = SimpleScanner.get(self)
121 if t is None:
122 if eof_allowed:
123 return None
124 raise UnexpectedEOF(self.lineno)
125 if t.ttype != TYPE_DOC:
126 #print 'TOKEN:', t
127 return t
128 self.doc = t
129
130 def get_type(self, ttype):
131 "Get the next token, ensuring it is of the given type."
132 t = self.get(eof_allowed=False)
133 if t.ttype != ttype:
134 raise ExpectedType(ttype, t.ttype, self.lineno)
135 return t
136
137 def value_of(self, ttype):
138 "Get the next token's value, ensuring it is of the given type."
139 return self.get_type(ttype).tvalue
140
141 def pushback(self, token):
142 "Push a token back into the scanner; it was unused."
143 assert token is not None
144 assert self.pending is None
145 self.pending = token
146
147 def eat_commasemi(self):
148 "Eat a comma or a semicolon, if present."
149 t = self.get()
150 if t != SYM_COMMA and t != SYM_SEMI:
151 self.pushback(t)
152
153 def eat_expected(self, token):
154 "Eat the expected token, or raise a ExpectedError."
155 t = self.get()
156 if t != token:
157 raise ExpectedError(token, t, self.lineno)
158
159
160class Token(object):
161 def __init__(self, ttype, tvalue=None):
162 self.ttype = ttype
163 self.tvalue = tvalue
164
165 def __str__(self):
166 if self.tvalue is None:
167 return 'T(%s)' % self.ttype
168 return 'T(%s, "%s")' % (self.ttype, self.tvalue)
169
170 def __eq__(self, other):
171 return self.ttype == other.ttype and self.tvalue == other.tvalue
172
173 def __ne__(self, other):
174 return self.ttype != other.ttype or self.tvalue != other.tvalue
175
176 def __hash__(self):
177 return hash((self.ttype, self.tvalue))
178
179
180for ident in ['namespace',
181 'cpp_namespace',
182 'cpp_include',
183 'cpp_type',
184 'java_package',
185 'cocoa_prefix',
186 'csharp_namespace',
187 'php_namespace',
188 'py_module',
189 'perl_package',
190 'ruby_namespace',
191 'smalltalk_category',
192 'smalltalk_prefix',
193 'xsd_all',
194 'xsd_optional',
195 'xsd_nillable',
196 'xsd_namespace',
197 'xsd_attrs',
198 'include',
199 'void',
200 'bool',
201 'byte',
202 'i16',
203 'i32',
204 'i64',
205 'double',
206 'string',
207 'binary',
208 'slist',
209 'senum',
210 'map',
211 'list',
212 'set',
213 'async',
214 'typedef',
215 'struct',
216 'exception',
217 'extends',
218 'throws',
219 'service',
220 'enum',
221 'const',
222 'required',
223 'optional',
224 ]:
225 name = 'ID_' + ident.upper()
226 globals()[name] = Token(TYPE_ID, ident)
227 __all__.append(name)
228
229
230for name, sym in [('COLON', ':'),
231 ('SEMI', ';'),
232 ('COMMA', ','),
233 ('LBRACE', '{'),
234 ('RBRACE', '}'),
235 ('LPAREN', '('),
236 ('RPAREN', ')'),
237 ('LBRKT', '['),
238 ('RBRKT', ']'),
239 ('EQ', '='),
240 ('LT', '<'),
241 ('GT', '>'),
242 ]:
243 globals()['SYM_' + name] = Token(TYPE_SYM, sym)
244 __all__.append('SYM_' + name)
245
246
247class ExpectedError(Exception):
248 "Expected token was not present."
249
250class ExpectedType(Exception):
251 "Expected token type was not present."
252
253class UnexpectedEOF(Exception):
254 "EOF reached unexpectedly."
255
256class UnknownToken(Exception):
257 "Unknown token encountered."
258
259class IncorrectSyntax(Exception):
260 "Incorrect syntax encountered."
261
262
263if __name__ == '__main__':
264 import sys
265
266 s = Scanner(open(sys.argv[1]).read())
267 while True:
268 token = s.get()
269 if token is None:
270 break
271 print token