blob: 6524768d1fc3dc7e8483acb914672a3d322609c9 [file] [log] [blame]
#
# simple scanner for Thrift. emits tokens.
#
__all__ = ['Scanner', 'SimpleScanner', 'Token', 'TYPE_INT',
'ExpectedError', 'ExpectedType', 'UnexpectedEOF',
'UnknownToken', 'IncorrectSyntax',
]
import re
re_int = re.compile('[+-]?[0-9]+$') # special handling
re_hex = re.compile('0x[0-9A-Fa-f]+')
re_dub = re.compile(r'[+-]?[0-9]*(\.[0-9]+)?([eE][+-]?[0-9]+)?')
re_white = re.compile('[ \t\r\n]+')
re_silly = re.compile(r'/\*+\*/')
re_multi = re.compile(r'/\*[^*]/*([^*/]|[^*]/|\*[^/])*\*+/')
re_comment = re.compile('//[^\n]*')
re_unix = re.compile('#[^\n]*')
re_doc = re.compile(r'/\*\*([^*/]|[^*]/|\*[^/])*\*+/')
re_ident = re.compile('[a-zA-Z_][\.a-zA-Z_0-9]*')
re_symbol = re.compile(r'[:;,{}()=<>\[\]]')
re_dliteral = re.compile('"[^"]*"')
re_sliteral = re.compile("'[^']*'")
re_st_ident = re.compile('[a-zA-Z-][.a-zA-Z_0-9-]*')
skip_re = [re_white, re_silly, re_multi, re_comment, re_unix]
types = [
('HEX', re_hex), # keep before re_dub
('DUB', re_dub),
('DOC', re_doc),
('ID', re_ident),
('SYM', re_symbol),
('LIT', re_dliteral),
('LIT', re_sliteral),
('STID', re_st_ident),
]
for key, pattern in types:
globals()['TYPE_' + key] = key
__all__.append('TYPE_' + key)
TYPE_INT = 'INT'
class SimpleScanner(object):
def __init__(self, contents):
self.contents = contents
self.lineno = 1
def get(self):
"""Get the next token.
Consumes and returns the next token. Note that leading whitespace is
skipped.
Returns None if there are no more tokens.
"""
self._skip()
if not self.contents:
return None
for ttype, pattern in types:
m = pattern.match(self.contents)
if m:
if m.end() == 0:
continue
tvalue = m.group()
if pattern is re_dub and re_int.match(tvalue):
ttype = TYPE_INT
elif ttype == TYPE_LIT:
# strip quotes
tvalue = tvalue[1:-1]
### fold TYPE_HEX into TYPE_INT? convert INT/DUB away from string?
token = Token(ttype, tvalue)
self._chomp(m.end())
return token
raise UnknownToken(self.lineno)
def _skip(self):
"Skip over leading whitespace."
while True:
for pattern in skip_re:
m = pattern.match(self.contents)
if m:
self._chomp(m.end())
break
else:
# nothing matched. all done.
return
def _chomp(self, amt):
"Chomp AMT bytes off the front of the contents. Count newlines."
self.lineno += self.contents[:amt].count('\n')
self.contents = self.contents[amt:]
class Scanner(SimpleScanner):
def __init__(self, contents):
SimpleScanner.__init__(self, contents)
self.doc = None
self.pending = None
def get(self, eof_allowed=True):
if self.pending is not None:
token = self.pending
self.pending = None
return token
self.doc = None
while True:
t = SimpleScanner.get(self)
if t is None:
if eof_allowed:
return None
raise UnexpectedEOF(self.lineno)
if t.ttype != TYPE_DOC:
#print 'TOKEN:', t
return t
self.doc = t
def get_type(self, ttype):
"Get the next token, ensuring it is of the given type."
t = self.get(eof_allowed=False)
if t.ttype != ttype:
raise ExpectedType(ttype, t.ttype, self.lineno)
return t
def value_of(self, ttype):
"Get the next token's value, ensuring it is of the given type."
return self.get_type(ttype).tvalue
def pushback(self, token):
"Push a token back into the scanner; it was unused."
assert token is not None
assert self.pending is None
self.pending = token
def eat_commasemi(self):
"Eat a comma or a semicolon, if present."
t = self.get()
if t != SYM_COMMA and t != SYM_SEMI:
self.pushback(t)
def eat_expected(self, token):
"Eat the expected token, or raise a ExpectedError."
t = self.get()
if t != token:
raise ExpectedError(token, t, self.lineno)
class Token(object):
def __init__(self, ttype, tvalue=None):
self.ttype = ttype
self.tvalue = tvalue
def __str__(self):
if self.tvalue is None:
return 'T(%s)' % self.ttype
return 'T(%s, "%s")' % (self.ttype, self.tvalue)
def __eq__(self, other):
return self.ttype == other.ttype and self.tvalue == other.tvalue
def __ne__(self, other):
return self.ttype != other.ttype or self.tvalue != other.tvalue
def __hash__(self):
return hash((self.ttype, self.tvalue))
for ident in ['namespace',
'cpp_namespace',
'cpp_include',
'cpp_type',
'java_package',
'cocoa_prefix',
'csharp_namespace',
'php_namespace',
'py_module',
'perl_package',
'ruby_namespace',
'smalltalk_category',
'smalltalk_prefix',
'xsd_all',
'xsd_optional',
'xsd_nillable',
'xsd_namespace',
'xsd_attrs',
'include',
'void',
'bool',
'byte',
'i16',
'i32',
'i64',
'double',
'string',
'binary',
'slist',
'senum',
'map',
'list',
'set',
'async',
'typedef',
'struct',
'exception',
'extends',
'throws',
'service',
'enum',
'const',
'required',
'optional',
]:
name = 'ID_' + ident.upper()
globals()[name] = Token(TYPE_ID, ident)
__all__.append(name)
for name, sym in [('COLON', ':'),
('SEMI', ';'),
('COMMA', ','),
('LBRACE', '{'),
('RBRACE', '}'),
('LPAREN', '('),
('RPAREN', ')'),
('LBRKT', '['),
('RBRKT', ']'),
('EQ', '='),
('LT', '<'),
('GT', '>'),
]:
globals()['SYM_' + name] = Token(TYPE_SYM, sym)
__all__.append('SYM_' + name)
class ExpectedError(Exception):
"Expected token was not present."
class ExpectedType(Exception):
"Expected token type was not present."
class UnexpectedEOF(Exception):
"EOF reached unexpectedly."
class UnknownToken(Exception):
"Unknown token encountered."
class IncorrectSyntax(Exception):
"Incorrect syntax encountered."
if __name__ == '__main__':
import sys
s = Scanner(open(sys.argv[1]).read())
while True:
token = s.get()
if token is None:
break
print token