| # |
| # simple scanner for Thrift. emits tokens. |
| # |
| |
| __all__ = ['Scanner', 'SimpleScanner', 'Token', 'TYPE_INT', |
| 'ExpectedError', 'ExpectedType', 'UnexpectedEOF', |
| 'UnknownToken', 'IncorrectSyntax', |
| ] |
| |
| import re |
| |
| re_int = re.compile('[+-]?[0-9]+$') # special handling |
| re_hex = re.compile('0x[0-9A-Fa-f]+') |
| re_dub = re.compile(r'[+-]?[0-9]*(\.[0-9]+)?([eE][+-]?[0-9]+)?') |
| |
| re_white = re.compile('[ \t\r\n]+') |
| re_silly = re.compile(r'/\*+\*/') |
| re_multi = re.compile(r'/\*[^*]/*([^*/]|[^*]/|\*[^/])*\*+/') |
| re_comment = re.compile('//[^\n]*') |
| re_unix = re.compile('#[^\n]*') |
| |
| re_doc = re.compile(r'/\*\*([^*/]|[^*]/|\*[^/])*\*+/') |
| |
| re_ident = re.compile('[a-zA-Z_][\.a-zA-Z_0-9]*') |
| re_symbol = re.compile(r'[:;,{}()=<>\[\]]') |
| re_dliteral = re.compile('"[^"]*"') |
| re_sliteral = re.compile("'[^']*'") |
| re_st_ident = re.compile('[a-zA-Z-][.a-zA-Z_0-9-]*') |
| |
| skip_re = [re_white, re_silly, re_multi, re_comment, re_unix] |
| |
| types = [ |
| ('HEX', re_hex), # keep before re_dub |
| ('DUB', re_dub), |
| ('DOC', re_doc), |
| ('ID', re_ident), |
| ('SYM', re_symbol), |
| ('LIT', re_dliteral), |
| ('LIT', re_sliteral), |
| ('STID', re_st_ident), |
| ] |
| |
| for key, pattern in types: |
| globals()['TYPE_' + key] = key |
| __all__.append('TYPE_' + key) |
| TYPE_INT = 'INT' |
| |
| |
| class SimpleScanner(object): |
| |
| def __init__(self, contents): |
| self.contents = contents |
| self.lineno = 1 |
| |
| def get(self): |
| """Get the next token. |
| |
| Consumes and returns the next token. Note that leading whitespace is |
| skipped. |
| |
| Returns None if there are no more tokens. |
| """ |
| self._skip() |
| |
| if not self.contents: |
| return None |
| |
| for ttype, pattern in types: |
| m = pattern.match(self.contents) |
| if m: |
| if m.end() == 0: |
| continue |
| tvalue = m.group() |
| if pattern is re_dub and re_int.match(tvalue): |
| ttype = TYPE_INT |
| elif ttype == TYPE_LIT: |
| # strip quotes |
| tvalue = tvalue[1:-1] |
| ### fold TYPE_HEX into TYPE_INT? convert INT/DUB away from string? |
| token = Token(ttype, tvalue) |
| self._chomp(m.end()) |
| return token |
| |
| raise UnknownToken(self.lineno) |
| |
| def _skip(self): |
| "Skip over leading whitespace." |
| |
| while True: |
| for pattern in skip_re: |
| m = pattern.match(self.contents) |
| if m: |
| self._chomp(m.end()) |
| break |
| else: |
| # nothing matched. all done. |
| return |
| |
| def _chomp(self, amt): |
| "Chomp AMT bytes off the front of the contents. Count newlines." |
| self.lineno += self.contents[:amt].count('\n') |
| self.contents = self.contents[amt:] |
| |
| |
| class Scanner(SimpleScanner): |
| def __init__(self, contents): |
| SimpleScanner.__init__(self, contents) |
| |
| self.doc = None |
| self.pending = None |
| |
| def get(self, eof_allowed=True): |
| if self.pending is not None: |
| token = self.pending |
| self.pending = None |
| return token |
| |
| self.doc = None |
| while True: |
| t = SimpleScanner.get(self) |
| if t is None: |
| if eof_allowed: |
| return None |
| raise UnexpectedEOF(self.lineno) |
| if t.ttype != TYPE_DOC: |
| #print 'TOKEN:', t |
| return t |
| self.doc = t |
| |
| def get_type(self, ttype): |
| "Get the next token, ensuring it is of the given type." |
| t = self.get(eof_allowed=False) |
| if t.ttype != ttype: |
| raise ExpectedType(ttype, t.ttype, self.lineno) |
| return t |
| |
| def value_of(self, ttype): |
| "Get the next token's value, ensuring it is of the given type." |
| return self.get_type(ttype).tvalue |
| |
| def pushback(self, token): |
| "Push a token back into the scanner; it was unused." |
| assert token is not None |
| assert self.pending is None |
| self.pending = token |
| |
| def eat_commasemi(self): |
| "Eat a comma or a semicolon, if present." |
| t = self.get() |
| if t != SYM_COMMA and t != SYM_SEMI: |
| self.pushback(t) |
| |
| def eat_expected(self, token): |
| "Eat the expected token, or raise a ExpectedError." |
| t = self.get() |
| if t != token: |
| raise ExpectedError(token, t, self.lineno) |
| |
| |
| class Token(object): |
| def __init__(self, ttype, tvalue=None): |
| self.ttype = ttype |
| self.tvalue = tvalue |
| |
| def __str__(self): |
| if self.tvalue is None: |
| return 'T(%s)' % self.ttype |
| return 'T(%s, "%s")' % (self.ttype, self.tvalue) |
| |
| def __eq__(self, other): |
| return self.ttype == other.ttype and self.tvalue == other.tvalue |
| |
| def __ne__(self, other): |
| return self.ttype != other.ttype or self.tvalue != other.tvalue |
| |
| def __hash__(self): |
| return hash((self.ttype, self.tvalue)) |
| |
| |
| for ident in ['namespace', |
| 'cpp_namespace', |
| 'cpp_include', |
| 'cpp_type', |
| 'java_package', |
| 'cocoa_prefix', |
| 'csharp_namespace', |
| 'php_namespace', |
| 'py_module', |
| 'perl_package', |
| 'ruby_namespace', |
| 'smalltalk_category', |
| 'smalltalk_prefix', |
| 'xsd_all', |
| 'xsd_optional', |
| 'xsd_nillable', |
| 'xsd_namespace', |
| 'xsd_attrs', |
| 'include', |
| 'void', |
| 'bool', |
| 'byte', |
| 'i16', |
| 'i32', |
| 'i64', |
| 'double', |
| 'string', |
| 'binary', |
| 'slist', |
| 'senum', |
| 'map', |
| 'list', |
| 'set', |
| 'async', |
| 'typedef', |
| 'struct', |
| 'exception', |
| 'extends', |
| 'throws', |
| 'service', |
| 'enum', |
| 'const', |
| 'required', |
| 'optional', |
| ]: |
| name = 'ID_' + ident.upper() |
| globals()[name] = Token(TYPE_ID, ident) |
| __all__.append(name) |
| |
| |
| for name, sym in [('COLON', ':'), |
| ('SEMI', ';'), |
| ('COMMA', ','), |
| ('LBRACE', '{'), |
| ('RBRACE', '}'), |
| ('LPAREN', '('), |
| ('RPAREN', ')'), |
| ('LBRKT', '['), |
| ('RBRKT', ']'), |
| ('EQ', '='), |
| ('LT', '<'), |
| ('GT', '>'), |
| ]: |
| globals()['SYM_' + name] = Token(TYPE_SYM, sym) |
| __all__.append('SYM_' + name) |
| |
| |
| class ExpectedError(Exception): |
| "Expected token was not present." |
| |
| class ExpectedType(Exception): |
| "Expected token type was not present." |
| |
| class UnexpectedEOF(Exception): |
| "EOF reached unexpectedly." |
| |
| class UnknownToken(Exception): |
| "Unknown token encountered." |
| |
| class IncorrectSyntax(Exception): |
| "Incorrect syntax encountered." |
| |
| |
| if __name__ == '__main__': |
| import sys |
| |
| s = Scanner(open(sys.argv[1]).read()) |
| while True: |
| token = s.get() |
| if token is None: |
| break |
| print token |