Snapshot (back up) my work-in-progress before I hop on a plane.
git-svn-id: https://svn.apache.org/repos/asf/incubator/thrift/branches/py-compiler@739520 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/compiler/py/src/scanner.py b/compiler/py/src/scanner.py
new file mode 100644
index 0000000..6524768
--- /dev/null
+++ b/compiler/py/src/scanner.py
@@ -0,0 +1,271 @@
+#
+# simple scanner for Thrift. emits tokens.
+#
+
+__all__ = ['Scanner', 'SimpleScanner', 'Token', 'TYPE_INT',
+ 'ExpectedError', 'ExpectedType', 'UnexpectedEOF',
+ 'UnknownToken', 'IncorrectSyntax',
+ ]
+
+import re
+
+re_int = re.compile('[+-]?[0-9]+$') # special handling
+re_hex = re.compile('0x[0-9A-Fa-f]+')
+re_dub = re.compile(r'[+-]?[0-9]*(\.[0-9]+)?([eE][+-]?[0-9]+)?')
+
+re_white = re.compile('[ \t\r\n]+')
+re_silly = re.compile(r'/\*+\*/')
+re_multi = re.compile(r'/\*[^*]/*([^*/]|[^*]/|\*[^/])*\*+/')
+re_comment = re.compile('//[^\n]*')
+re_unix = re.compile('#[^\n]*')
+
+re_doc = re.compile(r'/\*\*([^*/]|[^*]/|\*[^/])*\*+/')
+
+re_ident = re.compile('[a-zA-Z_][\.a-zA-Z_0-9]*')
+re_symbol = re.compile(r'[:;,{}()=<>\[\]]')
+re_dliteral = re.compile('"[^"]*"')
+re_sliteral = re.compile("'[^']*'")
+re_st_ident = re.compile('[a-zA-Z-][.a-zA-Z_0-9-]*')
+
+skip_re = [re_white, re_silly, re_multi, re_comment, re_unix]
+
+types = [
+ ('HEX', re_hex), # keep before re_dub
+ ('DUB', re_dub),
+ ('DOC', re_doc),
+ ('ID', re_ident),
+ ('SYM', re_symbol),
+ ('LIT', re_dliteral),
+ ('LIT', re_sliteral),
+ ('STID', re_st_ident),
+ ]
+
+for key, pattern in types:
+ globals()['TYPE_' + key] = key
+ __all__.append('TYPE_' + key)
+TYPE_INT = 'INT'
+
+
+class SimpleScanner(object):
+
+ def __init__(self, contents):
+ self.contents = contents
+ self.lineno = 1
+
+ def get(self):
+ """Get the next token.
+
+ Consumes and returns the next token. Note that leading whitespace is
+ skipped.
+
+ Returns None if there are no more tokens.
+ """
+ self._skip()
+
+ if not self.contents:
+ return None
+
+ for ttype, pattern in types:
+ m = pattern.match(self.contents)
+ if m:
+ if m.end() == 0:
+ continue
+ tvalue = m.group()
+ if pattern is re_dub and re_int.match(tvalue):
+ ttype = TYPE_INT
+ elif ttype == TYPE_LIT:
+ # strip quotes
+ tvalue = tvalue[1:-1]
+ ### fold TYPE_HEX into TYPE_INT? convert INT/DUB away from string?
+ token = Token(ttype, tvalue)
+ self._chomp(m.end())
+ return token
+
+ raise UnknownToken(self.lineno)
+
+ def _skip(self):
+ "Skip over leading whitespace."
+
+ while True:
+ for pattern in skip_re:
+ m = pattern.match(self.contents)
+ if m:
+ self._chomp(m.end())
+ break
+ else:
+ # nothing matched. all done.
+ return
+
+ def _chomp(self, amt):
+ "Chomp AMT bytes off the front of the contents. Count newlines."
+ self.lineno += self.contents[:amt].count('\n')
+ self.contents = self.contents[amt:]
+
+
+class Scanner(SimpleScanner):
+ def __init__(self, contents):
+ SimpleScanner.__init__(self, contents)
+
+ self.doc = None
+ self.pending = None
+
+ def get(self, eof_allowed=True):
+ if self.pending is not None:
+ token = self.pending
+ self.pending = None
+ return token
+
+ self.doc = None
+ while True:
+ t = SimpleScanner.get(self)
+ if t is None:
+ if eof_allowed:
+ return None
+ raise UnexpectedEOF(self.lineno)
+ if t.ttype != TYPE_DOC:
+ #print 'TOKEN:', t
+ return t
+ self.doc = t
+
+ def get_type(self, ttype):
+ "Get the next token, ensuring it is of the given type."
+ t = self.get(eof_allowed=False)
+ if t.ttype != ttype:
+ raise ExpectedType(ttype, t.ttype, self.lineno)
+ return t
+
+ def value_of(self, ttype):
+ "Get the next token's value, ensuring it is of the given type."
+ return self.get_type(ttype).tvalue
+
+ def pushback(self, token):
+ "Push a token back into the scanner; it was unused."
+ assert token is not None
+ assert self.pending is None
+ self.pending = token
+
+ def eat_commasemi(self):
+ "Eat a comma or a semicolon, if present."
+ t = self.get()
+ if t != SYM_COMMA and t != SYM_SEMI:
+ self.pushback(t)
+
+ def eat_expected(self, token):
+ "Eat the expected token, or raise a ExpectedError."
+ t = self.get()
+ if t != token:
+ raise ExpectedError(token, t, self.lineno)
+
+
+class Token(object):
+ def __init__(self, ttype, tvalue=None):
+ self.ttype = ttype
+ self.tvalue = tvalue
+
+ def __str__(self):
+ if self.tvalue is None:
+ return 'T(%s)' % self.ttype
+ return 'T(%s, "%s")' % (self.ttype, self.tvalue)
+
+ def __eq__(self, other):
+ return self.ttype == other.ttype and self.tvalue == other.tvalue
+
+ def __ne__(self, other):
+ return self.ttype != other.ttype or self.tvalue != other.tvalue
+
+ def __hash__(self):
+ return hash((self.ttype, self.tvalue))
+
+
+for ident in ['namespace',
+ 'cpp_namespace',
+ 'cpp_include',
+ 'cpp_type',
+ 'java_package',
+ 'cocoa_prefix',
+ 'csharp_namespace',
+ 'php_namespace',
+ 'py_module',
+ 'perl_package',
+ 'ruby_namespace',
+ 'smalltalk_category',
+ 'smalltalk_prefix',
+ 'xsd_all',
+ 'xsd_optional',
+ 'xsd_nillable',
+ 'xsd_namespace',
+ 'xsd_attrs',
+ 'include',
+ 'void',
+ 'bool',
+ 'byte',
+ 'i16',
+ 'i32',
+ 'i64',
+ 'double',
+ 'string',
+ 'binary',
+ 'slist',
+ 'senum',
+ 'map',
+ 'list',
+ 'set',
+ 'async',
+ 'typedef',
+ 'struct',
+ 'exception',
+ 'extends',
+ 'throws',
+ 'service',
+ 'enum',
+ 'const',
+ 'required',
+ 'optional',
+ ]:
+ name = 'ID_' + ident.upper()
+ globals()[name] = Token(TYPE_ID, ident)
+ __all__.append(name)
+
+
+for name, sym in [('COLON', ':'),
+ ('SEMI', ';'),
+ ('COMMA', ','),
+ ('LBRACE', '{'),
+ ('RBRACE', '}'),
+ ('LPAREN', '('),
+ ('RPAREN', ')'),
+ ('LBRKT', '['),
+ ('RBRKT', ']'),
+ ('EQ', '='),
+ ('LT', '<'),
+ ('GT', '>'),
+ ]:
+ globals()['SYM_' + name] = Token(TYPE_SYM, sym)
+ __all__.append('SYM_' + name)
+
+
+class ExpectedError(Exception):
+ "Expected token was not present."
+
+class ExpectedType(Exception):
+ "Expected token type was not present."
+
+class UnexpectedEOF(Exception):
+ "EOF reached unexpectedly."
+
+class UnknownToken(Exception):
+ "Unknown token encountered."
+
+class IncorrectSyntax(Exception):
+ "Incorrect syntax encountered."
+
+
+if __name__ == '__main__':
+ import sys
+
+ s = Scanner(open(sys.argv[1]).read())
+ while True:
+ token = s.get()
+ if token is None:
+ break
+ print token