Snapshot (back up) my work-in-progress before I hop on a plane.


git-svn-id: https://svn.apache.org/repos/asf/incubator/thrift/branches/py-compiler@739520 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/compiler/py/src/scanner.py b/compiler/py/src/scanner.py
new file mode 100644
index 0000000..6524768
--- /dev/null
+++ b/compiler/py/src/scanner.py
@@ -0,0 +1,271 @@
+#
+# simple scanner for Thrift. emits tokens.
+#
+
+__all__ = ['Scanner', 'SimpleScanner', 'Token', 'TYPE_INT',
+           'ExpectedError', 'ExpectedType', 'UnexpectedEOF',
+           'UnknownToken', 'IncorrectSyntax',
+           ]
+
+import re
+
+re_int = re.compile('[+-]?[0-9]+$')  # special handling
+re_hex = re.compile('0x[0-9A-Fa-f]+')
+re_dub = re.compile(r'[+-]?[0-9]*(\.[0-9]+)?([eE][+-]?[0-9]+)?')
+
+re_white = re.compile('[ \t\r\n]+')
+re_silly = re.compile(r'/\*+\*/')
+re_multi = re.compile(r'/\*[^*]/*([^*/]|[^*]/|\*[^/])*\*+/')
+re_comment = re.compile('//[^\n]*')
+re_unix = re.compile('#[^\n]*')
+
+re_doc = re.compile(r'/\*\*([^*/]|[^*]/|\*[^/])*\*+/')
+
+re_ident = re.compile('[a-zA-Z_][\.a-zA-Z_0-9]*')
+re_symbol = re.compile(r'[:;,{}()=<>\[\]]')
+re_dliteral = re.compile('"[^"]*"')
+re_sliteral = re.compile("'[^']*'")
+re_st_ident = re.compile('[a-zA-Z-][.a-zA-Z_0-9-]*')
+
+skip_re = [re_white, re_silly, re_multi, re_comment, re_unix]
+
+types = [
+  ('HEX', re_hex),  # keep before re_dub
+  ('DUB', re_dub),
+  ('DOC', re_doc),
+  ('ID', re_ident),
+  ('SYM', re_symbol),
+  ('LIT', re_dliteral),
+  ('LIT', re_sliteral),
+  ('STID', re_st_ident),
+  ]
+
+for key, pattern in types:
+  globals()['TYPE_' + key] = key
+  __all__.append('TYPE_' + key)
+TYPE_INT = 'INT'
+
+
+class SimpleScanner(object):
+
+  def __init__(self, contents):
+    self.contents = contents
+    self.lineno = 1
+
+  def get(self):
+    """Get the next token.
+
+    Consumes and returns the next token. Note that leading whitespace is
+    skipped.
+
+    Returns None if there are no more tokens.
+    """
+    self._skip()
+
+    if not self.contents:
+      return None
+
+    for ttype, pattern in types:
+      m = pattern.match(self.contents)
+      if m:
+        if m.end() == 0:
+          continue
+        tvalue = m.group()
+        if pattern is re_dub and re_int.match(tvalue):
+          ttype = TYPE_INT
+        elif ttype == TYPE_LIT:
+          # strip quotes
+          tvalue = tvalue[1:-1]
+        ### fold TYPE_HEX into TYPE_INT? convert INT/DUB away from string?
+        token = Token(ttype, tvalue)
+        self._chomp(m.end())
+        return token
+
+    raise UnknownToken(self.lineno)
+
+  def _skip(self):
+    "Skip over leading whitespace."
+
+    while True:
+      for pattern in skip_re:
+        m = pattern.match(self.contents)
+        if m:
+          self._chomp(m.end())
+          break
+      else:
+        # nothing matched. all done.
+        return
+
+  def _chomp(self, amt):
+    "Chomp AMT bytes off the front of the contents. Count newlines."
+    self.lineno += self.contents[:amt].count('\n')
+    self.contents = self.contents[amt:]
+
+
+class Scanner(SimpleScanner):
+  def __init__(self, contents):
+    SimpleScanner.__init__(self, contents)
+
+    self.doc = None
+    self.pending = None
+
+  def get(self, eof_allowed=True):
+    if self.pending is not None:
+      token = self.pending
+      self.pending = None
+      return token
+
+    self.doc = None
+    while True:
+      t = SimpleScanner.get(self)
+      if t is None:
+        if eof_allowed:
+          return None
+        raise UnexpectedEOF(self.lineno)
+      if t.ttype != TYPE_DOC:
+        #print 'TOKEN:', t
+        return t
+      self.doc = t
+
+  def get_type(self, ttype):
+    "Get the next token, ensuring it is of the given type."
+    t = self.get(eof_allowed=False)
+    if t.ttype != ttype:
+      raise ExpectedType(ttype, t.ttype, self.lineno)
+    return t
+
+  def value_of(self, ttype):
+    "Get the next token's value, ensuring it is of the given type."
+    return self.get_type(ttype).tvalue
+
+  def pushback(self, token):
+    "Push a token back into the scanner; it was unused."
+    assert token is not None
+    assert self.pending is None
+    self.pending = token
+
+  def eat_commasemi(self):
+    "Eat a comma or a semicolon, if present."
+    t = self.get()
+    if t != SYM_COMMA and t != SYM_SEMI:
+      self.pushback(t)
+
+  def eat_expected(self, token):
+    "Eat the expected token, or raise a ExpectedError."
+    t = self.get()
+    if t != token:
+      raise ExpectedError(token, t, self.lineno)
+
+
+class Token(object):
+  def __init__(self, ttype, tvalue=None):
+    self.ttype = ttype
+    self.tvalue = tvalue
+
+  def __str__(self):
+    if self.tvalue is None:
+      return 'T(%s)' % self.ttype
+    return 'T(%s, "%s")' % (self.ttype, self.tvalue)
+
+  def __eq__(self, other):
+    return self.ttype == other.ttype and self.tvalue == other.tvalue
+
+  def __ne__(self, other):
+    return self.ttype != other.ttype or self.tvalue != other.tvalue
+
+  def __hash__(self):
+    return hash((self.ttype, self.tvalue))
+
+
+for ident in ['namespace',
+              'cpp_namespace',
+              'cpp_include',
+              'cpp_type',
+              'java_package',
+              'cocoa_prefix',
+              'csharp_namespace',
+              'php_namespace',
+              'py_module',
+              'perl_package',
+              'ruby_namespace',
+              'smalltalk_category',
+              'smalltalk_prefix',
+              'xsd_all',
+              'xsd_optional',
+              'xsd_nillable',
+              'xsd_namespace',
+              'xsd_attrs',
+              'include',
+              'void',
+              'bool',
+              'byte',
+              'i16',
+              'i32',
+              'i64',
+              'double',
+              'string',
+              'binary',
+              'slist',
+              'senum',
+              'map',
+              'list',
+              'set',
+              'async',
+              'typedef',
+              'struct',
+              'exception',
+              'extends',
+              'throws',
+              'service',
+              'enum',
+              'const',
+              'required',
+              'optional',
+              ]:
+  name = 'ID_' + ident.upper()
+  globals()[name] = Token(TYPE_ID, ident)
+  __all__.append(name)
+
+
+for name, sym in [('COLON', ':'),
+                  ('SEMI', ';'),
+                  ('COMMA', ','),
+                  ('LBRACE', '{'),
+                  ('RBRACE', '}'),
+                  ('LPAREN', '('),
+                  ('RPAREN', ')'),
+                  ('LBRKT', '['),
+                  ('RBRKT', ']'),
+                  ('EQ', '='),
+                  ('LT', '<'),
+                  ('GT', '>'),
+                  ]:
+  globals()['SYM_' + name] = Token(TYPE_SYM, sym)
+  __all__.append('SYM_' + name)
+
+
+class ExpectedError(Exception):
+  "Expected token was not present."
+
+class ExpectedType(Exception):
+  "Expected token type was not present."
+
+class UnexpectedEOF(Exception):
+  "EOF reached unexpectedly."
+
+class UnknownToken(Exception):
+  "Unknown token encountered."
+
+class IncorrectSyntax(Exception):
+  "Incorrect syntax encountered."
+
+
+if __name__ == '__main__':
+  import sys
+
+  s = Scanner(open(sys.argv[1]).read())
+  while True:
+    token = s.get()
+    if token is None:
+      break
+    print token