Greg Stein | db84a10 | 2009-01-31 07:40:26 +0000 | [diff] [blame^] | 1 | # |
| 2 | # simple scanner for Thrift. emits tokens. |
| 3 | # |
| 4 | |
| 5 | __all__ = ['Scanner', 'SimpleScanner', 'Token', 'TYPE_INT', |
| 6 | 'ExpectedError', 'ExpectedType', 'UnexpectedEOF', |
| 7 | 'UnknownToken', 'IncorrectSyntax', |
| 8 | ] |
| 9 | |
| 10 | import re |
| 11 | |
| 12 | re_int = re.compile('[+-]?[0-9]+$') # special handling |
| 13 | re_hex = re.compile('0x[0-9A-Fa-f]+') |
| 14 | re_dub = re.compile(r'[+-]?[0-9]*(\.[0-9]+)?([eE][+-]?[0-9]+)?') |
| 15 | |
| 16 | re_white = re.compile('[ \t\r\n]+') |
| 17 | re_silly = re.compile(r'/\*+\*/') |
| 18 | re_multi = re.compile(r'/\*[^*]/*([^*/]|[^*]/|\*[^/])*\*+/') |
| 19 | re_comment = re.compile('//[^\n]*') |
| 20 | re_unix = re.compile('#[^\n]*') |
| 21 | |
| 22 | re_doc = re.compile(r'/\*\*([^*/]|[^*]/|\*[^/])*\*+/') |
| 23 | |
| 24 | re_ident = re.compile('[a-zA-Z_][\.a-zA-Z_0-9]*') |
| 25 | re_symbol = re.compile(r'[:;,{}()=<>\[\]]') |
| 26 | re_dliteral = re.compile('"[^"]*"') |
| 27 | re_sliteral = re.compile("'[^']*'") |
| 28 | re_st_ident = re.compile('[a-zA-Z-][.a-zA-Z_0-9-]*') |
| 29 | |
| 30 | skip_re = [re_white, re_silly, re_multi, re_comment, re_unix] |
| 31 | |
| 32 | types = [ |
| 33 | ('HEX', re_hex), # keep before re_dub |
| 34 | ('DUB', re_dub), |
| 35 | ('DOC', re_doc), |
| 36 | ('ID', re_ident), |
| 37 | ('SYM', re_symbol), |
| 38 | ('LIT', re_dliteral), |
| 39 | ('LIT', re_sliteral), |
| 40 | ('STID', re_st_ident), |
| 41 | ] |
| 42 | |
| 43 | for key, pattern in types: |
| 44 | globals()['TYPE_' + key] = key |
| 45 | __all__.append('TYPE_' + key) |
| 46 | TYPE_INT = 'INT' |
| 47 | |
| 48 | |
| 49 | class SimpleScanner(object): |
| 50 | |
| 51 | def __init__(self, contents): |
| 52 | self.contents = contents |
| 53 | self.lineno = 1 |
| 54 | |
| 55 | def get(self): |
| 56 | """Get the next token. |
| 57 | |
| 58 | Consumes and returns the next token. Note that leading whitespace is |
| 59 | skipped. |
| 60 | |
| 61 | Returns None if there are no more tokens. |
| 62 | """ |
| 63 | self._skip() |
| 64 | |
| 65 | if not self.contents: |
| 66 | return None |
| 67 | |
| 68 | for ttype, pattern in types: |
| 69 | m = pattern.match(self.contents) |
| 70 | if m: |
| 71 | if m.end() == 0: |
| 72 | continue |
| 73 | tvalue = m.group() |
| 74 | if pattern is re_dub and re_int.match(tvalue): |
| 75 | ttype = TYPE_INT |
| 76 | elif ttype == TYPE_LIT: |
| 77 | # strip quotes |
| 78 | tvalue = tvalue[1:-1] |
| 79 | ### fold TYPE_HEX into TYPE_INT? convert INT/DUB away from string? |
| 80 | token = Token(ttype, tvalue) |
| 81 | self._chomp(m.end()) |
| 82 | return token |
| 83 | |
| 84 | raise UnknownToken(self.lineno) |
| 85 | |
| 86 | def _skip(self): |
| 87 | "Skip over leading whitespace." |
| 88 | |
| 89 | while True: |
| 90 | for pattern in skip_re: |
| 91 | m = pattern.match(self.contents) |
| 92 | if m: |
| 93 | self._chomp(m.end()) |
| 94 | break |
| 95 | else: |
| 96 | # nothing matched. all done. |
| 97 | return |
| 98 | |
| 99 | def _chomp(self, amt): |
| 100 | "Chomp AMT bytes off the front of the contents. Count newlines." |
| 101 | self.lineno += self.contents[:amt].count('\n') |
| 102 | self.contents = self.contents[amt:] |
| 103 | |
| 104 | |
| 105 | class Scanner(SimpleScanner): |
| 106 | def __init__(self, contents): |
| 107 | SimpleScanner.__init__(self, contents) |
| 108 | |
| 109 | self.doc = None |
| 110 | self.pending = None |
| 111 | |
| 112 | def get(self, eof_allowed=True): |
| 113 | if self.pending is not None: |
| 114 | token = self.pending |
| 115 | self.pending = None |
| 116 | return token |
| 117 | |
| 118 | self.doc = None |
| 119 | while True: |
| 120 | t = SimpleScanner.get(self) |
| 121 | if t is None: |
| 122 | if eof_allowed: |
| 123 | return None |
| 124 | raise UnexpectedEOF(self.lineno) |
| 125 | if t.ttype != TYPE_DOC: |
| 126 | #print 'TOKEN:', t |
| 127 | return t |
| 128 | self.doc = t |
| 129 | |
| 130 | def get_type(self, ttype): |
| 131 | "Get the next token, ensuring it is of the given type." |
| 132 | t = self.get(eof_allowed=False) |
| 133 | if t.ttype != ttype: |
| 134 | raise ExpectedType(ttype, t.ttype, self.lineno) |
| 135 | return t |
| 136 | |
| 137 | def value_of(self, ttype): |
| 138 | "Get the next token's value, ensuring it is of the given type." |
| 139 | return self.get_type(ttype).tvalue |
| 140 | |
| 141 | def pushback(self, token): |
| 142 | "Push a token back into the scanner; it was unused." |
| 143 | assert token is not None |
| 144 | assert self.pending is None |
| 145 | self.pending = token |
| 146 | |
| 147 | def eat_commasemi(self): |
| 148 | "Eat a comma or a semicolon, if present." |
| 149 | t = self.get() |
| 150 | if t != SYM_COMMA and t != SYM_SEMI: |
| 151 | self.pushback(t) |
| 152 | |
| 153 | def eat_expected(self, token): |
| 154 | "Eat the expected token, or raise a ExpectedError." |
| 155 | t = self.get() |
| 156 | if t != token: |
| 157 | raise ExpectedError(token, t, self.lineno) |
| 158 | |
| 159 | |
| 160 | class Token(object): |
| 161 | def __init__(self, ttype, tvalue=None): |
| 162 | self.ttype = ttype |
| 163 | self.tvalue = tvalue |
| 164 | |
| 165 | def __str__(self): |
| 166 | if self.tvalue is None: |
| 167 | return 'T(%s)' % self.ttype |
| 168 | return 'T(%s, "%s")' % (self.ttype, self.tvalue) |
| 169 | |
| 170 | def __eq__(self, other): |
| 171 | return self.ttype == other.ttype and self.tvalue == other.tvalue |
| 172 | |
| 173 | def __ne__(self, other): |
| 174 | return self.ttype != other.ttype or self.tvalue != other.tvalue |
| 175 | |
| 176 | def __hash__(self): |
| 177 | return hash((self.ttype, self.tvalue)) |
| 178 | |
| 179 | |
| 180 | for ident in ['namespace', |
| 181 | 'cpp_namespace', |
| 182 | 'cpp_include', |
| 183 | 'cpp_type', |
| 184 | 'java_package', |
| 185 | 'cocoa_prefix', |
| 186 | 'csharp_namespace', |
| 187 | 'php_namespace', |
| 188 | 'py_module', |
| 189 | 'perl_package', |
| 190 | 'ruby_namespace', |
| 191 | 'smalltalk_category', |
| 192 | 'smalltalk_prefix', |
| 193 | 'xsd_all', |
| 194 | 'xsd_optional', |
| 195 | 'xsd_nillable', |
| 196 | 'xsd_namespace', |
| 197 | 'xsd_attrs', |
| 198 | 'include', |
| 199 | 'void', |
| 200 | 'bool', |
| 201 | 'byte', |
| 202 | 'i16', |
| 203 | 'i32', |
| 204 | 'i64', |
| 205 | 'double', |
| 206 | 'string', |
| 207 | 'binary', |
| 208 | 'slist', |
| 209 | 'senum', |
| 210 | 'map', |
| 211 | 'list', |
| 212 | 'set', |
| 213 | 'async', |
| 214 | 'typedef', |
| 215 | 'struct', |
| 216 | 'exception', |
| 217 | 'extends', |
| 218 | 'throws', |
| 219 | 'service', |
| 220 | 'enum', |
| 221 | 'const', |
| 222 | 'required', |
| 223 | 'optional', |
| 224 | ]: |
| 225 | name = 'ID_' + ident.upper() |
| 226 | globals()[name] = Token(TYPE_ID, ident) |
| 227 | __all__.append(name) |
| 228 | |
| 229 | |
| 230 | for name, sym in [('COLON', ':'), |
| 231 | ('SEMI', ';'), |
| 232 | ('COMMA', ','), |
| 233 | ('LBRACE', '{'), |
| 234 | ('RBRACE', '}'), |
| 235 | ('LPAREN', '('), |
| 236 | ('RPAREN', ')'), |
| 237 | ('LBRKT', '['), |
| 238 | ('RBRKT', ']'), |
| 239 | ('EQ', '='), |
| 240 | ('LT', '<'), |
| 241 | ('GT', '>'), |
| 242 | ]: |
| 243 | globals()['SYM_' + name] = Token(TYPE_SYM, sym) |
| 244 | __all__.append('SYM_' + name) |
| 245 | |
| 246 | |
| 247 | class ExpectedError(Exception): |
| 248 | "Expected token was not present." |
| 249 | |
| 250 | class ExpectedType(Exception): |
| 251 | "Expected token type was not present." |
| 252 | |
| 253 | class UnexpectedEOF(Exception): |
| 254 | "EOF reached unexpectedly." |
| 255 | |
| 256 | class UnknownToken(Exception): |
| 257 | "Unknown token encountered." |
| 258 | |
| 259 | class IncorrectSyntax(Exception): |
| 260 | "Incorrect syntax encountered." |
| 261 | |
| 262 | |
| 263 | if __name__ == '__main__': |
| 264 | import sys |
| 265 | |
| 266 | s = Scanner(open(sys.argv[1]).read()) |
| 267 | while True: |
| 268 | token = s.get() |
| 269 | if token is None: |
| 270 | break |
| 271 | print token |