1# ----------------------------------------------------------------------------- 2# ply: lex.py 3# 4# Copyright (C) 2001-2009, 5# David M. Beazley (Dabeaz LLC) 6# All rights reserved. 7# 8# Redistribution and use in source and binary forms, with or without 9# modification, are permitted provided that the following conditions are 10# met: 11# 12# * Redistributions of source code must retain the above copyright notice, 13# this list of conditions and the following disclaimer. 14# * Redistributions in binary form must reproduce the above copyright notice, 15# this list of conditions and the following disclaimer in the documentation 16# and/or other materials provided with the distribution. 17# * Neither the name of the David Beazley or Dabeaz LLC may be used to 18# endorse or promote products derived from this software without 19# specific prior written permission. 20# 21# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32# ----------------------------------------------------------------------------- 33 34__version__ = "3.3" 35__tabversion__ = "3.2" # Version of table file used 36 37import re, sys, types, copy, os 38 39# This tuple contains known string types 40try: 41 # Python 2.6 42 StringTypes = (types.StringType, types.UnicodeType) 43except AttributeError: 44 # Python 3.0 45 StringTypes = (str, bytes) 46 47# Extract the code attribute of a function. Different implementations 48# are for Python 2/3 compatibility. 49 50if sys.version_info[0] < 3: 51 def func_code(f): 52 return f.func_code 53else: 54 def func_code(f): 55 return f.__code__ 56 57# This regular expression is used to match valid token names 58_is_identifier = re.compile(r'^[a-zA-Z0-9_]+$') 59 60# Exception thrown when invalid token encountered and no default error 61# handler is defined. 62 63class LexError(Exception): 64 def __init__(self,message,s): 65 self.args = (message,) 66 self.text = s 67 68# Token class. This class is used to represent the tokens produced. 69class LexToken(object): 70 def __str__(self): 71 return "LexToken(%s,%r,%d,%d)" % (self.type,self.value,self.lineno,self.lexpos) 72 def __repr__(self): 73 return str(self) 74 75# This object is a stand-in for a logging object created by the 76# logging module. 77 78class PlyLogger(object): 79 def __init__(self,f): 80 self.f = f 81 def critical(self,msg,*args,**kwargs): 82 self.f.write((msg % args) + "\n") 83 84 def warning(self,msg,*args,**kwargs): 85 self.f.write("WARNING: "+ (msg % args) + "\n") 86 87 def error(self,msg,*args,**kwargs): 88 self.f.write("ERROR: " + (msg % args) + "\n") 89 90 info = critical 91 debug = critical 92 93# Null logger is used when no output is generated. Does nothing. 94class NullLogger(object): 95 def __getattribute__(self,name): 96 return self 97 def __call__(self,*args,**kwargs): 98 return self 99 100# ----------------------------------------------------------------------------- 101# === Lexing Engine === 102# 103# The following Lexer class implements the lexer runtime. There are only 104# a few public methods and attributes: 105# 106# input() - Store a new string in the lexer 107# token() - Get the next token 108# clone() - Clone the lexer 109# 110# lineno - Current line number 111# lexpos - Current position in the input string 112# ----------------------------------------------------------------------------- 113 114class Lexer: 115 def __init__(self): 116 self.lexre = None # Master regular expression. This is a list of 117 # tuples (re,findex) where re is a compiled 118 # regular expression and findex is a list 119 # mapping regex group numbers to rules 120 self.lexretext = None # Current regular expression strings 121 self.lexstatere = {} # Dictionary mapping lexer states to master regexs 122 self.lexstateretext = {} # Dictionary mapping lexer states to regex strings 123 self.lexstaterenames = {} # Dictionary mapping lexer states to symbol names 124 self.lexstate = "INITIAL" # Current lexer state 125 self.lexstatestack = [] # Stack of lexer states 126 self.lexstateinfo = None # State information 127 self.lexstateignore = {} # Dictionary of ignored characters for each state 128 self.lexstateerrorf = {} # Dictionary of error functions for each state 129 self.lexreflags = 0 # Optional re compile flags 130 self.lexdata = None # Actual input data (as a string) 131 self.lexpos = 0 # Current position in input text 132 self.lexlen = 0 # Length of the input text 133 self.lexerrorf = None # Error rule (if any) 134 self.lextokens = None # List of valid tokens 135 self.lexignore = "" # Ignored characters 136 self.lexliterals = "" # Literal characters that can be passed through 137 self.lexmodule = None # Module 138 self.lineno = 1 # Current line number 139 self.lexoptimize = 0 # Optimized mode 140 141 def clone(self,object=None): 142 c = copy.copy(self) 143 144 # If the object parameter has been supplied, it means we are attaching the 145 # lexer to a new object. In this case, we have to rebind all methods in 146 # the lexstatere and lexstateerrorf tables. 147 148 if object: 149 newtab = { } 150 for key, ritem in self.lexstatere.items(): 151 newre = [] 152 for cre, findex in ritem: 153 newfindex = [] 154 for f in findex: 155 if not f or not f[0]: 156 newfindex.append(f) 157 continue 158 newfindex.append((getattr(object,f[0].__name__),f[1])) 159 newre.append((cre,newfindex)) 160 newtab[key] = newre 161 c.lexstatere = newtab 162 c.lexstateerrorf = { } 163 for key, ef in self.lexstateerrorf.items(): 164 c.lexstateerrorf[key] = getattr(object,ef.__name__) 165 c.lexmodule = object 166 return c 167 168 # ------------------------------------------------------------ 169 # writetab() - Write lexer information to a table file 170 # ------------------------------------------------------------ 171 def writetab(self,tabfile,outputdir=""): 172 if isinstance(tabfile,types.ModuleType): 173 return 174 basetabfilename = tabfile.split(".")[-1] 175 filename = os.path.join(outputdir,basetabfilename)+".py" 176 tf = open(filename,"w") 177 tf.write("# %s.py. This file automatically created by PLY (version %s). Don't edit!\n" % (tabfile,__version__)) 178 tf.write("_tabversion = %s\n" % repr(__version__)) 179 tf.write("_lextokens = %s\n" % repr(self.lextokens)) 180 tf.write("_lexreflags = %s\n" % repr(self.lexreflags)) 181 tf.write("_lexliterals = %s\n" % repr(self.lexliterals)) 182 tf.write("_lexstateinfo = %s\n" % repr(self.lexstateinfo)) 183 184 tabre = { } 185 # Collect all functions in the initial state 186 initial = self.lexstatere["INITIAL"] 187 initialfuncs = [] 188 for part in initial: 189 for f in part[1]: 190 if f and f[0]: 191 initialfuncs.append(f) 192 193 for key, lre in self.lexstatere.items(): 194 titem = [] 195 for i in range(len(lre)): 196 titem.append((self.lexstateretext[key][i],_funcs_to_names(lre[i][1],self.lexstaterenames[key][i]))) 197 tabre[key] = titem 198 199 tf.write("_lexstatere = %s\n" % repr(tabre)) 200 tf.write("_lexstateignore = %s\n" % repr(self.lexstateignore)) 201 202 taberr = { } 203 for key, ef in self.lexstateerrorf.items(): 204 if ef: 205 taberr[key] = ef.__name__ 206 else: 207 taberr[key] = None 208 tf.write("_lexstateerrorf = %s\n" % repr(taberr)) 209 tf.close() 210 211 # ------------------------------------------------------------ 212 # readtab() - Read lexer information from a tab file 213 # ------------------------------------------------------------ 214 def readtab(self,tabfile,fdict): 215 if isinstance(tabfile,types.ModuleType): 216 lextab = tabfile 217 else: 218 if sys.version_info[0] < 3: 219 exec("import %s as lextab" % tabfile) 220 else: 221 env = { } 222 exec("import %s as lextab" % tabfile, env,env) 223 lextab = env['lextab'] 224 225 if getattr(lextab,"_tabversion","0.0") != __version__: 226 raise ImportError("Inconsistent PLY version") 227 228 self.lextokens = lextab._lextokens 229 self.lexreflags = lextab._lexreflags 230 self.lexliterals = lextab._lexliterals 231 self.lexstateinfo = lextab._lexstateinfo 232 self.lexstateignore = lextab._lexstateignore 233 self.lexstatere = { } 234 self.lexstateretext = { } 235 for key,lre in lextab._lexstatere.items(): 236 titem = [] 237 txtitem = [] 238 for i in range(len(lre)): 239 titem.append((re.compile(lre[i][0],lextab._lexreflags | re.VERBOSE),_names_to_funcs(lre[i][1],fdict))) 240 txtitem.append(lre[i][0]) 241 self.lexstatere[key] = titem 242 self.lexstateretext[key] = txtitem 243 self.lexstateerrorf = { } 244 for key,ef in lextab._lexstateerrorf.items(): 245 self.lexstateerrorf[key] = fdict[ef] 246 self.begin('INITIAL') 247 248 # ------------------------------------------------------------ 249 # input() - Push a new string into the lexer 250 # ------------------------------------------------------------ 251 def input(self,s): 252 # Pull off the first character to see if s looks like a string 253 c = s[:1] 254 if not isinstance(c,StringTypes): 255 raise ValueError("Expected a string") 256 self.lexdata = s 257 self.lexpos = 0 258 self.lexlen = len(s) 259 260 # ------------------------------------------------------------ 261 # begin() - Changes the lexing state 262 # ------------------------------------------------------------ 263 def begin(self,state): 264 if not state in self.lexstatere: 265 raise ValueError("Undefined state") 266 self.lexre = self.lexstatere[state] 267 self.lexretext = self.lexstateretext[state] 268 self.lexignore = self.lexstateignore.get(state,"") 269 self.lexerrorf = self.lexstateerrorf.get(state,None) 270 self.lexstate = state 271 272 # ------------------------------------------------------------ 273 # push_state() - Changes the lexing state and saves old on stack 274 # ------------------------------------------------------------ 275 def push_state(self,state): 276 self.lexstatestack.append(self.lexstate) 277 self.begin(state) 278 279 # ------------------------------------------------------------ 280 # pop_state() - Restores the previous state 281 # ------------------------------------------------------------ 282 def pop_state(self): 283 self.begin(self.lexstatestack.pop()) 284 285 # ------------------------------------------------------------ 286 # current_state() - Returns the current lexing state 287 # ------------------------------------------------------------ 288 def current_state(self): 289 return self.lexstate 290 291 # ------------------------------------------------------------ 292 # skip() - Skip ahead n characters 293 # ------------------------------------------------------------ 294 def skip(self,n): 295 self.lexpos += n 296 297 # ------------------------------------------------------------ 298 # opttoken() - Return the next token from the Lexer 299 # 300 # Note: This function has been carefully implemented to be as fast 301 # as possible. Don't make changes unless you really know what 302 # you are doing 303 # ------------------------------------------------------------ 304 def token(self): 305 # Make local copies of frequently referenced attributes 306 lexpos = self.lexpos 307 lexlen = self.lexlen 308 lexignore = self.lexignore 309 lexdata = self.lexdata 310 311 while lexpos < lexlen: 312 # This code provides some short-circuit code for whitespace, tabs, and other ignored characters 313 if lexdata[lexpos] in lexignore: 314 lexpos += 1 315 continue 316 317 # Look for a regular expression match 318 for lexre,lexindexfunc in self.lexre: 319 m = lexre.match(lexdata,lexpos) 320 if not m: continue 321 322 # Create a token for return 323 tok = LexToken() 324 tok.value = m.group() 325 tok.lineno = self.lineno 326 tok.lexpos = lexpos 327 328 i = m.lastindex 329 func,tok.type = lexindexfunc[i] 330 331 if not func: 332 # If no token type was set, it's an ignored token 333 if tok.type: 334 self.lexpos = m.end() 335 return tok 336 else: 337 lexpos = m.end() 338 break 339 340 lexpos = m.end() 341 342 # If token is processed by a function, call it 343 344 tok.lexer = self # Set additional attributes useful in token rules 345 self.lexmatch = m 346 self.lexpos = lexpos 347 348 newtok = func(tok) 349 350 # Every function must return a token, if nothing, we just move to next token 351 if not newtok: 352 lexpos = self.lexpos # This is here in case user has updated lexpos. 353 lexignore = self.lexignore # This is here in case there was a state change 354 break 355 356 # Verify type of the token. If not in the token map, raise an error 357 if not self.lexoptimize: 358 if not newtok.type in self.lextokens: 359 raise LexError("%s:%d: Rule '%s' returned an unknown token type '%s'" % ( 360 func_code(func).co_filename, func_code(func).co_firstlineno, 361 func.__name__, newtok.type),lexdata[lexpos:]) 362 363 return newtok 364 else: 365 # No match, see if in literals 366 if lexdata[lexpos] in self.lexliterals: 367 tok = LexToken() 368 tok.value = lexdata[lexpos] 369 tok.lineno = self.lineno 370 tok.type = tok.value 371 tok.lexpos = lexpos 372 self.lexpos = lexpos + 1 373 return tok 374 375 # No match. Call t_error() if defined. 376 if self.lexerrorf: 377 tok = LexToken() 378 tok.value = self.lexdata[lexpos:] 379 tok.lineno = self.lineno 380 tok.type = "error" 381 tok.lexer = self 382 tok.lexpos = lexpos 383 self.lexpos = lexpos 384 newtok = self.lexerrorf(tok) 385 if lexpos == self.lexpos: 386 # Error method didn't change text position at all. This is an error. 387 raise LexError("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:]) 388 lexpos = self.lexpos 389 if not newtok: continue 390 return newtok 391 392 self.lexpos = lexpos 393 raise LexError("Illegal character '%s' at index %d" % (lexdata[lexpos],lexpos), lexdata[lexpos:]) 394 395 self.lexpos = lexpos + 1 396 if self.lexdata is None: 397 raise RuntimeError("No input string given with input()") 398 return None 399 400 # Iterator interface 401 def __iter__(self): 402 return self 403 404 def next(self): 405 t = self.token() 406 if t is None: 407 raise StopIteration 408 return t 409 410 __next__ = next 411 412# ----------------------------------------------------------------------------- 413# ==== Lex Builder === 414# 415# The functions and classes below are used to collect lexing information 416# and build a Lexer object from it. 417# ----------------------------------------------------------------------------- 418 419# ----------------------------------------------------------------------------- 420# get_caller_module_dict() 421# 422# This function returns a dictionary containing all of the symbols defined within 423# a caller further down the call stack. This is used to get the environment 424# associated with the yacc() call if none was provided. 425# ----------------------------------------------------------------------------- 426 427def get_caller_module_dict(levels): 428 try: 429 raise RuntimeError 430 except RuntimeError: 431 e,b,t = sys.exc_info() 432 f = t.tb_frame 433 while levels > 0: 434 f = f.f_back 435 levels -= 1 436 ldict = f.f_globals.copy() 437 if f.f_globals != f.f_locals: 438 ldict.update(f.f_locals) 439 440 return ldict 441 442# ----------------------------------------------------------------------------- 443# _funcs_to_names() 444# 445# Given a list of regular expression functions, this converts it to a list 446# suitable for output to a table file 447# ----------------------------------------------------------------------------- 448 449def _funcs_to_names(funclist,namelist): 450 result = [] 451 for f,name in zip(funclist,namelist): 452 if f and f[0]: 453 result.append((name, f[1])) 454 else: 455 result.append(f) 456 return result 457 458# ----------------------------------------------------------------------------- 459# _names_to_funcs() 460# 461# Given a list of regular expression function names, this converts it back to 462# functions. 463# ----------------------------------------------------------------------------- 464 465def _names_to_funcs(namelist,fdict): 466 result = [] 467 for n in namelist: 468 if n and n[0]: 469 result.append((fdict[n[0]],n[1])) 470 else: 471 result.append(n) 472 return result 473 474# ----------------------------------------------------------------------------- 475# _form_master_re() 476# 477# This function takes a list of all of the regex components and attempts to 478# form the master regular expression. Given limitations in the Python re 479# module, it may be necessary to break the master regex into separate expressions. 480# ----------------------------------------------------------------------------- 481 482def _form_master_re(relist,reflags,ldict,toknames): 483 if not relist: return [] 484 regex = "|".join(relist) 485 try: 486 lexre = re.compile(regex,re.VERBOSE | reflags) 487 488 # Build the index to function map for the matching engine 489 lexindexfunc = [ None ] * (max(lexre.groupindex.values())+1) 490 lexindexnames = lexindexfunc[:] 491 492 for f,i in lexre.groupindex.items(): 493 handle = ldict.get(f,None) 494 if type(handle) in (types.FunctionType, types.MethodType): 495 lexindexfunc[i] = (handle,toknames[f]) 496 lexindexnames[i] = f 497 elif handle is not None: 498 lexindexnames[i] = f 499 if f.find("ignore_") > 0: 500 lexindexfunc[i] = (None,None) 501 else: 502 lexindexfunc[i] = (None, toknames[f]) 503 504 return [(lexre,lexindexfunc)],[regex],[lexindexnames] 505 except Exception: 506 m = int(len(relist)/2) 507 if m == 0: m = 1 508 llist, lre, lnames = _form_master_re(relist[:m],reflags,ldict,toknames) 509 rlist, rre, rnames = _form_master_re(relist[m:],reflags,ldict,toknames) 510 return llist+rlist, lre+rre, lnames+rnames 511 512# ----------------------------------------------------------------------------- 513# def _statetoken(s,names) 514# 515# Given a declaration name s of the form "t_" and a dictionary whose keys are 516# state names, this function returns a tuple (states,tokenname) where states 517# is a tuple of state names and tokenname is the name of the token. For example, 518# calling this with s = "t_foo_bar_SPAM" might return (('foo','bar'),'SPAM') 519# ----------------------------------------------------------------------------- 520 521def _statetoken(s,names): 522 nonstate = 1 523 parts = s.split("_") 524 for i in range(1,len(parts)): 525 if not parts[i] in names and parts[i] != 'ANY': break 526 if i > 1: 527 states = tuple(parts[1:i]) 528 else: 529 states = ('INITIAL',) 530 531 if 'ANY' in states: 532 states = tuple(names) 533 534 tokenname = "_".join(parts[i:]) 535 return (states,tokenname) 536 537 538# ----------------------------------------------------------------------------- 539# LexerReflect() 540# 541# This class represents information needed to build a lexer as extracted from a 542# user's input file. 543# ----------------------------------------------------------------------------- 544class LexerReflect(object): 545 def __init__(self,ldict,log=None,reflags=0): 546 self.ldict = ldict 547 self.error_func = None 548 self.tokens = [] 549 self.reflags = reflags 550 self.stateinfo = { 'INITIAL' : 'inclusive'} 551 self.files = {} 552 self.error = 0 553 554 if log is None: 555 self.log = PlyLogger(sys.stderr) 556 else: 557 self.log = log 558 559 # Get all of the basic information 560 def get_all(self): 561 self.get_tokens() 562 self.get_literals() 563 self.get_states() 564 self.get_rules() 565 566 # Validate all of the information 567 def validate_all(self): 568 self.validate_tokens() 569 self.validate_literals() 570 self.validate_rules() 571 return self.error 572 573 # Get the tokens map 574 def get_tokens(self): 575 tokens = self.ldict.get("tokens",None) 576 if not tokens: 577 self.log.error("No token list is defined") 578 self.error = 1 579 return 580 581 if not isinstance(tokens,(list, tuple)): 582 self.log.error("tokens must be a list or tuple") 583 self.error = 1 584 return 585 586 if not tokens: 587 self.log.error("tokens is empty") 588 self.error = 1 589 return 590 591 self.tokens = tokens 592 593 # Validate the tokens 594 def validate_tokens(self): 595 terminals = {} 596 for n in self.tokens: 597 if not _is_identifier.match(n): 598 self.log.error("Bad token name '%s'",n) 599 self.error = 1 600 if n in terminals: 601 self.log.warning("Token '%s' multiply defined", n) 602 terminals[n] = 1 603 604 # Get the literals specifier 605 def get_literals(self): 606 self.literals = self.ldict.get("literals","") 607 608 # Validate literals 609 def validate_literals(self): 610 try: 611 for c in self.literals: 612 if not isinstance(c,StringTypes) or len(c) > 1: 613 self.log.error("Invalid literal %s. Must be a single character", repr(c)) 614 self.error = 1 615 continue 616 617 except TypeError: 618 self.log.error("Invalid literals specification. literals must be a sequence of characters") 619 self.error = 1 620 621 def get_states(self): 622 self.states = self.ldict.get("states",None) 623 # Build statemap 624 if self.states: 625 if not isinstance(self.states,(tuple,list)): 626 self.log.error("states must be defined as a tuple or list") 627 self.error = 1 628 else: 629 for s in self.states: 630 if not isinstance(s,tuple) or len(s) != 2: 631 self.log.error("Invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')",repr(s)) 632 self.error = 1 633 continue 634 name, statetype = s 635 if not isinstance(name,StringTypes): 636 self.log.error("State name %s must be a string", repr(name)) 637 self.error = 1 638 continue 639 if not (statetype == 'inclusive' or statetype == 'exclusive'): 640 self.log.error("State type for state %s must be 'inclusive' or 'exclusive'",name) 641 self.error = 1 642 continue 643 if name in self.stateinfo: 644 self.log.error("State '%s' already defined",name) 645 self.error = 1 646 continue 647 self.stateinfo[name] = statetype 648 649 # Get all of the symbols with a t_ prefix and sort them into various 650 # categories (functions, strings, error functions, and ignore characters) 651 652 def get_rules(self): 653 tsymbols = [f for f in self.ldict if f[:2] == 't_' ] 654 655 # Now build up a list of functions and a list of strings 656 657 self.toknames = { } # Mapping of symbols to token names 658 self.funcsym = { } # Symbols defined as functions 659 self.strsym = { } # Symbols defined as strings 660 self.ignore = { } # Ignore strings by state 661 self.errorf = { } # Error functions by state 662 663 for s in self.stateinfo: 664 self.funcsym[s] = [] 665 self.strsym[s] = [] 666 667 if len(tsymbols) == 0: 668 self.log.error("No rules of the form t_rulename are defined") 669 self.error = 1 670 return 671 672 for f in tsymbols: 673 t = self.ldict[f] 674 states, tokname = _statetoken(f,self.stateinfo) 675 self.toknames[f] = tokname 676 677 if hasattr(t,"__call__"): 678 if tokname == 'error': 679 for s in states: 680 self.errorf[s] = t 681 elif tokname == 'ignore': 682 line = func_code(t).co_firstlineno 683 file = func_code(t).co_filename 684 self.log.error("%s:%d: Rule '%s' must be defined as a string",file,line,t.__name__) 685 self.error = 1 686 else: 687 for s in states: 688 self.funcsym[s].append((f,t)) 689 elif isinstance(t, StringTypes): 690 if tokname == 'ignore': 691 for s in states: 692 self.ignore[s] = t 693 if "\\" in t: 694 self.log.warning("%s contains a literal backslash '\\'",f) 695 696 elif tokname == 'error': 697 self.log.error("Rule '%s' must be defined as a function", f) 698 self.error = 1 699 else: 700 for s in states: 701 self.strsym[s].append((f,t)) 702 else: 703 self.log.error("%s not defined as a function or string", f) 704 self.error = 1 705 706 # Sort the functions by line number 707 for f in self.funcsym.values(): 708 f.sort(key=lambda x: func_code(x[1]).co_firstlineno) 709 710 # Sort the strings by regular expression length 711 for s in self.strsym.values(): 712 if sys.version_info[0] < 3: 713 s.sort(lambda x,y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[1]))) 714 else: 715 # Python 3.0 716 s.sort(key=lambda x: len(x[1]),reverse=True) 717 718 # Validate all of the t_rules collected 719 def validate_rules(self): 720 for state in self.stateinfo: 721 # Validate all rules defined by functions 722 723 724 725 for fname, f in self.funcsym[state]: 726 line = func_code(f).co_firstlineno 727 file = func_code(f).co_filename 728 self.files[file] = 1 729 730 tokname = self.toknames[fname] 731 if isinstance(f, types.MethodType): 732 reqargs = 2 733 else: 734 reqargs = 1 735 nargs = func_code(f).co_argcount 736 if nargs > reqargs: 737 self.log.error("%s:%d: Rule '%s' has too many arguments",file,line,f.__name__) 738 self.error = 1 739 continue 740 741 if nargs < reqargs: 742 self.log.error("%s:%d: Rule '%s' requires an argument", file,line,f.__name__) 743 self.error = 1 744 continue 745 746 if not f.__doc__: 747 self.log.error("%s:%d: No regular expression defined for rule '%s'",file,line,f.__name__) 748 self.error = 1 749 continue 750 751 try: 752 c = re.compile("(?P<%s>%s)" % (fname,f.__doc__), re.VERBOSE | self.reflags) 753 if c.match(""): 754 self.log.error("%s:%d: Regular expression for rule '%s' matches empty string", file,line,f.__name__) 755 self.error = 1 756 except re.error: 757 _etype, e, _etrace = sys.exc_info() 758 self.log.error("%s:%d: Invalid regular expression for rule '%s'. %s", file,line,f.__name__,e) 759 if '#' in f.__doc__: 760 self.log.error("%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'",file,line, f.__name__) 761 self.error = 1 762 763 # Validate all rules defined by strings 764 for name,r in self.strsym[state]: 765 tokname = self.toknames[name] 766 if tokname == 'error': 767 self.log.error("Rule '%s' must be defined as a function", name) 768 self.error = 1 769 continue 770 771 if not tokname in self.tokens and tokname.find("ignore_") < 0: 772 self.log.error("Rule '%s' defined for an unspecified token %s",name,tokname) 773 self.error = 1 774 continue 775 776 try: 777 c = re.compile("(?P<%s>%s)" % (name,r),re.VERBOSE | self.reflags) 778 if (c.match("")): 779 self.log.error("Regular expression for rule '%s' matches empty string",name) 780 self.error = 1 781 except re.error: 782 _etype, e, _etrace = sys.exc_info() 783 self.log.error("Invalid regular expression for rule '%s'. %s",name,e) 784 if '#' in r: 785 self.log.error("Make sure '#' in rule '%s' is escaped with '\\#'",name) 786 self.error = 1 787 788 if not self.funcsym[state] and not self.strsym[state]: 789 self.log.error("No rules defined for state '%s'",state) 790 self.error = 1 791 792 # Validate the error function 793 efunc = self.errorf.get(state,None) 794 if efunc: 795 f = efunc 796 line = func_code(f).co_firstlineno 797 file = func_code(f).co_filename 798 self.files[file] = 1 799 800 if isinstance(f, types.MethodType): 801 reqargs = 2 802 else: 803 reqargs = 1 804 nargs = func_code(f).co_argcount 805 if nargs > reqargs: 806 self.log.error("%s:%d: Rule '%s' has too many arguments",file,line,f.__name__) 807 self.error = 1 808 809 if nargs < reqargs: 810 self.log.error("%s:%d: Rule '%s' requires an argument", file,line,f.__name__) 811 self.error = 1 812 813 for f in self.files: 814 self.validate_file(f) 815 816 817 # ----------------------------------------------------------------------------- 818 # validate_file() 819 # 820 # This checks to see if there are duplicated t_rulename() functions or strings 821 # in the parser input file. This is done using a simple regular expression 822 # match on each line in the given file. 823 # ----------------------------------------------------------------------------- 824 825 def validate_file(self,filename): 826 import os.path 827 base,ext = os.path.splitext(filename) 828 if ext != '.py': return # No idea what the file is. Return OK 829 830 try: 831 f = open(filename) 832 lines = f.readlines() 833 f.close() 834 except IOError: 835 return # Couldn't find the file. Don't worry about it 836 837 fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(') 838 sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=') 839 840 counthash = { } 841 linen = 1 842 for l in lines: 843 m = fre.match(l) 844 if not m: 845 m = sre.match(l) 846 if m: 847 name = m.group(1) 848 prev = counthash.get(name) 849 if not prev: 850 counthash[name] = linen 851 else: 852 self.log.error("%s:%d: Rule %s redefined. Previously defined on line %d",filename,linen,name,prev) 853 self.error = 1 854 linen += 1 855 856# ----------------------------------------------------------------------------- 857# lex(module) 858# 859# Build all of the regular expression rules from definitions in the supplied module 860# ----------------------------------------------------------------------------- 861def lex(module=None,object=None,debug=0,optimize=0,lextab="lextab",reflags=0,nowarn=0,outputdir="", debuglog=None, errorlog=None): 862 global lexer 863 ldict = None 864 stateinfo = { 'INITIAL' : 'inclusive'} 865 lexobj = Lexer() 866 lexobj.lexoptimize = optimize 867 global token,input 868 869 if errorlog is None: 870 errorlog = PlyLogger(sys.stderr) 871 872 if debug: 873 if debuglog is None: 874 debuglog = PlyLogger(sys.stderr) 875 876 # Get the module dictionary used for the lexer 877 if object: module = object 878 879 if module: 880 _items = [(k,getattr(module,k)) for k in dir(module)] 881 ldict = dict(_items) 882 else: 883 ldict = get_caller_module_dict(2) 884 885 # Collect parser information from the dictionary 886 linfo = LexerReflect(ldict,log=errorlog,reflags=reflags) 887 linfo.get_all() 888 if not optimize: 889 if linfo.validate_all(): 890 raise SyntaxError("Can't build lexer") 891 892 if optimize and lextab: 893 try: 894 lexobj.readtab(lextab,ldict) 895 token = lexobj.token 896 input = lexobj.input 897 lexer = lexobj 898 return lexobj 899 900 except ImportError: 901 pass 902 903 # Dump some basic debugging information 904 if debug: 905 debuglog.info("lex: tokens = %r", linfo.tokens) 906 debuglog.info("lex: literals = %r", linfo.literals) 907 debuglog.info("lex: states = %r", linfo.stateinfo) 908 909 # Build a dictionary of valid token names 910 lexobj.lextokens = { } 911 for n in linfo.tokens: 912 lexobj.lextokens[n] = 1 913 914 # Get literals specification 915 if isinstance(linfo.literals,(list,tuple)): 916 lexobj.lexliterals = type(linfo.literals[0])().join(linfo.literals) 917 else: 918 lexobj.lexliterals = linfo.literals 919 920 # Get the stateinfo dictionary 921 stateinfo = linfo.stateinfo 922 923 regexs = { } 924 # Build the master regular expressions 925 for state in stateinfo: 926 regex_list = [] 927 928 # Add rules defined by functions first 929 for fname, f in linfo.funcsym[state]: 930 line = func_code(f).co_firstlineno 931 file = func_code(f).co_filename 932 regex_list.append("(?P<%s>%s)" % (fname,f.__doc__)) 933 if debug: 934 debuglog.info("lex: Adding rule %s -> '%s' (state '%s')",fname,f.__doc__, state) 935 936 # Now add all of the simple rules 937 for name,r in linfo.strsym[state]: 938 regex_list.append("(?P<%s>%s)" % (name,r)) 939 if debug: 940 debuglog.info("lex: Adding rule %s -> '%s' (state '%s')",name,r, state) 941 942 regexs[state] = regex_list 943 944 # Build the master regular expressions 945 946 if debug: 947 debuglog.info("lex: ==== MASTER REGEXS FOLLOW ====") 948 949 for state in regexs: 950 lexre, re_text, re_names = _form_master_re(regexs[state],reflags,ldict,linfo.toknames) 951 lexobj.lexstatere[state] = lexre 952 lexobj.lexstateretext[state] = re_text 953 lexobj.lexstaterenames[state] = re_names 954 if debug: 955 for i in range(len(re_text)): 956 debuglog.info("lex: state '%s' : regex[%d] = '%s'",state, i, re_text[i]) 957 958 # For inclusive states, we need to add the regular expressions from the INITIAL state 959 for state,stype in stateinfo.items(): 960 if state != "INITIAL" and stype == 'inclusive': 961 lexobj.lexstatere[state].extend(lexobj.lexstatere['INITIAL']) 962 lexobj.lexstateretext[state].extend(lexobj.lexstateretext['INITIAL']) 963 lexobj.lexstaterenames[state].extend(lexobj.lexstaterenames['INITIAL']) 964 965 lexobj.lexstateinfo = stateinfo 966 lexobj.lexre = lexobj.lexstatere["INITIAL"] 967 lexobj.lexretext = lexobj.lexstateretext["INITIAL"] 968 lexobj.lexreflags = reflags 969 970 # Set up ignore variables 971 lexobj.lexstateignore = linfo.ignore 972 lexobj.lexignore = lexobj.lexstateignore.get("INITIAL","") 973 974 # Set up error functions 975 lexobj.lexstateerrorf = linfo.errorf 976 lexobj.lexerrorf = linfo.errorf.get("INITIAL",None) 977 if not lexobj.lexerrorf: 978 errorlog.warning("No t_error rule is defined") 979 980 # Check state information for ignore and error rules 981 for s,stype in stateinfo.items(): 982 if stype == 'exclusive': 983 if not s in linfo.errorf: 984 errorlog.warning("No error rule is defined for exclusive state '%s'", s) 985 if not s in linfo.ignore and lexobj.lexignore: 986 errorlog.warning("No ignore rule is defined for exclusive state '%s'", s) 987 elif stype == 'inclusive': 988 if not s in linfo.errorf: 989 linfo.errorf[s] = linfo.errorf.get("INITIAL",None) 990 if not s in linfo.ignore: 991 linfo.ignore[s] = linfo.ignore.get("INITIAL","") 992 993 # Create global versions of the token() and input() functions 994 token = lexobj.token 995 input = lexobj.input 996 lexer = lexobj 997 998 # If in optimize mode, we write the lextab 999 if lextab and optimize: 1000 lexobj.writetab(lextab,outputdir) 1001 1002 return lexobj 1003 1004# ----------------------------------------------------------------------------- 1005# runmain() 1006# 1007# This runs the lexer as a main program 1008# ----------------------------------------------------------------------------- 1009 1010def runmain(lexer=None,data=None): 1011 if not data: 1012 try: 1013 filename = sys.argv[1] 1014 f = open(filename) 1015 data = f.read() 1016 f.close() 1017 except IndexError: 1018 sys.stdout.write("Reading from standard input (type EOF to end):\n") 1019 data = sys.stdin.read() 1020 1021 if lexer: 1022 _input = lexer.input 1023 else: 1024 _input = input 1025 _input(data) 1026 if lexer: 1027 _token = lexer.token 1028 else: 1029 _token = token 1030 1031 while 1: 1032 tok = _token() 1033 if not tok: break 1034 sys.stdout.write("(%s,%r,%d,%d)\n" % (tok.type, tok.value, tok.lineno,tok.lexpos)) 1035 1036# ----------------------------------------------------------------------------- 1037# @TOKEN(regex) 1038# 1039# This decorator function can be used to set the regex expression on a function 1040# when its docstring might need to be set in an alternative way 1041# ----------------------------------------------------------------------------- 1042 1043def TOKEN(r): 1044 def set_doc(f): 1045 if hasattr(r,"__call__"): 1046 f.__doc__ = r.__doc__ 1047 else: 1048 f.__doc__ = r 1049 return f 1050 return set_doc 1051 1052# Alternative spelling of the TOKEN decorator 1053Token = TOKEN 1054 1055