1*eb8dc403SDave Cobbley# ----------------------------------------------------------------------------- 2*eb8dc403SDave Cobbley# ply: lex.py 3*eb8dc403SDave Cobbley# 4*eb8dc403SDave Cobbley# Copyright (C) 2001-2009, 5*eb8dc403SDave Cobbley# David M. Beazley (Dabeaz LLC) 6*eb8dc403SDave Cobbley# All rights reserved. 7*eb8dc403SDave Cobbley# 8*eb8dc403SDave Cobbley# Redistribution and use in source and binary forms, with or without 9*eb8dc403SDave Cobbley# modification, are permitted provided that the following conditions are 10*eb8dc403SDave Cobbley# met: 11*eb8dc403SDave Cobbley# 12*eb8dc403SDave Cobbley# * Redistributions of source code must retain the above copyright notice, 13*eb8dc403SDave Cobbley# this list of conditions and the following disclaimer. 14*eb8dc403SDave Cobbley# * Redistributions in binary form must reproduce the above copyright notice, 15*eb8dc403SDave Cobbley# this list of conditions and the following disclaimer in the documentation 16*eb8dc403SDave Cobbley# and/or other materials provided with the distribution. 17*eb8dc403SDave Cobbley# * Neither the name of the David Beazley or Dabeaz LLC may be used to 18*eb8dc403SDave Cobbley# endorse or promote products derived from this software without 19*eb8dc403SDave Cobbley# specific prior written permission. 20*eb8dc403SDave Cobbley# 21*eb8dc403SDave Cobbley# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22*eb8dc403SDave Cobbley# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23*eb8dc403SDave Cobbley# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24*eb8dc403SDave Cobbley# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25*eb8dc403SDave Cobbley# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26*eb8dc403SDave Cobbley# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27*eb8dc403SDave Cobbley# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28*eb8dc403SDave Cobbley# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29*eb8dc403SDave Cobbley# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30*eb8dc403SDave Cobbley# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31*eb8dc403SDave Cobbley# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32*eb8dc403SDave Cobbley# ----------------------------------------------------------------------------- 33*eb8dc403SDave Cobbley 34*eb8dc403SDave Cobbley__version__ = "3.3" 35*eb8dc403SDave Cobbley__tabversion__ = "3.2" # Version of table file used 36*eb8dc403SDave Cobbley 37*eb8dc403SDave Cobbleyimport re, sys, types, copy, os 38*eb8dc403SDave Cobbley 39*eb8dc403SDave Cobbley# This tuple contains known string types 40*eb8dc403SDave Cobbleytry: 41*eb8dc403SDave Cobbley # Python 2.6 42*eb8dc403SDave Cobbley StringTypes = (types.StringType, types.UnicodeType) 43*eb8dc403SDave Cobbleyexcept AttributeError: 44*eb8dc403SDave Cobbley # Python 3.0 45*eb8dc403SDave Cobbley StringTypes = (str, bytes) 46*eb8dc403SDave Cobbley 47*eb8dc403SDave Cobbley# Extract the code attribute of a function. Different implementations 48*eb8dc403SDave Cobbley# are for Python 2/3 compatibility. 49*eb8dc403SDave Cobbley 50*eb8dc403SDave Cobbleyif sys.version_info[0] < 3: 51*eb8dc403SDave Cobbley def func_code(f): 52*eb8dc403SDave Cobbley return f.func_code 53*eb8dc403SDave Cobbleyelse: 54*eb8dc403SDave Cobbley def func_code(f): 55*eb8dc403SDave Cobbley return f.__code__ 56*eb8dc403SDave Cobbley 57*eb8dc403SDave Cobbley# This regular expression is used to match valid token names 58*eb8dc403SDave Cobbley_is_identifier = re.compile(r'^[a-zA-Z0-9_]+$') 59*eb8dc403SDave Cobbley 60*eb8dc403SDave Cobbley# Exception thrown when invalid token encountered and no default error 61*eb8dc403SDave Cobbley# handler is defined. 62*eb8dc403SDave Cobbley 63*eb8dc403SDave Cobbleyclass LexError(Exception): 64*eb8dc403SDave Cobbley def __init__(self,message,s): 65*eb8dc403SDave Cobbley self.args = (message,) 66*eb8dc403SDave Cobbley self.text = s 67*eb8dc403SDave Cobbley 68*eb8dc403SDave Cobbley# Token class. This class is used to represent the tokens produced. 69*eb8dc403SDave Cobbleyclass LexToken(object): 70*eb8dc403SDave Cobbley def __str__(self): 71*eb8dc403SDave Cobbley return "LexToken(%s,%r,%d,%d)" % (self.type,self.value,self.lineno,self.lexpos) 72*eb8dc403SDave Cobbley def __repr__(self): 73*eb8dc403SDave Cobbley return str(self) 74*eb8dc403SDave Cobbley 75*eb8dc403SDave Cobbley# This object is a stand-in for a logging object created by the 76*eb8dc403SDave Cobbley# logging module. 77*eb8dc403SDave Cobbley 78*eb8dc403SDave Cobbleyclass PlyLogger(object): 79*eb8dc403SDave Cobbley def __init__(self,f): 80*eb8dc403SDave Cobbley self.f = f 81*eb8dc403SDave Cobbley def critical(self,msg,*args,**kwargs): 82*eb8dc403SDave Cobbley self.f.write((msg % args) + "\n") 83*eb8dc403SDave Cobbley 84*eb8dc403SDave Cobbley def warning(self,msg,*args,**kwargs): 85*eb8dc403SDave Cobbley self.f.write("WARNING: "+ (msg % args) + "\n") 86*eb8dc403SDave Cobbley 87*eb8dc403SDave Cobbley def error(self,msg,*args,**kwargs): 88*eb8dc403SDave Cobbley self.f.write("ERROR: " + (msg % args) + "\n") 89*eb8dc403SDave Cobbley 90*eb8dc403SDave Cobbley info = critical 91*eb8dc403SDave Cobbley debug = critical 92*eb8dc403SDave Cobbley 93*eb8dc403SDave Cobbley# Null logger is used when no output is generated. Does nothing. 94*eb8dc403SDave Cobbleyclass NullLogger(object): 95*eb8dc403SDave Cobbley def __getattribute__(self,name): 96*eb8dc403SDave Cobbley return self 97*eb8dc403SDave Cobbley def __call__(self,*args,**kwargs): 98*eb8dc403SDave Cobbley return self 99*eb8dc403SDave Cobbley 100*eb8dc403SDave Cobbley# ----------------------------------------------------------------------------- 101*eb8dc403SDave Cobbley# === Lexing Engine === 102*eb8dc403SDave Cobbley# 103*eb8dc403SDave Cobbley# The following Lexer class implements the lexer runtime. There are only 104*eb8dc403SDave Cobbley# a few public methods and attributes: 105*eb8dc403SDave Cobbley# 106*eb8dc403SDave Cobbley# input() - Store a new string in the lexer 107*eb8dc403SDave Cobbley# token() - Get the next token 108*eb8dc403SDave Cobbley# clone() - Clone the lexer 109*eb8dc403SDave Cobbley# 110*eb8dc403SDave Cobbley# lineno - Current line number 111*eb8dc403SDave Cobbley# lexpos - Current position in the input string 112*eb8dc403SDave Cobbley# ----------------------------------------------------------------------------- 113*eb8dc403SDave Cobbley 114*eb8dc403SDave Cobbleyclass Lexer: 115*eb8dc403SDave Cobbley def __init__(self): 116*eb8dc403SDave Cobbley self.lexre = None # Master regular expression. This is a list of 117*eb8dc403SDave Cobbley # tuples (re,findex) where re is a compiled 118*eb8dc403SDave Cobbley # regular expression and findex is a list 119*eb8dc403SDave Cobbley # mapping regex group numbers to rules 120*eb8dc403SDave Cobbley self.lexretext = None # Current regular expression strings 121*eb8dc403SDave Cobbley self.lexstatere = {} # Dictionary mapping lexer states to master regexs 122*eb8dc403SDave Cobbley self.lexstateretext = {} # Dictionary mapping lexer states to regex strings 123*eb8dc403SDave Cobbley self.lexstaterenames = {} # Dictionary mapping lexer states to symbol names 124*eb8dc403SDave Cobbley self.lexstate = "INITIAL" # Current lexer state 125*eb8dc403SDave Cobbley self.lexstatestack = [] # Stack of lexer states 126*eb8dc403SDave Cobbley self.lexstateinfo = None # State information 127*eb8dc403SDave Cobbley self.lexstateignore = {} # Dictionary of ignored characters for each state 128*eb8dc403SDave Cobbley self.lexstateerrorf = {} # Dictionary of error functions for each state 129*eb8dc403SDave Cobbley self.lexreflags = 0 # Optional re compile flags 130*eb8dc403SDave Cobbley self.lexdata = None # Actual input data (as a string) 131*eb8dc403SDave Cobbley self.lexpos = 0 # Current position in input text 132*eb8dc403SDave Cobbley self.lexlen = 0 # Length of the input text 133*eb8dc403SDave Cobbley self.lexerrorf = None # Error rule (if any) 134*eb8dc403SDave Cobbley self.lextokens = None # List of valid tokens 135*eb8dc403SDave Cobbley self.lexignore = "" # Ignored characters 136*eb8dc403SDave Cobbley self.lexliterals = "" # Literal characters that can be passed through 137*eb8dc403SDave Cobbley self.lexmodule = None # Module 138*eb8dc403SDave Cobbley self.lineno = 1 # Current line number 139*eb8dc403SDave Cobbley self.lexoptimize = 0 # Optimized mode 140*eb8dc403SDave Cobbley 141*eb8dc403SDave Cobbley def clone(self,object=None): 142*eb8dc403SDave Cobbley c = copy.copy(self) 143*eb8dc403SDave Cobbley 144*eb8dc403SDave Cobbley # If the object parameter has been supplied, it means we are attaching the 145*eb8dc403SDave Cobbley # lexer to a new object. In this case, we have to rebind all methods in 146*eb8dc403SDave Cobbley # the lexstatere and lexstateerrorf tables. 147*eb8dc403SDave Cobbley 148*eb8dc403SDave Cobbley if object: 149*eb8dc403SDave Cobbley newtab = { } 150*eb8dc403SDave Cobbley for key, ritem in self.lexstatere.items(): 151*eb8dc403SDave Cobbley newre = [] 152*eb8dc403SDave Cobbley for cre, findex in ritem: 153*eb8dc403SDave Cobbley newfindex = [] 154*eb8dc403SDave Cobbley for f in findex: 155*eb8dc403SDave Cobbley if not f or not f[0]: 156*eb8dc403SDave Cobbley newfindex.append(f) 157*eb8dc403SDave Cobbley continue 158*eb8dc403SDave Cobbley newfindex.append((getattr(object,f[0].__name__),f[1])) 159*eb8dc403SDave Cobbley newre.append((cre,newfindex)) 160*eb8dc403SDave Cobbley newtab[key] = newre 161*eb8dc403SDave Cobbley c.lexstatere = newtab 162*eb8dc403SDave Cobbley c.lexstateerrorf = { } 163*eb8dc403SDave Cobbley for key, ef in self.lexstateerrorf.items(): 164*eb8dc403SDave Cobbley c.lexstateerrorf[key] = getattr(object,ef.__name__) 165*eb8dc403SDave Cobbley c.lexmodule = object 166*eb8dc403SDave Cobbley return c 167*eb8dc403SDave Cobbley 168*eb8dc403SDave Cobbley # ------------------------------------------------------------ 169*eb8dc403SDave Cobbley # writetab() - Write lexer information to a table file 170*eb8dc403SDave Cobbley # ------------------------------------------------------------ 171*eb8dc403SDave Cobbley def writetab(self,tabfile,outputdir=""): 172*eb8dc403SDave Cobbley if isinstance(tabfile,types.ModuleType): 173*eb8dc403SDave Cobbley return 174*eb8dc403SDave Cobbley basetabfilename = tabfile.split(".")[-1] 175*eb8dc403SDave Cobbley filename = os.path.join(outputdir,basetabfilename)+".py" 176*eb8dc403SDave Cobbley tf = open(filename,"w") 177*eb8dc403SDave Cobbley tf.write("# %s.py. This file automatically created by PLY (version %s). Don't edit!\n" % (tabfile,__version__)) 178*eb8dc403SDave Cobbley tf.write("_tabversion = %s\n" % repr(__version__)) 179*eb8dc403SDave Cobbley tf.write("_lextokens = %s\n" % repr(self.lextokens)) 180*eb8dc403SDave Cobbley tf.write("_lexreflags = %s\n" % repr(self.lexreflags)) 181*eb8dc403SDave Cobbley tf.write("_lexliterals = %s\n" % repr(self.lexliterals)) 182*eb8dc403SDave Cobbley tf.write("_lexstateinfo = %s\n" % repr(self.lexstateinfo)) 183*eb8dc403SDave Cobbley 184*eb8dc403SDave Cobbley tabre = { } 185*eb8dc403SDave Cobbley # Collect all functions in the initial state 186*eb8dc403SDave Cobbley initial = self.lexstatere["INITIAL"] 187*eb8dc403SDave Cobbley initialfuncs = [] 188*eb8dc403SDave Cobbley for part in initial: 189*eb8dc403SDave Cobbley for f in part[1]: 190*eb8dc403SDave Cobbley if f and f[0]: 191*eb8dc403SDave Cobbley initialfuncs.append(f) 192*eb8dc403SDave Cobbley 193*eb8dc403SDave Cobbley for key, lre in self.lexstatere.items(): 194*eb8dc403SDave Cobbley titem = [] 195*eb8dc403SDave Cobbley for i in range(len(lre)): 196*eb8dc403SDave Cobbley titem.append((self.lexstateretext[key][i],_funcs_to_names(lre[i][1],self.lexstaterenames[key][i]))) 197*eb8dc403SDave Cobbley tabre[key] = titem 198*eb8dc403SDave Cobbley 199*eb8dc403SDave Cobbley tf.write("_lexstatere = %s\n" % repr(tabre)) 200*eb8dc403SDave Cobbley tf.write("_lexstateignore = %s\n" % repr(self.lexstateignore)) 201*eb8dc403SDave Cobbley 202*eb8dc403SDave Cobbley taberr = { } 203*eb8dc403SDave Cobbley for key, ef in self.lexstateerrorf.items(): 204*eb8dc403SDave Cobbley if ef: 205*eb8dc403SDave Cobbley taberr[key] = ef.__name__ 206*eb8dc403SDave Cobbley else: 207*eb8dc403SDave Cobbley taberr[key] = None 208*eb8dc403SDave Cobbley tf.write("_lexstateerrorf = %s\n" % repr(taberr)) 209*eb8dc403SDave Cobbley tf.close() 210*eb8dc403SDave Cobbley 211*eb8dc403SDave Cobbley # ------------------------------------------------------------ 212*eb8dc403SDave Cobbley # readtab() - Read lexer information from a tab file 213*eb8dc403SDave Cobbley # ------------------------------------------------------------ 214*eb8dc403SDave Cobbley def readtab(self,tabfile,fdict): 215*eb8dc403SDave Cobbley if isinstance(tabfile,types.ModuleType): 216*eb8dc403SDave Cobbley lextab = tabfile 217*eb8dc403SDave Cobbley else: 218*eb8dc403SDave Cobbley if sys.version_info[0] < 3: 219*eb8dc403SDave Cobbley exec("import %s as lextab" % tabfile) 220*eb8dc403SDave Cobbley else: 221*eb8dc403SDave Cobbley env = { } 222*eb8dc403SDave Cobbley exec("import %s as lextab" % tabfile, env,env) 223*eb8dc403SDave Cobbley lextab = env['lextab'] 224*eb8dc403SDave Cobbley 225*eb8dc403SDave Cobbley if getattr(lextab,"_tabversion","0.0") != __version__: 226*eb8dc403SDave Cobbley raise ImportError("Inconsistent PLY version") 227*eb8dc403SDave Cobbley 228*eb8dc403SDave Cobbley self.lextokens = lextab._lextokens 229*eb8dc403SDave Cobbley self.lexreflags = lextab._lexreflags 230*eb8dc403SDave Cobbley self.lexliterals = lextab._lexliterals 231*eb8dc403SDave Cobbley self.lexstateinfo = lextab._lexstateinfo 232*eb8dc403SDave Cobbley self.lexstateignore = lextab._lexstateignore 233*eb8dc403SDave Cobbley self.lexstatere = { } 234*eb8dc403SDave Cobbley self.lexstateretext = { } 235*eb8dc403SDave Cobbley for key,lre in lextab._lexstatere.items(): 236*eb8dc403SDave Cobbley titem = [] 237*eb8dc403SDave Cobbley txtitem = [] 238*eb8dc403SDave Cobbley for i in range(len(lre)): 239*eb8dc403SDave Cobbley titem.append((re.compile(lre[i][0],lextab._lexreflags | re.VERBOSE),_names_to_funcs(lre[i][1],fdict))) 240*eb8dc403SDave Cobbley txtitem.append(lre[i][0]) 241*eb8dc403SDave Cobbley self.lexstatere[key] = titem 242*eb8dc403SDave Cobbley self.lexstateretext[key] = txtitem 243*eb8dc403SDave Cobbley self.lexstateerrorf = { } 244*eb8dc403SDave Cobbley for key,ef in lextab._lexstateerrorf.items(): 245*eb8dc403SDave Cobbley self.lexstateerrorf[key] = fdict[ef] 246*eb8dc403SDave Cobbley self.begin('INITIAL') 247*eb8dc403SDave Cobbley 248*eb8dc403SDave Cobbley # ------------------------------------------------------------ 249*eb8dc403SDave Cobbley # input() - Push a new string into the lexer 250*eb8dc403SDave Cobbley # ------------------------------------------------------------ 251*eb8dc403SDave Cobbley def input(self,s): 252*eb8dc403SDave Cobbley # Pull off the first character to see if s looks like a string 253*eb8dc403SDave Cobbley c = s[:1] 254*eb8dc403SDave Cobbley if not isinstance(c,StringTypes): 255*eb8dc403SDave Cobbley raise ValueError("Expected a string") 256*eb8dc403SDave Cobbley self.lexdata = s 257*eb8dc403SDave Cobbley self.lexpos = 0 258*eb8dc403SDave Cobbley self.lexlen = len(s) 259*eb8dc403SDave Cobbley 260*eb8dc403SDave Cobbley # ------------------------------------------------------------ 261*eb8dc403SDave Cobbley # begin() - Changes the lexing state 262*eb8dc403SDave Cobbley # ------------------------------------------------------------ 263*eb8dc403SDave Cobbley def begin(self,state): 264*eb8dc403SDave Cobbley if not state in self.lexstatere: 265*eb8dc403SDave Cobbley raise ValueError("Undefined state") 266*eb8dc403SDave Cobbley self.lexre = self.lexstatere[state] 267*eb8dc403SDave Cobbley self.lexretext = self.lexstateretext[state] 268*eb8dc403SDave Cobbley self.lexignore = self.lexstateignore.get(state,"") 269*eb8dc403SDave Cobbley self.lexerrorf = self.lexstateerrorf.get(state,None) 270*eb8dc403SDave Cobbley self.lexstate = state 271*eb8dc403SDave Cobbley 272*eb8dc403SDave Cobbley # ------------------------------------------------------------ 273*eb8dc403SDave Cobbley # push_state() - Changes the lexing state and saves old on stack 274*eb8dc403SDave Cobbley # ------------------------------------------------------------ 275*eb8dc403SDave Cobbley def push_state(self,state): 276*eb8dc403SDave Cobbley self.lexstatestack.append(self.lexstate) 277*eb8dc403SDave Cobbley self.begin(state) 278*eb8dc403SDave Cobbley 279*eb8dc403SDave Cobbley # ------------------------------------------------------------ 280*eb8dc403SDave Cobbley # pop_state() - Restores the previous state 281*eb8dc403SDave Cobbley # ------------------------------------------------------------ 282*eb8dc403SDave Cobbley def pop_state(self): 283*eb8dc403SDave Cobbley self.begin(self.lexstatestack.pop()) 284*eb8dc403SDave Cobbley 285*eb8dc403SDave Cobbley # ------------------------------------------------------------ 286*eb8dc403SDave Cobbley # current_state() - Returns the current lexing state 287*eb8dc403SDave Cobbley # ------------------------------------------------------------ 288*eb8dc403SDave Cobbley def current_state(self): 289*eb8dc403SDave Cobbley return self.lexstate 290*eb8dc403SDave Cobbley 291*eb8dc403SDave Cobbley # ------------------------------------------------------------ 292*eb8dc403SDave Cobbley # skip() - Skip ahead n characters 293*eb8dc403SDave Cobbley # ------------------------------------------------------------ 294*eb8dc403SDave Cobbley def skip(self,n): 295*eb8dc403SDave Cobbley self.lexpos += n 296*eb8dc403SDave Cobbley 297*eb8dc403SDave Cobbley # ------------------------------------------------------------ 298*eb8dc403SDave Cobbley # opttoken() - Return the next token from the Lexer 299*eb8dc403SDave Cobbley # 300*eb8dc403SDave Cobbley # Note: This function has been carefully implemented to be as fast 301*eb8dc403SDave Cobbley # as possible. Don't make changes unless you really know what 302*eb8dc403SDave Cobbley # you are doing 303*eb8dc403SDave Cobbley # ------------------------------------------------------------ 304*eb8dc403SDave Cobbley def token(self): 305*eb8dc403SDave Cobbley # Make local copies of frequently referenced attributes 306*eb8dc403SDave Cobbley lexpos = self.lexpos 307*eb8dc403SDave Cobbley lexlen = self.lexlen 308*eb8dc403SDave Cobbley lexignore = self.lexignore 309*eb8dc403SDave Cobbley lexdata = self.lexdata 310*eb8dc403SDave Cobbley 311*eb8dc403SDave Cobbley while lexpos < lexlen: 312*eb8dc403SDave Cobbley # This code provides some short-circuit code for whitespace, tabs, and other ignored characters 313*eb8dc403SDave Cobbley if lexdata[lexpos] in lexignore: 314*eb8dc403SDave Cobbley lexpos += 1 315*eb8dc403SDave Cobbley continue 316*eb8dc403SDave Cobbley 317*eb8dc403SDave Cobbley # Look for a regular expression match 318*eb8dc403SDave Cobbley for lexre,lexindexfunc in self.lexre: 319*eb8dc403SDave Cobbley m = lexre.match(lexdata,lexpos) 320*eb8dc403SDave Cobbley if not m: continue 321*eb8dc403SDave Cobbley 322*eb8dc403SDave Cobbley # Create a token for return 323*eb8dc403SDave Cobbley tok = LexToken() 324*eb8dc403SDave Cobbley tok.value = m.group() 325*eb8dc403SDave Cobbley tok.lineno = self.lineno 326*eb8dc403SDave Cobbley tok.lexpos = lexpos 327*eb8dc403SDave Cobbley 328*eb8dc403SDave Cobbley i = m.lastindex 329*eb8dc403SDave Cobbley func,tok.type = lexindexfunc[i] 330*eb8dc403SDave Cobbley 331*eb8dc403SDave Cobbley if not func: 332*eb8dc403SDave Cobbley # If no token type was set, it's an ignored token 333*eb8dc403SDave Cobbley if tok.type: 334*eb8dc403SDave Cobbley self.lexpos = m.end() 335*eb8dc403SDave Cobbley return tok 336*eb8dc403SDave Cobbley else: 337*eb8dc403SDave Cobbley lexpos = m.end() 338*eb8dc403SDave Cobbley break 339*eb8dc403SDave Cobbley 340*eb8dc403SDave Cobbley lexpos = m.end() 341*eb8dc403SDave Cobbley 342*eb8dc403SDave Cobbley # If token is processed by a function, call it 343*eb8dc403SDave Cobbley 344*eb8dc403SDave Cobbley tok.lexer = self # Set additional attributes useful in token rules 345*eb8dc403SDave Cobbley self.lexmatch = m 346*eb8dc403SDave Cobbley self.lexpos = lexpos 347*eb8dc403SDave Cobbley 348*eb8dc403SDave Cobbley newtok = func(tok) 349*eb8dc403SDave Cobbley 350*eb8dc403SDave Cobbley # Every function must return a token, if nothing, we just move to next token 351*eb8dc403SDave Cobbley if not newtok: 352*eb8dc403SDave Cobbley lexpos = self.lexpos # This is here in case user has updated lexpos. 353*eb8dc403SDave Cobbley lexignore = self.lexignore # This is here in case there was a state change 354*eb8dc403SDave Cobbley break 355*eb8dc403SDave Cobbley 356*eb8dc403SDave Cobbley # Verify type of the token. If not in the token map, raise an error 357*eb8dc403SDave Cobbley if not self.lexoptimize: 358*eb8dc403SDave Cobbley if not newtok.type in self.lextokens: 359*eb8dc403SDave Cobbley raise LexError("%s:%d: Rule '%s' returned an unknown token type '%s'" % ( 360*eb8dc403SDave Cobbley func_code(func).co_filename, func_code(func).co_firstlineno, 361*eb8dc403SDave Cobbley func.__name__, newtok.type),lexdata[lexpos:]) 362*eb8dc403SDave Cobbley 363*eb8dc403SDave Cobbley return newtok 364*eb8dc403SDave Cobbley else: 365*eb8dc403SDave Cobbley # No match, see if in literals 366*eb8dc403SDave Cobbley if lexdata[lexpos] in self.lexliterals: 367*eb8dc403SDave Cobbley tok = LexToken() 368*eb8dc403SDave Cobbley tok.value = lexdata[lexpos] 369*eb8dc403SDave Cobbley tok.lineno = self.lineno 370*eb8dc403SDave Cobbley tok.type = tok.value 371*eb8dc403SDave Cobbley tok.lexpos = lexpos 372*eb8dc403SDave Cobbley self.lexpos = lexpos + 1 373*eb8dc403SDave Cobbley return tok 374*eb8dc403SDave Cobbley 375*eb8dc403SDave Cobbley # No match. Call t_error() if defined. 376*eb8dc403SDave Cobbley if self.lexerrorf: 377*eb8dc403SDave Cobbley tok = LexToken() 378*eb8dc403SDave Cobbley tok.value = self.lexdata[lexpos:] 379*eb8dc403SDave Cobbley tok.lineno = self.lineno 380*eb8dc403SDave Cobbley tok.type = "error" 381*eb8dc403SDave Cobbley tok.lexer = self 382*eb8dc403SDave Cobbley tok.lexpos = lexpos 383*eb8dc403SDave Cobbley self.lexpos = lexpos 384*eb8dc403SDave Cobbley newtok = self.lexerrorf(tok) 385*eb8dc403SDave Cobbley if lexpos == self.lexpos: 386*eb8dc403SDave Cobbley # Error method didn't change text position at all. This is an error. 387*eb8dc403SDave Cobbley raise LexError("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:]) 388*eb8dc403SDave Cobbley lexpos = self.lexpos 389*eb8dc403SDave Cobbley if not newtok: continue 390*eb8dc403SDave Cobbley return newtok 391*eb8dc403SDave Cobbley 392*eb8dc403SDave Cobbley self.lexpos = lexpos 393*eb8dc403SDave Cobbley raise LexError("Illegal character '%s' at index %d" % (lexdata[lexpos],lexpos), lexdata[lexpos:]) 394*eb8dc403SDave Cobbley 395*eb8dc403SDave Cobbley self.lexpos = lexpos + 1 396*eb8dc403SDave Cobbley if self.lexdata is None: 397*eb8dc403SDave Cobbley raise RuntimeError("No input string given with input()") 398*eb8dc403SDave Cobbley return None 399*eb8dc403SDave Cobbley 400*eb8dc403SDave Cobbley # Iterator interface 401*eb8dc403SDave Cobbley def __iter__(self): 402*eb8dc403SDave Cobbley return self 403*eb8dc403SDave Cobbley 404*eb8dc403SDave Cobbley def next(self): 405*eb8dc403SDave Cobbley t = self.token() 406*eb8dc403SDave Cobbley if t is None: 407*eb8dc403SDave Cobbley raise StopIteration 408*eb8dc403SDave Cobbley return t 409*eb8dc403SDave Cobbley 410*eb8dc403SDave Cobbley __next__ = next 411*eb8dc403SDave Cobbley 412*eb8dc403SDave Cobbley# ----------------------------------------------------------------------------- 413*eb8dc403SDave Cobbley# ==== Lex Builder === 414*eb8dc403SDave Cobbley# 415*eb8dc403SDave Cobbley# The functions and classes below are used to collect lexing information 416*eb8dc403SDave Cobbley# and build a Lexer object from it. 417*eb8dc403SDave Cobbley# ----------------------------------------------------------------------------- 418*eb8dc403SDave Cobbley 419*eb8dc403SDave Cobbley# ----------------------------------------------------------------------------- 420*eb8dc403SDave Cobbley# get_caller_module_dict() 421*eb8dc403SDave Cobbley# 422*eb8dc403SDave Cobbley# This function returns a dictionary containing all of the symbols defined within 423*eb8dc403SDave Cobbley# a caller further down the call stack. This is used to get the environment 424*eb8dc403SDave Cobbley# associated with the yacc() call if none was provided. 425*eb8dc403SDave Cobbley# ----------------------------------------------------------------------------- 426*eb8dc403SDave Cobbley 427*eb8dc403SDave Cobbleydef get_caller_module_dict(levels): 428*eb8dc403SDave Cobbley try: 429*eb8dc403SDave Cobbley raise RuntimeError 430*eb8dc403SDave Cobbley except RuntimeError: 431*eb8dc403SDave Cobbley e,b,t = sys.exc_info() 432*eb8dc403SDave Cobbley f = t.tb_frame 433*eb8dc403SDave Cobbley while levels > 0: 434*eb8dc403SDave Cobbley f = f.f_back 435*eb8dc403SDave Cobbley levels -= 1 436*eb8dc403SDave Cobbley ldict = f.f_globals.copy() 437*eb8dc403SDave Cobbley if f.f_globals != f.f_locals: 438*eb8dc403SDave Cobbley ldict.update(f.f_locals) 439*eb8dc403SDave Cobbley 440*eb8dc403SDave Cobbley return ldict 441*eb8dc403SDave Cobbley 442*eb8dc403SDave Cobbley# ----------------------------------------------------------------------------- 443*eb8dc403SDave Cobbley# _funcs_to_names() 444*eb8dc403SDave Cobbley# 445*eb8dc403SDave Cobbley# Given a list of regular expression functions, this converts it to a list 446*eb8dc403SDave Cobbley# suitable for output to a table file 447*eb8dc403SDave Cobbley# ----------------------------------------------------------------------------- 448*eb8dc403SDave Cobbley 449*eb8dc403SDave Cobbleydef _funcs_to_names(funclist,namelist): 450*eb8dc403SDave Cobbley result = [] 451*eb8dc403SDave Cobbley for f,name in zip(funclist,namelist): 452*eb8dc403SDave Cobbley if f and f[0]: 453*eb8dc403SDave Cobbley result.append((name, f[1])) 454*eb8dc403SDave Cobbley else: 455*eb8dc403SDave Cobbley result.append(f) 456*eb8dc403SDave Cobbley return result 457*eb8dc403SDave Cobbley 458*eb8dc403SDave Cobbley# ----------------------------------------------------------------------------- 459*eb8dc403SDave Cobbley# _names_to_funcs() 460*eb8dc403SDave Cobbley# 461*eb8dc403SDave Cobbley# Given a list of regular expression function names, this converts it back to 462*eb8dc403SDave Cobbley# functions. 463*eb8dc403SDave Cobbley# ----------------------------------------------------------------------------- 464*eb8dc403SDave Cobbley 465*eb8dc403SDave Cobbleydef _names_to_funcs(namelist,fdict): 466*eb8dc403SDave Cobbley result = [] 467*eb8dc403SDave Cobbley for n in namelist: 468*eb8dc403SDave Cobbley if n and n[0]: 469*eb8dc403SDave Cobbley result.append((fdict[n[0]],n[1])) 470*eb8dc403SDave Cobbley else: 471*eb8dc403SDave Cobbley result.append(n) 472*eb8dc403SDave Cobbley return result 473*eb8dc403SDave Cobbley 474*eb8dc403SDave Cobbley# ----------------------------------------------------------------------------- 475*eb8dc403SDave Cobbley# _form_master_re() 476*eb8dc403SDave Cobbley# 477*eb8dc403SDave Cobbley# This function takes a list of all of the regex components and attempts to 478*eb8dc403SDave Cobbley# form the master regular expression. Given limitations in the Python re 479*eb8dc403SDave Cobbley# module, it may be necessary to break the master regex into separate expressions. 480*eb8dc403SDave Cobbley# ----------------------------------------------------------------------------- 481*eb8dc403SDave Cobbley 482*eb8dc403SDave Cobbleydef _form_master_re(relist,reflags,ldict,toknames): 483*eb8dc403SDave Cobbley if not relist: return [] 484*eb8dc403SDave Cobbley regex = "|".join(relist) 485*eb8dc403SDave Cobbley try: 486*eb8dc403SDave Cobbley lexre = re.compile(regex,re.VERBOSE | reflags) 487*eb8dc403SDave Cobbley 488*eb8dc403SDave Cobbley # Build the index to function map for the matching engine 489*eb8dc403SDave Cobbley lexindexfunc = [ None ] * (max(lexre.groupindex.values())+1) 490*eb8dc403SDave Cobbley lexindexnames = lexindexfunc[:] 491*eb8dc403SDave Cobbley 492*eb8dc403SDave Cobbley for f,i in lexre.groupindex.items(): 493*eb8dc403SDave Cobbley handle = ldict.get(f,None) 494*eb8dc403SDave Cobbley if type(handle) in (types.FunctionType, types.MethodType): 495*eb8dc403SDave Cobbley lexindexfunc[i] = (handle,toknames[f]) 496*eb8dc403SDave Cobbley lexindexnames[i] = f 497*eb8dc403SDave Cobbley elif handle is not None: 498*eb8dc403SDave Cobbley lexindexnames[i] = f 499*eb8dc403SDave Cobbley if f.find("ignore_") > 0: 500*eb8dc403SDave Cobbley lexindexfunc[i] = (None,None) 501*eb8dc403SDave Cobbley else: 502*eb8dc403SDave Cobbley lexindexfunc[i] = (None, toknames[f]) 503*eb8dc403SDave Cobbley 504*eb8dc403SDave Cobbley return [(lexre,lexindexfunc)],[regex],[lexindexnames] 505*eb8dc403SDave Cobbley except Exception: 506*eb8dc403SDave Cobbley m = int(len(relist)/2) 507*eb8dc403SDave Cobbley if m == 0: m = 1 508*eb8dc403SDave Cobbley llist, lre, lnames = _form_master_re(relist[:m],reflags,ldict,toknames) 509*eb8dc403SDave Cobbley rlist, rre, rnames = _form_master_re(relist[m:],reflags,ldict,toknames) 510*eb8dc403SDave Cobbley return llist+rlist, lre+rre, lnames+rnames 511*eb8dc403SDave Cobbley 512*eb8dc403SDave Cobbley# ----------------------------------------------------------------------------- 513*eb8dc403SDave Cobbley# def _statetoken(s,names) 514*eb8dc403SDave Cobbley# 515*eb8dc403SDave Cobbley# Given a declaration name s of the form "t_" and a dictionary whose keys are 516*eb8dc403SDave Cobbley# state names, this function returns a tuple (states,tokenname) where states 517*eb8dc403SDave Cobbley# is a tuple of state names and tokenname is the name of the token. For example, 518*eb8dc403SDave Cobbley# calling this with s = "t_foo_bar_SPAM" might return (('foo','bar'),'SPAM') 519*eb8dc403SDave Cobbley# ----------------------------------------------------------------------------- 520*eb8dc403SDave Cobbley 521*eb8dc403SDave Cobbleydef _statetoken(s,names): 522*eb8dc403SDave Cobbley nonstate = 1 523*eb8dc403SDave Cobbley parts = s.split("_") 524*eb8dc403SDave Cobbley for i in range(1,len(parts)): 525*eb8dc403SDave Cobbley if not parts[i] in names and parts[i] != 'ANY': break 526*eb8dc403SDave Cobbley if i > 1: 527*eb8dc403SDave Cobbley states = tuple(parts[1:i]) 528*eb8dc403SDave Cobbley else: 529*eb8dc403SDave Cobbley states = ('INITIAL',) 530*eb8dc403SDave Cobbley 531*eb8dc403SDave Cobbley if 'ANY' in states: 532*eb8dc403SDave Cobbley states = tuple(names) 533*eb8dc403SDave Cobbley 534*eb8dc403SDave Cobbley tokenname = "_".join(parts[i:]) 535*eb8dc403SDave Cobbley return (states,tokenname) 536*eb8dc403SDave Cobbley 537*eb8dc403SDave Cobbley 538*eb8dc403SDave Cobbley# ----------------------------------------------------------------------------- 539*eb8dc403SDave Cobbley# LexerReflect() 540*eb8dc403SDave Cobbley# 541*eb8dc403SDave Cobbley# This class represents information needed to build a lexer as extracted from a 542*eb8dc403SDave Cobbley# user's input file. 543*eb8dc403SDave Cobbley# ----------------------------------------------------------------------------- 544*eb8dc403SDave Cobbleyclass LexerReflect(object): 545*eb8dc403SDave Cobbley def __init__(self,ldict,log=None,reflags=0): 546*eb8dc403SDave Cobbley self.ldict = ldict 547*eb8dc403SDave Cobbley self.error_func = None 548*eb8dc403SDave Cobbley self.tokens = [] 549*eb8dc403SDave Cobbley self.reflags = reflags 550*eb8dc403SDave Cobbley self.stateinfo = { 'INITIAL' : 'inclusive'} 551*eb8dc403SDave Cobbley self.files = {} 552*eb8dc403SDave Cobbley self.error = 0 553*eb8dc403SDave Cobbley 554*eb8dc403SDave Cobbley if log is None: 555*eb8dc403SDave Cobbley self.log = PlyLogger(sys.stderr) 556*eb8dc403SDave Cobbley else: 557*eb8dc403SDave Cobbley self.log = log 558*eb8dc403SDave Cobbley 559*eb8dc403SDave Cobbley # Get all of the basic information 560*eb8dc403SDave Cobbley def get_all(self): 561*eb8dc403SDave Cobbley self.get_tokens() 562*eb8dc403SDave Cobbley self.get_literals() 563*eb8dc403SDave Cobbley self.get_states() 564*eb8dc403SDave Cobbley self.get_rules() 565*eb8dc403SDave Cobbley 566*eb8dc403SDave Cobbley # Validate all of the information 567*eb8dc403SDave Cobbley def validate_all(self): 568*eb8dc403SDave Cobbley self.validate_tokens() 569*eb8dc403SDave Cobbley self.validate_literals() 570*eb8dc403SDave Cobbley self.validate_rules() 571*eb8dc403SDave Cobbley return self.error 572*eb8dc403SDave Cobbley 573*eb8dc403SDave Cobbley # Get the tokens map 574*eb8dc403SDave Cobbley def get_tokens(self): 575*eb8dc403SDave Cobbley tokens = self.ldict.get("tokens",None) 576*eb8dc403SDave Cobbley if not tokens: 577*eb8dc403SDave Cobbley self.log.error("No token list is defined") 578*eb8dc403SDave Cobbley self.error = 1 579*eb8dc403SDave Cobbley return 580*eb8dc403SDave Cobbley 581*eb8dc403SDave Cobbley if not isinstance(tokens,(list, tuple)): 582*eb8dc403SDave Cobbley self.log.error("tokens must be a list or tuple") 583*eb8dc403SDave Cobbley self.error = 1 584*eb8dc403SDave Cobbley return 585*eb8dc403SDave Cobbley 586*eb8dc403SDave Cobbley if not tokens: 587*eb8dc403SDave Cobbley self.log.error("tokens is empty") 588*eb8dc403SDave Cobbley self.error = 1 589*eb8dc403SDave Cobbley return 590*eb8dc403SDave Cobbley 591*eb8dc403SDave Cobbley self.tokens = tokens 592*eb8dc403SDave Cobbley 593*eb8dc403SDave Cobbley # Validate the tokens 594*eb8dc403SDave Cobbley def validate_tokens(self): 595*eb8dc403SDave Cobbley terminals = {} 596*eb8dc403SDave Cobbley for n in self.tokens: 597*eb8dc403SDave Cobbley if not _is_identifier.match(n): 598*eb8dc403SDave Cobbley self.log.error("Bad token name '%s'",n) 599*eb8dc403SDave Cobbley self.error = 1 600*eb8dc403SDave Cobbley if n in terminals: 601*eb8dc403SDave Cobbley self.log.warning("Token '%s' multiply defined", n) 602*eb8dc403SDave Cobbley terminals[n] = 1 603*eb8dc403SDave Cobbley 604*eb8dc403SDave Cobbley # Get the literals specifier 605*eb8dc403SDave Cobbley def get_literals(self): 606*eb8dc403SDave Cobbley self.literals = self.ldict.get("literals","") 607*eb8dc403SDave Cobbley 608*eb8dc403SDave Cobbley # Validate literals 609*eb8dc403SDave Cobbley def validate_literals(self): 610*eb8dc403SDave Cobbley try: 611*eb8dc403SDave Cobbley for c in self.literals: 612*eb8dc403SDave Cobbley if not isinstance(c,StringTypes) or len(c) > 1: 613*eb8dc403SDave Cobbley self.log.error("Invalid literal %s. Must be a single character", repr(c)) 614*eb8dc403SDave Cobbley self.error = 1 615*eb8dc403SDave Cobbley continue 616*eb8dc403SDave Cobbley 617*eb8dc403SDave Cobbley except TypeError: 618*eb8dc403SDave Cobbley self.log.error("Invalid literals specification. literals must be a sequence of characters") 619*eb8dc403SDave Cobbley self.error = 1 620*eb8dc403SDave Cobbley 621*eb8dc403SDave Cobbley def get_states(self): 622*eb8dc403SDave Cobbley self.states = self.ldict.get("states",None) 623*eb8dc403SDave Cobbley # Build statemap 624*eb8dc403SDave Cobbley if self.states: 625*eb8dc403SDave Cobbley if not isinstance(self.states,(tuple,list)): 626*eb8dc403SDave Cobbley self.log.error("states must be defined as a tuple or list") 627*eb8dc403SDave Cobbley self.error = 1 628*eb8dc403SDave Cobbley else: 629*eb8dc403SDave Cobbley for s in self.states: 630*eb8dc403SDave Cobbley if not isinstance(s,tuple) or len(s) != 2: 631*eb8dc403SDave Cobbley self.log.error("Invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')",repr(s)) 632*eb8dc403SDave Cobbley self.error = 1 633*eb8dc403SDave Cobbley continue 634*eb8dc403SDave Cobbley name, statetype = s 635*eb8dc403SDave Cobbley if not isinstance(name,StringTypes): 636*eb8dc403SDave Cobbley self.log.error("State name %s must be a string", repr(name)) 637*eb8dc403SDave Cobbley self.error = 1 638*eb8dc403SDave Cobbley continue 639*eb8dc403SDave Cobbley if not (statetype == 'inclusive' or statetype == 'exclusive'): 640*eb8dc403SDave Cobbley self.log.error("State type for state %s must be 'inclusive' or 'exclusive'",name) 641*eb8dc403SDave Cobbley self.error = 1 642*eb8dc403SDave Cobbley continue 643*eb8dc403SDave Cobbley if name in self.stateinfo: 644*eb8dc403SDave Cobbley self.log.error("State '%s' already defined",name) 645*eb8dc403SDave Cobbley self.error = 1 646*eb8dc403SDave Cobbley continue 647*eb8dc403SDave Cobbley self.stateinfo[name] = statetype 648*eb8dc403SDave Cobbley 649*eb8dc403SDave Cobbley # Get all of the symbols with a t_ prefix and sort them into various 650*eb8dc403SDave Cobbley # categories (functions, strings, error functions, and ignore characters) 651*eb8dc403SDave Cobbley 652*eb8dc403SDave Cobbley def get_rules(self): 653*eb8dc403SDave Cobbley tsymbols = [f for f in self.ldict if f[:2] == 't_' ] 654*eb8dc403SDave Cobbley 655*eb8dc403SDave Cobbley # Now build up a list of functions and a list of strings 656*eb8dc403SDave Cobbley 657*eb8dc403SDave Cobbley self.toknames = { } # Mapping of symbols to token names 658*eb8dc403SDave Cobbley self.funcsym = { } # Symbols defined as functions 659*eb8dc403SDave Cobbley self.strsym = { } # Symbols defined as strings 660*eb8dc403SDave Cobbley self.ignore = { } # Ignore strings by state 661*eb8dc403SDave Cobbley self.errorf = { } # Error functions by state 662*eb8dc403SDave Cobbley 663*eb8dc403SDave Cobbley for s in self.stateinfo: 664*eb8dc403SDave Cobbley self.funcsym[s] = [] 665*eb8dc403SDave Cobbley self.strsym[s] = [] 666*eb8dc403SDave Cobbley 667*eb8dc403SDave Cobbley if len(tsymbols) == 0: 668*eb8dc403SDave Cobbley self.log.error("No rules of the form t_rulename are defined") 669*eb8dc403SDave Cobbley self.error = 1 670*eb8dc403SDave Cobbley return 671*eb8dc403SDave Cobbley 672*eb8dc403SDave Cobbley for f in tsymbols: 673*eb8dc403SDave Cobbley t = self.ldict[f] 674*eb8dc403SDave Cobbley states, tokname = _statetoken(f,self.stateinfo) 675*eb8dc403SDave Cobbley self.toknames[f] = tokname 676*eb8dc403SDave Cobbley 677*eb8dc403SDave Cobbley if hasattr(t,"__call__"): 678*eb8dc403SDave Cobbley if tokname == 'error': 679*eb8dc403SDave Cobbley for s in states: 680*eb8dc403SDave Cobbley self.errorf[s] = t 681*eb8dc403SDave Cobbley elif tokname == 'ignore': 682*eb8dc403SDave Cobbley line = func_code(t).co_firstlineno 683*eb8dc403SDave Cobbley file = func_code(t).co_filename 684*eb8dc403SDave Cobbley self.log.error("%s:%d: Rule '%s' must be defined as a string",file,line,t.__name__) 685*eb8dc403SDave Cobbley self.error = 1 686*eb8dc403SDave Cobbley else: 687*eb8dc403SDave Cobbley for s in states: 688*eb8dc403SDave Cobbley self.funcsym[s].append((f,t)) 689*eb8dc403SDave Cobbley elif isinstance(t, StringTypes): 690*eb8dc403SDave Cobbley if tokname == 'ignore': 691*eb8dc403SDave Cobbley for s in states: 692*eb8dc403SDave Cobbley self.ignore[s] = t 693*eb8dc403SDave Cobbley if "\\" in t: 694*eb8dc403SDave Cobbley self.log.warning("%s contains a literal backslash '\\'",f) 695*eb8dc403SDave Cobbley 696*eb8dc403SDave Cobbley elif tokname == 'error': 697*eb8dc403SDave Cobbley self.log.error("Rule '%s' must be defined as a function", f) 698*eb8dc403SDave Cobbley self.error = 1 699*eb8dc403SDave Cobbley else: 700*eb8dc403SDave Cobbley for s in states: 701*eb8dc403SDave Cobbley self.strsym[s].append((f,t)) 702*eb8dc403SDave Cobbley else: 703*eb8dc403SDave Cobbley self.log.error("%s not defined as a function or string", f) 704*eb8dc403SDave Cobbley self.error = 1 705*eb8dc403SDave Cobbley 706*eb8dc403SDave Cobbley # Sort the functions by line number 707*eb8dc403SDave Cobbley for f in self.funcsym.values(): 708*eb8dc403SDave Cobbley f.sort(key=lambda x: func_code(x[1]).co_firstlineno) 709*eb8dc403SDave Cobbley 710*eb8dc403SDave Cobbley # Sort the strings by regular expression length 711*eb8dc403SDave Cobbley for s in self.strsym.values(): 712*eb8dc403SDave Cobbley if sys.version_info[0] < 3: 713*eb8dc403SDave Cobbley s.sort(lambda x,y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[1]))) 714*eb8dc403SDave Cobbley else: 715*eb8dc403SDave Cobbley # Python 3.0 716*eb8dc403SDave Cobbley s.sort(key=lambda x: len(x[1]),reverse=True) 717*eb8dc403SDave Cobbley 718*eb8dc403SDave Cobbley # Validate all of the t_rules collected 719*eb8dc403SDave Cobbley def validate_rules(self): 720*eb8dc403SDave Cobbley for state in self.stateinfo: 721*eb8dc403SDave Cobbley # Validate all rules defined by functions 722*eb8dc403SDave Cobbley 723*eb8dc403SDave Cobbley 724*eb8dc403SDave Cobbley 725*eb8dc403SDave Cobbley for fname, f in self.funcsym[state]: 726*eb8dc403SDave Cobbley line = func_code(f).co_firstlineno 727*eb8dc403SDave Cobbley file = func_code(f).co_filename 728*eb8dc403SDave Cobbley self.files[file] = 1 729*eb8dc403SDave Cobbley 730*eb8dc403SDave Cobbley tokname = self.toknames[fname] 731*eb8dc403SDave Cobbley if isinstance(f, types.MethodType): 732*eb8dc403SDave Cobbley reqargs = 2 733*eb8dc403SDave Cobbley else: 734*eb8dc403SDave Cobbley reqargs = 1 735*eb8dc403SDave Cobbley nargs = func_code(f).co_argcount 736*eb8dc403SDave Cobbley if nargs > reqargs: 737*eb8dc403SDave Cobbley self.log.error("%s:%d: Rule '%s' has too many arguments",file,line,f.__name__) 738*eb8dc403SDave Cobbley self.error = 1 739*eb8dc403SDave Cobbley continue 740*eb8dc403SDave Cobbley 741*eb8dc403SDave Cobbley if nargs < reqargs: 742*eb8dc403SDave Cobbley self.log.error("%s:%d: Rule '%s' requires an argument", file,line,f.__name__) 743*eb8dc403SDave Cobbley self.error = 1 744*eb8dc403SDave Cobbley continue 745*eb8dc403SDave Cobbley 746*eb8dc403SDave Cobbley if not f.__doc__: 747*eb8dc403SDave Cobbley self.log.error("%s:%d: No regular expression defined for rule '%s'",file,line,f.__name__) 748*eb8dc403SDave Cobbley self.error = 1 749*eb8dc403SDave Cobbley continue 750*eb8dc403SDave Cobbley 751*eb8dc403SDave Cobbley try: 752*eb8dc403SDave Cobbley c = re.compile("(?P<%s>%s)" % (fname,f.__doc__), re.VERBOSE | self.reflags) 753*eb8dc403SDave Cobbley if c.match(""): 754*eb8dc403SDave Cobbley self.log.error("%s:%d: Regular expression for rule '%s' matches empty string", file,line,f.__name__) 755*eb8dc403SDave Cobbley self.error = 1 756*eb8dc403SDave Cobbley except re.error: 757*eb8dc403SDave Cobbley _etype, e, _etrace = sys.exc_info() 758*eb8dc403SDave Cobbley self.log.error("%s:%d: Invalid regular expression for rule '%s'. %s", file,line,f.__name__,e) 759*eb8dc403SDave Cobbley if '#' in f.__doc__: 760*eb8dc403SDave Cobbley self.log.error("%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'",file,line, f.__name__) 761*eb8dc403SDave Cobbley self.error = 1 762*eb8dc403SDave Cobbley 763*eb8dc403SDave Cobbley # Validate all rules defined by strings 764*eb8dc403SDave Cobbley for name,r in self.strsym[state]: 765*eb8dc403SDave Cobbley tokname = self.toknames[name] 766*eb8dc403SDave Cobbley if tokname == 'error': 767*eb8dc403SDave Cobbley self.log.error("Rule '%s' must be defined as a function", name) 768*eb8dc403SDave Cobbley self.error = 1 769*eb8dc403SDave Cobbley continue 770*eb8dc403SDave Cobbley 771*eb8dc403SDave Cobbley if not tokname in self.tokens and tokname.find("ignore_") < 0: 772*eb8dc403SDave Cobbley self.log.error("Rule '%s' defined for an unspecified token %s",name,tokname) 773*eb8dc403SDave Cobbley self.error = 1 774*eb8dc403SDave Cobbley continue 775*eb8dc403SDave Cobbley 776*eb8dc403SDave Cobbley try: 777*eb8dc403SDave Cobbley c = re.compile("(?P<%s>%s)" % (name,r),re.VERBOSE | self.reflags) 778*eb8dc403SDave Cobbley if (c.match("")): 779*eb8dc403SDave Cobbley self.log.error("Regular expression for rule '%s' matches empty string",name) 780*eb8dc403SDave Cobbley self.error = 1 781*eb8dc403SDave Cobbley except re.error: 782*eb8dc403SDave Cobbley _etype, e, _etrace = sys.exc_info() 783*eb8dc403SDave Cobbley self.log.error("Invalid regular expression for rule '%s'. %s",name,e) 784*eb8dc403SDave Cobbley if '#' in r: 785*eb8dc403SDave Cobbley self.log.error("Make sure '#' in rule '%s' is escaped with '\\#'",name) 786*eb8dc403SDave Cobbley self.error = 1 787*eb8dc403SDave Cobbley 788*eb8dc403SDave Cobbley if not self.funcsym[state] and not self.strsym[state]: 789*eb8dc403SDave Cobbley self.log.error("No rules defined for state '%s'",state) 790*eb8dc403SDave Cobbley self.error = 1 791*eb8dc403SDave Cobbley 792*eb8dc403SDave Cobbley # Validate the error function 793*eb8dc403SDave Cobbley efunc = self.errorf.get(state,None) 794*eb8dc403SDave Cobbley if efunc: 795*eb8dc403SDave Cobbley f = efunc 796*eb8dc403SDave Cobbley line = func_code(f).co_firstlineno 797*eb8dc403SDave Cobbley file = func_code(f).co_filename 798*eb8dc403SDave Cobbley self.files[file] = 1 799*eb8dc403SDave Cobbley 800*eb8dc403SDave Cobbley if isinstance(f, types.MethodType): 801*eb8dc403SDave Cobbley reqargs = 2 802*eb8dc403SDave Cobbley else: 803*eb8dc403SDave Cobbley reqargs = 1 804*eb8dc403SDave Cobbley nargs = func_code(f).co_argcount 805*eb8dc403SDave Cobbley if nargs > reqargs: 806*eb8dc403SDave Cobbley self.log.error("%s:%d: Rule '%s' has too many arguments",file,line,f.__name__) 807*eb8dc403SDave Cobbley self.error = 1 808*eb8dc403SDave Cobbley 809*eb8dc403SDave Cobbley if nargs < reqargs: 810*eb8dc403SDave Cobbley self.log.error("%s:%d: Rule '%s' requires an argument", file,line,f.__name__) 811*eb8dc403SDave Cobbley self.error = 1 812*eb8dc403SDave Cobbley 813*eb8dc403SDave Cobbley for f in self.files: 814*eb8dc403SDave Cobbley self.validate_file(f) 815*eb8dc403SDave Cobbley 816*eb8dc403SDave Cobbley 817*eb8dc403SDave Cobbley # ----------------------------------------------------------------------------- 818*eb8dc403SDave Cobbley # validate_file() 819*eb8dc403SDave Cobbley # 820*eb8dc403SDave Cobbley # This checks to see if there are duplicated t_rulename() functions or strings 821*eb8dc403SDave Cobbley # in the parser input file. This is done using a simple regular expression 822*eb8dc403SDave Cobbley # match on each line in the given file. 823*eb8dc403SDave Cobbley # ----------------------------------------------------------------------------- 824*eb8dc403SDave Cobbley 825*eb8dc403SDave Cobbley def validate_file(self,filename): 826*eb8dc403SDave Cobbley import os.path 827*eb8dc403SDave Cobbley base,ext = os.path.splitext(filename) 828*eb8dc403SDave Cobbley if ext != '.py': return # No idea what the file is. Return OK 829*eb8dc403SDave Cobbley 830*eb8dc403SDave Cobbley try: 831*eb8dc403SDave Cobbley f = open(filename) 832*eb8dc403SDave Cobbley lines = f.readlines() 833*eb8dc403SDave Cobbley f.close() 834*eb8dc403SDave Cobbley except IOError: 835*eb8dc403SDave Cobbley return # Couldn't find the file. Don't worry about it 836*eb8dc403SDave Cobbley 837*eb8dc403SDave Cobbley fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(') 838*eb8dc403SDave Cobbley sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=') 839*eb8dc403SDave Cobbley 840*eb8dc403SDave Cobbley counthash = { } 841*eb8dc403SDave Cobbley linen = 1 842*eb8dc403SDave Cobbley for l in lines: 843*eb8dc403SDave Cobbley m = fre.match(l) 844*eb8dc403SDave Cobbley if not m: 845*eb8dc403SDave Cobbley m = sre.match(l) 846*eb8dc403SDave Cobbley if m: 847*eb8dc403SDave Cobbley name = m.group(1) 848*eb8dc403SDave Cobbley prev = counthash.get(name) 849*eb8dc403SDave Cobbley if not prev: 850*eb8dc403SDave Cobbley counthash[name] = linen 851*eb8dc403SDave Cobbley else: 852*eb8dc403SDave Cobbley self.log.error("%s:%d: Rule %s redefined. Previously defined on line %d",filename,linen,name,prev) 853*eb8dc403SDave Cobbley self.error = 1 854*eb8dc403SDave Cobbley linen += 1 855*eb8dc403SDave Cobbley 856*eb8dc403SDave Cobbley# ----------------------------------------------------------------------------- 857*eb8dc403SDave Cobbley# lex(module) 858*eb8dc403SDave Cobbley# 859*eb8dc403SDave Cobbley# Build all of the regular expression rules from definitions in the supplied module 860*eb8dc403SDave Cobbley# ----------------------------------------------------------------------------- 861*eb8dc403SDave Cobbleydef lex(module=None,object=None,debug=0,optimize=0,lextab="lextab",reflags=0,nowarn=0,outputdir="", debuglog=None, errorlog=None): 862*eb8dc403SDave Cobbley global lexer 863*eb8dc403SDave Cobbley ldict = None 864*eb8dc403SDave Cobbley stateinfo = { 'INITIAL' : 'inclusive'} 865*eb8dc403SDave Cobbley lexobj = Lexer() 866*eb8dc403SDave Cobbley lexobj.lexoptimize = optimize 867*eb8dc403SDave Cobbley global token,input 868*eb8dc403SDave Cobbley 869*eb8dc403SDave Cobbley if errorlog is None: 870*eb8dc403SDave Cobbley errorlog = PlyLogger(sys.stderr) 871*eb8dc403SDave Cobbley 872*eb8dc403SDave Cobbley if debug: 873*eb8dc403SDave Cobbley if debuglog is None: 874*eb8dc403SDave Cobbley debuglog = PlyLogger(sys.stderr) 875*eb8dc403SDave Cobbley 876*eb8dc403SDave Cobbley # Get the module dictionary used for the lexer 877*eb8dc403SDave Cobbley if object: module = object 878*eb8dc403SDave Cobbley 879*eb8dc403SDave Cobbley if module: 880*eb8dc403SDave Cobbley _items = [(k,getattr(module,k)) for k in dir(module)] 881*eb8dc403SDave Cobbley ldict = dict(_items) 882*eb8dc403SDave Cobbley else: 883*eb8dc403SDave Cobbley ldict = get_caller_module_dict(2) 884*eb8dc403SDave Cobbley 885*eb8dc403SDave Cobbley # Collect parser information from the dictionary 886*eb8dc403SDave Cobbley linfo = LexerReflect(ldict,log=errorlog,reflags=reflags) 887*eb8dc403SDave Cobbley linfo.get_all() 888*eb8dc403SDave Cobbley if not optimize: 889*eb8dc403SDave Cobbley if linfo.validate_all(): 890*eb8dc403SDave Cobbley raise SyntaxError("Can't build lexer") 891*eb8dc403SDave Cobbley 892*eb8dc403SDave Cobbley if optimize and lextab: 893*eb8dc403SDave Cobbley try: 894*eb8dc403SDave Cobbley lexobj.readtab(lextab,ldict) 895*eb8dc403SDave Cobbley token = lexobj.token 896*eb8dc403SDave Cobbley input = lexobj.input 897*eb8dc403SDave Cobbley lexer = lexobj 898*eb8dc403SDave Cobbley return lexobj 899*eb8dc403SDave Cobbley 900*eb8dc403SDave Cobbley except ImportError: 901*eb8dc403SDave Cobbley pass 902*eb8dc403SDave Cobbley 903*eb8dc403SDave Cobbley # Dump some basic debugging information 904*eb8dc403SDave Cobbley if debug: 905*eb8dc403SDave Cobbley debuglog.info("lex: tokens = %r", linfo.tokens) 906*eb8dc403SDave Cobbley debuglog.info("lex: literals = %r", linfo.literals) 907*eb8dc403SDave Cobbley debuglog.info("lex: states = %r", linfo.stateinfo) 908*eb8dc403SDave Cobbley 909*eb8dc403SDave Cobbley # Build a dictionary of valid token names 910*eb8dc403SDave Cobbley lexobj.lextokens = { } 911*eb8dc403SDave Cobbley for n in linfo.tokens: 912*eb8dc403SDave Cobbley lexobj.lextokens[n] = 1 913*eb8dc403SDave Cobbley 914*eb8dc403SDave Cobbley # Get literals specification 915*eb8dc403SDave Cobbley if isinstance(linfo.literals,(list,tuple)): 916*eb8dc403SDave Cobbley lexobj.lexliterals = type(linfo.literals[0])().join(linfo.literals) 917*eb8dc403SDave Cobbley else: 918*eb8dc403SDave Cobbley lexobj.lexliterals = linfo.literals 919*eb8dc403SDave Cobbley 920*eb8dc403SDave Cobbley # Get the stateinfo dictionary 921*eb8dc403SDave Cobbley stateinfo = linfo.stateinfo 922*eb8dc403SDave Cobbley 923*eb8dc403SDave Cobbley regexs = { } 924*eb8dc403SDave Cobbley # Build the master regular expressions 925*eb8dc403SDave Cobbley for state in stateinfo: 926*eb8dc403SDave Cobbley regex_list = [] 927*eb8dc403SDave Cobbley 928*eb8dc403SDave Cobbley # Add rules defined by functions first 929*eb8dc403SDave Cobbley for fname, f in linfo.funcsym[state]: 930*eb8dc403SDave Cobbley line = func_code(f).co_firstlineno 931*eb8dc403SDave Cobbley file = func_code(f).co_filename 932*eb8dc403SDave Cobbley regex_list.append("(?P<%s>%s)" % (fname,f.__doc__)) 933*eb8dc403SDave Cobbley if debug: 934*eb8dc403SDave Cobbley debuglog.info("lex: Adding rule %s -> '%s' (state '%s')",fname,f.__doc__, state) 935*eb8dc403SDave Cobbley 936*eb8dc403SDave Cobbley # Now add all of the simple rules 937*eb8dc403SDave Cobbley for name,r in linfo.strsym[state]: 938*eb8dc403SDave Cobbley regex_list.append("(?P<%s>%s)" % (name,r)) 939*eb8dc403SDave Cobbley if debug: 940*eb8dc403SDave Cobbley debuglog.info("lex: Adding rule %s -> '%s' (state '%s')",name,r, state) 941*eb8dc403SDave Cobbley 942*eb8dc403SDave Cobbley regexs[state] = regex_list 943*eb8dc403SDave Cobbley 944*eb8dc403SDave Cobbley # Build the master regular expressions 945*eb8dc403SDave Cobbley 946*eb8dc403SDave Cobbley if debug: 947*eb8dc403SDave Cobbley debuglog.info("lex: ==== MASTER REGEXS FOLLOW ====") 948*eb8dc403SDave Cobbley 949*eb8dc403SDave Cobbley for state in regexs: 950*eb8dc403SDave Cobbley lexre, re_text, re_names = _form_master_re(regexs[state],reflags,ldict,linfo.toknames) 951*eb8dc403SDave Cobbley lexobj.lexstatere[state] = lexre 952*eb8dc403SDave Cobbley lexobj.lexstateretext[state] = re_text 953*eb8dc403SDave Cobbley lexobj.lexstaterenames[state] = re_names 954*eb8dc403SDave Cobbley if debug: 955*eb8dc403SDave Cobbley for i in range(len(re_text)): 956*eb8dc403SDave Cobbley debuglog.info("lex: state '%s' : regex[%d] = '%s'",state, i, re_text[i]) 957*eb8dc403SDave Cobbley 958*eb8dc403SDave Cobbley # For inclusive states, we need to add the regular expressions from the INITIAL state 959*eb8dc403SDave Cobbley for state,stype in stateinfo.items(): 960*eb8dc403SDave Cobbley if state != "INITIAL" and stype == 'inclusive': 961*eb8dc403SDave Cobbley lexobj.lexstatere[state].extend(lexobj.lexstatere['INITIAL']) 962*eb8dc403SDave Cobbley lexobj.lexstateretext[state].extend(lexobj.lexstateretext['INITIAL']) 963*eb8dc403SDave Cobbley lexobj.lexstaterenames[state].extend(lexobj.lexstaterenames['INITIAL']) 964*eb8dc403SDave Cobbley 965*eb8dc403SDave Cobbley lexobj.lexstateinfo = stateinfo 966*eb8dc403SDave Cobbley lexobj.lexre = lexobj.lexstatere["INITIAL"] 967*eb8dc403SDave Cobbley lexobj.lexretext = lexobj.lexstateretext["INITIAL"] 968*eb8dc403SDave Cobbley lexobj.lexreflags = reflags 969*eb8dc403SDave Cobbley 970*eb8dc403SDave Cobbley # Set up ignore variables 971*eb8dc403SDave Cobbley lexobj.lexstateignore = linfo.ignore 972*eb8dc403SDave Cobbley lexobj.lexignore = lexobj.lexstateignore.get("INITIAL","") 973*eb8dc403SDave Cobbley 974*eb8dc403SDave Cobbley # Set up error functions 975*eb8dc403SDave Cobbley lexobj.lexstateerrorf = linfo.errorf 976*eb8dc403SDave Cobbley lexobj.lexerrorf = linfo.errorf.get("INITIAL",None) 977*eb8dc403SDave Cobbley if not lexobj.lexerrorf: 978*eb8dc403SDave Cobbley errorlog.warning("No t_error rule is defined") 979*eb8dc403SDave Cobbley 980*eb8dc403SDave Cobbley # Check state information for ignore and error rules 981*eb8dc403SDave Cobbley for s,stype in stateinfo.items(): 982*eb8dc403SDave Cobbley if stype == 'exclusive': 983*eb8dc403SDave Cobbley if not s in linfo.errorf: 984*eb8dc403SDave Cobbley errorlog.warning("No error rule is defined for exclusive state '%s'", s) 985*eb8dc403SDave Cobbley if not s in linfo.ignore and lexobj.lexignore: 986*eb8dc403SDave Cobbley errorlog.warning("No ignore rule is defined for exclusive state '%s'", s) 987*eb8dc403SDave Cobbley elif stype == 'inclusive': 988*eb8dc403SDave Cobbley if not s in linfo.errorf: 989*eb8dc403SDave Cobbley linfo.errorf[s] = linfo.errorf.get("INITIAL",None) 990*eb8dc403SDave Cobbley if not s in linfo.ignore: 991*eb8dc403SDave Cobbley linfo.ignore[s] = linfo.ignore.get("INITIAL","") 992*eb8dc403SDave Cobbley 993*eb8dc403SDave Cobbley # Create global versions of the token() and input() functions 994*eb8dc403SDave Cobbley token = lexobj.token 995*eb8dc403SDave Cobbley input = lexobj.input 996*eb8dc403SDave Cobbley lexer = lexobj 997*eb8dc403SDave Cobbley 998*eb8dc403SDave Cobbley # If in optimize mode, we write the lextab 999*eb8dc403SDave Cobbley if lextab and optimize: 1000*eb8dc403SDave Cobbley lexobj.writetab(lextab,outputdir) 1001*eb8dc403SDave Cobbley 1002*eb8dc403SDave Cobbley return lexobj 1003*eb8dc403SDave Cobbley 1004*eb8dc403SDave Cobbley# ----------------------------------------------------------------------------- 1005*eb8dc403SDave Cobbley# runmain() 1006*eb8dc403SDave Cobbley# 1007*eb8dc403SDave Cobbley# This runs the lexer as a main program 1008*eb8dc403SDave Cobbley# ----------------------------------------------------------------------------- 1009*eb8dc403SDave Cobbley 1010*eb8dc403SDave Cobbleydef runmain(lexer=None,data=None): 1011*eb8dc403SDave Cobbley if not data: 1012*eb8dc403SDave Cobbley try: 1013*eb8dc403SDave Cobbley filename = sys.argv[1] 1014*eb8dc403SDave Cobbley f = open(filename) 1015*eb8dc403SDave Cobbley data = f.read() 1016*eb8dc403SDave Cobbley f.close() 1017*eb8dc403SDave Cobbley except IndexError: 1018*eb8dc403SDave Cobbley sys.stdout.write("Reading from standard input (type EOF to end):\n") 1019*eb8dc403SDave Cobbley data = sys.stdin.read() 1020*eb8dc403SDave Cobbley 1021*eb8dc403SDave Cobbley if lexer: 1022*eb8dc403SDave Cobbley _input = lexer.input 1023*eb8dc403SDave Cobbley else: 1024*eb8dc403SDave Cobbley _input = input 1025*eb8dc403SDave Cobbley _input(data) 1026*eb8dc403SDave Cobbley if lexer: 1027*eb8dc403SDave Cobbley _token = lexer.token 1028*eb8dc403SDave Cobbley else: 1029*eb8dc403SDave Cobbley _token = token 1030*eb8dc403SDave Cobbley 1031*eb8dc403SDave Cobbley while 1: 1032*eb8dc403SDave Cobbley tok = _token() 1033*eb8dc403SDave Cobbley if not tok: break 1034*eb8dc403SDave Cobbley sys.stdout.write("(%s,%r,%d,%d)\n" % (tok.type, tok.value, tok.lineno,tok.lexpos)) 1035*eb8dc403SDave Cobbley 1036*eb8dc403SDave Cobbley# ----------------------------------------------------------------------------- 1037*eb8dc403SDave Cobbley# @TOKEN(regex) 1038*eb8dc403SDave Cobbley# 1039*eb8dc403SDave Cobbley# This decorator function can be used to set the regex expression on a function 1040*eb8dc403SDave Cobbley# when its docstring might need to be set in an alternative way 1041*eb8dc403SDave Cobbley# ----------------------------------------------------------------------------- 1042*eb8dc403SDave Cobbley 1043*eb8dc403SDave Cobbleydef TOKEN(r): 1044*eb8dc403SDave Cobbley def set_doc(f): 1045*eb8dc403SDave Cobbley if hasattr(r,"__call__"): 1046*eb8dc403SDave Cobbley f.__doc__ = r.__doc__ 1047*eb8dc403SDave Cobbley else: 1048*eb8dc403SDave Cobbley f.__doc__ = r 1049*eb8dc403SDave Cobbley return f 1050*eb8dc403SDave Cobbley return set_doc 1051*eb8dc403SDave Cobbley 1052*eb8dc403SDave Cobbley# Alternative spelling of the TOKEN decorator 1053*eb8dc403SDave CobbleyToken = TOKEN 1054*eb8dc403SDave Cobbley 1055