1# pyshlex.py - PLY compatible lexer for pysh. 2# 3# Copyright 2007 Patrick Mezard 4# 5# This software may be used and distributed according to the terms 6# of the GNU General Public License, incorporated herein by reference. 7 8# TODO: 9# - review all "char in 'abc'" snippets: the empty string can be matched 10# - test line continuations within quoted/expansion strings 11# - eof is buggy wrt sublexers 12# - the lexer cannot really work in pull mode as it would be required to run 13# PLY in pull mode. It was designed to work incrementally and it would not be 14# that hard to enable pull mode. 15import re 16 17from ply import lex 18from bb.pysh.sherrors import * 19 20class NeedMore(Exception): 21 pass 22 23def is_blank(c): 24 return c in (' ', '\t') 25 26_RE_DIGITS = re.compile(r'^\d+$') 27 28def are_digits(s): 29 return _RE_DIGITS.search(s) is not None 30 31_OPERATORS = dict([ 32 ('&&', 'AND_IF'), 33 ('||', 'OR_IF'), 34 (';;', 'DSEMI'), 35 ('<<', 'DLESS'), 36 ('>>', 'DGREAT'), 37 ('<&', 'LESSAND'), 38 ('>&', 'GREATAND'), 39 ('<>', 'LESSGREAT'), 40 ('<<-', 'DLESSDASH'), 41 ('>|', 'CLOBBER'), 42 ('&', 'AMP'), 43 (';', 'COMMA'), 44 ('<', 'LESS'), 45 ('>', 'GREATER'), 46 ('(', 'LPARENS'), 47 (')', 'RPARENS'), 48]) 49 50#Make a function to silence pychecker "Local variable shadows global" 51def make_partial_ops(): 52 partials = {} 53 for k in _OPERATORS: 54 for i in range(1, len(k)+1): 55 partials[k[:i]] = None 56 return partials 57 58_PARTIAL_OPERATORS = make_partial_ops() 59 60def is_partial_op(s): 61 """Return True if s matches a non-empty subpart of an operator starting 62 at its first character. 63 """ 64 return s in _PARTIAL_OPERATORS 65 66def is_op(s): 67 """If s matches an operator, returns the operator identifier. Return None 68 otherwise. 69 """ 70 return _OPERATORS.get(s) 71 72_RESERVEDS = dict([ 73 ('if', 'If'), 74 ('then', 'Then'), 75 ('else', 'Else'), 76 ('elif', 'Elif'), 77 ('fi', 'Fi'), 78 ('do', 'Do'), 79 ('done', 'Done'), 80 ('case', 'Case'), 81 ('esac', 'Esac'), 82 ('while', 'While'), 83 ('until', 'Until'), 84 ('for', 'For'), 85 ('{', 'Lbrace'), 86 ('}', 'Rbrace'), 87 ('!', 'Bang'), 88 ('in', 'In'), 89 ('|', 'PIPE'), 90]) 91 92def get_reserved(s): 93 return _RESERVEDS.get(s) 94 95_RE_NAME = re.compile(r'^[0-9a-zA-Z_]+$') 96 97def is_name(s): 98 return _RE_NAME.search(s) is not None 99 100def find_chars(seq, chars): 101 for i,v in enumerate(seq): 102 if v in chars: 103 return i,v 104 return -1, None 105 106class WordLexer: 107 """WordLexer parse quoted or expansion expressions and return an expression 108 tree. The input string can be any well formed sequence beginning with quoting 109 or expansion character. Embedded expressions are handled recursively. The 110 resulting tree is made of lists and strings. Lists represent quoted or 111 expansion expressions. Each list first element is the opening separator, 112 the last one the closing separator. In-between can be any number of strings 113 or lists for sub-expressions. Non quoted/expansion expression can written as 114 strings or as lists with empty strings as starting and ending delimiters. 115 """ 116 117 NAME_CHARSET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_' 118 NAME_CHARSET = dict(zip(NAME_CHARSET, NAME_CHARSET)) 119 120 SPECIAL_CHARSET = '@*#?-$!0' 121 122 #Characters which can be escaped depends on the current delimiters 123 ESCAPABLE = { 124 '`': set(['$', '\\', '`']), 125 '"': set(['$', '\\', '`', '"']), 126 "'": set(), 127 } 128 129 def __init__(self, heredoc = False): 130 # _buffer is the unprocessed input characters buffer 131 self._buffer = [] 132 # _stack is empty or contains a quoted list being processed 133 # (this is the DFS path to the quoted expression being evaluated). 134 self._stack = [] 135 self._escapable = None 136 # True when parsing unquoted here documents 137 self._heredoc = heredoc 138 139 def add(self, data, eof=False): 140 """Feed the lexer with more data. If the quoted expression can be 141 delimited, return a tuple (expr, remaining) containing the expression 142 tree and the unconsumed data. 143 Otherwise, raise NeedMore. 144 """ 145 self._buffer += list(data) 146 self._parse(eof) 147 148 result = self._stack[0] 149 remaining = ''.join(self._buffer) 150 self._stack = [] 151 self._buffer = [] 152 return result, remaining 153 154 def _is_escapable(self, c, delim=None): 155 if delim is None: 156 if self._heredoc: 157 # Backslashes works as if they were double quoted in unquoted 158 # here-documents 159 delim = '"' 160 else: 161 if len(self._stack)<=1: 162 return True 163 delim = self._stack[-2][0] 164 165 escapables = self.ESCAPABLE.get(delim, None) 166 return escapables is None or c in escapables 167 168 def _parse_squote(self, buf, result, eof): 169 if not buf: 170 raise NeedMore() 171 try: 172 pos = buf.index("'") 173 except ValueError: 174 raise NeedMore() 175 result[-1] += ''.join(buf[:pos]) 176 result += ["'"] 177 return pos+1, True 178 179 def _parse_bquote(self, buf, result, eof): 180 if not buf: 181 raise NeedMore() 182 183 if buf[0]=='\n': 184 #Remove line continuations 185 result[:] = ['', '', ''] 186 elif self._is_escapable(buf[0]): 187 result[-1] += buf[0] 188 result += [''] 189 else: 190 #Keep as such 191 result[:] = ['', '\\'+buf[0], ''] 192 193 return 1, True 194 195 def _parse_dquote(self, buf, result, eof): 196 if not buf: 197 raise NeedMore() 198 pos, sep = find_chars(buf, '$\\`"') 199 if pos==-1: 200 raise NeedMore() 201 202 result[-1] += ''.join(buf[:pos]) 203 if sep=='"': 204 result += ['"'] 205 return pos+1, True 206 else: 207 #Keep everything until the separator and defer processing 208 return pos, False 209 210 def _parse_command(self, buf, result, eof): 211 if not buf: 212 raise NeedMore() 213 214 chars = '$\\`"\'' 215 if result[0] == '$(': 216 chars += ')' 217 pos, sep = find_chars(buf, chars) 218 if pos == -1: 219 raise NeedMore() 220 221 result[-1] += ''.join(buf[:pos]) 222 if (result[0]=='$(' and sep==')') or (result[0]=='`' and sep=='`'): 223 result += [sep] 224 return pos+1, True 225 else: 226 return pos, False 227 228 def _parse_parameter(self, buf, result, eof): 229 if not buf: 230 raise NeedMore() 231 232 pos, sep = find_chars(buf, '$\\`"\'}') 233 if pos==-1: 234 raise NeedMore() 235 236 result[-1] += ''.join(buf[:pos]) 237 if sep=='}': 238 result += [sep] 239 return pos+1, True 240 else: 241 return pos, False 242 243 def _parse_dollar(self, buf, result, eof): 244 sep = result[0] 245 if sep=='$': 246 if not buf: 247 #TODO: handle empty $ 248 raise NeedMore() 249 if buf[0]=='(': 250 if len(buf)==1: 251 raise NeedMore() 252 253 if buf[1]=='(': 254 result[0] = '$((' 255 buf[:2] = [] 256 else: 257 result[0] = '$(' 258 buf[:1] = [] 259 260 elif buf[0]=='{': 261 result[0] = '${' 262 buf[:1] = [] 263 else: 264 if buf[0] in self.SPECIAL_CHARSET: 265 result[-1] = buf[0] 266 read = 1 267 else: 268 for read,c in enumerate(buf): 269 if c not in self.NAME_CHARSET: 270 break 271 else: 272 if not eof: 273 raise NeedMore() 274 read += 1 275 276 result[-1] += ''.join(buf[0:read]) 277 278 if not result[-1]: 279 result[:] = ['', result[0], ''] 280 else: 281 result += [''] 282 return read,True 283 284 sep = result[0] 285 if sep=='$(': 286 parsefunc = self._parse_command 287 elif sep=='${': 288 parsefunc = self._parse_parameter 289 else: 290 raise NotImplementedError(sep) 291 292 pos, closed = parsefunc(buf, result, eof) 293 return pos, closed 294 295 def _parse(self, eof): 296 buf = self._buffer 297 stack = self._stack 298 recurse = False 299 300 while 1: 301 if not stack or recurse: 302 if not buf: 303 raise NeedMore() 304 if buf[0] not in ('"\\`$\''): 305 raise ShellSyntaxError('Invalid quoted string sequence') 306 stack.append([buf[0], '']) 307 buf[:1] = [] 308 recurse = False 309 310 result = stack[-1] 311 if result[0]=="'": 312 parsefunc = self._parse_squote 313 elif result[0]=='\\': 314 parsefunc = self._parse_bquote 315 elif result[0]=='"': 316 parsefunc = self._parse_dquote 317 elif result[0]=='`': 318 parsefunc = self._parse_command 319 elif result[0][0]=='$': 320 parsefunc = self._parse_dollar 321 else: 322 raise NotImplementedError() 323 324 read, closed = parsefunc(buf, result, eof) 325 326 buf[:read] = [] 327 if closed: 328 if len(stack)>1: 329 #Merge in parent expression 330 parsed = stack.pop() 331 stack[-1] += [parsed] 332 stack[-1] += [''] 333 else: 334 break 335 else: 336 recurse = True 337 338def normalize_wordtree(wtree): 339 """Fold back every literal sequence (delimited with empty strings) into 340 parent sequence. 341 """ 342 def normalize(wtree): 343 result = [] 344 for part in wtree[1:-1]: 345 if isinstance(part, list): 346 part = normalize(part) 347 if part[0]=='': 348 #Move the part content back at current level 349 result += part[1:-1] 350 continue 351 elif not part: 352 #Remove empty strings 353 continue 354 result.append(part) 355 if not result: 356 result = [''] 357 return [wtree[0]] + result + [wtree[-1]] 358 359 return normalize(wtree) 360 361 362def make_wordtree(token, here_document=False): 363 """Parse a delimited token and return a tree similar to the ones returned by 364 WordLexer. token may contain any combinations of expansion/quoted fields and 365 non-ones. 366 """ 367 tree = [''] 368 remaining = token 369 delimiters = '\\$`' 370 if not here_document: 371 delimiters += '\'"' 372 373 while 1: 374 pos, sep = find_chars(remaining, delimiters) 375 if pos==-1: 376 tree += [remaining, ''] 377 return normalize_wordtree(tree) 378 tree.append(remaining[:pos]) 379 remaining = remaining[pos:] 380 381 try: 382 result, remaining = WordLexer(heredoc = here_document).add(remaining, True) 383 except NeedMore: 384 raise ShellSyntaxError('Invalid token "%s"') 385 tree.append(result) 386 387 388def wordtree_as_string(wtree): 389 """Rewrite an expression tree generated by make_wordtree as string.""" 390 def visit(node, output): 391 for child in node: 392 if isinstance(child, list): 393 visit(child, output) 394 else: 395 output.append(child) 396 397 output = [] 398 visit(wtree, output) 399 return ''.join(output) 400 401 402def unquote_wordtree(wtree): 403 """Fold the word tree while removing quotes everywhere. Other expansion 404 sequences are joined as such. 405 """ 406 def unquote(wtree): 407 unquoted = [] 408 if wtree[0] in ('', "'", '"', '\\'): 409 wtree = wtree[1:-1] 410 411 for part in wtree: 412 if isinstance(part, list): 413 part = unquote(part) 414 unquoted.append(part) 415 return ''.join(unquoted) 416 417 return unquote(wtree) 418 419 420class HereDocLexer: 421 """HereDocLexer delimits whatever comes from the here-document starting newline 422 not included to the closing delimiter line included. 423 """ 424 def __init__(self, op, delim): 425 assert op in ('<<', '<<-') 426 if not delim: 427 raise ShellSyntaxError('invalid here document delimiter %s' % str(delim)) 428 429 self._op = op 430 self._delim = delim 431 self._buffer = [] 432 self._token = [] 433 434 def add(self, data, eof): 435 """If the here-document was delimited, return a tuple (content, remaining). 436 Raise NeedMore() otherwise. 437 """ 438 self._buffer += list(data) 439 self._parse(eof) 440 token = ''.join(self._token) 441 remaining = ''.join(self._buffer) 442 self._token, self._remaining = [], [] 443 return token, remaining 444 445 def _parse(self, eof): 446 while 1: 447 #Look for first unescaped newline. Quotes may be ignored 448 escaped = False 449 for i,c in enumerate(self._buffer): 450 if escaped: 451 escaped = False 452 elif c=='\\': 453 escaped = True 454 elif c=='\n': 455 break 456 else: 457 i = -1 458 459 if i==-1 or self._buffer[i]!='\n': 460 if not eof: 461 raise NeedMore() 462 #No more data, maybe the last line is closing delimiter 463 line = ''.join(self._buffer) 464 eol = '' 465 self._buffer[:] = [] 466 else: 467 line = ''.join(self._buffer[:i]) 468 eol = self._buffer[i] 469 self._buffer[:i+1] = [] 470 471 if self._op=='<<-': 472 line = line.lstrip('\t') 473 474 if line==self._delim: 475 break 476 477 self._token += [line, eol] 478 if i==-1: 479 break 480 481class Token: 482 #TODO: check this is still in use 483 OPERATOR = 'OPERATOR' 484 WORD = 'WORD' 485 486 def __init__(self): 487 self.value = '' 488 self.type = None 489 490 def __getitem__(self, key): 491 #Behave like a two elements tuple 492 if key==0: 493 return self.type 494 if key==1: 495 return self.value 496 raise IndexError(key) 497 498 499class HereDoc: 500 def __init__(self, op, name=None): 501 self.op = op 502 self.name = name 503 self.pendings = [] 504 505TK_COMMA = 'COMMA' 506TK_AMPERSAND = 'AMP' 507TK_OP = 'OP' 508TK_TOKEN = 'TOKEN' 509TK_COMMENT = 'COMMENT' 510TK_NEWLINE = 'NEWLINE' 511TK_IONUMBER = 'IO_NUMBER' 512TK_ASSIGNMENT = 'ASSIGNMENT_WORD' 513TK_HERENAME = 'HERENAME' 514 515class Lexer: 516 """Main lexer. 517 518 Call add() until the script AST is returned. 519 """ 520 # Here-document handling makes the whole thing more complex because they basically 521 # force tokens to be reordered: here-content must come right after the operator 522 # and the here-document name, while some other tokens might be following the 523 # here-document expression on the same line. 524 # 525 # So, here-doc states are basically: 526 # *self._state==ST_NORMAL 527 # - self._heredoc.op is None: no here-document 528 # - self._heredoc.op is not None but name is: here-document operator matched, 529 # waiting for the document name/delimiter 530 # - self._heredoc.op and name are not None: here-document is ready, following 531 # tokens are being stored and will be pushed again when the document is 532 # completely parsed. 533 # *self._state==ST_HEREDOC 534 # - The here-document is being delimited by self._herelexer. Once it is done 535 # the content is pushed in front of the pending token list then all these 536 # tokens are pushed once again. 537 ST_NORMAL = 'ST_NORMAL' 538 ST_OP = 'ST_OP' 539 ST_BACKSLASH = 'ST_BACKSLASH' 540 ST_QUOTED = 'ST_QUOTED' 541 ST_COMMENT = 'ST_COMMENT' 542 ST_HEREDOC = 'ST_HEREDOC' 543 544 #Match end of backquote strings 545 RE_BACKQUOTE_END = re.compile(r'(?<!\\)(`)') 546 547 def __init__(self, parent_state = None): 548 self._input = [] 549 self._pos = 0 550 551 self._token = '' 552 self._type = TK_TOKEN 553 554 self._state = self.ST_NORMAL 555 self._parent_state = parent_state 556 self._wordlexer = None 557 558 self._heredoc = HereDoc(None) 559 self._herelexer = None 560 561 ### Following attributes are not used for delimiting token and can safely 562 ### be changed after here-document detection (see _push_toke) 563 564 # Count the number of tokens following a 'For' reserved word. Needed to 565 # return an 'In' reserved word if it comes in third place. 566 self._for_count = None 567 568 def add(self, data, eof=False): 569 """Feed the lexer with data. 570 571 When eof is set to True, returns unconsumed data or raise if the lexer 572 is in the middle of a delimiting operation. 573 Raise NeedMore otherwise. 574 """ 575 self._input += list(data) 576 self._parse(eof) 577 self._input[:self._pos] = [] 578 return ''.join(self._input) 579 580 def _parse(self, eof): 581 while self._state: 582 if self._pos>=len(self._input): 583 if not eof: 584 raise NeedMore() 585 elif self._state not in (self.ST_OP, self.ST_QUOTED, self.ST_HEREDOC): 586 #Delimit the current token and leave cleanly 587 self._push_token('') 588 break 589 else: 590 #Let the sublexer handle the eof themselves 591 pass 592 593 if self._state==self.ST_NORMAL: 594 self._parse_normal() 595 elif self._state==self.ST_COMMENT: 596 self._parse_comment() 597 elif self._state==self.ST_OP: 598 self._parse_op(eof) 599 elif self._state==self.ST_QUOTED: 600 self._parse_quoted(eof) 601 elif self._state==self.ST_HEREDOC: 602 self._parse_heredoc(eof) 603 else: 604 assert False, "Unknown state " + str(self._state) 605 606 if self._heredoc.op is not None: 607 raise ShellSyntaxError('missing here-document delimiter') 608 609 def _parse_normal(self): 610 c = self._input[self._pos] 611 if c=='\n': 612 self._push_token(c) 613 self._token = c 614 self._type = TK_NEWLINE 615 self._push_token('') 616 self._pos += 1 617 elif c in ('\\', '\'', '"', '`', '$'): 618 self._state = self.ST_QUOTED 619 elif is_partial_op(c): 620 self._push_token(c) 621 622 self._type = TK_OP 623 self._token += c 624 self._pos += 1 625 self._state = self.ST_OP 626 elif is_blank(c): 627 self._push_token(c) 628 629 #Discard blanks 630 self._pos += 1 631 elif self._token: 632 self._token += c 633 self._pos += 1 634 elif c=='#': 635 self._state = self.ST_COMMENT 636 self._type = TK_COMMENT 637 self._pos += 1 638 else: 639 self._pos += 1 640 self._token += c 641 642 def _parse_op(self, eof): 643 assert self._token 644 645 while 1: 646 if self._pos>=len(self._input): 647 if not eof: 648 raise NeedMore() 649 c = '' 650 else: 651 c = self._input[self._pos] 652 653 op = self._token + c 654 if c and is_partial_op(op): 655 #Still parsing an operator 656 self._token = op 657 self._pos += 1 658 else: 659 #End of operator 660 self._push_token(c) 661 self._state = self.ST_NORMAL 662 break 663 664 def _parse_comment(self): 665 while 1: 666 if self._pos>=len(self._input): 667 raise NeedMore() 668 669 c = self._input[self._pos] 670 if c=='\n': 671 #End of comment, do not consume the end of line 672 self._state = self.ST_NORMAL 673 break 674 else: 675 self._token += c 676 self._pos += 1 677 678 def _parse_quoted(self, eof): 679 """Precondition: the starting backquote/dollar is still in the input queue.""" 680 if not self._wordlexer: 681 self._wordlexer = WordLexer() 682 683 if self._pos<len(self._input): 684 #Transfer input queue character into the subparser 685 input = self._input[self._pos:] 686 self._pos += len(input) 687 688 wtree, remaining = self._wordlexer.add(input, eof) 689 self._wordlexer = None 690 self._token += wordtree_as_string(wtree) 691 692 #Put unparsed character back in the input queue 693 if remaining: 694 self._input[self._pos:self._pos] = list(remaining) 695 self._state = self.ST_NORMAL 696 697 def _parse_heredoc(self, eof): 698 assert not self._token 699 700 if self._herelexer is None: 701 self._herelexer = HereDocLexer(self._heredoc.op, self._heredoc.name) 702 703 if self._pos<len(self._input): 704 #Transfer input queue character into the subparser 705 input = self._input[self._pos:] 706 self._pos += len(input) 707 708 self._token, remaining = self._herelexer.add(input, eof) 709 710 #Reset here-document state 711 self._herelexer = None 712 heredoc, self._heredoc = self._heredoc, HereDoc(None) 713 if remaining: 714 self._input[self._pos:self._pos] = list(remaining) 715 self._state = self.ST_NORMAL 716 717 #Push pending tokens 718 heredoc.pendings[:0] = [(self._token, self._type, heredoc.name)] 719 for token, type, delim in heredoc.pendings: 720 self._token = token 721 self._type = type 722 self._push_token(delim) 723 724 def _push_token(self, delim): 725 if not self._token: 726 return 0 727 728 if self._heredoc.op is not None: 729 if self._heredoc.name is None: 730 #Here-document name 731 if self._type!=TK_TOKEN: 732 raise ShellSyntaxError("expecting here-document name, got '%s'" % self._token) 733 self._heredoc.name = unquote_wordtree(make_wordtree(self._token)) 734 self._type = TK_HERENAME 735 else: 736 #Capture all tokens until the newline starting the here-document 737 if self._type==TK_NEWLINE: 738 assert self._state==self.ST_NORMAL 739 self._state = self.ST_HEREDOC 740 741 self._heredoc.pendings.append((self._token, self._type, delim)) 742 self._token = '' 743 self._type = TK_TOKEN 744 return 1 745 746 # BEWARE: do not change parser state from here to the end of the function: 747 # when parsing between an here-document operator to the end of the line 748 # tokens are stored in self._heredoc.pendings. Therefore, they will not 749 # reach the section below. 750 751 #Check operators 752 if self._type==TK_OP: 753 #False positive because of partial op matching 754 op = is_op(self._token) 755 if not op: 756 self._type = TK_TOKEN 757 else: 758 #Map to the specific operator 759 self._type = op 760 if self._token in ('<<', '<<-'): 761 #Done here rather than in _parse_op because there is no need 762 #to change the parser state since we are still waiting for 763 #the here-document name 764 if self._heredoc.op is not None: 765 raise ShellSyntaxError("syntax error near token '%s'" % self._token) 766 assert self._heredoc.op is None 767 self._heredoc.op = self._token 768 769 if self._type==TK_TOKEN: 770 if '=' in self._token and not delim: 771 if self._token.startswith('='): 772 #Token is a WORD... a TOKEN that is. 773 pass 774 else: 775 prev = self._token[:self._token.find('=')] 776 if is_name(prev): 777 self._type = TK_ASSIGNMENT 778 else: 779 #Just a token (unspecified) 780 pass 781 else: 782 reserved = get_reserved(self._token) 783 if reserved is not None: 784 if reserved=='In' and self._for_count!=2: 785 #Sorry, not a reserved word after all 786 pass 787 else: 788 self._type = reserved 789 if reserved in ('For', 'Case'): 790 self._for_count = 0 791 elif are_digits(self._token) and delim in ('<', '>'): 792 #Detect IO_NUMBER 793 self._type = TK_IONUMBER 794 elif self._token==';': 795 self._type = TK_COMMA 796 elif self._token=='&': 797 self._type = TK_AMPERSAND 798 elif self._type==TK_COMMENT: 799 #Comments are not part of sh grammar, ignore them 800 self._token = '' 801 self._type = TK_TOKEN 802 return 0 803 804 if self._for_count is not None: 805 #Track token count in 'For' expression to detect 'In' reserved words. 806 #Can only be in third position, no need to go beyond 807 self._for_count += 1 808 if self._for_count==3: 809 self._for_count = None 810 811 self.on_token((self._token, self._type)) 812 self._token = '' 813 self._type = TK_TOKEN 814 return 1 815 816 def on_token(self, token): 817 raise NotImplementedError 818 819 820tokens = [ 821 TK_TOKEN, 822# To silence yacc unused token warnings 823# TK_COMMENT, 824 TK_NEWLINE, 825 TK_IONUMBER, 826 TK_ASSIGNMENT, 827 TK_HERENAME, 828] 829 830#Add specific operators 831tokens += _OPERATORS.values() 832#Add reserved words 833tokens += _RESERVEDS.values() 834 835class PLYLexer(Lexer): 836 """Bridge Lexer and PLY lexer interface.""" 837 def __init__(self): 838 Lexer.__init__(self) 839 self._tokens = [] 840 self._current = 0 841 self.lineno = 0 842 843 def on_token(self, token): 844 value, type = token 845 846 self.lineno = 0 847 t = lex.LexToken() 848 t.value = value 849 t.type = type 850 t.lexer = self 851 t.lexpos = 0 852 t.lineno = 0 853 854 self._tokens.append(t) 855 856 def is_empty(self): 857 return not bool(self._tokens) 858 859 #PLY compliant interface 860 def token(self): 861 if self._current>=len(self._tokens): 862 return None 863 t = self._tokens[self._current] 864 self._current += 1 865 return t 866 867 868def get_tokens(s): 869 """Parse the input string and return a tuple (tokens, unprocessed) where 870 tokens is a list of parsed tokens and unprocessed is the part of the input 871 string left untouched by the lexer. 872 """ 873 lexer = PLYLexer() 874 untouched = lexer.add(s, True) 875 tokens = [] 876 while 1: 877 token = lexer.token() 878 if token is None: 879 break 880 tokens.append(token) 881 882 tokens = [(t.value, t.type) for t in tokens] 883 return tokens, untouched 884