xref: /openbmc/openbmc/poky/bitbake/lib/bb/pysh/pyshlex.py (revision 19323693)
1# pyshlex.py - PLY compatible lexer for pysh.
2#
3# Copyright 2007 Patrick Mezard
4#
5# This software may be used and distributed according to the terms
6# of the GNU General Public License, incorporated herein by reference.
7
8# TODO:
9# - review all "char in 'abc'" snippets: the empty string can be matched
10# - test line continuations within quoted/expansion strings
11# - eof is buggy wrt sublexers
12# - the lexer cannot really work in pull mode as it would be required to run
13# PLY in pull mode. It was designed to work incrementally and it would not be
14# that hard to enable pull mode.
15import re
16
17from ply import lex
18from bb.pysh.sherrors import *
19
20class NeedMore(Exception):
21    pass
22
23def is_blank(c):
24    return c in (' ', '\t')
25
26_RE_DIGITS = re.compile(r'^\d+$')
27
28def are_digits(s):
29    return _RE_DIGITS.search(s) is not None
30
31_OPERATORS = dict([
32    ('&&', 'AND_IF'),
33    ('||', 'OR_IF'),
34    (';;', 'DSEMI'),
35    ('<<', 'DLESS'),
36    ('>>', 'DGREAT'),
37    ('<&', 'LESSAND'),
38    ('>&', 'GREATAND'),
39    ('<>', 'LESSGREAT'),
40    ('<<-', 'DLESSDASH'),
41    ('>|', 'CLOBBER'),
42    ('&', 'AMP'),
43    (';', 'COMMA'),
44    ('<', 'LESS'),
45    ('>', 'GREATER'),
46    ('(', 'LPARENS'),
47    (')', 'RPARENS'),
48])
49
50#Make a function to silence pychecker "Local variable shadows global"
51def make_partial_ops():
52    partials = {}
53    for k in _OPERATORS:
54        for i in range(1, len(k)+1):
55            partials[k[:i]] = None
56    return partials
57
58_PARTIAL_OPERATORS = make_partial_ops()
59
60def is_partial_op(s):
61    """Return True if s matches a non-empty subpart of an operator starting
62    at its first character.
63    """
64    return s in _PARTIAL_OPERATORS
65
66def is_op(s):
67    """If s matches an operator, returns the operator identifier. Return None
68    otherwise.
69    """
70    return _OPERATORS.get(s)
71
72_RESERVEDS = dict([
73    ('if', 'If'),
74    ('then', 'Then'),
75    ('else', 'Else'),
76    ('elif', 'Elif'),
77    ('fi', 'Fi'),
78    ('do', 'Do'),
79    ('done', 'Done'),
80    ('case', 'Case'),
81    ('esac', 'Esac'),
82    ('while', 'While'),
83    ('until', 'Until'),
84    ('for', 'For'),
85    ('{', 'Lbrace'),
86    ('}', 'Rbrace'),
87    ('!', 'Bang'),
88    ('in', 'In'),
89    ('|', 'PIPE'),
90])
91
92def get_reserved(s):
93    return _RESERVEDS.get(s)
94
95_RE_NAME = re.compile(r'^[0-9a-zA-Z_]+$')
96
97def is_name(s):
98    return _RE_NAME.search(s) is not None
99
100def find_chars(seq, chars):
101    for i,v in enumerate(seq):
102        if v in chars:
103            return i,v
104    return -1, None
105
106class WordLexer:
107    """WordLexer parse quoted or expansion expressions and return an expression
108    tree. The input string can be any well formed sequence beginning with quoting
109    or expansion character. Embedded expressions are handled recursively. The
110    resulting tree is made of lists and strings. Lists represent quoted or
111    expansion expressions. Each list first element is the opening separator,
112    the last one the closing separator. In-between can be any number of strings
113    or lists for sub-expressions. Non quoted/expansion expression can written as
114    strings or as lists with empty strings as starting and ending delimiters.
115    """
116
117    NAME_CHARSET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_'
118    NAME_CHARSET = dict(zip(NAME_CHARSET, NAME_CHARSET))
119
120    SPECIAL_CHARSET = '@*#?-$!0'
121
122    #Characters which can be escaped depends on the current delimiters
123    ESCAPABLE = {
124        '`': set(['$', '\\', '`']),
125        '"': set(['$', '\\', '`', '"']),
126        "'": set(),
127    }
128
129    def __init__(self, heredoc = False):
130        # _buffer is the unprocessed input characters buffer
131        self._buffer = []
132        # _stack is empty or contains a quoted list being processed
133        # (this is the DFS path to the quoted expression being evaluated).
134        self._stack = []
135        self._escapable = None
136        # True when parsing unquoted here documents
137        self._heredoc = heredoc
138
139    def add(self, data, eof=False):
140        """Feed the lexer with more data. If the quoted expression can be
141        delimited, return a tuple (expr, remaining) containing the expression
142        tree and the unconsumed data.
143        Otherwise, raise NeedMore.
144        """
145        self._buffer += list(data)
146        self._parse(eof)
147
148        result = self._stack[0]
149        remaining = ''.join(self._buffer)
150        self._stack = []
151        self._buffer = []
152        return result, remaining
153
154    def _is_escapable(self, c, delim=None):
155        if delim is None:
156            if self._heredoc:
157                # Backslashes works as if they were double quoted in unquoted
158                # here-documents
159                delim = '"'
160            else:
161                if len(self._stack)<=1:
162                    return True
163                delim = self._stack[-2][0]
164
165        escapables = self.ESCAPABLE.get(delim, None)
166        return escapables is None or c in escapables
167
168    def _parse_squote(self, buf, result, eof):
169        if not buf:
170            raise NeedMore()
171        try:
172            pos = buf.index("'")
173        except ValueError:
174            raise NeedMore()
175        result[-1] += ''.join(buf[:pos])
176        result += ["'"]
177        return pos+1, True
178
179    def _parse_bquote(self, buf, result, eof):
180        if not buf:
181            raise NeedMore()
182
183        if buf[0]=='\n':
184            #Remove line continuations
185            result[:] = ['', '', '']
186        elif self._is_escapable(buf[0]):
187            result[-1] += buf[0]
188            result += ['']
189        else:
190            #Keep as such
191            result[:] = ['', '\\'+buf[0], '']
192
193        return 1, True
194
195    def _parse_dquote(self, buf, result, eof):
196        if not buf:
197            raise NeedMore()
198        pos, sep = find_chars(buf, '$\\`"')
199        if pos==-1:
200            raise NeedMore()
201
202        result[-1] += ''.join(buf[:pos])
203        if sep=='"':
204            result += ['"']
205            return pos+1, True
206        else:
207            #Keep everything until the separator and defer processing
208            return pos, False
209
210    def _parse_command(self, buf, result, eof):
211        if not buf:
212            raise NeedMore()
213
214        chars = '$\\`"\''
215        if result[0] == '$(':
216            chars += ')'
217        pos, sep = find_chars(buf, chars)
218        if pos == -1:
219            raise NeedMore()
220
221        result[-1] += ''.join(buf[:pos])
222        if (result[0]=='$(' and sep==')') or (result[0]=='`' and sep=='`'):
223            result += [sep]
224            return pos+1, True
225        else:
226            return pos, False
227
228    def _parse_parameter(self, buf, result, eof):
229        if not buf:
230            raise NeedMore()
231
232        pos, sep = find_chars(buf, '$\\`"\'}')
233        if pos==-1:
234            raise NeedMore()
235
236        result[-1] += ''.join(buf[:pos])
237        if sep=='}':
238            result += [sep]
239            return pos+1, True
240        else:
241            return pos, False
242
243    def _parse_dollar(self, buf, result, eof):
244        sep = result[0]
245        if sep=='$':
246            if not buf:
247                #TODO: handle empty $
248                raise NeedMore()
249            if buf[0]=='(':
250                if len(buf)==1:
251                    raise NeedMore()
252
253                if buf[1]=='(':
254                    result[0] = '$(('
255                    buf[:2] = []
256                else:
257                    result[0] = '$('
258                    buf[:1] = []
259
260            elif buf[0]=='{':
261                result[0] = '${'
262                buf[:1] = []
263            else:
264                if buf[0] in self.SPECIAL_CHARSET:
265                    result[-1] = buf[0]
266                    read = 1
267                else:
268                    for read,c in enumerate(buf):
269                        if c not in self.NAME_CHARSET:
270                            break
271                    else:
272                        if not eof:
273                            raise NeedMore()
274                        read += 1
275
276                    result[-1] += ''.join(buf[0:read])
277
278                if not result[-1]:
279                    result[:] = ['', result[0], '']
280                else:
281                    result += ['']
282                return read,True
283
284        sep = result[0]
285        if sep=='$(':
286            parsefunc = self._parse_command
287        elif sep=='${':
288            parsefunc = self._parse_parameter
289        else:
290            raise NotImplementedError(sep)
291
292        pos, closed = parsefunc(buf, result, eof)
293        return pos, closed
294
295    def _parse(self, eof):
296        buf = self._buffer
297        stack = self._stack
298        recurse = False
299
300        while 1:
301            if not stack or recurse:
302                if not buf:
303                    raise NeedMore()
304                if buf[0] not in ('"\\`$\''):
305                    raise ShellSyntaxError('Invalid quoted string sequence')
306                stack.append([buf[0], ''])
307                buf[:1] = []
308                recurse = False
309
310            result = stack[-1]
311            if result[0]=="'":
312                parsefunc = self._parse_squote
313            elif result[0]=='\\':
314                parsefunc = self._parse_bquote
315            elif result[0]=='"':
316                parsefunc = self._parse_dquote
317            elif result[0]=='`':
318                parsefunc = self._parse_command
319            elif result[0][0]=='$':
320                parsefunc = self._parse_dollar
321            else:
322                raise NotImplementedError()
323
324            read, closed = parsefunc(buf, result, eof)
325
326            buf[:read] = []
327            if closed:
328                if len(stack)>1:
329                    #Merge in parent expression
330                    parsed = stack.pop()
331                    stack[-1] += [parsed]
332                    stack[-1] += ['']
333                else:
334                    break
335            else:
336                recurse = True
337
338def normalize_wordtree(wtree):
339    """Fold back every literal sequence (delimited with empty strings) into
340    parent sequence.
341    """
342    def normalize(wtree):
343        result = []
344        for part in wtree[1:-1]:
345            if isinstance(part, list):
346                part = normalize(part)
347                if part[0]=='':
348                    #Move the part content back at current level
349                    result += part[1:-1]
350                    continue
351            elif not part:
352                #Remove empty strings
353                continue
354            result.append(part)
355        if not result:
356            result = ['']
357        return [wtree[0]] + result + [wtree[-1]]
358
359    return normalize(wtree)
360
361
362def make_wordtree(token, here_document=False):
363    """Parse a delimited token and return a tree similar to the ones returned by
364    WordLexer. token may contain any combinations of expansion/quoted fields and
365    non-ones.
366    """
367    tree = ['']
368    remaining = token
369    delimiters = '\\$`'
370    if not here_document:
371        delimiters += '\'"'
372
373    while 1:
374        pos, sep = find_chars(remaining, delimiters)
375        if pos==-1:
376            tree += [remaining, '']
377            return normalize_wordtree(tree)
378        tree.append(remaining[:pos])
379        remaining = remaining[pos:]
380
381        try:
382            result, remaining = WordLexer(heredoc = here_document).add(remaining, True)
383        except NeedMore:
384            raise ShellSyntaxError('Invalid token "%s"')
385        tree.append(result)
386
387
388def wordtree_as_string(wtree):
389    """Rewrite an expression tree generated by make_wordtree as string."""
390    def visit(node, output):
391        for child in node:
392            if isinstance(child, list):
393                visit(child, output)
394            else:
395                output.append(child)
396
397    output = []
398    visit(wtree, output)
399    return ''.join(output)
400
401
402def unquote_wordtree(wtree):
403    """Fold the word tree while removing quotes everywhere. Other expansion
404    sequences are joined as such.
405    """
406    def unquote(wtree):
407        unquoted = []
408        if wtree[0] in ('', "'", '"', '\\'):
409            wtree = wtree[1:-1]
410
411        for part in wtree:
412            if isinstance(part, list):
413                part = unquote(part)
414            unquoted.append(part)
415        return ''.join(unquoted)
416
417    return unquote(wtree)
418
419
420class HereDocLexer:
421    """HereDocLexer delimits whatever comes from the here-document starting newline
422    not included to the closing delimiter line included.
423    """
424    def __init__(self, op, delim):
425        assert op in ('<<', '<<-')
426        if not delim:
427            raise ShellSyntaxError('invalid here document delimiter %s' % str(delim))
428
429        self._op = op
430        self._delim = delim
431        self._buffer = []
432        self._token = []
433
434    def add(self, data, eof):
435        """If the here-document was delimited, return a tuple (content, remaining).
436        Raise NeedMore() otherwise.
437        """
438        self._buffer += list(data)
439        self._parse(eof)
440        token = ''.join(self._token)
441        remaining = ''.join(self._buffer)
442        self._token, self._remaining = [], []
443        return token, remaining
444
445    def _parse(self, eof):
446        while 1:
447            #Look for first unescaped newline. Quotes may be ignored
448            escaped = False
449            for i,c in enumerate(self._buffer):
450                if escaped:
451                    escaped = False
452                elif c=='\\':
453                    escaped = True
454                elif c=='\n':
455                    break
456            else:
457                i = -1
458
459            if i==-1 or self._buffer[i]!='\n':
460                if not eof:
461                    raise NeedMore()
462                #No more data, maybe the last line is closing delimiter
463                line = ''.join(self._buffer)
464                eol = ''
465                self._buffer[:] = []
466            else:
467                line = ''.join(self._buffer[:i])
468                eol = self._buffer[i]
469                self._buffer[:i+1] = []
470
471            if self._op=='<<-':
472                line = line.lstrip('\t')
473
474            if line==self._delim:
475                break
476
477            self._token += [line, eol]
478            if i==-1:
479                break
480
481class Token:
482    #TODO: check this is still in use
483    OPERATOR = 'OPERATOR'
484    WORD = 'WORD'
485
486    def __init__(self):
487        self.value = ''
488        self.type = None
489
490    def __getitem__(self, key):
491        #Behave like a two elements tuple
492        if key==0:
493            return self.type
494        if key==1:
495            return self.value
496        raise IndexError(key)
497
498
499class HereDoc:
500    def __init__(self, op, name=None):
501        self.op = op
502        self.name = name
503        self.pendings = []
504
505TK_COMMA        = 'COMMA'
506TK_AMPERSAND    = 'AMP'
507TK_OP           = 'OP'
508TK_TOKEN        = 'TOKEN'
509TK_COMMENT      = 'COMMENT'
510TK_NEWLINE      = 'NEWLINE'
511TK_IONUMBER     = 'IO_NUMBER'
512TK_ASSIGNMENT   = 'ASSIGNMENT_WORD'
513TK_HERENAME     = 'HERENAME'
514
515class Lexer:
516    """Main lexer.
517
518    Call add() until the script AST is returned.
519    """
520    # Here-document handling makes the whole thing more complex because they basically
521    # force tokens to be reordered: here-content must come right after the operator
522    # and the here-document name, while some other tokens might be following the
523    # here-document expression on the same line.
524    #
525    # So, here-doc states are basically:
526    #   *self._state==ST_NORMAL
527    #       - self._heredoc.op is None: no here-document
528    #       - self._heredoc.op is not None but name is: here-document operator matched,
529    #           waiting for the document name/delimiter
530    #       - self._heredoc.op and name are not None: here-document is ready, following
531    #           tokens are being stored and will be pushed again when the document is
532    #           completely parsed.
533    #   *self._state==ST_HEREDOC
534    #       - The here-document is being delimited by self._herelexer. Once it is done
535    #           the content is pushed in front of the pending token list then all these
536    #           tokens are pushed once again.
537    ST_NORMAL       = 'ST_NORMAL'
538    ST_OP           = 'ST_OP'
539    ST_BACKSLASH    = 'ST_BACKSLASH'
540    ST_QUOTED       = 'ST_QUOTED'
541    ST_COMMENT      = 'ST_COMMENT'
542    ST_HEREDOC      = 'ST_HEREDOC'
543
544    #Match end of backquote strings
545    RE_BACKQUOTE_END = re.compile(r'(?<!\\)(`)')
546
547    def __init__(self, parent_state = None):
548        self._input = []
549        self._pos = 0
550
551        self._token = ''
552        self._type = TK_TOKEN
553
554        self._state = self.ST_NORMAL
555        self._parent_state = parent_state
556        self._wordlexer = None
557
558        self._heredoc = HereDoc(None)
559        self._herelexer = None
560
561        ### Following attributes are not used for delimiting token and can safely
562        ### be changed after here-document detection (see _push_toke)
563
564        # Count the number of tokens following a 'For' reserved word. Needed to
565        # return an 'In' reserved word if it comes in third place.
566        self._for_count = None
567
568    def add(self, data, eof=False):
569        """Feed the lexer with data.
570
571        When eof is set to True, returns unconsumed data or raise if the lexer
572        is in the middle of a delimiting operation.
573        Raise NeedMore otherwise.
574        """
575        self._input += list(data)
576        self._parse(eof)
577        self._input[:self._pos] = []
578        return ''.join(self._input)
579
580    def _parse(self, eof):
581        while self._state:
582            if self._pos>=len(self._input):
583                if not eof:
584                    raise NeedMore()
585                elif self._state not in (self.ST_OP, self.ST_QUOTED, self.ST_HEREDOC):
586                    #Delimit the current token and leave cleanly
587                    self._push_token('')
588                    break
589                else:
590                    #Let the sublexer handle the eof themselves
591                    pass
592
593            if self._state==self.ST_NORMAL:
594                self._parse_normal()
595            elif self._state==self.ST_COMMENT:
596                self._parse_comment()
597            elif self._state==self.ST_OP:
598                self._parse_op(eof)
599            elif self._state==self.ST_QUOTED:
600                self._parse_quoted(eof)
601            elif self._state==self.ST_HEREDOC:
602                self._parse_heredoc(eof)
603            else:
604                assert False, "Unknown state " + str(self._state)
605
606        if self._heredoc.op is not None:
607            raise ShellSyntaxError('missing here-document delimiter')
608
609    def _parse_normal(self):
610        c = self._input[self._pos]
611        if c=='\n':
612            self._push_token(c)
613            self._token = c
614            self._type = TK_NEWLINE
615            self._push_token('')
616            self._pos += 1
617        elif c in ('\\', '\'', '"', '`', '$'):
618            self._state = self.ST_QUOTED
619        elif is_partial_op(c):
620            self._push_token(c)
621
622            self._type = TK_OP
623            self._token += c
624            self._pos += 1
625            self._state = self.ST_OP
626        elif is_blank(c):
627            self._push_token(c)
628
629            #Discard blanks
630            self._pos += 1
631        elif self._token:
632            self._token += c
633            self._pos += 1
634        elif c=='#':
635            self._state = self.ST_COMMENT
636            self._type = TK_COMMENT
637            self._pos += 1
638        else:
639            self._pos += 1
640            self._token += c
641
642    def _parse_op(self, eof):
643        assert self._token
644
645        while 1:
646            if self._pos>=len(self._input):
647                if not eof:
648                    raise NeedMore()
649                c = ''
650            else:
651                c = self._input[self._pos]
652
653            op = self._token + c
654            if c and is_partial_op(op):
655                #Still parsing an operator
656                self._token = op
657                self._pos += 1
658            else:
659                #End of operator
660                self._push_token(c)
661                self._state = self.ST_NORMAL
662                break
663
664    def _parse_comment(self):
665        while 1:
666            if self._pos>=len(self._input):
667                raise NeedMore()
668
669            c = self._input[self._pos]
670            if c=='\n':
671                #End of comment, do not consume the end of line
672                self._state = self.ST_NORMAL
673                break
674            else:
675                self._token += c
676                self._pos += 1
677
678    def _parse_quoted(self, eof):
679        """Precondition: the starting backquote/dollar is still in the input queue."""
680        if not self._wordlexer:
681            self._wordlexer = WordLexer()
682
683        if self._pos<len(self._input):
684             #Transfer input queue character into the subparser
685            input = self._input[self._pos:]
686            self._pos += len(input)
687
688        wtree, remaining = self._wordlexer.add(input, eof)
689        self._wordlexer = None
690        self._token += wordtree_as_string(wtree)
691
692        #Put unparsed character back in the input queue
693        if remaining:
694            self._input[self._pos:self._pos] = list(remaining)
695        self._state = self.ST_NORMAL
696
697    def _parse_heredoc(self, eof):
698        assert not self._token
699
700        if self._herelexer is None:
701            self._herelexer = HereDocLexer(self._heredoc.op, self._heredoc.name)
702
703        if self._pos<len(self._input):
704             #Transfer input queue character into the subparser
705            input = self._input[self._pos:]
706            self._pos += len(input)
707
708        self._token, remaining = self._herelexer.add(input, eof)
709
710        #Reset here-document state
711        self._herelexer = None
712        heredoc, self._heredoc = self._heredoc, HereDoc(None)
713        if remaining:
714            self._input[self._pos:self._pos] = list(remaining)
715        self._state = self.ST_NORMAL
716
717        #Push pending tokens
718        heredoc.pendings[:0] = [(self._token, self._type, heredoc.name)]
719        for token, type, delim in heredoc.pendings:
720            self._token = token
721            self._type = type
722            self._push_token(delim)
723
724    def _push_token(self, delim):
725        if not self._token:
726            return 0
727
728        if self._heredoc.op is not None:
729            if self._heredoc.name is None:
730                #Here-document name
731                if self._type!=TK_TOKEN:
732                    raise ShellSyntaxError("expecting here-document name, got '%s'" % self._token)
733                self._heredoc.name = unquote_wordtree(make_wordtree(self._token))
734                self._type = TK_HERENAME
735            else:
736                #Capture all tokens until the newline starting the here-document
737                if self._type==TK_NEWLINE:
738                    assert self._state==self.ST_NORMAL
739                    self._state = self.ST_HEREDOC
740
741                self._heredoc.pendings.append((self._token, self._type, delim))
742                self._token = ''
743                self._type = TK_TOKEN
744                return 1
745
746        # BEWARE: do not change parser state from here to the end of the function:
747        # when parsing between an here-document operator to the end of the line
748        # tokens are stored in self._heredoc.pendings. Therefore, they will not
749        # reach the section below.
750
751        #Check operators
752        if self._type==TK_OP:
753            #False positive because of partial op matching
754            op = is_op(self._token)
755            if not op:
756                self._type = TK_TOKEN
757            else:
758                #Map to the specific operator
759                self._type = op
760                if self._token in ('<<', '<<-'):
761                    #Done here rather than in _parse_op because there is no need
762                    #to change the parser state since we are still waiting for
763                    #the here-document name
764                    if self._heredoc.op is not None:
765                        raise ShellSyntaxError("syntax error near token '%s'" % self._token)
766                    assert self._heredoc.op is None
767                    self._heredoc.op = self._token
768
769        if self._type==TK_TOKEN:
770            if '=' in self._token and not delim:
771                if self._token.startswith('='):
772                    #Token is a WORD... a TOKEN that is.
773                    pass
774                else:
775                    prev = self._token[:self._token.find('=')]
776                    if is_name(prev):
777                        self._type = TK_ASSIGNMENT
778                    else:
779                        #Just a token (unspecified)
780                        pass
781            else:
782                reserved = get_reserved(self._token)
783                if reserved is not None:
784                    if reserved=='In' and self._for_count!=2:
785                        #Sorry, not a reserved word after all
786                        pass
787                    else:
788                        self._type = reserved
789                        if reserved in ('For', 'Case'):
790                            self._for_count = 0
791                elif are_digits(self._token) and delim in ('<', '>'):
792                    #Detect IO_NUMBER
793                    self._type = TK_IONUMBER
794                elif self._token==';':
795                    self._type = TK_COMMA
796                elif self._token=='&':
797                    self._type = TK_AMPERSAND
798        elif self._type==TK_COMMENT:
799            #Comments are not part of sh grammar, ignore them
800            self._token = ''
801            self._type = TK_TOKEN
802            return 0
803
804        if self._for_count is not None:
805            #Track token count in 'For' expression to detect 'In' reserved words.
806            #Can only be in third position, no need to go beyond
807            self._for_count += 1
808            if self._for_count==3:
809                self._for_count = None
810
811        self.on_token((self._token, self._type))
812        self._token = ''
813        self._type = TK_TOKEN
814        return 1
815
816    def on_token(self, token):
817        raise NotImplementedError
818
819
820tokens = [
821    TK_TOKEN,
822# To silence yacc unused token warnings
823#    TK_COMMENT,
824    TK_NEWLINE,
825    TK_IONUMBER,
826    TK_ASSIGNMENT,
827    TK_HERENAME,
828]
829
830#Add specific operators
831tokens += _OPERATORS.values()
832#Add reserved words
833tokens += _RESERVEDS.values()
834
835class PLYLexer(Lexer):
836    """Bridge Lexer and PLY lexer interface."""
837    def __init__(self):
838        Lexer.__init__(self)
839        self._tokens = []
840        self._current = 0
841        self.lineno = 0
842
843    def on_token(self, token):
844        value, type = token
845
846        self.lineno = 0
847        t = lex.LexToken()
848        t.value = value
849        t.type = type
850        t.lexer = self
851        t.lexpos = 0
852        t.lineno = 0
853
854        self._tokens.append(t)
855
856    def is_empty(self):
857        return not bool(self._tokens)
858
859    #PLY compliant interface
860    def token(self):
861        if self._current>=len(self._tokens):
862            return None
863        t = self._tokens[self._current]
864        self._current += 1
865        return t
866
867
868def get_tokens(s):
869    """Parse the input string and return a tuple (tokens, unprocessed) where
870    tokens is a list of parsed tokens and unprocessed is the part of the input
871    string left untouched by the lexer.
872    """
873    lexer = PLYLexer()
874    untouched = lexer.add(s, True)
875    tokens = []
876    while 1:
877        token = lexer.token()
878        if token is None:
879            break
880        tokens.append(token)
881
882    tokens = [(t.value, t.type) for t in tokens]
883    return tokens, untouched
884