Source code for penman.lexer

# -*- coding: utf-8 -*-

"""
Classes and functions for lexing PENMAN strings.
"""

from typing import Union, Iterable, Iterator, NamedTuple, Pattern
import re
import logging

from penman.exceptions import DecodeError


logger = logging.getLogger(__name__)


# These are the regex patterns for parsing. They must not have any
# capturing groups. They are used during lexing and will be
# checked by name during parsing.
PATTERNS = {
    'COMMENT':    r'\#.*$',
    'STRING':     r'"[^"\\]*(?:\\.[^"\\]*)*"',
    'FLOAT':      r'''
      [-+]?
      (?:
        (?:(?:\d+\.\d*|\.\d+)  # .1   | 1.2
           (?:[eE][-+]?\d+)?)  # .1e2 | 1.2e3
       |\d+[eE][-+]?\d+        # 1e2
      )''',
    'INTEGER':    r'[-+]?\d+(?=[ )/:])',
    # ROLE cannot be made up of COLON + SYMBOL because it then becomes
    # difficult to detect anonymous roles: (a : b) vs (a :b c)
    'ROLE':       r':[^\s()\/,:~]*',
    'SYMBOL':     r'[^\s()\/,:~]+',
    'ALIGNMENT':  r'~(?:[a-zA-Z]\.?)?\d+(?:,\d+)*',
    'LPAREN':     r'\(',
    'RPAREN':     r'\)',
    'SLASH':      r'\/',  # concept (node label) role
    'COMMA':      r',',   # used in triple conjunctions
    'CARET':      r'\^',  # used in triple conjunctions
    'UNEXPECTED': r'[^\s]'
}


def _compile(*names: str) -> Pattern[str]:
    pat = '\n|'.join('(?P<{}>{})'.format(name, PATTERNS[name])
                     for name in names)
    return re.compile(pat, flags=re.VERBOSE)


# The order matters in these pattern lists as more permissive patterns
# can short-circuit stricter patterns.
PENMAN_RE = _compile('COMMENT',
                     'STRING', 'FLOAT', 'INTEGER',
                     'LPAREN', 'RPAREN', 'SLASH',
                     'ALIGNMENT', 'ROLE', 'SYMBOL',
                     'UNEXPECTED')
TRIPLE_RE = _compile('COMMENT',
                     'STRING', 'FLOAT', 'INTEGER',
                     'LPAREN', 'RPAREN', 'COMMA', 'CARET',
                     'SYMBOL',
                     'UNEXPECTED')


[docs]class Token(NamedTuple): """ A lexed token. """ type: str #: The token type. text: str #: The matched string for the token. lineno: int #: The line number the token appears on. offset: int #: The character offset of the token. line: str #: The line the token appears in. @property def value(self): if self.type == 'INTEGER': return int(self.text) elif self.type == 'FLOAT': return float(self.text) else: return self.text
[docs]class TokenIterator(Iterator[Token]): """ An iterator of Tokens with L1 lookahead. """ def __init__(self, iterator): try: self._next = next(iterator) except StopIteration: self._next = None self._last = None self.iterator = iterator def __iter__(self): return self def __next__(self): return self.next() def __bool__(self): return self._next is not None
[docs] def peek(self) -> Token: """ Return the next token but do not advance the iterator. If the iterator is exhausted then a :exc:`DecodeError` is raised. """ if self._next is None: raise self.error('Unexpected end of input') return self._next
[docs] def next(self) -> Token: """ Advance the iterator and return the next token. Raises: :exc:`StopIteration`: if the iterator is already exhausted. """ current = self._next try: self._next = next(self.iterator) except StopIteration: if current is None: raise self._next = None self._last = current return current
[docs] def expect(self, *choices): """ Return the next token if its type is in *choices*. The iterator is advanced if successful. Raises: :exc:`DecodeError`: if the next token type is not in *choices* """ try: token = self.next() except StopIteration: raise self.error('Unexpected end of input') if token.type not in choices: raise self.error('Expected: {}'.format(', '.join(choices)), token=token) return token
[docs] def accept(self, *choices): """ Return the next token if its type is in *choices*. The iterator is advanced if successful. If unsuccessful, `None` is returned. """ if self._next is not None and self._next.type in choices: return self.next() return None
def error(self, message: str, token=None) -> DecodeError: if token is None: type = line = None if self._last is not None: lineno = self._last.lineno offset = self._last.offset + len(self._last.text) line = self._last.line else: lineno = offset = 0 else: type, _, lineno, offset, line = token return DecodeError(message, lineno=lineno, offset=offset, text=line)
[docs]def lex(lines: Union[Iterable[str], str], pattern: Union[Pattern[str], str] = None) -> TokenIterator: """ Yield PENMAN tokens matched in *lines*. By default, this lexes strings in *lines* using the basic pattern for PENMAN graphs. If *pattern* is given, it is used for lexing instead. Args: lines: iterable of lines to lex pattern: pattern to use for lexing instead of the default ones Returns: A :class:`TokenIterator` object """ if isinstance(lines, str): lines = lines.splitlines() if pattern is not None: if isinstance(pattern, str): regex = re.compile(pattern, flags=re.VERBOSE) else: regex = pattern else: regex = PENMAN_RE tokens = _lex(lines, regex) return TokenIterator(tokens)
def _lex(lines: Iterable[str], regex: Pattern[str]) -> Iterator[Token]: for i, line in enumerate(lines, 1): logger.debug('Line %d: %r', i, line) matches = list(regex.finditer(line)) tokens = [] for m in matches: if m.lastgroup is None: raise ValueError( 'Lexer pattern generated a match without a named ' 'capturing group:\n{}'.format(regex.pattern)) tokens.append(Token(m.lastgroup, m.group(), i, m.start(), line)) if logger.isEnabledFor(logging.DEBUG): for token in tokens: logger.debug(token) yield from tokens