Source code for penman.lexer

# -*- coding: utf-8 -*-

"""
Classes and functions for lexing PENMAN strings.
"""

from typing import Union, Iterable, Iterator, NamedTuple, Pattern
import re
import logging

from penman.exceptions import DecodeError


logger = logging.getLogger(__name__)


# These are the regex patterns for parsing. They must not have any
# capturing groups. They are used during lexing and will be
# checked by name during parsing.
PATTERNS = {
    'COMMENT':    r'\#.*$',
    'STRING':     r'"[^"\\]*(?:\\.[^"\\]*)*"',
    'FLOAT':      r'''
      [-+]?
      (?:
        (?:(?:\d+\.\d*|\.\d+)  # .1   | 1.2
           (?:[eE][-+]?\d+)?)  # .1e2 | 1.2e3
       |\d+[eE][-+]?\d+        # 1e2
      )''',
    'INTEGER':    r'[-+]?\d+(?=[ )/:])',
    # ROLE cannot be made up of COLON + SYMBOL because it then becomes
    # difficult to detect anonymous roles: (a : b) vs (a :b c)
    'ROLE':       r':[^\s()\/,:~]*',
    'SYMBOL':     r'[^\s()\/,:~]+',
    'ALIGNMENT':  r'~(?:[a-zA-Z]\.?)?\d+(?:,\d+)*',
    'LPAREN':     r'\(',
    'RPAREN':     r'\)',
    'SLASH':      r'\/',  # concept (node label) role
    'COMMA':      r',',   # used in triple conjunctions
    'CARET':      r'\^',  # used in triple conjunctions
    'UNEXPECTED': r'[^\s]'
}


def _compile(*names: str) -> Pattern[str]:
    pat = '\n|'.join('(?P<{}>{})'.format(name, PATTERNS[name])
                     for name in names)
    return re.compile(pat, flags=re.VERBOSE)


# The order matters in these pattern lists as more permissive patterns
# can short-circuit stricter patterns.
PENMAN_RE = _compile('COMMENT',
                     'STRING', 'FLOAT', 'INTEGER',
                     'LPAREN', 'RPAREN', 'SLASH',
                     'ALIGNMENT', 'ROLE', 'SYMBOL',
                     'UNEXPECTED')
TRIPLE_RE = _compile('COMMENT',
                     'STRING', 'FLOAT', 'INTEGER',
                     'LPAREN', 'RPAREN', 'COMMA', 'CARET',
                     'SYMBOL',
                     'UNEXPECTED')


[docs]class Token(NamedTuple):
    """
    A lexed token.
    """
    type: str    #: The token type.
    text: str    #: The matched string for the token.
    lineno: int  #: The line number the token appears on.
    offset: int  #: The character offset of the token.
    line: str    #: The line the token appears in.

    @property
    def value(self):
        if self.type == 'INTEGER':
            return int(self.text)
        elif self.type == 'FLOAT':
            return float(self.text)
        else:
            return self.text


[docs]class TokenIterator(Iterator[Token]):
    """
    An iterator of Tokens with L1 lookahead.
    """

    def __init__(self, iterator):
        try:
            self._next = next(iterator)
        except StopIteration:
            self._next = None
        self._last = None
        self.iterator = iterator

    def __iter__(self):
        return self

    def __next__(self):
        return self.next()

    def __bool__(self):
        return self._next is not None

[docs]    def peek(self) -> Token:
        """
        Return the next token but do not advance the iterator.

        If the iterator is exhausted then a :exc:`DecodeError` is
        raised.
        """
        if self._next is None:
            raise self.error('Unexpected end of input')
        return self._next

[docs]    def next(self) -> Token:
        """
        Advance the iterator and return the next token.

        Raises:
            :exc:`StopIteration`: if the iterator is already
                exhausted.
        """
        current = self._next
        try:
            self._next = next(self.iterator)
        except StopIteration:
            if current is None:
                raise
            self._next = None
        self._last = current
        return current

[docs]    def expect(self, *choices):
        """
        Return the next token if its type is in *choices*.

        The iterator is advanced if successful.

        Raises:
            :exc:`DecodeError`: if the
                next token type is not in *choices*
        """
        try:
            token = self.next()
        except StopIteration:
            raise self.error('Unexpected end of input')
        if token.type not in choices:
            raise self.error('Expected: {}'.format(', '.join(choices)),
                             token=token)
        return token

[docs]    def accept(self, *choices):
        """
        Return the next token if its type is in *choices*.

        The iterator is advanced if successful. If unsuccessful,
        `None` is returned.
        """
        if self._next is not None and self._next.type in choices:
            return self.next()
        return None

    def error(self, message: str, token=None) -> DecodeError:
        if token is None:
            type = line = None
            if self._last is not None:
                lineno = self._last.lineno
                offset = self._last.offset + len(self._last.text)
                line = self._last.line
            else:
                lineno = offset = 0
        else:
            type, _, lineno, offset, line = token

        return DecodeError(message, lineno=lineno, offset=offset, text=line)


[docs]def lex(lines: Union[Iterable[str], str],
        pattern: Union[Pattern[str], str] = None) -> TokenIterator:
    """
    Yield PENMAN tokens matched in *lines*.

    By default, this lexes strings in *lines* using the basic pattern
    for PENMAN graphs. If *pattern* is given, it is used for lexing
    instead.

    Args:
        lines: iterable of lines to lex
        pattern: pattern to use for lexing instead of the default ones
    Returns:
        A :class:`TokenIterator` object
    """
    if isinstance(lines, str):
        lines = lines.splitlines()
    if pattern is not None:
        if isinstance(pattern, str):
            regex = re.compile(pattern, flags=re.VERBOSE)
        else:
            regex = pattern
    else:
        regex = PENMAN_RE

    tokens = _lex(lines, regex)
    return TokenIterator(tokens)


def _lex(lines: Iterable[str], regex: Pattern[str]) -> Iterator[Token]:
    for i, line in enumerate(lines, 1):
        logger.debug('Line %d: %r', i, line)
        matches = list(regex.finditer(line))
        tokens = []
        for m in matches:
            if m.lastgroup is None:
                raise ValueError(
                    'Lexer pattern generated a match without a named '
                    'capturing group:\n{}'.format(regex.pattern))
            tokens.append(Token(m.lastgroup, m.group(), i, m.start(), line))

        if logger.isEnabledFor(logging.DEBUG):
            for token in tokens:
                logger.debug(token)

        yield from tokens