Source code for vermouth.parser_utils

# -*- coding: utf-8 -*-
# Copyright 2018 University of Groningen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Helper functions for parsers
"""
from collections import deque

# This file contains helper methods and infrastructure for parsers. The class
# SectionLineParser in particular is a powerful tool that is intended to make
# parsing section based files easier. There's a fair chance it turned out to be
# a multi-tentacled Lovecraftion horror that's in charge of slightly magical
# switchboard. If it ever breaks I'm very sorry for you. Pray to your deity of
# choice and prepare your sacrificial chicken.



[docs]
class SectionParser(type):
    """
    Metaclass (!) that populates the `METH_DICT` attribute of new classes. The
    contents of `METH_DICT` are set by reading the `_section_names` attribute
    of all its attributes. You can conveniently set `_section_names` attributes
    using the :meth:`section_parser` decorator.
    """
    def __new__(mcs, name, bases, attrs, **kwargs):
        obj = super().__new__(mcs, name, bases, attrs, **kwargs)
        #if not hasattr(obj, 'METH_DICT'):
        obj.METH_DICT = {}
        mapping = obj.METH_DICT

        for attribute_name in dir(obj):
            attribute = getattr(obj, attribute_name)
            try:
                section_names = attribute._section_names
            except AttributeError:
                pass
            else:
                for names, kwargs in section_names.items():
                    mapping[names] = (attribute, kwargs)
        return obj


[docs]
    @staticmethod
    def section_parser(*names, **kwargs):
        """
        Parameters
        ----------
        names: tuple[collections.abc.Hashable]
            The section names that should be associated with the decorated
            function.
        kwargs: dict[str]
            The keyword arguments with which the decorated function should be
            called.
        """
        def wrapper(method):
            if not hasattr(method, '_section_names'):
                method._section_names = {}
            method._section_names[names] = kwargs
            return method
        return wrapper





[docs]
class LineParser:
    """
    Class that describes a parser object that parses a file line by line.
    Subclasses will probably want to override the methods :meth:`dispatch`,
    :meth:`parse_line`, and/or :meth:`finalize`:

      - :meth:`dispatch` is called for every line and should return the
        function that should be used to parse that line.
      - :meth:`parse_line` is called by the default implementation of
        :meth:`dispatch` for every line.
      - :meth:`finalize` is called at the end of the file.
    """
    COMMENT_CHAR = '#'


[docs]
    def parse(self, file_handle):
        """
        Reads lines from `file_handle`, and calls :meth:`dispatch` to find
        which method to call to do the actual parsing. Yields the result of
        that call, if it's not `None`.
        At the end, calls :meth:`finalize`, and yields its results, iff
        it's not None.

        Parameters
        ----------
        file_handle: collections.abc.Iterable[str]
            The data to parse. Should produce lines of data.

        Yields
        ------
        object
            The results of dispatching to parsing methods, and of
            :meth:`finalize`.
        """
        lineno = 0
        for lineno, line in enumerate(file_handle, 1):
            line, _ = split_comments(line, self.COMMENT_CHAR)
            if not line:
                continue
            result = self.dispatch(line)(line, lineno)

            if result is not None:
                yield result

        result = self.finalize(lineno)
        if result is not None:
            yield result



[docs]
    def finalize(self, lineno=0):
        """
        Wraps up. Is called at the end of the file.
        """
        return



[docs]
    def dispatch(self, line):
        """
        Finds the correct method to parse `line`. Always returns
        :meth:`parse_line`.
        """
        return self.parse_line



[docs]
    def parse_line(self, line, lineno):
        """
        Does nothing and should be overridden by subclasses.
        """
        return





[docs]
class SectionLineParser(LineParser, metaclass=SectionParser):
    """
    Baseclass for all parsers that have to parse file formats that are based on
    sections. Parses the `macros` section.
    Subclasses will probably want to override :meth:`finalize` and/or
    :meth:`finalize_section`.

    :meth:`finalize_section` is called with the previous section whenever a
    section ends.

    Attributes
    ----------
    section: list[str]
        The current section.
    macros: dict[str, str]
        A set of subsitution rules as parsed from a `macros` section.
    """
    METH_DICT = {}
    """
    A dict of all known parser methods, mapping section names to the
    function to be called and the associated keyword arguments.
    """

    def __init__(self, *args, **kwargs):
        self.macros = {}
        self.section = []
        super().__init__(*args, **kwargs)


[docs]
    def dispatch(self, line):
        """
        Looks at `line` to see what kind of line it is, and returns either
        :meth:`parse_header` if `line` is a section header or
        :meth:`parse_section` otherwise. Calls :meth:`is_section_header` to see
        whether `line` is a section header or not.

        Parameters
        ----------
        line: str

        Returns
        -------
        collections.abc.Callable
            The method that should be used to parse `line`.
        """
        if self.is_section_header(line):
            return self.parse_header
        else:
            return self.parse_section



[docs]
    def finalize(self, lineno=0):
        """
        Called after the last line has been parsed to wrap up. Resets
        the instance and calls :meth:`finalize_section`.

        Arguments
        ---------
        lineno: int
            The line number.
        """
        prev_section = self.section
        self.section = []
        result = self.finalize_section(prev_section, prev_section)
        self.macros = {}
        self.section = None
        return result



[docs]
    def finalize_section(self, previous_section, ended_section):
        """
        Called once a section is finished. Currently does nothing.

        Arguments
        ---------
        previous_section: list[str]
            The last parsed section.
        ended_section: list[str]
            The sections that have been ended.
        """
        return



[docs]
    def parse_section(self, line, lineno):
        """
        Parse `line` with line number `lineno` by looking up the section in
        :attr:`METH_DICT` and calling that method.

        Parameters
        ----------
        line: str
        lineno: int

        Returns
        -------
        object
            The result returned by calling the registered method.
        """
        line = _substitute_macros(line, self.macros)
        if tuple(self.section) not in self.METH_DICT:
            raise IOError("Can't parse line {} in section '{}' because the "
                          "section is unknown".format(lineno, self.section))
        try:
            method, kwargs = self.METH_DICT[tuple(self.section)]
            return method(self, line, lineno, **kwargs)
        except Exception as error:
            raise IOError("Problems parsing line {}. I think it should be a "
                          "'{}' line, but I can't parse it as such."
                          "".format(lineno, self.section)) from error



[docs]
    def parse_header(self, line, lineno=0):
        """
        Parses a section header with line number `lineno`. Sets :attr:`section`
        when applicable. Does not check whether `line` is a valid section
        header.

        Parameters
        ----------
        line: str
        lineno: str

        Returns
        -------
        object
            The result of calling :meth:`finalize_section`, which is called
            if a section ends.

        Raises
        ------
        KeyError
            If the section header is unknown.
        """
        prev_section = self.section

        section = self.section + [line.strip('[ ]').casefold()]

        ended = []
        while tuple(section) not in self.METH_DICT and len(section) > 1:
            ended.append(section.pop(-2))  # [a, b, c, d] -> [a, b, d]

        self.section = section
        if prev_section:
            result = self.finalize_section(prev_section, ended)
            return result



[docs]
    @staticmethod
    def is_section_header(line):
        """
        Parameters
        ----------
        line: str
            A line of text.

        Returns
        -------
        bool
            ``True`` iff `line` is a section header.

        Raises
        ------
        IOError
            The line starts like a section header but looks misformatted.
        """
        if line.startswith('['):
            if line.endswith(']'):
                return True
            else:
                raise IOError('Section header looks misformatted.')
        return False


    @SectionParser.section_parser('macros')
    def _macros(self, line, lineno=0):
        """
        Parses a "macros" section. Adds to :attr:`macros`.

        Parameters
        ----------
        line: str
        """
        line = deque(_tokenize(line))
        _parse_macro(line, self.macros)




[docs]
def split_comments(line, comment_char=';'):
    """
    Splits `line` at the first occurence of `comment_char`.

    Parameters
    ----------
    line: str
    comment_char: str

    Returns
    -------
    tuple[str, str]
        `line` before and after `comment_char`, respectively. If `line` does
        not contain `comment_char`, the second element will be an empty string.
    """
    split = line.split(comment_char, 1)
    data = split[0].strip()
    if len(split) == 1:
        return data, ''
    else:
        return data, split[1].strip()



def _tokenize(line):
    """
    Split an interaction line into its elementary components.

    An interaction line is any uncommented and non empty line that follows a
    section header about an interaction type. Such a line is composed of the
    following parts:

    * a list of atoms involved in the interaction,
    * an optional delimiter that indicates the end of the atom list,
    * a list of parameters for the interaction.

    The list of atoms is *a minima* a list of atom references. In blocks, these
    references can be atom 1-based indices referring to the order of the atoms
    in the "[ atoms ]" section. It is however more readable, and more robust,
    to refer to atoms by their name. Only the reference by name is allowed in
    links, as links may not have a full "[ atoms ]" section. In links, each
    atom reference can be complemented by atom attributes to specify the  scope
    of the link. These attribute follow the atom reference and are formatted
    like a python dictionary.

    The end-of-atoms delimiter is useful for interaction types that are not
    explicitly encoded in the parser. It allows to indicate when the list of
    atoms ends, and where the list of parameters starts. Two dashes ("--") are
    used as the delimiter. The delimiter is optional for the interaction types
    that are explicitly encoded in the parser and that refer to a fixed number
    of atoms.

    The list of parameters will be copied as-is in an ITP file.

    In its simplest form, an interaction line is what is used in an ITP file.
    Here is an example for a bond:

        2  3  1  0.2  1000

    The two first numbers refer to the second and third atoms of the block,
    respectively. The next three values are the parameters for a bond (*i.e.*
    the function type, the equilibrium distance, and the force constant).

    The two first numbers could be replaced by the corresponding atom names:

        PO4  GL1  1 0.2  1000

    where "PO4" and "GL1" are the names of the second and third atoms of the
    block.

    Optionally, the "--" delimiter can be used after the list of atoms:

        PO4  GL1  --  1 0.2  1000

    If the line is part of a link, then the atom selection may be limited in
    scope. Atom attributes is how to implement such scope limitation:

        BB {'resname': 'ALA', 'secstruc': 'H'} BB {'resname': 'LYS', 'secstruc': 'H', 'order': +1} 1 0.2 1000

    Here, we add a bond to the current link. At one end of the bond is the atom
    named "BB" and annotated as part of an alpha helix ('secstruc': 'H') of a
    residue called "ALA". On the other end of the link is an other
    atom named "BB" that is part of an alpha helix, but that is part of the
    next residue ('order': +1) if this next residue is named "LYS".

    The order parameter has a shortcut in the form of a + or - prefix to the
    atom reference name. Then, "+ATOM" refers to "ATOM" in the next residue,
    and is equivalent to "ATOM {'order': +1}"; "-ATOM" refers to the previous
    residue. There can be multiple + or -, "++ATOM" is equivalent to "ATOM
    {'order': +2}".

    When using attributes, the optional delimiter can increase the readability:

        BB {'resname': 'ALA', 'secstruc': 'H'} +BB {'resname': 'LYS', 'secstruc': 'H'} -- 1 0.2 1000

    Tokens on an interaction line are its different elements. These elements
    are considered as one token each: am atom reference, a set of atom
    attributes, the optional delimiter, each space-separated element of the
    parameter list. The line above splits into the following tokens:

    * ``BB``
    * ``{'resname': 'ALA', 'secstruc': 'H'}``
    * ``+BB``
    * ``{'resname': 'LYS', 'secstruc': 'H'}``
    * ``--``
    * ``1``
    * ``0.2``
    * ``1000``

    Atom attributes can be written next to the previous or the next token
    without an explicit separator. The two following lines yield the same three
    tokens:

        ATOM1{attributes}ATOM2
        ATOM1 {attributes} ATOM2

    Parameters
    ----------
    line: str

    Returns
    -------
    list of str
    """
    separators = ' \t\n'
    tokens = []
    start = 0
    end = -1
    # Find the first non-separator character
    for start, char in enumerate(line):
        if char not in separators:
            break

    # Find the tokens. This has to be a while-loop because we cannot predict
    # what will be the next value of start.
    while start < len(line):
        end = start
        # We count the brackets because if a token starts with an opening
        # bracket, we want to end it with the *matching* closing bracket.
        # Note also that we do not yet implement a way to escape a bracket, nor
        # do we check if the bracket is not part of a string.
        brackets = 0
        for end, end_char in enumerate(line[start:], start=start):
            if end_char == '{':
                # We reached an opening bracket. If it is the first character
                # of the token or if we are already engaged in a bracketized
                # token, then we go on. But if the current token was not
                # a bracketized token, it means we are at the beginning of
                # a new token, so we treat the opening bracket as a separator.
                if not brackets and end != start:
                    end -= 1
                    break
                brackets += 1
            elif end_char == '}':
                brackets -= 1
                if not brackets:
                    break
            elif end_char in separators:
                if not brackets:
                    # We reached a separator. We do not want the separator to
                    # be included in the token, so we push the end by one
                    # character to the left.
                    end -= 1
                    break
        if brackets > 0:
            msg = 'Unexpected end of line. A closing bracket is missing.'
            raise IOError(msg)
        elif brackets < 0:
            msg = 'An opening bracket is missing.'
            raise IOError(msg)

        token = line[start:end + 1]
        if token:
            tokens.append(token)

        # Find the beginning of the next token.
        start = end + 1
        while start < len(line) and line[start] in separators:
            start += 1
    return tokens


def _substitute_macros(line, macros):
    r"""
    Substitute macros by their content.

    A macro starts with a '$' and ends with one amongst ' ${}\n\t"'.

    Parameters
    ----------
    line: str
        The line to fix.
    macros: dict[str, str]
        Keys are macro names, values are the replacement content.

    Returns
    -------
    str
    """
    start = None
    while True:  # stops when start < 0
        start = line.find('$', start)
        if start < 0:
            break
        for end, char in enumerate(line[start + 1:], start=start + 1):
            if char in ' \t\n{}$"':
                break
        else: # no break
            end += 1
        macro_name = line[start + 1:end]
        macro_value = macros[macro_name]
        line = line[:start] + macro_value + line[end:]
        end = start + len(macro_value)
    return line


def _parse_macro(tokens, macros):
    if len(tokens) > 2:
        raise IOError('Unexpected column in macro definition.')
    elif len(tokens) < 2:
        raise IOError('Missing column in macro definition.')
    macro_name = tokens.popleft()
    macro_value = tokens.popleft()
    macros[macro_name] = macro_value
Source code for vermouth.parser_utils

VerMoUTH

Navigation

Related Topics