Source code for vermouth.pdb.pdb

# -*- coding: utf-8 -*-
# Copyright 2018 University of Groningen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Provides functions for reading and writing PDB files.
"""

import numpy as np
import networkx as nx

from ..file_writer import deferred_open
from ..molecule import Molecule
from ..utils import first_alpha, distance, format_atom_string
from ..parser_utils import LineParser
from ..truncating_formatter import TruncFormatter
from ..log_helpers import StyleAdapter, get_logger

LOGGER = StyleAdapter(get_logger(__name__))



[docs]
class PDBParser(LineParser):
    """
    Parser for PDB files

    Attributes
    ----------
    active_molecule: vermouth.molecule.Molecule
        The molecule/model currently being read.
    molecules: list[vermouth.molecule.Molecule]
        All complete molecules read so far.
    modelidx: int
        Which model to take.

    Parameters
    ----------
    exclude: collections.abc.Container[str]
        Container of residue names. Any atom that has a residue name that is in
        `exclude` will be skipped.
    ignh: bool
        Whether all hydrogen atoms should be skipped
    modelidx: int
        Which model to take.
    """

    def __init__(self, exclude=('SOL',), ignh=False, modelidx=1):
        self.active_molecule = Molecule()
        self.molecules = []
        self._conects = []
        self.exclude = exclude
        self.ignh = ignh
        self.modelidx = modelidx
        self._skipahead = False
        self.cryst = {}


[docs]
    def dispatch(self, line):
        """
        Returns the appropriate method for parsing `line`. This is determined
        based on the first 6 characters of `line`.

        Parameters
        ----------
        line: str

        Returns
        -------
        collections.abc.Callable[str, int]
            The method to call with the line, and the line number.
        """
        record = line[:6].strip().lower()
        return getattr(self, record, self._unknown_line)



[docs]
    def parse(self, file_handle):
        # Only PDBParser.finalize should produce a result, namely a list of
        # molecules. This means that mols is a list containing a single list of
        # molecules, which is a little silly.
        outcome = list(super().parse(file_handle))
        assert len(outcome) == 1
        yield from outcome[0]


    @staticmethod
    def _unknown_line(line, lineno):
        """
        Called when a line is unknown. Raises a KeyError with a helpful message.
        """
        raise KeyError("Line {} can not be parsed, since we don't recognize the"
                       " first 6 characters. The line is: {}"
                       "".format(lineno, line))

    @staticmethod
    def _skip(line, lineno=0):
        """
        Does nothing.
        """


    # TODO: Parse some of these, and either do something useful with it, or
    #       propagate it to e.g. the ITP
    # pylint: disable=bad-whitespace
    # TITLE SECTION
    header = _skip
    obslte = _skip
    title  = _skip
    splt   = _skip
    caveat = _skip
    compnd = _skip
    source = _skip
    keywds = _skip
    expdta = _skip
    nummdl = _skip
    mdltyp = _skip
    author = _skip
    revdat = _skip
    sprsde = _skip
    jrnl   = _skip
    remark = _skip

    # PRIMARY STRUCTURE SECTION
    dbref  = _skip
    dbref1 = _skip
    dbref2 = _skip
    seqadv = _skip
    seqres = _skip
    modres = _skip

    # HETEROGEN SECTION
    het    = _skip
    formul = _skip
    hetnam = _skip
    hetsyn = _skip

    # SECONDARY STRUCTURE SECTION
    helix  = _skip
    sheet  = _skip

    # CONNECTIVITY ANNOTATION SECTION
    ssbond = _skip
    link   = _skip
    cispep = _skip

    # MISCELLANEOUS FEATURES SECTION
    site   = _skip

    # CRYSTALLOGRAPHIC AND COORDINATE TRANSFORMATION SECTION
    origx1 = _skip
    origx2 = _skip
    origx3 = _skip
    scale1 = _skip
    scale2 = _skip
    scale3 = _skip
    mtrix1 = _skip
    mtrix2 = _skip
    mtrix3 = _skip

    # COORDINATE SECTION
    # model  = _skip  # Used
    # atom   = _skip  # Used
    anisou = _skip
    # ter    = _skip  # Used
    # hetatm = _skip  # Used
    # endmdl = _skip  # Used

    # CONNECTIVITY SECTION
    # conect = _skip  # Used

    # BOOKKEEPING SECTION
    master = _skip
    # end   = _skip  # Used
    # pylint: enable=bad-whitespace

    def _atom(self, line, lineno=0):
        """
        Parse an ATOM or HETATM record.

        Parameters
        ----------
        line: str
            The line to parse. We do not check whether it starts with either
            "ATOM  " or "HETATM".
        lineno: int
            The line number (not used).
        """
        if self._skipahead:
            return

        fields = [
            ('', str, 6),
            ('atomid', int, 5),
            ('', str, 1),
            ('atomname', str, 4),
            ('altloc', str, 1),
            ('resname', str, 4),
            ('chain', str, 1),
            ('resid', int, 4),
            ('insertion_code', str, 1),
            ('', str, 3),
            ('x', float, 8),
            ('y', float, 8),
            ('z', float, 8),
            ('occupancy', float, 6),
            ('temp_factor', float, 6),
            ('', str, 10),
            ('element', str, 2),
            ('charge', str, 2),
        ]

        start = 0
        field_slices = []
        for name, type_, width in fields:
            if name:
                field_slices.append((name, type_, slice(start, start + width)))
            start += width

        properties = {}
        for name, type_, slice_ in field_slices:
            value = line[slice_].strip()
            if value:
                properties[name] = type_(value)
            else:
                properties[name] = type_()

        # Charge is special, since it's "2-" or "1+", rather than -2 etc. And
        # let's turn it into a number
        charge = properties['charge']
        if charge:
            try:
                charge = float(charge)
            except ValueError:
                charge = float(charge[::-1])
        else:
            charge = 0
        properties['charge'] = charge

        pos = (properties.pop('x'), properties.pop('y'), properties.pop('z'))
        # Coordinates are read in Angstrom, but we want them in nm
        properties['position'] = np.array(pos, dtype=float) / 10

        if not properties['element']:
            atomname = properties['atomname']
            properties['element'] = first_alpha(atomname)
        if properties['altloc'] not in ['', 'A']:
            # TODO: allow selecting alternative conformation for specific
            #       residues.
            LOGGER.warning("There is an alternative conformation for atom {}. "
                           "We use conformation A exclusively",
                           format_atom_string(properties),
                           type='pdb-alternate')
            return
        if (properties['resname'] in self.exclude or
                (self.ignh and properties['element'] == 'H')):
            return
        idx = max(self.active_molecule) + 1 if self.active_molecule else 0
        self.active_molecule.add_node(idx, **properties)

    atom = _atom
    hetatm = _atom


[docs]
    def cryst1(self, line, lineno=0):
        """
        Parse the CRYST1 record. Crystal structure information are stored with
        the parser object and may be extracted later.
        """
        fields = [
            ('', str, 6),
            ('a', float, 9),
            ('b', float, 9),
            ('c', float, 9),
            ('alpha', float, 7),
            ('beta', float, 7),
            ('gamma', float, 7),
            ('space_group', str, 11),
            ('', str, 1),
            ('z_value', int, 4),
            ]
        start = 0
        field_slices = []
        for name, type_, width in fields:
            if name:
                field_slices.append((name, type_, slice(start, start + width)))
            start += width

        for name, type_, slice_ in field_slices:
            value = line[slice_].strip()
            if value:
                self.cryst[name] = type_(value)
            else:
                LOGGER.warning(f"CRYST1 directive incomplete. Missing entry for {name}.")



[docs]
    def model(self, line, lineno=0):
        """
        Parse a MODEL record. If the model is not the same as :attr:`modelidx`,
        this model will not be parsed.

        Parameters
        ----------
        line: str
            The line to parse. Should start with "MODEL ", but this is not
            checked.
        lineno: int
            The line number (not used).
        """
        try:
            modelnr = int(line[10:14])
        except ValueError:
            return
        else:
            self._skipahead = modelnr != self.modelidx



[docs]
    def conect(self, line, lineno=0):
        """
        Parse a CONECT record. The line is stored for later processing.

        Parameters
        ----------
        line: str
            The line to parse. Should start with CONECT, but this is not checked
        lineno: int
            The line number (not used).
        """
        # We can't add edges immediately, since the molecule might not be parsed
        # yet (does the PDB file format mandate anything on the order of
        # records?). Instead, just store the lines for later use.
        self._conects.append(line)


    def _finish_molecule(self, line="", lineno=0):
        """
        Finish parsing the molecule. :attr:`active_molecule` will be appended to
        :attr:`molecules`, and a new :attr:`active_molecule` will be made.
        """
        # We kind of *want* to yield self.active_molecule here, but we can't
        # since there's a very good chance it's CONECT records have not been
        # parsed yet, and the molecule won't have any edges.
        if self.active_molecule:
            if {"a", "b", "c"}.issubset(set(self.cryst.keys())):
                self.active_molecule.box = np.array([self.cryst['a']/10.,
                                                     self.cryst['b']/10.,
                                                     self.cryst['c']/10.])
            self.molecules.append(self.active_molecule)
        self.active_molecule = Molecule()

    endmdl = _finish_molecule
    ter = _finish_molecule
    end = _finish_molecule


[docs]
    def finalize(self, lineno=0):
        """
        Finish parsing the file. Process all CONECT records found, and returns
        a list of molecules.

        Parameters
        ----------
        lineno: int
            The line number (not used).

        Returns
        -------
        list[vermouth.molecule.Molecule]
            All molecules parsed from this file.
        """
        # TODO: cross reference number of molecules with CMPND records
        self._finish_molecule()
        self.do_conect()
        return self.molecules



[docs]
    def do_conect(self):
        """
        Apply connections to molecule based on CONECT records read from PDB file
        """
        id2idxs = [{mol.nodes[idx]['atomid']: idx for idx in mol}
                   for mol in self.molecules]
        for line in self._conects:
            start = 6
            width = 5
            atids = []
            for num in range(start, len(line.rstrip()), width):
                atom = int(line[num:num + width])
                atids.append(atom)
            self._do_single_conect(atids, id2idxs)


    def _do_single_conect(self, conect_record, id2idxs):
        """
        Process a single CONECT line. Adds edges to the molecules in
        :attr:`molecules`.

        Parameters
        ----------
        conect_record: list[int]
        id2idxs: list[dict[int, int]]
            A list of dicts mapping atomids to node keys for every molecule in
            :attr:`molecules`
        """
        atomid0 = conect_record[0]
        # Find the appropriate molecule:
        mol = None
        for mol, id2idx in zip(self.molecules, id2idxs):
            if atomid0 in id2idx:
                atomidx0 = id2idx[atomid0]
                break
        else:  # no break
            # No molecule found with an atom containing atomid atid0.
            # This could be a hydrogen if ignh, or an excluded residue.
            return
        for atomid in conect_record[1:]:
            # Find the second molecule...
            mol2 = None
            for mol2, id2idx in zip(self.molecules, id2idxs):
                if atomid in id2idx:
                    atomidx = id2idx[atomid]
                    break
            else:
                # Two options: a skipped atom
                continue
            if mol is not mol2:
                assert mol is not None and mol2 is not None
                LOGGER.info('Merging two molecules/chains because there is a '
                            'CONECT record between atoms {} and {}',
                            format_atom_string(mol.nodes[atomidx0]),
                            format_atom_string(mol2.nodes[atomidx]))
                # Conect record between two molecules! It's probably a *good*
                # idea to cross reference this with e.g. SSBOND and LINK records
                molidx = self.molecules.index(mol)
                molidx2 = self.molecules.index(mol2)
                del id2idxs[max(molidx, molidx2)]
                del id2idxs[min(molidx, molidx2)]
                self.molecules.remove(mol)
                self.molecules.remove(mol2)
                mol = nx.disjoint_union(mol, mol2)
                mol2 = mol
                self.molecules.append(mol)
                id2idxs.append({mol.nodes[idx]['atomid']: idx for idx in mol})

            dist = distance(mol.nodes[atomidx0]['position'],
                            mol2.nodes[atomidx]['position'])
            mol.add_edge(atomidx0, atomidx, distance=dist)




[docs]
def read_pdb(file_name, exclude=('SOL',), ignh=False, modelidx=1):
    """
    Parse a PDB file to create a molecule.

    Parameters
    ----------
    filename: str
        The file to read.
    exclude: collections.abc.Container[str]
        Atoms that have one of these residue names will not be included.
    ignh: bool
        Whether hydrogen atoms should be ignored.
    model: int
        If the PDB file contains multiple models, which one to select.

    Returns
    -------
    list[vermouth.molecule.Molecule]
        The parsed molecules. Will only contain edges if the PDB file has
        CONECT records. Either way, the molecules might be disconnected. Entries
        separated by TER, ENDMDL, and END records will result in separate
        molecules.
    """
    parser = PDBParser(exclude, ignh, modelidx)
    with open(str(file_name), encoding='utf-8') as file_handle:
        mols = list(parser.parse(file_handle))
    LOGGER.info('Read {} molecules from PDB file {}', len(mols), file_name)
    return mols




[docs]
def get_not_none(node, attr, default):
    """
    Returns ``node[attr]``. If it doesn't exists or is ``None``, return
    `default`.

    Parameters
    ----------
    node: collections.abc.Mapping
    attr: collections.abc.Hashable
    default
        The value to return if ``node[attr]`` is either ``None``, or does not
        exist.

    Returns
    -------
    object
        The value of ``node[attr]`` if it exists and is not ``None``, else
        `default`.
    """
    value = node.get(attr)
    if value is None:
        value = default
    return value




[docs]
def write_pdb_string(system, conect=True, omit_charges=True, nan_missing_pos=False):
    """
    Describes `system` as a PDB formatted string. Will create CONECT records
    from the edges in the molecules in `system` iff `conect` is True.

    Parameters
    ----------
    system: vermouth.system.System
        The system to write.
    conect: bool
        Whether to write CONECT records for the edges.
    omit_charges: bool
        Whether charges should be omitted. This is usually a good idea since
        the PDB format can only deal with integer charges.
    nan_missing_pos: bool
        Wether the writing should fail if an atom does not have a position.
        When set to `True`, atoms without coordinates will be written
        with 'nan' as coordinates; this will cause the output file to be
        *invalid* for most uses.

    Returns
    -------
    str
        The system as PDB formatted string.
    """
    out = []

    formatter = TruncFormatter()
#    format_string = 'ATOM  {: >5.5d} {:4.4s}{:1.1s}{:3.3s} {:1.1s}{:4.4d}{:1.1s}   {:8.3f}{:8.3f}{:8.3f}{:6.2f}{:6.2f}          {:2.2s}{:2.2s}'
    format_string = 'ATOM  {: >5dt} {:4st}{:1st}{:3st} {:1st}{:>4dt}{:1st}   {:8.3ft}{:8.3ft}{:8.3ft}{:6.2ft}{:6.2ft}          {:2st}{:2st}'

    nodeidx2atomid = {}
    atomid = 1
    for mol_idx, molecule in enumerate(system.molecules):
        for node_idx in molecule.sorted_nodes:
            # Node indices do not have to be unique across molecules. So store
            # them as (mol_idx, node_idx)
            nodeidx2atomid[(mol_idx, node_idx)] = atomid
            node = molecule.nodes[node_idx]
            atomname = get_not_none(node, 'atomname', '')
            altloc = get_not_none(node, 'altloc', '')
            resname = get_not_none(node, 'resname', '')
            chain = get_not_none(node, 'chain', '')
            resid = get_not_none(node, 'resid', 1)
            insertion_code = get_not_none(node, 'insertion_code', '')
            try:
                # converting from nm to A
                x, y, z = node['position'] * 10  # pylint: disable=invalid-name
            except KeyError:
                if nan_missing_pos:
                    x = y = z = float('nan')  # pylint: disable=invalid-name
                else:
                    raise
            occupancy = get_not_none(node, 'occupancy', 1)
            temp_factor = get_not_none(node, 'temp_factor', 0)
            element = get_not_none(node, 'element', '')
            charge = get_not_none(node, 'charge', 0)
            if charge and not omit_charges:
                charge = '{:+2d}'.format(int(charge))[::-1]
            else:
                charge = ''
            line = formatter.format(format_string, atomid, atomname, altloc,
                                    resname, chain, resid, insertion_code, x,
                                    y, z, occupancy, temp_factor, element,
                                    charge)
            atomid += 1
            out.append(line)
        terline = formatter.format('TER   {: >5dt}      {:3st} {:1st}{: >4dt}{:1st}',
                                   atomid, resname, chain, resid, insertion_code)
        atomid += 1
        out.append(terline)
    if conect:
        number_fmt = '{:>4dt}'
        format_string = 'CONECT '
        for mol_idx, molecule in enumerate(system.molecules):
            for node_idx in molecule:
                todo = sorted(nodeidx2atomid[(mol_idx, n_idx)]
                              for n_idx in molecule[node_idx] if n_idx > node_idx)
                while todo:
                    current, todo = todo[:4], todo[4:]
                    fmt = ['CONECT'] + [number_fmt]*(len(current) + 1)
                    fmt = ' '.join(fmt)
                    line = formatter.format(fmt, nodeidx2atomid[(mol_idx, node_idx)], *current)
                    out.append(line)
    out.append('END   ')
    return '\n'.join(out)




[docs]
def write_pdb(system, path, conect=True, omit_charges=True, nan_missing_pos=False, defer_writing=True):
    """
    Writes `system` to `path` as a PDB formatted string.

    Parameters
    ----------
    system: vermouth.system.System
        The system to write.
    path: str
        The file to write to.
    conect: bool
        Whether to write CONECT records for the edges.
    omit_charges: bool
        Whether charges should be omitted. This is usually a good idea since
        the PDB format can only deal with integer charges.
    nan_missing_pos: bool
        Whether the writing should fail if an atom does not have a position.
        When set to `True`, atoms without coordinates will be written
        with 'nan' as coordinates; this will cause the output file to be
        *invalid* for most uses.
        for most use.
    defer_writing: bool
        Whether to use :class:`~vermouth.file_writer.DeferredFileWriter` for writing data

    See Also
    --------
    :func:write_pdb_string
    """
    if defer_writing:
        open = deferred_open
    else:
        # This is needed since the variable assignment above declares `open` as
        # a local variable, which means it won't be looked up from the global
        # namespace any more.
        from builtins import open
    with open(path, 'w', encoding='utf-8') as out:
        out.write(write_pdb_string(system, conect, omit_charges, nan_missing_pos))
Source code for vermouth.pdb.pdb

VerMoUTH

Navigation

Related Topics