Source code for vermouth.pdb.pdb

# -*- coding: utf-8 -*-
# Copyright 2018 University of Groningen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Provides functions for reading and writing PDB files.
"""

import numpy as np
import networkx as nx

from ..file_writer import deferred_open
from ..molecule import Molecule
from ..utils import first_alpha, distance, format_atom_string
from ..parser_utils import LineParser
from ..truncating_formatter import TruncFormatter
from ..log_helpers import StyleAdapter, get_logger

LOGGER = StyleAdapter(get_logger(__name__))


[docs] class PDBParser(LineParser): """ Parser for PDB files Attributes ---------- active_molecule: vermouth.molecule.Molecule The molecule/model currently being read. molecules: list[vermouth.molecule.Molecule] All complete molecules read so far. modelidx: int Which model to take. Parameters ---------- exclude: collections.abc.Container[str] Container of residue names. Any atom that has a residue name that is in `exclude` will be skipped. ignh: bool Whether all hydrogen atoms should be skipped modelidx: int Which model to take. """ def __init__(self, exclude=('SOL',), ignh=False, modelidx=1): self.active_molecule = Molecule() self.molecules = [] self._conects = [] self.exclude = exclude self.ignh = ignh self.modelidx = modelidx self._skipahead = False self.cryst = {}
[docs] def dispatch(self, line): """ Returns the appropriate method for parsing `line`. This is determined based on the first 6 characters of `line`. Parameters ---------- line: str Returns ------- collections.abc.Callable[str, int] The method to call with the line, and the line number. """ record = line[:6].strip().lower() return getattr(self, record, self._unknown_line)
[docs] def parse(self, file_handle): # Only PDBParser.finalize should produce a result, namely a list of # molecules. This means that mols is a list containing a single list of # molecules, which is a little silly. outcome = list(super().parse(file_handle)) assert len(outcome) == 1 yield from outcome[0]
@staticmethod def _unknown_line(line, lineno): """ Called when a line is unknown. Raises a KeyError with a helpful message. """ raise KeyError("Line {} can not be parsed, since we don't recognize the" " first 6 characters. The line is: {}" "".format(lineno, line)) @staticmethod def _skip(line, lineno=0): """ Does nothing. """ # TODO: Parse some of these, and either do something useful with it, or # propagate it to e.g. the ITP # pylint: disable=bad-whitespace # TITLE SECTION header = _skip obslte = _skip title = _skip splt = _skip caveat = _skip compnd = _skip source = _skip keywds = _skip expdta = _skip nummdl = _skip mdltyp = _skip author = _skip revdat = _skip sprsde = _skip jrnl = _skip remark = _skip # PRIMARY STRUCTURE SECTION dbref = _skip dbref1 = _skip dbref2 = _skip seqadv = _skip seqres = _skip modres = _skip # HETEROGEN SECTION het = _skip formul = _skip hetnam = _skip hetsyn = _skip # SECONDARY STRUCTURE SECTION helix = _skip sheet = _skip # CONNECTIVITY ANNOTATION SECTION ssbond = _skip link = _skip cispep = _skip # MISCELLANEOUS FEATURES SECTION site = _skip # CRYSTALLOGRAPHIC AND COORDINATE TRANSFORMATION SECTION origx1 = _skip origx2 = _skip origx3 = _skip scale1 = _skip scale2 = _skip scale3 = _skip mtrix1 = _skip mtrix2 = _skip mtrix3 = _skip # COORDINATE SECTION # model = _skip # Used # atom = _skip # Used anisou = _skip # ter = _skip # Used # hetatm = _skip # Used # endmdl = _skip # Used # CONNECTIVITY SECTION # conect = _skip # Used # BOOKKEEPING SECTION master = _skip # end = _skip # Used # pylint: enable=bad-whitespace def _atom(self, line, lineno=0): """ Parse an ATOM or HETATM record. Parameters ---------- line: str The line to parse. We do not check whether it starts with either "ATOM " or "HETATM". lineno: int The line number (not used). """ if self._skipahead: return fields = [ ('', str, 6), ('atomid', int, 5), ('', str, 1), ('atomname', str, 4), ('altloc', str, 1), ('resname', str, 4), ('chain', str, 1), ('resid', int, 4), ('insertion_code', str, 1), ('', str, 3), ('x', float, 8), ('y', float, 8), ('z', float, 8), ('occupancy', float, 6), ('temp_factor', float, 6), ('', str, 10), ('element', str, 2), ('charge', str, 2), ] start = 0 field_slices = [] for name, type_, width in fields: if name: field_slices.append((name, type_, slice(start, start + width))) start += width properties = {} for name, type_, slice_ in field_slices: value = line[slice_].strip() if value: properties[name] = type_(value) else: properties[name] = type_() # Charge is special, since it's "2-" or "1+", rather than -2 etc. And # let's turn it into a number charge = properties['charge'] if charge: try: charge = float(charge) except ValueError: charge = float(charge[::-1]) else: charge = 0 properties['charge'] = charge pos = (properties.pop('x'), properties.pop('y'), properties.pop('z')) # Coordinates are read in Angstrom, but we want them in nm properties['position'] = np.array(pos, dtype=float) / 10 if not properties['element']: atomname = properties['atomname'] properties['element'] = first_alpha(atomname) if properties['altloc'] not in ['', 'A']: # TODO: allow selecting alternative conformation for specific # residues. LOGGER.warning("There is an alternative conformation for atom {}. " "We use conformation A exclusively", format_atom_string(properties), type='pdb-alternate') return if (properties['resname'] in self.exclude or (self.ignh and properties['element'] == 'H')): return idx = max(self.active_molecule) + 1 if self.active_molecule else 0 self.active_molecule.add_node(idx, **properties) atom = _atom hetatm = _atom
[docs] def cryst1(self, line, lineno=0): """ Parse the CRYST1 record. Crystal structure information are stored with the parser object and may be extracted later. """ fields = [ ('', str, 6), ('a', float, 9), ('b', float, 9), ('c', float, 9), ('alpha', float, 7), ('beta', float, 7), ('gamma', float, 7), ('space_group', str, 11), ('', str, 1), ('z_value', int, 4), ] start = 0 field_slices = [] for name, type_, width in fields: if name: field_slices.append((name, type_, slice(start, start + width))) start += width for name, type_, slice_ in field_slices: value = line[slice_].strip() if value: self.cryst[name] = type_(value) else: LOGGER.warning(f"CRYST1 directive incomplete. Missing entry for {name}.")
[docs] def model(self, line, lineno=0): """ Parse a MODEL record. If the model is not the same as :attr:`modelidx`, this model will not be parsed. Parameters ---------- line: str The line to parse. Should start with "MODEL ", but this is not checked. lineno: int The line number (not used). """ try: modelnr = int(line[10:14]) except ValueError: return else: self._skipahead = modelnr != self.modelidx
[docs] def conect(self, line, lineno=0): """ Parse a CONECT record. The line is stored for later processing. Parameters ---------- line: str The line to parse. Should start with CONECT, but this is not checked lineno: int The line number (not used). """ # We can't add edges immediately, since the molecule might not be parsed # yet (does the PDB file format mandate anything on the order of # records?). Instead, just store the lines for later use. self._conects.append(line)
def _finish_molecule(self, line="", lineno=0): """ Finish parsing the molecule. :attr:`active_molecule` will be appended to :attr:`molecules`, and a new :attr:`active_molecule` will be made. """ # We kind of *want* to yield self.active_molecule here, but we can't # since there's a very good chance it's CONECT records have not been # parsed yet, and the molecule won't have any edges. if self.active_molecule: if {"a", "b", "c"}.issubset(set(self.cryst.keys())): self.active_molecule.box = np.array([self.cryst['a']/10., self.cryst['b']/10., self.cryst['c']/10.]) self.molecules.append(self.active_molecule) self.active_molecule = Molecule() endmdl = _finish_molecule ter = _finish_molecule end = _finish_molecule
[docs] def finalize(self, lineno=0): """ Finish parsing the file. Process all CONECT records found, and returns a list of molecules. Parameters ---------- lineno: int The line number (not used). Returns ------- list[vermouth.molecule.Molecule] All molecules parsed from this file. """ # TODO: cross reference number of molecules with CMPND records self._finish_molecule() self.do_conect() return self.molecules
[docs] def do_conect(self): """ Apply connections to molecule based on CONECT records read from PDB file """ id2idxs = [{mol.nodes[idx]['atomid']: idx for idx in mol} for mol in self.molecules] for line in self._conects: start = 6 width = 5 atids = [] for num in range(start, len(line.rstrip()), width): atom = int(line[num:num + width]) atids.append(atom) self._do_single_conect(atids, id2idxs)
def _do_single_conect(self, conect_record, id2idxs): """ Process a single CONECT line. Adds edges to the molecules in :attr:`molecules`. Parameters ---------- conect_record: list[int] id2idxs: list[dict[int, int]] A list of dicts mapping atomids to node keys for every molecule in :attr:`molecules` """ atomid0 = conect_record[0] # Find the appropriate molecule: mol = None for mol, id2idx in zip(self.molecules, id2idxs): if atomid0 in id2idx: atomidx0 = id2idx[atomid0] break else: # no break # No molecule found with an atom containing atomid atid0. # This could be a hydrogen if ignh, or an excluded residue. return for atomid in conect_record[1:]: # Find the second molecule... mol2 = None for mol2, id2idx in zip(self.molecules, id2idxs): if atomid in id2idx: atomidx = id2idx[atomid] break else: # Two options: a skipped atom continue if mol is not mol2: assert mol is not None and mol2 is not None LOGGER.info('Merging two molecules/chains because there is a ' 'CONECT record between atoms {} and {}', format_atom_string(mol.nodes[atomidx0]), format_atom_string(mol2.nodes[atomidx])) # Conect record between two molecules! It's probably a *good* # idea to cross reference this with e.g. SSBOND and LINK records molidx = self.molecules.index(mol) molidx2 = self.molecules.index(mol2) del id2idxs[max(molidx, molidx2)] del id2idxs[min(molidx, molidx2)] self.molecules.remove(mol) self.molecules.remove(mol2) mol = nx.disjoint_union(mol, mol2) mol2 = mol self.molecules.append(mol) id2idxs.append({mol.nodes[idx]['atomid']: idx for idx in mol}) dist = distance(mol.nodes[atomidx0]['position'], mol2.nodes[atomidx]['position']) mol.add_edge(atomidx0, atomidx, distance=dist)
[docs] def read_pdb(file_name, exclude=('SOL',), ignh=False, modelidx=1): """ Parse a PDB file to create a molecule. Parameters ---------- filename: str The file to read. exclude: collections.abc.Container[str] Atoms that have one of these residue names will not be included. ignh: bool Whether hydrogen atoms should be ignored. model: int If the PDB file contains multiple models, which one to select. Returns ------- list[vermouth.molecule.Molecule] The parsed molecules. Will only contain edges if the PDB file has CONECT records. Either way, the molecules might be disconnected. Entries separated by TER, ENDMDL, and END records will result in separate molecules. """ parser = PDBParser(exclude, ignh, modelidx) with open(str(file_name)) as file_handle: mols = list(parser.parse(file_handle)) LOGGER.info('Read {} molecules from PDB file {}', len(mols), file_name) return mols
[docs] def get_not_none(node, attr, default): """ Returns ``node[attr]``. If it doesn't exists or is ``None``, return `default`. Parameters ---------- node: collections.abc.Mapping attr: collections.abc.Hashable default The value to return if ``node[attr]`` is either ``None``, or does not exist. Returns ------- object The value of ``node[attr]`` if it exists and is not ``None``, else `default`. """ value = node.get(attr) if value is None: value = default return value
[docs] def write_pdb_string(system, conect=True, omit_charges=True, nan_missing_pos=False): """ Describes `system` as a PDB formatted string. Will create CONECT records from the edges in the molecules in `system` iff `conect` is True. Parameters ---------- system: vermouth.system.System The system to write. conect: bool Whether to write CONECT records for the edges. omit_charges: bool Whether charges should be omitted. This is usually a good idea since the PDB format can only deal with integer charges. nan_missing_pos: bool Wether the writing should fail if an atom does not have a position. When set to `True`, atoms without coordinates will be written with 'nan' as coordinates; this will cause the output file to be *invalid* for most uses. Returns ------- str The system as PDB formatted string. """ out = [] formatter = TruncFormatter() # format_string = 'ATOM {: >5.5d} {:4.4s}{:1.1s}{:3.3s} {:1.1s}{:4.4d}{:1.1s} {:8.3f}{:8.3f}{:8.3f}{:6.2f}{:6.2f} {:2.2s}{:2.2s}' format_string = 'ATOM {: >5dt} {:4st}{:1st}{:3st} {:1st}{:>4dt}{:1st} {:8.3ft}{:8.3ft}{:8.3ft}{:6.2ft}{:6.2ft} {:2st}{:2st}' nodeidx2atomid = {} atomid = 1 for mol_idx, molecule in enumerate(system.molecules): for node_idx in molecule.sorted_nodes: # Node indices do not have to be unique across molecules. So store # them as (mol_idx, node_idx) nodeidx2atomid[(mol_idx, node_idx)] = atomid node = molecule.nodes[node_idx] atomname = get_not_none(node, 'atomname', '') altloc = get_not_none(node, 'altloc', '') resname = get_not_none(node, 'resname', '') chain = get_not_none(node, 'chain', '') resid = get_not_none(node, 'resid', 1) insertion_code = get_not_none(node, 'insertion_code', '') try: # converting from nm to A x, y, z = node['position'] * 10 # pylint: disable=invalid-name except KeyError: if nan_missing_pos: x = y = z = float('nan') # pylint: disable=invalid-name else: raise occupancy = get_not_none(node, 'occupancy', 1) temp_factor = get_not_none(node, 'temp_factor', 0) element = get_not_none(node, 'element', '') charge = get_not_none(node, 'charge', 0) if charge and not omit_charges: charge = '{:+2d}'.format(int(charge))[::-1] else: charge = '' line = formatter.format(format_string, atomid, atomname, altloc, resname, chain, resid, insertion_code, x, y, z, occupancy, temp_factor, element, charge) atomid += 1 out.append(line) terline = formatter.format('TER {: >5dt} {:3st} {:1st}{: >4dt}{:1st}', atomid, resname, chain, resid, insertion_code) atomid += 1 out.append(terline) if conect: number_fmt = '{:>4dt}' format_string = 'CONECT ' for mol_idx, molecule in enumerate(system.molecules): for node_idx in molecule: todo = sorted(nodeidx2atomid[(mol_idx, n_idx)] for n_idx in molecule[node_idx] if n_idx > node_idx) while todo: current, todo = todo[:4], todo[4:] fmt = ['CONECT'] + [number_fmt]*(len(current) + 1) fmt = ' '.join(fmt) line = formatter.format(fmt, nodeidx2atomid[(mol_idx, node_idx)], *current) out.append(line) out.append('END ') return '\n'.join(out)
[docs] def write_pdb(system, path, conect=True, omit_charges=True, nan_missing_pos=False, defer_writing=True): """ Writes `system` to `path` as a PDB formatted string. Parameters ---------- system: vermouth.system.System The system to write. path: str The file to write to. conect: bool Whether to write CONECT records for the edges. omit_charges: bool Whether charges should be omitted. This is usually a good idea since the PDB format can only deal with integer charges. nan_missing_pos: bool Whether the writing should fail if an atom does not have a position. When set to `True`, atoms without coordinates will be written with 'nan' as coordinates; this will cause the output file to be *invalid* for most uses. for most use. defer_writing: bool Whether to use :class:`~vermouth.file_writer.DeferredFileWriter` for writing data See Also -------- :func:write_pdb_string """ if defer_writing: open = deferred_open else: # This is needed since the variable assignment above declares `open` as # a local variable, which means it won't be looked up from the global # namespace any more. from builtins import open with open(path, 'w') as out: out.write(write_pdb_string(system, conect, omit_charges, nan_missing_pos))