Source code for vermouth.pdb.cif

# -*- coding: utf-8 -*-
# Copyright 2025 University of Groningen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Provides functionality to read CIF files.
"""

try:
    from CifFile import ReadCif
except ImportError:
    HAVE_READCIF = False
else:
    HAVE_READCIF= True

import numpy as np

from ..molecule import Molecule
from ..log_helpers import StyleAdapter, get_logger

LOGGER = StyleAdapter(get_logger(__name__))


[docs] def casting(value, typeto): """ Cast a value to its correct type with an exception value: value to cast type: the expected output type of the value """ try: return typeto(value) # except a ValueError. The two known ones are resid = '.' or charge = '?' which must be a str. # may cause an error if more are added except ValueError: return None
def _cell(cf, entry): """ get the cell dimensions from a cif file as a list Parameters ---------- cf: CifFile.CifFile_module.CifFile CIF file object parsed by PyCifRW entry: str Key of the cf dict Returns ------- dims: np.array array of len(6) containing the dimensions of the cell """ try: a = cf[entry]['_cell.length_a'] b = cf[entry]['_cell.length_b'] c = cf[entry]['_cell.length_c'] alpha = cf[entry]['_cell.angle_alpha'] beta = cf[entry]['_cell.angle_beta'] gamma = cf[entry]['_cell.angle_gamma'] dims = np.array([a, b, c, alpha, beta, gamma], dtype=float) except KeyError: LOGGER.info("_cell information missing from .cif file. Will write default dimensions") dims = np.array([1, 1, 1, 90, 90, 90]) return dims
[docs] def cif_entry_reader(cf, entry, cif_categories_all, cif_category_names, cif_category_types, essential_properties, modelidx, exclude, ignh): """ Read a single entry of a parsed cif file Parameters ---------- cf: :func: `<CifFile.CifFile_module.CifFile>` CIF file object parsed by PyCifRW entry: str Key of the cf dict cif_categories_all: list List of valid category keys contained within a cif file entry cif_category_names: list Common name for category entries cif_category_types: list Type to cast CIF data to essential_properties: list List of properties which must be contained modelidx: int If the cif file contains multiple models, which one to select. exclude: collections.abc.Container[str] Atoms that have one of these residue names will not be included. ignh: bool Whether hydrogen atoms should be ignored. Returns ------- vermouth.molecule.Molecule A parsed molecule. Will not contain edges """ # first filter the data by which categories are present # make list of the category names which are present names = [] # make a list of the lists of the data all_data = [] for category, name, cattype in zip(cif_categories_all, cif_category_names, cif_category_types): if category in cf[entry]: # cast the data to its correct type data = [casting(i, cattype) for i in cf[entry][category]] all_data.append(data) names.append(name) # if we're missing one of the essential categories, raise a warning elif name in essential_properties: LOGGER.warning(f"{name} data is missing from the input file, and is a required field.") # if we don't have atomnames but do have element information, copy it. if ('atomname' not in names) and ('element' in names): LOGGER.warning("atomname data missing from input file. Will attempt to continue using element data.") names.append('atomname') all_data.append([casting(i, str) for i in cf[entry]['_atom_site.type_symbol']]) # for each atom, make a dictionary with its associated name properties_dict_list = [dict(zip(names, v)) for v in zip(*all_data)] molecule = Molecule() # find the indices in the data which are actually from the model that we're after. model_atom_indices = [index for index, value in enumerate([int(model) == modelidx for model in cf[entry]['_atom_site.pdbx_PDB_model_num']]) if value] idx = 0 # add nodes by separate index in case we skip some atoms for index in model_atom_indices: properties = properties_dict_list[index] if properties.get('resname', None) in exclude or (ignh and properties['element'] == 'H'): continue properties['position'] = np.array([properties['x'], properties['y'], properties['z'], ], dtype=float) / 10 molecule.add_node(idx, **properties) idx += 1 molecule.box = _cell(cf, entry) return molecule
[docs] def read_cif_file(file_name, exclude=('SOL', 'HOH'), ignh=False, modelidx=1): """ Parse a CIF file to create a list of molecules using the PyCIFRW library Parameters ---------- file_name: str The file to read. exclude: collections.abc.Container[str] Atoms that have one of these residue names will not be included. ignh: bool Whether hydrogen atoms should be ignored. modelidx: int If the cif file contains multiple models, which one to select. Returns ------- list[vermouth.molecule.Molecule] The parsed molecules. Will not contain edges """ # list of categories from the read cif file to read into a molecule cif_categories_all = [ '_atom_site.id', # atomid '_atom_site.label_atom_id', # atomname '_atom_site.label_comp_id', # resname '_atom_site.label_asym_id', # chain '_atom_site.label_seq_id', # resid '_atom_site.pdbx_pdb_ins_code', # insertion_code '_atom_site.cartn_x', # x '_atom_site.cartn_y', # y '_atom_site.cartn_z', # z '_atom_site.occupancy', # occupancy '_atom_site.b_iso_or_equiv', # temp_factor '_atom_site.type_symbol', # element '_atom_site.pdbx_formal_charge', # charge (nb not added by AF3) '_atom_site.pdbx_PDB_model_num' # model ] cif_category_names = ['atomid', 'atomname', 'resname', 'chain', 'resid', 'insertion_code', 'x', 'y', 'z', 'occupancy', 'temp_factor', 'element', 'charge', 'model'] cif_category_types = [int, str, str, str, int, str, float, float, float, float, float, str, float, int] essential_properties = ['resname', 'resid'] cf = ReadCif(str(file_name)) # PyCifRW seems to store everything from the file it reads in a top level key from _entry.id # which ~ corresponds to the pdb code. From a single file we should only have one key. if len(cf.keys()) > 1: LOGGER.info("This cif file contains multiple entries. Check output carefully.") molecules = [] for entry in cf.keys(): molecule = cif_entry_reader(cf, entry, cif_categories_all, cif_category_names, cif_category_types, essential_properties, modelidx, exclude, ignh) molecules.append(molecule) return molecules
[docs] class CIFReader(): def __init__(self, file, exclude, ignh, modelidx=1): self.input_file = file self.exclude = exclude self.ignh = ignh self.modelidx = modelidx
[docs] def reader(self): if HAVE_READCIF: molecules = read_cif_file(self.input_file, self.exclude, self.ignh, self.modelidx) return molecules else: raise ImportError("The PyCifRW library must be installed to read .cif files with Vermouth.")