Source code for vermouth.pdb.cif

# -*- coding: utf-8 -*-
# Copyright 2025 University of Groningen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Provides functionality to read CIF files.
"""

try:
    from CifFile import ReadCif
except ImportError:
    HAVE_READCIF = False
else:
    HAVE_READCIF= True

import numpy as np

from ..molecule import Molecule
from ..log_helpers import StyleAdapter, get_logger

LOGGER = StyleAdapter(get_logger(__name__))



[docs]
def casting(value, typeto):
    """
    Cast a value to its correct type with an exception
    value: value to cast
    type: the expected output type of the value
    """
    try:
        return typeto(value)
    # except a ValueError. The two known ones are resid = '.' or charge = '?' which must be a str.
    # may cause an error if more are added
    except ValueError:
        return None


def _cell(cf, entry):
    """
    get the cell dimensions from a cif file as a list

    Parameters
    ----------
    cf: CifFile.CifFile_module.CifFile
        CIF file object parsed by PyCifRW
    entry: str
        Key of the cf dict

    Returns
    -------
    dims: np.array
        array of len(6) containing the dimensions of the cell
    """
    try:
        a = cf[entry]['_cell.length_a']
        b = cf[entry]['_cell.length_b']
        c = cf[entry]['_cell.length_c']
        alpha = cf[entry]['_cell.angle_alpha']
        beta = cf[entry]['_cell.angle_beta']
        gamma = cf[entry]['_cell.angle_gamma']
        dims = np.array([a, b, c, alpha, beta, gamma], dtype=float)
    except KeyError:
        LOGGER.info("_cell information missing from .cif file. Will write default dimensions")
        dims = np.array([1, 1, 1, 90, 90, 90])

    return dims



[docs]
def cif_entry_reader(cf, entry,
                     cif_categories_all, cif_category_names, cif_category_types, essential_properties,
                     modelidx, exclude, ignh):
    """
    Read a single entry of a parsed cif file

    Parameters
    ----------
    cf: :func: `<CifFile.CifFile_module.CifFile>`
        CIF file object parsed by PyCifRW
    entry: str
        Key of the cf dict
    cif_categories_all: list
        List of valid category keys contained within a cif file entry
    cif_category_names: list
        Common name for category entries
    cif_category_types: list
        Type to cast CIF data to
    essential_properties: list
        List of properties which must be contained
    modelidx: int
        If the cif file contains multiple models, which one to select.
    exclude: collections.abc.Container[str]
        Atoms that have one of these residue names will not be included.
    ignh: bool
        Whether hydrogen atoms should be ignored.

    Returns
    -------
    vermouth.molecule.Molecule
        A parsed molecule. Will not contain edges

    """
    # first filter the data by which categories are present
    # make list of the category names which are present
    names = []
    # make a list of the lists of the data
    all_data = []
    for category, name, cattype in zip(cif_categories_all, cif_category_names, cif_category_types):
        if category in cf[entry]:
            # cast the data to its correct type
            data = [casting(i, cattype) for i in cf[entry][category]]
            all_data.append(data)
            names.append(name)
        # if we're missing one of the essential categories, raise a warning
        elif name in essential_properties:
            LOGGER.warning(f"{name} data is missing from the input file, and is a required field.")

    # if we don't have atomnames but do have element information, copy it.
    if ('atomname' not in names) and ('element' in names):
        LOGGER.warning("atomname data missing from input file. Will attempt to continue using element data.")
        names.append('atomname')
        all_data.append([casting(i, str) for i in cf[entry]['_atom_site.type_symbol']])

    # for each atom, make a dictionary with its associated name
    properties_dict_list = [dict(zip(names, v)) for v in zip(*all_data)]

    molecule = Molecule()

    # find the indices in the data which are actually from the model that we're after.
    model_atom_indices = [index for index, value in
                          enumerate([int(model) == modelidx for
                                     model in cf[entry]['_atom_site.pdbx_PDB_model_num']])
                          if value]

    idx = 0  # add nodes by separate index in case we skip some atoms
    for index in model_atom_indices:
        properties = properties_dict_list[index]

        if properties.get('resname', None) in exclude or (ignh and properties['element'] == 'H'):
            continue

        properties['position'] = np.array([properties['x'],
                                           properties['y'],
                                           properties['z'],
                                           ], dtype=float) / 10

        molecule.add_node(idx, **properties)
        idx += 1

    molecule.box = _cell(cf, entry)

    return molecule



[docs]
def read_cif_file(file_name, exclude=('SOL', 'HOH'), ignh=False, modelidx=1):
    """
    Parse a CIF file to create a list of molecules using the PyCIFRW library

    Parameters
    ----------
    file_name: str
        The file to read.
    exclude: collections.abc.Container[str]
        Atoms that have one of these residue names will not be included.
    ignh: bool
        Whether hydrogen atoms should be ignored.
    modelidx: int
        If the cif file contains multiple models, which one to select.

    Returns
    -------
    list[vermouth.molecule.Molecule]
        The parsed molecules. Will not contain edges
    """

    # list of categories from the read cif file to read into a molecule
    cif_categories_all = [
        '_atom_site.id',  # atomid
        '_atom_site.label_atom_id',  # atomname
        '_atom_site.label_comp_id',  # resname
        '_atom_site.label_asym_id',  # chain
        '_atom_site.label_seq_id',  # resid
        '_atom_site.pdbx_pdb_ins_code',  # insertion_code
        '_atom_site.cartn_x',  # x
        '_atom_site.cartn_y',  # y
        '_atom_site.cartn_z',  # z
        '_atom_site.occupancy',  # occupancy
        '_atom_site.b_iso_or_equiv',  # temp_factor
        '_atom_site.type_symbol',  # element
        '_atom_site.pdbx_formal_charge',  # charge (nb not added by AF3)
        '_atom_site.pdbx_PDB_model_num'  # model
    ]
    cif_category_names = ['atomid', 'atomname', 'resname', 'chain', 'resid', 'insertion_code',
                          'x', 'y', 'z',
                          'occupancy', 'temp_factor',
                          'element', 'charge', 'model']
    cif_category_types = [int, str, str, str, int, str,
                          float, float, float,
                          float, float,
                          str, float, int]
    essential_properties = ['resname', 'resid']

    cf = ReadCif(str(file_name))
    # PyCifRW seems to store everything from the file it reads in a top level key from _entry.id
    # which ~ corresponds to the pdb code. From a single file we should only have one key.
    if len(cf.keys()) > 1:
        LOGGER.info("This cif file contains multiple entries. Check output carefully.")

    molecules = []
    for entry in cf.keys():
        molecule = cif_entry_reader(cf, entry, cif_categories_all,
                                    cif_category_names, cif_category_types, essential_properties,
                                    modelidx, exclude, ignh)
        molecules.append(molecule)

    return molecules



[docs]
class CIFReader():
    def __init__(self, file, exclude, ignh, modelidx=1):
        self.input_file = file
        self.exclude = exclude
        self.ignh = ignh
        self.modelidx = modelidx


[docs]
    def reader(self):
        if HAVE_READCIF:
            molecules = read_cif_file(self.input_file, self.exclude, self.ignh, self.modelidx)
            return molecules
        else:
            raise ImportError("The PyCifRW library must be installed to read .cif files with Vermouth.")
Source code for vermouth.pdb.cif

VerMoUTH

Navigation

Related Topics