from __future__ import division, print_function, absolute_import
import os
from collections import OrderedDict
from itertools import combinations, chain
import sys
from six.moves import urllib
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist
from rdkit import RDLogger
from rdkit import Chem
from rdkit.Chem.AllChem import ConstrainedEmbed
from rdkit.Chem.rdForceFieldHelpers import UFFGetMoleculeForceField
from . import AtomListToSubMol
[docs]class SanitizeError(Exception):
pass
[docs]class SubstructureMatchError(Exception):
pass
[docs]class AddAtomsError(Exception):
pass
[docs]class FixerError(Exception):
pass
[docs]def MolToTemplates(mol):
"""Prepare set of templates for a given PDB residue."""
if mol.HasProp('_Name') and mol.GetProp('_Name') in ['DA', 'DG', 'DT', 'DC',
'A', 'G', 'T', 'C', 'U']:
backbone = 'OP(=O)(O)OC'
else:
backbone = 'OC(=O)CN'
match = mol.GetSubstructMatch(Chem.MolFromSmiles(backbone))
mol2 = Chem.RWMol(mol)
if match:
mol2.RemoveAtom(match[0])
Chem.SanitizeMol(mol2)
mol2 = mol2.GetMol()
return (mol, mol2)
[docs]def ReadTemplates(filename, resnames):
"""Load templates from file for specified residues"""
template_mols = {}
with open(filename) as f:
for line in f:
data = line.split()
# TODO: skip all residues that have 1 heavy atom
if data[1] in resnames and data[1] != 'HOH': # skip waters
res = Chem.MolFromSmiles(data[0])
res.SetProp('_Name', data[1]) # Needed for residue type lookup
template_mols[data[1]] = MolToTemplates(res)
return template_mols
[docs]def SimplifyMol(mol):
"""Change all bonds to single and discharge/dearomatize all atoms.
The molecule is modified in-place (no copy is made).
"""
for b in mol.GetBonds():
b.SetBondType(Chem.BondType.SINGLE)
b.SetIsAromatic(False)
for a in mol.GetAtoms():
a.SetFormalCharge(0)
a.SetIsAromatic(False)
return mol
[docs]def UFFConstrainedOptimize(mol, moving_atoms=None, fixed_atoms=None,
cutoff=5., verbose=False):
"""Minimize a molecule using UFF forcefield with a set of moving/fixed
atoms. If both moving and fixed atoms are provided, fixed_atoms parameter
will be ignored. The minimization is done in-place (without copying
molecule).
Parameters
----------
mol: rdkit.Chem.rdchem.Mol
Molecule to be minimized.
moving_atoms: array-like (default=None)
Indices of freely moving atoms. If None, fixed atoms are assigned
based on `fixed_atoms`. These two arguments are mutually exclusive.
fixed_atoms: array-like (default=None)
Indices of fixed atoms. If None, fixed atoms are assigned based on
`moving_atoms`. These two arguments are mutually exclusive.
cutoff: float (default=10.)
Distance cutoff for the UFF minimization
Returns
-------
mol: rdkit.Chem.rdchem.Mol
Molecule with mimimized `moving_atoms`
"""
logger = RDLogger.logger()
if not verbose:
logger.setLevel(RDLogger.CRITICAL)
if moving_atoms is None and fixed_atoms is None:
raise ValueError('You must supply at least one set of moving/fixed '
'atoms.')
all_atoms = set(range(mol.GetNumAtoms()))
if moving_atoms is None:
moving_atoms = list(all_atoms.difference(fixed_atoms))
else:
fixed_atoms = list(all_atoms.difference(moving_atoms))
# extract submolecules containing atoms within cutoff
mol_conf = mol.GetConformer(-1)
pos = np.array([mol_conf.GetAtomPosition(i)
for i in range(mol_conf.GetNumAtoms())])
mask = (cdist(pos, pos[moving_atoms]) <= cutoff).any(axis=1)
amap = np.where(mask)[0].tolist()
# expand to whole residues
pocket_residues = OrderedDict()
protein_residues = GetResidues(mol)
for res_id in protein_residues.keys():
if any(1 for res_aix in protein_residues[res_id]
if res_aix in amap):
pocket_residues[res_id] = protein_residues[res_id]
amap = list(chain(*pocket_residues.values()))
# TODO: above certain threshold its making a submolis redundant
submol = AtomListToSubMol(mol, amap, includeConformer=True)
# initialize ring info
Chem.GetSSSR(submol)
ff = UFFGetMoleculeForceField(submol, vdwThresh=cutoff,
ignoreInterfragInteractions=False)
for submol_id, atom_id in enumerate(amap):
if atom_id not in moving_atoms:
ff.AddFixedPoint(submol_id)
ff.Initialize()
ff.Minimize(energyTol=1e-4, forceTol=1e-3, maxIts=2000)
# get the positions backbone
conf = mol.GetConformer(-1)
submol_conf = submol.GetConformer(-1)
for submol_idx, mol_idx in enumerate(amap,):
conf.SetAtomPosition(mol_idx, submol_conf.GetAtomPosition(submol_idx))
# FIXME: there's no getLevel method, so we set to default level
if not verbose:
logger.setLevel(RDLogger.INFO)
return mol
[docs]def ExtractPocketAndLigand(mol, cutoff=12., expandResidues=True,
ligand_residue=None, ligand_residue_blacklist=None,
append_residues=None):
"""Function extracting a ligand (the largest HETATM residue) and the protein
pocket within certain cutoff. The selection of pocket atoms can be expanded
to contain whole residues. The single atom HETATM residues are attributed
to pocket (metals and waters)
Parameters
----------
mol: rdkit.Chem.rdchem.Mol
Molecule with a protein ligand complex
cutoff: float (default=12.)
Distance cutoff for the pocket atoms
expandResidues: bool (default=True)
Expand selection to whole residues within cutoff.
ligand_residue: string (default None)
Residue name which explicitly pint to a ligand(s).
ligand_residue_blacklist: array-like, optional (default None)
List of residues to ignore during ligand lookup.
append_residues: array-like, optional (default None)
List of residues to append to pocket, even if they are HETATM, such
as MSE, ATP, AMP, ADP, etc.
Returns
-------
pocket: rdkit.Chem.rdchem.RWMol
Pocket constructed of protein residues/atoms around ligand
ligand: rdkit.Chem.rdchem.RWMol
Largest HETATM residue contained in input molecule
"""
# Get heteroatom residues - connectivity still might be wrong, so GetFrags will fail
# Use OrderDict, so that A chain is prefered first over B if ligands are equal
hetatm_residues = OrderedDict()
protein_residues = OrderedDict()
for atom in mol.GetAtoms():
info = atom.GetPDBResidueInfo()
res_id = GetAtomResidueId(atom)
if info.GetIsHeteroAtom():
if res_id not in hetatm_residues:
hetatm_residues[res_id] = []
hetatm_residues[res_id].append(atom.GetIdx())
else:
if res_id not in protein_residues:
protein_residues[res_id] = []
protein_residues[res_id].append(atom.GetIdx())
# check if desired ligand residue is present
if ligand_residue is not None and ligand_residue not in hetatm_residues:
ValueError('Threre is no residue named "%s" in the protein file' %
ligand_residue)
for res_id in list(hetatm_residues.keys()): # exhaust keys since we modify
# Treat single atom residues (waters + metals) as pocket residues
# Also append listed residues to protein
if (len(hetatm_residues[res_id]) == 1 and res_id[1] != ligand_residue or
append_residues is not None and res_id[1] in append_residues):
protein_residues[res_id] = hetatm_residues[res_id]
del hetatm_residues[res_id]
# leave only the desired residues
elif ligand_residue is not None and res_id[1] != ligand_residue:
del hetatm_residues[res_id]
# remove blacklisted residues
elif (ligand_residue_blacklist is not None and
res_id[1] in ligand_residue_blacklist):
del hetatm_residues[res_id]
if len(hetatm_residues) == 0:
raise ValueError('No ligands')
# Take largest ligand
ligand_key = sorted(hetatm_residues, key=lambda x: len(hetatm_residues[x]),
reverse=True)[0]
ligand_amap = hetatm_residues[ligand_key]
ligand = AtomListToSubMol(mol, ligand_amap, includeConformer=True)
# we should use GetPositions() here, but it often leads to segfault (RDKit)
conf = ligand.GetConformer()
ligand_coords = np.array([conf.GetAtomPosition(i)
for i in range(ligand.GetNumAtoms())])
# Get protein and waters
blacklist_ids = list(chain(*hetatm_residues.values()))
protein_amap = np.array([i for i in range(mol.GetNumAtoms())
if i not in blacklist_ids])
# we should use GetPositions() here, but it often leads to segfault (RDKit)
conf = mol.GetConformer()
protein_coords = np.array([conf.GetAtomPosition(i)
for i in protein_amap.tolist()])
# Pocket selection based on cutoff
mask = (cdist(protein_coords, ligand_coords) <= cutoff).any(axis=1)
# IDs of atoms within cutoff
pocket_amap = protein_amap[np.where(mask)[0]].tolist()
# Expand pocket's residues
if expandResidues:
pocket_residues = OrderedDict()
for res_id in protein_residues.keys():
if any(1 for res_aix in protein_residues[res_id]
if res_aix in pocket_amap):
pocket_residues[res_id] = protein_residues[res_id]
pocket_amap = list(chain(*pocket_residues.values()))
# Create pocket mol, pocket_amap needs to be mapped to mol Idxs
pocket = AtomListToSubMol(mol, pocket_amap, includeConformer=True)
return pocket, ligand
[docs]def GetAtomResidueId(atom):
"""Return (residue number, residue name, chain id) for a given atom"""
info = atom.GetPDBResidueInfo()
res_id = (info.GetResidueNumber(), info.GetResidueName().strip(),
info.GetChainId())
return res_id
[docs]def GetResidues(mol, atom_list=None):
"""Create dictrionary that maps residues to atom IDs:
(res number, res name, chain id) --> [atom1 idx, atom2 idx, ...]
"""
residues = OrderedDict()
if atom_list is None:
atom_list = range(mol.GetNumAtoms())
for aid in atom_list:
res_id = GetAtomResidueId(mol.GetAtomWithIdx(aid))
if res_id not in residues:
residues[res_id] = []
residues[res_id].append(aid)
return residues
[docs]def PreparePDBResidue(protein, residue, amap, template):
"""
Parameters
----------
protein: rdkit.Chem.rdchem.RWMol
Mol with whole protein. Note that it is modified in place.
residue:
Mol with residue only
amap: list
List mapping atom IDs in residue to atom IDs in whole protein
(amap[i] = j means that i'th atom in residue corresponds to j'th
atom in protein)
template:
Residue template
Returns
-------
protein: rdkit.Chem.rdchem.RWMol
Modified protein
visited_bonds: list
Bonds that match the template
is_complete: bool
Indicates whether all atoms in template were found in residue
"""
visited_bonds = []
is_complete = False
# Catch residues which have less than 4 atoms (i.e. cannot have complete
# backbone), and template has more atoms than that, or residues with
# many missing atoms, which lead to low number of bonds (less than 3)
if ((len(amap) < 4 or residue.GetNumBonds() < 3) and
template.GetNumAtoms() > 4):
raise SubstructureMatchError('Residue has too few atoms (%i) to '
'properly assignbond orders.' % len(amap))
# modify copies instead of original molecules
template2 = Chem.Mol(template)
residue2 = Chem.Mol(residue)
# do the molecules match already?
match = residue2.GetSubstructMatch(template2)
if not match: # no, they don't match
residue2 = SimplifyMol(residue2)
template2 = SimplifyMol(template2)
# match is either tuple (if match was complete) or dict (if match
# was partial)
match = residue2.GetSubstructMatch(template2)
# try inverse match
if not match:
inverse_match = template.GetSubstructMatch(residue)
# if it failed try to match modified molecules (single bonds,
# no charges, no aromatic atoms)
if not inverse_match:
inverse_match = template2.GetSubstructMatch(residue2)
if inverse_match:
match = (dict(zip(inverse_match, range(len(inverse_match)))))
# do the molecules match now?
if match:
assert len(match) <= len(amap), \
'matching is bigger than amap for %s' \
'(%s / %s vs %s; %s atoms vs %s atoms)' % (
template.GetProp('_Name'),
Chem.MolToSmiles(template),
Chem.MolToSmiles(template2),
Chem.MolToSmiles(residue),
residue.GetNumAtoms(),
template.GetNumAtoms(),
)
# Convert matches to dict to support partial match, where keys
# are not complete sequence, as in full match.
if isinstance(match, (tuple, list)):
match = dict(zip(range(len(match)), match))
# apply matching: set bond properties
for (atom1, atom2), (refatom1, refatom2) in \
zip(combinations(match.values(), 2),
combinations(match.keys(), 2)):
b = template.GetBondBetweenAtoms(refatom1, refatom2)
b2 = protein.GetBondBetweenAtoms(amap[atom1], amap[atom2])
# remove extra bonds
if b is None:
if b2: # this bond is not there
protein.RemoveBond(amap[atom1], amap[atom2])
continue
# add missing bonds
if b2 is None:
protein.AddBond(amap[atom1], amap[atom2])
b2 = protein.GetBondBetweenAtoms(amap[atom1], amap[atom2])
# set bond properties
b2.SetBondType(b.GetBondType())
b2.SetIsAromatic(b.GetIsAromatic())
visited_bonds.append((amap[atom1], amap[atom2]))
# apply matching: set atom properties
for a in template.GetAtoms():
if a.GetIdx() not in match:
continue
a2 = protein.GetAtomWithIdx(amap[match[a.GetIdx()]])
a2.SetHybridization(a.GetHybridization())
# partial match may not close ring, so set aromacity only if
# atom is in ring
if a2.IsInRing():
a2.SetIsAromatic(a.GetIsAromatic())
# TODO: check for connected Hs
# n_hs = sum(n.GetAtomicNum() == 1 for n in a2.GetNeighbors())
a2.SetNumExplicitHs(a.GetNumExplicitHs())
a2.SetFormalCharge(a.GetFormalCharge())
# Update computed properties for an atom
a2.UpdatePropertyCache(strict=False)
if len(match) < template.GetNumAtoms():
# TODO: replace following with warning/logging
# Get atom map of fixed fragment
amap_frag = [amap[match[a.GetIdx()]]
for a in template.GetAtoms()
if a.GetIdx() in match]
info = protein.GetAtomWithIdx(amap_frag[0]).GetPDBResidueInfo()
print('Partial match. Probably incomplete sidechain.',
template.GetProp('_Name'),
Chem.MolToSmiles(template),
Chem.MolToSmiles(template2),
Chem.MolToSmiles(residue),
Chem.MolToSmiles(AtomListToSubMol(protein, amap_frag)),
info.GetResidueName(),
info.GetResidueNumber(),
info.GetChainId(),
sep='\t', file=sys.stderr)
else:
is_complete = True
else:
# most common missing sidechain AA
msg = 'No matching found'
raise SubstructureMatchError(msg,
template.GetProp('_Name'),
Chem.MolToSmiles(template),
Chem.MolToSmiles(template2),
Chem.MolToSmiles(residue))
return protein, visited_bonds, is_complete
[docs]def AddMissingAtoms(protein, residue, amap, template):
"""Add missing atoms to protein molecule only at the residue according to
template.
Parameters
----------
protein: rdkit.Chem.rdchem.RWMol
Mol with whole protein. Note that it is modified in place.
residue:
Mol with residue only
amap: list
List mapping atom IDs in residue to atom IDs in whole protein
(amap[i] = j means that i'th atom in residue corresponds to j'th
atom in protein)
template:
Residue template
Returns
-------
protein: rdkit.Chem.rdchem.RWMol
Modified protein
visited_bonds: list
Bonds that match the template
is_complete: bool
Indicates whether all atoms in template were found in residue
"""
# TODO: try to better guess the types of atoms (if possible)
# Catch residues which have less than 4 atoms (i.e. cannot have complete
# backbone), and template has more atoms than that, or residues with
# many missing atoms, which lead to low number of bonds (less than 3)
if ((len(amap) < 4 or residue.GetNumBonds() < 3) and
template.GetNumAtoms() > 4):
raise AddAtomsError('Residue has too few atoms (%i) to properly embed '
'residue conformer.' % len(amap))
# we need the match anyway and ConstrainedEmbed does not outputs it
matched_atoms = template.GetSubstructMatch(residue)
if matched_atoms: # instead of catching ValueError
try:
fixed_residue = ConstrainedEmbed(template, residue)
except ValueError:
raise AddAtomsError('Could not embed residue')
else:
residue2 = SimplifyMol(Chem.Mol(residue))
template2 = SimplifyMol(Chem.Mol(template))
matched_atoms = template2.GetSubstructMatch(residue2)
if matched_atoms:
try:
fixed_residue = ConstrainedEmbed(template2, residue2)
except ValueError:
raise AddAtomsError('Could not embed residue')
# copy coordinates to molecule with appropriate bond orders
fixed_residue2 = Chem.Mol(template)
fixed_residue2.RemoveAllConformers()
fixed_residue2.AddConformer(fixed_residue.GetConformer(-1))
fixed_residue = fixed_residue2
else:
raise SubstructureMatchError(
'No matching found at missing atom stage.',
template.GetProp('_Name'),
Chem.MolToSmiles(template),
Chem.MolToSmiles(residue))
new_atoms = []
new_amap = []
info = residue.GetAtomWithIdx(0).GetPDBResidueInfo()
protein_conformer = protein.GetConformer()
fixed_conformer = fixed_residue.GetConformer()
for i in range(fixed_residue.GetNumAtoms()):
if i not in matched_atoms:
atom = fixed_residue.GetAtomWithIdx(i)
# we need to generate atom names like 'H123', these are
# "wrapped around" below when setting 'atomName' to '3H12'
atom_symbol = atom.GetSymbol()
name = (atom_symbol + str(i)[:4-len(atom_symbol)]).ljust(4)
new_info = Chem.AtomPDBResidueInfo(
atomName=name[-1:] + name[:-1], # wrap around
residueName=info.GetResidueName(),
residueNumber=info.GetResidueNumber(),
chainId=info.GetChainId(),
insertionCode=info.GetInsertionCode(),
isHeteroAtom=info.GetIsHeteroAtom()
)
atom.SetMonomerInfo(new_info)
new_id = protein.AddAtom(atom)
new_atoms.append(new_id)
pos = fixed_conformer.GetAtomPosition(i)
protein_conformer.SetAtomPosition(new_id, pos)
new_amap.append(new_id)
else:
new_amap.append(amap[matched_atoms.index(i)])
# add bonds in separate loop (we need all atoms added before that)
for i in range(fixed_residue.GetNumAtoms()):
if i not in matched_atoms:
atom = fixed_residue.GetAtomWithIdx(i)
for n in atom.GetNeighbors():
ni = n.GetIdx()
bond = fixed_residue.GetBondBetweenAtoms(i, ni)
# for multiple missing atoms we may hit bonds multiple times
new_bond = protein.GetBondBetweenAtoms(new_amap[i],
new_amap[ni])
if new_bond is None:
protein.AddBond(new_amap[i], new_amap[ni])
new_bond = protein.GetBondBetweenAtoms(new_amap[i],
new_amap[ni])
new_bond.SetBondType(bond.GetBondType())
# if there are no new atoms raise an exception and dont go further
if len(new_atoms) == 0:
raise AddAtomsError
backbone_definitions = [
# Phosphodiester Bond
{'smarts': Chem.MolFromSmiles('O=P(O)OCC1OC(CC1O)'),
'atom_types': {0: 'OP1', 1: 'P', 2: 'OP2', 3: 'O5\'', 4: 'C5\'',
5: 'C4\'', 9: 'C3\'', 10: 'O3\''},
'bond_pair': ('O3\'', 'P')},
# Peptide Bond
{'smarts': Chem.MolFromSmiles('C(=O)CN'),
'atom_types': {0: 'C', 1: 'O', 2: 'CA', 3: 'N'},
'bond_pair': ('C', 'N')},
]
info = residue.GetAtomWithIdx(0).GetPDBResidueInfo()
res_num = info.GetResidueNumber()
res_chain = info.GetChainId()
for bond_def in backbone_definitions:
backbone_match = fixed_residue.GetSubstructMatch(bond_def['smarts'])
if backbone_match:
for i in new_atoms:
if new_amap.index(i) in backbone_match:
atom = protein.GetAtomWithIdx(i)
match_idx = backbone_match.index(new_amap.index(i))
if match_idx not in bond_def['atom_types']:
# if atom type is not defined we can skip that atom
continue
# Set atom label if present in backbone definition
match_type = bond_def['atom_types'][match_idx]
atom.GetPDBResidueInfo().SetName(' ' + match_type.ljust(3))
# define upstream and downstream bonds
bonds = zip([bond_def['bond_pair'],
reversed(bond_def['bond_pair'])],
[1, -1])
for (a1, a2), diff in bonds:
if match_type == a1:
limit = max(-1, protein.GetNumAtoms() * diff)
for j in range(amap[0], limit, diff):
info = (protein.GetAtomWithIdx(j)
.GetPDBResidueInfo())
res2_num = info.GetResidueNumber()
res2_chain = info.GetChainId()
if (res2_num == res_num + diff
and res_chain == res2_chain):
if info.GetName().strip() == a2:
protein.AddBond(i, j, Chem.BondType.SINGLE)
break
elif (abs(res2_num - res_num) > 1
or res_chain != res2_chain):
break
# run minimization just for this residue
protein = UFFConstrainedOptimize(protein, moving_atoms=new_atoms)
# run PreparePDBResidue to fix atom properies
out = PreparePDBResidue(protein, fixed_residue, new_amap, template)
return out + (new_atoms,)
[docs]def PreparePDBMol(mol,
removeHs=True,
removeHOHs=True,
residue_whitelist=None,
residue_blacklist=None,
remove_incomplete=False,
add_missing_atoms=False,
custom_templates=None,
replace_default_templates=False,
):
"""Prepares protein molecule by:
- Removing Hs by hard using atomic number [default=True]
- Removes HOH [default=True]
- Assign bond orders from smiles of PDB residues (over 24k templates)
- Removes bonds to metals
Parameters
----------
mol: rdkit.Chem.rdchem.Mol
Mol with whole protein.
removeHs: bool, optional (default True)
If True, hydrogens will be forcefully removed
removeHOHs: bool, optional (default True)
If True, remove waters using residue name
residue_whitelist: array-like, optional (default None)
List of residues to clean. If not specified, all residues
present in the structure will be used.
residue_blacklist: array-like, optional (default None)
List of residues to ignore during cleaning. If not specified,
all residues present in the structure will be cleaned.
remove_incomplete: bool, optional (default False)
If True, remove residues that do not fully match the template
add_missing_atoms: bool (default=False)
Switch to add missing atoms accordingly to template SMILES structure.
custom_templates: str or dict, optional (default None)
Custom templates for residues. Can be either path to SMILES file,
or dictionary mapping names to SMILES or Mol objects
replace_default_templates: bool, optional (default False)
Indicates whether default default templates should be replaced by
cusom ones. If False, default templates will be updated with custom
ones. This argument is ignored if custom_templates is None.
Returns
-------
new_mol: rdkit.Chem.rdchem.RWMol
Modified protein
"""
new_mol = Chem.RWMol(mol)
if removeHs:
for i in reversed(range(new_mol.GetNumAtoms())):
atom = new_mol.GetAtomWithIdx(i)
if atom.GetAtomicNum() == 1:
new_mol.RemoveAtom(i)
if removeHOHs:
for i in reversed(range(new_mol.GetNumAtoms())):
atom = new_mol.GetAtomWithIdx(i)
if atom.GetPDBResidueInfo().GetResidueName() == 'HOH':
new_mol.RemoveAtom(i)
# list of unique residues and their atom indices
unique_resname = set()
residues_atom_map = GetResidues(new_mol)
# create a list of residue mols with atom maps
residues = []
# residue_id == (res number, res name, chain id)
for residue_id, amap in residues_atom_map.items():
unique_resname.add(residue_id[1].strip())
# skip waters
if residue_id[1] != 'HOH':
res = AtomListToSubMol(new_mol, amap, includeConformer=True)
residues.append((residue_id, res, amap))
# load cutom templates
if custom_templates is not None:
if isinstance(custom_templates, str):
custom_mols = ReadTemplates(custom_templates, unique_resname)
elif isinstance(custom_templates, dict):
custom_mols = {}
for resname, structure in custom_templates.items():
if isinstance(structure, str):
structure = Chem.MolFromSmiles(structure)
structure.SetProp('_Name', resname)
custom_mols[resname] = MolToTemplates(structure)
else:
raise TypeError('custom_templates should be file name on dict,'
' %s was given' % type(custom_templates))
if custom_templates is None or not replace_default_templates:
filename = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'..', 'pdb_residue_templates.smi')
template_mols = ReadTemplates(filename, unique_resname)
else:
template_mols = {}
if custom_templates is not None:
if replace_default_templates:
template_mols = custom_mols
else:
template_mols.update(custom_mols)
# Deal with residue lists
if residue_whitelist is not None:
unique_resname = set(residue_whitelist)
if residue_blacklist is not None:
unique_resname = unique_resname.difference(set(residue_blacklist))
unique_resname = tuple(map(lambda x: x.strip().upper(), unique_resname))
# reset B.O. using templates
visited_bonds = []
new_atoms = []
atoms_to_del = []
for ((resnum, resname, chainid), residue, amap) in residues:
if resname not in unique_resname:
continue
if resname not in template_mols:
raise ValueError('There is no template for residue "%s"' % resname)
template_raw, template_chain = template_mols[resname]
if residue.GetNumAtoms() > template_chain.GetNumAtoms():
template = template_raw
else:
template = template_chain
bonds = []
atoms = []
complete_match = False
try:
new_mol, bonds, complete_match = PreparePDBResidue(new_mol,
residue,
amap,
template)
if add_missing_atoms and not complete_match:
new_mol, bonds, complete_match, atoms = AddMissingAtoms(new_mol,
residue,
amap,
template)
if atoms:
print('Added %i atoms on residue' % len(atoms), resnum,
resname, chainid, file=sys.stderr)
except SubstructureMatchError as e:
print(resnum, resname, chainid, e, file=sys.stderr)
except AddAtomsError as e:
print(resnum, resname, chainid, e, file=sys.stderr)
finally:
visited_bonds.extend(bonds)
if remove_incomplete and not complete_match:
atoms_to_del.extend(amap)
else:
new_atoms.extend(atoms)
# HACK: remove not-visited bonds
if visited_bonds: # probably we dont want to delete all
new_mol = Chem.RWMol(new_mol)
visited_bonds = set(visited_bonds)
bonds_queue = []
backbone_bonds = [] # a list of backbone bonds to re-check
for bond in new_mol.GetBonds():
a1 = bond.GetBeginAtomIdx()
a2 = bond.GetEndAtomIdx()
if (a1, a2) not in visited_bonds and (a2, a1) not in visited_bonds:
bonds_queue.append((a1, a2))
for a1_ix, a2_ix in bonds_queue:
a1 = new_mol.GetAtomWithIdx(a1_ix)
a2 = new_mol.GetAtomWithIdx(a2_ix)
# get residue number
a1_num = a1.GetPDBResidueInfo().GetResidueNumber()
a2_num = a2.GetPDBResidueInfo().GetResidueNumber()
# get PDB atom names
a1_name = a1.GetPDBResidueInfo().GetName().strip()
a2_name = a2.GetPDBResidueInfo().GetName().strip()
if a1.GetAtomicNum() > 1 and a2.GetAtomicNum() > 1:
# don't remove bonds between residues in backbone
# and sulphur bridges
if (((a1_name, a2_name) in {('C', 'N'), ('N', 'C'),
('P', 'O3\''), ('O3\'', 'P')} and
abs(a1_num - a2_num) == 1) or # peptide or DNA bond
(a1_name == 'SG' and a2_name == 'SG')): # sulphur bridge
backbone_bonds.append((a1_ix, a2_ix))
else:
new_mol.RemoveBond(a1_ix, a2_ix)
else:
pass
# minimize new atoms
if new_atoms:
old_new_mol = Chem.RWMol(new_mol)
Chem.GetSSSR(new_mol) # we need to update ring info
new_mol = UFFConstrainedOptimize(new_mol, moving_atoms=new_atoms)
print('RMS after minimization of added atoms (%i):' % len(new_atoms),
Chem.rdMolAlign.AlignMol(new_mol, old_new_mol),
file=sys.stderr)
# remove all peptide, phosphodiester and sulfur bonds which are to long (<4A)
if visited_bonds and bonds_queue:
conf = new_mol.GetConformer(-1)
for a1_ix, a2_ix in backbone_bonds:
if np.linalg.norm(conf.GetAtomPosition(a1_ix) -
conf.GetAtomPosition(a2_ix)) > 4: # np.array
new_mol.RemoveBond(a1_ix, a2_ix)
# check if new bonds have reasonable lengths
new_bonds = set(chain(*(new_mol.GetAtomWithIdx(a).GetBonds()
for a in new_atoms)))
conformer = new_mol.GetConformer()
for bond in new_bonds:
a1 = bond.GetBeginAtomIdx()
a2 = bond.GetEndAtomIdx()
bond_length = np.linalg.norm(conformer.GetAtomPosition(a1) -
conformer.GetAtomPosition(a2))
if bond_length > 3.0:
res1 = '{1}{0}.{2}'.format(*GetAtomResidueId(new_mol.GetAtomWithIdx(a1)))
res2 = '{1}{0}.{2}'.format(*GetAtomResidueId(new_mol.GetAtomWithIdx(a2)))
raise FixerError('Cannot fix the structure. Bond between atoms '
'%s (%s) and %s (%s) is too long.'
% (a1, res1, a2, res2))
# index change here
if atoms_to_del:
new_mol = Chem.RWMol(new_mol)
for idx in sorted(atoms_to_del, reverse=True):
new_mol.RemoveAtom(idx)
# if missing atoms were added we need to renumber them
if add_missing_atoms and new_atoms:
def atom_reorder_repr(i):
"""Generate keys for each atom during sort"""
atom = new_mol.GetAtomWithIdx(i)
info = atom.GetPDBResidueInfo()
return (info.GetChainId(), info.GetResidueNumber(), i)
order = list(range(new_mol.GetNumAtoms()))
new_order = sorted(order, key=atom_reorder_repr)
Chem.GetSSSR(new_mol)
new_mol = Chem.RenumberAtoms(new_mol, new_order)
# highlight added atoms, but need to get their new idx first
new_mol.__sssAtoms = [new_i for new_i, i in enumerate(new_order)
if i in new_atoms]
return new_mol
[docs]def FetchAffinityTable(pdbids, affinity_types):
"""Fetch affinity data from RCSB PDB server.
Parameters
----------
pdbids: array-like
List of PDB IDs of structres with protein-ligand complexes.
affinity_types: array-like
List of types of affinity data to retrieve. Available types are:
Ki, Kd, EC50, IC50, deltaG, deltaH, deltaS, Ka.
Returns
-------
ligand_affinity: pd.DataFrame
Table with protein-ligand binding affinities. Table contains
following columns: structureId, ligandId, ligandFormula,
ligandMolecularWeight + columns named after affinity types
specified byt the user.
"""
ids_string = ','.join(pdbids)
pdb_report_url = ('https://www.rcsb.org/pdb/rest/customReport.csv?'
'pdbids=%s&reportName=%s&service=wsfile&format=csv')
# get table with ligands
ligands = pd.read_csv(pdb_report_url % (ids_string, 'Ligands'))
ligands = ligands.dropna(subset=['structureId', 'ligandId'])
# get table with binding affinites
affinity = pd.read_csv(pdb_report_url % (ids_string, 'BindingAffinity'))
affinity = affinity.rename(columns={'hetId': 'ligandId'})
# inner join of two tables - all ligands with known affinities
ligand_affinity = (
pd.merge(ligands, affinity, sort=False)
.drop_duplicates(subset=['structureId', 'ligandId'])
.dropna(subset=affinity_types, how='all')
.fillna('')
)
# remove comments from columns with affinity data
for affinity_type in affinity_types:
ligand_affinity[affinity_type] = (
ligand_affinity[affinity_type]
.str
.split(' ', expand=True)[0]
)
columns = ['structureId', 'ligandId', 'ligandFormula',
'ligandMolecularWeight'] + affinity_types
return ligand_affinity[columns]
[docs]def FetchStructure(pdbid, sanitize=False, removeHs=True, cache_dir=None):
"""Fetch the structure in PDB format from RCSB PDB server and read it with
rdkit.
Parameters
----------
pdbid: str
PDB IDs of the structre
sanitize: bool, optional (default False)
Toggles molecule sanitation
removeHs: bool, optional (default False)
Indicates wheter Hs should be removed during reading
Returns
-------
mol: Chem.rdchem.Mol
Retrieved molecule
"""
if cache_dir is not None:
structure_dir = os.path.join(cache_dir, pdbid)
structure_path = os.path.join(structure_dir, '%s.pdb' % pdbid)
if not os.path.isdir(cache_dir):
os.makedirs(cache_dir)
if not os.path.isdir(structure_dir):
os.makedirs(structure_dir)
if os.path.isfile(structure_path):
mol = Chem.MolFromPDBFile(structure_path, sanitize=sanitize,
removeHs=removeHs)
return mol
req = urllib.request.Request('https://files.rcsb.org/view/%s.pdb' % pdbid)
response = urllib.request.urlopen(req)
pdb_block = response.read().decode('utf-8')
mol = Chem.MolFromPDBBlock(pdb_block, sanitize=sanitize, removeHs=removeHs)
if cache_dir is not None:
with open(structure_path, 'w') as f:
f.write(pdb_block)
return mol
[docs]def IsResidueConnected(mol, atom_ids):
"""Check if residue with given atom IDs is connected to other residues
in the molecule.
"""
residues = set(GetResidues(mol, atom_ids))
if len(residues) > 1:
raise ValueError('Atoms belong to multiple residues:' + str(residues))
residue = residues.pop()
to_check = set(atom_ids)
visited_atoms = set()
while len(to_check) > 0:
aid = to_check.pop()
visited_atoms.add(aid)
atom = mol.GetAtomWithIdx(aid)
for atom2 in atom.GetNeighbors():
if atom2.GetIdx() in visited_atoms:
continue
if residue != GetAtomResidueId(atom2):
# we got to different residue so it is connected
return True
else:
to_check.add(atom2.GetIdx())
return False
[docs]def PrepareComplexes(pdbids, pocket_dist_cutoff=12., affinity_types=None,
cache_dir=None):
"""Fetch structures and affinity data from RCSB PDB server and prepare
ligand-pocket pairs for small molecules with known activites.
Parameters
----------
pdbids: array-like
List of PDB IDs of structres with protein-ligand complexes.
pocket_dist_cutoff: float, optional (default 12.)
Distance cutoff for the pocket atoms
affinity_types: array-like, optional (default None)
List of types of affinity data to retrieve. Available types are:
Ki, Kd, EC50, IC50, deltaG, deltaH, deltaS, Ka. If not specified
Ki, Kd, EC50, and IC50 are used.
Returns
-------
complexes: dict
Dictionary with pocket-ligand paris, structured as follows:
{'pdbid': {'ligid': (pocket_mol, ligand_mol)}. Ligands have
binding affinity data stored as properties.
"""
if affinity_types is None:
affinity_types = ['Ki', 'Kd', 'EC50', 'IC50']
affinity_table = FetchAffinityTable(pdbids, affinity_types)
complexes = {}
for pdbid, tab in affinity_table.groupby('structureId'):
complexes[pdbid] = {}
complex_mol = FetchStructure(pdbid, cache_dir=cache_dir)
# we need to use fixer with rdkit < 2018
complex_mol = PreparePDBMol(complex_mol)
ligand_atoms = {res_name: {} for res_name in tab['ligandId']}
for atom in complex_mol.GetAtoms():
info = atom.GetPDBResidueInfo()
res_name = info.GetResidueName().strip()
if res_name not in ligand_atoms:
continue
res_id = (info.GetResidueNumber(), info.GetChainId())
if res_id not in ligand_atoms[res_name]:
ligand_atoms[res_name][res_id] = []
ligand_atoms[res_name][res_id].append(atom.GetIdx())
proper_ligands = []
for res_name, atoms_ids in ligand_atoms.items():
# ligand shouldn't be connected to other residues
if not any(IsResidueConnected(complex_mol, atom_list)
for atom_list in atoms_ids.values()):
proper_ligands.append(res_name)
for res_name in proper_ligands:
try:
pocket, ligand = ExtractPocketAndLigand(
complex_mol,
cutoff=pocket_dist_cutoff,
ligand_residue=res_name)
except Exception:
print('Cant get pocket and ligand for %s and %s'
% (pdbid, res_name))
continue
# prepare the pocket
# TODO: add missing atoms
pocket = PreparePDBMol(pocket)
flag = Chem.SanitizeMol(pocket)
if flag != Chem.SanitizeFlags.SANITIZE_NONE:
raise SanitizeError('Cannot sanitize pocket for %s and %s'
% (pdbid, res_name))
flag = Chem.SanitizeMol(ligand)
if flag != Chem.SanitizeFlags.SANITIZE_NONE:
raise SanitizeError('Cannot sanitize ligand for %s and %s'
% (pdbid, res_name))
affinity_values = (
tab
[tab['ligandId'] == res_name]
[affinity_types]
.iloc[0]
)
for affinity_type, value in zip(affinity_types, affinity_values):
if len(value) == 0:
continue
# parse values like ">1000" or "0.5-0.8"
value = [float(v.strip('<>~')) for v in value.split('-')]
if len(value) == 1:
value = value[0]
else:
# it's range, use its middle
assert len(value) == 2
value = sum(value) / 2
ligand.SetProp(affinity_type, str(value))
complexes[pdbid][res_name] = (pocket, ligand)
return complexes