Source code for oddt.scoring.descriptors.binana

"""Internal implementation of binana software
(http://nbcr.ucsd.edu/data/sw/hosted/binana/)

"""

import numpy as np
from oddt.scoring.descriptors import (atoms_by_type,
                                      close_contacts_descriptor,
                                      oddt_vina_descriptor)
from oddt.interactions import (close_contacts, hbonds, hydrophobic_contacts,
                               pi_cation, pi_stacking, salt_bridges)


[docs]class binana_descriptor(object): def __init__(self, protein=None): """ Descriptor build from binana script (as used in NNScore 2.0 Parameters ---------- protein: oddt.toolkit.Molecule object (default=None) Protein object to be used while generating descriptors. """ self.protein = protein self.titles = [] self.vina = oddt_vina_descriptor(protein, vina_scores=['vina_gauss1', 'vina_gauss2', 'vina_repulsion', 'vina_hydrophobic', 'vina_hydrogen']) self.titles += self.vina.titles # Close contacts descriptor generators cc_4_types = (('A', 'A'), ('A', 'C'), ('A', 'CL'), ('A', 'F'), ('A', 'FE'), ('A', 'HD'), ('A', 'MG'), ('A', 'MN'), ('A', 'N'), ('A', 'NA'), ('A', 'OA'), ('A', 'SA'), ('A', 'ZN'), ('BR', 'C'), ('BR', 'HD'), ('BR', 'OA'), ('C', 'C'), ('C', 'CL'), ('C', 'F'), ('C', 'HD'), ('C', 'MG'), ('C', 'MN'), ('C', 'N'), ('C', 'NA'), ('C', 'OA'), ('C', 'SA'), ('C', 'ZN'), ('CL', 'FE'), ('CL', 'HD'), ('CL', 'MG'), ('CL', 'N'), ('CL', 'OA'), ('CL', 'ZN'), ('F', 'HD'), ('F', 'N'), ('F', 'OA'), ('F', 'SA'), ('FE', 'HD'), ('FE', 'N'), ('FE', 'OA'), ('HD', 'HD'), ('HD', 'I'), ('HD', 'MG'), ('HD', 'MN'), ('HD', 'N'), ('HD', 'NA'), ('HD', 'OA'), ('HD', 'P'), ('HD', 'S'), ('HD', 'SA'), ('HD', 'ZN'), ('MG', 'NA'), ('MG', 'OA'), ('MN', 'N'), ('MN', 'OA'), ('N', 'N'), ('N', 'NA'), ('N', 'OA'), ('N', 'SA'), ('N', 'ZN'), ('NA', 'OA'), ('NA', 'SA'), ('NA', 'ZN'), ('OA', 'OA'), ('OA', 'SA'), ('OA', 'ZN'), ('S', 'ZN'), ('SA', 'ZN'), ('A', 'BR'), ('A', 'I'), ('A', 'P'), ('A', 'S'), ('BR', 'N'), ('BR', 'SA'), ('C', 'FE'), ('C', 'I'), ('C', 'P'), ('C', 'S'), ('CL', 'MN'), ('CL', 'NA'), ('CL', 'P'), ('CL', 'S'), ('CL', 'SA'), ('CU', 'HD'), ('CU', 'N'), ('FE', 'NA'), ('FE', 'SA'), ('I', 'N'), ('I', 'OA'), ('MG', 'N'), ('MG', 'P'), ('MG', 'S'), ('MG', 'SA'), ('MN', 'NA'), ('MN', 'P'), ('MN', 'S'), ('MN', 'SA'), ('N', 'P'), ('N', 'S'), ('NA', 'P'), ('NA', 'S'), ('OA', 'P'), ('OA', 'S'), ('P', 'S'), ('P', 'SA'), ('P', 'ZN'), ('S', 'SA'), ('SA', 'SA'), ('A', 'CU'), ('C', 'CD')) cc_4_rec_types, cc_4_lig_types = zip(*cc_4_types) self.titles += ['cc_%s.%s_4' % (t1, t2) for t1, t2 in cc_4_types] self.cc_4 = close_contacts_descriptor(protein, cutoff=4, protein_types=cc_4_rec_types, ligand_types=cc_4_lig_types, mode='atom_types_ad4', aligned_pairs=True) self.ele_types = (('A', 'A'), ('A', 'C'), ('A', 'CL'), ('A', 'F'), ('A', 'FE'), ('A', 'HD'), ('A', 'MG'), ('A', 'MN'), ('A', 'N'), ('A', 'NA'), ('A', 'OA'), ('A', 'SA'), ('A', 'ZN'), ('BR', 'C'), ('BR', 'HD'), ('BR', 'OA'), ('C', 'C'), ('C', 'CL'), ('C', 'F'), ('C', 'HD'), ('C', 'MG'), ('C', 'MN'), ('C', 'N'), ('C', 'NA'), ('C', 'OA'), ('C', 'SA'), ('C', 'ZN'), ('CL', 'FE'), ('CL', 'HD'), ('CL', 'MG'), ('CL', 'N'), ('CL', 'OA'), ('CL', 'ZN'), ('F', 'HD'), ('F', 'N'), ('F', 'OA'), ('F', 'SA'), ('F', 'ZN'), ('FE', 'HD'), ('FE', 'N'), ('FE', 'OA'), ('HD', 'HD'), ('HD', 'I'), ('HD', 'MG'), ('HD', 'MN'), ('HD', 'N'), ('HD', 'NA'), ('HD', 'OA'), ('HD', 'P'), ('HD', 'S'), ('HD', 'SA'), ('HD', 'ZN'), ('MG', 'NA'), ('MG', 'OA'), ('MN', 'N'), ('MN', 'OA'), ('N', 'N'), ('N', 'NA'), ('N', 'OA'), ('N', 'SA'), ('N', 'ZN'), ('NA', 'OA'), ('NA', 'SA'), ('NA', 'ZN'), ('OA', 'OA'), ('OA', 'SA'), ('OA', 'ZN'), ('S', 'ZN'), ('SA', 'ZN'), ('A', 'BR'), ('A', 'I'), ('A', 'P'), ('A', 'S'), ('BR', 'N'), ('BR', 'SA'), ('C', 'FE'), ('C', 'I'), ('C', 'P'), ('C', 'S'), ('CL', 'MN'), ('CL', 'NA'), ('CL', 'P'), ('CL', 'S'), ('CL', 'SA'), ('CU', 'HD'), ('CU', 'N'), ('FE', 'NA'), ('FE', 'SA'), ('I', 'N'), ('I', 'OA'), ('MG', 'N'), ('MG', 'P'), ('MG', 'S'), ('MG', 'SA'), ('MN', 'NA'), ('MN', 'P'), ('MN', 'S'), ('MN', 'SA'), ('N', 'P'), ('N', 'S'), ('NA', 'P'), ('NA', 'S'), ('OA', 'P'), ('OA', 'S'), ('P', 'S'), ('P', 'SA'), ('P', 'ZN'), ('S', 'SA'), ('SA', 'SA')) self.titles += ['ele_%s.%s_4' % (t1, t2) for t1, t2 in self.ele_types] self.ligand_atom_types = ['A', 'BR', 'C', 'CL', 'F', 'HD', 'I', 'N', 'NA', 'OA', 'P', 'S', 'SA'] self.titles += ['lig_%s' % t1 for t1 in self.ligand_atom_types] cc_25_types = [('A', 'A'), ('A', 'C'), ('A', 'CL'), ('A', 'F'), ('A', 'FE'), ('A', 'HD'), ('A', 'MG'), ('A', 'MN'), ('A', 'N'), ('A', 'NA'), ('A', 'OA'), ('A', 'SA'), ('A', 'ZN'), ('BR', 'C'), ('BR', 'HD'), ('BR', 'OA'), ('C', 'C'), ('C', 'CL'), ('C', 'F'), ('C', 'HD'), ('C', 'MG'), ('C', 'MN'), ('C', 'N'), ('C', 'NA'), ('C', 'OA'), ('C', 'SA'), ('C', 'ZN'), ('CD', 'OA'), ('CL', 'FE'), ('CL', 'HD'), ('CL', 'MG'), ('CL', 'N'), ('CL', 'OA'), ('CL', 'ZN'), ('F', 'HD'), ('F', 'N'), ('F', 'OA'), ('F', 'SA'), ('F', 'ZN'), ('FE', 'HD'), ('FE', 'N'), ('FE', 'OA'), ('HD', 'HD'), ('HD', 'I'), ('HD', 'MG'), ('HD', 'MN'), ('HD', 'N'), ('HD', 'NA'), ('HD', 'OA'), ('HD', 'P'), ('HD', 'S'), ('HD', 'SA'), ('HD', 'ZN'), ('MG', 'NA'), ('MG', 'OA'), ('MN', 'N'), ('MN', 'OA'), ('N', 'N'), ('N', 'NA'), ('N', 'OA'), ('N', 'SA'), ('N', 'ZN'), ('NA', 'OA'), ('NA', 'SA'), ('NA', 'ZN'), ('OA', 'OA'), ('OA', 'SA'), ('OA', 'ZN'), ('S', 'ZN'), ('SA', 'ZN')] cc_25_rec_types, cc_25_lig_types = zip(*cc_25_types) self.cc_25 = close_contacts_descriptor(protein, cutoff=2.5, protein_types=cc_25_rec_types, ligand_types=cc_25_lig_types, mode='atom_types_ad4', aligned_pairs=True) self.titles += ['cc_%s.%s_2.5' % (t1, t2) for t1, t2 in cc_25_types] # H-Bonds (<4A) self.titles += ['hb_4_mol_backbone_alpha', 'hb_4_mol_backbone_beta', 'hb_4_mol_backbone_other', 'hb_4_mol_sidechain_alpha', 'hb_4_mol_sidechain_beta', 'hb_4_mol_sidechain_other', 'hb_4_rec_backbone_alpha', 'hb_4_rec_backbone_beta', 'hb_4_rec_backbone_other', 'hb_4_rec_sidechain_alpha', 'hb_4_rec_sidechain_beta', 'hb_4_rec_sidechain_other'] # Hydrophobic Contact <4A self.titles += ['hyd_4_backbone_alpha', 'hyd_4_backbone_beta', 'hyd_4_backbone_other', 'hyd_4_sidechain_alpha', 'hyd_4_sidechain_beta', 'hyd_4_sidechain_other', 'hyd_4_all'] # Pi-stacking (<7.5A) self.titles += ['pi_stack_7.5_alpha', 'pi_stack_7.5_beta', 'pi_stack_7.5_other'] # T-shaped Pi-Pi interaction self.titles += ['pi_t_7.5_alpha', 'pi_t_7.5_beta', 'pi_t_7.5_other'] # Pi-cation (<6A) self.titles += ['pi_cat_mol_6_alpha', 'pi_cat_mol_6_beta', 'pi_cat_mol_6_other', 'pi_cat_rec_6_alpha', 'pi_cat_rec_6_beta', 'pi_cat_rec_6_other'] # Active site flexibility (<4A) self.titles += ['as_flex_backbone_alpha', 'as_flex_backbone_beta', 'as_flex_backbone_other', 'as_flex_sidechain_alpha', 'as_flex_sidechain_beta', 'as_flex_sidechain_other', 'as_flex_all'] # Salt bridges (<5.5) self.titles += ['salt_bridge_5.5_alpha', 'salt_bridge_5.5_beta', 'salt_bridge_5.5_other', 'salt_bridge_5.5_all'] # Rotatable bonds self.titles += ['num_rotors'] assert len(self.titles) == len(self)
[docs] def set_protein(self, protein): """ One function to change all relevant proteins Parameters ---------- protein: oddt.toolkit.Molecule object Protein object to be used while generating descriptors. Protein becomes new global and default protein. """ self.protein = protein self.vina.set_protein(protein) self.cc_4.protein = protein self.cc_25.protein = protein
[docs] def build(self, ligands, protein=None): """ Descriptor building method Parameters ---------- ligands: array-like An array of generator of oddt.toolkit.Molecule objects for which the descriptor is computed protein: oddt.toolkit.Molecule object (default=None) Protein object to be used while generating descriptors. If none, then the default protein (from constructor) is used. Otherwise, protein becomes new global and default protein. Returns ------- descs: numpy array, shape=[n_samples, 351] An array of binana descriptors, aligned with input ligands """ if protein: self.set_protein(protein) else: protein = self.protein protein_dict = protein.atom_dict desc = None for mol in ligands: mol_dict = mol.atom_dict vec = np.array([], dtype=float) vec = tuple() # Vina # TODO: Asynchronous output from vina, push command to score and retrieve at the end? # TODO: Check if ligand has vina scores vec += tuple(self.vina.build(mol).flatten()) # Close Contacts (<4A) vec += tuple(self.cc_4.build(mol).flatten()) # Electrostatics (<4A) ele_rec_types, ele_lig_types = zip(*self.ele_types) ele_mol_atoms = atoms_by_type(mol_dict, ele_lig_types, 'atom_types_ad4') ele_rec_atoms = atoms_by_type(protein_dict, ele_rec_types, 'atom_types_ad4') ele = tuple() for r_t, m_t in self.ele_types: mol_ele_dict, rec_ele_dict = close_contacts(ele_mol_atoms[m_t], ele_rec_atoms[r_t], 4) if len(mol_ele_dict) and len(rec_ele_dict): ele += (mol_ele_dict['charge'] * rec_ele_dict['charge'] / np.sqrt((mol_ele_dict['coords'] - rec_ele_dict['coords'])**2).sum(axis=-1) * 138.94238460104697e4).sum(), # convert to J/mol else: ele += 0, vec += tuple(np.nan_to_num(ele)) # Ligand Atom Types atoms = atoms_by_type(mol_dict, self.ligand_atom_types, 'atom_types_ad4') vec += tuple([len(atoms[t]) for t in self.ligand_atom_types]) # Close Contacts (<2.5A) vec += tuple(self.cc_25.build(mol).flatten()) # H-Bonds (<4A) hbond_mol, hbond_rec, strict = hbonds(mol, protein, 4) # Retain only strict hbonds hbond_mol = hbond_mol[strict] hbond_rec = hbond_rec[strict] backbone = hbond_rec['isbackbone'] alpha = hbond_rec['isalpha'] beta = hbond_rec['isbeta'] other = ~alpha & ~beta donor_mol = hbond_mol['isdonor'] donor_rec = hbond_rec['isdonor'] hbond_vec = ((donor_mol & backbone & alpha).sum(), (donor_mol & backbone & beta).sum(), (donor_mol & backbone & other).sum(), (donor_mol & ~backbone & alpha).sum(), (donor_mol & ~backbone & beta).sum(), (donor_mol & ~backbone & other).sum(), (donor_rec & backbone & alpha).sum(), (donor_rec & backbone & beta).sum(), (donor_rec & backbone & other).sum(), (donor_rec & ~backbone & alpha).sum(), (donor_rec & ~backbone & beta).sum(), (donor_rec & ~backbone & other).sum()) vec += tuple(hbond_vec) # Hydrophobic contacts (<4A) hydrophobic = hydrophobic_contacts(mol, protein, 4)[1] backbone = hydrophobic['isbackbone'] alpha = hydrophobic['isalpha'] beta = hydrophobic['isbeta'] other = ~alpha & ~beta hyd_vec = ((backbone & alpha).sum(), (backbone & beta).sum(), (backbone & other).sum(), (~backbone & alpha).sum(), (~backbone & beta).sum(), (~backbone & other).sum(), len(hydrophobic)) vec += tuple(hyd_vec) # Pi-stacking (<7.5A) pi_mol, pi_rec, pi_paralel, pi_tshaped = pi_stacking(mol, protein, 7.5) alpha = pi_rec['isalpha'] & pi_paralel beta = pi_rec['isbeta'] & pi_paralel other = ~alpha & ~beta & pi_paralel pi_vec = (alpha.sum(), beta.sum(), other.sum()) vec += tuple(pi_vec) # T-shaped Pi-Pi interaction alpha = pi_rec['isalpha'] & pi_tshaped beta = pi_rec['isbeta'] & pi_tshaped other = ~alpha & ~beta & pi_tshaped pi_t_vec = (alpha.sum(), beta.sum(), other.sum()) # Pi-cation (<6A) pi_rec, cat_mol, strict = pi_cation(protein, mol, 6) alpha = pi_rec['isalpha'] & strict beta = pi_rec['isbeta'] & strict other = ~alpha & ~beta & strict pi_cat_vec = (alpha.sum(), beta.sum(), other.sum()) pi_mol, cat_rec, strict = pi_cation(mol, protein, 6) alpha = cat_rec['isalpha'] & strict beta = cat_rec['isbeta'] & strict other = ~alpha & ~beta & strict pi_cat_vec += (alpha.sum(), beta.sum(), other.sum()) vec += tuple(pi_cat_vec) # T-shape (perpendicular Pi's) (<7.5A) vec += tuple(pi_t_vec) # Active site flexibility (<4A) acitve_site = close_contacts(mol_dict[mol_dict['atomicnum'] != 1], protein_dict[protein_dict['atomicnum'] != 1], cutoff=4)[1] backbone = acitve_site['isbackbone'] alpha = acitve_site['isalpha'] beta = acitve_site['isbeta'] other = ~alpha & ~beta as_flex = ((backbone & alpha).sum(), (backbone & beta).sum(), (backbone & other).sum(), (~backbone & alpha).sum(), (~backbone & beta).sum(), (~backbone & other).sum(), len(acitve_site)) vec += tuple(as_flex) # Salt bridges (<5.5) salt_bridge_dict = salt_bridges(mol, protein, 5.5)[1] vec += (salt_bridge_dict['isalpha'].sum(), salt_bridge_dict['isbeta'].sum(), (~salt_bridge_dict['isalpha'] & ~salt_bridge_dict['isbeta']).sum(), len(salt_bridge_dict)) # Rotatable bonds vec += mol.num_rotors, if desc is None: desc = np.zeros(len(vec), dtype=float) desc = np.vstack((desc, np.array(vec, dtype=float))) return desc[1:]
def __len__(self): """ Returns the dimensions of descriptors """ return 350 def __reduce__(self): return binana_descriptor, (self.protein,)