Source code for oddt.shape

from __future__ import print_function
import sys
import numpy as np
from numpy.linalg import norm
from scipy.stats import moment
from scipy.special import cbrt


[docs]def common_usr(molecule, ctd=None, cst=None, fct=None, ftf=None, atoms_type=None): """Function used in USR and USRCAT function Parameters ---------- molecule : oddt.toolkit.Molecule Molecule to compute USR shape descriptor ctd : numpy array or None (default = None) Coordinates of the molecular centroid If 'None', the point is calculated cst : numpy array or None (default = None) Coordinates of the closest atom to the molecular centroid If 'None', the point is calculated fct : numpy array or None (default = None) Coordinates of the farthest atom to the molecular centroid If 'None', the point is calculated ftf : numpy array or None (default = None) Coordinates of the farthest atom to the farthest atom to the molecular centroid If 'None', the point is calculated atoms_type : str or None (default None) Type of atoms to be selected from atom_dict If 'None', all atoms are used to calculate shape descriptor Returns ------- shape_descriptor : numpy array, shape = (12) Array describing shape of molecule """ if atoms_type is None: atoms = molecule.atom_dict['coords'] else: if atoms_type == 'ishydrophobe': mask = (molecule.atom_dict['ishalogen'] | molecule.atom_dict['ishydrophobe'] | (molecule.atom_dict['atomicnum'] == 16)) else: mask = molecule.atom_dict[atoms_type] atoms = molecule.atom_dict[mask]['coords'] if len(atoms) == 0: return np.zeros(12), ((0., 0., 0.),) * 4 if ctd is None: ctd = atoms.mean(0) distances_ctd = norm(atoms - ctd, axis=1) if cst is None: cst = atoms[distances_ctd.argmin()] distances_cst = norm(atoms - cst, axis=1) if fct is None: fct = atoms[distances_ctd.argmax()] distances_fct = norm(atoms - fct, axis=1) if ftf is None: ftf = atoms[distances_fct.argmax()] distances_ftf = norm(atoms - ftf, axis=1) distances_list = [distances_ctd, distances_cst, distances_fct, distances_ftf] shape_descriptor = np.zeros(12) for i, distances in enumerate(distances_list): shape_descriptor[i * 3 + 0] = np.mean(distances) shape_descriptor[i * 3 + 1] = np.var(distances) shape_descriptor[i * 3 + 2] = moment(distances, moment=3) return shape_descriptor, (ctd, cst, fct, ftf)
[docs]def usr(molecule): """Computes USR shape descriptor based on Ballester PJ, Richards WG (2007). Ultrafast shape recognition to search compound databases for similar molecular shapes. Journal of computational chemistry, 28(10):1711-23. http://dx.doi.org/10.1002/jcc.20681 Parameters ---------- molecule : oddt.toolkit.Molecule Molecule to compute USR shape descriptor Returns ------- shape_descriptor : numpy array, shape = (12) Array describing shape of molecule """ return common_usr(molecule)[0]
[docs]def usr_cat(molecule): """Computes USRCAT shape descriptor based on Adrian M Schreyer, Tom Blundell (2012). USRCAT: real-time ultrafast shape recognition with pharmacophoric constraints. Journal of Cheminformatics, 2012 4:27. http://dx.doi.org/10.1186/1758-2946-4-27 Parameters ---------- molecule : oddt.toolkit.Molecule Molecule to compute USRCAT shape descriptor Returns ------- shape_descriptor : numpy array, shape = (60) Array describing shape of molecule """ all_atoms_shape, points = common_usr(molecule) ctd, cst, fct, ftf = points hydrophobic_shape = common_usr( molecule, ctd, cst, fct, ftf, 'ishydrophobe')[0] aromatic_shape = common_usr(molecule, ctd, cst, fct, ftf, 'isaromatic')[0] acceptor_shape = common_usr(molecule, ctd, cst, fct, ftf, 'isacceptor')[0] donor_shape = common_usr(molecule, ctd, cst, fct, ftf, 'isdonor')[0] cat_shape = np.hstack((all_atoms_shape, hydrophobic_shape, aromatic_shape, acceptor_shape, donor_shape)) return np.nan_to_num(cat_shape)
[docs]def electroshape(mol): """Computes shape descriptor based on Armstrong, M. S. et al. ElectroShape: fast molecular similarity calculations incorporating shape, chirality and electrostatics. J Comput Aided Mol Des 24, 789-801 (2010). http://dx.doi.org/doi:10.1007/s10822-010-9374-0 Aside from spatial coordinates, atoms' charges are also used as the fourth dimension to describe shape of the molecule. Parameters ---------- mol : oddt.toolkit.Molecule Returns ------- shape_descriptor : numpy array, shape = (15) Array describing shape of molecule """ if (mol.atom_dict['coords'] == 0).all(): raise Exception('Molecule needs 3D coordinates') if np.isnan(mol.atom_dict['charge']).any(): print('Nan values in charge values of molecule ' + mol.title, file=sys.stderr) charge = np.nan_to_num(mol.atom_dict['charge']) mi = 25 # scaling factor converting electron charges to Angstroms four_dimensions = np.column_stack((mol.atom_dict['coords'], charge * mi)) c1 = four_dimensions.mean(0) # geometric centre of the molecule distances_c1 = norm(four_dimensions - c1, axis=1) c2 = four_dimensions[distances_c1.argmax()] # atom position furthest from c1 distances_c2 = norm(four_dimensions - c2, axis=1) c3 = four_dimensions[distances_c2.argmax()] # atom position furthest from c2 distances_c3 = norm(four_dimensions - c3, axis=1) vector_a = c2 - c1 vector_b = c3 - c1 vector_as = vector_a[:3] # spatial parts of these vectors - vector_bs = vector_b[:3] # the first three coordinates vector_c = ((norm(vector_a) / (2 * norm(np.cross(vector_as, vector_bs)))) * np.cross(vector_as, vector_bs)) vector_c1s = c1[:3] max_charge = np.array(np.amax(charge) * mi) min_charge = np.array(np.amin(charge) * mi) c4 = np.append(vector_c1s + vector_c, max_charge) c5 = np.append(vector_c1s + vector_c, min_charge) distances_c4 = norm(four_dimensions - c4, axis=1) distances_c5 = norm(four_dimensions - c5, axis=1) distances_list = [distances_c1, distances_c2, distances_c3, distances_c4, distances_c5] shape_descriptor = np.zeros(15) i = 0 for distances in distances_list: mean = np.mean(distances) shape_descriptor[0 + i] = mean shape_descriptor[1 + i] = np.std(distances) shape_descriptor[2 + i] = cbrt(np.sum(((distances - mean) ** 3) / distances.size)) i += 3 return shape_descriptor
[docs]def usr_similarity(mol1_shape, mol2_shape, ow=1., hw=1., rw=1., aw=1., dw=1.): """Computes similarity between molecules Parameters ---------- mol1_shape : numpy array USR shape descriptor mol2_shape : numpy array USR shape descriptor ow : float (default = 1.) Scaling factor for all atoms Only used for USRCAT, ignored for other types hw : float (default = 1.) Scaling factor for hydrophobic atoms Only used for USRCAT, ignored for other types rw : float (default = 1.) Scaling factor for aromatic atoms Only used for USRCAT, ignored for other types aw : float (default = 1.) Scaling factor for acceptors Only used for USRCAT, ignored for other types dw : float (default = 1.) Scaling factor for donors Only used for USRCAT, ignored for other types Returns ------- similarity : float from 0 to 1 Similarity between shapes of molecules, 1 indicates identical molecules """ if mol1_shape.shape[0] == 12 and mol2_shape.shape[0] == 12: sim = 1. / (1. + (1. / 12) * np.sum(np.fabs(mol1_shape - mol2_shape))) elif mol1_shape.shape[0] == 60 and mol2_shape.shape[0] == 60: w = np.array([ow, hw, rw, aw, dw]) # Normalize weights w = w / w.sum() shape_diff = np.abs(mol1_shape - mol2_shape).reshape(-1, 12) sim = 1. / (1 + (w * (1. / 12) * shape_diff.sum(axis=1)).sum()) elif mol1_shape.shape[0] == 15 and mol2_shape.shape[0] == 15: sim = 1. / (1 + (1. / 15) * np.sum(np.fabs(mol1_shape - mol2_shape))) else: raise Exception('Given vectors are not valid USR shape descriptors ' 'or come from different methods. Correct vector lengths' 'are: 12 for USR, 60 for USRCAT, 15 for Electroshape') return sim