Source code for oddt.scoring.descriptors

from functools import partial

import numpy as np
from scipy.spatial.distance import cdist as distance
from scipy.sparse import vstack as sparse_vstack

from oddt.utils import is_molecule
from oddt.docking import autodock_vina
from oddt.docking.internal import vina_docking
from oddt.fingerprints import sparse_to_csr_matrix

__all__ = ['close_contacts_descriptor',
           'fingerprints',
           'autodock_vina_descriptor',
           'oddt_vina_descriptor']


def atoms_by_type(atom_dict, types, mode='atomic_nums'):
    """Returns atom dictionaries based on given criteria.
    Currently we have 3 types of atom selection criteria:
        * atomic numbers ['atomic_nums']
        * Sybyl Atom Types ['atom_types_sybyl']
        * AutoDock4 atom types ['atom_types_ad4'] (http://autodock.scripps.edu/faqs-help/faq/where-do-i-set-the-autodock-4-force-field-parameters)

    Parameters
    ----------
    atom_dict: oddt.toolkit.Molecule.atom_dict
        Atom dictionary as implemeted in oddt.toolkit.Molecule class

    types: array-like
        List of atom types/numbers wanted.

    Returns
    -------
    out: dictionary of shape=[len(types)]
        A dictionary of queried atom types (types are keys of the dictionary).
        Values are of oddt.toolkit.Molecule.atom_dict type.
    """

    ad4_to_atomicnum = {
        'HD': 1, 'C': 6, 'CD': 6, 'A': 6, 'N': 7, 'NA': 7, 'OA': 8, 'F': 9,
        'MG': 12, 'P': 15, 'SA': 16, 'S': 16, 'CL': 17, 'CA': 20, 'MN': 25,
        'FE': 26, 'CU': 29, 'ZN': 30, 'BR': 35, 'I': 53
    }

    if mode == 'atomic_nums':
        return {num: atom_dict[atom_dict['atomicnum'] == num]
                for num in set(types)}
    elif mode == 'atom_types_sybyl':
        return {t: atom_dict[atom_dict['atomtype'] == t]
                for t in set(types)}
    elif mode == 'atom_types_ad4':
        # all AD4 atom types are capitalized
        types = [t.upper() for t in types]
        out = {}
        for t in set(types):
            if t in ad4_to_atomicnum:
                constraints = (atom_dict['atomicnum'] == ad4_to_atomicnum[t])
                # additoinal constraints for more specific atom types (donors,
                # acceptors, aromatic etc)
                if t == 'HD':
                    constraints &= atom_dict['isdonorh']
                elif t == 'C':
                    constraints &= ~atom_dict['isaromatic']
                elif t == 'CD':
                    # not canonical AD4 type, although used by NNscore, with no
                    # description
                    constraints &= ~atom_dict['isdonor']
                elif t == 'A':
                    constraints &= atom_dict['isaromatic']
                elif t in ('N', 'S'):
                    constraints &= ~atom_dict['isacceptor']
                elif t in ('NA', 'OA', 'SA'):
                    constraints &= atom_dict['isacceptor']

                out[t] = atom_dict[constraints]

            else:
                raise ValueError('Unsopported atom type: %s' % t)
    else:
        raise ValueError('Unsopported mode: %s' % mode)
    return out


[docs]class close_contacts_descriptor(object): def __init__(self, protein=None, cutoff=4, mode='atomic_nums', ligand_types=None, protein_types=None, aligned_pairs=False): """Close contacts descriptor which tallies atoms of type X in certain cutoff from atoms of type Y. Parameters ---------- protein: oddt.toolkit.Molecule or None (default=None) Default protein to use as reference cutoff: int or list, shape=[n,] or shape=[n,2] (default=4) Cutoff for atoms in Angstroms given as an integer or a list of ranges, eg. [0, 4, 8, 12] or [[0,4],[4,8],[8,12]]. Upper bound is always inclusive, lower exclusive. mode: string (default='atomic_nums') Method of atoms selection, as used in `atoms_by_type` ligand_types: array List of ligand atom types to use protein_types: array List of protein atom types to use aligned_pairs: bool (default=False) Flag indicating should permutation of types should be done, otherwise the atoms are treated as aligned pairs. """ self.cutoff = np.atleast_1d(cutoff) # Cutoffs in fomr of continuous intervals (0,2,4,6,...) if len(self.cutoff) > 1 and self.cutoff.ndim == 1: self.cutoff = np.vstack((self.cutoff[:-1], self.cutoff[1:])).T elif self.cutoff.ndim > 2: raise ValueError('Unsupported shape of cutoff: %s' % self.cutoff.shape) # for pickle save original value self.original_cutoff = cutoff self.protein = protein self.ligand_types = ligand_types self.protein_types = protein_types if protein_types else ligand_types self.aligned_pairs = aligned_pairs self.mode = mode # setup titles if len(self.cutoff) == 1: self.titles = ['%s.%s' % (str(p), str(l)) for p in self.protein_types for l in self.ligand_types ] else: self.titles = ['%s.%s_%s-%s' % (str(p), str(l), str(c1), str(c2)) for p in self.protein_types for l in self.ligand_types for c1, c2 in self.cutoff ]
[docs] def build(self, ligands, protein=None): """Builds descriptors for series of ligands Parameters ---------- ligands: iterable of oddt.toolkit.Molecules or oddt.toolkit.Molecule A list or iterable of ligands to build the descriptor or a single molecule. protein: oddt.toolkit.Molecule or None (default=None) Default protein to use as reference """ if protein: self.protein = protein if is_molecule(ligands): ligands = [ligands] out = [] for mol in ligands: mol_dict = atoms_by_type(mol.atom_dict, self.ligand_types, self.mode) if self.aligned_pairs: pairs = zip(self.ligand_types, self.protein_types) else: pairs = [(mol_type, prot_type) for mol_type in self.ligand_types for prot_type in self.protein_types] dist = distance(self.protein.atom_dict['coords'], mol.atom_dict['coords']) within_cutoff = (dist <= self.cutoff.max()).any(axis=1) local_protein_dict = self.protein.atom_dict[within_cutoff] prot_dict = atoms_by_type(local_protein_dict, self.protein_types, self.mode) desc = [] for mol_type, prot_type in pairs: d = distance(prot_dict[prot_type]['coords'], mol_dict[mol_type]['coords'])[..., np.newaxis] if len(self.cutoff) > 1: count = ((d > self.cutoff[..., 0]) & (d <= self.cutoff[..., 1])).sum(axis=(0, 1)) else: count = (d <= self.cutoff).sum() desc.append(count) desc = np.array(desc, dtype=int).flatten() out.append(desc) return np.vstack(out)
def __len__(self): """ Returns the dimensions of descriptors """ if self.aligned_pairs: return len(self.ligand_types) * self.cutoff.shape[0] else: return len(self.ligand_types) * len(self.protein_types) * len(self.cutoff) def __reduce__(self): return close_contacts_descriptor, (self.protein, self.original_cutoff, self.mode, self.ligand_types, self.protein_types, self.aligned_pairs)
class universal_descriptor(object): def __init__(self, func, protein=None, shape=None, sparse=False): """An universal descriptor which converts a callable object (function) to a descriptor generator which can be used in scoring methods. .. versionadded:: 0.6 Parameters ---------- func: object A function to be mapped accross all ligands. Can be any callable object, which takes ligand as first argument and optionally protein key word argument. Additional arguments should be set using `functools.partial`. protein: oddt.toolkit.Molecule or None (default=None) Default protein to use as reference shape: int or tuple (default=None) The final shape of output sparse: bool (default=True) Flag to return sparse matrix """ self.func = func self.protein = protein self.shape = shape self.sparse = sparse if isinstance(func, partial): self.titles = self.func.func.__name__ else: self.titles = self.func.__name__ def build(self, ligands, protein=None): """Builds descriptors for series of ligands Parameters ---------- ligands: iterable of oddt.toolkit.Molecules or oddt.toolkit.Molecule A list or iterable of ligands to build the descriptor or a single molecule. protein: oddt.toolkit.Molecule or None (default=None) Default protein to use as reference """ if protein: self.protein = protein if is_molecule(ligands): ligands = [ligands] out = [] for mol in ligands: if self.protein is None: out.append(self.func(mol)) else: out.append(self.func(mol, protein=self.protein)) if self.sparse: # out = list(map(partial(sparse_to_csr_matrix, size=self.shape), out)) return sparse_vstack(map(partial(sparse_to_csr_matrix, size=self.shape), out), format='csr') else: return np.vstack(out) def __len__(self): """ Returns the dimensions of descriptors """ if self.shape is None: raise NotImplementedError('The length of descriptor is not defined') else: return self.shape def __reduce__(self): return universal_descriptor, (self.func, self.protein, self.shape, self.sparse) # TODO: we don't use toolkit. should we?
[docs]class fingerprints(object): def __init__(self, fp='fp2', toolkit='ob'): self.fp = fp self.exchange = False # if toolkit == oddt.toolkit.backend: # self.exchange = False # else: # self.exchange = True # self.target_toolkit = __import__('toolkits.'+toolkit) def _get_fingerprint(self, mol): if self.exchange: mol = self.target_toolkit.Molecule(mol) return mol.calcfp(self.fp).raw
[docs] def build(self, mols): if is_molecule(mols): mols = [mols] out = [] for mol in mols: fp = self._get_fingerprint(mol) out.append(fp) return np.vstack(out)
def __reduce__(self): return fingerprints, ()
[docs]class autodock_vina_descriptor(object): def __init__(self, protein=None, vina_scores=None): self.protein = protein self.vina = autodock_vina(protein) self.vina_scores = vina_scores or ['vina_affinity', 'vina_gauss1', 'vina_gauss2', 'vina_repulsion', 'vina_hydrophobic', 'vina_hydrogen'] self.titles = self.vina_scores
[docs] def set_protein(self, protein): self.protein = protein self.vina.set_protein(protein)
[docs] def build(self, ligands, protein=None): if protein: self.set_protein(protein) else: protein = self.protein if is_molecule(ligands): ligands = [ligands] desc = None for mol in ligands: # Vina # TODO: Asynchronous output from vina, push command to score and retrieve at the end? # TODO: Check if ligand has vina scores scored_mol = self.vina.score(mol)[0].data vec = np.array(([scored_mol[key] for key in self.vina_scores]), dtype=np.float32).flatten() if desc is None: desc = vec else: desc = np.vstack((desc, vec)) return np.atleast_2d(desc)
def __len__(self): """ Returns the dimensions of descriptors """ return len(self.vina_scores) def __reduce__(self): return autodock_vina_descriptor, (self.protein, self.vina_scores)
[docs]class oddt_vina_descriptor(object): def __init__(self, protein=None, vina_scores=None): self.protein = protein self.vina = vina_docking(protein) self.all_vina_scores = ['vina_affinity', # inter-molecular interactions 'vina_gauss1', 'vina_gauss2', 'vina_repulsion', 'vina_hydrophobic', 'vina_hydrogen', # intra-molecular interactions 'vina_intra_gauss1', 'vina_intra_gauss2', 'vina_intra_repulsion', 'vina_intra_hydrophobic', 'vina_intra_hydrogen', 'vina_num_rotors'] self.vina_scores = vina_scores or self.all_vina_scores self.titles = self.vina_scores
[docs] def set_protein(self, protein): self.protein = protein self.vina.set_protein(protein)
[docs] def build(self, ligands, protein=None): if protein: self.set_protein(protein) else: protein = self.protein if is_molecule(ligands): ligands = [ligands] desc = None for mol in ligands: mol_keys = mol.data.keys() if any(x not in mol_keys for x in self.vina_scores): self.vina.set_ligand(mol) inter = self.vina.score_inter() intra = self.vina.score_intra() num_rotors = self.vina.num_rotors # could use self.vina.score(), but better to reuse variables affinity = ((inter * self.vina.weights[:5]).sum() / (1 + self.vina.weights[5] * num_rotors)) assert len(self.all_vina_scores) == len(inter) + len(intra) + 2 score = dict(zip( self.all_vina_scores, np.hstack((affinity, inter, intra, num_rotors)).flatten() )) mol.data.update(score) else: score = mol.data.to_dict() try: vec = np.array([score[s] for s in self.vina_scores], dtype=np.float32).flatten() except Exception as e: print(score, affinity, inter, intra, num_rotors) print(mol.title) raise e if desc is None: desc = vec else: desc = np.vstack((desc, vec)) return np.atleast_2d(desc)
def __len__(self): """ Returns the dimensions of descriptors """ return len(self.vina_scores) def __reduce__(self): return oddt_vina_descriptor, (self.protein, self.vina_scores)