Source code for oddt.virtualscreening

"""ODDT pipeline framework for virtual screening"""
from __future__ import print_function
import sys
import csv
from os.path import dirname, isfile, join
from multiprocessing import Pool
from itertools import chain
from functools import partial
import warnings

import six
from six.moves import filter
# from joblib import Parallel, delayed

import oddt
from oddt.utils import is_molecule, compose_iter, chunker, method_caller
from oddt.scoring import scorer
from oddt.fingerprints import (InteractionFingerprint,
                               SimpleInteractionFingerprint,
                               dice)
from oddt.shape import usr, usr_cat, electroshape, usr_similarity


def _filter_smarts(mols, smarts, soft_fail=0):
    """Filter out molecule list (exhaustive) by smarts occurances. Allow molecules
    to pass if them match up to `soft_fail` matches.
    """
    out = []
    for mol in mols:
        if isinstance(smarts, six.string_types):
            compiled_smarts = oddt.toolkit.Smarts(smarts)
            if len(compiled_smarts.findall(mol)) == 0:
                out.append(mol)
        else:
            compiled_smarts = [oddt.toolkit.Smarts(s) for s in smarts]
            fail = 0
            for s in compiled_smarts:
                if len(s.findall(mol)) > 0:
                    fail += 1
                if fail > soft_fail:
                    break
            if fail <= soft_fail:
                out.append(mol)
    return out


def _filter(mols, expression, soft_fail=0):
    """Filter molecule by a generic expression, such as `mol.logp > 1`."""
    out = []
    for mol in mols:
        if isinstance(expression, list):
            fail = 0
            for e in expression:
                if not eval(e):
                    fail += 1
                if fail > soft_fail:
                    break
            if fail <= soft_fail:
                out.append(mol)
        else:
            if eval(expression):
                out.append(mol)
    return out


def _filter_similarity(mols, distance, generator, query_fps, cutoff):
    """Filter molecules by a certain distance to the reference fingerprints.
    User must supply distance funtion, FP generator, query FPs and cutoff."""
    return list(filter(
        lambda q: any(distance(generator(q), q_fp) >= float(cutoff)
                      for q_fp in query_fps), mols))


[docs]class virtualscreening: def __init__(self, n_cpu=-1, verbose=False, chunksize=100): """Virtual Screening pipeline stack Parameters ---------- n_cpu: int (default=-1) The number of parallel procesors to use verbose: bool (default=False) Verbosity flag for some methods """ self._pipe = [] self._mol_feed = [] self.n_cpu = n_cpu if n_cpu else -1 self.num_input = 0 self.num_output = 0 self.verbose = verbose self.chunksize = chunksize
[docs] def load_ligands(self, fmt, ligands_file, **kwargs): """Loads file with ligands. Parameters ---------- file_type: string Type of molecular file ligands_file: string Path to a file, which is loaded to pipeline """ if fmt == 'mol2' and oddt.toolkit.backend == 'ob': if 'opt' in kwargs: kwargs['opt']['c'] = None else: kwargs['opt'] = {'c': None} self._mol_feed = chain(self._mol_feed, oddt.toolkit.readfile(fmt, ligands_file, **kwargs))
[docs] def apply_filter(self, expression, soft_fail=0): """Filtering method, can use raw expressions (strings to be evaled in if statement, can use oddt.toolkit.Molecule methods, eg. `mol.molwt < 500`) Currently supported presets: * Lipinski Rule of 5 ('ro5' or 'l5') * Fragment Rule of 3 ('ro3') * PAINS filter ('pains') Parameters ---------- expression: string or list of strings Expresion(s) to be used while filtering. soft_fail: int (default=0) The number of faulures molecule can have to pass filter, aka. soft-fails. """ if expression in ['l5', 'ro5', 'ro3', 'pains']: # define presets # TODO: move presets to another config file # Lipinski rule of 5's if expression.lower() in ['l5', 'ro5']: self._pipe.append((partial(_filter, expression=['mol.molwt < 500', 'mol.HBA1 <= 10', 'mol.HBD <= 5', 'mol.logP <= 5'], soft_fail=soft_fail))) # Rule of three elif expression.lower() == 'ro3': self._pipe.append((partial(_filter, expression=['mol.molwt < 300', 'mol.HBA1 <= 3', 'mol.HBD <= 3', 'mol.logP <= 3'], soft_fail=soft_fail))) # PAINS filter elif expression.lower() == 'pains': pains_smarts = {} with open(join(dirname(__file__), 'filter', 'pains.smarts')) as pains_file: csv_reader = csv.reader(pains_file, delimiter="\t") for line in csv_reader: if len(line) > 1: pains_smarts[line[1][8:-2]] = line[0] self._pipe.append((partial(_filter_smarts, smarts=list(pains_smarts.values()), soft_fail=soft_fail))) else: self._pipe.append((partial(_filter, expression=expression, soft_fail=soft_fail)))
[docs] def similarity(self, method, query, cutoff=0.9, protein=None): """Similarity filter. Supported structural methods: * ift: interaction fingerprints * sift: simple interaction fingerprints * usr: Ultrafast Shape recognition * usr_cat: Ultrafast Shape recognition, Credo Atom Types * electroshape: Electroshape, an USR method including partial charges Parameters ---------- method: string Similarity method used to compare molecules. Avaiale methods: * `ifp` - interaction fingerprint (requires a receptor) * `sifp` - simple interaction fingerprint (requires a receptor) * `usr` - Ultrafast Shape Reckognition * `usr_cat` - USR, with CREDO atom types * `electroshape` - Electroshape, USR with moments representing partial charge query: oddt.toolkit.Molecule or list of oddt.toolkit.Molecule Query molecules to compare the pipeline to. cutoff: float Similarity cutoff for filtering molecules. Any similarity lower than it will be filtered out. protein: oddt.toolkit.Molecule (default = None) Protein for underling method. By default it's empty, but sturctural fingerprints need one. """ if is_molecule(query): query = [query] # choose fp/usr and appropriate distance if method.lower() == 'ifp': gen = partial(InteractionFingerprint, protein=protein) dist = dice elif method.lower() == 'sifp': gen = partial(SimpleInteractionFingerprint, protein=protein) dist = dice elif method.lower() == 'usr': gen = usr dist = usr_similarity elif method.lower() == 'usr_cat': gen = usr_cat dist = usr_similarity elif method.lower() == 'electroshape': gen = electroshape dist = usr_similarity else: raise ValueError('Similarity filter "%s" is not supported.' % method) # generate FPs for query molecules once query_fps = [gen(q) for q in query] self._pipe.append(partial(_filter_similarity, distance=dist, generator=gen, # same generator for pipe mols query_fps=query_fps, cutoff=cutoff))
[docs] def dock(self, engine, protein, *args, **kwargs): """Docking procedure. Parameters ---------- engine: string Which docking engine to use. Notes ----- Additional parameters are passed directly to the engine. Following docking engines are supported: 1. Audodock Vina (```engine="autodock_vina"```), see :class:`oddt.docking.autodock_vina`. """ if engine.lower() == 'autodock_vina': from oddt.docking import autodock_vina engine = autodock_vina(protein, *args, **kwargs) else: raise ValueError('Docking engine %s was not implemented in ODDT' % engine) self._pipe.append(partial(method_caller, engine, 'dock'))
[docs] def score(self, function, protein=None, *args, **kwargs): """Scoring procedure compatible with any scoring function implemented in ODDT and other pickled SFs which are subclasses of `oddt.scoring.scorer`. Parameters ---------- function: string Which scoring function to use. protein: oddt.toolkit.Molecule Default protein to use as reference Notes ----- Additional parameters are passed directly to the scoring function. """ if isinstance(protein, six.string_types): extension = protein.split('.')[-1] protein = next(oddt.toolkit.readfile(extension, protein)) protein.protein = True elif protein is None: raise ValueError('Protein needs to be set for structure based ' 'scoring') # trigger cache protein.atom_dict if isinstance(function, six.string_types): if isfile(function): sf = scorer.load(function) sf.set_protein(protein) elif function.lower().startswith('rfscore'): from oddt.scoring.functions.RFScore import rfscore new_kwargs = {} for bit in function.lower().split('_'): if bit.startswith('pdbbind'): new_kwargs['pdbbind_version'] = int(bit.replace('pdbbind', '')) elif bit.startswith('v'): new_kwargs['version'] = int(bit.replace('v', '')) sf = rfscore.load(**new_kwargs) sf.set_protein(protein) elif function.lower().startswith('nnscore'): from oddt.scoring.functions.NNScore import nnscore new_kwargs = {} for bit in function.lower().split('_'): if bit.startswith('pdbbind'): new_kwargs['pdbbind_version'] = int(bit.replace('pdbbind', '')) sf = nnscore.load(**new_kwargs) sf.set_protein(protein) elif function.lower().startswith('plec'): from oddt.scoring.functions.PLECscore import PLECscore new_kwargs = {} for bit in function.lower().split('_'): if bit.startswith('pdbbind'): new_kwargs['pdbbind_version'] = int(bit.replace('pdbbind', '')) elif bit.startswith('plec'): new_kwargs['version'] = bit.replace('plec', '') elif bit.startswith('p'): new_kwargs['depth_protein'] = int(bit.replace('p', '')) elif bit.startswith('l'): new_kwargs['depth_ligand'] = int(bit.replace('l', '')) elif bit.startswith('s'): new_kwargs['size'] = int(bit.replace('s', '')) sf = PLECscore.load(**new_kwargs) sf.set_protein(protein) elif function.lower() == 'autodock_vina': from oddt.docking import autodock_vina sf = autodock_vina(protein, *args, **kwargs) sf.set_protein(protein) else: raise ValueError('Scoring Function %s was not implemented in ' 'ODDT' % function) else: if isinstance(function, scorer): sf = function sf.set_protein(protein) else: raise ValueError('Supplied object "%s" is not an ODDT scoring ' 'funtion' % function.__name__) self._pipe.append(partial(method_caller, sf, 'predict_ligands'))
[docs] def fetch(self): """A method to exhaust the pipeline. Itself it is lazy (a generator)""" chunk_feed = chunker(self._mol_feed, chunksize=self.chunksize) # get first chunk and check if it is saturated try: first_chunk = next(chunk_feed) except StopIteration: raise StopIteration('There are no molecules loaded to the pipeline.') if len(first_chunk) == 0: warnings.warn('There is **zero** molecules at the output of the VS' ' pipeline. Output file will be empty.') elif len(first_chunk) < self.chunksize and self.n_cpu > 1: warnings.warn('The chunksize (%i) seams to be to large.' % self.chunksize) # use methods multithreading when we have less molecules than cores if len(first_chunk) < self.n_cpu: warnings.warn('Falling back to sub-methods multithreading as ' 'the number of molecules is less than cores ' '(%i < %i)' % (len(first_chunk), self.n_cpu)) for func in self._pipe: if hasattr(func, 'n_cpu'): func.n_cpu = self.n_cpu elif hasattr(func, 'n_jobs'): func.n_jobs = self.n_cpu elif isinstance(func, partial): for func2 in func.args: if hasattr(func2, 'n_cpu'): func2.n_cpu = self.n_cpu elif hasattr(func2, 'n_jobs'): func2.n_jobs = self.n_cpu # turn off VS multiprocessing self.n_cpu = 1 # TODO add some verbosity or progress bar if self.n_cpu != 1: out = (Pool(self.n_cpu if self.n_cpu > 0 else None) .imap(partial(compose_iter, funcs=self._pipe), (chunk for chunk in chain([first_chunk], chunk_feed)))) else: out = (compose_iter(chunk, self._pipe) for chunk in chain([first_chunk], chunk_feed)) # FIXME use joblib version as soon as it gets return_generator merged # out = Parallel(n_jobs=self.n_cpu)( # delayed(compose_iter)(chunk, self._pipe) # for chunk in chain([first_chunk], chunk_feed)) # merge chunks into one iterable return chain.from_iterable(out)
[docs] def write(self, fmt, filename, csv_filename=None, **kwargs): """Outputs molecules to a file Parameters ---------- file_type: string Type of molecular file ligands_file: string Path to a output file csv_filename: string Optional path to a CSV file """ if fmt == 'mol2' and oddt.toolkit.backend == 'ob': if 'opt' in kwargs: kwargs['opt']['c'] = None else: kwargs['opt'] = {'c': None} output_mol_file = oddt.toolkit.Outputfile(fmt, filename, overwrite=True, **kwargs) if csv_filename: f = open(csv_filename, 'w') csv_file = None for mol in self.fetch(): if csv_filename: data = mol.data.to_dict() # filter some internal data blacklist_keys = ['OpenBabel Symmetry Classes', 'MOL Chiral Flag', 'PartialCharges', 'TORSDO', 'REMARK'] for b in blacklist_keys: if b in data: del data[b] if len(data) > 0: data['name'] = mol.title else: print("There is no data to write in CSV file", file=sys.stderr) return False if csv_file is None: csv_file = csv.DictWriter(f, data.keys(), **kwargs) csv_file.writeheader() csv_file.writerow(data) # write ligand output_mol_file.write(mol) output_mol_file.close() if csv_filename: f.close() # TODO keep_pipe using hdf5 to store molecules if isfile(filename): kwargs.pop('overwrite', None) # this argument is unsupported self.load_ligands(fmt, filename, **kwargs)
[docs] def write_csv(self, csv_filename, fields=None, keep_pipe=False, **kwargs): """Outputs molecules to a csv file Parameters ---------- csv_filename: string Optional path to a CSV file fields: list (default None) List of fields to save in CSV file keep_pipe: bool (default=False) If set to True, the ligand pipe is sustained. """ if hasattr(csv_filename, 'write'): f = csv_filename else: f = open(csv_filename, 'w') csv_file = None for mol in self.fetch(): data = mol.data.to_dict() # filter some internal data blacklist_keys = ['OpenBabel Symmetry Classes', 'MOL Chiral Flag', 'PartialCharges', 'TORSDO', 'REMARK'] for b in blacklist_keys: if b in data: del data[b] if len(data) > 0: data['name'] = mol.title else: print("There is no data to write in CSV file", file=sys.stderr) return False if csv_file is None: csv_file = csv.DictWriter(f, fields or data.keys(), extrasaction='ignore', **kwargs) csv_file.writeheader() csv_file.writerow(data) # TODO keep_pipe using hdf5 to store molecules f.close()