Source code for oddt.virtualscreening

"""ODDT pipeline framework for virtual screening"""
import csv
from os.path import isfile
from multiprocessing.dummy import Pool
from itertools import chain

from oddt import toolkit

def _parallel_helper(args):
    """Private helper to workaround Python 2 pickle limitations to paralelize methods"""
    obj, methodname, arg = args
    return getattr(obj, methodname)(**arg)

[docs]class virtualscreening:
    def __init__(self, n_cpu=-1, verbose=False):
        """Virtual Screening pipeline stack

        Parameters
        ----------
            n_cpu: int (default=-1)
                The number of parallel procesors to use

            verbose: bool (default=False)
                Verbosity flag for some methods
        """
        self._pipe = None
        self.n_cpu = n_cpu
        self.num_input = 0
        self.num_output = 0
        self.verbose = verbose
        # setup pool
        self._pool = Pool(n_cpu if n_cpu > 0 else None)

[docs]    def load_ligands(self, fmt, ligands_file, *args, **kwargs):
        """Loads file with ligands.

        Parameters
        ----------
            file_type: string
                Type of molecular file

            ligands_file: string
                Path to a file, which is loaded to pipeline

        """
        if fmt == 'mol2' and toolkit.backend == 'ob':
            if 'opt' in kwargs:
                kwargs['opt']['c'] = None
            else:
                kwargs['opt'] = {'c': None}
        new_pipe = self._ligand_pipe(toolkit.readfile(fmt, ligands_file, *args, **kwargs))
        self._pipe = chain(self._pipe, new_pipe) if self._pipe else new_pipe

    def _ligand_pipe(self, ligands):
        for mol in ligands:
            self.num_input += 1
            yield mol

[docs]    def apply_filter(self, expression, filter_type='expression', soft_fail = 0):
        """Filtering method, can use raw expressions (strings to be evaled in if statement, can use oddt.toolkit.Molecule methods, eg. 'mol.molwt < 500')
        Currently supported presets:
            * Lipinski Rule of 5 ('r5' or 'l5')
            * Fragment Rule of 3 ('r3')

        Parameters
        ----------
            expression: string or list of strings
                Expresion(s) to be used while filtering.

            filter_type: 'expression' or 'preset' (default='expression')
                Specify filter type: 'expression' or 'preset'. Default strings are treated as expressions.

            soft_fail: int (default=0)
                The number of faulures molecule can have to pass filter, aka. soft-fails.
        """
        if filter_type == 'expression':
            self._pipe = self._filter(self._pipe, expression, soft_fail = soft_fail)
        elif filter_type == 'preset':
            # define presets
            # TODO: move presets to another config file
            # Lipinski rule of 5's
            if expression.lower() in ['l5', 'ro5']:
                self._pipe = self._filter(self._pipe, ['mol.molwt < 500', 'mol.calcdesc(["HBA1"])["HBA1"] <= 10', 'mol.calcdesc(["HBD"])["HBD"] <= 5', 'mol.calcdesc(["logP"])["logP"] <= 5'], soft_fail = soft_fail)
            # Rule of three
            elif expression.lower() in ['ro3']:
                self._pipe = self._filter(self._pipe, ['mol.molwt < 300', 'mol.calcdesc(["HBA1"])["HBA1"] <= 3', 'mol.calcdesc(["HBD"])["HBD"] <= 3', 'mol.calcdesc(["logP"])["logP"] <= 3'], soft_fail = soft_fail)
            # PAINS filter
            elif expression.lower() in ['pains']:
                pains_smarts = {}
                with open(dirname(__file__)+'filter/pains.smarts') as pains_file:
                    csv_reader = csv.reader(pains_file, delimiter="\t")
                    for line in csv_reader:
                        if len(line) > 1:
                            pains_smarts[line[1][8:-2]] = line[0]
                self._pipe = self._filter_smarts(self._pipe, pains_smarts.values(), soft_fail = soft_fail)

    def _filter_smarts(self, pipe, smarts, soft_fail = 0):
        for mol in pipe:
            if type(smarts) is list:
                compiled_smarts = [toolkit.Smarts(s) for s in smarts]
                fail = 0
                for s in compiled_smarts:
                    if len(s.findall(mol)) > 0:
                        fail += 1
                    if fail > soft_fail:
                        break
                if fail <= soft_fail:
                    yield mol
            else:
                compiled_smarts = toolkit.Smarts(smarts)
                if len(compiled_smiles.findall(mol)) == 0:
                    yield mol

    def _filter(self, pipe, expression, soft_fail = 0):
        for mol in pipe:
            if type(expression) is list:
                fail = 0
                for e in expression:
                    if not eval(e):
                        fail += 1
                    if fail > soft_fail:
                        break
                if fail <= soft_fail:
                    yield mol
            else:
                if eval(expression):
                    yield mol

[docs]    def dock(self, engine, protein, *args, **kwargs):
        """Docking procedure.

        Parameters
        ----------
            engine: string
                Which docking engine to use.

        Note
        ----
            Additional parameters are passed directly to the engine.
        """
        if engine.lower() == 'autodock_vina':
            from oddt.docking import autodock_vina
            engine = autodock_vina(protein, *args, **kwargs)
        else:
            raise ValueError('Docking engine %s was not implemented in ODDT' % engine)
        def _iter_conf(results):
            """ Generator to go through docking results, and put them to pipe """
            for confs in results:
                for conf in confs:
                    yield conf
        if self.n_cpu != 1:
            docking_results = self._pool.imap(_parallel_helper, ((engine, "dock", {'ligands':lig, 'single': True}) for lig in self._pipe))
        else:
            docking_results = (engine.dock(lig, single=True) for lig in self._pipe)
        self._pipe = _iter_conf(docking_results)

[docs]    def score(self, function, protein = None, *args, **kwargs):
        """Scoring procedure.

        Parameters
        ----------
            function: string
                Which scoring function to use.

            protein: oddt.toolkit.Molecule
                Default protein to use as reference

        Note
        ----
            Additional parameters are passed directly to the scoring function.
        """
        if type(protein) is str:
            extension = protein.split('.')[-1]
            protein = toolkit.readfile(extension, protein).next()
            protein.protein = True

        if type(function) is str:
            if function.lower() == 'rfscore':
                from .scoring.functions.RFScore import rfscore
                sf = rfscore.load()
                sf.set_protein(protein)
            elif function.lower() == 'nnscore':
                from .scoring.functions.NNScore import nnscore
                sf = nnscore.load()
                sf.set_protein(protein)
            else:
                raise ValueError('Scoring Function %s was not implemented in ODDT' % function)
        else:
            if hasattr(function, 'set_protein') and hasattr(function, 'predict_ligands') and hasattr(function, 'predict_ligand'):
                sf = function
                sf.set_protein(protein)
            else:
                raise ValueError('Supplied object "%s" is not an ODDT scoring funtion' % function.__name__)
        if self.n_cpu != 1:
            self._pipe = self._pool.imap(_parallel_helper, ((sf, 'predict_ligand', {'ligand': lig}) for lig in self._pipe))
        else:
            self._pipe = sf.predict_ligands(self._pipe)

[docs]    def fetch(self):
        for n, mol in enumerate(self._pipe):
            self.num_output = n+1
            if self.verbose and self.num_input % 100 == 0:
                print "\rPassed: %i (%.2f%%)\tTotal: %i" % (self.num_output, float(self.num_output)/float(self.num_input)*100, self.num_input),
            yield mol
        if self.verbose:
            print ""

    # Consume the pipe
[docs]    def write(self, fmt, filename, csv_filename = None, **kwargs):
        """Outputs molecules to a file

        Parameters
        ----------
            file_type: string
                Type of molecular file

            ligands_file: string
                Path to a output file

            csv_filename: string
                Optional path to a CSV file
        """
        if fmt == 'mol2' and toolkit.backend == 'ob':
            if 'opt' in kwargs:
                kwargs['opt']['c'] = None
            else:
                kwargs['opt'] = {'c': None}
        output_mol_file = toolkit.Outputfile(fmt, filename, **kwargs)
        if csv_filename:
            f = open(csv_filename, 'w')
            csv_file = None
        for mol in self.fetch():
            if csv_filename:
                data = dict(mol.data)
                #filter some internal data
                blacklist_keys = ['OpenBabel Symmetry Classes', 'MOL Chiral Flag', 'PartialCharges', 'TORSDO', 'REMARK']
                for b in blacklist_keys:
                    if data.has_key(b):
                        del data[b]
                if len(data) > 0:
                    data['name'] = mol.title
                else:
                    print "There is no data to write in CSV file"
                    return False
                if csv_file is None:
                    csv_file = csv.DictWriter(f, data.keys(), **kwargs)
                    csv_file.writeheader()
                csv_file.writerow(data)
            # write ligand
            output_mol_file.write(mol)
        output_mol_file.close()
        if csv_filename:
            f.close()
#        if kwargs.has_key('keep_pipe') and kwargs['keep_pipe']:
        if isfile(filename):
            kwargs.pop('overwrite') # this argument is unsupported in readfile
            self._pipe = toolkit.readfile(fmt, filename, **kwargs)

[docs]    def write_csv(self, csv_filename, keep_pipe = False, **kwargs):
        """Outputs molecules to a csv file

        Parameters
        ----------
            csv_filename: string
                Optional path to a CSV file

            keep_pipe: bool (default=False)
                If set to True, the ligand pipe is sustained.
        """
        f = open(csv_filename, 'w')
        csv_file = None
        for mol in self.fetch():
            data = dict(mol.data)
            #filter some internal data
            blacklist_keys = ['OpenBabel Symmetry Classes', 'MOL Chiral Flag', 'PartialCharges', 'TORSDO', 'REMARK']
            for b in blacklist_keys:
                if data.has_key(b):
                    del data[b]
            if len(data) > 0:
                data['name'] = mol.title
            else:
                print "There is no data to write in CSV file"
                return False
            if csv_file is None:
                csv_file = csv.DictWriter(f, data.keys(), **kwargs)
                csv_file.writeheader()
            csv_file.writerow(data)
            if keep_pipe:
                #write ligand using pickle
                pass
        f.close()