Source code for oddt.virtualscreening

"""ODDT pipeline framework for virtual screening"""
from __future__ import print_function
import sys
import csv
import six
from six.moves import filter
from os.path import dirname, isfile
# from multiprocessing.dummy import Pool # threading
from multiprocessing import Pool  # process
from itertools import chain
from functools import partial

from oddt import toolkit
from oddt.scoring import scorer
from oddt.fingerprints import (InteractionFingerprint,
                               SimpleInteractionFingerprint,
                               dice)
from oddt.shape import usr, usr_cat, electroshape


def _parallel_helper(obj, methodname, kwargs):
    """Private helper to workaround Python 2 pickle limitations to paralelize methods"""
    return getattr(obj, methodname)(**kwargs)


[docs]class virtualscreening: def __init__(self, n_cpu=-1, verbose=False): """Virtual Screening pipeline stack Parameters ---------- n_cpu: int (default=-1) The number of parallel procesors to use verbose: bool (default=False) Verbosity flag for some methods """ self._pipe = None self.n_cpu = n_cpu if n_cpu else -1 self.num_input = 0 self.num_output = 0 self.verbose = verbose
[docs] def load_ligands(self, fmt, ligands_file, *args, **kwargs): """Loads file with ligands. Parameters ---------- file_type: string Type of molecular file ligands_file: string Path to a file, which is loaded to pipeline """ if fmt == 'mol2' and toolkit.backend == 'ob': if 'opt' in kwargs: kwargs['opt']['c'] = None else: kwargs['opt'] = {'c': None} new_pipe = self._ligand_pipe(toolkit.readfile(fmt, ligands_file, *args, **kwargs)) self._pipe = chain(self._pipe, new_pipe) if self._pipe else new_pipe
def _ligand_pipe(self, ligands): for mol in ligands: if mol: self.num_input += 1 yield mol
[docs] def apply_filter(self, expression, soft_fail=0): """Filtering method, can use raw expressions (strings to be evaled in if statement, can use oddt.toolkit.Molecule methods, eg. 'mol.molwt < 500') Currently supported presets: * Lipinski Rule of 5 ('ro5' or 'l5') * Fragment Rule of 3 ('ro3') * PAINS filter ('pains') Parameters ---------- expression: string or list of strings Expresion(s) to be used while filtering. soft_fail: int (default=0) The number of faulures molecule can have to pass filter, aka. soft-fails. """ if expression in ['l5', 'ro5', 'ro3', 'pains']: # define presets # TODO: move presets to another config file # Lipinski rule of 5's if expression.lower() in ['l5', 'ro5']: self._pipe = self._filter(self._pipe, ['mol.molwt < 500', 'mol.HBA1 <= 10', 'mol.HBD <= 5', 'mol.logP <= 5'], soft_fail=soft_fail) # Rule of three elif expression.lower() in ['ro3']: self._pipe = self._filter(self._pipe, ['mol.molwt < 300', 'mol.HBA1 <= 3', 'mol.HBD <= 3', 'mol.logP <= 3'], soft_fail=soft_fail) # PAINS filter elif expression.lower() in ['pains']: pains_smarts = {} with open(dirname(__file__)+'/filter/pains.smarts') as pains_file: csv_reader = csv.reader(pains_file, delimiter="\t") for line in csv_reader: if len(line) > 1: pains_smarts[line[1][8:-2]] = line[0] self._pipe = self._filter_smarts(self._pipe, pains_smarts.values(), soft_fail=soft_fail) else: self._pipe = self._filter(self._pipe, expression, soft_fail=soft_fail)
def _filter_smarts(self, pipe, smarts, soft_fail=0): for mol in pipe: if type(smarts) in six.string_types: compiled_smarts = toolkit.Smarts(smarts) if len(compiled_smarts.findall(mol)) == 0: yield mol else: compiled_smarts = [toolkit.Smarts(s) for s in smarts] fail = 0 for s in compiled_smarts: if len(s.findall(mol)) > 0: fail += 1 if fail > soft_fail: break if fail <= soft_fail: yield mol def _filter(self, pipe, expression, soft_fail=0): for mol in pipe: if type(expression) is list: fail = 0 for e in expression: if not eval(e): fail += 1 if fail > soft_fail: break if fail <= soft_fail: yield mol else: if eval(expression): yield mol
[docs] def similarity(self, method, query, cutoff=0.9, protein=None): """Similarity filter. Supported structural methods: * ift: interaction fingerprints * sift: simple interaction fingerprints * usr: Ultrafast Shape recognition * usr_cat: Ultrafast Shape recognition, Credo Atom Types * electroshape: Electroshape, an USR method including partial charges Parameters ---------- method: string, one of ['ift', 'sift', 'usr', 'usr_cat', 'electroshape'] Similarity method used to compare molecules query: oddt.toolkit.Molecule or list of oddt.toolkit.Molecule Query molecules to compare the pipeline to. cutoff: float Similarity cutoff for filtering molecules. Any similarity lower than it will be filtered out. protein: oddt.toolkit.Molecule (default = None) Protein for underling method. By default it's empty, but sturctural fingerprints need one. """ if isinstance(query, toolkit.Molecule): query = [query] # choose fp/usr and appropriate distance if method.lower() == 'ifp': gen = InteractionFingerprint dist = dice elif method.lower() == 'sifp': gen = SimpleInteractionFingerprint dist = dice elif method.lower() == 'usr': gen = usr dist = usr_similarity elif method.lower() == 'usr_cat': gen = usr_cat dist = usr_similarity elif method.lower() == 'electroshape': gen = electroshape dist = usr_similarity else: raise Exception('Similarity filter "%s" is not supported.' % method) query_fps = [(gen(q) if protein is None else gen(q, protein)) for q in query] self._pipe = filter(lambda q: any(dist(gen(q) if protein is None else gen(q, protein), q_fp) >= float(cutoff) for q_fp in query_fps), self._pipe)
[docs] def dock(self, engine, protein, *args, **kwargs): """Docking procedure. Parameters ---------- engine: string Which docking engine to use. Note ---- Additional parameters are passed directly to the engine. """ if engine.lower() == 'autodock_vina': from oddt.docking import autodock_vina engine = autodock_vina(protein, *args, **kwargs) else: raise ValueError('Docking engine %s was not implemented in ODDT' % engine) if self.n_cpu != 1: _parallel_helper_partial = partial(_parallel_helper, engine, 'dock') docking_results = (Pool(self.n_cpu if self.n_cpu > 0 else None) .imap(_parallel_helper_partial, ({'ligands': lig, 'single': True} for lig in self._pipe))) else: docking_results = (engine.dock(lig, single=True) for lig in self._pipe) self._pipe = chain.from_iterable(docking_results)
[docs] def score(self, function, protein=None, *args, **kwargs): """Scoring procedure. Parameters ---------- function: string Which scoring function to use. protein: oddt.toolkit.Molecule Default protein to use as reference Note ---- Additional parameters are passed directly to the scoring function. """ if type(protein) is str: extension = protein.split('.')[-1] protein = six.next(toolkit.readfile(extension, protein)) protein.protein = True # trigger cache protein.atom_dict if type(function) is str: if function.lower().startswith('rfscore'): from oddt.scoring.functions.RFScore import rfscore new_kwargs = {} for bit in function.lower().split('_'): if bit.startswith('pdbbind'): new_kwargs['pdbbind_version'] = int(bit.replace('pdbbind', '')) elif bit.startswith('v'): new_kwargs['version'] = int(bit.replace('v', '')) sf = rfscore.load(**new_kwargs) sf.set_protein(protein) elif function.lower().startswith('nnscore'): from oddt.scoring.functions.NNScore import nnscore new_kwargs = {} for bit in function.lower().split('_'): if bit.startswith('pdbbind'): new_kwargs['pdbbind_version'] = int(bit.replace('pdbbind', '')) sf = nnscore.load(**new_kwargs) sf.set_protein(protein) elif function.lower() == 'autodock_vina': from oddt.docking import autodock_vina sf = autodock_vina(protein, *args, **kwargs) sf.set_protein(protein) elif isfile(function): sf = scorer.load(function) sf.set_protein(protein) else: raise ValueError('Scoring Function %s was not implemented in ODDT' % function) else: if isinstance(function, scorer): sf = function sf.set_protein(protein) else: raise ValueError('Supplied object "%s" is not an ODDT scoring funtion' % function.__name__) if self.n_cpu != 1: _parallel_helper_partial = partial(_parallel_helper, sf, 'predict_ligand') self._pipe = (Pool(self.n_cpu if self.n_cpu > 0 else None) .imap(_parallel_helper_partial, ({'ligand': lig} for lig in self._pipe), chunksize=100)) else: self._pipe = sf.predict_ligands(self._pipe)
[docs] def fetch(self): for n, mol in enumerate(self._pipe): self.num_output = n+1 if self.verbose and self.num_input % 100 == 0: print("Passed: %i (%.2f%%)\tTotal: %i\r" % (self.num_output, float(self.num_output) / float(self.num_input) * 100, self.num_input), file=sys.stderr, end=" ") yield mol if self.verbose: print('', file=sys.stderr)
# Consume the pipe
[docs] def write(self, fmt, filename, csv_filename=None, **kwargs): """Outputs molecules to a file Parameters ---------- file_type: string Type of molecular file ligands_file: string Path to a output file csv_filename: string Optional path to a CSV file """ if fmt == 'mol2' and toolkit.backend == 'ob': if 'opt' in kwargs: kwargs['opt']['c'] = None else: kwargs['opt'] = {'c': None} output_mol_file = toolkit.Outputfile(fmt, filename, **kwargs) if csv_filename: f = open(csv_filename, 'w') csv_file = None for mol in self.fetch(): if csv_filename: data = mol.data.to_dict() # filter some internal data blacklist_keys = ['OpenBabel Symmetry Classes', 'MOL Chiral Flag', 'PartialCharges', 'TORSDO', 'REMARK'] for b in blacklist_keys: if b in data: del data[b] if len(data) > 0: data['name'] = mol.title else: print("There is no data to write in CSV file", file=sys.stderr) return False if csv_file is None: csv_file = csv.DictWriter(f, data.keys(), **kwargs) csv_file.writeheader() csv_file.writerow(data) # write ligand output_mol_file.write(mol) output_mol_file.close() if csv_filename: f.close() # if 'keep_pipe' in kwargs and kwargs['keep_pipe']: if isfile(filename): kwargs.pop('overwrite') # this argument is unsupported in readfile self._pipe = toolkit.readfile(fmt, filename, **kwargs)
[docs] def write_csv(self, csv_filename, fields=None, keep_pipe=False, **kwargs): """Outputs molecules to a csv file Parameters ---------- csv_filename: string Optional path to a CSV file fields: list (default None) List of fields to save in CSV file keep_pipe: bool (default=False) If set to True, the ligand pipe is sustained. """ if hasattr(csv_filename, 'write'): f = csv_filename else: f = open(csv_filename, 'w') csv_file = None for mol in self.fetch(): data = mol.data.to_dict() # filter some internal data blacklist_keys = ['OpenBabel Symmetry Classes', 'MOL Chiral Flag', 'PartialCharges', 'TORSDO', 'REMARK'] for b in blacklist_keys: if b in data: del data[b] if len(data) > 0: data['name'] = mol.title else: print("There is no data to write in CSV file", file=sys.stderr) return False if csv_file is None: csv_file = csv.DictWriter(f, fields or data.keys(), extrasaction='ignore', **kwargs) csv_file.writeheader() csv_file.writerow(data) if keep_pipe: # write ligand using pickle pass f.close()