Source code for oddt.datasets

""" Datasets wrapped in convenient models """
from __future__ import print_function
import sys
import os
import six
import pandas as pd
from os.path import isfile, isdir
from os import listdir

from oddt import toolkit


[docs]class pdbbind(object): def __init__(self, home, version=None, default_set=None, opt=None): if version is None: raise ValueError('PDBbind version not specified') else: version = int(version) self.home = home if default_set: self.default_set = default_set else: if version == 2007: self.default_set = 'general' else: self.default_set = 'general_PL' self.opt = opt or {} self.sets = {} self._set_ids = {} self._set_act = {} if version == 2007: self.pdbind_sets = ['core', 'refined', 'general'] else: self.pdbind_sets = ['core', 'refined', 'general_PL'] for pdbind_set in self.pdbind_sets: if version == 2007: csv_file = os.path.join(self.home, 'INDEX.%i.%s.data' % (version, pdbind_set)) elif version == 2016: csv_file = os.path.join(self.home, 'index', 'INDEX_%s_data.%i' % (pdbind_set, version)) else: csv_file = os.path.join(self.home, 'INDEX_%s_data.%i' % (pdbind_set, version)) if os.path.isfile(csv_file): data = pd.read_csv(csv_file, sep='\s+', usecols=[0, 1, 2, 3], names=['pdbid', 'resolution', 'release_year', 'act'], comment='#') self._set_ids[pdbind_set] = data['pdbid'].tolist() self._set_act[pdbind_set] = data['act'].tolist() self.sets[pdbind_set] = dict(zip(self._set_ids[pdbind_set], self._set_act[pdbind_set])) if len(self.sets) == 0: raise Exception('There is no PDBbind set availabe') @property def ids(self): # return sorted(self.sets[self.default_set].keys()) return self._set_ids[self.default_set] @property def activities(self): return self._set_act[self.default_set] def __iter__(self): for pdbid in self.ids: yield _pdbbind_id(self.home, pdbid, opt=self.opt) def __getitem__(self, pdbid): if pdbid in self.ids: return _pdbbind_id(self.home, pdbid, opt=self.opt) elif (isinstance(pdbid, int) and pdbid < len(self.ids) and pdbid >= -len(self.ids)): return _pdbbind_id(self.home + '', self.ids[pdbid], opt=self.opt) else: raise KeyError('There is no such target ("%s")' % pdbid)
class _pdbbind_id(object): def __init__(self, home, pdbid, opt=None): self.home = home self.id = pdbid self.opt = opt or {} @property def protein(self): f = os.path.join(self.home, self.id, '%s_protein.pdb' % self.id) if os.path.isfile(f): return next(toolkit.readfile('pdb', f, lazy=True, opt=self.opt)) else: return None @property def pocket(self): f = os.path.join(self.home, self.id, '%s_pocket.pdb' % self.id) if os.path.isfile(f): return next(toolkit.readfile('pdb', f, lazy=True, opt=self.opt)) else: return None @property def ligand(self): f = os.path.join(self.home, self.id, '%s_ligand.sdf' % self.id) if os.path.isfile(f): return next(toolkit.readfile('sdf', f, lazy=True, opt=self.opt)) else: return None
[docs]class dude(object): def __init__(self, home): """A wrapper for DUD-E (A Database of Useful Decoys: Enhanced) http://dude.docking.org/ Parameters ---------- home : str Path to files from dud-e """ self.home = home if not os.path.isdir(self.home): raise Exception('Directory %s doesn\'t exist' % self.home) self.ids = [] files = ['receptor.pdb', 'crystal_ligand.mol2', 'actives_final.mol2.gz', 'decoys_final.mol2.gz'] # ids sorted by size of protein all_ids = [ 'fnta', 'dpp4', 'mmp13', 'hivpr', 'ada17', 'mk14', 'egfr', 'src', 'drd3', 'aa2ar', 'cah2', 'parp1', 'cdk2', 'lck', 'pde5a', 'thrb', 'aces', 'try1', 'pparg', 'vgfr2', 'pgh2', 'esr1', 'fa10', 'esr2', 'ppara', 'dhi1', 'hivrt', 'bace1', 'ace', 'dyr', 'akt1', 'adrb1', 'prgr', 'gcr', 'adrb2', 'andr', 'ppard', 'csf1r', 'gria2', 'cp3a4', 'met', 'pgh1', 'abl1', 'casp3', 'kit', 'hdac8', 'hdac2', 'braf', 'urok', 'lkha4', 'igf1r', 'aldr', 'fpps', 'hmdh', 'kpcb', 'tgfr1', 'ital', 'mp2k1', 'nos1', 'tryb1', 'rxra', 'thb', 'cp2c9', 'ptn1', 'reni', 'pnph', 'tysy', 'akt2', 'kif11', 'aofb', 'plk1', 'hivint', 'mk10', 'pyrd', 'grik1', 'jak2', 'rock1', 'fa7', 'mapk2', 'nram', 'wee1', 'fkb1a', 'def', 'ada', 'fak1', 'mcr', 'pa2ga', 'xiap', 'hs90a', 'hxk4', 'mk01', 'pygm', 'glcm', 'comt', 'sahh', 'cxcr4', 'kith', 'ampc', 'pur2', 'fabp4', 'inha', 'fgfr1' ] for i in all_ids: if os.path.isdir(os.path.join(self.home, i)): self.ids.append(i) for fname in files: f = os.path.join(self.home, i, fname) if not (os.path.isfile(f) or (fname[-3:] == '.gz' and os.path.isfile(f[:-3]))): print('Target %s doesn\'t have file %s' % (i, fname), file=sys.stderr) if not self.ids: print('No targets in directory %s' % (self.home), file=sys.stderr) def __iter__(self): for dude_id in self.ids: yield _dude_target(self.home, dude_id) def __getitem__(self, dude_id): if dude_id in self.ids: return _dude_target(self.home, dude_id) else: raise KeyError('There is no such target ("%s")' % dude_id)
class _dude_target(object): def __init__(self, home, dude_id): """Allows to read files of the dude target Parameters ---------- home : str Directory to files from dud-e dude_id : str Target id """ self.home = home self.dude_id = dude_id @property def protein(self): """Read a protein file""" f = os.path.join(self.home, self.dude_id, 'receptor.pdb') if os.path.isfile(f): return next(toolkit.readfile('pdb', f)) else: return None @property def ligand(self): """Read a ligand file""" f = os.path.join(self.home, self.dude_id, 'crystal_ligand.mol2') if os.path.isfile(f): return next(toolkit.readfile('mol2', f)) else: return None @property def actives(self): """Read an actives file""" f = os.path.join(self.home, self.dude_id, 'actives_final.mol2.gz') if os.path.isfile(f): return toolkit.readfile('mol2', f) # check if file is unpacked elif os.path.isfile(f[:-3]): return toolkit.readfile('mol2', f[:-3]) else: return None @property def decoys(self): """Read a decoys file""" f = os.path.join(self.home, self.dude_id, 'decoys_final.mol2.gz') if os.path.isfile(f): return toolkit.readfile('mol2', f) # check if file is unpacked elif os.path.isfile(f[:-3]): return toolkit.readfile('mol2', f[:-3]) else: return None
[docs]class CASF: """Load CASF dataset as described in Li, Y. et al. Comparative Assessment of Scoring Functions on an Updated Benchmark: 2. Evaluation Methods and General Results. J. Chem. Inf. Model. 54, 1717-1736. (2014) http://dx.doi.org/10.1021/ci500081m Parameters ---------- home: string Path to CASF dataset main directory """ def __init__(self, home): self.home = home self.index = '%s/coreset/index/' % self.home if isdir(self.index): filepath = '%s/2013_core_data.lst' % self.index self.index_data = pd.read_csv(filepath, sep=r'\s+', comment='#', header=None, names=['pdbid', 'act', 'cluster'], usecols=[0, 1, 5]) self.pdbids = self.index_data['pdbid'] def __iter__(self): for pdbid in self.pdbids: yield _CASFTarget(self.home, pdbid) def __getitem__(self, item): if item in self.pdbids: return _CASFTarget(self.home, item) elif isinstance(int, item) and item < len(self.pdbids): return _CASFTarget(self.home, self.pdbids[item]) else: raise KeyError
[docs] def precomputed_score(self, scoring_function=None): """Load precomputed results of scoring power test for various scoring functions. Parameters ---------- scoring_function: string (default=None) Name of the scoring function to get results If None, all results are returned. """ examples_dir = '%s/power_scoring/examples' % self.home if scoring_function is not None: functions = [scoring_function] else: functions = listdir(examples_dir) functions.remove('README') frames = [] for fun in functions: file_score = '%s/%s' % (examples_dir, fun) if not isfile(file_score): raise FileNotFoundError('Invalid scoring function name') score = pd.read_csv(file_score, comment='#', sep=r'\s+', header=None, names=['pdbid', 'score_crystal', 'score_opt']) act = self.index_data[['pdbid', 'act']] scores = pd.merge(score, act) scores['scoring_function'] = pd.Series([fun] * 195, name='Scoring function') frames.append(scores) return pd.concat(frames)
[docs] def precomputed_screening(self, scoring_function=None, cluster_id=None): """Load precomputed results of screening power test for various scoring functions Parameters ---------- scoring_function: string (default=None) Name of the scoring function to get results If None, all results are returned cluster_id: int (default=None) Number of the protein cluster to get results If None, all results are returned """ screening_dir = '%s/power_screening' % self.home examples_dir = '%s/examples' % screening_dir if scoring_function is not None: functions = [scoring_function] else: functions = listdir(examples_dir) cluster_frame = pd.DataFrame(columns=['cluster_id', 'protein_structure', 'cluster_proteins']) data_file = open('%s/TargetInfo.dat' % screening_dir) for cluster, line in enumerate(filter(lambda x: not x.startswith('#'), data_file.readlines())): line = line.split() protein_structure = line[0] cluster_proteins = line[1:] cluster_frame.loc[cluster] = [cluster + 1, protein_structure, cluster_proteins] frames = [] for fun in functions: file_dir = '%s/%s' % (examples_dir, fun) if not isdir(file_dir): raise FileNotFoundError('Invalid scoring function name') if cluster_id: protein = cluster_frame.iloc[cluster_id - 1]['protein_structure'] frame = pd.read_csv('%s/%s_score.dat' % (file_dir, protein), sep=r'\s+', header=None, names=['name', 'score']) frame['pdbid'] = [name[:4] for name in frame['name']] frame['scoring_function'] = [fun] * len(frame) frame = frame.merge(self.index_data[['pdbid', 'act']]) frames.append(frame) else: for row in cluster_frame.itertuples(): protein = row[2] frame = pd.read_csv('%s/%s_score.dat' % (file_dir, protein), sep=r'\s+', header=None, names=['name', 'score']) x = row[1] frame['cluster_id'] = [x] * len(frame) frame['protein_structure'] = [protein] * len(frame) frame['cluster_proteins'] = [row[3]] * len(frame) frame['pdbid'] = [name[:4] for name in frame['name']] frame['scoring_function'] = [fun] * len(frame) frame = frame.merge(self.index_data[['pdbid', 'act']]) frames.append(frame) return pd.concat(frames, ignore_index=True)
class _CASFTarget: """ Used by CASF class. Load CASF target (protein and ligand) with given ID. Parameters ---------- home: string Path to CASF dataset main directory pdbid: string ID of target protein """ def __init__(self, home, pdbid): self.home = home self.pdbid = pdbid @property def protein(self): """Load target protein from mol2 file as ob.Molecule object""" filepath = '%s/coreset/%s/%s_protein.mol2' % ( self.home, self.pdbid, self.pdbid) if isfile(filepath): protein = six.next(toolkit.readfile('mol2', filepath)) return protein return None @property def ligand(self): """Load target ligand from mol2 file as ob.Molecule object""" filepath = '%s/coreset/%s/%s_ligand.mol2' % ( self.home, self.pdbid, self.pdbid) if isfile(filepath): ligand = six.next(toolkit.readfile('mol2', filepath)) return ligand return None @property def decoys_docking(self): """Load decoys used for docking from mol2 file as list of ob.Molecule objects""" filepath = '%s/decoys_docking/%s_decoys.mol2' % (self.home, self.pdbid) if isfile(filepath): decoys = list(toolkit.readfile('mol2', filepath)) return decoys return None @property def decoys_screening(self): """Load decoys used for screening from mol2 files as list of ob.Molecule objects""" dirpath = '%s/decoys_screening/%s' % (self.home, self.pdbid) if isdir(dirpath): decoys = [] for file in listdir(dirpath): decoys.append(six.next( toolkit.readfile('mol2', dirpath + '/' + file))) return decoys return None