""" Datasets wrapped in convenient models """
from __future__ import print_function
import sys
import os
import six
import pandas as pd
from os.path import isfile, isdir
from os import listdir
from oddt import toolkit
[docs]class pdbbind(object):
def __init__(self,
home,
version=None,
default_set=None,
opt=None):
if version is None:
raise ValueError('PDBbind version not specified')
else:
version = int(version)
self.home = home
if default_set:
self.default_set = default_set
else:
if version == 2007:
self.default_set = 'general'
else:
self.default_set = 'general_PL'
self.opt = opt or {}
self.sets = {}
self._set_ids = {}
self._set_act = {}
if version == 2007:
self.pdbind_sets = ['core', 'refined', 'general']
else:
self.pdbind_sets = ['core', 'refined', 'general_PL']
for pdbind_set in self.pdbind_sets:
if version == 2007:
csv_file = os.path.join(self.home, 'INDEX.%i.%s.data'
% (version, pdbind_set))
elif version == 2016:
csv_file = os.path.join(self.home, 'index', 'INDEX_%s_data.%i'
% (pdbind_set, version))
else:
csv_file = os.path.join(self.home, 'INDEX_%s_data.%i'
% (pdbind_set, version))
if os.path.isfile(csv_file):
data = pd.read_csv(csv_file,
sep='\s+',
usecols=[0, 1, 2, 3],
names=['pdbid',
'resolution',
'release_year',
'act'],
comment='#')
self._set_ids[pdbind_set] = data['pdbid'].tolist()
self._set_act[pdbind_set] = data['act'].tolist()
self.sets[pdbind_set] = dict(zip(self._set_ids[pdbind_set],
self._set_act[pdbind_set]))
if len(self.sets) == 0:
raise Exception('There is no PDBbind set availabe')
@property
def ids(self):
# return sorted(self.sets[self.default_set].keys())
return self._set_ids[self.default_set]
@property
def activities(self):
return self._set_act[self.default_set]
def __iter__(self):
for pdbid in self.ids:
yield _pdbbind_id(self.home, pdbid, opt=self.opt)
def __getitem__(self, pdbid):
if pdbid in self.ids:
return _pdbbind_id(self.home, pdbid, opt=self.opt)
elif (isinstance(pdbid, int) and
pdbid < len(self.ids) and
pdbid >= -len(self.ids)):
return _pdbbind_id(self.home + '', self.ids[pdbid], opt=self.opt)
else:
raise KeyError('There is no such target ("%s")' % pdbid)
class _pdbbind_id(object):
def __init__(self, home, pdbid, opt=None):
self.home = home
self.id = pdbid
self.opt = opt or {}
@property
def protein(self):
f = os.path.join(self.home, self.id, '%s_protein.pdb' % self.id)
if os.path.isfile(f):
return next(toolkit.readfile('pdb', f, lazy=True, opt=self.opt))
else:
return None
@property
def pocket(self):
f = os.path.join(self.home, self.id, '%s_pocket.pdb' % self.id)
if os.path.isfile(f):
return next(toolkit.readfile('pdb', f, lazy=True, opt=self.opt))
else:
return None
@property
def ligand(self):
f = os.path.join(self.home, self.id, '%s_ligand.sdf' % self.id)
if os.path.isfile(f):
return next(toolkit.readfile('sdf', f, lazy=True, opt=self.opt))
else:
return None
[docs]class dude(object):
def __init__(self, home):
"""A wrapper for DUD-E (A Database of Useful Decoys: Enhanced)
http://dude.docking.org/
Parameters
----------
home : str
Path to files from dud-e
"""
self.home = home
if not os.path.isdir(self.home):
raise Exception('Directory %s doesn\'t exist' % self.home)
self.ids = []
files = ['receptor.pdb', 'crystal_ligand.mol2',
'actives_final.mol2.gz', 'decoys_final.mol2.gz']
# ids sorted by size of protein
all_ids = [
'fnta', 'dpp4', 'mmp13', 'hivpr', 'ada17', 'mk14', 'egfr', 'src',
'drd3', 'aa2ar', 'cah2', 'parp1', 'cdk2', 'lck', 'pde5a', 'thrb',
'aces', 'try1', 'pparg', 'vgfr2', 'pgh2', 'esr1', 'fa10', 'esr2',
'ppara', 'dhi1', 'hivrt', 'bace1', 'ace', 'dyr', 'akt1', 'adrb1',
'prgr', 'gcr', 'adrb2', 'andr', 'ppard', 'csf1r', 'gria2', 'cp3a4',
'met', 'pgh1', 'abl1', 'casp3', 'kit', 'hdac8', 'hdac2', 'braf',
'urok', 'lkha4', 'igf1r', 'aldr', 'fpps', 'hmdh', 'kpcb', 'tgfr1',
'ital', 'mp2k1', 'nos1', 'tryb1', 'rxra', 'thb', 'cp2c9', 'ptn1',
'reni', 'pnph', 'tysy', 'akt2', 'kif11', 'aofb', 'plk1', 'hivint',
'mk10', 'pyrd', 'grik1', 'jak2', 'rock1', 'fa7', 'mapk2', 'nram',
'wee1', 'fkb1a', 'def', 'ada', 'fak1', 'mcr', 'pa2ga', 'xiap',
'hs90a', 'hxk4', 'mk01', 'pygm', 'glcm', 'comt', 'sahh', 'cxcr4',
'kith', 'ampc', 'pur2', 'fabp4', 'inha', 'fgfr1'
]
for i in all_ids:
if os.path.isdir(os.path.join(self.home, i)):
self.ids.append(i)
for fname in files:
f = os.path.join(self.home, i, fname)
if not (os.path.isfile(f) or
(fname[-3:] == '.gz' and os.path.isfile(f[:-3]))):
print('Target %s doesn\'t have file %s' % (i, fname),
file=sys.stderr)
if not self.ids:
print('No targets in directory %s' % (self.home), file=sys.stderr)
def __iter__(self):
for dude_id in self.ids:
yield _dude_target(self.home, dude_id)
def __getitem__(self, dude_id):
if dude_id in self.ids:
return _dude_target(self.home, dude_id)
else:
raise KeyError('There is no such target ("%s")' % dude_id)
class _dude_target(object):
def __init__(self, home, dude_id):
"""Allows to read files of the dude target
Parameters
----------
home : str
Directory to files from dud-e
dude_id : str
Target id
"""
self.home = home
self.dude_id = dude_id
@property
def protein(self):
"""Read a protein file"""
f = os.path.join(self.home, self.dude_id, 'receptor.pdb')
if os.path.isfile(f):
return next(toolkit.readfile('pdb', f))
else:
return None
@property
def ligand(self):
"""Read a ligand file"""
f = os.path.join(self.home, self.dude_id, 'crystal_ligand.mol2')
if os.path.isfile(f):
return next(toolkit.readfile('mol2', f))
else:
return None
@property
def actives(self):
"""Read an actives file"""
f = os.path.join(self.home, self.dude_id, 'actives_final.mol2.gz')
if os.path.isfile(f):
return toolkit.readfile('mol2', f)
# check if file is unpacked
elif os.path.isfile(f[:-3]):
return toolkit.readfile('mol2', f[:-3])
else:
return None
@property
def decoys(self):
"""Read a decoys file"""
f = os.path.join(self.home, self.dude_id, 'decoys_final.mol2.gz')
if os.path.isfile(f):
return toolkit.readfile('mol2', f)
# check if file is unpacked
elif os.path.isfile(f[:-3]):
return toolkit.readfile('mol2', f[:-3])
else:
return None
[docs]class CASF:
"""Load CASF dataset as described in
Li, Y. et al. Comparative Assessment of Scoring Functions
on an Updated Benchmark: 2. Evaluation Methods and General
Results. J. Chem. Inf. Model. 54, 1717-1736. (2014)
http://dx.doi.org/10.1021/ci500081m
Parameters
----------
home: string
Path to CASF dataset main directory
"""
def __init__(self, home):
self.home = home
self.index = '%s/coreset/index/' % self.home
if isdir(self.index):
filepath = '%s/2013_core_data.lst' % self.index
self.index_data = pd.read_csv(filepath,
sep=r'\s+',
comment='#',
header=None,
names=['pdbid', 'act', 'cluster'],
usecols=[0, 1, 5])
self.pdbids = self.index_data['pdbid']
def __iter__(self):
for pdbid in self.pdbids:
yield _CASFTarget(self.home, pdbid)
def __getitem__(self, item):
if item in self.pdbids:
return _CASFTarget(self.home, item)
elif isinstance(int, item) and item < len(self.pdbids):
return _CASFTarget(self.home, self.pdbids[item])
else:
raise KeyError
[docs] def precomputed_score(self, scoring_function=None):
"""Load precomputed results of scoring power
test for various scoring functions.
Parameters
----------
scoring_function: string (default=None)
Name of the scoring function to get results
If None, all results are returned.
"""
examples_dir = '%s/power_scoring/examples' % self.home
if scoring_function is not None:
functions = [scoring_function]
else:
functions = listdir(examples_dir)
functions.remove('README')
frames = []
for fun in functions:
file_score = '%s/%s' % (examples_dir, fun)
if not isfile(file_score):
raise FileNotFoundError('Invalid scoring function name')
score = pd.read_csv(file_score, comment='#',
sep=r'\s+', header=None,
names=['pdbid', 'score_crystal', 'score_opt'])
act = self.index_data[['pdbid', 'act']]
scores = pd.merge(score, act)
scores['scoring_function'] = pd.Series([fun] * 195,
name='Scoring function')
frames.append(scores)
return pd.concat(frames)
[docs] def precomputed_screening(self, scoring_function=None, cluster_id=None):
"""Load precomputed results of screening power
test for various scoring functions
Parameters
----------
scoring_function: string (default=None)
Name of the scoring function to get results
If None, all results are returned
cluster_id: int (default=None)
Number of the protein cluster to get results
If None, all results are returned
"""
screening_dir = '%s/power_screening' % self.home
examples_dir = '%s/examples' % screening_dir
if scoring_function is not None:
functions = [scoring_function]
else:
functions = listdir(examples_dir)
cluster_frame = pd.DataFrame(columns=['cluster_id',
'protein_structure',
'cluster_proteins'])
data_file = open('%s/TargetInfo.dat' % screening_dir)
for cluster, line in enumerate(filter(lambda x: not x.startswith('#'),
data_file.readlines())):
line = line.split()
protein_structure = line[0]
cluster_proteins = line[1:]
cluster_frame.loc[cluster] = [cluster + 1,
protein_structure, cluster_proteins]
frames = []
for fun in functions:
file_dir = '%s/%s' % (examples_dir, fun)
if not isdir(file_dir):
raise FileNotFoundError('Invalid scoring function name')
if cluster_id:
protein = cluster_frame.iloc[cluster_id - 1]['protein_structure']
frame = pd.read_csv('%s/%s_score.dat' % (file_dir, protein),
sep=r'\s+', header=None,
names=['name', 'score'])
frame['pdbid'] = [name[:4] for name in frame['name']]
frame['scoring_function'] = [fun] * len(frame)
frame = frame.merge(self.index_data[['pdbid', 'act']])
frames.append(frame)
else:
for row in cluster_frame.itertuples():
protein = row[2]
frame = pd.read_csv('%s/%s_score.dat' % (file_dir, protein),
sep=r'\s+', header=None,
names=['name', 'score'])
x = row[1]
frame['cluster_id'] = [x] * len(frame)
frame['protein_structure'] = [protein] * len(frame)
frame['cluster_proteins'] = [row[3]] * len(frame)
frame['pdbid'] = [name[:4] for name in frame['name']]
frame['scoring_function'] = [fun] * len(frame)
frame = frame.merge(self.index_data[['pdbid', 'act']])
frames.append(frame)
return pd.concat(frames, ignore_index=True)
class _CASFTarget:
"""
Used by CASF class.
Load CASF target (protein and ligand) with given ID.
Parameters
----------
home: string
Path to CASF dataset main directory
pdbid: string
ID of target protein
"""
def __init__(self, home, pdbid):
self.home = home
self.pdbid = pdbid
@property
def protein(self):
"""Load target protein from mol2 file as ob.Molecule object"""
filepath = '%s/coreset/%s/%s_protein.mol2' % (
self.home, self.pdbid, self.pdbid)
if isfile(filepath):
protein = six.next(toolkit.readfile('mol2', filepath))
return protein
return None
@property
def ligand(self):
"""Load target ligand from mol2 file as ob.Molecule object"""
filepath = '%s/coreset/%s/%s_ligand.mol2' % (
self.home, self.pdbid, self.pdbid)
if isfile(filepath):
ligand = six.next(toolkit.readfile('mol2', filepath))
return ligand
return None
@property
def decoys_docking(self):
"""Load decoys used for docking from mol2
file as list of ob.Molecule objects"""
filepath = '%s/decoys_docking/%s_decoys.mol2' % (self.home, self.pdbid)
if isfile(filepath):
decoys = list(toolkit.readfile('mol2', filepath))
return decoys
return None
@property
def decoys_screening(self):
"""Load decoys used for screening from mol2
files as list of ob.Molecule objects"""
dirpath = '%s/decoys_screening/%s' % (self.home, self.pdbid)
if isdir(dirpath):
decoys = []
for file in listdir(dirpath):
decoys.append(six.next(
toolkit.readfile('mol2', dirpath + '/' + file)))
return decoys
return None