""" Datasets wrapped in convenient models """
from __future__ import print_function
import sys
import os
import six
import pandas as pd
from os.path import isfile, isdir
from os import listdir
import warnings
from oddt import toolkit
[docs]class pdbbind(object):
def __init__(self,
home,
version=None,
default_set=None,
opt=None):
if version is None:
raise ValueError('PDBbind version not specified')
else:
version = int(version)
self.home = home
if default_set:
self.default_set = default_set
else:
if version == 2007:
self.default_set = 'general'
else:
self.default_set = 'general_PL'
self.opt = opt or {}
self.sets = {}
self._set_ids = {}
self._set_act = {}
# list of protein ids that are known to segfault toolkits
self.protein_blacklist = {
'ob': {'1e8h', '1ntk', '1nu1', '1rbo', '1sqb', '1sqp', '1sqq',
'2f2h', '2wig', '2wij', '2wik', '3axk', '3axm', '3cf1',
# Following segfault on systems with smaller RAM
'1px4', '1pyg', '1zyr', '3a2c', '3dxj', '3dyo', '3eql',
'3f33', '3f34', '3f35', '3f36', '3f37', '3f38', '3f39',
'3i3b', '3i3d', '3k1j', '3muz', '3mv0', '3n75', '3t08',
'3t09', '3t0b', '3t0d', '3t2p', '3t2q', '3vd4', '3vd7',
'3vd9', '3vdb', '3vdc', '3wi6', '4kmu', '4kn4', '4kn7',
'7gpb',
# extended use segfaults (not only reading problem)
'1l7x',
},
'rdk': {}
}
if version == 2007:
self.pdbind_sets = ['core', 'refined', 'general']
else:
self.pdbind_sets = ['core', 'refined', 'general_PL']
for pdbind_set in self.pdbind_sets:
if version == 2007:
csv_file = os.path.join(self.home, 'INDEX.%i.%s.data'
% (version, pdbind_set))
elif version >= 2016:
csv_file = os.path.join(self.home, 'index', 'INDEX_%s_data.%i'
% (pdbind_set, version))
else:
csv_file = os.path.join(self.home, 'INDEX_%s_data.%i'
% (pdbind_set, version))
if os.path.isfile(csv_file):
data = pd.read_csv(csv_file,
sep='\s+',
usecols=[0, 1, 2, 3],
names=['pdbid',
'resolution',
'release_year',
'act'],
comment='#')
self._set_ids[pdbind_set] = data['pdbid'].tolist()
self._set_act[pdbind_set] = data['act'].tolist()
self.sets[pdbind_set] = dict(zip(self._set_ids[pdbind_set],
self._set_act[pdbind_set]))
if len(self.sets) == 0:
raise Exception('There is no PDBbind set availabe')
@property
def ids(self):
# return sorted(self.sets[self.default_set].keys())
return self._set_ids[self.default_set]
@property
def activities(self):
return self._set_act[self.default_set]
def __iter__(self):
for pdbid in self.ids:
yield _pdbbind_id(self.home, pdbid, opt=self.opt)
def __getitem__(self, pdbid):
warn_msg = ('A protein "%s" is blacklisted (known to segfault) for '
'current toolkit. Proceed at your own risk.' % pdbid)
if pdbid in self.ids:
if pdbid in self.protein_blacklist[toolkit.backend]:
warnings.warn(warn_msg, UserWarning)
return _pdbbind_id(self.home, pdbid, opt=self.opt)
elif (isinstance(pdbid, int) and
pdbid < len(self.ids) and
pdbid >= -len(self.ids)):
if self.ids[pdbid] in self.protein_blacklist[toolkit.backend]:
warnings.warn(warn_msg, UserWarning)
return _pdbbind_id(self.home + '', self.ids[pdbid], opt=self.opt)
else:
raise KeyError('There is no such target ("%s")' % pdbid)
class _pdbbind_id(object):
def __init__(self, home, pdbid, opt=None):
self.home = home
self.id = pdbid
self.opt = opt or {}
@property
def protein(self):
f = os.path.join(self.home, self.id, '%s_protein.pdb' % self.id)
if os.path.isfile(f):
protein = next(toolkit.readfile('pdb', f, lazy=True, opt=self.opt))
if protein is not None:
protein.protein = True
return protein
else:
return None
@property
def pocket(self):
f = os.path.join(self.home, self.id, '%s_pocket.pdb' % self.id)
if os.path.isfile(f):
pocket = next(toolkit.readfile('pdb', f, lazy=True, opt=self.opt))
if pocket is not None:
pocket.protein = True
return pocket
else:
return None
@property
def ligand(self):
f = os.path.join(self.home, self.id, '%s_ligand.sdf' % self.id)
if os.path.isfile(f):
return next(toolkit.readfile('sdf', f, lazy=True, opt=self.opt))
else:
return None
[docs]class dude(object):
def __init__(self, home):
"""A wrapper for DUD-E (A Database of Useful Decoys: Enhanced)
http://dude.docking.org/
Parameters
----------
home : str
Path to files from dud-e
"""
self.home = home
if not os.path.isdir(self.home):
raise Exception('Directory %s doesn\'t exist' % self.home)
self.ids = []
files = ['receptor.pdb', 'crystal_ligand.mol2',
'actives_final.mol2.gz', 'decoys_final.mol2.gz']
# ids sorted by size of protein
all_ids = [
'fnta', 'dpp4', 'mmp13', 'hivpr', 'ada17', 'mk14', 'egfr', 'src',
'drd3', 'aa2ar', 'cah2', 'parp1', 'cdk2', 'lck', 'pde5a', 'thrb',
'aces', 'try1', 'pparg', 'vgfr2', 'pgh2', 'esr1', 'fa10', 'esr2',
'ppara', 'dhi1', 'hivrt', 'bace1', 'ace', 'dyr', 'akt1', 'adrb1',
'prgr', 'gcr', 'adrb2', 'andr', 'ppard', 'csf1r', 'gria2', 'cp3a4',
'met', 'pgh1', 'abl1', 'casp3', 'kit', 'hdac8', 'hdac2', 'braf',
'urok', 'lkha4', 'igf1r', 'aldr', 'fpps', 'hmdh', 'kpcb', 'tgfr1',
'ital', 'mp2k1', 'nos1', 'tryb1', 'rxra', 'thb', 'cp2c9', 'ptn1',
'reni', 'pnph', 'tysy', 'akt2', 'kif11', 'aofb', 'plk1', 'hivint',
'mk10', 'pyrd', 'grik1', 'jak2', 'rock1', 'fa7', 'mapk2', 'nram',
'wee1', 'fkb1a', 'def', 'ada', 'fak1', 'mcr', 'pa2ga', 'xiap',
'hs90a', 'hxk4', 'mk01', 'pygm', 'glcm', 'comt', 'sahh', 'cxcr4',
'kith', 'ampc', 'pur2', 'fabp4', 'inha', 'fgfr1',
]
for i in all_ids:
if os.path.isdir(os.path.join(self.home, i)):
self.ids.append(i)
for fname in files:
f = os.path.join(self.home, i, fname)
if not (os.path.isfile(f) or
(fname[-3:] == '.gz' and os.path.isfile(f[:-3]))):
print('Target %s doesn\'t have file %s' % (i, fname),
file=sys.stderr)
if not self.ids:
print('No targets in directory %s' % (self.home), file=sys.stderr)
def __iter__(self):
for dude_id in self.ids:
yield _dude_target(self.home, dude_id)
def __getitem__(self, dude_id):
if dude_id in self.ids:
return _dude_target(self.home, dude_id)
else:
raise KeyError('There is no such target ("%s")' % dude_id)
class _dude_target(object):
def __init__(self, home, dude_id):
"""Allows to read files of the dude target
Parameters
----------
home : str
Directory to files from dud-e
dude_id : str
Target id
"""
self.home = home
self.dude_id = dude_id
@property
def protein(self):
"""Read a protein file"""
f = os.path.join(self.home, self.dude_id, 'receptor.pdb')
if os.path.isfile(f):
return next(toolkit.readfile('pdb', f))
else:
return None
@property
def ligand(self):
"""Read a ligand file"""
f = os.path.join(self.home, self.dude_id, 'crystal_ligand.mol2')
if os.path.isfile(f):
return next(toolkit.readfile('mol2', f))
else:
return None
@property
def actives(self):
"""Read an actives file"""
f = os.path.join(self.home, self.dude_id, 'actives_final.mol2.gz')
if os.path.isfile(f):
return toolkit.readfile('mol2', f)
# check if file is unpacked
elif os.path.isfile(f[:-3]):
return toolkit.readfile('mol2', f[:-3])
else:
return None
@property
def decoys(self):
"""Read a decoys file"""
f = os.path.join(self.home, self.dude_id, 'decoys_final.mol2.gz')
if os.path.isfile(f):
return toolkit.readfile('mol2', f)
# check if file is unpacked
elif os.path.isfile(f[:-3]):
return toolkit.readfile('mol2', f[:-3])
else:
return None
[docs]class CASF:
"""Load CASF dataset as described in
Li, Y. et al. Comparative Assessment of Scoring Functions
on an Updated Benchmark: 2. Evaluation Methods and General
Results. J. Chem. Inf. Model. 54, 1717-1736. (2014)
http://dx.doi.org/10.1021/ci500081m
Parameters
----------
home: string
Path to CASF dataset main directory
"""
def __init__(self, home):
self.home = home
self.index = '%s/coreset/index/' % self.home
if isdir(self.index):
filepath = '%s/2013_core_data.lst' % self.index
self.index_data = pd.read_csv(filepath,
sep=r'\s+',
comment='#',
header=None,
names=['pdbid', 'act', 'cluster'],
usecols=[0, 1, 5])
self.pdbids = self.index_data['pdbid']
def __iter__(self):
for pdbid in self.pdbids:
yield _CASFTarget(self.home, pdbid)
def __getitem__(self, item):
if item in self.pdbids:
return _CASFTarget(self.home, item)
elif isinstance(int, item) and item < len(self.pdbids):
return _CASFTarget(self.home, self.pdbids[item])
else:
raise KeyError
[docs] def precomputed_score(self, scoring_function=None):
"""Load precomputed results of scoring power
test for various scoring functions.
Parameters
----------
scoring_function: string (default=None)
Name of the scoring function to get results
If None, all results are returned.
"""
examples_dir = '%s/power_scoring/examples' % self.home
if scoring_function is not None:
functions = [scoring_function]
else:
functions = listdir(examples_dir)
functions.remove('README')
frames = []
for fun in functions:
file_score = '%s/%s' % (examples_dir, fun)
if not isfile(file_score):
raise FileNotFoundError('Invalid scoring function name')
score = pd.read_csv(file_score, comment='#',
sep=r'\s+', header=None,
names=['pdbid', 'score_crystal', 'score_opt'])
act = self.index_data[['pdbid', 'act']]
scores = pd.merge(score, act)
scores['scoring_function'] = pd.Series([fun] * 195,
name='Scoring function')
frames.append(scores)
return pd.concat(frames)
[docs] def precomputed_screening(self, scoring_function=None, cluster_id=None):
"""Load precomputed results of screening power
test for various scoring functions
Parameters
----------
scoring_function: string (default=None)
Name of the scoring function to get results
If None, all results are returned
cluster_id: int (default=None)
Number of the protein cluster to get results
If None, all results are returned
"""
screening_dir = '%s/power_screening' % self.home
examples_dir = '%s/examples' % screening_dir
if scoring_function is not None:
functions = [scoring_function]
else:
functions = listdir(examples_dir)
cluster_frame = pd.DataFrame(columns=['cluster_id',
'protein_structure',
'cluster_proteins'])
data_file = open('%s/TargetInfo.dat' % screening_dir)
for cluster, line in enumerate(filter(lambda x: not x.startswith('#'),
data_file.readlines())):
line = line.split()
protein_structure = line[0]
cluster_proteins = line[1:]
cluster_frame.loc[cluster] = [cluster + 1,
protein_structure, cluster_proteins]
frames = []
for fun in functions:
file_dir = '%s/%s' % (examples_dir, fun)
if not isdir(file_dir):
raise FileNotFoundError('Invalid scoring function name')
if cluster_id:
protein = cluster_frame.iloc[cluster_id - 1]['protein_structure']
frame = pd.read_csv('%s/%s_score.dat' % (file_dir, protein),
sep=r'\s+', header=None,
names=['name', 'score'])
frame['pdbid'] = [name[:4] for name in frame['name']]
frame['scoring_function'] = [fun] * len(frame)
frame = frame.merge(self.index_data[['pdbid', 'act']])
frames.append(frame)
else:
for row in cluster_frame.itertuples():
protein = row[2]
frame = pd.read_csv('%s/%s_score.dat' % (file_dir, protein),
sep=r'\s+', header=None,
names=['name', 'score'])
x = row[1]
frame['cluster_id'] = [x] * len(frame)
frame['protein_structure'] = [protein] * len(frame)
frame['cluster_proteins'] = [row[3]] * len(frame)
frame['pdbid'] = [name[:4] for name in frame['name']]
frame['scoring_function'] = [fun] * len(frame)
frame = frame.merge(self.index_data[['pdbid', 'act']])
frames.append(frame)
return pd.concat(frames, ignore_index=True)
class _CASFTarget:
"""
Used by CASF class.
Load CASF target (protein and ligand) with given ID.
Parameters
----------
home: string
Path to CASF dataset main directory
pdbid: string
ID of target protein
"""
def __init__(self, home, pdbid):
self.home = home
self.pdbid = pdbid
@property
def protein(self):
"""Load target protein from mol2 file as ob.Molecule object"""
filepath = '%s/coreset/%s/%s_protein.mol2' % (
self.home, self.pdbid, self.pdbid)
if isfile(filepath):
protein = six.next(toolkit.readfile('mol2', filepath))
return protein
return None
@property
def ligand(self):
"""Load target ligand from mol2 file as ob.Molecule object"""
filepath = '%s/coreset/%s/%s_ligand.mol2' % (
self.home, self.pdbid, self.pdbid)
if isfile(filepath):
ligand = six.next(toolkit.readfile('mol2', filepath))
return ligand
return None
@property
def decoys_docking(self):
"""Load decoys used for docking from mol2
file as list of ob.Molecule objects"""
filepath = '%s/decoys_docking/%s_decoys.mol2' % (self.home, self.pdbid)
if isfile(filepath):
decoys = list(toolkit.readfile('mol2', filepath))
return decoys
return None
@property
def decoys_screening(self):
"""Load decoys used for screening from mol2
files as list of ob.Molecule objects"""
dirpath = '%s/decoys_screening/%s' % (self.home, self.pdbid)
if isdir(dirpath):
decoys = []
for file in listdir(dirpath):
decoys.append(six.next(
toolkit.readfile('mol2', dirpath + '/' + file)))
return decoys
return None