Source code for oddt.datasets

""" Datasets wrapped in convenient models """
from __future__ import print_function
import sys
import os
import six
import pandas as pd
from os.path import isfile, isdir
from os import listdir
import warnings

from oddt import toolkit


[docs]class pdbbind(object):
    def __init__(self,
                 home,
                 version=None,
                 default_set=None,
                 opt=None):

        if version is None:
            raise ValueError('PDBbind version not specified')
        else:
            version = int(version)

        self.home = home
        if default_set:
            self.default_set = default_set
        else:
            if version == 2007:
                self.default_set = 'general'
            else:
                self.default_set = 'general_PL'
        self.opt = opt or {}
        self.sets = {}
        self._set_ids = {}
        self._set_act = {}

        # list of protein ids that are known to segfault toolkits
        self.protein_blacklist = {
            'ob': {'1e8h', '1ntk', '1nu1', '1rbo', '1sqb', '1sqp', '1sqq',
                   '2f2h', '2wig', '2wij', '2wik', '3axk', '3axm', '3cf1',
                   # Following segfault on systems with smaller RAM
                   '1px4', '1pyg', '1zyr', '3a2c', '3dxj', '3dyo', '3eql',
                   '3f33', '3f34', '3f35', '3f36', '3f37', '3f38', '3f39',
                   '3i3b', '3i3d', '3k1j', '3muz', '3mv0', '3n75', '3t08',
                   '3t09', '3t0b', '3t0d', '3t2p', '3t2q', '3vd4', '3vd7',
                   '3vd9', '3vdb', '3vdc', '3wi6', '4kmu', '4kn4', '4kn7',
                   '7gpb',
                   # extended use segfaults (not only reading problem)
                   '1l7x',
                   },
            'rdk': {}
        }

        if version == 2007:
            self.pdbind_sets = ['core', 'refined', 'general']
        else:
            self.pdbind_sets = ['core', 'refined', 'general_PL']
        for pdbind_set in self.pdbind_sets:
            if version == 2007:
                csv_file = os.path.join(self.home, 'INDEX.%i.%s.data'
                                        % (version, pdbind_set))
            elif version >= 2016:
                csv_file = os.path.join(self.home, 'index', 'INDEX_%s_data.%i'
                                        % (pdbind_set, version))
            else:
                csv_file = os.path.join(self.home, 'INDEX_%s_data.%i'
                                        % (pdbind_set, version))

            if os.path.isfile(csv_file):
                data = pd.read_csv(csv_file,
                                   sep='\s+',
                                   usecols=[0, 1, 2, 3],
                                   names=['pdbid',
                                          'resolution',
                                          'release_year',
                                          'act'],
                                   comment='#')
                self._set_ids[pdbind_set] = data['pdbid'].tolist()
                self._set_act[pdbind_set] = data['act'].tolist()
                self.sets[pdbind_set] = dict(zip(self._set_ids[pdbind_set],
                                                 self._set_act[pdbind_set]))
        if len(self.sets) == 0:
            raise Exception('There is no PDBbind set availabe')

    @property
    def ids(self):
        # return sorted(self.sets[self.default_set].keys())
        return self._set_ids[self.default_set]

    @property
    def activities(self):
        return self._set_act[self.default_set]

    def __iter__(self):
        for pdbid in self.ids:
            yield _pdbbind_id(self.home, pdbid, opt=self.opt)

    def __getitem__(self, pdbid):
        warn_msg = ('A protein "%s" is blacklisted (known to segfault) for '
                    'current toolkit. Proceed at your own risk.' % pdbid)
        if pdbid in self.ids:
            if pdbid in self.protein_blacklist[toolkit.backend]:
                warnings.warn(warn_msg, UserWarning)
            return _pdbbind_id(self.home, pdbid, opt=self.opt)
        elif (isinstance(pdbid, int) and
              pdbid < len(self.ids) and
              pdbid >= -len(self.ids)):
            if self.ids[pdbid] in self.protein_blacklist[toolkit.backend]:
                warnings.warn(warn_msg, UserWarning)
            return _pdbbind_id(self.home + '', self.ids[pdbid], opt=self.opt)
        else:
            raise KeyError('There is no such target ("%s")' % pdbid)


class _pdbbind_id(object):
    def __init__(self, home, pdbid, opt=None):
        self.home = home
        self.id = pdbid
        self.opt = opt or {}

    @property
    def protein(self):
        f = os.path.join(self.home, self.id, '%s_protein.pdb' % self.id)
        if os.path.isfile(f):
            protein = next(toolkit.readfile('pdb', f, lazy=True, opt=self.opt))
            if protein is not None:
                protein.protein = True
            return protein
        else:
            return None

    @property
    def pocket(self):
        f = os.path.join(self.home, self.id, '%s_pocket.pdb' % self.id)
        if os.path.isfile(f):
            pocket = next(toolkit.readfile('pdb', f, lazy=True, opt=self.opt))
            if pocket is not None:
                pocket.protein = True
            return pocket
        else:
            return None

    @property
    def ligand(self):
        f = os.path.join(self.home, self.id, '%s_ligand.sdf' % self.id)
        if os.path.isfile(f):
            return next(toolkit.readfile('sdf', f, lazy=True, opt=self.opt))
        else:
            return None


[docs]class dude(object):

    def __init__(self, home):
        """A wrapper for DUD-E (A Database of Useful Decoys: Enhanced)
        http://dude.docking.org/

        Parameters
        ----------
        home : str
            Path to files from dud-e

        """
        self.home = home
        if not os.path.isdir(self.home):
            raise Exception('Directory %s doesn\'t exist' % self.home)

        self.ids = []
        files = ['receptor.pdb', 'crystal_ligand.mol2',
                 'actives_final.mol2.gz', 'decoys_final.mol2.gz']
        # ids sorted by size of protein
        all_ids = [
            'fnta', 'dpp4', 'mmp13', 'hivpr', 'ada17', 'mk14', 'egfr', 'src',
            'drd3', 'aa2ar', 'cah2', 'parp1', 'cdk2', 'lck', 'pde5a', 'thrb',
            'aces', 'try1', 'pparg', 'vgfr2', 'pgh2', 'esr1', 'fa10', 'esr2',
            'ppara', 'dhi1', 'hivrt', 'bace1', 'ace', 'dyr', 'akt1', 'adrb1',
            'prgr', 'gcr', 'adrb2', 'andr', 'ppard', 'csf1r', 'gria2', 'cp3a4',
            'met', 'pgh1', 'abl1', 'casp3', 'kit', 'hdac8', 'hdac2', 'braf',
            'urok', 'lkha4', 'igf1r', 'aldr', 'fpps', 'hmdh', 'kpcb', 'tgfr1',
            'ital', 'mp2k1', 'nos1', 'tryb1', 'rxra', 'thb', 'cp2c9', 'ptn1',
            'reni', 'pnph', 'tysy', 'akt2', 'kif11', 'aofb', 'plk1', 'hivint',
            'mk10', 'pyrd', 'grik1', 'jak2', 'rock1', 'fa7', 'mapk2', 'nram',
            'wee1', 'fkb1a', 'def', 'ada', 'fak1', 'mcr', 'pa2ga', 'xiap',
            'hs90a', 'hxk4', 'mk01', 'pygm', 'glcm', 'comt', 'sahh', 'cxcr4',
            'kith', 'ampc', 'pur2', 'fabp4', 'inha', 'fgfr1',
        ]

        for i in all_ids:
            if os.path.isdir(os.path.join(self.home, i)):
                self.ids.append(i)
                for fname in files:
                    f = os.path.join(self.home, i, fname)
                    if not (os.path.isfile(f) or
                            (fname[-3:] == '.gz' and os.path.isfile(f[:-3]))):
                        print('Target %s doesn\'t have file %s' % (i, fname),
                              file=sys.stderr)
        if not self.ids:
            print('No targets in directory %s' % (self.home), file=sys.stderr)

    def __iter__(self):
        for dude_id in self.ids:
            yield _dude_target(self.home, dude_id)

    def __getitem__(self, dude_id):
        if dude_id in self.ids:
            return _dude_target(self.home, dude_id)
        else:
            raise KeyError('There is no such target ("%s")' % dude_id)


class _dude_target(object):

    def __init__(self, home, dude_id):
        """Allows to read files of the dude target

        Parameters
        ----------
        home : str
            Directory to files from dud-e

        dude_id : str
            Target id
        """
        self.home = home
        self.dude_id = dude_id

    @property
    def protein(self):
        """Read a protein file"""
        f = os.path.join(self.home, self.dude_id, 'receptor.pdb')
        if os.path.isfile(f):
            return next(toolkit.readfile('pdb', f))
        else:
            return None

    @property
    def ligand(self):
        """Read a ligand file"""
        f = os.path.join(self.home, self.dude_id, 'crystal_ligand.mol2')
        if os.path.isfile(f):
            return next(toolkit.readfile('mol2', f))
        else:
            return None

    @property
    def actives(self):
        """Read an actives file"""
        f = os.path.join(self.home, self.dude_id, 'actives_final.mol2.gz')
        if os.path.isfile(f):
            return toolkit.readfile('mol2', f)
        # check if file is unpacked
        elif os.path.isfile(f[:-3]):
            return toolkit.readfile('mol2', f[:-3])
        else:
            return None

    @property
    def decoys(self):
        """Read a decoys file"""
        f = os.path.join(self.home, self.dude_id, 'decoys_final.mol2.gz')
        if os.path.isfile(f):
            return toolkit.readfile('mol2', f)
        # check if file is unpacked
        elif os.path.isfile(f[:-3]):
            return toolkit.readfile('mol2', f[:-3])
        else:
            return None


[docs]class CASF:
    """Load CASF dataset as described in
    Li, Y. et al. Comparative Assessment of Scoring Functions
    on an Updated Benchmark: 2. Evaluation Methods and General
    Results. J. Chem. Inf. Model. 54, 1717-1736. (2014)
    http://dx.doi.org/10.1021/ci500081m

    Parameters
    ----------
    home: string
        Path to CASF dataset main directory
    """

    def __init__(self, home):
        self.home = home
        self.index = '%s/coreset/index/' % self.home

        if isdir(self.index):
            filepath = '%s/2013_core_data.lst' % self.index
            self.index_data = pd.read_csv(filepath,
                                          sep=r'\s+',
                                          comment='#',
                                          header=None,
                                          names=['pdbid', 'act', 'cluster'],
                                          usecols=[0, 1, 5])
            self.pdbids = self.index_data['pdbid']

    def __iter__(self):
        for pdbid in self.pdbids:
            yield _CASFTarget(self.home, pdbid)

    def __getitem__(self, item):
        if item in self.pdbids:
            return _CASFTarget(self.home, item)
        elif isinstance(int, item) and item < len(self.pdbids):
            return _CASFTarget(self.home, self.pdbids[item])
        else:
            raise KeyError

[docs]    def precomputed_score(self, scoring_function=None):
        """Load precomputed results of scoring power
        test for various scoring functions.

        Parameters
        ----------
        scoring_function: string (default=None)
            Name of the scoring function to get results
            If None, all results are returned.
        """
        examples_dir = '%s/power_scoring/examples' % self.home
        if scoring_function is not None:
            functions = [scoring_function]
        else:
            functions = listdir(examples_dir)
            functions.remove('README')

        frames = []

        for fun in functions:
            file_score = '%s/%s' % (examples_dir, fun)
            if not isfile(file_score):
                raise FileNotFoundError('Invalid scoring function name')

            score = pd.read_csv(file_score, comment='#',
                                sep=r'\s+', header=None,
                                names=['pdbid', 'score_crystal', 'score_opt'])
            act = self.index_data[['pdbid', 'act']]

            scores = pd.merge(score, act)
            scores['scoring_function'] = pd.Series([fun] * 195,
                                                   name='Scoring function')
            frames.append(scores)

        return pd.concat(frames)

[docs]    def precomputed_screening(self, scoring_function=None, cluster_id=None):
        """Load precomputed results of screening power
        test for various scoring functions

        Parameters
        ----------
        scoring_function: string (default=None)
            Name of the scoring function to get results
            If None, all results are returned

        cluster_id: int (default=None)
            Number of the protein cluster to get results
            If None, all results are returned
        """
        screening_dir = '%s/power_screening' % self.home
        examples_dir = '%s/examples' % screening_dir
        if scoring_function is not None:
            functions = [scoring_function]
        else:
            functions = listdir(examples_dir)

        cluster_frame = pd.DataFrame(columns=['cluster_id',
                                              'protein_structure',
                                              'cluster_proteins'])
        data_file = open('%s/TargetInfo.dat' % screening_dir)
        for cluster, line in enumerate(filter(lambda x: not x.startswith('#'),
                                              data_file.readlines())):
            line = line.split()
            protein_structure = line[0]
            cluster_proteins = line[1:]
            cluster_frame.loc[cluster] = [cluster + 1,
                                          protein_structure, cluster_proteins]

        frames = []
        for fun in functions:
            file_dir = '%s/%s' % (examples_dir, fun)
            if not isdir(file_dir):
                raise FileNotFoundError('Invalid scoring function name')
            if cluster_id:
                protein = cluster_frame.iloc[cluster_id - 1]['protein_structure']
                frame = pd.read_csv('%s/%s_score.dat' % (file_dir, protein),
                                    sep=r'\s+', header=None,
                                    names=['name', 'score'])
                frame['pdbid'] = [name[:4] for name in frame['name']]
                frame['scoring_function'] = [fun] * len(frame)
                frame = frame.merge(self.index_data[['pdbid', 'act']])
                frames.append(frame)

            else:
                for row in cluster_frame.itertuples():
                    protein = row[2]
                    frame = pd.read_csv('%s/%s_score.dat' % (file_dir, protein),
                                        sep=r'\s+', header=None,
                                        names=['name', 'score'])
                    x = row[1]
                    frame['cluster_id'] = [x] * len(frame)
                    frame['protein_structure'] = [protein] * len(frame)
                    frame['cluster_proteins'] = [row[3]] * len(frame)
                    frame['pdbid'] = [name[:4] for name in frame['name']]
                    frame['scoring_function'] = [fun] * len(frame)
                    frame = frame.merge(self.index_data[['pdbid', 'act']])
                    frames.append(frame)

        return pd.concat(frames, ignore_index=True)


class _CASFTarget:
    """
    Used by CASF class.
    Load CASF target (protein and ligand) with given ID.

    Parameters
    ----------
    home: string
        Path to CASF dataset main directory
    pdbid: string
        ID of target protein
    """
    def __init__(self, home, pdbid):
        self.home = home
        self.pdbid = pdbid

    @property
    def protein(self):
        """Load target protein from mol2 file as ob.Molecule object"""
        filepath = '%s/coreset/%s/%s_protein.mol2' % (
            self.home, self.pdbid, self.pdbid)
        if isfile(filepath):
            protein = six.next(toolkit.readfile('mol2', filepath))
            return protein
        return None

    @property
    def ligand(self):
        """Load target ligand from mol2 file as ob.Molecule object"""
        filepath = '%s/coreset/%s/%s_ligand.mol2' % (
            self.home, self.pdbid, self.pdbid)
        if isfile(filepath):
            ligand = six.next(toolkit.readfile('mol2', filepath))
            return ligand
        return None

    @property
    def decoys_docking(self):
        """Load decoys used for docking from mol2
        file as list of ob.Molecule objects"""
        filepath = '%s/decoys_docking/%s_decoys.mol2' % (self.home, self.pdbid)
        if isfile(filepath):
            decoys = list(toolkit.readfile('mol2', filepath))
            return decoys
        return None

    @property
    def decoys_screening(self):
        """Load decoys used for screening from mol2
        files as list of ob.Molecule objects"""
        dirpath = '%s/decoys_screening/%s' % (self.home, self.pdbid)
        if isdir(dirpath):
            decoys = []
            for file in listdir(dirpath):
                decoys.append(six.next(
                    toolkit.readfile('mol2', dirpath + '/' + file)))
            return decoys
        return None