Source code for oddt.scoring

from os.path import dirname, join as path_join
import gzip
from itertools import chain
from functools import partial

import six
from six.moves import cPickle as pickle

import numpy as np
from scipy.sparse import vstack as sparse_vstack
import pandas as pd

from joblib import Parallel, delayed
from sklearn.model_selection import cross_val_score, KFold
from sklearn.base import is_classifier, is_regressor
from sklearn.metrics import accuracy_score, r2_score

import oddt
from oddt.utils import method_caller
from oddt.datasets import pdbbind
from oddt.fingerprints import sparse_to_csr_matrix, csr_matrix_to_sparse, fold


[docs]def cross_validate(model, cv_set, cv_target, n=10, shuffle=True, n_jobs=1): """Perform cross validation of model using provided data Parameters ---------- model: object Model to be tested cv_set: array-like of shape = [n_samples, n_features] Estimated target values. cv_target: array-like of shape = [n_samples] or [n_samples, n_outputs] Estimated target values. n: integer (default = 10) How many folds to be created from dataset shuffle: bool (default = True) Should data be shuffled before folding. n_jobs: integer (default = 1) How many CPUs to use during cross validation Returns ------- r2: array of shape = [n] R^2 score for each of generated folds """ if shuffle: cv = KFold(n_splits=n, shuffle=True) else: cv = n return cross_val_score(model, cv_set, cv_target, cv=cv, n_jobs=n_jobs)
# FIX ### If possible make ensemble scorer lazy, for now it consumes all ligands
[docs]class scorer(object): def __init__(self, model_instance, descriptor_generator_instance, score_title='score'): """Scorer class is parent class for scoring functions. Parameters ---------- model_instance: model Medel compatible with sklearn API (fit, predict and score methods) descriptor_generator_instance: array of descriptors Descriptor generator object score_title: string Title of score to be used. """ self.model = model_instance self.descriptor_generator = descriptor_generator_instance self.score_title = score_title def _gen_pdbbind_desc(self, pdbbind_dir, pdbbind_versions=(2007, 2012, 2013, 2014, 2015, 2016), desc_path=None, include_general_set=False, use_proteins=False, **kwargs): pdbbind_versions = sorted(pdbbind_versions) opt = kwargs.get('opt', {}) # generate metadata df = None for pdbbind_version in pdbbind_versions: p = pdbbind('%s/v%i/' % (pdbbind_dir, pdbbind_version), version=pdbbind_version, opt=opt) # Core set for set_name in p.pdbind_sets: if set_name == 'general_PL': dataset_key = '%i_general' % pdbbind_version else: dataset_key = '%i_%s' % (pdbbind_version, set_name) tmp_df = pd.DataFrame({ 'pdbid': list(p.sets[set_name].keys()), dataset_key: list(p.sets[set_name].values()) }) if df is not None: df = pd.merge(tmp_df, df, how='outer', on='pdbid') else: df = tmp_df df.sort_values('pdbid', inplace=True) tmp_act = df['%i_general' % pdbbind_versions[-1]].values df = df.set_index('pdbid').notnull() df['act'] = tmp_act # take non-empty and core + refined set df = df[df['act'].notnull() & (df.filter(regex='.*_[refined,core]').any(axis=1) | include_general_set)] # build descriptos pdbbind_db = pdbbind('%s/v%i/' % (pdbbind_dir, pdbbind_versions[-1]), version=pdbbind_versions[-1]) if not desc_path: desc_path = path_join(dirname(__file__) + 'descs.csv') if self.n_jobs is None: n_jobs = -1 else: n_jobs = self.n_jobs blacklist = [] if use_proteins: # list of protein files that segfault OB 2.4.1 blacklist = pdbbind_db.protein_blacklist[oddt.toolkit.backend] # check if PDBID exists or is blacklisted desc_idx = [pid for pid in df.index.values if (pid not in blacklist and getattr(pdbbind_db[pid], 'protein' if use_proteins else 'pocket') is not None)] result = Parallel(n_jobs=n_jobs, verbose=1)( delayed(method_caller)( self.descriptor_generator, 'build', [pdbbind_db[pid].ligand], protein=getattr(pdbbind_db[pid], 'protein' if use_proteins else 'pocket')) for pid in desc_idx) # sparse descs may have different shapes, dense are stored np.array sparse = (hasattr(self.descriptor_generator, 'sparse') and self.descriptor_generator.sparse) if not sparse: result = np.vstack(result) # create dataframe with descriptors with pdbids as index df_desc = pd.DataFrame(result, index=desc_idx) df_desc.index.rename('pdbid', inplace=True) # for sparse features leave one column and cast explicitly to list if sparse: if len(df_desc.columns) > 1: raise Exception('There are more than one column in the ' 'sparse descriptor table.') df_desc.columns = ['sparse'] df_desc['sparse'] = df_desc['sparse'].map( lambda x: csr_matrix_to_sparse(x).tolist()) compression = None if desc_path[-3:] == '.gz': compression = 'gzip' # DF are joined by index (pdbid) since some might be missing df.join(df_desc, how='inner').to_csv(desc_path, float_format='%.5g', compression=compression) def _load_pdbbind_desc(self, desc_path, pdbbind_version=2016, train_set='refined', test_set='core', train_blacklist=None, fold_size=None): """ TODO: write the docs """ df = pd.read_csv(desc_path, index_col='pdbid') # generate dense representation of sparse descriptor in CSV cols = list(map(str, range(len(self.descriptor_generator)))) if 'sparse' in df.columns: # convert strings to np.arrays df['sparse'] = df['sparse'].map( lambda x: np.fromstring(x[1:-1], dtype=np.uint64, sep=',')) cols = 'sparse' # sparse array will have one column # fold only if necessary if fold_size: df['sparse'] = df['sparse'].map(lambda x: fold(x, fold_size)) # convert to sparse csr_matrix df['sparse'] = df['sparse'].map( partial(sparse_to_csr_matrix, size=len(self.descriptor_generator))) if isinstance(train_set, six.string_types): train_idx = df['%i_%s' % (pdbbind_version, train_set)] else: train_idx = df[['%i_%s' % (pdbbind_version, s) for s in train_set]].any(axis=1) if train_blacklist: train_idx &= ~df.index.isin(train_blacklist) train_idx &= ~df['%i_%s' % (pdbbind_version, test_set)] # load sparse matrices as training is usually faster on them if 'sparse' in df.columns: self.train_descs = sparse_vstack(df.loc[train_idx, cols].values, format='csr') else: self.train_descs = df.loc[train_idx, cols].values self.train_target = df.loc[train_idx, 'act'].values test_idx = df['%i_%s' % (pdbbind_version, test_set)] if 'sparse' in df.columns: self.test_descs = sparse_vstack(df.loc[test_idx, cols].values, format='csr') else: self.test_descs = df.loc[test_idx, cols].values self.test_target = df.loc[test_idx, 'act'].values
[docs] def fit(self, ligands, target, *args, **kwargs): """Trains model on supplied ligands and target values Parameters ---------- ligands: array-like of ligands Molecules to featurize and feed into the model target: array-like of shape = [n_samples] or [n_samples, n_outputs] Ground truth (correct) target values. """ self.train_descs = self.descriptor_generator.build(ligands) return self.model.fit(self.train_descs, target, *args, **kwargs)
[docs] def predict(self, ligands, *args, **kwargs): """Predicts values (eg. affinity) for supplied ligands. Parameters ---------- ligands: array-like of ligands Molecules to featurize and feed into the model Returns ------- predicted: np.array or array of np.arrays of shape = [n_ligands] Predicted scores for ligands """ descs = self.descriptor_generator.build(ligands) return self.model.predict(descs)
[docs] def score(self, ligands, target, *args, **kwargs): """Methods estimates the quality of prediction using model's default score (accuracy for classification or R^2 for regression) Parameters ---------- ligands: array-like of ligands Molecules to featurize and feed into the model target: array-like of shape = [n_samples] or [n_samples, n_outputs] Ground truth (correct) target values. Returns ------- s: float Quality score (accuracy or R^2) for prediction """ descs = self.descriptor_generator.build(ligands) return self.model.score(descs, target, *args, **kwargs)
[docs] def predict_ligand(self, ligand): """Local method to score one ligand and update it's scores. Parameters ---------- ligand: oddt.toolkit.Molecule object Ligand to be scored Returns ------- ligand: oddt.toolkit.Molecule object Scored ligand with updated scores """ score = self.predict([ligand])[0] ligand.data.update({self.score_title: score}) return ligand
[docs] def predict_ligands(self, ligands): """Method to score ligands in a lazy fashion. Parameters ---------- ligands: iterable of oddt.toolkit.Molecule objects Ligands to be scored Returns ------- ligand: iterator of oddt.toolkit.Molecule objects Scored ligands with updated scores """ return (self.predict_ligand(lig) for lig in ligands)
[docs] def set_protein(self, protein): """Proxy method to update protein in all relevant places. Parameters ---------- protein: oddt.toolkit.Molecule object New default protein """ self.protein = protein if hasattr(self.descriptor_generator, 'set_protein'): self.descriptor_generator.set_protein(protein) else: self.descriptor_generator.protein = protein
[docs] def save(self, filename): """Saves scoring function to a pickle file. Parameters ---------- filename: string Pickle filename """ # FIXME: re-set protein after pickling self.set_protein(None) # return joblib.dump(self, filename, compress=9)[0] with gzip.open(filename, 'w+b', compresslevel=9) as f: pickle.dump(self, f, protocol=2) return filename
[docs] @classmethod def load(self, filename): """Loads scoring function from a pickle file. Parameters ---------- filename: string Pickle filename Returns ------- sf: scorer-like object Scoring function object loaded from a pickle """ # return joblib.load(filename) kwargs = {'encoding': 'latin1'} if six.PY3 else {} with gzip.open(filename, 'rb') as f: out = pickle.load(f, **kwargs) return out
[docs]class ensemble_model(object): def __init__(self, models): """Proxy class to build an ensemble of models with an API as one Parameters ---------- models: array An array of models """ self._models = models if len(models) else None if self._models is not None: if is_classifier(self._models[0]): check_type = is_classifier self._scoring_fun = accuracy_score elif is_regressor(self._models[0]): check_type = is_regressor self._scoring_fun = r2_score else: raise ValueError('Expected regressors or classifiers,' ' got %s instead' % type(self._models[0])) for model in self._models: if not check_type(model): raise ValueError('Different types of models found, privide' ' either regressors or classifiers.')
[docs] def fit(self, X, y, *args, **kwargs): for model in self._models: model.fit(X, y, *args, **kwargs) return self
[docs] def predict(self, X, *args, **kwargs): return np.array([model.predict(X, *args, **kwargs) for model in self._models]).mean(axis=0)
[docs] def score(self, X, y, *args, **kwargs): return self._scoring_fun(y.flatten(), self.predict(X, *args, **kwargs).flatten())
[docs]class ensemble_descriptor(object): def __init__(self, descriptor_generators): """Proxy class to build an ensemble of destriptors with an API as one Parameters ---------- models: array An array of models """ self._desc_gens = (descriptor_generators if len(descriptor_generators) else None) self.titles = list(chain(*(desc_gen.titles for desc_gen in self._desc_gens)))
[docs] def build(self, mols, *args, **kwargs): desc = np.hstack(tuple(desc_gen.build(mols, *args, **kwargs) for desc_gen in self._desc_gens)) return desc
[docs] def set_protein(self, protein): for desc in self._desc_gens: if hasattr(desc, 'set_protein'): desc.set_protein(protein) else: desc.protein = protein
def __len__(self): """ Returns the dimensions of descriptors """ return sum(len(desc) for desc in self._desc_gens) def __reduce__(self): return ensemble_descriptor, (self._desc_gens,)