Source code for oddt.scoring.functions.PLECscore

from __future__ import print_function
import sys
from os.path import dirname, isfile, join as path_join
from functools import partial
import json
import warnings

import numpy as np
import pandas as pd
from scipy.stats import pearsonr
from sklearn.metrics import r2_score

from sklearn import __version__ as sklearn_version
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

from oddt.metrics import rmse, standard_deviation_error
from oddt.scoring import scorer
from oddt.fingerprints import PLEC, MAX_HASH_VALUE
from oddt.scoring.descriptors import universal_descriptor


[docs]class PLECscore(scorer): def __init__(self, protein=None, n_jobs=-1, version='linear', depth_protein=5, depth_ligand=1, size=65536): """PLECscore - a novel scoring function based on PLEC fingerprints. The underlying model can be one of: * linear regression * neural network (dense, 200x200x200) * random forest (100 trees) The scoring function is trained on PDBbind v2016 database and even with linear model outperforms other machine-learning ones in terms of Pearson correlation coefficient on "core set". For details see PLEC publication. PLECscore predicts binding affinity (pKi/d). .. versionadded:: 0.6 Parameters ---------- protein : oddt.toolkit.Molecule object Receptor for the scored ligands n_jobs: int (default=-1) Number of cores to use for scoring and training. By default (-1) all cores are allocated. version: str (default='linear') A version of scoring function ('linear', 'nn' or 'rf') - which model should be used for the scoring function. depth_protein: int (default=5) The depth of ECFP environments generated on the protein side of interaction. By default 6 (0 to 5) environments are generated. depth_ligand: int (default=1) The depth of ECFP environments generated on the ligand side of interaction. By default 2 (0 to 1) environments are generated. size: int (default=65536) The final size of a folded PLEC fingerprint. This setting is not used to limit the data encoded in PLEC fingerprint (for that tune the depths), but only the final lenght. Setting it to too low value will lead to many collisions. """ self.protein = protein self.n_jobs = n_jobs self.version = version self.depth_protein = depth_protein self.depth_ligand = depth_ligand self.size = size plec_func = partial(PLEC, depth_ligand=depth_ligand, depth_protein=depth_protein, size=size, count_bits=True, sparse=True, ignore_hoh=True) descriptors = universal_descriptor(plec_func, protein=protein, shape=size, sparse=True) if version == 'linear': # avoid deprecation warnings kwargs = {'fit_intercept': False, 'loss': 'huber', 'penalty': 'elasticnet', 'random_state': 0, 'verbose': 0, 'alpha': 1e-4, 'epsilon': 1e-1, } if sklearn_version >= '0.19': kwargs['max_iter'] = 100 else: kwargs['n_iter'] = 100 model = SGDRegressor(**kwargs) elif version == 'nn': model = MLPRegressor((200, 200, 200), batch_size=10, random_state=0, verbose=0, solver='lbfgs') elif version == 'rf': model = RandomForestRegressor(n_estimators=100, n_jobs=n_jobs, verbose=0, random_state=0) else: raise ValueError('The version "%s" is not supported by PLECscore' % version) super(PLECscore, self).__init__(model, descriptors, score_title='PLEC%s_p%i_l%i_s%i' % (version, depth_protein, depth_ligand, size))
[docs] def gen_training_data(self, pdbbind_dir, pdbbind_versions=(2016,), home_dir=None, use_proteins=True): if home_dir is None: home_dir = path_join(dirname(__file__), 'PLECscore') filename = path_join(home_dir, 'plecscore_descs_p%i_l%i.csv.gz' % (self.depth_protein, self.depth_ligand)) # The CSV will contain unfolded FP self.descriptor_generator.func.keywords['size'] = MAX_HASH_VALUE self.descriptor_generator.shape = MAX_HASH_VALUE super(PLECscore, self)._gen_pdbbind_desc( pdbbind_dir=pdbbind_dir, pdbbind_versions=pdbbind_versions, desc_path=filename, include_general_set=True, use_proteins=use_proteins, ) # reset to the original size self.descriptor_generator.func.keywords['size'] = self.size self.descriptor_generator.shape = self.size
[docs] def gen_json(self, home_dir=None, pdbbind_version=2016): if not home_dir: home_dir = path_join(dirname(__file__), 'PLECscore') if isinstance(self.model, SGDRegressor): attributes = ['coef_', 'intercept_', 't_'] elif isinstance(self.model, MLPRegressor): attributes = ['loss_', 'coefs_', 'intercepts_', 'n_iter_', 'n_layers_', 'n_outputs_', 'out_activation_'] out = {} for attr_name in attributes: attr = getattr(self.model, attr_name) # convert numpy arrays to list for json if isinstance(attr, np.ndarray): attr = attr.tolist() elif (isinstance(attr, (list, tuple)) and isinstance(attr[0], np.ndarray)): attr = [x.tolist() for x in attr] out[attr_name] = attr json_path = path_join(home_dir, 'plecscore_%s_p%i_l%i_s%i_pdbbind%i.json' % (self.version, self.depth_protein, self.depth_ligand, self.size, pdbbind_version)) with open(json_path, 'w') as json_f: json.dump(out, json_f, indent=2) return json_path
[docs] def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016, ignore_json=False): if not home_dir: home_dir = path_join(dirname(__file__), 'PLECscore') desc_path = path_join(home_dir, 'plecscore_descs_p%i_l%i.csv.gz' % (self.depth_protein, self.depth_ligand)) json_path = path_join( home_dir, 'plecscore_%s_p%i_l%i_s%i_pdbbind%i.json' % (self.version, self.depth_protein, self.depth_ligand, self.size, pdbbind_version)) if (self.version in ['linear'] and # TODO: support other models isfile(json_path) and not ignore_json): print('Loading pretrained PLECscore %s with depths P%i L%i on ' 'PDBBind v%i' % (self.version, self.depth_protein, self.depth_ligand, pdbbind_version), file=sys.stderr) with open(json_path) as json_f: json_data = json.load(json_f) for k, v in json_data.items(): if isinstance(v, list): if isinstance(v[0], list): v = [np.array(x) for x in v] else: v = np.array(v) setattr(self.model, k, v) else: # blacklist core set 2013 and astex pdbids_blacklist = [ '3ao4', '3i3b', '1uto', '1ps3', '1qi0', '3g2z', '3dxg', '3l7b', '3mfv', '3b3s', '3kgp', '3fk1', '3fcq', '3lka', '3udh', '4gqq', '3imc', '2xdl', '2ymd', '1lbk', '1bcu', '3zsx', '1f8d', '3muz', '2v00', '1loq', '3n7a', '2r23', '3nq3', '2hb1', '2w66', '1n2v', '3kwa', '3g2n', '4de2', '3ozt', '3b3w', '3cft', '3f3a', '2qmj', '3f80', '1a30', '1w3k', '3ivg', '2jdy', '3u9q', '3pxf', '2wbg', '1u33', '2x0y', '3mss', '1vso', '1q8t', '3acw', '3bpc', '3vd4', '3cj2', '2brb', '1p1q', '2vo5', '3d4z', '2gss', '2yge', '3gy4', '3zso', '3ov1', '1w4o', '1zea', '2zxd', '3ueu', '2qft', '1gpk', '1f8b', '2jdm', '3su5', '2wca', '3n86', '2x97', '1n1m', '1o5b', '2y5h', '3ehy', '4des', '3ebp', '1q8u', '4de1', '3huc', '3l4w', '2vl4', '3coy', '3f3c', '1os0', '3owj', '3bkk', '1yc1', '1hnn', '3vh9', '3bfu', '1w3l', '3k5v', '2qbr', '1lol', '10gs', '2j78', '1r5y', '2weg', '3uo4', '3jvs', '2yfe', '1sln', '2iwx', '2jdu', '4djv', '2xhm', '2xnb', '3s8o', '2zcr', '3oe5', '3gbb', '2d3u', '3uex', '4dew', '1xd0', '1z95', '2vot', '1oyt', '2ole', '3gcs', '1kel', '2vvn', '3kv2', '3pww', '3su2', '1f8c', '2xys', '3l4u', '2xb8', '2d1o', '2zjw', '3f3e', '2g70', '2zwz', '1u1b', '4g8m', '1o3f', '2x8z', '3cyx', '2cet', '3ag9', '2pq9', '3l3n', '1nvq', '2cbj', '2v7a', '1h23', '2qbp', '3b68', '2xbv', '2fvd', '2vw5', '3ejr', '3f17', '3nox', '1hfs', '1jyq', '2pcp', '3ge7', '2wtv', '2zcq', '2obf', '3e93', '2p4y', '3dd0', '3nw9', '3uri', '3gnw', '3su3', '2xy9', '1sqa', '3fv1', '2yki', '3g0w', '3pe2', '1e66', '1igj', '4tmn', '2zx6', '3myg', '4gid', '3utu', '1lor', '1mq6', '2x00', '2j62', '4djr', '1gm8', '1gpk', '1hnn', '1hp0', '1hq2', '1hvy', '1hwi', '1hww', '1ia1', '1j3j', '1jd0', '1jje', '1ke5', '1kzk', '1l2s', '1l7f', '1lpz', '1m2z', '1mmv', '1mzc', '1n1m', '1n2v', '1n46', '1nav', '1of1', '1of6', '1opk', '1oq5', '1owe', '1oyt', '1p2y', '1p62', '1pmn', '1q1g', '1q41', '1q4g', '1r1h', '1r55', '1r58', '1r9o', '1s19', '1s3v', '1sg0', '1sj0', '1sq5', '1sqn', '1t40', '1t46', '1t9b', '1tow', '1tt1', '1u1c', '1uml', '1unl', '1uou', '1v0p', '1v48', '1v4s', '1vcj', '1w1p', '1w2g', '1xm6', '1xoq', '1xoz', '1y6b', '1ygc', '1yqy', '1yv3', '1yvf', '1ywr', '1z95', '2bm2', '2br1', '2bsm'] # use remote csv if it's not present if not isfile(desc_path): branch = 'master' # define branch/commit desc_url = ('https://raw.githubusercontent.com/oddt/oddt/%s' '/oddt/scoring/functions/PLECscore/' 'plecscore_descs_p%i_l%i.csv.gz' % (branch, self.depth_protein, self.depth_ligand)) warnings.warn('The CSV for PLEC P%i L%i is missing. Trying to ' 'get it from ODDT GitHub.' % (self.depth_protein, self.depth_ligand)) # download and save CSV pd.read_csv(desc_url, index_col='pdbid').to_csv( desc_path, compression='gzip') # set PLEC size to unfolded super(PLECscore, self)._load_pdbbind_desc( desc_path, train_set=('general', 'refined'), pdbbind_version=pdbbind_version, train_blacklist=pdbids_blacklist, fold_size=self.size, ) print('Training PLECscore %s with depths P%i L%i on PDBBind v%i' % (self.version, self.depth_protein, self.depth_ligand, pdbbind_version), file=sys.stderr) self.model.fit(self.train_descs, self.train_target) sets = [ ('Test', self.model.predict(self.test_descs), self.test_target), ('Train', self.model.predict(self.train_descs), self.train_target)] for name, pred, target in sets: if len(target) < 3: print('There are less than 3 values to predict, skipping.', file=sys.stderr) continue print('%s set:' % name, 'R2_score: %.4f' % r2_score(target, pred), 'Rp: %.4f' % pearsonr(target, pred)[0], 'RMSE: %.4f' % rmse(target, pred), 'SD: %.4f' % standard_deviation_error(target, pred), sep='\t', file=sys.stderr) if sf_pickle is None: return self.save('PLEC%s_p%i_l%i_pdbbind%i_s%i.pickle' % (self.version, self.depth_protein, self.depth_ligand, pdbbind_version, self.size)) else: return self.save(sf_pickle)
[docs] @classmethod def load(self, filename=None, version='linear', pdbbind_version=2016, depth_protein=5, depth_ligand=1, size=65536): if filename is None: # FIXME: it would be cool to have templates of names for a class fname = ('PLEC%s_p%i_l%i_pdbbind%i_s%i.pickle' % (version, depth_protein, depth_ligand, pdbbind_version, size)) for f in [fname, path_join(dirname(__file__), fname)]: if isfile(f): filename = f break else: print('No pickle, training new scoring function.', file=sys.stderr) sf = PLECscore(version=version) filename = sf.train(sf_pickle=filename, pdbbind_version=pdbbind_version) return scorer.load(filename)