""" Pandas extension for chemical analysis """
from __future__ import absolute_import
from collections import deque
from six import BytesIO, StringIO
import pandas as pd
import oddt
pd.set_option("display.max_colwidth", 999999)
image_backend = 'png' # png or svg
image_size = (200, 200)
try:
if get_ipython().config:
ipython_notebook = True
else:
ipython_notebook = False
except NameError:
ipython_notebook = False
def _mol_reader(fmt='sdf',
filepath_or_buffer=None,
usecols=None,
molecule_column='mol',
molecule_name_column='mol_name',
smiles_column=None,
skip_bad_mols=False,
chunksize=None,
**kwargs):
"""Universal reading function for private use.
.. versionadded:: 0.3
Parameters
----------
fmt : string
The format of molecular file
filepath_or_buffer : string or None
File path
usecols : list or None, optional (default=None)
A list of columns to read from file. If None then all available
fields are read.
molecule_column : string or None, optional (default='mol')
Name of molecule column. If None the molecules will be skipped and
the reading will be speed up significantly.
molecule_name_column : string or None, optional (default='mol_name')
Column name which will contain molecules' title/name. Column is
skipped when set to None.
smiles_column : string or None, optional (default=None)
Column name containg molecules' SMILES, by default it is disabled.
skip_bad_mols : bool, optional (default=False)
Switch to skip empty (bad) molecules. Useful for RDKit, which Returns
None if molecule can not sanitize.
chunksize : int or None, optional (default=None)
Size of chunk to return. If set to None whole set is returned.
Returns
-------
chunk :
A `ChemDataFrame` containg `chunksize` molecules.
"""
# capture options for reader
reader_kwargs = {}
if 'opt' in kwargs:
reader_kwargs['opt'] = kwargs.pop('opt')
if 'sanitize' in kwargs:
reader_kwargs['sanitize'] = kwargs.pop('sanitize')
# when you dont read molecules you can skip parsing them
if molecule_column is None:
if oddt.toolkit.backend == 'ob' and fmt == 'sdf':
if 'opt' in reader_kwargs:
reader_kwargs['opt']['P'] = None
else:
reader_kwargs['opt'] = {'P': None}
elif oddt.toolkit.backend == 'rdk':
reader_kwargs['sanitize'] = False
chunk = []
for n, mol in enumerate(oddt.toolkit.readfile(fmt, filepath_or_buffer, **reader_kwargs)):
if skip_bad_mols and mol is None:
continue # add warning with number of skipped molecules
if usecols is None:
mol_data = mol.data.to_dict()
else:
mol_data = dict((k, mol.data[k]) for k in usecols)
if molecule_column:
mol_data[molecule_column] = mol
if molecule_name_column:
mol_data[molecule_name_column] = mol.title
if smiles_column:
mol_data[smiles_column] = mol.smiles
chunk.append(mol_data)
if chunksize and (n + 1) % chunksize == 0:
chunk_frm = ChemDataFrame(chunk, **kwargs)
chunk_frm._molecule_column = molecule_column
yield chunk_frm
chunk = []
if chunk or chunksize is None:
chunk_frm = ChemDataFrame(chunk, **kwargs)
chunk_frm._molecule_column = molecule_column
yield chunk_frm
def _mol_writer(data,
fmt='sdf',
filepath_or_buffer=None,
update_properties=True,
molecule_column=None,
columns=None):
"""Universal writing function for private use.
.. versionadded:: 0.3
Parameters
----------
fmt : string
The format of molecular file
filepath_or_buffer : string or None
File path
update_properties : bool, optional (default=True)
Switch to update properties from the DataFrames to the molecules
while writting.
molecule_column : string or None, optional (default='mol')
Name of molecule column. If None the molecules will be skipped.
columns : list or None, optional (default=None)
A list of columns to write to file. If None then all available
fields are written.
"""
if filepath_or_buffer is None:
out = StringIO()
elif hasattr(filepath_or_buffer, 'write'):
out = filepath_or_buffer
else:
out = oddt.toolkit.Outputfile(fmt, filepath_or_buffer, overwrite=True)
if isinstance(data, pd.DataFrame):
molecule_column = molecule_column or data._molecule_column
for ix, row in data.iterrows():
mol = row[molecule_column].clone
if update_properties:
new_data = row.to_dict()
del new_data[molecule_column]
mol.data.update(new_data)
if columns:
for k in mol.data.keys():
if k not in columns:
del mol.data[k]
if filepath_or_buffer is None or hasattr(filepath_or_buffer, 'write'):
out.write(mol.write(fmt))
else:
out.write(mol)
elif isinstance(data, pd.Series):
for mol in data:
if filepath_or_buffer is None or hasattr(filepath_or_buffer, 'write'):
out.write(mol.write(fmt))
else:
out.write(mol)
if filepath_or_buffer is None:
return out.getvalue()
elif not hasattr(filepath_or_buffer, 'write'): # dont close foreign buffer
out.close()
[docs]def read_csv(*args, **kwargs):
""" TODO: Support Chunks """
smiles_to_molecule = kwargs.pop('smiles_to_molecule', None)
molecule_column = kwargs.pop('molecule_column', 'mol')
data = pd.read_csv(*args, **kwargs)
if smiles_to_molecule is not None:
data[molecule_column] = data[smiles_to_molecule].map(lambda x: oddt.toolkit.readstring('smi', x))
return data
[docs]def read_sdf(filepath_or_buffer=None,
usecols=None,
molecule_column='mol',
molecule_name_column='mol_name',
smiles_column=None,
skip_bad_mols=False,
chunksize=None,
**kwargs):
"""Read SDF/MDL multi molecular file to ChemDataFrame
.. versionadded:: 0.3
Parameters
----------
filepath_or_buffer : string or None
File path
usecols : list or None, optional (default=None)
A list of columns to read from file. If None then all available
fields are read.
molecule_column : string or None, optional (default='mol')
Name of molecule column. If None the molecules will be skipped and
the reading will be speed up significantly.
molecule_name_column : string or None, optional (default='mol_name')
Column name which will contain molecules' title/name. Column is
skipped when set to None.
smiles_column : string or None, optional (default=None)
Column name containg molecules' SMILES, by default it is disabled.
skip_bad_mols : bool, optional (default=False)
Switch to skip empty (bad) molecules. Useful for RDKit, which Returns
None if molecule can not sanitize.
chunksize : int or None, optional (default=None)
Size of chunk to return. If set to None whole set is returned.
Returns
-------
result :
A `ChemDataFrame` containg all molecules if `chunksize` is None
or genrerator of `ChemDataFrame` with `chunksize` molecules.
"""
result = _mol_reader(fmt='sdf',
filepath_or_buffer=filepath_or_buffer,
usecols=usecols,
molecule_column=molecule_column,
molecule_name_column=molecule_name_column,
smiles_column=smiles_column,
skip_bad_mols=skip_bad_mols,
chunksize=chunksize,
**kwargs)
if chunksize:
return result
else:
return deque(result, maxlen=1).pop()
[docs]def read_mol2(filepath_or_buffer=None,
usecols=None,
molecule_column='mol',
molecule_name_column='mol_name',
smiles_column=None,
skip_bad_mols=False,
chunksize=None,
**kwargs):
"""Read Mol2 multi molecular file to ChemDataFrame. UCSF Dock 6 comments
style is supported, i.e. `#### var_name: value` before molecular block.
.. versionadded:: 0.3
Parameters
----------
filepath_or_buffer : string or None
File path
usecols : list or None, optional (default=None)
A list of columns to read from file. If None then all available
fields are read.
molecule_column : string or None, optional (default='mol')
Name of molecule column. If None the molecules will be skipped and
the reading will be speed up significantly.
molecule_name_column : string or None, optional (default='mol_name')
Column name which will contain molecules' title/name. Column is
skipped when set to None.
smiles_column : string or None, optional (default=None)
Column name containg molecules' SMILES, by default it is disabled.
skip_bad_mols : bool, optional (default=False)
Switch to skip empty (bad) molecules. Useful for RDKit, which Returns
None if molecule can not sanitize.
chunksize : int or None, optional (default=None)
Size of chunk to return. If set to None whole set is returned.
Returns
-------
result :
A `ChemDataFrame` containg all molecules if `chunksize` is None
or genrerator of `ChemDataFrame` with `chunksize` molecules.
"""
result = _mol_reader(fmt='mol2',
filepath_or_buffer=filepath_or_buffer,
usecols=usecols,
molecule_column=molecule_column,
molecule_name_column=molecule_name_column,
smiles_column=smiles_column,
skip_bad_mols=skip_bad_mols,
chunksize=chunksize,
**kwargs)
if chunksize:
return result
else:
return deque(result, maxlen=1).pop()
[docs]class ChemSeries(pd.Series):
"""Pandas Series modified to adapt `oddt.toolkit.Molecule` objects and apply
molecular methods easily.
.. versionadded:: 0.3
"""
def __le__(self, other):
""" Substructure searching.
`chemseries < mol`: are molecules in series substructures of a `mol`
"""
if (isinstance(other, oddt.toolkit.Molecule) and
isinstance(self[0], oddt.toolkit.Molecule)):
return self.map(lambda x: oddt.toolkit.Smarts(x.smiles).match(other))
else:
return super(ChemSeries, self).__le__(other)
def __ge__(self, other):
""" Substructure searching.
`chemseries > mol`: is `mol` a substructure of molecules in series
"""
if (isinstance(other, oddt.toolkit.Molecule) and
isinstance(self[0], oddt.toolkit.Molecule)):
smarts = oddt.toolkit.Smarts(other.smiles)
return self.map(lambda x: smarts.match(x))
else:
return super(ChemSeries, self).__ge__(other)
def __or__(self, other):
""" Tanimoto coefficient """
if (isinstance(self[0], oddt.toolkit.Fingerprint) and
isinstance(other, oddt.toolkit.Fingerprint)):
return self.map(lambda x: x | other)
else:
return super(ChemSeries, self).__or__(other)
[docs] def calcfp(self, *args, **kwargs):
"""Helper function to map FP calculation throuugh the series"""
assert(isinstance(self[0], oddt.toolkit.Molecule))
return self.map(lambda x: x.calcfp(*args, **kwargs))
[docs] def to_smiles(self, filepath_or_buffer=None):
return _mol_writer(self, fmt='smi', filepath_or_buffer=filepath_or_buffer)
[docs] def to_sdf(self, filepath_or_buffer=None):
return _mol_writer(self, fmt='sdf', filepath_or_buffer=filepath_or_buffer)
[docs] def to_mol2(self, filepath_or_buffer=None):
return _mol_writer(self, fmt='mol2', filepath_or_buffer=filepath_or_buffer)
@property
def _constructor(self):
""" Force new class to be usead as constructor """
return ChemSeries
@property
def _constructor_expanddim(self):
""" Force new class to be usead as constructor when expandig dims """
return ChemDataFrame
[docs]class ChemDataFrame(pd.DataFrame):
"""Chemical DataFrame object, which contains molecules column of
`oddt.toolkit.Molecule` objects. Rich display of moleucles (2D) is available
in iPython Notebook. Additional `to_sdf` and `to_mol2` methods make writing
to molecular formats easy.
.. versionadded:: 0.3
Note:
Thanks to: http://blog.snapdragon.cc/2015/05/05/subclass-pandas-dataframe-to-save-custom-attributes/
"""
_metadata = ['_molecule_column']
_molecule_column = None
[docs] def to_sdf(self,
filepath_or_buffer=None,
update_properties=True,
molecule_column=None,
columns=None):
"""Write DataFrame to SDF file.
.. versionadded:: 0.3
Parameters
----------
filepath_or_buffer : string or None
File path
update_properties : bool, optional (default=True)
Switch to update properties from the DataFrames to the molecules
while writting.
molecule_column : string or None, optional (default='mol')
Name of molecule column. If None the molecules will be skipped.
columns : list or None, optional (default=None)
A list of columns to write to file. If None then all available
fields are written.
"""
molecule_column = molecule_column or self._molecule_column
return _mol_writer(self,
filepath_or_buffer=filepath_or_buffer,
update_properties=update_properties,
fmt='sdf',
molecule_column=molecule_column,
columns=columns)
[docs] def to_mol2(self,
filepath_or_buffer=None,
update_properties=True,
molecule_column='mol',
columns=None):
"""Write DataFrame to Mol2 file.
.. versionadded:: 0.3
Parameters
----------
filepath_or_buffer : string or None
File path
update_properties : bool, optional (default=True)
Switch to update properties from the DataFrames to the molecules
while writting.
molecule_column : string or None, optional (default='mol')
Name of molecule column. If None the molecules will be skipped.
columns : list or None, optional (default=None)
A list of columns to write to file. If None then all available
fields are written.
"""
molecule_column = molecule_column or self._molecule_column
return _mol_writer(self,
fmt='mol2',
filepath_or_buffer=filepath_or_buffer,
update_properties=update_properties,
molecule_column=molecule_column,
columns=columns)
[docs] def to_html(self, *args, **kwargs):
"""Patched rendering in HTML - don't escape HTML inside the cells.
Docs are copied from parent
"""
kwargs['escape'] = False
return super(ChemDataFrame, self).to_html(*args, **kwargs)
[docs] def to_csv(self, *args, **kwargs):
""" Docs are copied from parent """
if self._molecule_column and ('columns' not in kwargs or
kwargs['columns'] is None or
self._molecule_column in kwargs['columns']):
frm_copy = self.copy(deep=False)
frm_copy[self._molecule_column] = frm_copy[self._molecule_column].map(lambda x: x.smiles).values
return super(ChemDataFrame, frm_copy).to_csv(*args, **kwargs)
else:
return super(ChemDataFrame, self).to_csv(*args, **kwargs)
[docs] def to_excel(self, *args, **kwargs):
""" Docs are copied from parent """
columns = kwargs['columns'] if 'columns' in kwargs else self.columns.tolist()
if 'molecule_column' in kwargs:
molecule_column = kwargs['molecule_column']
else:
molecule_column = self._molecule_column
molecule_column_idx = columns.index(molecule_column)
size = kwargs.pop('size') if 'size' in kwargs else (200, 200)
excel_writer = pd.ExcelWriter(args[0], engine='xlsxwriter')
super(ChemDataFrame, self).to_excel(excel_writer, *args[1:], **kwargs)
sheet = excel_writer.sheets['Sheet1'] # TODO: Get appropriate sheet name
sheet.set_column(molecule_column_idx + 1, molecule_column_idx + 1, width=size[1] / 6.)
for i, mol in enumerate(self[molecule_column]):
img = BytesIO()
png = mol.clone.write('png', size=size)
if type(png) is str:
png = png.encode('utf-8', errors='surrogateescape')
img.write(png)
sheet.write_string(i + 1, molecule_column_idx + 1, "")
sheet.insert_image(i + 1,
molecule_column_idx + 1,
'dummy',
{'image_data': img,
'positioning': 2,
'x_offset': 1,
'y_offset': 1})
sheet.set_row(i + 1, height=size[0])
excel_writer.save()
@property
def _constructor(self):
""" Force new class to be usead as constructor """
return ChemDataFrame
@property
def _constructor_sliced(self):
""" Force new class to be usead as constructor when slicing """
return ChemSeries
@property
def _constructor_expanddim(self):
""" Force new class to be usead as constructor when expandig dims """
return ChemPanel
# Copy some docscrings from upstream classes
for method in ['to_html', 'to_csv', 'to_excel']:
try:
getattr(ChemDataFrame, method).__doc__ = getattr(pd.DataFrame, method).__doc__
except AttributeError: # Python 2 compatible
getattr(ChemDataFrame, method).__func__.__doc__ = getattr(pd.DataFrame, method).__func__.__doc__
[docs]class ChemPanel(pd.Panel):
"""Modified `pandas.Panel` to adopt higher dimension data than
`ChemDataFrame`. Main purpose is to store molecular fingerprints in one
column and keep 2D numpy array underneath.
.. versionadded:: 0.3
"""
_metadata = ['_molecule_column']
_molecule_column = None
@property
def _constructor(self):
""" Force new class to be usead as constructor """
return ChemPanel
@property
def _constructor_sliced(self):
""" Force new class to be usead as constructor when slicing """
return ChemDataFrame