Source code for oddt.pandas

""" Pandas extension for chemical analysis """
from __future__ import absolute_import
from collections import deque
from six import BytesIO, StringIO
import pandas as pd

import oddt

pd.set_option("display.max_colwidth", 999999)


def _mol_reader(fmt='sdf',
                filepath_or_buffer=None,
                usecols=None,
                molecule_column='mol',
                molecule_name_column='mol_name',
                smiles_column=None,
                skip_bad_mols=False,
                chunksize=None,
                **kwargs):
    """Universal reading function for private use.

    .. versionadded:: 0.3

    Parameters
    ----------
        fmt : string
            The format of molecular file

        filepath_or_buffer : string or None
            File path

        usecols : list or None, optional (default=None)
            A list of columns to read from file. If None then all available
            fields are read.

        molecule_column : string or None, optional (default='mol')
            Name of molecule column. If None the molecules will be skipped and
            the reading will be speed up significantly.

        molecule_name_column : string or None, optional (default='mol_name')
            Column name which will contain molecules' title/name. Column is
            skipped when set to None.

        smiles_column  : string or None, optional (default=None)
            Column name containg molecules' SMILES, by default it is disabled.

        skip_bad_mols : bool, optional (default=False)
            Switch to skip empty (bad) molecules. Useful for RDKit, which Returns
            None if molecule can not sanitize.

        chunksize : int or None, optional (default=None)
            Size of chunk to return. If set to None whole set is returned.

    Returns
    -------
        chunk :
            A `ChemDataFrame` containg `chunksize` molecules.

    """
    # capture options for reader
    reader_kwargs = {}
    if 'opt' in kwargs:
        reader_kwargs['opt'] = kwargs.pop('opt')
    if 'sanitize' in kwargs:
        reader_kwargs['sanitize'] = kwargs.pop('sanitize')

    # when you dont read molecules you can skip parsing them
    if molecule_column is None:
        if oddt.toolkit.backend == 'ob' and fmt == 'sdf':
            if 'opt' in reader_kwargs:
                reader_kwargs['opt']['P'] = None
            else:
                reader_kwargs['opt'] = {'P': None}
        elif oddt.toolkit.backend == 'rdk':
            reader_kwargs['sanitize'] = False

    chunk = []
    for n, mol in enumerate(oddt.toolkit.readfile(fmt, filepath_or_buffer, **reader_kwargs)):
        if skip_bad_mols and mol is None:
            continue  # add warning with number of skipped molecules
        if usecols is None:
            mol_data = mol.data.to_dict()
        else:
            mol_data = dict((k, mol.data[k]) for k in usecols)

        if molecule_column:
            mol_data[molecule_column] = mol
        if molecule_name_column:
            mol_data[molecule_name_column] = mol.title
        if smiles_column:
            mol_data[smiles_column] = mol.smiles
        chunk.append(mol_data)
        if chunksize and (n + 1) % chunksize == 0:
            chunk_frm = ChemDataFrame(chunk, **kwargs)
            chunk_frm._molecule_column = molecule_column
            yield chunk_frm
            chunk = []
    if chunk or chunksize is None:
        chunk_frm = ChemDataFrame(chunk, **kwargs)
        chunk_frm._molecule_column = molecule_column
        yield chunk_frm


def _mol_writer(data,
                fmt='sdf',
                filepath_or_buffer=None,
                update_properties=True,
                molecule_column=None,
                columns=None):
    """Universal writing function for private use.

    .. versionadded:: 0.3

    Parameters
    ----------
        fmt : string
            The format of molecular file

        filepath_or_buffer : string or None
            File path

        update_properties : bool, optional (default=True)
            Switch to update properties from the DataFrames to the molecules
            while writting.

        molecule_column : string or None, optional (default='mol')
            Name of molecule column. If None the molecules will be skipped.

        columns : list or None, optional (default=None)
            A list of columns to write to file. If None then all available
            fields are written.

    """
    if filepath_or_buffer is None:
        out = StringIO()
    elif hasattr(filepath_or_buffer, 'write'):
        out = filepath_or_buffer
    else:
        out = oddt.toolkit.Outputfile(fmt, filepath_or_buffer, overwrite=True)
    if isinstance(data, pd.DataFrame):
        molecule_column = molecule_column or data._molecule_column
        for ix, row in data.iterrows():
            mol = row[molecule_column].clone
            if update_properties:
                new_data = row.to_dict()
                del new_data[molecule_column]
                mol.data.update(new_data)
            if columns:
                for k in mol.data.keys():
                    if k not in columns:
                        del mol.data[k]
            if filepath_or_buffer is None or hasattr(filepath_or_buffer, 'write'):
                out.write(mol.write(fmt))
            else:
                out.write(mol)
    elif isinstance(data, pd.Series):
        for mol in data:
            if filepath_or_buffer is None or hasattr(filepath_or_buffer, 'write'):
                out.write(mol.write(fmt))
            else:
                out.write(mol)
    if filepath_or_buffer is None:
        return out.getvalue()
    elif not hasattr(filepath_or_buffer, 'write'):  # dont close foreign buffer
        out.close()


[docs]def read_csv(*args, **kwargs): """ TODO: Support Chunks """ smiles_to_molecule = kwargs.pop('smiles_to_molecule', None) molecule_column = kwargs.pop('molecule_column', 'mol') data = pd.read_csv(*args, **kwargs) if smiles_to_molecule is not None: data[molecule_column] = data[smiles_to_molecule].map(lambda x: oddt.toolkit.readstring('smi', x)) return data
[docs]def read_sdf(filepath_or_buffer=None, usecols=None, molecule_column='mol', molecule_name_column='mol_name', smiles_column=None, skip_bad_mols=False, chunksize=None, **kwargs): """Read SDF/MDL multi molecular file to ChemDataFrame .. versionadded:: 0.3 Parameters ---------- filepath_or_buffer : string or None File path usecols : list or None, optional (default=None) A list of columns to read from file. If None then all available fields are read. molecule_column : string or None, optional (default='mol') Name of molecule column. If None the molecules will be skipped and the reading will be speed up significantly. molecule_name_column : string or None, optional (default='mol_name') Column name which will contain molecules' title/name. Column is skipped when set to None. smiles_column : string or None, optional (default=None) Column name containg molecules' SMILES, by default it is disabled. skip_bad_mols : bool, optional (default=False) Switch to skip empty (bad) molecules. Useful for RDKit, which Returns None if molecule can not sanitize. chunksize : int or None, optional (default=None) Size of chunk to return. If set to None whole set is returned. Returns ------- result : A `ChemDataFrame` containg all molecules if `chunksize` is None or genrerator of `ChemDataFrame` with `chunksize` molecules. """ result = _mol_reader(fmt='sdf', filepath_or_buffer=filepath_or_buffer, usecols=usecols, molecule_column=molecule_column, molecule_name_column=molecule_name_column, smiles_column=smiles_column, skip_bad_mols=skip_bad_mols, chunksize=chunksize, **kwargs) if chunksize: return result else: return deque(result, maxlen=1).pop()
[docs]def read_mol2(filepath_or_buffer=None, usecols=None, molecule_column='mol', molecule_name_column='mol_name', smiles_column=None, skip_bad_mols=False, chunksize=None, **kwargs): """Read Mol2 multi molecular file to ChemDataFrame. UCSF Dock 6 comments style is supported, i.e. `#### var_name: value` before molecular block. .. versionadded:: 0.3 Parameters ---------- filepath_or_buffer : string or None File path usecols : list or None, optional (default=None) A list of columns to read from file. If None then all available fields are read. molecule_column : string or None, optional (default='mol') Name of molecule column. If None the molecules will be skipped and the reading will be speed up significantly. molecule_name_column : string or None, optional (default='mol_name') Column name which will contain molecules' title/name. Column is skipped when set to None. smiles_column : string or None, optional (default=None) Column name containg molecules' SMILES, by default it is disabled. skip_bad_mols : bool, optional (default=False) Switch to skip empty (bad) molecules. Useful for RDKit, which Returns None if molecule can not sanitize. chunksize : int or None, optional (default=None) Size of chunk to return. If set to None whole set is returned. Returns ------- result : A `ChemDataFrame` containg all molecules if `chunksize` is None or genrerator of `ChemDataFrame` with `chunksize` molecules. """ result = _mol_reader(fmt='mol2', filepath_or_buffer=filepath_or_buffer, usecols=usecols, molecule_column=molecule_column, molecule_name_column=molecule_name_column, smiles_column=smiles_column, skip_bad_mols=skip_bad_mols, chunksize=chunksize, **kwargs) if chunksize: return result else: return deque(result, maxlen=1).pop()
[docs]class ChemSeries(pd.Series): """Pandas Series modified to adapt `oddt.toolkit.Molecule` objects and apply molecular methods easily. .. versionadded:: 0.3 """ def __le__(self, other): """ Substructure searching. `chemseries < mol`: are molecules in series substructures of a `mol` """ if (isinstance(other, oddt.toolkit.Molecule) and isinstance(self[0], oddt.toolkit.Molecule)): return self.map(lambda x: oddt.toolkit.Smarts(x.smiles).match(other)) else: return super(ChemSeries, self).__le__(other) def __ge__(self, other): """ Substructure searching. `chemseries > mol`: is `mol` a substructure of molecules in series """ if (isinstance(other, oddt.toolkit.Molecule) and isinstance(self[0], oddt.toolkit.Molecule)): smarts = oddt.toolkit.Smarts(other.smiles) return self.map(lambda x: smarts.match(x)) else: return super(ChemSeries, self).__ge__(other) def __or__(self, other): """ Tanimoto coefficient """ if (isinstance(self[0], oddt.toolkit.Fingerprint) and isinstance(other, oddt.toolkit.Fingerprint)): return self.map(lambda x: x | other) else: return super(ChemSeries, self).__or__(other)
[docs] def calcfp(self, *args, **kwargs): """Helper function to map FP calculation throuugh the series""" assert(isinstance(self[0], oddt.toolkit.Molecule)) return self.map(lambda x: x.calcfp(*args, **kwargs))
[docs] def to_smiles(self, filepath_or_buffer=None): return _mol_writer(self, fmt='smi', filepath_or_buffer=filepath_or_buffer)
[docs] def to_sdf(self, filepath_or_buffer=None): return _mol_writer(self, fmt='sdf', filepath_or_buffer=filepath_or_buffer)
[docs] def to_mol2(self, filepath_or_buffer=None): return _mol_writer(self, fmt='mol2', filepath_or_buffer=filepath_or_buffer)
@property def _constructor(self): """ Force new class to be usead as constructor """ return ChemSeries @property def _constructor_expanddim(self): """ Force new class to be usead as constructor when expandig dims """ return ChemDataFrame
[docs]class ChemDataFrame(pd.DataFrame): """Chemical DataFrame object, which contains molecules column of `oddt.toolkit.Molecule` objects. Rich display of moleucles (2D) is available in iPython Notebook. Additional `to_sdf` and `to_mol2` methods make writing to molecular formats easy. .. versionadded:: 0.3 Note: Thanks to: http://blog.snapdragon.cc/2015/05/05/subclass-pandas-dataframe-to-save-custom-attributes/ """ _metadata = ['_molecule_column'] _molecule_column = None
[docs] def to_sdf(self, filepath_or_buffer=None, update_properties=True, molecule_column=None, columns=None): """Write DataFrame to SDF file. .. versionadded:: 0.3 Parameters ---------- filepath_or_buffer : string or None File path update_properties : bool, optional (default=True) Switch to update properties from the DataFrames to the molecules while writting. molecule_column : string or None, optional (default='mol') Name of molecule column. If None the molecules will be skipped. columns : list or None, optional (default=None) A list of columns to write to file. If None then all available fields are written. """ molecule_column = molecule_column or self._molecule_column return _mol_writer(self, filepath_or_buffer=filepath_or_buffer, update_properties=update_properties, fmt='sdf', molecule_column=molecule_column, columns=columns)
[docs] def to_mol2(self, filepath_or_buffer=None, update_properties=True, molecule_column='mol', columns=None): """Write DataFrame to Mol2 file. .. versionadded:: 0.3 Parameters ---------- filepath_or_buffer : string or None File path update_properties : bool, optional (default=True) Switch to update properties from the DataFrames to the molecules while writting. molecule_column : string or None, optional (default='mol') Name of molecule column. If None the molecules will be skipped. columns : list or None, optional (default=None) A list of columns to write to file. If None then all available fields are written. """ molecule_column = molecule_column or self._molecule_column return _mol_writer(self, fmt='mol2', filepath_or_buffer=filepath_or_buffer, update_properties=update_properties, molecule_column=molecule_column, columns=columns)
[docs] def to_html(self, *args, **kwargs): """Patched rendering in HTML - don't escape HTML inside the cells. Docs are copied from parent """ kwargs['escape'] = False return super(ChemDataFrame, self).to_html(*args, **kwargs)
[docs] def to_csv(self, *args, **kwargs): """ Docs are copied from parent """ if self._molecule_column and ('columns' not in kwargs or kwargs['columns'] is None or self._molecule_column in kwargs['columns']): frm_copy = self.copy(deep=False) frm_copy[self._molecule_column] = frm_copy[self._molecule_column].map(lambda x: x.smiles).values return super(ChemDataFrame, frm_copy).to_csv(*args, **kwargs) else: return super(ChemDataFrame, self).to_csv(*args, **kwargs)
[docs] def to_excel(self, *args, **kwargs): """ Docs are copied from parent """ columns = kwargs['columns'] if 'columns' in kwargs else self.columns.tolist() if 'molecule_column' in kwargs: molecule_column = kwargs['molecule_column'] else: molecule_column = self._molecule_column molecule_column_idx = columns.index(molecule_column) if 'index' not in kwargs or ('index' in kwargs and kwargs['index']): molecule_column_idx += 1 size = kwargs.pop('size') if 'size' in kwargs else (200, 200) excel_writer = pd.ExcelWriter(args[0], engine='xlsxwriter') super(ChemDataFrame, self).to_excel(excel_writer, *args[1:], **kwargs) sheet = excel_writer.sheets['Sheet1'] # TODO: Get appropriate sheet name sheet.set_column(molecule_column_idx, molecule_column_idx, width=size[1] / 6.) for i, mol in enumerate(self[molecule_column]): if mol is None: continue img = BytesIO() png = mol.clone.write('png', size=size) if type(png) is str: png = png.encode('utf-8', errors='surrogateescape') img.write(png) sheet.write_string(i + 1, molecule_column_idx, "") sheet.insert_image(i + 1, molecule_column_idx, 'dummy', {'image_data': img, 'positioning': 2, 'x_offset': 1, 'y_offset': 1}) sheet.set_row(i + 1, height=size[0]) excel_writer.save()
@property def _constructor(self): """ Force new class to be usead as constructor """ return ChemDataFrame @property def _constructor_sliced(self): """ Force new class to be usead as constructor when slicing """ return ChemSeries @property def _constructor_expanddim(self): """ Force new class to be usead as constructor when expandig dims """ return ChemPanel
# Copy some docscrings from upstream classes for method in ['to_html', 'to_csv', 'to_excel']: try: getattr(ChemDataFrame, method).__doc__ = getattr(pd.DataFrame, method).__doc__ except AttributeError: # Python 2 compatible getattr(ChemDataFrame, method).__func__.__doc__ = getattr(pd.DataFrame, method).__func__.__doc__
[docs]class ChemPanel(pd.Panel): """Modified `pandas.Panel` to adopt higher dimension data than `ChemDataFrame`. Main purpose is to store molecular fingerprints in one column and keep 2D numpy array underneath. .. versionadded:: 0.3 """ _metadata = ['_molecule_column'] _molecule_column = None @property def _constructor(self): """ Force new class to be usead as constructor """ return ChemPanel @property def _constructor_sliced(self): """ Force new class to be usead as constructor when slicing """ return ChemDataFrame