Source code for oddt.toolkits.ob

import gzip
import pybel
from pybel import *
import copy_reg
import numpy as np
import openbabel as ob
from openbabel import OBAtomAtomIter,OBTypeTable
from oddt.spatial import angle, angle_2v, dihedral

backend = 'ob'
# setup typetable to translate atom types
typetable = OBTypeTable()
typetable.SetFromType('INT')
typetable.SetToType('SYB')

# setup ElementTable
elementtable = ob.OBElementTable()

# hash OB!
pybel.ob.obErrorLog.StopLogging()

def _filereader_mol2(filename, opt = None):
    block = ''
    data = ''
    n = 0
    with gzip.open(filename) if filename.split('.')[-1] == 'gz' else open(filename) as f:
        for line in f:
            if line[:1] == '#':
                data += line
            elif line[:17] == '@<TRIPOS>MOLECULE':
                if n>0: #skip `zero` molecule (any preciding comments and spaces)
                    yield Molecule(source={'fmt': 'mol2', 'string': block, 'opt': opt})
                n += 1
                block = data
                data = ''
            block += line
        # open last molecule
        if block:
            yield Molecule(source={'fmt': 'mol2', 'string': block, 'opt': opt})

def _filereader_sdf(filename, opt = None):
    block = ''
    n = 0
    with gzip.open(filename) if filename.split('.')[-1] == 'gz' else open(filename) as f:
        for line in f:
            block += line
            if line[:4] == '$$$$':
                yield Molecule(source={'fmt': 'sdf', 'string': block, 'opt': opt})
                n += 1
                block = ''
        if block: # open last molecule if any
            yield Molecule(source={'fmt': 'sdf', 'string': block, 'opt': opt})

def _filereader_pdb(filename, opt = None):
    block = ''
    n = 0
    with gzip.open(filename) if filename.split('.')[-1] == 'gz' else open(filename) as f:
        for line in f:
            block += line
            if line[:4] == 'ENDMDL':
                yield Molecule(source={'fmt': 'pdb', 'string': block, 'opt': opt})
                n += 1
                block = ''
        if block: # open last molecule if any
            yield Molecule(source={'fmt': 'pdb', 'string': block, 'opt': opt})

[docs]def readfile(format, filename, opt=None, lazy=False): if lazy and format == 'mol2': return _filereader_mol2(filename, opt=opt) elif lazy and format == 'sdf': return _filereader_sdf(filename, opt=opt) elif lazy and format == 'pdb': return _filereader_pdb(filename, opt=opt) else: return pybel.readfile(format, filename, opt=opt)
[docs]class Molecule(pybel.Molecule): def __init__(self, OBMol = None, source = None, protein = False): # lazy self._source = source # dict with keys: n, fmt, string, filename # call parent constructor super(Molecule,self).__init__(OBMol) self.protein = protein #ob.DeterminePeptideBackbone(molecule.OBMol) # percieve chains in residues #if len(res_dict) > 1 and not molecule.OBMol.HasChainsPerceived(): # print "Dirty HACK" # molecule = pybel.readstring('pdb', molecule.write('pdb')) self._atom_dict = None self._res_dict = None self._ring_dict = None self._coords = None self._charges = None # lazy Molecule parsing requires masked OBMol @property def OBMol(self): if not self._OBMol and self._source: self._OBMol = readstring(self._source['fmt'], self._source['string'], opt=self._source['opt'] if 'opt' in self._source else {}).OBMol self._source = None return self._OBMol @OBMol.setter def OBMol(self, value): self._OBMol = value @property def atoms(self): return AtomStack(self.OBMol) @property def bonds(self): return BondStack(self.OBMol) # cache frequently used properties and cache them in prefixed [_] variables @property def coords(self): if self._coords is None: self._coords = np.array([atom.coords for atom in self.atoms]) return self._coords @coords.setter def coords(self, new): [a.OBAtom.SetVector(v[0],v[1],v[2]) for v, a in zip(new, self.atoms)] # clear cache self._cache = None @property def charges(self): if self._charges is None: self._charges = np.array([atom.partialcharge for atom in self.atoms]) return self._charges
[docs] def write(self, format="smi", filename=None, overwrite=False, opt=None): format = format.lower() # Use lazy molecule if possible if self._source and 'fmt' in self._source and self._source['fmt'] == format and self._source['string']: return self._source['string'] else: return super(Molecule,self).write(format=format, filename=filename, overwrite=overwrite, opt=opt) ### Backport code implementing resudues (by me) to support older versions of OB (aka 'stable')
@property def residues(self): return [Residue(res) for res in ob.OBResidueIter(self.OBMol)] #### Custom ODDT properties #### def __getattr__(self, attr): for desc in pybel._descdict.keys(): if attr.lower() == desc.lower(): return self.calcdesc([desc])[desc] raise AttributeError('Molecule has no such property: %s' % attr) @property def num_rotors(self): return self.OBMol.NumRotors() def _repr_svg_(self): return self.write('svg', opt={'d':None}).replace('\n', '') @property def canonic_order(self): """ Returns np.array with canonic order of heavy atoms in the molecule """ tmp = self.clone tmp.write('can') return np.array(tmp.data['SMILES Atom Order'].split(), dtype=int)-1 @property def atom_dict(self): # check cache and generate dicts if self._atom_dict is None: self._dicts() return self._atom_dict @property def res_dict(self): # check cache and generate dicts if self._res_dict is None: self._dicts() return self._res_dict @property def ring_dict(self): # check cache and generate dicts if self._ring_dict is None: self._dicts() return self._ring_dict @property def clone(self): return Molecule(ob.OBMol(self.OBMol))
[docs] def clone_coords(self, source): self.OBMol.SetCoordinates(source.OBMol.GetCoordinates()) return self
def _dicts(self): # Atoms atom_dtype = [('id', 'int16'), # atom info ('coords', 'float32', 3), ('radius', 'float32'), ('charge', 'float32'), ('atomicnum', 'int8'), ('atomtype','a4'), ('hybridization', 'int8'), ('neighbors', 'float32', (4,3)), # max of 4 neighbors should be enough # residue info ('resid', 'int16'), ('resname', 'a3'), ('isbackbone', 'bool'), # atom properties ('isacceptor', 'bool'), ('isdonor', 'bool'), ('isdonorh', 'bool'), ('ismetal', 'bool'), ('ishydrophobe', 'bool'), ('isaromatic', 'bool'), ('isminus', 'bool'), ('isplus', 'bool'), ('ishalogen', 'bool'), # secondary structure ('isalpha', 'bool'), ('isbeta', 'bool') ] a = [] atom_dict = np.empty(self.OBMol.NumAtoms(), dtype=atom_dtype) metals = [3,4,11,12,13,19,20,21,22,23,24,25,26,27,28,29,30,31,37,38,39,40,41,42,43,44,45,46,47,48,49,50,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,87,88,89,90,91, 92,93,94,95,96,97,98,99,100,101,102,103] for i, atom in enumerate(self.atoms): atomicnum = atom.atomicnum # skip non-polar hydrogens for performance # if atomicnum == 1 and atom.OBAtom.IsNonPolarHydrogen(): # continue atomtype = typetable.Translate(atom.type) # sybyl atom type partialcharge = atom.partialcharge coords = atom.coords if self.protein: residue = Residue(atom.OBAtom.GetResidue()) else: residue = False # get neighbors, but only for those atoms which realy need them neighbors = np.zeros(4, dtype=[('coords', 'float32', 3),('atomicnum', 'int8')]) neighbors['coords'].fill(np.nan) for n, nbr_atom in enumerate(atom.neighbors): # concider raising neighbors list to 6, but must do some benchmarks if n > 3: break nbr_atomicnum = nbr_atom.atomicnum neighbors[n] = (nbr_atom.coords, nbr_atomicnum) atom_dict[i] = (atom.idx, coords, elementtable.GetVdwRad(atomicnum), partialcharge, atomicnum, atomtype, atom.OBAtom.GetHyb(), neighbors['coords'], #n_coords, # residue info residue.idx if residue else 0, residue.name if residue else '', residue.OBResidue.GetAtomProperty(atom.OBAtom, 2) if residue else False, # is backbone # atom properties atom.OBAtom.IsHbondAcceptor(), atom.OBAtom.IsHbondDonor(), atom.OBAtom.IsHbondDonorH(), atomicnum in metals, atomicnum == 6 and np.in1d(neighbors['atomicnum'], [6,1,0]).all(), #hydrophobe atom.OBAtom.IsAromatic(), atomtype in ['O3-', '02-' 'O-'] or atom.formalcharge < 0, # is charged (minus) atomtype in ['N3+', 'N2+', 'Ng+'] or atom.formalcharge > 0, # is charged (plus) atomicnum in [9,17,35,53], # is halogen? False, # alpha False # beta ) if self.protein: # Protein Residues (alpha helix and beta sheet) res_dtype = [('id', 'int16'), ('resname', 'a3'), ('N', 'float32', 3), ('CA', 'float32', 3), ('C', 'float32', 3), ('isalpha', 'bool'), ('isbeta', 'bool') ] # N, CA, C b = [] for residue in self.residues: backbone = {} for atom in residue: if residue.OBResidue.GetAtomProperty(atom.OBAtom,1): if atom.atomicnum == 7: backbone['N'] = atom.coords elif atom.atomicnum == 6: if atom.type == 'C3': backbone['CA'] = atom.coords else: backbone['C'] = atom.coords if len(backbone.keys()) == 3: b.append((residue.idx, residue.name, backbone['N'], backbone['CA'], backbone['C'], False, False)) res_dict = np.array(b, dtype=res_dtype) # detect secondary structure by phi and psi angles first = res_dict[:-1] second = res_dict[1:] psi = dihedral(first['N'], first['CA'], first['C'], second['N']) phi = dihedral(first['C'], second['N'], second['CA'], second['C']) # mark atoms belonging to alpha and beta res_mask_alpha = np.where(((phi > -145) & (phi < -35) & (psi > -70) & (psi < 50))) # alpha res_dict['isalpha'][res_mask_alpha] = True for i in res_dict[res_mask_alpha]['id']: atom_dict['isalpha'][atom_dict['resid'] == i] = True res_mask_beta = np.where(((phi >= -180) & (phi < -40) & (psi <= 180) & (psi > 90)) | ((phi >= -180) & (phi < -70) & (psi <= -165))) # beta res_dict['isbeta'][res_mask_beta] = True atom_dict['isbeta'][np.in1d(atom_dict['resid'], res_dict[res_mask_beta]['id'])] = True # Aromatic Rings r = [] for ring in self.sssr: if ring.IsAromatic(): path = ring._path atom = atom_dict[atom_dict['id'] == path[0]] coords = atom_dict[np.in1d(atom_dict['id'], path)]['coords'] centroid = coords.mean(axis=0) # get vector perpendicular to ring vector = np.cross(coords - np.vstack((coords[1:],coords[:1])), np.vstack((coords[1:],coords[:1])) - np.vstack((coords[2:],coords[:2]))).mean(axis=0) - centroid r.append((centroid, vector, atom['isalpha'], atom['isbeta'])) ring_dict = np.array(r, dtype=[('centroid', 'float32', 3),('vector', 'float32', 3),('isalpha', 'bool'),('isbeta', 'bool'),]) self._atom_dict = atom_dict self._ring_dict = ring_dict if self.protein: self._res_dict = res_dict ### Extend pybel.Molecule
pybel.Molecule = Molecule
[docs]class AtomStack(object): def __init__(self,OBMol): self.OBMol = OBMol def __iter__(self): for i in range(self.OBMol.NumAtoms()): yield Atom(self.OBMol.GetAtom(i+1)) def __len__(self): return self.OBMol.NumAtoms() def __getitem__(self, i): if 0 <= i < self.OBMol.NumAtoms(): return Atom(self.OBMol.GetAtom(i+1)) else: raise AttributeError("There is no atom with Idx %i" % i)
[docs]class Atom(pybel.Atom): @property def neighbors(self): return [Atom(a) for a in OBAtomAtomIter(self.OBAtom)] @property def residue(self): return Residue(self.OBAtom.GetResidue())
pybel.Atom = Atom
[docs]class BondStack(object): def __init__(self,OBMol): self.OBMol = OBMol def __iter__(self): for i in range(self.OBMol.NumBonds()): yield Bond(self.OBMol.GetBond(i)) def __len__(self): return self.OBMol.NumBonds() def __getitem__(self, i): if 0 <= i < self.OBMol.NumBonds(): return Bond(self.OBMol.GetBond(i)) else: raise AttributeError("There is no bond with Idx %i" % i)
[docs]class Bond(object): def __init__(self, OBBond): self.OBBond = OBBond @property def order(self): return self.OBBond.GetBondOrder() @property def atoms(self): return (Atom(self.OBBond.GetBeginAtom()), Atom(self.OBBond.GetEndAtom())) @property def isrotor(self): return self.OBBond.IsRotor()
[docs]class Residue(object): """Represent a Pybel residue. Required parameter: OBResidue -- an Open Babel OBResidue Attributes: atoms, idx, name. (refer to the Open Babel library documentation for more info). The original Open Babel atom can be accessed using the attribute: OBResidue """ def __init__(self, OBResidue): self.OBResidue = OBResidue @property def atoms(self): return [Atom(atom) for atom in ob.OBResidueAtomIter(self.OBResidue)] @property def idx(self): return self.OBResidue.GetIdx() @property def name(self): return self.OBResidue.GetName() def __iter__(self): """Iterate over the Atoms of the Residue. This allows constructions such as the following: for atom in residue: print atom """ return iter(self.atoms)
[docs]class Fingerprint(pybel.Fingerprint): @property def raw(self): return _unrollbits(self.fp, pybel.ob.OBFingerprint.Getbitsperint())
def _unrollbits(fp, bitsperint): """ Unroll unsigned int fingerprint to bool """ ans = np.zeros(len(fp)*bitsperint) start = 1 for x in fp: i = start while x > 0: ans[i] = x % 2 x >>= 1 i += 1 start += bitsperint return ans pybel.Fingerprint = Fingerprint ### Monkeypatch pybel objects pickling pickle_format = 'mol2'
[docs]def pickle_mol(self): if self._source: return unpickle_mol, (self._source,) else: return unpickle_mol, ({'fmt': pickle_format, 'string': self.write(pickle_format), 'data': dict(self.data.items())},)
[docs]def unpickle_mol(source): mol = Molecule(source=source) if 'data' in source: mol.data.update(source['data']) return mol
copy_reg.pickle(Molecule, pickle_mol, unpickle_mol)