Looking at the number of bits set by different fingerprints¶
I've done a number of posts looking at Morgan fingerprint statistics before, including:
- The number of collisions in Morgan fingerprints.
- Morgan fingerprint stats
- Collisions in Morgan fingerprints revisited
I have done similar analysis for other fingerprint types, but it looks like I didn't post that (at least I can't find it if I did). It's useful to do this because, as we'll see, the different fingerprint types have very different numbers of bits set for typical molecules.
Here's the summary of the mean and standard deviation of the number of bits set, from an analysis of 5 million molecules with less than 50 heavy atoms extracted from ZINC:
Fingerprint | Type | Mean num_bits | SD num_bits |
---|---|---|---|
Morgan1 | sparse | 29.4 | 5.6 |
Morgan2 | sparse | 48.7 | 9.6 |
Morgan3 | sparse | 66.8 | 13.8 |
FeatMorgan1 | sparse | 20.1 | 3.9 |
FeatMorgan2 | sparse | 38.1 | 7.7 |
FeatMorgan3 | sparse | 56.0 | 11.8 |
RDKit5 | bitvect | 363 | 122 |
RDKit6 | bitvect | 621 | 233 |
RDKit7 | bitvect | 993 | 406 |
pattern | bitvect | 446 | 122 |
avalon | bitvect | 280 | 130 |
atom pairs | sparse | 167 | 56 |
TT | sparse | 33.4 | 9.8 |
atom pairs | bitvect | 267 | 90 |
TT | bitvect | 47.2 | 12.0 |
The bit vector fingerprints were all 4096 bits long.
from rdkit import Chem,DataStructs
import time,random,gzip,pickle,copy
import numpy as np
from collections import Counter,defaultdict
from rdkit.Chem import Draw
from rdkit.Chem import rdMolDescriptors
from rdkit.Avalon import pyAvalonTools
from rdkit.Chem.Draw import IPythonConsole
from rdkit import DataStructs
from rdkit import rdBase
%pylab inline
print(rdBase.rdkitVersion)
import time
print(time.asctime())
try:
import ipyparallel as ipp
rc = ipp.Client()
dview = rc[:]
dview.execute('from rdkit import Chem')
dview.execute('from rdkit import Descriptors')
dview.execute('from rdkit.Chem import rdMolDescriptors')
dview.execute('from rdkit.Avalon import pyAvalonTools')
except:
print("could not use ipyparallel")
dview = None
For test data I'll use the same 16 million ZINC compounds I used in the bit statistics post.
filen='/scratch/RDKit_git/LocalData/Zinc/zinc_all_clean.pkl.gz'
Loop over the molecules, skip anything with more than 50 atoms, and build fingerprints for all the others.
The fingerprints I generate for this analysis are:
- Sparse Morgan with radii 1, 2, and 3
- Sparse FeatureMorgan with radii 1, 2, and 3
- RDKit BitVect with maxPath 5, 6, and 7
- Pattern BitVect
- Avalon BitVect
- Sparse Atom Pairs
- Sparse Topological Torsions
- Atom Pair BitVect
- Topological Torsion BitVect
All of the BitVect fingerprints are 4096 bits long
import copy
historyf = gzip.open('../data/fp_bit_counts.history.pkl.gz','wb+')
counts=defaultdict(Counter)
t1 = time.time()
with gzip.open(filen,'rb') as inf:
i = 0
ms = []
while 1:
try:
m,nm = pickle.load(inf)
except EOFError:
break
if not m or m.GetNumHeavyAtoms()>50: continue
ms.append(m)
i+=1
if len(ms)>=10000:
for v in 1,2,3:
cnts = dview.map_sync(lambda x,v=v:len(rdMolDescriptors.GetMorganFingerprint(x,v).GetNonzeroElements()),
ms)
for obc in cnts:
counts[('Morgan',v)][obc]+=1
for v in 1,2,3:
cnts = dview.map_sync(lambda x,v=v:len(rdMolDescriptors.GetMorganFingerprint(x,v,useFeatures=True).GetNonzeroElements()),
ms)
for obc in cnts:
counts[('FeatMorgan',v)][obc]+=1
for v in 5,6,7:
cnts = dview.map_sync(lambda x,v=v:Chem.RDKFingerprint(x,maxPath=v,fpSize=4096).GetNumOnBits(),
ms)
for obc in cnts:
counts[('RDKit',v)][obc]+=1
cnts = dview.map_sync(lambda x:Chem.PatternFingerprint(x,fpSize=4096).GetNumOnBits(),
ms)
for obc in cnts:
counts[('pattern',-1)][obc]+=1
cnts = dview.map_sync(lambda x:pyAvalonTools.GetAvalonFP(x,nBits=4096).GetNumOnBits(),
ms)
for obc in cnts:
counts[('avalon',-1)][obc]+=1
cnts = dview.map_sync(lambda x:len(rdMolDescriptors.GetAtomPairFingerprint(x).GetNonzeroElements()),
ms)
for obc in cnts:
counts[('ap-counts',-1)][obc]+=1
cnts = dview.map_sync(lambda x:len(rdMolDescriptors.GetTopologicalTorsionFingerprint(x).GetNonzeroElements()),
ms)
for obc in cnts:
counts[('tt-counts',-1)][obc]+=1
cnts = dview.map_sync(lambda x:rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(x,nBits=4096).GetNumOnBits(),
ms)
for obc in cnts:
counts[('ap-bv',-1)][obc]+=1
cnts = dview.map_sync(lambda x:rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(x,nBits=4096).GetNumOnBits(),
ms)
for obc in cnts:
counts[('tt-bv',-1)][obc]+=1
ms = []
if not i%50000:
t2 = time.time()
print("Done %d in %.2f sec"%(i,t2-t1))
if not i%500000:
pickle.dump(dict(counts),historyf)
if i>=5000000:
break
pickle.dump(dict(counts),gzip.open('../data/fp_bit_counts.pkl.gz','wb+'))
Now plot the distributions of the number of bits set
morgan_ks = [x for x in counts.keys() if x[0] =='Morgan']
featmorgan_ks = [x for x in counts.keys() if x[0] =='FeatMorgan']
rdkit_ks = [x for x in counts.keys() if x[0] == 'RDKit']
figure(figsize=(15,15))
pidx=1
subplot(3,3,pidx)
for n,r in morgan_ks:
cnts = sorted(counts[(n,r)].items())
plot([x for x,y in cnts],[y for x,y in cnts],label=
f"r={r}")
_=title("Morgan")
_=xlabel("num bits set")
_=ylabel("count")
_=legend()
pidx=2
subplot(3,3,pidx)
for n,r in featmorgan_ks:
cnts = sorted(counts[(n,r)].items())
plot([x for x,y in cnts],[y for x,y in cnts],label=
f"r={r}")
_=title("FeatMorgan")
_=xlabel("num bits set")
_=ylabel("count")
_=legend()
pidx=3
subplot(3,3,pidx)
for n,r in rdkit_ks:
cnts = sorted(counts[(n,r)].items())
plot([x for x,y in cnts],[y for x,y in cnts],label=
f"r={r}")
_=title("RDKit")
_=xlabel("num bits set")
_=ylabel("count")
_=legend()
for k in counts.keys():
if k[0] in ('Morgan','FeatMorgan','RDKit'):
continue
pidx+=1
subplot(3,3,pidx)
cnts = sorted(counts[k].items())
plot([x for x,y in cnts],[y for x,y in cnts])
_=title(k[0])
_=xlabel("num bits set")
_=ylabel("count")
The avalon FP curve has an interesting shape
for k,cnts in counts.items():
accum = 0
denom = 0
for cnt,num in cnts.items():
accum += cnt*num
denom += num
mean = accum/denom
dev = 0
for cnt,num in cnts.items():
dev += num*(cnt-mean)**2
dev /= (denom-1)
dev = dev**0.5
label = k[0]
if k[1]!=-1:
label += str(k[1])
print(label,'\t%.1f'%mean,'%.1f'%dev)
Convergence¶
I did 5 million examples, which took a while (about 1.5 hours with 6 worker processes on my PC). Could I have analyzed less and gotten to the same results? Did the means converge? If so, how quickly?
historyf = gzip.open('../data/fp_bit_counts.history.pkl.gz','rb')
means = defaultdict(list)
devs = defaultdict(list)
nmols = []
while 1:
try:
lcounts = pickle.load(historyf)
except EOFError:
break
for k,cnts in lcounts.items():
accum = 0
denom = 0
for cnt,num in cnts.items():
accum += cnt*num
denom += num
mean = accum/denom
dev = 0
for cnt,num in cnts.items():
dev += num*(cnt-mean)**2
dev /= (denom-1)
dev = dev**0.5
if denom not in nmols:
nmols.append(denom)
means[k].append(mean)
devs[k].append(dev)
label = k[0]
if k[1]!=-1:
label += str(k[1])
print(denom,label,'\t%.1f'%mean,'%.1f'%dev)
Let's look at those graphically:
morgan_ks = [x for x in counts.keys() if x[0] =='Morgan']
featmorgan_ks = [x for x in counts.keys() if x[0] =='FeatMorgan']
rdkit_ks = [x for x in counts.keys() if x[0] == 'RDKit']
figure(figsize=(15,15))
nmols2 = [x/1000000 for x in nmols]
pidx=1
subplot(3,3,pidx)
for n,r in morgan_ks:
lmeans = means[(n,r)]
ldevs = devs[(n,r)]
errorbar(nmols2,lmeans,yerr=ldevs,capsize=3)
_=title("Morgan")
_=xlabel("num mols (millions)")
_=ylabel("count")
#_=legend()
pidx=2
subplot(3,3,pidx)
for n,r in featmorgan_ks:
lmeans = means[(n,r)]
ldevs = devs[(n,r)]
errorbar(nmols2,lmeans,yerr=ldevs,capsize=3)
_=title("FeatMorgan")
_=xlabel("num mols (millions)")
_=ylabel("count")
#_=legend()
pidx=3
subplot(3,3,pidx)
for n,r in rdkit_ks:
lmeans = means[(n,r)]
ldevs = devs[(n,r)]
errorbar(nmols2,lmeans,yerr=ldevs,capsize=3)
_=title("RDKit")
_=xlabel("num mols (millions)")
_=ylabel("count")
#_=legend()
for k in counts.keys():
if k[0] in ('Morgan','FeatMorgan','RDKit'):
continue
pidx+=1
subplot(3,3,pidx)
lmeans = means[k]
ldevs = devs[k]
errorbar(nmols2,lmeans,yerr=ldevs,capsize=3)
_=title(k[0])
_=xlabel("num mols (millions)")
_=ylabel("count")
Looks like we would have been fine with 3 million molecules.