Comparing fingerprints to each other. Part 1
Goal: Look at the differences between different similarity methods.
This uses a set of pairs of molecules that have a baseline similarity: a Tanimoto similarity using count-based Morgan0 fingerprints of at least 0.7. The construction of this set was presented in an earlier post: http://rdkit.blogspot.com/2013/10/building-similarity-comparison-set-goal.html.
Note: this notebook and the data it uses/generates can be found in the github repo: https://github.com/greglandrum/rdkit_blog
Set up
Do the usual imports, read in the molecules, set up the fingerprints we'll compare, and calculate the similarities between the pairs of molecules using those fingerprints.
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from rdkit.Avalon import pyAvalonTools
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit import rdBase
from rdkit import DataStructs
from collections import defaultdict
import cPickle,random,gzip
import scipy as sp
import pandas
from scipy import stats
from IPython.core.display import display,HTML,Javascript
print rdBase.rdkitVersion
ind = [x.split() for x in gzip.open('../data/chembl16_25K.pairs.txt.gz')]
ms1 = []
ms2 = []
for i,row in enumerate(ind):
m1 = Chem.MolFromSmiles(row[1])
ms1.append((row[0],m1))
m2 = Chem.MolFromSmiles(row[3])
ms2.append((row[2],m2))
methods = [(lambda x:Chem.RDKFingerprint(x,maxPath=4),'RDKit4'),
(lambda x:Chem.RDKFingerprint(x,maxPath=5),'RDKit5'),
(lambda x:Chem.RDKFingerprint(x,maxPath=6),'RDKit6'),
(lambda x:Chem.RDKFingerprint(x,maxPath=7),'RDKit7'),
(lambda x:Chem.RDKFingerprint(x,maxPath=4,branchedPaths=False),'RDKit4-linear'),
(lambda x:Chem.RDKFingerprint(x,maxPath=5,branchedPaths=False),'RDKit5-linear'),
(lambda x:Chem.RDKFingerprint(x,maxPath=6,branchedPaths=False),'RDKit6-linear'),
(lambda x:Chem.RDKFingerprint(x,maxPath=7,branchedPaths=False),'RDKit7-linear'),
(lambda x:rdMolDescriptors.GetMorganFingerprint(x,0),'MFP0'),
(lambda x:rdMolDescriptors.GetMorganFingerprint(x,1),'MFP1'),
(lambda x:rdMolDescriptors.GetMorganFingerprint(x,2),'MFP2'),
(lambda x:rdMolDescriptors.GetMorganFingerprint(x,3),'MFP3'),
(lambda x:rdMolDescriptors.GetMorganFingerprint(x,0,useFeatures=True),'FeatMFP0'),
(lambda x:rdMolDescriptors.GetMorganFingerprint(x,1,useFeatures=True),'FeatMFP1'),
(lambda x:rdMolDescriptors.GetMorganFingerprint(x,2,useFeatures=True),'FeatMFP2'),
(lambda x:rdMolDescriptors.GetMorganFingerprint(x,3,useFeatures=True),'FeatMFP3'),
(lambda x:rdMolDescriptors.GetHashedMorganFingerprint(x,0),'MFP0-bits'),
(lambda x:rdMolDescriptors.GetHashedMorganFingerprint(x,1),'MFP1-bits'),
(lambda x:rdMolDescriptors.GetHashedMorganFingerprint(x,2),'MFP2-bits'),
(lambda x:rdMolDescriptors.GetHashedMorganFingerprint(x,3),'MFP3-bits'),
(lambda x:rdMolDescriptors.GetAtomPairFingerprint(x),'AP'),
(lambda x:rdMolDescriptors.GetTopologicalTorsionFingerprint(x),'TT'),
(lambda x:rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(x),'AP-bits'),
(lambda x:rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(x),'TT-bits'),
(lambda x:rdMolDescriptors.GetMACCSKeysFingerprint(x),'MACCS'),
(lambda x:pyAvalonTools.GetAvalonFP(x,512),'Avalon-512'),
(lambda x:pyAvalonTools.GetAvalonFP(x,1024),'Avalon-1024'),
]
scoredLists={}
for method,nm in methods:
if not scoredLists.has_key(nm):
print 'Doing: ',nm
rl=[]
for i,(m1,m2) in enumerate(zip(ms1,ms2)):
fp1 = method(m1[-1])
fp2 = method(m2[-1])
sim = DataStructs.TanimotoSimilarity(fp1,fp2)
rl.append((sim,i))
scoredLists[nm]=rl
cPickle.dump(scoredLists,gzip.open('../data/chembl16_25K.pairs.sims.pkl.gz','wb+'))
Set up the comparison code
scoredLists = cPickle.load(gzip.open('../data/chembl16_25K.pairs.sims.pkl.gz','rb'))
def directCompare(scoredLists,fp1,fp2,plotIt=True,silent=False):
""" We return: Kendall tau, Spearman rho, and Pearson R
"""
l1 = scoredLists[fp1]
l2 = scoredLists[fp2]
rl1=[x[-1] for x in l1]
rl2=[x[-1] for x in l2]
vl1=[x[0] for x in l1]
vl2=[x[0] for x in l2]
if plotIt:
_=scatter(vl1,vl2,edgecolors='none')
maxv=max(max(vl1),max(vl2))
minv=min(min(vl1),min(vl2))
_=plot((minv,maxv),(minv,maxv),color='k',linestyle='-')
xlabel(fp1)
ylabel(fp2)
tau,tau_p=stats.kendalltau(vl1,vl2)
spearman_rho,spearman_p=stats.spearmanr(vl1,vl2)
pearson_r,pearson_p = stats.pearsonr(vl1,vl2)
if not silent:
print fp1,fp2,tau,tau_p,spearman_rho,spearman_p,pearson_r,pearson_p
return tau,spearman_rho,pearson_r
And now compare a few methods to each other
Start with two very closely related fingerprints:
_=directCompare(scoredLists,'MFP0','MFP0-bits')
What about a two different Morgan fingerprint radii?
_=directCompare(scoredLists,'MFP1','MFP2')
And a couple RDKit fingerprint sizes
_=directCompare(scoredLists,'RDKit4','RDKit6')
Do all the comparisons so that we can do some statistics on them
ks = sorted(scoredLists.keys())
kappas={}
spearmans={}
pearsons={}
for i,ki in enumerate(ks):
for j in range(i+1,len(ks)):
kappa,spearman,pearson=directCompare(scoredLists,ki,ks[j],plotIt=False,silent=True)
kappas[(ki,ks[j])]=kappa
spearmans[(ki,ks[j])]=spearman
pearsons[(ki,ks[j])]=pearson
cPickle.dump((ks,kappas,spearmans,pearsons),gzip.open('../data/chembl16_25K.pairs.sim_workup.pkl.gz','wb+'))
(ks,kappas,spearmans,pearsons)=cPickle.load(gzip.open('../data/chembl16_25K.pairs.sim_workup.pkl.gz','rb'))
Load the data into a Pandas dataframe
rows=[]
for k in kappas.keys():
rows.append([k[0],k[1],kappas[k],spearmans[k],pearsons[k]])
df = pandas.DataFrame(data=rows,columns=['Sim1','Sim2','Tau','Spearman','Pearson'])
df.sort(columns=('Sim1',),inplace=True)
df.head()
Let's get a feeling for what the correlations look like for various tau values.
figure(figsize=(24,4))
subplot(1,5,1)
tau,s,p=directCompare(scoredLists,'MFP1','MFP1-bits',silent=True)
title('tau=%.2f'%tau)
subplot(1,5,2)
tau,s,p=directCompare(scoredLists,'FeatMFP1','FeatMFP3',silent=True)
title('tau=%.2f'%tau)
subplot(1,5,3)
tau,s,p=directCompare(scoredLists,'Avalon-1024','RDKit6',silent=True)
title('tau=%.2f'%tau)
subplot(1,5,4)
tau,s,p=directCompare(scoredLists,'RDKit7','TT',silent=True)
title('tau=%.2f'%tau)
subplot(1,5,5)
tau,s,p=directCompare(scoredLists,'MFP0','RDKit7',silent=True)
_=title('tau=%.2f'%tau)
That last one is somewhat artificial due to the lower bound on MFP0 enforced by the data set.
There's not much correlation left at tau=0.25.
Similar Fingerprints: all the pairs of fingerprints where tau>0.85
HTML(df[df.Tau>0.85].sort(columns=['Tau'],ascending=False).to_html(float_format=lambda x: '%4.3f' % x,
classes="table display"))
Nothing terribly suprising there.
Different Fingerprints: all the pairs of fingerprints where tau<0.3
We also exclude all of the MFP0 variants.
subset=df[df.Tau<0.3]\
[~df.Sim1.isin(('MFP0','MFP0-bits','FeatMFP0',))]\
[~df.Sim2.isin(('MFP0','MFP0-bits','FeatMFP0',))]
HTML(subset.sort(columns=['Tau'],ascending=False).to_html(float_format=lambda x: '%4.3f' % x,
classes="table display"))
What about methods that work well for similarity-based virtual screening?
Look at the methods that we found to be "best" as measured by AUC for similarity-based virtual screening in our benchmarking paper (http://www.jcheminf.com/content/5/1/26 ). The table itself is here: http://www.jcheminf.com/content/5/1/26/table/T1
I've got best in quotes here because there wasn't a statistically significant difference in performance.
subset=df[df.Sim1.isin(('AP','Avalon-1024','TT','RDKit5'))][df.Sim2.isin(('AP','Avalon-1024','TT','RDKit5'))]
HTML(subset.to_html(float_format=lambda x: '%4.3f' % x,
classes="table display"))
That is the correlation over the entire range of similarities. What about if we just look at the top pairs for each fingerprint?
nToDo=200
apl = sorted(scoredLists['AP'],reverse=True)[:nToDo]
ttl = sorted(scoredLists['TT'],reverse=True)[:nToDo]
avl = sorted(scoredLists['Avalon-1024'],reverse=True)[:nToDo]
rdkl = sorted(scoredLists['RDKit5'],reverse=True)[:nToDo]
idsToKeep=set()
idsToKeep.update([x[1] for x in apl])
idsToKeep.update([x[1] for x in ttl])
idsToKeep.update([x[1] for x in avl])
idsToKeep.update([x[1] for x in rdkl])
print len(idsToKeep)
limitedLists={}
for fp in ('AP','TT','Avalon-1024','RDKit5'):
limitedLists[fp]=[scoredLists[fp][x] for x in idsToKeep]
figure(figsize=(30,4))
subplot(1,6,1)
tau,s,p=directCompare(limitedLists,'AP','TT',silent=True)
title('tau=%.2f'%tau)
subplot(1,6,2)
tau,s,p=directCompare(limitedLists,'AP','Avalon-1024',silent=True)
title('tau=%.2f'%tau)
subplot(1,6,3)
tau,s,p=directCompare(limitedLists,'AP','RDKit5',silent=True)
title('tau=%.2f'%tau)
subplot(1,6,4)
tau,s,p=directCompare(limitedLists,'TT','Avalon-1024',silent=True)
title('tau=%.2f'%tau)
subplot(1,6,5)
tau,s,p=directCompare(limitedLists,'TT','RDKit5',silent=True)
title('tau=%.2f'%tau)
subplot(1,6,6)
tau,s,p=directCompare(limitedLists,'Avalon-1024','RDKit5',silent=True)
_=title('tau=%.2f'%tau)
The Tau values are still pretty low. The rankings from these fingerprints tend to have a low correlation with each other.
The comparison in the benchmarking paper showed, on the other hand, that across a broad range of data sets the fingerprints perform at about the same level when it comes to enrichment. It seems like there's either a contradiction or this set of pairs isn't particularly representative of what we used for that paper.
Even more concrete: look at the number of overlapping pairs in that pick
Look at the overlap between the top picks of those fingerprints.
nToDo=200
apl = sorted(scoredLists['AP'],reverse=True)[:nToDo]
ttl = sorted(scoredLists['TT'],reverse=True)[:nToDo]
avl = sorted(scoredLists['Avalon-1024'],reverse=True)[:nToDo]
rdkl = sorted(scoredLists['RDKit5'],reverse=True)[:nToDo]
idsToKeep=set()
idsToKeep.update([x[1] for x in apl])
idsToKeep.update([x[1] for x in ttl])
idsToKeep.update([x[1] for x in avl])
idsToKeep.update([x[1] for x in rdkl])
print 'Overall number:',len(idsToKeep)
ids={}
ids['AP']=set([x[1] for x in apl])
ids['TT']=set([x[1] for x in ttl])
ids['Avalon-1024']=set([x[1] for x in avl])
ids['RDKit5']=set([x[1] for x in rdkl])
ks = sorted(ids.keys())
for i,k in enumerate(ks):
for j in range(i+1,len(ks)):
overlap=len(ids[k].intersection(ids[ks[j]]))
print ks[i],ks[j],overlap,'%.2f'%(float(overlap)/len(apl))
So each of those sets of picks has a good fraction (>40%) of different compounds. Nice!
Repeat that for fewer picks:
nToDo=100
apl = sorted(scoredLists['AP'],reverse=True)[:nToDo]
ttl = sorted(scoredLists['TT'],reverse=True)[:nToDo]
avl = sorted(scoredLists['Avalon-1024'],reverse=True)[:nToDo]
rdkl = sorted(scoredLists['RDKit5'],reverse=True)[:nToDo]
idsToKeep=set()
idsToKeep.update([x[1] for x in apl])
idsToKeep.update([x[1] for x in ttl])
idsToKeep.update([x[1] for x in avl])
idsToKeep.update([x[1] for x in rdkl])
print 'Overall number:',len(idsToKeep)
ids={}
ids['AP']=set([x[1] for x in apl])
ids['TT']=set([x[1] for x in ttl])
ids['Avalon-1024']=set([x[1] for x in avl])
ids['RDKit5']=set([x[1] for x in rdkl])
ks = sorted(ids.keys())
for i,k in enumerate(ks):
for j in range(i+1,len(ks)):
overlap=len(ids[k].intersection(ids[ks[j]]))
print ks[i],ks[j],overlap,'%.2f'%(float(overlap)/len(apl))
Still a significant number of unique compounds when considering pairwise overlaps. AP--TT is, of course, something of an exception.
Idle Curiosity: Difference between the correlation coefficients
xk='Tau'
yk='Spearman'
tplt=df.plot(x=xk,y=yk,style='o')
minV=min(min(df[xk]),min(df[yk]))
maxV=max(max(df[xk]),max(df[yk]))
tplt.plot((minV,maxV),(minV,maxV))
xlabel(xk)
_=ylabel(yk)
xk='Tau'
yk='Pearson'
tplt=df.plot(x=xk,y=yk,style='o')
minV=min(min(df[xk]),min(df[yk]))
maxV=max(max(df[xk]),max(df[yk]))
tplt.plot((minV,maxV),(minV,maxV))
xlabel(xk)
_=ylabel(yk)
xk='Spearman'
yk='Pearson'
tplt=df.plot(x=xk,y=yk,style='o')
minV=min(min(df[xk]),min(df[yk]))
maxV=max(max(df[xk]),max(df[yk]))
tplt.plot((minV,maxV),(minV,maxV))
xlabel(xk)
_=ylabel(yk)
Here's the full set of results... there are a lot
from IPython.core.display import display,HTML,Javascript
HTML(df.sort(columns=['Tau'],ascending=False).to_html(float_format=lambda x: '%4.3f' % x,
classes="table display"))