More on finding related documents in ChEMBL.
This is a followup to this post, the original notebook is here.
This time I will use Andrew Dalke's very nice, and super fast, chemfp to do the similarity comparisons. This allows much larger data sets to be run, so I'll look at documents from 2010-2012 and, in a second step, use a lower similarity threshold to define neighbors.
from rdkit import Chem,DataStructs
import time,random
from collections import defaultdict
import psycopg2
from rdkit.Chem import Draw,PandasTools,rdMolDescriptors
from rdkit.Chem.Draw import IPythonConsole
from rdkit import rdBase
from __future__ import print_function
import requests
from xml.etree import ElementTree
import pandas as pd
%load_ext sql
print(rdBase.rdkitVersion)
Preparation
Let's start by getting all molecules from papers from the years 2010-2012 from ChEMBL_16.
data = %sql postgresql://localhost/chembl_16 \
select molregno,canonical_smiles smiles from \
(select distinct molregno from \
(select doc_id from docs where year>=2010 and year<=2012) t1 \
join (select doc_id,molregno from activities) t2 \
using (doc_id)) tbl \
join compound_structures using (molregno) ;
outf=file('../data/chembl16_2010-2012.smi','w+')
for i,(molregno,smi) in enumerate(data):
outf.write('%s\t%d\n'%(smi,molregno))
outf=None
outf=file('../data/chembl16_2010-2012.mfp2.fps','w+')
outf.write("""#FPS1
#num_bits=2048
#software=RDKit/%s
"""%rdBase.rdkitVersion)
for i,(molregno,smi) in enumerate(data):
mol = Chem.MolFromSmiles(str(smi))
if not mol:
continue
fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol,2,nBits=2048)
outf.write("%s\t%d\n"%(DataStructs.BitVectToFPSText(fp),molregno))
if i and not i%(5000):
print("Done: %s"%i)
outf=None
data=None
t1=time.time()
rows = !simsearch --NxN -t 0.8 ../data/chembl16_2010-2012.mfp2.fps
t2=time.time()
print("That took %.2f seconds"%(t2-t1))
len(rows)
To be clear on exactly how amazing chemfp is: I just found all the pairs in a 230Kx230K set of fingerprints in about two minutes on my three year old desktop linux box.
Limit the data to include only rows for molecules that have at least one neighbor:
rowsWithData=[x for x in rows if x[0] not in '#0']
len(rowsWithData)
rowsWithData[0]
Grab doc_ids for each compound:
data = %sql \
select molregno,doc_id from \
(select doc_id from docs where year>=2010 and year<=2012) t1 \
join (select doc_id,molregno from activities) t2 \
using (doc_id) ;
molDocs=defaultdict(list)
for regno,doc_id in data:
if doc_id not in molDocs[regno]:
molDocs[regno].append(doc_id)
data=None
another data structure to hold the compounds in each document:
docMols=defaultdict(list)
for regno,docs in molDocs.iteritems():
for docId in docs:
docMols[docId].append(regno)
And now figure out the document pairs based on the compound pairs:
docPairs=defaultdict(list)
molSims={}
for row in rowsWithData:
row = row.split('\t')
nNbrs = int(row[0])
bRegno = int(row[1])
for i in range(nNbrs):
iRegno=int(row[2*i+2])
if bRegno==iRegno:
continue
sim=float(row[2*i+3])
if bRegno<iRegno:
tpl=(bRegno,iRegno)
else:
tpl=(iRegno,bRegno)
molSims[tpl]=sim
for dId1 in molDocs[bRegno]:
for dId2 in molDocs[iRegno]:
if dId1==dId2:
continue
if dId1<dId2:
dtpl = (dId1,dId2)
else:
dtpl = (dId2,dId1)
if tpl not in docPairs[dtpl]:
docPairs[dtpl].append(tpl)
Put the data into a DataFrame so that it's easier to look at:
keys=['docid1','docid2','nDoc1','nDoc2','nInCommon','pair_count']
rows=[]
for k,v in docPairs.iteritems():
row=[]
row.append(k[0])
row.append(k[1])
s0=set(docMols[k[0]])
s1=set(docMols[k[1]])
row.append(len(s0))
row.append(len(s1))
row.append(len(s0.intersection(s1)))
row.append(len(v))
rows.append(row)
df = pd.DataFrame(data=rows,columns=keys)
Add our computed summary properties:
df['compound_id_tani']=df.apply(lambda row: float(row['nInCommon'])/(row['nDoc1']+row['nDoc2']-row['nInCommon']),axis=1)
df['frac_high_sim']=df.apply(lambda row: float(row['pair_count'])/(row['nDoc1']*row['nDoc2']),axis=1)
df[(df.nDoc1>10)&(df.nDoc2>10)].sort('frac_high_sim',ascending=False).head(10)
Take a look at the document titles from pubmed:
# turn off row count printing
%config SqlMagic.feedback = False
titlePairs=[]
for id_,row in df[(df.nDoc1>10)&(df.nDoc2>10)].sort('frac_high_sim',ascending=False).head(10).iterrows():
did1=row['docid1']
did2=row['docid2']
pmids=%sql select doc_id,pubmed_id,title from docs where doc_id in (:did1,:did2)
tpl = pmids[0][1],pmids[1][1]
if tpl[0] is None:
print('no pmid found for item: %s'%(str(pmids[0])))
continue
if tpl[1] is None:
print('no pmid found for item: %s'%(str(pmids[1])))
continue
txt=requests.get('http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=%d,%d'%tpl).text
et = ElementTree.fromstring(txt.encode('utf-8'))
ts=[x.text for x in et.findall(".//*[@Name='Title']")]
titlePairs.append(ts)
print(str(tpl))
print(' '+ts[0])
print(' '+ts[1])
And then inspect the results:
for id_,row in df[(df.nDoc1>10)&(df.nDoc2>10)].sort('frac_high_sim',ascending=False).head(10).iterrows():
did1=row['docid1']
did2=row['docid2']
pmids=%sql select doc_id,pubmed_id,chembl_id,year from docs where doc_id in (:did1,:did2)
print('https://www.ebi.ac.uk/chembl/doc/inspect/%s (%d )- https://www.ebi.ac.uk/chembl/doc/inspect/%s (%d)'%(pmids[0][2],pmids[0][3],pmids[1][2],pmids[1][3]))
Once again, many of these don't have ChEMBL matches identified. This is at least partially due to the fact that papers have to cite each other to show up (thanks to George for pointing that out), but another part is clearly going to be the low absolute overlap between papers.
The exception is CHEMBL1255554 - CHEMBL1777727, which is a pair in the current ChEMBL scheme.
look at some molecules
d1,d2=50562,59446
mPairs=docPairs[(d1,d2)][:]
random.seed(d1)
random.shuffle(mPairs)
ms = []
for i,j in mPairs[:3]:
d = %sql select m from rdk.mols where molregno=:i
ms.append(Chem.MolFromSmiles(d[0][0]))
d = %sql select m from rdk.mols where molregno=:j
ms.append(Chem.MolFromSmiles(d[0][0]))
Draw.MolsToGridImage(ms,molsPerRow=2,subImgSize=(300,300))
The molecules do certainly seem similar to each other, and they definitely highlight some nice opportunities for improvement of the depiction code (positive attitude! positive attitude!).
d1,d2=52860, 66739
mPairs=docPairs[(d1,d2)][:]
random.seed(d1)
random.shuffle(mPairs)
ms = []
for i,j in mPairs[:3]:
d = %sql select m from rdk.mols where molregno=:i
ms.append(Chem.MolFromSmiles(d[0][0]))
d = %sql select m from rdk.mols where molregno=:j
ms.append(Chem.MolFromSmiles(d[0][0]))
Draw.MolsToGridImage(ms,molsPerRow=2,subImgSize=(400,200))
Try a lower similarity threshold
t1=time.time()
rows = !simsearch --NxN -t 0.6 ../data/chembl16_2010-2012.mfp2.fps
t2=time.time()
print("That took %.2f seconds"%(t2-t1))
It takes a bit longer with the lower threshold, but chemfp is still excellently fast.
len(rows)
rowsWithData=[x for x in rows if x[0] not in '#0']
len(rowsWithData)
data = %sql \
select molregno,doc_id from \
(select doc_id from docs where year>=2010 and year<=2012) t1 \
join (select doc_id,molregno from activities) t2 \
using (doc_id) ;
molDocs=defaultdict(list)
for regno,doc_id in data:
if doc_id not in molDocs[regno]:
molDocs[regno].append(doc_id)
data=None
docMols=defaultdict(list)
for regno,docs in molDocs.iteritems():
for docId in docs:
docMols[docId].append(regno)
docPairs=defaultdict(list)
molSims={}
for row in rowsWithData:
row = row.split('\t')
nNbrs = int(row[0])
bRegno = int(row[1])
for i in range(nNbrs):
iRegno=int(row[2*i+2])
if bRegno==iRegno:
continue
sim=float(row[2*i+3])
if bRegno<iRegno:
tpl=(bRegno,iRegno)
else:
tpl=(iRegno,bRegno)
molSims[tpl]=sim
for dId1 in molDocs[bRegno]:
for dId2 in molDocs[iRegno]:
if dId1==dId2:
continue
if dId1<dId2:
dtpl = (dId1,dId2)
else:
dtpl = (dId2,dId1)
if tpl not in docPairs[dtpl]:
docPairs[dtpl].append(tpl)
keys=['docid1','docid2','nDoc1','nDoc2','nInCommon','pair_count']
rows=[]
for k,v in docPairs.iteritems():
row=[]
row.append(k[0])
row.append(k[1])
s0=set(docMols[k[0]])
s1=set(docMols[k[1]])
row.append(len(s0))
row.append(len(s1))
row.append(len(s0.intersection(s1)))
row.append(len(v))
rows.append(row)
df = pd.DataFrame(data=rows,columns=keys)
df['compound_id_tani']=df.apply(lambda row: float(row['nInCommon'])/(row['nDoc1']+row['nDoc2']-row['nInCommon']),axis=1)
df['frac_high_sim']=df.apply(lambda row: float(row['pair_count'])/(row['nDoc1']*row['nDoc2']),axis=1)
df[(df.nDoc1>10)&(df.nDoc2>10)].sort('frac_high_sim',ascending=False).head(10)
titlePairs=[]
for id_,row in df[(df.nDoc1>10)&(df.nDoc2>10)].sort('frac_high_sim',ascending=False).head(10).iterrows():
did1=row['docid1']
did2=row['docid2']
pmids=%sql select doc_id,pubmed_id,title from docs where doc_id in (:did1,:did2)
tpl = pmids[0][1],pmids[1][1]
if tpl[0] is None:
print('no pmid found for item: %s'%(str(pmids[0])))
continue
if tpl[1] is None:
print('no pmid found for item: %s'%(str(pmids[1])))
continue
txt=requests.get('http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=%d,%d'%tpl).text
et = ElementTree.fromstring(txt.encode('utf-8'))
ts=[x.text for x in et.findall(".//*[@Name='Title']")]
titlePairs.append(ts)
print(str(tpl))
print(' '+ts[0])
print(' '+ts[1])
for id_,row in df[(df.nDoc1>10)&(df.nDoc2>10)].sort('frac_high_sim',ascending=False).head(10).iterrows():
did1=row['docid1']
did2=row['docid2']
pmids=%sql select doc_id,pubmed_id,chembl_id,year from docs where doc_id in (:did1,:did2)
print('https://www.ebi.ac.uk/chembl/doc/inspect/%s (%d )- https://www.ebi.ac.uk/chembl/doc/inspect/%s (%d)'%(pmids[0][2],pmids[0][3],pmids[1][2],pmids[1][3]))
d1,d2=65507,66530
mPairs=docPairs[(d1,d2)][:]
random.seed(d1)
random.shuffle(mPairs)
ms = []
for i,j in mPairs[:3]:
d = %sql select m from rdk.mols where molregno=:i
ms.append(Chem.MolFromSmiles(d[0][0]))
d = %sql select m from rdk.mols where molregno=:j
ms.append(Chem.MolFromSmiles(d[0][0]))
Draw.MolsToGridImage(ms,molsPerRow=2,subImgSize=(300,200))
d1,d2=61045,66790
mPairs=docPairs[(d1,d2)][:]
random.seed(d1)
random.shuffle(mPairs)
ms = []
for i,j in mPairs[:3]:
d = %sql select m from rdk.mols where molregno=:i
ms.append(Chem.MolFromSmiles(d[0][0]))
d = %sql select m from rdk.mols where molregno=:j
ms.append(Chem.MolFromSmiles(d[0][0]))
Draw.MolsToGridImage(ms,molsPerRow=2,subImgSize=(300,200))
1 comment:
Great to see the chemfps in action!
What do you think about using defaultdict(set) in snippets [8] and [21]?
molDocs=defaultdict(set)
for regno,doc_id in data:
molDocs[regno].add(doc_id)
That way you could skip the look-up check.
Post a Comment