--- title: Title keywords: fastai sidebar: home_sidebar nb_path: "nbs/05_ADFam.ipynb" ---
from SEQLinkage.Plot import plotped
import pandas as pd
import gzip
def get_vcf_names(vcf_path):
with gzip.open(vcf_path, "rt") as ifile:
for line in ifile:
if line.startswith("#CHROM"):
vcf_names = [x.strip() for x in line.split('\t')]
break
ifile.close()
return vcf_names
def read_vcf_chunk(fn,chunksize=10):
names = get_vcf_names(fn)
try:
vcf = pd.read_csv(fn, compression='gzip', comment='#', chunksize=chunksize, delim_whitespace=True, header=None, names=names)
except:
vcf = pd.read_csv(fn, comment='#', chunksize=chunksize, delim_whitespace=True, header=None, names=names)
df= pd.DataFrame(vcf.get_chunk(chunksize))
return df
vcf = read_vcf_chunk('/mnt/mfs/statgen/alzheimers-family/linkage_files/geno/full_sample/vcf/full_sample.vcf.gz', chunksize=10)
vcf
pheno_full_sample_path = '/mnt/mfs/statgen/alzheimers-family/linkage_files/pheno/full_sample/'
efiga_pedigree.txt full_sample_efi_nia.fam full_sample_fam_id.txt full_sample_fam_pop.txt full_sample_id_list.txt full_sample_pheno.txt niaload_pedigree.txt
pheno_df = pd.read_csv('/mnt/mfs/statgen/alzheimers-family/pheno/pheno_modified/AD.txt',header=0,sep='\t')
efiga_ped = pd.read_csv(pheno_full_sample_path+'efiga_pedigree.txt',delim_whitespace=True)
efiga_ped
nia_ped = pd.read_csv(pheno_full_sample_path+'niaload_pedigree.txt',delim_whitespace=True)
nia_ped[nia_ped.FID =='27_25']
nia_ped.astype(str).groupby(['prob.AD','APOE']).size().unstack()
nia_ped.astype(str).groupby(['prob.AD','APOE4NUM']).size().unstack()
efiga_ped.astype(str).groupby(['AD','APOE']).size().unstack()
pheno_df.astype(str).groupby(['AD','APOE']).size().unstack()
all_fam.ad.value_counts()
pheno_df
pheno_df[['AD','APOE']].sort_values(['AD','APOE']).astype(str).value_counts()
efi_nia_fam = pd.read_csv(pheno_full_sample_path+'full_sample_efi_nia.fam',delim_whitespace=True,header=None,names = ['fid','iid','father','mother','gender','trait'])
efi_nia_txt = pd.read_csv(pheno_full_sample_path+'full_sample_fam_id.txt',delim_whitespace=True,header=None,names = ['fid','iid','father','mother','gender','trait','id'])
efi_nia_txt
sample_id_list = pd.read_csv(pheno_full_sample_path+'full_sample_id_list.txt',delim_whitespace=True,header=None,names = ['id'])
sample_id_list
sum(pheno_df.IID == sample_id_list.id)
sample_pheno = pd.read_csv(pheno_full_sample_path+'full_sample_pheno.txt',sep='\t')
sample_pheno
sample_pheno[sample_pheno.ID == '167_DCH23.54']
sample_pheno.describe(include='all')
pheno_df
all_fam = pd.read_csv('all_sample.fam',delim_whitespace=True,header=None,names=['fid','iid','fathid','mothid','sex','ad'])
all_fam['vcf'] = list(all_fam.iid.isin(vcf.columns))
all_fam[all_fam.iid.isin(['10R_R99_8','215_59','27_25_','4_595_18','4_595_69','4_603_43'])]
famid = all_fam.fid.value_counts()
one_fam = all_fam[all_fam.fid.isin(famid[famid==1].keys())]
sum(one_fam.vcf)
twom_fam = all_fam[all_fam.fid.isin(famid[famid==1].keys())==False]
twom_fam.to_csv('twoormore_member_fam.csv',header=False,index=False)
fmid=list(set(twom_fam.fathid))+list(set(twom_fam.mothid))
def create_founder(fam,foid='fathid'):
if foid == 'fathid':
tmp =fam[~fam.fathid.isin(fam.iid) & (fam.fathid != '0')].copy()
tmp.iid = tmp.fathid
tmp.sex = 1
elif foid == 'mothid':
tmp =fam[~fam.mothid.isin(fam.iid) & (fam.mothid != '0')].copy()
tmp.iid = tmp.mothid
tmp.sex = 2
else:
raise print('id error, do not match')
tmp.fathid = '0'
tmp.mothid = '0'
tmp.ad = -9
tmp.vcf = False
return tmp
new_twom_fam = pd.concat([twom_fam,create_founder(twom_fam,'fathid'),create_founder(twom_fam,'mothid')])
new_twom_fam
new_twom_fam.to_csv('data/new_twoormore_member_fam.csv',header=False,index=False)
efiga_ped[efiga_ped.ID.duplicated(keep=False)]
nia_ped[nia_ped.Sample_ID.duplicated(keep=False)].sort_values('Sample_ID')
dufam = new_twom_fam[new_twom_fam.iid.duplicated(keep=False)].copy().sort_values('iid').reset_index(drop=True)
nodp_fam = pd.concat([new_twom_fam[~new_twom_fam.iid.duplicated(keep=False)],dufam.iloc[[0,4,5,11,13,24,26,28]]])
nof_id = (nodp_fam.fathid == '0') & (nodp_fam.mothid != '0')
nodp_fam.fathid[nof_id] = [x+'c' for x in nodp_fam.mothid[nof_id]]
nom_id = (nodp_fam.fathid != '0') & (nodp_fam.mothid == '0')
nodp_fam.mothid[nom_id] = [x+'c' for x in nodp_fam.fathid[nom_id]]
nodp_fam = pd.concat([nodp_fam,create_founder(nodp_fam,'fathid'),create_founder(nodp_fam,'mothid')])
nodp_fam = nodp_fam[~nodp_fam.iid.duplicated()]
nodp_fam.to_csv('data/nodp_fam.csv',header=False,index=False)
nodp_fam.fathid[nodp_fam.iid=='27_122_16055'] = '27_122_84953'
nodp_fam.mothid[nodp_fam.iid=='27_122_16055'] = '27_122_84952'
nodp_fam.fathid[nodp_fam.iid=='10R_R47_62'] = '10R_R47_43'
nodp_fam.mothid[nodp_fam.iid=='10R_R47_62'] = '10R_R47_1'
nodp_fam[nodp_fam.fid == '27_126'].sort_values('fathid')
nodp_fam.fathid[nodp_fam.iid=='27_126_86559'] = '27_126_86560'
nodp_fam.mothid[nodp_fam.iid=='27_126_86559'] = '27_126_85004'
nodp_fam[nodp_fam.fid == '10R_R78'].sort_values(['fathid','iid'])
nodp_fam.fathid[nodp_fam.iid=='10R_R78_31'] = '10R_R78_51'
nodp_fam[nodp_fam.fid == '27_152'].sort_values('fathid')
nodp_fam.fathid[nodp_fam.iid=='27_152_85910'] = '27_152_85911'
nodp_fam.mothid[nodp_fam.iid=='27_152_85910'] = '27_152_85912'
nodp_fam[nodp_fam.fid == '26_TCC'].sort_values('fathid')
nodp_fam.fathid[nodp_fam.iid=='26_TCC_TCC65609'] = '26_TCC_TCC65610'
nodp_fam.mothid[nodp_fam.iid=='26_TCC_TCC65609'] = '26_TCC_TCC65606'
nodp_fam[nodp_fam.fid == '10R_R99'].sort_values('fathid')
nodp_fam.fathid[nodp_fam.iid=='10R_R99_8'] = '0'
nodp_fam.mothid[nodp_fam.iid=='10R_R99_8'] = '0'
nodp_fam.sex[nodp_fam.iid=='10R_R99_8'] = 1
nodp_fam[nodp_fam.fid == '27_192'].sort_values('fathid')
nodp_fam.sex[nodp_fam.iid=='27_192_86076'] = 1
nodp_fam[nodp_fam.fid == '4_715'].sort_values('fathid')
nodp_fam.sex[nodp_fam.iid=='4_715_6'] = 1
nodp_fam[nodp_fam.fid == '170'].sort_values('fathid')
nodp_fam.fathid[nodp_fam.iid=='170_38'] = '170_31'
nodp_fam.mothid[nodp_fam.iid=='170_38'] = '170_18'
sum(nodp_fam.vcf)+250
nodp_fam.to_csv('data/nodp_fam.csv',header=False,index=False)
ped = nodp_fam.sort_values(['fid','fathid']).copy()
ped.index = list(ped.iid)
ped
def get_depth(ped,i,value):
r = ped.iloc[i,:]
if r[2]=='0':
dm = 1
else:
try:
ind = ped.index.get_loc(r[2])
v = value[ind]
if v==0:
dm = 1+get_depth(ped,ind,value)
else:
dm = 1+v
except:
dm = 1
if r[3]=='0':
df = 1
else:
try:
ind = ped.index.get_loc(r[3])
v = value[ind]
if v==0:
df = 1+get_depth(ped,ind,value)
else:
df = 1+v
except:
df = 1
return max(dm,df)
depth = [0]*len(ped)
for i in range(len(ped)):
depth[i] = get_depth(ped,i,depth)
max(depth)
ped['depth'] = depth
ped.depth[0]
ped.ad.value_counts()
ped
def trim_trees(ped,depth_cut=3):
'''sort ped by fam and depth'''
trim = []
for i in ped.fid.unique():
fi = ped[ped.fid==i]
trim += trim_tree(fi,depth_cut)
return trim
def trim_tree(fi,depth_cut=3):
trim = pd.Series([False]*len(fi))
for i,r in enumerate(fi.iterrows()):
r=r[1]
if r.depth>depth_cut:
if r.vcf==False and r.ad!=2:
#if leaf node or not: if leaf, trim it. otherwise, all children are trimed, then trim it.
ch_r = (fi.fathid==r.iid) | (fi.mothid==r.iid)
if not ch_r.any():
trim[i] = True
elif trim[list(ch_r)].all():
trim[i] = True
else:
#trim founders without children
if r.fathid=='0' and r.mothid=='0':
ch_r = (fi.fathid==r.iid) | (fi.mothid==r.iid)
if not ch_r.any():
trim[i] = True
elif trim[list(ch_r)].all():
trim[i] = True
return list(trim)
ped = ped.sort_values(['fid','depth'],ascending=False)
trim = trim_trees(ped,depth_cut=3)
sum(trim)
len(trim)
ped['trim'] = trim
ped.to_csv('data/nodp_ped_with_depth_trim.csv',header=True,index=False)
def trim_trees(ped):
'''sort ped by fam and depth'''
ped = ped.sort_values(['fid','depth'],ascending=False) # from high to low
trim = []
for i in ped.fid.unique():
fi = ped[ped.fid==i]
trim += trim_tree_all(fi)
return trim
def trim_tree_all(fi):
n = len(fi)
trim = pd.Series([False]*n)
for i in range(n):
r=fi.iloc[i]
ch_r = (fi.fathid==r.iid) | (fi.mothid==r.iid)
if not ch_r.any() or trim[list(ch_r)].all(): #leaf
if r.vcf==False and r.ad!=2:
trim[i] = True
for i in range(n-1, -1, -1):
r=fi.iloc[i]
ch_r = (fi.fathid==r.iid) | (fi.mothid==r.iid)
if r.fathid=='0' and r.mothid=='0' or trim[list(fi.iid.isin([r.fathid,r.mothid]))].all():
if not ch_r.any():
trim[i] = True
elif trim[list(ch_r)].all():
trim[i] = True
elif not fi.vcf[ch_r].any(): #no vcf in any chrildren
trim[i] = True
elif (fi.ad[ch_r]!=2).all() and r.ad!=2: #no ad info in all chrildren
trim[i] = True
return list(trim)
new_ped = ped.sort_values(['fid','depth'],ascending=False).copy()
trim = trim_trees(new_ped)
new_ped['trim']=trim
new_ped.to_csv('data/nodp_ped_with_trim_bottomup_topdown.csv',header=True,index=False)
trim_ped = new_ped[new_ped.trim==False]
depth = [0]*len(trim_ped)
for i in range(len(trim_ped)):
depth[i] = get_depth(trim_ped,i,depth)
max(depth)
trim_ped['depth'] = depth
trim_ped.depth.value_counts()
trim_ped.fathid[trim_ped.depth==1] = '0'
trim_ped.mothid[trim_ped.depth==1] = '0'
trim_smry = pd.DataFrame([[new_ped.depth[new_ped.fid == i].max() for i in trim_ped.fid.unique()],[trim_ped.depth[trim_ped.fid == i].max() for i in trim_ped.fid.unique()]])
trim_smry = trim_smry.T
trim_smry.index = trim_ped.fid.unique()
trim_smry.columns = ['before','after']
trim_smry.hist()
len(trim_smry.index[(trim_smry.before>3) & (trim_smry.after<4)])
sum(trim_smry.before>3)
sum(trim_smry.after>3)
dots1 = [plotped(new_ped[new_ped.fid ==i],output=True,folder='data/new_ADfam_depthless4') for i in trim_smry.index[trim_smry.before<4]]
dots2 = [plotped(new_ped[new_ped.fid ==i],output=True,folder='data/new_ADfam_trimless4') for i in trim_smry.index[(trim_smry.before>3) & (trim_smry.after<4)]]
dots3 = [plotped(new_ped[new_ped.fid ==i],output=True,folder='data/new_ADfam_trimmore4') for i in trim_smry.index[trim_smry.after>3]]
!cd data/
!tar -zcvf morethan4_aft_trim.tar.gz morethan4_aft_trim
trim_smry1 = pd.DataFrame([[len(new_ped[new_ped.fid == i]) for i in trim_ped.fid.unique()],[len(trim_ped[trim_ped.fid == i]) for i in trim_ped.fid.unique()]])
trim_smry1 = trim_smry1.T
trim_smry1.index = trim_ped.fid.unique()
trim_smry1.columns = ['before','after']
trim_smry1.sort_values('before',ascending=False)[:10]
dots3 = [plotped(new_ped[new_ped.fid ==i],output=True,folder='data/bigest_families') for i in trim_smry1.sort_values('before',ascending=False)[:10].index]
!tar -zcvf data/bigest_families.tar.gz data/bigest_families/*.svg
all_parents = set(trim_ped.fathid[trim_ped.fathid!='0']).union(set(trim_ped.mothid[trim_ped.mothid!='0']))
missing_parents_ped = new_ped[new_ped.iid.isin(all_parents.difference(set(trim_ped.iid)))].copy()
missing_parents_ped.fathid = '0'
missing_parents_ped.mothid = '0'
trim_ped = pd.concat([trim_ped,missing_parents_ped])
trim_ped = trim_ped.sort_values('fid')
depth = [0]*len(trim_ped)
for i in range(len(trim_ped)):
depth[i] = get_depth(trim_ped,i,depth)
trim_ped['depth'] = depth
plotped(new_ped[new_ped.fid =='4_558'])
plotped(new_ped[new_ped.fid =='3798'])
plotped(new_ped[new_ped.fid =='348'])
missing_parents_ped.ad.value_counts()
missing_parents_ped.vcf.value_counts()
missing_parents_ped.ad.value_counts()
# trees = None
# for i in ped.fid.unique():
def merge_trees(trees):
new_trees = []
label = pd.Series([False]*len(trees))
pre_n = n = 0
tree = []
while not label.all():
un_label_idx = label[label==False].index
if sum(label)==pre_n:
#next subtree
if len(tree)>0:
new_trees.append(tree)
tree = set(trees[un_label_idx[0]])
label[un_label_idx[0]] = True
#update both pre_n and n
pre_n = n
n = sum(label)
else:
#first update pre_n
pre_n = n
for i in un_label_idx:
if len(tree.intersection(trees[i]))>0:
tree = tree.union(trees[i])
#update n and label
label[i] = True
n +=1
new_trees.append(tree)
return(new_trees)
def get_trees(fi):
trees = []
for i in np.where(fi.depth==1)[0]:
nodes = get_desnodes(fi,i,[])
trees.append(nodes)
return trees
def get_desnodes(fi,i,value):
'''recursive mark tree'''
r=fi.iloc[i]
ch_r = (fi.fathid==r.iid) | (fi.mothid==r.iid)
if not ch_r.any():
value.append(r.iid)
else:
for j in np.where(ch_r)[0]:
value.append(r.iid)
value += get_desnodes(fi,j,value)
return list(set(value))
for i in trim_ped.fid.unique():
if len(get_trees(trim_ped[trim_ped.fid ==i]))==0:
print(i)
tree_nodes = [merge_trees(get_trees(trim_ped[trim_ped.fid ==i])) for i in trim_ped.fid.unique()]
tree_nodes[0]
def label_subtree(trim_ped,tree_nodes):
tag = []
for fid,node_list in zip(trim_ped.fid.unique(),tree_nodes):
iid = trim_ped.iid[trim_ped.fid == fid]
labels = pd.Series([None]*len(iid))
for i,ns in enumerate(node_list):
labels[list(iid.isin(ns))]=i
tag += list(labels)
return tag
tag = label_subtree(trim_ped,tree_nodes)
tmp = [i+':'+str(j) for i,j in zip(trim_ped.fid,tag)]
trim_ped['tag'] = tmp
one_vcf_list = []
for i in set(trim_ped.tag):
if sum(trim_ped.vcf[trim_ped.tag==i])<=1:
one_vcf_list.append(i)
new_trim_ped = trim_ped[~trim_ped.tag.isin(one_vcf_list)].copy()
new_trim_ped.fid = new_trim_ped.tag
new_trim_ped.to_csv('data/new_trim_ped.csv',header=True,index=False)
new_trim_ped.iloc[:,:-4].sort_values('fid').to_csv('data/new_trim_ped_fam.fam',header=False,index=False,sep='\t')
import pandas as pd
new_trim_ped = pd.read_csv('../data/new_trim_ped.csv')
tmp = new_trim_ped.fid.value_counts()
famless17 = new_trim_ped[new_trim_ped.fid.isin(tmp[tmp<17].index)]
famless17.iloc[:,:-4].sort_values('fid').to_csv('../data/new_trim_ped_famless17.fam',header=False,index=False,sep='\t')
tmp = [True if x[-1]=='0' else False for x in famless17.tag]
tmp = famless17[tmp]
tmp.fid = [x[:-2] for x in tmp.fid]
tmp.iloc[:,:-4].sort_values('fid').to_csv('../data/new_trim_ped_famless17_no:xx.fam',header=False,index=False,sep='\t')
famless17.fid[tmp].value_counts()
tmp = new_trim_ped.fathid.value_counts()
tmp
new_trim_ped.mothid.value_counts()
new_trim_ped[new_trim_ped.fid.isin(['10R_R114:0'])]
tmp = trim_ped.copy()
tmp.trim = tag
len(set(tmp.fid[tmp.trim>1]))
import shutil
from pathlib import Path
[shutil.copy2('data/trim_ped_plot/ADfam_'+i+'.svg', 'data/multiple_subtrees/') for i in set(tmp.fid[tmp.trim>1])] # target filename is /dst/dir/file.ext
!tar -zcvf data/multiple_subtrees.tar.gz data/multiple_subtrees
def copyfile(file_path):
path = Path(file_path)
if path.is_file():
shutil.copy2(file_path, '../data/multiple_subtrees/')
for i in [223,597,508,3324,197,1317,'4_680',3761,'4_393',546,359,216,591,'4_162',215]:
copyfile('../data/new_ADfam_depthless4/ADfam_'+str(i)+'.png')
copyfile('../data/new_ADfam_trimmore4/ADfam_'+str(i)+'.png')
copyfile('../data/new_ADfam_trimless4/ADfam_'+str(i)+'.png')
tmp.sort_values('trim')[-20:]
len(set(tmp.fid))
dots4 = [plotped1(tmp[tmp.fid ==i],output=True,folder='data/trim_ped_plot') for i in set(tmp.fid)]
plotped1(tmp[tmp.fid =='3761'])
from graphviz import Digraph
def update_attributes(r):
attributes={'shape':'polygon','height':'0.3','width':'0.5','regular':'0','style':"filled,setlinewidth(4)",'fontcolor':'black'}
gender = {"m":"box","1":"box",1:"box","f":"ellipse","2":"ellipse",2:"ellipse",0:"polygon"}
trait = {1:'white',2:'dimgrey',-9:'aquamarine3'}
vcf = {True:"filled,setlinewidth(4)",False:"filled"}
trim = {0:"aqua",1:"red",2:"blue",3:"orange",4:"yellow",5:"green"}
attributes['shape'] = gender[r.sex]
attributes['fillcolor']=trait[r.ad]
attributes['style']=vcf[r.vcf]
try:
attributes['fontcolor'] = trim[r.trim]
except:
pass
return attributes
def plotped1(fi,output=False,folder='',format='svg'):
#nodes
#create iid nodes
#create parent nodes
#edges
#parent to iid
#fathid,mothid to parent
# Create Digraph object
dot = Digraph()
# Add nodes
parents = []
for i,r in fi.iterrows():
dot.node(r.iid,shape='box',_attributes=update_attributes(r))
if r.fathid !='0' and r.mothid !='0':
parents.append(r.fathid+'x'+r.mothid)
dot.edge(r.fathid+'x'+r.mothid,r.iid)
#edge to children
ch_r = (fi.fathid==r.iid) | (fi.mothid==r.iid)
if ch_r.any():
for p in fi[ch_r][['fathid','mothid']].agg('x'.join, axis=1).unique():
dot.edge(r.iid,p)
for p in set(parents):
dot.node(p,shape='diamond',label='',height='.1',width='.1')
if output:
dot.render(outfile=folder+'/ADfam_'+str(list(fi.fid)[0])+'.'+format,format=format,overwrite_source=True)
return dot
plotped(trim_ped[trim_ped.fid =='359'])
plotped(trim_ped[trim_ped.fid =='359'])
trees = get_trees(fi)
plotped(new_ped[new_ped.fid =='4_364'])
np.where(ch_r)[0][0]
plotped(new_ped[new_ped.fid =='1005'])
plotped(new_ped[new_ped.fid =='336'])
plotped(new_ped[new_ped.fid =='4_364'])
plotped(new_ped[new_ped.fid =='4_501'])
plotped(new_ped[new_ped.fid =='3761'])
plotped(new_ped[new_ped.fid =='930'])
new_ped1[new_ped1.fid=='930']
dot1 = [plotped(new_ped[new_ped.fid ==i],output=True,folder='data/ADfam_new_trim_4') for i in new_ped1[list(pd.Series(depth)>3)].fid.unique()]
trim_ped = ped[ped.trim==False]
tmp = pd.DataFrame([[trim_ped.depth[trim_ped.fid == i].max() for i in trim_ped.fid.unique()],[ped.depth[ped.fid == i].max() for i in trim_ped.fid.unique()]])
tmp = tmp.T
tmp.index = trim_ped.fid.unique()
tmp.hist()
sum(tmp[0]<4)
sum(tmp[1]<4)