--- title: Title keywords: fastai sidebar: home_sidebar nb_path: "nbs/05_ADFam-Copy1.ipynb" ---
import dask as da
import dask.dataframe as dd
import pandas as pd
import gzip
def get_vcf_names(vcf_path):
with gzip.open(vcf_path, "rt") as ifile:
for line in ifile:
if line.startswith("#CHROM"):
vcf_names = [x.strip() for x in line.split('\t')]
break
ifile.close()
return vcf_names
def read_vcf_chunk(fn,chunksize=10):
names = get_vcf_names(fn)
try:
vcf = pd.read_csv(fn, compression='gzip', comment='#', chunksize=chunksize, delim_whitespace=True, header=None, names=names)
except:
vcf = pd.read_csv(fn, comment='#', chunksize=chunksize, delim_whitespace=True, header=None, names=names)
df= pd.DataFrame(vcf.get_chunk(chunksize))
return df
vcf = read_vcf_chunk('/mnt/mfs/statgen/alzheimers-family/linkage_files/geno/full_sample/vcf/full_sample.vcf.gz', chunksize=10)
chrX =
fam = pd.read_csv('/mnt/mfs/statgen/alzheimers-family/linkage_files/geno/full_sample/bfiles/full_sample.fam',delim_whitespace=True, header=None,names = ['fid','iid','father','mother','gender','trait'])
sum(fam.iid == pd.Series(vcf.columns[9:]))
fam
?Questions for Chong: why do the last rows use characters? what is the meaning of last colnum.
fam.shape
anno = pd.read_csv('/mnt/mfs/statgen/alzheimers-family/linkage_files/geno/full_sample/rare_positions/full_sample_coding.hg38_multianno.txt',delim_whitespace=True)
anno
anno.AF.value_counts()
?Question for Chong. all of the AF are ., which means there is something wrong with the annotation.
[anno[[i]].value_counts() for i in anno.columns[5:]]
pheno_full_sample_path = '/mnt/mfs/statgen/alzheimers-family/linkage_files/pheno/full_sample/'
efiga_pedigree.txt full_sample_efi_nia.fam full_sample_fam_id.txt full_sample_fam_pop.txt full_sample_id_list.txt full_sample_pheno.txt niaload_pedigree.txt
efiga = pd.read_csv(pheno_full_sample_path+'efiga_pedigree.txt',delim_whitespace=True)
efiga
efi_nia_fam = pd.read_csv(pheno_full_sample_path+'full_sample_efi_nia.fam',delim_whitespace=True,header=None,names = ['fid','iid','father','mother','gender','trait'])
efi_nia_fam
efi_nia_fam[efi_nia_fam.iid == '167_DCH23.54']
efi_nia_fam[efi_nia_fam.fid == '167']
efi_nia_txt = pd.read_csv(pheno_full_sample_path+'full_sample_fam_id.txt',delim_whitespace=True,header=None,names = ['fid','iid','father','mother','gender','trait','id'])
efi_nia_txt
efi_nia_txt[efi_nia_txt.iid == '167_DCH23.54']
fam_pop = pd.read_csv(pheno_full_sample_path+'full_sample_fam_pop.txt',delim_whitespace=True,header=None,names = ['fid','pop'])
fam_pop
sample_id_list = pd.read_csv(pheno_full_sample_path+'full_sample_id_list.txt',delim_whitespace=True,header=None,names = ['id'])
sample_id_list
sample_pheno = pd.read_csv(pheno_full_sample_path+'full_sample_pheno.txt',sep='\t')
sample_pheno
sample_pheno[sample_pheno.ID == '167_DCH23.54']
sample_pheno.describe(include='all')
nia_ped = pd.read_csv(pheno_full_sample_path+'niaload_pedigree.txt',delim_whitespace=True)
nia_ped
nia_ped[nia_ped.Sample_ID == '167_DCH23.54']
s1_vcf = read_vcf_chunk('/mnt/mfs/statgen/alzheimers-family/linkage_files/geno/sample_i/rare_positions/sample_i_coding.hg38_multianno.vcf.gz')
s1_vcf
s1_fam = pd.read_csv('/mnt/mfs/statgen/alzheimers-family/linkage_files/geno/sample_i/rare_positions/sample_i_coding.hg38_multianno.fam',delim_whitespace=True, header=None,names = ['fid','iid','father','mother','gender','trait'])
s1_fam
s1_anno = pd.read_csv('/mnt/mfs/statgen/alzheimers-family/linkage_files/geno/sample_i/rare_positions/sample_i_coding.hg38_multianno.txt',delim_whitespace=True)
s1_anno
s2_vcf = read_vcf_chunk('/mnt/mfs/statgen/alzheimers-family/linkage_files/geno/sample_ii/rare_positions/sample_ii_coding.hg38_multianno.vcf.gz')
s2_vcf
s2_vcf.columns[9:]==s2_fam.iid[:222]
s2_fam = pd.read_csv('/mnt/mfs/statgen/alzheimers-family/linkage_files/pheno/sample_ii/small_sample_ii.fam',delim_whitespace=True, header=None,names = ['fid','iid','father','mother','gender','trait'])
s2_fam
tmp1=s2_fam.fid[222:].value_counts()
tmp2 = s2_fam.fid[:222].value_counts()
pd.DataFrame([[i,tmp1[i],tmp2[i]] for i in tmp1.keys()],columns=['fid','all','vcf'])
tmp2[]
s2_fam[s2_fam.fid == '3761']
s2_anno = pd.read_csv('/mnt/mfs/statgen/alzheimers-family/linkage_files/geno/sample_ii/rare_positions/sample_ii_coding.hg38_multianno.txt',delim_whitespace=True)
s2_anno
s2_fam[s2_fam.duplicated()]
s2_vcf.columns[9:]
len(s2_vcf.columns[9:])
s2_fam[s2_fam.iid == '167_28']
s2_fam[s2_fam.iid == '1819_10']
s2_fam[s2_fam.iid == '167_DCH23.54']
s2_fam[210:230]
s2_fam.drop_duplicates().to_csv('/mnt/mfs/statgen/alzheimers-family/yhseqlink/data/MWE/sample2_uniq.fam',header=False,index=False,sep='\t')
coding_region_rare_variant_positions.txt
why do you choose these snps?
vcf hg19? hg38?
hg19 = pd.read_csv('/home/yh3455/.SEQLinkage/genemap.hg19.txt',header=None,sep='\t')
hg38 = pd.read_csv('/home/yh3455/.SEQLinkage/genemap.hg38.txt',header=None,sep='\t')
hg19
hg38
anno = pd.read_csv('./MWE/annotation/EFIGA_NIALOAD_chr18.hg38.hg38_multianno.csv')
maf = anno[['Chr','Start','End','Ref','Alt','AF','AF_afr','AF_ami','AF_amr','AF_asj','AF_eas','AF_fin','AF_nfe','AF_oth','AF_sas']]
maf.iloc[:,5:]
fam_anc = pd.read_csv(pheno_full_sample_path+'full_sample_fam_pop.txt',delim_whitespace=True,header=None,index_col=0)
sum(fam_anc.anc.isin(['AF','AF_raw','AF_male','AF_female','AF_afr','AF_ami','AF_amr','AF_asj','AF_eas','AF_fin','AF_nfe','AF_oth','AF_sas']))
maf_gene = maf.iloc[:,5:]
maf_gene = maf_gene.replace('.',np.nan)
sum(maf_gene.isna().all(axis=1))
sum(maf_gene.AF.isna())
maf_gene = maf_gene.astype(np.float64)
maf_gene = maf_gene.replace(0,10**-5)
maf_gene[maf_gene.sum(axis=1)==0] = 10**-5
maf_gene[maf_gene.AF.isna() & (maf_gene.isna().all(axis=1)==False)]
for i in ['AF','AF_afr','AF_ami','AF_amr','AF_asj','AF_eas','AF_fin','AF_nfe','AF_oth','AF_sas']:
print(i,sum(maf_gene[i].isna()))
import numpy as np
X = maf_gene.corr()
X.values[[np.arange(X.shape[0])]*2] = 0
import scipy
from scipy.spatial.distance import squareform
v = squareform(X)
from scipy.cluster.hierarchy import dendrogram, linkage
from matplotlib import pyplot as plt
X.columns
Z = linkage(v, 'ward')
fig = plt.figure(figsize=(20, 10))
dn = dendrogram(Z,labels=list(X.columns))
plt.show()
len(v)
X.shape
maf_gene.sum(axis=1)
maf_gene.corr()
vcf
def get_fam_mafs(geno,anno,famid):
fam_mafs = {}
fam_mafsfam_anc.loc['4_44'].values[0]
tmp.values[0]
bim = pd.read_csv('../sample_i/bfiles/small_sample_i.bim',header=None,sep='\t')
bim.columns = ['chrom','snp','i','pos','a0','a1']
sum(bim['chrom']==8)
bim['chrom'].value_counts()
geno = pd.read_csv('data/genemap.hg38.txt',header=None,sep ='\t')
vipgeno = geno[list(geno[[3]].isin(['CACNG7','LOC101928105','TNFRSF21','DMAP1','SPRED1','RAP2B'])[3])]
vipgeno
vipgeno.to_csv('data/vipgenemap.hg38.txt',header=False,index=False,sep='\t')
pheno_full_sample_path = '/mnt/mfs/statgen/alzheimers-family/linkage_files/pheno/full_sample/'
efiga_pedigree.txt full_sample_efi_nia.fam full_sample_fam_id.txt full_sample_fam_pop.txt full_sample_id_list.txt full_sample_pheno.txt niaload_pedigree.txt
pheno_df = pd.read_csv('/mnt/mfs/statgen/alzheimers-family/pheno/pheno_modified/AD.txt',header=0,sep='\t')
efiga_ped = pd.read_csv(pheno_full_sample_path+'efiga_pedigree.txt',delim_whitespace=True)
efiga_ped
nia_ped = pd.read_csv(pheno_full_sample_path+'niaload_pedigree.txt',delim_whitespace=True)
nia_ped[nia_ped.FID =='27_25']
pheno_df
efi_nia_fam = pd.read_csv(pheno_full_sample_path+'full_sample_efi_nia.fam',delim_whitespace=True,header=None,names = ['fid','iid','father','mother','gender','trait'])
efi_nia_txt = pd.read_csv(pheno_full_sample_path+'full_sample_fam_id.txt',delim_whitespace=True,header=None,names = ['fid','iid','father','mother','gender','trait','id'])
efi_nia_txt
sample_id_list = pd.read_csv(pheno_full_sample_path+'full_sample_id_list.txt',delim_whitespace=True,header=None,names = ['id'])
sample_id_list
sum(pheno_df.IID == sample_id_list.id)
sample_pheno = pd.read_csv(pheno_full_sample_path+'full_sample_pheno.txt',sep='\t')
sample_pheno
sample_pheno[sample_pheno.ID == '167_DCH23.54']
sample_pheno.describe(include='all')
pheno_df
all_fam = pd.read_csv('all_sample.fam',delim_whitespace=True,header=None,names=['fid','iid','fathid','mothid','sex','ad'])
all_fam['vcf'] = list(all_fam.iid.isin(vcf.columns))
all_fam[all_fam.iid.isin(['10R_R99_8','215_59','27_25_','4_595_18','4_595_69','4_603_43'])]
famid = all_fam.fid.value_counts()
one_fam = all_fam[all_fam.fid.isin(famid[famid==1].keys())]
sum(one_fam.vcf)
twom_fam = all_fam[all_fam.fid.isin(famid[famid==1].keys())==False]
twom_fam.to_csv('twoormore_member_fam.csv',header=False,index=False)
fmid=list(set(twom_fam.fathid))+list(set(twom_fam.mothid))
def create_founder(fam,foid='fathid'):
if foid == 'fathid':
tmp =fam[~fam.fathid.isin(fam.iid) & (fam.fathid != '0')].copy()
tmp.iid = tmp.fathid
tmp.sex = 1
elif foid == 'mothid':
tmp =fam[~fam.mothid.isin(fam.iid) & (fam.mothid != '0')].copy()
tmp.iid = tmp.mothid
tmp.sex = 2
else:
raise print('id error, do not match')
tmp.fathid = '0'
tmp.mothid = '0'
tmp.ad = -9
tmp.vcf = False
return tmp
new_twom_fam = pd.concat([twom_fam,create_founder(twom_fam,'fathid'),create_founder(twom_fam,'mothid')])
new_twom_fam
new_twom_fam.to_csv('data/new_twoormore_member_fam.csv',header=False,index=False)
efiga_ped[efiga_ped.ID.duplicated(keep=False)]
nia_ped[nia_ped.Sample_ID.duplicated(keep=False)].sort_values('Sample_ID')
dufam = new_twom_fam[new_twom_fam.iid.duplicated(keep=False)].copy().sort_values('iid').reset_index(drop=True)
nodp_fam = pd.concat([new_twom_fam[~new_twom_fam.iid.duplicated(keep=False)],dufam.iloc[[0,4,5,11,13,24,26,28]]])
nof_id = (nodp_fam.fathid == '0') & (nodp_fam.mothid != '0')
nodp_fam.fathid[nof_id] = [x+'c' for x in nodp_fam.mothid[nof_id]]
nom_id = (nodp_fam.fathid != '0') & (nodp_fam.mothid == '0')
nodp_fam.mothid[nom_id] = [x+'c' for x in nodp_fam.fathid[nom_id]]
nodp_fam = pd.concat([nodp_fam,create_founder(nodp_fam,'fathid'),create_founder(nodp_fam,'mothid')])
nodp_fam = nodp_fam[~nodp_fam.iid.duplicated()]
nodp_fam.to_csv('data/nodp_fam.csv',header=False,index=False)
nodp_fam.fathid[nodp_fam.iid=='27_122_16055'] = '27_122_84953'
nodp_fam.mothid[nodp_fam.iid=='27_122_16055'] = '27_122_84952'
nodp_fam.fathid[nodp_fam.iid=='10R_R47_62'] = '10R_R47_43'
nodp_fam.mothid[nodp_fam.iid=='10R_R47_62'] = '10R_R47_1'
nodp_fam[nodp_fam.fid == '27_126'].sort_values('fathid')
nodp_fam.fathid[nodp_fam.iid=='27_126_86559'] = '27_126_86560'
nodp_fam.mothid[nodp_fam.iid=='27_126_86559'] = '27_126_85004'
nodp_fam[nodp_fam.fid == '10R_R78'].sort_values(['fathid','iid'])
nodp_fam.fathid[nodp_fam.iid=='10R_R78_31'] = '10R_R78_51'
nodp_fam[nodp_fam.fid == '27_152'].sort_values('fathid')
nodp_fam.fathid[nodp_fam.iid=='27_152_85910'] = '27_152_85911'
nodp_fam.mothid[nodp_fam.iid=='27_152_85910'] = '27_152_85912'
nodp_fam[nodp_fam.fid == '26_TCC'].sort_values('fathid')
nodp_fam.fathid[nodp_fam.iid=='26_TCC_TCC65609'] = '26_TCC_TCC65610'
nodp_fam.mothid[nodp_fam.iid=='26_TCC_TCC65609'] = '26_TCC_TCC65606'
nodp_fam[nodp_fam.fid == '10R_R99'].sort_values('fathid')
nodp_fam[nodp_fam.fid == '27_192'].sort_values('fathid')
nodp_fam[nodp_fam.fid == '4_715'].sort_values('fathid')
nodp_fam.sex[nodp_fam.iid=='4_715_6'] = 1
nodp_fam[nodp_fam.fid == '170'].sort_values('fathid')
nodp_fam.fathid[nodp_fam.iid=='170_38'] = '170_31'
nodp_fam.mothid[nodp_fam.iid=='170_38'] = '170_18'
def rowsofid(fam,i):
return fam[(fam.iid.isin([i]) | fam.fathid.isin([i]) | fam.mothid.isin([i]))]
rowsofid(nodp_fam,'10R_R78_21')
print(efiga_ped[efiga_ped.ID == '215_20'])
print(nia_ped[nia_ped.Sample_ID =='215_20'])
print(efi_nia_fam[efi_nia_fam.iid =='215_20'])
print(pheno_df[pheno_df.ID =='215_20'])
all_fam[all_fam.fid=='215']
all_fam.drop([1898])
nia_ped[nia_ped.Sample_ID.isin(['10R_R99_8','215_59','27_25_','4_595_18','4_595_69','4_603_43'])]
sum(pd.Series(famid.keys()).isin(fam_pop.fid))
all_fam
pheno_df.FID.value_counts().value_counts()
fam_pop
famid.value_counts()
one_fam
nodp_fam[(nodp_fam.fathid == '0') & (nodp_fam.mothid != '0')]
nodp_fam[(nodp_fam.fathid != '0') & (nodp_fam.mothid == '0')]