--- title: 1.read fam keywords: fastai sidebar: home_sidebar nb_path: "nbs/06_Create_MWE.ipynb" ---
import pandas as pd
import gzip
def get_vcf_names(vcf_path):
with gzip.open(vcf_path, "rt") as ifile:
for line in ifile:
if line.startswith("#CHROM"):
vcf_names = [x.strip() for x in line.split('\t')]
break
ifile.close()
return vcf_names
def read_vcf_chunk(fn,chunksize=10):
names = get_vcf_names(fn)
try:
vcf = pd.read_csv(fn, compression='gzip', comment='#', chunksize=chunksize, delim_whitespace=True, header=None, names=names)
except:
vcf = pd.read_csv(fn, comment='#', chunksize=chunksize, delim_whitespace=True, header=None, names=names)
df= pd.DataFrame(vcf.get_chunk(chunksize))
return df
def read_anno_chunk(fn,chunksize=10):
try:
vcf = pd.read_csv(fn, compression='gzip', chunksize=chunksize)
except:
vcf = pd.read_csv(fn, chunksize=chunksize)
df= pd.DataFrame(vcf.get_chunk(chunksize))
return df
fam = pd.read_csv('data/nodp_fam.csv',header=None,names=['fid','iid','fathid','mothid','sex','ad','vcf'])
fam.iloc[:,:-1].sort_values('fid').to_csv('data/nodp_fam.fam',header=False,index=False,sep='\t')
fam[fam.fid.isin(['22_1','1036','28_9'])].iloc[:,:-1].sort_values('fid').to_csv('data/mwe_normal_fam.csv',header=False,index=False,sep='\t')
tmp = pd.DataFrame([pd.Series(list(set(fam.fid))),pd.Series([sum(fam[fam.fid == i].vcf) for i in set(fam.fid)]),pd.Series([sum(fam.fid == i) for i in set(fam.fid)])])
tmp = tmp.T
tmp.columns = ['fid','vcf','tol']
tmp['per'] = tmp.vcf/tmp.tol
tmp.sort_values('per')
fam[fam.fid.isin(['27_104'])].iloc[:,:-1].sort_values('fid').to_csv('data/mwe_multi_group_fam.csv',header=False,index=False,sep='\t')
fam[fam.fid.isin(['4_461'])].iloc[:,:-1].sort_values('fid').to_csv('data/mwe_in_law_fam.csv',header=False,index=False,sep='\t')
fam[fam.fid.isin(['1005'])].iloc[:,:-1].sort_values('fid').to_csv('data/mwe_the_1005_fam.csv',header=False,index=False,sep='\t')
ok_plot = pd.read_csv('data/ok_plot.csv',index_col=0)
fam[fam.fid.isin(ok_plot.x[:10])].iloc[:,:-1].sort_values('fid').to_csv('data/mwe_ok_plot_fam.csv',header=False,index=False,sep='\t')
f4_364 = fam[fam.fid.isin(['4_364'])].iloc[:,:-1]
f4_364.to_csv('data/mwe_the_4_364_all_fam.csv',header=False,index=False,sep='\t')
f4_364[~f4_364.iid.isin(['4_364_'+str(i) for i in range(14,20)])].to_csv('data/mwe_the_4_364_rm_4-5layers_fam.csv',header=False,index=False,sep='\t')
f4_364[~f4_364.iid.isin(['4_364_'+str(i) for i in range(17,20)])].to_csv('data/mwe_the_4_364_rm_5layers_fam.csv',header=False,index=False,sep='\t')
ped = pd.read_csv('data/nodp_ped_with_depth_trim.csv',header=0)
ped[ped.fid.isin(trim_smry.index[trim_smry.before<4])].iloc[:,:-3].sort_values('fid').to_csv('data/mwe_3layers.csv',header=False,index=False,sep='\t')
trim_ped = ped[ped.trim==False]
trim_smry = pd.DataFrame([[ped.depth[ped.fid == i].max() for i in trim_ped.fid.unique()],[trim_ped.depth[trim_ped.fid == i].max() for i in trim_ped.fid.unique()]])
trim_smry = trim_smry.T
trim_smry.index = trim_ped.fid.unique()
trim_smry.columns = ['before','after']
trim_smry.hist()
trim_ped[trim_ped.fid.isin(trim_smry.index[trim_smry.after<4])].iloc[:,:-3].sort_values('fid').to_csv('data/mwe_trimed_3layers.csv',header=False,index=False,sep='\t')
trim_lay3 = trim_ped[trim_ped.fid.isin(trim_smry.index[trim_smry.after<4])]
trim_faml20_fid = trim_lay3.fid.value_counts()<20
trim_ped[trim_ped.fid.isin(trim_faml20_fid[trim_faml20_fid].index)].iloc[:,:-3].sort_values('fid').to_csv('data/mwe_trimed_3layers_faml20.csv',header=False,index=False,sep='\t')
vcf = read_vcf_chunk('/mnt/mfs/statgen/alzheimers-family/linkage_files/geno/full_sample/vcf/full_sample.vcf.gz', chunksize=1000)
vcf
import numpy as np
anno = read_anno_chunk('/home/yl4604/project/alzheimers-family/SMMAT/20210802/annotation/EFIGA_NIALOAD_chr1.hg38.hg38_multianno.csv',chunksize=1000)
af = anno.AF
af = af.replace('.',0.00001).astype(np.float64)
af.hist()
vcf.INFO = ['AF='+str(i) for i in af]
vcf.to_csv('data/first1000snp_full_samples.vcf',sep='\t',header=True,index=False)