--- title: Title keywords: fastai sidebar: home_sidebar nb_path: "nbs/debug_info.ipynb" ---
import dask as da
import dask.dataframe as dd
import pandas as pd
import gzip
def get_vcf_names(vcf_path):
with gzip.open(vcf_path, "rt") as ifile:
for line in ifile:
if line.startswith("#CHROM"):
vcf_names = [x.strip() for x in line.split('\t')]
break
ifile.close()
return vcf_names
def read_vcf_chunk(fn,chunksize=10):
names = get_vcf_names(fn)
try:
vcf = pd.read_csv(fn, compression='gzip', comment='#', chunksize=chunksize, delim_whitespace=True, header=None, names=names)
except:
vcf = pd.read_csv(fn, comment='#', chunksize=chunksize, delim_whitespace=True, header=None, names=names)
df= pd.DataFrame(vcf.get_chunk(chunksize))
return df
vcf = read_vcf_chunk('/mnt/mfs/statgen/alzheimers-family/linkage_files/geno/full_sample/vcf/full_sample.vcf.gz', chunksize=10000)
fam = pd.read_csv('/mnt/mfs/statgen/alzheimers-family/linkage_files/geno/full_sample/bfiles/full_sample.fam',delim_whitespace=True, header=None,names = ['fid','iid','father','mother','gender','trait'])
sum(fam.iid == pd.Series(names[9:]))
fam
?Questions for Chong: why do the last rows use characters? what is the meaning of last colnum.
fam.shape
anno = pd.read_csv('/mnt/mfs/statgen/alzheimers-family/linkage_files/geno/full_sample/rare_positions/full_sample_coding.hg38_multianno.txt',delim_whitespace=True)
anno
anno.AF.value_counts()
?Question for Chong. all of the AF are ., which means there is something wrong with the annotation.
[anno[[i]].value_counts() for i in anno.columns[5:]]
pheno_full_sample_path = '/mnt/mfs/statgen/alzheimers-family/linkage_files/pheno/full_sample/'
efiga_pedigree.txt full_sample_efi_nia.fam full_sample_fam_id.txt full_sample_fam_pop.txt full_sample_id_list.txt full_sample_pheno.txt niaload_pedigree.txt
efiga = pd.read_csv(pheno_full_sample_path+'efiga_pedigree.txt',delim_whitespace=True)
efiga
efi_nia_fam = pd.read_csv(pheno_full_sample_path+'full_sample_efi_nia.fam',delim_whitespace=True,header=None,names = ['fid','iid','father','mother','gender','trait'])
efi_nia_fam
efi_nia_txt = pd.read_csv(pheno_full_sample_path+'full_sample_fam_id.txt',delim_whitespace=True,header=None,names = ['fid','iid','father','mother','gender','trait','id'])
efi_nia_txt
fam_pop = pd.read_csv(pheno_full_sample_path+'full_sample_fam_pop.txt',delim_whitespace=True,header=None,names = ['fid','pop'])
fam_pop
sample_id_list = pd.read_csv(pheno_full_sample_path+'full_sample_id_list.txt',delim_whitespace=True,header=None,names = ['id'])
sample_id_list
sample_pheno = pd.read_csv(pheno_full_sample_path+'full_sample_pheno.txt',sep='\t')
sample_pheno
sample_pheno.describe(include='all')
nia_ped = pd.read_csv(pheno_full_sample_path+'niaload_pedigree.txt',delim_whitespace=True)
nia_ped
s1_vcf = read_vcf_chunk('/mnt/mfs/statgen/alzheimers-family/linkage_files/geno/sample_i/rare_positions/sample_i_coding.hg38_multianno.vcf.gz')
s1_vcf
s1_fam = pd.read_csv('/mnt/mfs/statgen/alzheimers-family/linkage_files/geno/sample_i/rare_positions/sample_i_coding.hg38_multianno.fam',delim_whitespace=True, header=None,names = ['fid','iid','father','mother','gender','trait'])
s1_fam
s1_anno = pd.read_csv('/mnt/mfs/statgen/alzheimers-family/linkage_files/geno/sample_i/rare_positions/sample_i_coding.hg38_multianno.txt',delim_whitespace=True)
s1_anno
s2_vcf = read_vcf_chunk('/mnt/mfs/statgen/alzheimers-family/linkage_files/geno/sample_ii/rare_positions/sample_ii_coding.hg38_multianno.vcf.gz')
s2_vcf
s2_fam = pd.read_csv('/mnt/mfs/statgen/alzheimers-family/linkage_files/pheno/sample_ii/small_sample_ii.fam',delim_whitespace=True, header=None,names = ['fid','iid','father','mother','gender','trait'])
s2_fam
s2_anno = pd.read_csv('/mnt/mfs/statgen/alzheimers-family/linkage_files/geno/sample_ii/rare_positions/sample_ii_coding.hg38_multianno.txt',delim_whitespace=True)
s2_anno
s2_fam[s2_fam.duplicated()]
s2_vcf.columns[9:]
len(s2_vcf.columns[9:])
s2_fam[210:230]
s2_fam.drop_duplicates().to
s2_fam.to_csv('/mnt/mfs/statgen/alzheimers-family/yhseqlink/data/MWE/sample2_uniq.fam',header=False,index=False,sep='\t')
coding_region_rare_variant_positions.txt
why do you choose these snps?
vcf hg19? hg38?