--- title: preprocess keywords: fastai sidebar: home_sidebar nb_path: "06_preprocess.ipynb" ---
fc_count2tpm
[source]
fc_count2tpm
(df
,samples
)
fc_count2fpkm
[source]
fc_count2fpkm
(df
,samples
)
featurecount_rename
[source]
featurecount_rename
(featurecounts
,clinical
,prefix
,sample_title
='sample'
,bam_title
='bam'
,count_title
='Geneid'
)
featurecounts will use the bamfile name as the sample name. This script rename the featurecounts by the given table.
:param str featurecounts: featurecounts table file :param str clinical: clinical table file with header :param str prefix: output prefix, there will be four outputs, one is count output (prefix_count.txt), and the other one is the rename featurecounts (prefix_featurecounts.txt) and FPKM and TPM {prefix}_featurecounts_FPKM.txt {prefix}_featurecounts_TPM.txt :param str sample_title: the column name of the sample name :param str bam_title: the column name of the bamfile name :param str count_title: the column name used as identity in count data
featurecounts='tests/fc.txt'
clinical='tests/clinical.txt'
prefix='tests/rename'
sample_title='sample'
bam_title='starbam'
count_title='gene_name'
featurecount_rename(featurecounts,clinical,prefix,sample_title=sample_title,bam_title=bam_title,count_title=count_title)
!head tests/fc.txt tests/clinical.txt tests/rename*
==> tests/fc.txt <== #Program:featureCounts v2.0.1; Command:"featureCounts" "-T" "12" "-p" "-t" "exon" "-g" "gene_id" "-M" "--ignoreDup" "--extraAttributes" "gene_name,gene_biotype" "-a" "/aegis/database/mus_musculus/99/transcriptome.gtf" "-o" "pair-end/results/RNA/merged/star_featurecounts.txt" "--tmpDir" "pair-end/results/RNA/tmp" "pair-end/results/RNA/SRR12678525/align_data/SRR12678525_STAR_Aligned.sortedByCoord.out.bam" "pair-end/results/RNA/SRR12678526/align_data/SRR12678526_STAR_Aligned.sortedByCoord.out.bam" "pair-end/results/RNA/SRR12678527/align_data/SRR12678527_STAR_Aligned.sortedByCoord.out.bam" "pair-end/results/RNA/SRR12678528/align_data/SRR12678528_STAR_Aligned.sortedByCoord.out.bam" "pair-end/results/RNA/SRR12678529/align_data/SRR12678529_STAR_Aligned.sortedByCoord.out.bam" "pair-end/results/RNA/SRR12678530/align_data/SRR12678530_STAR_Aligned.sortedByCoord.out.bam" Geneid Chr Start End Strand Length gene_name gene_biotype pair-end/results/RNA/SRR12678525/align_data/SRR12678525_STAR_Aligned.sortedByCoord.out.bam pair-end/results/RNA/SRR12678526/align_data/SRR12678526_STAR_Aligned.sortedByCoord.out.bam pair-end/results/RNA/SRR12678527/align_data/SRR12678527_STAR_Aligned.sortedByCoord.out.bam pair-end/results/RNA/SRR12678528/align_data/SRR12678528_STAR_Aligned.sortedByCoord.out.bam pair-end/results/RNA/SRR12678529/align_data/SRR12678529_STAR_Aligned.sortedByCoord.out.bam pair-end/results/RNA/SRR12678530/align_data/SRR12678530_STAR_Aligned.sortedByCoord.out.bam ENSMUSG00000102693 1 3073253 3074322 + 1070 4933401J01Rik TEC 0 0 0 0 0 0 ENSMUSG00000064842 1 3102016 3102125 + 110 Gm26206 snRNA 0 0 0 0 0 0 ENSMUSG00000051951 1;1;1;1;1;1;1 3205901;3206523;3213439;3213609;3214482;3421702;3670552 3207317;3207317;3215632;3216344;3216968;3421901;3671498 -;-;-;-;-;-;- 6094 Xkr4 protein_coding 6 9 6 13 12 11 ENSMUSG00000102851 1 3252757 3253236 + 480 Gm18956 processed_pseudogene 0 0 0 0 0 0 ENSMUSG00000103377 1 3365731 3368549 - 2819 Gm37180 TEC 0 0 0 0 0 0 ENSMUSG00000104017 1 3375556 3377788 - 2233 Gm37363 TEC 0 0 0 0 0 0 ==> tests/clinical.txt <== sample,condition,batch,starbam,hisat2bam SRR12678525,Control,b1,pair-end/results/RNA/SRR12678525/align_data/SRR12678525_STAR_Aligned.sortedByCoord.out.bam,pair-end/results/RNA/SRR12678525/align_data/SRR12678525_hisat2.bam SRR12678526,Control,b1,pair-end/results/RNA/SRR12678526/align_data/SRR12678526_STAR_Aligned.sortedByCoord.out.bam,pair-end/results/RNA/SRR12678526/align_data/SRR12678526_hisat2.bam SRR12678527,Control,b1,pair-end/results/RNA/SRR12678527/align_data/SRR12678527_STAR_Aligned.sortedByCoord.out.bam,pair-end/results/RNA/SRR12678527/align_data/SRR12678527_hisat2.bam SRR12678528,Case,b1,pair-end/results/RNA/SRR12678528/align_data/SRR12678528_STAR_Aligned.sortedByCoord.out.bam,pair-end/results/RNA/SRR12678528/align_data/SRR12678528_hisat2.bam SRR12678529,Case,b1,pair-end/results/RNA/SRR12678529/align_data/SRR12678529_STAR_Aligned.sortedByCoord.out.bam,pair-end/results/RNA/SRR12678529/align_data/SRR12678529_hisat2.bam SRR12678530,Case,b1,pair-end/results/RNA/SRR12678530/align_data/SRR12678530_STAR_Aligned.sortedByCoord.out.bam,pair-end/results/RNA/SRR12678530/align_data/SRR12678530_hisat2.bam ==> tests/rename_counts.txt <== gene_name SRR12678525 SRR12678526 SRR12678527 SRR12678528 SRR12678529 SRR12678530 4933401J01Rik 0 0 0 0 0 0 Gm26206 0 0 0 0 0 0 Xkr4 6 9 6 13 12 11 Gm18956 0 0 0 0 0 0 Gm37180 0 0 0 0 0 0 Gm37363 0 0 0 0 0 0 ==> tests/rename_fc.txt <== Geneid Chr Start End Strand Length gene_name gene_biotype SRR12678525 SRR12678526 SRR12678527 SRR12678528 SRR12678529 SRR12678530 ENSMUSG00000102693 1 3073253 3074322 + 1070 4933401J01Rik TEC 0 0 0 0 0 0 ENSMUSG00000064842 1 3102016 3102125 + 110 Gm26206 snRNA 0 0 0 0 0 0 ENSMUSG00000051951 1;1;1;1;1;1;1 3205901;3206523;3213439;3213609;3214482;3421702;3670552 3207317;3207317;3215632;3216344;3216968;3421901;3671498 -;-;-;-;-;-;- 6094 Xkr4 protein_coding 6 9 6 13 12 11 ENSMUSG00000102851 1 3252757 3253236 + 480 Gm18956 processed_pseudogene 0 0 0 0 0 0 ENSMUSG00000103377 1 3365731 3368549 - 2819 Gm37180 TEC 0 0 0 0 0 0 ENSMUSG00000104017 1 3375556 3377788 - 2233 Gm37363 TEC 0 0 0 0 0 0 ==> tests/rename_featurecounts.txt <== Geneid Chr Start End Strand Length gene_name gene_biotype SRR12678525 SRR12678526 SRR12678527 SRR12678528 SRR12678529 SRR12678530 ENSMUSG00000102693 1 3073253 3074322 + 1070 4933401J01Rik TEC 0 0 0 0 0 0 ENSMUSG00000064842 1 3102016 3102125 + 110 Gm26206 snRNA 0 0 0 0 0 0 ENSMUSG00000051951 1;1;1;1;1;1;1 3205901;3206523;3213439;3213609;3214482;3421702;3670552 3207317;3207317;3215632;3216344;3216968;3421901;3671498 -;-;-;-;-;-;- 6094 Xkr4 protein_coding 6 9 6 13 12 11 ENSMUSG00000102851 1 3252757 3253236 + 480 Gm18956 processed_pseudogene 0 0 0 0 0 0 ENSMUSG00000103377 1 3365731 3368549 - 2819 Gm37180 TEC 0 0 0 0 0 0 ENSMUSG00000104017 1 3375556 3377788 - 2233 Gm37363 TEC 0 0 0 0 0 0 ==> tests/rename_featurecounts_FPKM.txt <== Geneid Chr Start End Strand Length gene_name gene_biotype SRR12678525 SRR12678526 SRR12678527 SRR12678528 SRR12678529 SRR12678530 ENSMUSG00000102693 1 3073253 3074322 + 1070 4933401J01Rik TEC 0.0 0.0 0.0 0.0 0.0 0.0 ENSMUSG00000064842 1 3102016 3102125 + 110 Gm26206 snRNA 0.0 0.0 0.0 0.0 0.0 0.0 ENSMUSG00000051951 1;1;1;1;1;1;1 3205901;3206523;3213439;3213609;3214482;3421702;3670552 3207317;3207317;3215632;3216344;3216968;3421901;3671498 -;-;-;-;-;-;- 6094 Xkr4 protein_coding 164.0958319658681 164.09583196586806 164.0958319658681 164.0958319658681 164.0958319658681 164.0958319658681 ENSMUSG00000102851 1 3252757 3253236 + 480 Gm18956 processed_pseudogene 0.0 0.0 0.0 0.0 0.0 0.0 ENSMUSG00000103377 1 3365731 3368549 - 2819 Gm37180 TEC 0.0 0.0 0.0 0.0 0.0 0.0 ENSMUSG00000104017 1 3375556 3377788 - 2233 Gm37363 TEC 0.0 0.0 0.0 0.0 0.0 0.0 ==> tests/rename_featurecounts_TPM.txt <== Geneid Chr Start End Strand Length gene_name gene_biotype SRR12678525 SRR12678526 SRR12678527 SRR12678528 SRR12678529 SRR12678530 ENSMUSG00000102693 1 3073253 3074322 + 1070 4933401J01Rik TEC 0.0 0.0 0.0 0.0 0.0 0.0 ENSMUSG00000064842 1 3102016 3102125 + 110 Gm26206 snRNA 0.0 0.0 0.0 0.0 0.0 0.0 ENSMUSG00000051951 1;1;1;1;1;1;1 3205901;3206523;3213439;3213609;3214482;3421702;3670552 3207317;3207317;3215632;3216344;3216968;3421901;3671498 -;-;-;-;-;-;- 6094 Xkr4 protein_coding 1000000.0 1000000.0 1000000.0 1000000.0 1000000.0 1000000.0 ENSMUSG00000102851 1 3252757 3253236 + 480 Gm18956 processed_pseudogene 0.0 0.0 0.0 0.0 0.0 0.0 ENSMUSG00000103377 1 3365731 3368549 - 2819 Gm37180 TEC 0.0 0.0 0.0 0.0 0.0 0.0 ENSMUSG00000104017 1 3375556 3377788 - 2233 Gm37363 TEC 0.0 0.0 0.0 0.0 0.0 0.0
df=pd.read_csv(prefix+'_featurecounts.txt',sep='\t')
samples=['SRR12678525', 'SRR12678526', 'SRR12678527', 'SRR12678528', 'SRR12678529', 'SRR12678530']
fc_count2tpm(df,samples)
Geneid | Chr | Start | End | Strand | Length | gene_name | gene_biotype | SRR12678525 | SRR12678526 | SRR12678527 | SRR12678528 | SRR12678529 | SRR12678530 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | ENSMUSG00000102693 | 1 | 3073253 | 3074322 | + | 1070 | 4933401J01Rik | TEC | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | ENSMUSG00000064842 | 1 | 3102016 | 3102125 | + | 110 | Gm26206 | snRNA | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | ENSMUSG00000051951 | 1;1;1;1;1;1;1 | 3205901;3206523;3213439;3213609;3214482;342170... | 3207317;3207317;3215632;3216344;3216968;342190... | -;-;-;-;-;-;- | 6094 | Xkr4 | protein_coding | 1000000.0 | 1000000.0 | 1000000.0 | 1000000.0 | 1000000.0 | 1000000.0 |
3 | ENSMUSG00000102851 | 1 | 3252757 | 3253236 | + | 480 | Gm18956 | processed_pseudogene | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | ENSMUSG00000103377 | 1 | 3365731 | 3368549 | - | 2819 | Gm37180 | TEC | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 | ENSMUSG00000104017 | 1 | 3375556 | 3377788 | - | 2233 | Gm37363 | TEC | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
fc_count2fpkm(df,samples)
Geneid | Chr | Start | End | Strand | Length | gene_name | gene_biotype | SRR12678525 | SRR12678526 | SRR12678527 | SRR12678528 | SRR12678529 | SRR12678530 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | ENSMUSG00000102693 | 1 | 3073253 | 3074322 | + | 1070 | 4933401J01Rik | TEC | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
1 | ENSMUSG00000064842 | 1 | 3102016 | 3102125 | + | 110 | Gm26206 | snRNA | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
2 | ENSMUSG00000051951 | 1;1;1;1;1;1;1 | 3205901;3206523;3213439;3213609;3214482;342170... | 3207317;3207317;3215632;3216344;3216968;342190... | -;-;-;-;-;-;- | 6094 | Xkr4 | protein_coding | 164.095832 | 164.095832 | 164.095832 | 164.095832 | 164.095832 | 164.095832 |
3 | ENSMUSG00000102851 | 1 | 3252757 | 3253236 | + | 480 | Gm18956 | processed_pseudogene | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
4 | ENSMUSG00000103377 | 1 | 3365731 | 3368549 | - | 2819 | Gm37180 | TEC | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
5 | ENSMUSG00000104017 | 1 | 3375556 | 3377788 | - | 2233 | Gm37363 | TEC | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |