--- title: preprocess keywords: fastai sidebar: home_sidebar nb_path: "06_preprocess.ipynb" ---
{% raw %}
{% endraw %} {% raw %}
{% endraw %} {% raw %}

fc_count2tpm[source]

fc_count2tpm(df, samples)

{% endraw %} {% raw %}

fc_count2fpkm[source]

fc_count2fpkm(df, samples)

{% endraw %} {% raw %}
{% endraw %} {% raw %}

featurecount_rename[source]

featurecount_rename(featurecounts, clinical, prefix, sample_title='sample', bam_title='bam', count_title='Geneid')

featurecounts will use the bamfile name as the sample name. This script rename the featurecounts by the given table.

:param str featurecounts: featurecounts table file :param str clinical: clinical table file with header :param str prefix: output prefix, there will be four outputs, one is count output (prefix_count.txt), and the other one is the rename featurecounts (prefix_featurecounts.txt) and FPKM and TPM {prefix}_featurecounts_FPKM.txt {prefix}_featurecounts_TPM.txt :param str sample_title: the column name of the sample name :param str bam_title: the column name of the bamfile name :param str count_title: the column name used as identity in count data

{% endraw %} {% raw %}
{% endraw %} {% raw %}
featurecounts='tests/fc.txt'
clinical='tests/clinical.txt'
prefix='tests/rename'
sample_title='sample'
bam_title='starbam'
count_title='gene_name'
featurecount_rename(featurecounts,clinical,prefix,sample_title=sample_title,bam_title=bam_title,count_title=count_title)
{% endraw %} {% raw %}
!head tests/fc.txt tests/clinical.txt tests/rename*
==> tests/fc.txt <==
#Program:featureCounts	v2.0.1;	Command:"featureCounts"	"-T"	"12"	"-p"	"-t"	"exon"	"-g"	"gene_id"	"-M"	"--ignoreDup"	"--extraAttributes"	"gene_name,gene_biotype"	"-a"	"/aegis/database/mus_musculus/99/transcriptome.gtf"	"-o"	"pair-end/results/RNA/merged/star_featurecounts.txt"	"--tmpDir"	"pair-end/results/RNA/tmp"	"pair-end/results/RNA/SRR12678525/align_data/SRR12678525_STAR_Aligned.sortedByCoord.out.bam"	"pair-end/results/RNA/SRR12678526/align_data/SRR12678526_STAR_Aligned.sortedByCoord.out.bam"	"pair-end/results/RNA/SRR12678527/align_data/SRR12678527_STAR_Aligned.sortedByCoord.out.bam"	"pair-end/results/RNA/SRR12678528/align_data/SRR12678528_STAR_Aligned.sortedByCoord.out.bam"	"pair-end/results/RNA/SRR12678529/align_data/SRR12678529_STAR_Aligned.sortedByCoord.out.bam"	"pair-end/results/RNA/SRR12678530/align_data/SRR12678530_STAR_Aligned.sortedByCoord.out.bam"	
Geneid	Chr	Start	End	Strand	Length	gene_name	gene_biotype	pair-end/results/RNA/SRR12678525/align_data/SRR12678525_STAR_Aligned.sortedByCoord.out.bam	pair-end/results/RNA/SRR12678526/align_data/SRR12678526_STAR_Aligned.sortedByCoord.out.bam	pair-end/results/RNA/SRR12678527/align_data/SRR12678527_STAR_Aligned.sortedByCoord.out.bam	pair-end/results/RNA/SRR12678528/align_data/SRR12678528_STAR_Aligned.sortedByCoord.out.bam	pair-end/results/RNA/SRR12678529/align_data/SRR12678529_STAR_Aligned.sortedByCoord.out.bam	pair-end/results/RNA/SRR12678530/align_data/SRR12678530_STAR_Aligned.sortedByCoord.out.bam
ENSMUSG00000102693	1	3073253	3074322	+	1070	4933401J01Rik	TEC	0	0	0	0	0	0
ENSMUSG00000064842	1	3102016	3102125	+	110	Gm26206	snRNA	0	0	0	0	0	0
ENSMUSG00000051951	1;1;1;1;1;1;1	3205901;3206523;3213439;3213609;3214482;3421702;3670552	3207317;3207317;3215632;3216344;3216968;3421901;3671498	-;-;-;-;-;-;-	6094	Xkr4	protein_coding	6	9	6	13	12	11
ENSMUSG00000102851	1	3252757	3253236	+	480	Gm18956	processed_pseudogene	0	0	0	0	0	0
ENSMUSG00000103377	1	3365731	3368549	-	2819	Gm37180	TEC	0	0	0	0	0	0
ENSMUSG00000104017	1	3375556	3377788	-	2233	Gm37363	TEC	0	0	0	0	0	0
==> tests/clinical.txt <==
sample,condition,batch,starbam,hisat2bam
SRR12678525,Control,b1,pair-end/results/RNA/SRR12678525/align_data/SRR12678525_STAR_Aligned.sortedByCoord.out.bam,pair-end/results/RNA/SRR12678525/align_data/SRR12678525_hisat2.bam
SRR12678526,Control,b1,pair-end/results/RNA/SRR12678526/align_data/SRR12678526_STAR_Aligned.sortedByCoord.out.bam,pair-end/results/RNA/SRR12678526/align_data/SRR12678526_hisat2.bam
SRR12678527,Control,b1,pair-end/results/RNA/SRR12678527/align_data/SRR12678527_STAR_Aligned.sortedByCoord.out.bam,pair-end/results/RNA/SRR12678527/align_data/SRR12678527_hisat2.bam
SRR12678528,Case,b1,pair-end/results/RNA/SRR12678528/align_data/SRR12678528_STAR_Aligned.sortedByCoord.out.bam,pair-end/results/RNA/SRR12678528/align_data/SRR12678528_hisat2.bam
SRR12678529,Case,b1,pair-end/results/RNA/SRR12678529/align_data/SRR12678529_STAR_Aligned.sortedByCoord.out.bam,pair-end/results/RNA/SRR12678529/align_data/SRR12678529_hisat2.bam
SRR12678530,Case,b1,pair-end/results/RNA/SRR12678530/align_data/SRR12678530_STAR_Aligned.sortedByCoord.out.bam,pair-end/results/RNA/SRR12678530/align_data/SRR12678530_hisat2.bam
==> tests/rename_counts.txt <==
gene_name	SRR12678525	SRR12678526	SRR12678527	SRR12678528	SRR12678529	SRR12678530
4933401J01Rik	0	0	0	0	0	0
Gm26206	0	0	0	0	0	0
Xkr4	6	9	6	13	12	11
Gm18956	0	0	0	0	0	0
Gm37180	0	0	0	0	0	0
Gm37363	0	0	0	0	0	0

==> tests/rename_fc.txt <==
Geneid	Chr	Start	End	Strand	Length	gene_name	gene_biotype	SRR12678525	SRR12678526	SRR12678527	SRR12678528	SRR12678529	SRR12678530
ENSMUSG00000102693	1	3073253	3074322	+	1070	4933401J01Rik	TEC	0	0	0	0	0	0
ENSMUSG00000064842	1	3102016	3102125	+	110	Gm26206	snRNA	0	0	0	0	0	0
ENSMUSG00000051951	1;1;1;1;1;1;1	3205901;3206523;3213439;3213609;3214482;3421702;3670552	3207317;3207317;3215632;3216344;3216968;3421901;3671498	-;-;-;-;-;-;-	6094	Xkr4	protein_coding	6	9	6	13	12	11
ENSMUSG00000102851	1	3252757	3253236	+	480	Gm18956	processed_pseudogene	0	0	0	0	0	0
ENSMUSG00000103377	1	3365731	3368549	-	2819	Gm37180	TEC	0	0	0	0	0	0
ENSMUSG00000104017	1	3375556	3377788	-	2233	Gm37363	TEC	0	0	0	0	0	0

==> tests/rename_featurecounts.txt <==
Geneid	Chr	Start	End	Strand	Length	gene_name	gene_biotype	SRR12678525	SRR12678526	SRR12678527	SRR12678528	SRR12678529	SRR12678530
ENSMUSG00000102693	1	3073253	3074322	+	1070	4933401J01Rik	TEC	0	0	0	0	0	0
ENSMUSG00000064842	1	3102016	3102125	+	110	Gm26206	snRNA	0	0	0	0	0	0
ENSMUSG00000051951	1;1;1;1;1;1;1	3205901;3206523;3213439;3213609;3214482;3421702;3670552	3207317;3207317;3215632;3216344;3216968;3421901;3671498	-;-;-;-;-;-;-	6094	Xkr4	protein_coding	6	9	6	13	12	11
ENSMUSG00000102851	1	3252757	3253236	+	480	Gm18956	processed_pseudogene	0	0	0	0	0	0
ENSMUSG00000103377	1	3365731	3368549	-	2819	Gm37180	TEC	0	0	0	0	0	0
ENSMUSG00000104017	1	3375556	3377788	-	2233	Gm37363	TEC	0	0	0	0	0	0

==> tests/rename_featurecounts_FPKM.txt <==
Geneid	Chr	Start	End	Strand	Length	gene_name	gene_biotype	SRR12678525	SRR12678526	SRR12678527	SRR12678528	SRR12678529	SRR12678530
ENSMUSG00000102693	1	3073253	3074322	+	1070	4933401J01Rik	TEC	0.0	0.0	0.0	0.0	0.0	0.0
ENSMUSG00000064842	1	3102016	3102125	+	110	Gm26206	snRNA	0.0	0.0	0.0	0.0	0.0	0.0
ENSMUSG00000051951	1;1;1;1;1;1;1	3205901;3206523;3213439;3213609;3214482;3421702;3670552	3207317;3207317;3215632;3216344;3216968;3421901;3671498	-;-;-;-;-;-;-	6094	Xkr4	protein_coding	164.0958319658681	164.09583196586806	164.0958319658681	164.0958319658681	164.0958319658681	164.0958319658681
ENSMUSG00000102851	1	3252757	3253236	+	480	Gm18956	processed_pseudogene	0.0	0.0	0.0	0.0	0.0	0.0
ENSMUSG00000103377	1	3365731	3368549	-	2819	Gm37180	TEC	0.0	0.0	0.0	0.0	0.0	0.0
ENSMUSG00000104017	1	3375556	3377788	-	2233	Gm37363	TEC	0.0	0.0	0.0	0.0	0.0	0.0

==> tests/rename_featurecounts_TPM.txt <==
Geneid	Chr	Start	End	Strand	Length	gene_name	gene_biotype	SRR12678525	SRR12678526	SRR12678527	SRR12678528	SRR12678529	SRR12678530
ENSMUSG00000102693	1	3073253	3074322	+	1070	4933401J01Rik	TEC	0.0	0.0	0.0	0.0	0.0	0.0
ENSMUSG00000064842	1	3102016	3102125	+	110	Gm26206	snRNA	0.0	0.0	0.0	0.0	0.0	0.0
ENSMUSG00000051951	1;1;1;1;1;1;1	3205901;3206523;3213439;3213609;3214482;3421702;3670552	3207317;3207317;3215632;3216344;3216968;3421901;3671498	-;-;-;-;-;-;-	6094	Xkr4	protein_coding	1000000.0	1000000.0	1000000.0	1000000.0	1000000.0	1000000.0
ENSMUSG00000102851	1	3252757	3253236	+	480	Gm18956	processed_pseudogene	0.0	0.0	0.0	0.0	0.0	0.0
ENSMUSG00000103377	1	3365731	3368549	-	2819	Gm37180	TEC	0.0	0.0	0.0	0.0	0.0	0.0
ENSMUSG00000104017	1	3375556	3377788	-	2233	Gm37363	TEC	0.0	0.0	0.0	0.0	0.0	0.0
{% endraw %} {% raw %}
df=pd.read_csv(prefix+'_featurecounts.txt',sep='\t')
samples=['SRR12678525', 'SRR12678526', 'SRR12678527', 'SRR12678528', 'SRR12678529', 'SRR12678530']
{% endraw %} {% raw %}
fc_count2tpm(df,samples)
Geneid Chr Start End Strand Length gene_name gene_biotype SRR12678525 SRR12678526 SRR12678527 SRR12678528 SRR12678529 SRR12678530
0 ENSMUSG00000102693 1 3073253 3074322 + 1070 4933401J01Rik TEC 0.0 0.0 0.0 0.0 0.0 0.0
1 ENSMUSG00000064842 1 3102016 3102125 + 110 Gm26206 snRNA 0.0 0.0 0.0 0.0 0.0 0.0
2 ENSMUSG00000051951 1;1;1;1;1;1;1 3205901;3206523;3213439;3213609;3214482;342170... 3207317;3207317;3215632;3216344;3216968;342190... -;-;-;-;-;-;- 6094 Xkr4 protein_coding 1000000.0 1000000.0 1000000.0 1000000.0 1000000.0 1000000.0
3 ENSMUSG00000102851 1 3252757 3253236 + 480 Gm18956 processed_pseudogene 0.0 0.0 0.0 0.0 0.0 0.0
4 ENSMUSG00000103377 1 3365731 3368549 - 2819 Gm37180 TEC 0.0 0.0 0.0 0.0 0.0 0.0
5 ENSMUSG00000104017 1 3375556 3377788 - 2233 Gm37363 TEC 0.0 0.0 0.0 0.0 0.0 0.0
{% endraw %} {% raw %}
fc_count2fpkm(df,samples)
Geneid Chr Start End Strand Length gene_name gene_biotype SRR12678525 SRR12678526 SRR12678527 SRR12678528 SRR12678529 SRR12678530
0 ENSMUSG00000102693 1 3073253 3074322 + 1070 4933401J01Rik TEC 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
1 ENSMUSG00000064842 1 3102016 3102125 + 110 Gm26206 snRNA 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
2 ENSMUSG00000051951 1;1;1;1;1;1;1 3205901;3206523;3213439;3213609;3214482;342170... 3207317;3207317;3215632;3216344;3216968;342190... -;-;-;-;-;-;- 6094 Xkr4 protein_coding 164.095832 164.095832 164.095832 164.095832 164.095832 164.095832
3 ENSMUSG00000102851 1 3252757 3253236 + 480 Gm18956 processed_pseudogene 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
4 ENSMUSG00000103377 1 3365731 3368549 - 2819 Gm37180 TEC 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
5 ENSMUSG00000104017 1 3375556 3377788 - 2233 Gm37363 TEC 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
{% endraw %}