import episcanpy as epi
import scanpy as sc
import anndata as ad
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
/Users/anna.danese/anaconda3/envs/new_snapatac/lib/python3.6/site-packages/anndata/core/anndata.py:17: FutureWarning: pandas.core.index is deprecated and will be removed in a future version. The public classes are available in the top-level namespace. from pandas.core.index import RangeIndex /Users/anna.danese/anaconda3/envs/new_snapatac/lib/python3.6/site-packages/scanpy/api/__init__.py:6: FutureWarning: In a future version of Scanpy, `scanpy.api` will be removed. Simply use `import scanpy as sc` and `import scanpy.external as sce` instead. FutureWarning
# figure settings
sc.set_figure_params(scanpy=True, dpi=80, dpi_save=250,
frameon=True, vector_friendly=True,
color_map=None, format='pdf', transparent=False,
ipython_format='png2x')
play data available at: https://www.dropbox.com/sh/zwmtsj6c85woyqp/AADSCz0QVgVtammgMjasK3R4a?dl=0
# path of input an output directories
DATADIR =''
OUT_DATADIR = './paper_data_methylation_episcanpy/'
# if you want to load the raw count matrix and impute it at the same time
#adata = epi.pp.readandimputematrix(DATADIR+'enhancer_c1_CG_paper.txt', min_coverage=500)
#to load the data without imputation
adata = epi.pp.load_met_noimput(DATADIR+'enhancer_c1_CG_paper.txt')
# save the data as a sparse AnnData rather than a dense tab separated file
adata.write(OUT_DATADIR+'enhancer_c1_CG_paper.h5ad')
adata = ad.read(OUT_DATADIR+'enhancer_c1_CG_paper.h5ad')
adata
AnnData object with n_obs × n_vars = 3379 × 55017 obs: 'cell_type', 'Library pool', 'Index i5', 'Index i7', 'i5 sequence', 'i7 sequence', 'random primer index', 'random primer index sequence', 'Total reads', 'Mapped reads', 'Filtered reads', 'mCCC/CCC', 'mCG/CG', 'mCH/CH', 'Estimated mCG/CG', 'Estimated mCH/CH', 'Neuron type', 'tSNE x coordinate', 'tSNE y coordinate\n', 'binary' uns: 'Estimated mCG', 'Estimated mCH', 'mCCC', 'mCG', 'mCH', 'omic'
# adding QC values
length1 = len(adata.X[0,:])
length2 = len(adata.X[:,0])
adata.obs['coverage_cells'] = [length1 - np.isnan(line).sum() for line in adata.X]
adata.obs['mean_cell_methylation'] = [np.nansum(line)/length1 for line in adata.X]
adata.var['coverage_feature'] = [length2 - np.isnan(line).sum() for line in adata.X.T]
adata.var['mean_feature_methylation'] = [np.nansum(line)/length2 for line in adata.X.T]
adata.obs
cell_type | Library pool | Index i5 | Index i7 | i5 sequence | i7 sequence | random primer index | random primer index sequence | Total reads | Mapped reads | ... | mCCC/CCC | mCG/CG | mCH/CH | Estimated mCG/CG | Estimated mCH/CH | Neuron type | tSNE x coordinate | tSNE y coordinate\n | binary | coverage_cells | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
index | |||||||||||||||||||||
allc_Pool_1000_AD002_indexed_qual30.tsv | mL5-1 | K1K2 | D508 | D711 | GTACTGAC | TCTCGCGC | AD002 | CGATGT | 3749296 | 1970400 | ... | 278 | 1388 | 1138 | 148 | 1022 | mL5-1 | -2,79539 | -0,105851\n | inhibitory | 21715 |
allc_nuclei_600_S147_L006_qual30.tsv | mL5-1 | N/A | D508 | D712 | GTACTGAC | AGCGATAG | AD012 | CTTGTA | 7069454 | 4358703 | ... | 208 | 238 | 1174 | 23 | 1114 | mL5-1 | -0,352829 | 0,932857\n | inhibitory | 9976 |
allc_nuclei_599_S146_L006_qual30.tsv | mL6-2 | N/A | D507 | D712 | CAGGACGT | AGCGATAG | AD011 | GGCTAC | 18242158 | 11386231 | ... | 88 | 1644 | 273 | 179 | 296 | mL6-2 | 10,0546 | -9,78996\n | inhibitory | 20162 |
allc_nuclei_598_S145_L006_qual30.tsv | mL6-1 | N/A | D506 | D712 | TAATCTTA | AGCGATAG | AD010 | TAGCTT | 7045252 | 4197889 | ... | 18 | 2750 | 887 | 2765 | 0 | mL6-1 | 9,37005 | 2,71283\n | inhibitory | 10374 |
allc_nuclei_597_S144_L006_qual30.tsv | mL4 | N/A | D505 | D712 | AGGCGAAG | AGCGATAG | AD008 | ACTTGA | 15116650 | 9483039 | ... | 149 | 2213 | 702 | 238 | 70 | mL4 | -4,17269 | -2,94523\n | inhibitory | 17226 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
allc_Pool_839_AD006_indexed_qual30.tsv | mPv | I1I2 | D507 | D712 | CAGGACGT | AGCGATAG | AD006 | GCCAAT | 6760981 | 2875103 | ... | 205 | 244 | 891 | 2264 | 821 | mPv | -0,198246 | 19,7879\n | inhibitory | 28767 |
allc_Pool_83_AD002_indexed_qual30.tsv | mL2/3 | A1A2 | D503 | D711 | CCTATCCT | TCTCGCGC | AD002 | CGATGT | 5254487 | 2706493 | ... | 141 | 528 | 494 | 541 | 484 | mL2/3 | -13,1049 | 0,642965\n | inhibitory | 27409 |
allc_Pool_83_AD006_indexed_qual30.tsv | mL4 | A1A2 | D503 | D711 | CCTATCCT | TCTCGCGC | AD006 | GCCAAT | 5332867 | 1913346 | ... | 93 | 1060 | 228 | 1071 | 230 | mL4 | -6,94662 | -9,61211\n | inhibitory | 26023 |
allc_Pool_995_AD002_indexed_qual30.tsv | mL6-2 | K1K2 | D503 | D711 | CCTATCCT | TCTCGCGC | AD002 | CGATGT | 3508294 | 1813927 | ... | 273 | 2144 | 1038 | 2156 | 93 | mL6-2 | 14,0262 | -4,28728\n | inhibitory | 20658 |
allc_nuclei-319_S59_L002_qual30_1.tsv | mL4 | 20160217 pool B | D503 | D708 | CCTATCCT | TAATGCGC | 3480402 | 1625695 | ... | 15 | 2158 | 597 | 226 | 582 | mL4 | -4,58041 | -5,07954\n | inhibitory | 1486 |
3379 rows × 21 columns
# number of peaks in a cell
plt.axvline(x=10000, color='r') # minimum number of feature covered per cells
np.histogram(adata.obs['coverage_cells'])
plt.hist(adata.obs['coverage_cells'], bins=50)
plt.show()
plt.axvline(x=500, color='r') # minimum number of cells covered to keep a feature
sns.set_style('whitegrid')
sns.kdeplot(np.array(adata.var['coverage_feature']), bw=0.5)
<matplotlib.axes._subplots.AxesSubplot at 0x1354389e8>
# quick look at the different cell metadata available
adata.obs
cell_type | Library pool | Index i5 | Index i7 | i5 sequence | i7 sequence | random primer index | random primer index sequence | Total reads | Mapped reads | ... | mCG/CG | mCH/CH | Estimated mCG/CG | Estimated mCH/CH | Neuron type | tSNE x coordinate | tSNE y coordinate\n | binary | coverage_cells | mean_cell_methylation | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
index | |||||||||||||||||||||
allc_Pool_1000_AD002_indexed_qual30.tsv | mL5-1 | K1K2 | D508 | D711 | GTACTGAC | TCTCGCGC | AD002 | CGATGT | 3749296 | 1970400 | ... | 1388 | 1138 | 148 | 1022 | mL5-1 | -2,79539 | -0,105851\n | inhibitory | 21715 | 0.243341 |
allc_nuclei_600_S147_L006_qual30.tsv | mL5-1 | N/A | D508 | D712 | GTACTGAC | AGCGATAG | AD012 | CTTGTA | 7069454 | 4358703 | ... | 238 | 1174 | 23 | 1114 | mL5-1 | -0,352829 | 0,932857\n | inhibitory | 9976 | 0.110672 |
allc_nuclei_599_S146_L006_qual30.tsv | mL6-2 | N/A | D507 | D712 | CAGGACGT | AGCGATAG | AD011 | GGCTAC | 18242158 | 11386231 | ... | 1644 | 273 | 179 | 296 | mL6-2 | 10,0546 | -9,78996\n | inhibitory | 20162 | 0.216064 |
allc_nuclei_598_S145_L006_qual30.tsv | mL6-1 | N/A | D506 | D712 | TAATCTTA | AGCGATAG | AD010 | TAGCTT | 7045252 | 4197889 | ... | 2750 | 887 | 2765 | 0 | mL6-1 | 9,37005 | 2,71283\n | inhibitory | 10374 | 0.148192 |
allc_nuclei_597_S144_L006_qual30.tsv | mL4 | N/A | D505 | D712 | AGGCGAAG | AGCGATAG | AD008 | ACTTGA | 15116650 | 9483039 | ... | 2213 | 702 | 238 | 70 | mL4 | -4,17269 | -2,94523\n | inhibitory | 17226 | 0.199148 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
allc_Pool_839_AD006_indexed_qual30.tsv | mPv | I1I2 | D507 | D712 | CAGGACGT | AGCGATAG | AD006 | GCCAAT | 6760981 | 2875103 | ... | 244 | 891 | 2264 | 821 | mPv | -0,198246 | 19,7879\n | inhibitory | 28767 | 0.393699 |
allc_Pool_83_AD002_indexed_qual30.tsv | mL2/3 | A1A2 | D503 | D711 | CCTATCCT | TCTCGCGC | AD002 | CGATGT | 5254487 | 2706493 | ... | 528 | 494 | 541 | 484 | mL2/3 | -13,1049 | 0,642965\n | inhibitory | 27409 | 0.275200 |
allc_Pool_83_AD006_indexed_qual30.tsv | mL4 | A1A2 | D503 | D711 | CCTATCCT | TCTCGCGC | AD006 | GCCAAT | 5332867 | 1913346 | ... | 1060 | 228 | 1071 | 230 | mL4 | -6,94662 | -9,61211\n | inhibitory | 26023 | 0.279163 |
allc_Pool_995_AD002_indexed_qual30.tsv | mL6-2 | K1K2 | D503 | D711 | CCTATCCT | TCTCGCGC | AD002 | CGATGT | 3508294 | 1813927 | ... | 2144 | 1038 | 2156 | 93 | mL6-2 | 14,0262 | -4,28728\n | inhibitory | 20658 | 0.248668 |
allc_nuclei-319_S59_L002_qual30_1.tsv | mL4 | 20160217 pool B | D503 | D708 | CCTATCCT | TAATGCGC | 3480402 | 1625695 | ... | 2158 | 597 | 226 | 582 | mL4 | -4,58041 | -5,07954\n | inhibitory | 1486 | 0.017435 |
3379 rows × 22 columns
adata
AnnData object with n_obs × n_vars = 3379 × 55017 obs: 'cell_type', 'Library pool', 'Index i5', 'Index i7', 'i5 sequence', 'i7 sequence', 'random primer index', 'random primer index sequence', 'Total reads', 'Mapped reads', 'Filtered reads', 'mCCC/CCC', 'mCG/CG', 'mCH/CH', 'Estimated mCG/CG', 'Estimated mCH/CH', 'Neuron type', 'tSNE x coordinate', 'tSNE y coordinate\n', 'binary', 'coverage_cells', 'mean_cell_methylation' var: 'coverage_feature', 'mean_feature_methylation' uns: 'Estimated mCG', 'Estimated mCH', 'mCCC', 'mCG', 'mCH', 'omic'
# Plot coverage VS mean methylation level in cells
sc.pl.scatter(adata, 'coverage_cells', 'mean_cell_methylation', color='Neuron type')
sc.pl.scatter(adata, 'coverage_cells', 'Mapped reads', color='Neuron type')
# convert string annotation of Mapped reads into int values
adata.obs['Mapped reads'] = [int(x) for x in adata.obs['Mapped reads']]
sc.pl.scatter(adata, 'coverage_cells', 'Mapped reads', color='Neuron type')
sc.pl.scatter(adata, 'coverage_feature', 'mean_feature_methylation')
#filter out cells with less than 10000 enhancers covered
adata=adata[adata.obs['coverage_cells']>10000,:].copy()
adata
AnnData object with n_obs × n_vars = 3346 × 55017 obs: 'cell_type', 'Library pool', 'Index i5', 'Index i7', 'i5 sequence', 'i7 sequence', 'random primer index', 'random primer index sequence', 'Total reads', 'Mapped reads', 'Filtered reads', 'mCCC/CCC', 'mCG/CG', 'mCH/CH', 'Estimated mCG/CG', 'Estimated mCH/CH', 'Neuron type', 'tSNE x coordinate', 'tSNE y coordinate\n', 'binary', 'coverage_cells', 'mean_cell_methylation' var: 'coverage_feature', 'mean_feature_methylation' uns: 'Estimated mCG', 'Estimated mCH', 'mCCC', 'mCG', 'mCH', 'omic', 'Neuron type_colors'
adata = epi.pp.imputation_met(adata, number_cell_covered=500, imputation_value='mean', save=None, copy=True)
adata
AnnData object with n_obs × n_vars = 3346 × 54710 obs: 'cell_type', 'Library pool', 'Index i5', 'Index i7', 'i5 sequence', 'i7 sequence', 'random primer index', 'random primer index sequence', 'Total reads', 'Mapped reads', 'Filtered reads', 'mCCC/CCC', 'mCG/CG', 'mCH/CH', 'Estimated mCG/CG', 'Estimated mCH/CH', 'Neuron type', 'tSNE x coordinate', 'tSNE y coordinate\n', 'binary', 'coverage_cells', 'mean_cell_methylation' var: 'coverage_feature', 'mean_feature_methylation' uns: 'Estimated mCG', 'Estimated mCH', 'mCCC', 'mCG', 'mCH', 'omic', 'Neuron type_colors'
# recalculing qc values
length1 = len(adata.X[0,:])
length2 = len(adata.X[:,0])
adata.obs['coverage_cells'] = [length1 - np.isnan(line).sum() for line in adata.X]
adata.obs['mean_cell_methylation'] = [np.nansum(line)/length1 for line in adata.X]
adata.var['coverage_feature'] = [length2 - np.isnan(line).sum() for line in adata.X.T]
adata.var['mean_feature_methylation'] = [np.nansum(line)/length2 for line in adata.X.T]
# perform PCA, neighbor graph, tSNE and UMAP
epi.pp.lazy(adata)
# save intermediary file
#adata.write(OUT_DATADIR+'imputed_min500cells_min10000enhancers.h5ad')
adata
AnnData object with n_obs × n_vars = 3346 × 54710 obs: 'cell_type', 'Library pool', 'Index i5', 'Index i7', 'i5 sequence', 'i7 sequence', 'random primer index', 'random primer index sequence', 'Total reads', 'Mapped reads', 'Filtered reads', 'mCCC/CCC', 'mCG/CG', 'mCH/CH', 'Estimated mCG/CG', 'Estimated mCH/CH', 'Neuron type', 'tSNE x coordinate', 'tSNE y coordinate\n', 'binary', 'coverage_cells', 'mean_cell_methylation' var: 'coverage_feature', 'mean_feature_methylation' uns: 'Estimated mCG', 'Estimated mCH', 'mCCC', 'mCG', 'mCH', 'omic', 'Neuron type_colors', 'pca', 'neighbors' obsm: 'X_pca', 'X_tsne', 'X_umap' varm: 'PCs'
## Plot low dimensional representation (before normalisation of the numerb of reads per cell)
sc.pl.pca(adata, color=['Library pool', 'Mapped reads',
'Index i5', 'Index i7',
'i5 sequence', 'i7 sequence',
'mCCC/CCC', 'mCG/CG', 'mCH/CH',], wspace=0.8)
sc.pl.umap(adata, color=['Library pool', 'Mapped reads',
'Index i5', 'Index i7',
'i5 sequence', 'i7 sequence',
'mCCC/CCC', 'mCG/CG', 'mCH/CH'], wspace=0.8)
# visualisation of known cell types
sc.pl.pca(adata, color=['cell_type', "Neuron type"], wspace=0.8)
sc.pl.umap(adata, color=['cell_type', "Neuron type"], wspace=0.8)
# Linear regression of the number of Mapped reads per cell.
sc.pp.regress_out(adata, 'Mapped reads')
# redoing PCA, neighborhood graph, tSNE and UMAP
epi.pp.lazy(adata)
# visualisation
sc.pl.pca(adata, color=['Library pool', 'Mapped reads',
'Index i5', 'Index i7',
'i5 sequence', 'i7 sequence',
'mCCC/CCC', 'mCG/CG', 'mCH/CH',], wspace=0.8)
sc.pl.umap(adata, color=['Library pool', 'Mapped reads',
'Index i5', 'Index i7',
'i5 sequence', 'i7 sequence',
'mCCC/CCC', 'mCG/CG', 'mCH/CH'], wspace=0.8)
# visulisation of known cell type annotation.
sc.pl.pca(adata, color=['cell_type', "Neuron type"], wspace=0.8)
sc.pl.umap(adata, color=['cell_type', "Neuron type"], wspace=0.8)
# recomputing the UMAP
epi.tl.umap(adata, min_dist=1)
sc.pl.umap(adata, color=['cell_type', "Neuron type"], wspace=0.8)
# save the processed file.
adata.write(OUT_DATADIR+'imputed_min500cells_min10000enhancers_normalised.h5ad')