In [3]:
import anndata as ad
import scanpy.api as sc
import episcanpy.api as epi

Download the raw peak matrices here: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE111586

the data is composed of 2 replicates. 2 mouse brain.

Load the first replicate

In [ ]:
## load the raw count matrix
path=''
file_name = 'GSM3034623_BoneMarrow_62216.peakmatrix.txt'
adata = epi.pp.read_mtx_bed(file_name) # terrible name... bed is not for a bed format but an output of 
In [ ]:
adata
In [9]:
# load sample metadata into .obs space
metadata_file = 'BoneMarrow62216_metadata.txt'
epi.pp.load_metadata(adata, metadata_file, separator='\t')
2.3262939453125 seconds
In [14]:
epi.pp.commoness_features(adata)
In [ ]:
epi.pp.coverage_cells(adata)
In [ ]:
# if the count matrix is not binary. Consider making it binary
epi.pp.binarize(adata)
In [ ]:
adata.write('matrices/bone_marrow_62216_raw.h5ad')
adata = ad.read('matrices/bone_marrow_62216_raw.h5ad')

There is a second replicate to load

In [23]:
path=''
file_name2 = 'GSM3034622_BoneMarrow_62016.peakmatrix.txt'
adata2 = read_mtx_bed(file_name2)
896.963546037674 seconds
In [ ]:
adata2.write('matrices/bone_marrow_62016_raw.h5ad')
adata2 = ad.read('matrices/bone_marrow_62016_raw.h5ad')
In [ ]:
metadata_file = 'BoneMarrow62016_metadata.txt'
epi.pp.load_metadata(adata2, metadata_file, separator='\t')

Merging the 2 replicates

In [ ]:
adata3 = ad.AnnData.concatenate(adata, adata2)
adata3.obs_names = list(adata.obs_names)+list(adata3.obs_names)
In [ ]:
epi.pp.coverage_cells(adata3)
In [ ]:
epi.pp.commoness_features(adata3)
In [ ]:
adata3.write('matrices/bone_marrow_merged_62016_and_62216_raw.h5ad')
adata3 = ad.read('matrices/bone_marrow_merged_62016_and_62216_raw.h5ad')

Filtering the merge count matrix

In [ ]:
# peaks shall be shared in at least 150 cells
adata4 = adata3[:,adata3.var["commonness"] >= 150]
In [ ]:
adata4
In [ ]:
epi.pp.coverage_cells(adata4, key_addded="sum_red_peaks")
In [ ]:
# this is an important step. The first PC correspond to the peak coverage.
sc.pp.regress_out(adata4, "sum_red_peaks")
In [ ]:
# despite the regression on the number of peak covered. Some cells are too lowly covered to be considered
# remove cells that have less than 500 peaks covered
adata5 = adata4[adata4.obs["sum_red_peaks"] >= 500,:]
In [ ]:
adata5
In [ ]:
# plot the resulting matrix and adjust filtering in case it looks odd
epi.pp.lazy(adata5, n_comps=100, n_neighbors=50, nb_pcs=20)
In [ ]:
sc.tl.loucain(adata5)
sc.pl.umap(adata5, color="louvain")
In [ ]:
# if it looks good, save the final filtered matrix:
adata5.write('../matrices/bone_marrow_62216_62016_com150_peaks_regressed_red_cov500_final_filtered.h5ad')

Now that you have a processed matrix you can attempt to identify cell types. For this check out the corresponding tutorial