diff --git a/analyses/01_process_counts.Rmd b/analyses/01_process_counts.Rmd index be674b7..6b338d5 100644 --- a/analyses/01_process_counts.Rmd +++ b/analyses/01_process_counts.Rmd @@ -5,7 +5,7 @@ jupyter: extension: .Rmd format_name: rmarkdown format_version: '1.2' - jupytext_version: 1.10.0 + jupytext_version: 1.11.2 kernelspec: display_name: Python [conda env:.conda-vanderburg_scanpy] language: python diff --git a/analyses/02_filter_data.Rmd b/analyses/02_filter_data.Rmd index 3838eed..0502481 100644 --- a/analyses/02_filter_data.Rmd +++ b/analyses/02_filter_data.Rmd @@ -5,7 +5,7 @@ jupyter: extension: .Rmd format_name: rmarkdown format_version: '1.2' - jupytext_version: 1.10.0 + jupytext_version: 1.11.2 kernelspec: display_name: Python [conda env:.conda-vanderburg_scanpy] language: python diff --git a/analyses/03_normalize.Rmd b/analyses/03_normalize.Rmd index 78eb622..7fb6a14 100644 --- a/analyses/03_normalize.Rmd +++ b/analyses/03_normalize.Rmd @@ -5,7 +5,7 @@ jupyter: extension: .Rmd format_name: rmarkdown format_version: '1.2' - jupytext_version: 1.10.0 + jupytext_version: 1.11.2 kernelspec: display_name: Python [conda env:.conda-vanderburg_scanpy] language: python @@ -25,8 +25,10 @@ In this notebook, we will ```{python tags=c("parameters")} input_file = "../results/02_filter_data/adata.h5ad" +adata_unfiltered_file = "../results/01_process_data/adata.h5ad" tables_dir = "../tables" output_file = "tmp/adata.h5ad" +output_file_stats = "tmp/quality_stats.csv" doublet_file = f"{tables_dir}/is_doublet.npy" ``` @@ -63,6 +65,10 @@ pca_file = os.path.join(tables_dir, "adata_pca.pkl.gz") adata = sc.read_h5ad(input_file) ``` +```{python} +adata_unfiltered = sc.read_h5ad(adata_unfiltered_file) +``` + ### Load doublets precomputed by solo We don't run `solo` as part of the pipeline, as the results are not reproducible on different systems. Instead, @@ -129,6 +135,118 @@ adata = adata[~adata.obs["is_doublet"], :].copy() print_dim(adata) ``` +## Summary statistics + +Generate a summary table with + * total number of sequenced reads per sample + * total number of uniquely mapped reads per sample + * total number of called cells (quality control and filtering for e.g., possible cell doublets, potential apoptotic cells) + * median number and range of uniquely maped reads per called cell + * median number and range of detected genes per called cell + * median number and range of detected genes per called cell + +The first two metrics are based on the raw files (FASTQ and BAM), i.e. before UMI deduplication and read in from precomputed tables. The other columns are based on the counts produced by cellranger and computed on the anndata object. + + +```{python} +sequenced_reads = pd.read_csv( + f"{tables_dir}/summary_fastq_counts.txt", names=["samples", "total_sequenced_reads"] +) +sequenced_reads["samples"] = [f"H{x}" for x in sequenced_reads["samples"]] +sequenced_reads.set_index("samples", inplace=True) +``` + +```{python} +uniquely_mapped_reads = pd.read_csv( + f"{tables_dir}/summary_uniquely_mapped_reads.txt", + names=["samples", "total_uniquely_mapped_reads"], +) +uniquely_mapped_reads["samples"] = [f"H{x}" for x in uniquely_mapped_reads["samples"]] +uniquely_mapped_reads.set_index("samples", inplace=True) +``` + +```{python} +called_cells = ( + adata.obs.groupby("samples") + .size() + .reset_index(name="total_called_cells") + .set_index("samples") +) +``` + +### Get fraction of ribosomal genes +need to revert to unfiltered anndata object to get stats on ribosomal genes as we removed them earlier. +The statistics need to be computed on "called cells" (after doublet filtering), so we can't compute the stats in the earlier notebook either. + +```{python} +ribo_genes = pd.read_csv( + os.path.join(tables_dir, "ribosomal_genes.tsv"), sep="\t", comment="#" +)["Approved symbol"].values +``` + +```{python} +adata.shape +``` + +```{python} +# only keep 'called cells' +adata_unfiltered = adata_unfiltered[adata.obs_names, :].copy() +adata_unfiltered.shape +``` + +```{python} +adata_unfiltered.var["is_ribo"] = adata_unfiltered.var_names.isin(ribo_genes) +``` + +```{python} +adata.obs["ribo_frac"] = np.sum( + adata_unfiltered[:, adata_unfiltered.var["is_ribo"]].X, axis=1 +) / np.sum(adata_unfiltered.X, axis=1) +``` + +### Compute stats by aggregating `obs` + +```{python} +gene_stats = adata.obs.groupby("samples").agg( + median_genes=pd.NamedAgg(column="n_genes", aggfunc="median"), + min_genes=pd.NamedAgg(column="n_genes", aggfunc="min"), + max_genes=pd.NamedAgg(column="n_genes", aggfunc="max"), + median_uniquely_mapped_reads=pd.NamedAgg(column="n_counts", aggfunc="median"), + min_uniquely_mapped_reads=pd.NamedAgg(column="n_counts", aggfunc="min"), + max_uniquely_mapped_reads=pd.NamedAgg(column="n_counts", aggfunc="max"), + median_ribosomal_read_fraction=pd.NamedAgg(column="ribo_frac", aggfunc="median"), + min_ribosomal_read_fraction=pd.NamedAgg(column="ribo_frac", aggfunc="min"), + max_ribosomal_read_fraction=pd.NamedAgg(column="ribo_frac", aggfunc="max"), +) +``` + +```{python} +quality_table = sequenced_reads.join(uniquely_mapped_reads).join(called_cells) +quality_table["median_uniquely_mapped_reads_per_cell"] = [ + f'{gene_stats.loc[s, "median_uniquely_mapped_reads"]:.0f} ' + f'({gene_stats.loc[s, "min_uniquely_mapped_reads"]:.0f} - ' + f'{gene_stats.loc[s, "max_uniquely_mapped_reads"]:.0f})' + for s in quality_table.index +] +quality_table["median_rrna_rate_per_cell"] = [ + f'{gene_stats.loc[s, "median_ribosomal_read_fraction"]:.2f} ' + f'({gene_stats.loc[s, "min_ribosomal_read_fraction"]:.2f} - ' + f'{gene_stats.loc[s, "max_ribosomal_read_fraction"]:.2f})' + for s in quality_table.index +] +quality_table["median_detected_genes_per_cell"] = [ + f'{gene_stats.loc[s, "median_genes"]:.0f} ' + f'({gene_stats.loc[s, "min_genes"]:.0f} - ' + f'{gene_stats.loc[s, "max_genes"]:.0f})' + for s in quality_table.index +] +quality_table +``` + +```{python} +quality_table.to_csv(output_file_stats) +``` + ## Compute highly variable genes ```{python} diff --git a/analyses/04_annotate_cell_types.Rmd b/analyses/04_annotate_cell_types.Rmd index 1d71b75..8ddfbcb 100644 --- a/analyses/04_annotate_cell_types.Rmd +++ b/analyses/04_annotate_cell_types.Rmd @@ -5,7 +5,7 @@ jupyter: extension: .Rmd format_name: rmarkdown format_version: '1.2' - jupytext_version: 1.10.0 + jupytext_version: 1.11.2 kernelspec: display_name: Python [conda env:.conda-vanderburg_scanpy] language: python diff --git a/analyses/05_prepare_adata_nk_t.Rmd b/analyses/05_prepare_adata_nk_t.Rmd index 4a634b1..c1a3b0c 100644 --- a/analyses/05_prepare_adata_nk_t.Rmd +++ b/analyses/05_prepare_adata_nk_t.Rmd @@ -7,7 +7,7 @@ jupyter: extension: .Rmd format_name: rmarkdown format_version: '1.2' - jupytext_version: 1.10.0 + jupytext_version: 1.11.2 kernelspec: display_name: Python [conda env:.conda-vanderburg_scanpy] language: python diff --git a/main.nf b/main.nf index a2d0b8f..0b94cb3 100755 --- a/main.nf +++ b/main.nf @@ -19,7 +19,7 @@ process p01_process_data { file 'notebook.Rmd' from Channel.fromPath("analyses/${id}.Rmd") output: - file "adata.h5ad" into process_data_adata + file "adata.h5ad" into process_data_adata, process_data_adata_2 file "${id}.html" into process_data_html """ @@ -97,18 +97,20 @@ process p03_normalize { file "is_doublet.npy" from Channel.fromPath("tables/is_doublet.npy") file 'lib/*' from Channel.fromPath("lib/{jupytertools,scio,scpp}.py").collect() file 'tables/*' from Channel.fromPath( - "tables/{biomart.tsv,cell_cycle_regev.tsv,adata_pca.pkl.gz}" + "tables/{biomart.tsv,cell_cycle_regev.tsv,adata_pca.pkl.gz,summary*.txt,ribosomal_genes.tsv}" ).collect() file 'notebook.Rmd' from Channel.fromPath("analyses/${id}.Rmd") file 'input_adata.h5ad' from filter_data_adata_2 + file 'adata_unfiltered.h5ad' from process_data_adata_2 output: file "adata.h5ad" into correct_data_adata file "${id}.html" into correct_data_html + file "quality_stats.csv" into correct_data_quality_stats """ execute_notebook.sh ${id} ${task.cpus} notebook.Rmd \\ - "-r input_file input_adata.h5ad -r output_file adata.h5ad -r tables_dir tables -r doublet_file is_doublet.npy" + "-r input_file input_adata.h5ad -r output_file adata.h5ad -r tables_dir tables -r doublet_file is_doublet.npy -r adata_unfiltered_file adata_unfiltered.h5ad -r output_file_stats quality_stats.csv" """ } @@ -243,20 +245,22 @@ process deploy { process_data_html, filter_data_html, correct_data_html, + correct_data_quality_stats, annotate_cell_types_html, prepare_adata_t_nk_html, nkg2a_html, nkg2a_figures, nkg2a_de_analysis, - nkg2a_de_analysis_zip + nkg2a_de_analysis_zip, ).collect() output: file "*.html" file "*.zip" + file "*.csv" """ - cp input/*.{html,zip} . + cp input/*.{html,zip,csv} . """ }