Update nextflow pipeline to generate summary stats

icbi-lab · Aug 10, 2021 · 149ad24 · 149ad24
1 parent 01664ea
commit 149ad24
Show file tree

Hide file tree

Showing 6 changed files with 132 additions and 10 deletions.
diff --git a/analyses/01_process_counts.Rmd b/analyses/01_process_counts.Rmd
@@ -5,7 +5,7 @@ jupyter:
       extension: .Rmd
       format_name: rmarkdown
       format_version: '1.2'
-      jupytext_version: 1.10.0
+      jupytext_version: 1.11.2
   kernelspec:
     display_name: Python [conda env:.conda-vanderburg_scanpy]
     language: python

diff --git a/analyses/02_filter_data.Rmd b/analyses/02_filter_data.Rmd
@@ -5,7 +5,7 @@ jupyter:
       extension: .Rmd
       format_name: rmarkdown
       format_version: '1.2'
-      jupytext_version: 1.10.0
+      jupytext_version: 1.11.2
   kernelspec:
     display_name: Python [conda env:.conda-vanderburg_scanpy]
     language: python

diff --git a/analyses/03_normalize.Rmd b/analyses/03_normalize.Rmd
@@ -5,7 +5,7 @@ jupyter:
       extension: .Rmd
       format_name: rmarkdown
       format_version: '1.2'
-      jupytext_version: 1.10.0
+      jupytext_version: 1.11.2
   kernelspec:
     display_name: Python [conda env:.conda-vanderburg_scanpy]
     language: python
@@ -25,8 +25,10 @@ In this notebook, we will
 
 ```{python tags=c("parameters")}
 input_file = "../results/02_filter_data/adata.h5ad"
+adata_unfiltered_file = "../results/01_process_data/adata.h5ad"
 tables_dir = "../tables"
 output_file = "tmp/adata.h5ad"
+output_file_stats = "tmp/quality_stats.csv"
 doublet_file = f"{tables_dir}/is_doublet.npy"
 ```
 
@@ -63,6 +65,10 @@ pca_file = os.path.join(tables_dir, "adata_pca.pkl.gz")
 adata = sc.read_h5ad(input_file)
 ```
 
+```{python}
+adata_unfiltered = sc.read_h5ad(adata_unfiltered_file)
+```
+
 ### Load doublets precomputed by solo
 We don't run `solo` as part of the pipeline, as the results
 are not reproducible on different systems. Instead, 
@@ -129,6 +135,118 @@ adata = adata[~adata.obs["is_doublet"], :].copy()
 print_dim(adata)
 ```
 
+## Summary statistics
+
+Generate a summary table with
+ * total number of sequenced reads per sample
+ * total number of uniquely mapped reads per sample
+ * total number of called cells (quality control and filtering for e.g., possible cell doublets, potential apoptotic cells)
+ * median number and range of uniquely maped reads per called cell 
+ * median number and range of detected genes per called cell
+ * median number and range of detected genes per called cell
+
+The first two metrics are based on the raw files (FASTQ and BAM), i.e. before UMI deduplication and read in from precomputed tables. The other columns are based on the counts produced by cellranger and computed on the anndata object. 
+
+
+```{python}
+sequenced_reads = pd.read_csv(
+    f"{tables_dir}/summary_fastq_counts.txt", names=["samples", "total_sequenced_reads"]
+)
+sequenced_reads["samples"] = [f"H{x}" for x in sequenced_reads["samples"]]
+sequenced_reads.set_index("samples", inplace=True)
+```
+
+```{python}
+uniquely_mapped_reads = pd.read_csv(
+    f"{tables_dir}/summary_uniquely_mapped_reads.txt",
+    names=["samples", "total_uniquely_mapped_reads"],
+)
+uniquely_mapped_reads["samples"] = [f"H{x}" for x in uniquely_mapped_reads["samples"]]
+uniquely_mapped_reads.set_index("samples", inplace=True)
+```
+
+```{python}
+called_cells = (
+    adata.obs.groupby("samples")
+    .size()
+    .reset_index(name="total_called_cells")
+    .set_index("samples")
+)
+```
+
+### Get fraction of ribosomal genes
+need to revert to unfiltered anndata object to get stats on ribosomal genes as we removed them earlier. 
+The statistics need to be computed on "called cells" (after doublet filtering), so we can't compute the stats in the earlier notebook either. 
+
+```{python}
+ribo_genes = pd.read_csv(
+    os.path.join(tables_dir, "ribosomal_genes.tsv"), sep="\t", comment="#"
+)["Approved symbol"].values
+```
+
+```{python}
+adata.shape
+```
+
+```{python}
+# only keep 'called cells'
+adata_unfiltered = adata_unfiltered[adata.obs_names, :].copy()
+adata_unfiltered.shape
+```
+
+```{python}
+adata_unfiltered.var["is_ribo"] = adata_unfiltered.var_names.isin(ribo_genes)
+```
+
+```{python}
+adata.obs["ribo_frac"] = np.sum(
+    adata_unfiltered[:, adata_unfiltered.var["is_ribo"]].X, axis=1
+) / np.sum(adata_unfiltered.X, axis=1)
+```
+
+### Compute stats by aggregating `obs`
+
+```{python}
+gene_stats = adata.obs.groupby("samples").agg(
+    median_genes=pd.NamedAgg(column="n_genes", aggfunc="median"),
+    min_genes=pd.NamedAgg(column="n_genes", aggfunc="min"),
+    max_genes=pd.NamedAgg(column="n_genes", aggfunc="max"),
+    median_uniquely_mapped_reads=pd.NamedAgg(column="n_counts", aggfunc="median"),
+    min_uniquely_mapped_reads=pd.NamedAgg(column="n_counts", aggfunc="min"),
+    max_uniquely_mapped_reads=pd.NamedAgg(column="n_counts", aggfunc="max"),
+    median_ribosomal_read_fraction=pd.NamedAgg(column="ribo_frac", aggfunc="median"),
+    min_ribosomal_read_fraction=pd.NamedAgg(column="ribo_frac", aggfunc="min"),
+    max_ribosomal_read_fraction=pd.NamedAgg(column="ribo_frac", aggfunc="max"),
+)
+```
+
+```{python}
+quality_table = sequenced_reads.join(uniquely_mapped_reads).join(called_cells)
+quality_table["median_uniquely_mapped_reads_per_cell"] = [
+    f'{gene_stats.loc[s, "median_uniquely_mapped_reads"]:.0f} '
+    f'({gene_stats.loc[s, "min_uniquely_mapped_reads"]:.0f} - '
+    f'{gene_stats.loc[s, "max_uniquely_mapped_reads"]:.0f})'
+    for s in quality_table.index
+]
+quality_table["median_rrna_rate_per_cell"] = [
+    f'{gene_stats.loc[s, "median_ribosomal_read_fraction"]:.2f} '
+    f'({gene_stats.loc[s, "min_ribosomal_read_fraction"]:.2f} - '
+    f'{gene_stats.loc[s, "max_ribosomal_read_fraction"]:.2f})'
+    for s in quality_table.index
+]
+quality_table["median_detected_genes_per_cell"] = [
+    f'{gene_stats.loc[s, "median_genes"]:.0f} '
+    f'({gene_stats.loc[s, "min_genes"]:.0f} - '
+    f'{gene_stats.loc[s, "max_genes"]:.0f})'
+    for s in quality_table.index
+]
+quality_table
+```
+
+```{python}
+quality_table.to_csv(output_file_stats)
+```
+
 ## Compute highly variable genes
 
 ```{python}

diff --git a/analyses/04_annotate_cell_types.Rmd b/analyses/04_annotate_cell_types.Rmd
@@ -5,7 +5,7 @@ jupyter:
       extension: .Rmd
       format_name: rmarkdown
       format_version: '1.2'
-      jupytext_version: 1.10.0
+      jupytext_version: 1.11.2
   kernelspec:
     display_name: Python [conda env:.conda-vanderburg_scanpy]
     language: python

diff --git a/analyses/05_prepare_adata_nk_t.Rmd b/analyses/05_prepare_adata_nk_t.Rmd
@@ -7,7 +7,7 @@ jupyter:
       extension: .Rmd
       format_name: rmarkdown
       format_version: '1.2'
-      jupytext_version: 1.10.0
+      jupytext_version: 1.11.2
   kernelspec:
     display_name: Python [conda env:.conda-vanderburg_scanpy]
     language: python

diff --git a/main.nf b/main.nf
@@ -19,7 +19,7 @@ process p01_process_data {
         file 'notebook.Rmd' from Channel.fromPath("analyses/${id}.Rmd")
 
     output:
-        file "adata.h5ad" into process_data_adata
+        file "adata.h5ad" into process_data_adata, process_data_adata_2
         file "${id}.html" into process_data_html
 
     """
@@ -97,18 +97,20 @@ process p03_normalize {
         file "is_doublet.npy" from Channel.fromPath("tables/is_doublet.npy")
         file 'lib/*' from Channel.fromPath("lib/{jupytertools,scio,scpp}.py").collect()
         file 'tables/*' from Channel.fromPath(
-            "tables/{biomart.tsv,cell_cycle_regev.tsv,adata_pca.pkl.gz}"
+            "tables/{biomart.tsv,cell_cycle_regev.tsv,adata_pca.pkl.gz,summary*.txt,ribosomal_genes.tsv}"
         ).collect()
         file 'notebook.Rmd' from Channel.fromPath("analyses/${id}.Rmd")
         file 'input_adata.h5ad' from filter_data_adata_2
+        file 'adata_unfiltered.h5ad' from  process_data_adata_2
 
     output:
         file "adata.h5ad" into correct_data_adata
         file "${id}.html" into correct_data_html
+        file "quality_stats.csv" into correct_data_quality_stats
 
     """
     execute_notebook.sh ${id} ${task.cpus} notebook.Rmd \\
-       "-r input_file input_adata.h5ad -r output_file adata.h5ad -r tables_dir tables -r doublet_file is_doublet.npy"
+       "-r input_file input_adata.h5ad -r output_file adata.h5ad -r tables_dir tables -r doublet_file is_doublet.npy -r adata_unfiltered_file adata_unfiltered.h5ad -r output_file_stats quality_stats.csv"
     """
 }
 
@@ -243,20 +245,22 @@ process deploy {
             process_data_html,
             filter_data_html,
             correct_data_html,
+            correct_data_quality_stats,
             annotate_cell_types_html,
             prepare_adata_t_nk_html,
             nkg2a_html,
             nkg2a_figures,
             nkg2a_de_analysis,
-            nkg2a_de_analysis_zip
+            nkg2a_de_analysis_zip,
         ).collect()
 
     output:
         file "*.html"
         file "*.zip"
+        file "*.csv"
 
     """
-    cp input/*.{html,zip} .
+    cp input/*.{html,zip,csv} .
     """
 }