Removing requirement to load libraries. All functions called with ref…

…erence to their namespace within the code.
carpentries-incubator · Oct 10, 2023 · d6cc834 · d6cc834
1 parent b6b8b6e
commit d6cc834
Show file tree

Hide file tree

Showing 5 changed files with 60 additions and 52 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/episodes/.DS_Store b/episodes/.DS_Store
diff --git a/episodes/episode4.Rmd b/episodes/episode4.Rmd
@@ -126,7 +126,9 @@ sprintf("The unique IDs that match the counts matrix are in column: %s", colname
 
 ```{r}
 
-suppressPackageStartupMessages(library(tidyverse, quietly = TRUE))
+# suppressPackageStartupMessages(library(dplyr, quietly = TRUE))
+# suppressPackageStartupMessages(library(ggplot2, quietly = TRUE))
+# suppressPackageStartupMessages(library(tibble, quietly = TRUE))
 
 ```
 
@@ -191,9 +193,13 @@ unique(c(samp.info.ibd.sel$sex, samp.info.ibd.sel$condition))
 
 * Both `sex` and `condition` are consistently encoded over all samples. We should however remove the spaces in the values for `sampleID` and for the `condition` value ulcerative colitis. Note: since we are changing our unique identifier, we'll need to make the same update later in the counts matrix.
 
+Before we do this, since we'll be using the pipe operator from the tidyverse libraries, we will to define the pipe operator from the magrittr package in the environment.
+
 
 ```{r}
 
+`%>%` <- magrittr::`%>%`
+
 samp.info.ibd.sel <- samp.info.ibd.sel %>%
                         dplyr::mutate(sampleID = gsub(" ", "_", sampleID),
                                       condition = gsub(" ", "_", condition))
@@ -212,7 +218,7 @@ samp.info.ibd.sel <- samp.info.ibd.sel %>%
 ```{r}
 
 samp.info.ibd.sel <- samp.info.ibd.sel %>%
-                        dplyr::mutate(class = if_else(condition == 'normal', -1, 1))
+                        dplyr::mutate(class = dplyr::if_else(condition == 'normal', -1, 1))
 
 ```
 
@@ -239,7 +245,7 @@ The two classes are approximately equally represented, so let's check everything
 
 ```{r}
 
-glimpse(samp.info.ibd.sel)
+dplyr::glimpse(samp.info.ibd.sel)
 
 ```
 
@@ -296,7 +302,7 @@ counts.mat.ibd <- raw.counts.ibd[-1,-1]
 
 rownames(counts.mat.ibd) <- NULL
 
-counts.mat.ibd <-  counts.mat.ibd %>% column_to_rownames('read')
+counts.mat.ibd <-  counts.mat.ibd %>% tibble::column_to_rownames('read')
 
 counts.mat.ibd[1:10,1:6]
 
@@ -402,7 +408,7 @@ It is crucial that you document your data reformatting so that it is reproducibl
 
 Take a look at [The Tuburculosis (TB) Dataset - E-MTAB-6845](https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-6845). You can download the sdrf file from zenodo and save it to your `data` directory by running the following code.
 
-```{r}
+```r
 
 download.file(url = "https://zenodo.org/record/8125141/files/E-MTAB-6845.sdrf.txt",
               destfile = "data/E-MTAB-6845.sdrf.txt")
@@ -466,7 +472,7 @@ dplyr::mutate(prog_status = factor(prog_status, levels = c("non_progressor","tb_
 dplyr::mutate(prog_status = dplyr::recode(prog_status,                                
               "non-progressor" = "non_progressor",
               "TB progressor" = "tb_progressor"), 
-              class = if_else(prog_status == 'non_progressor', -1, 1)                 
+              class = dplyr::if_else(prog_status == 'non_progressor', -1, 1)                 
               ) %>% 
 
 dplyr::distinct() %>%
@@ -499,7 +505,7 @@ samp.info.tb %>%
       dplyr::mutate(prog_status = dplyr::recode(prog_status,                                 # recode target variable
                     "non-progressor" = "non_progressor",
                     "TB progressor" = "tb_progressor"), 
-                    class = if_else(prog_status == 'non_progressor', -1, 1)                 # create numerical class
+                    class = dplyr::if_else(prog_status == 'non_progressor', -1, 1)                 # create numerical class
                     ) %>% 
 
        dplyr::mutate(prog_status = factor(prog_status, levels = c("non_progressor","tb_progressor")), 

diff --git a/episodes/episode5.Rmd b/episodes/episode5.Rmd
@@ -51,7 +51,9 @@ And now read the files into R...
 
 ```{r}
 
-suppressPackageStartupMessages(library(tidyverse, quietly = TRUE))
+# suppressPackageStartupMessages(library(dplyr, quietly = TRUE))
+# suppressPackageStartupMessages(library(ggplot2, quietly = TRUE))
+# suppressPackageStartupMessages(library(tibble, quietly = TRUE))
 
 ```
 
@@ -61,9 +63,6 @@ samp.info.ibd.sel <- read.table(file="./data/ibd.sample.info.txt", sep="\t", hea
 
 counts.mat.ibd <- read.table(file="./data/counts.mat.ibd.txt", sep='\t', header=T, fill=T, check.names=F)
 
-# head(counts.mat.ibd)
-# any(is.na(samp.info.ibd.sel))
-
 ```
 
 Run the following code to view the histogram giving the frequency of the maximum count for each gene in the sample (plotted on a log10 scale). You'll see in the resulting plot that over 800 genes have no counts for any gene (i.e., maximum = 0), and that there are hundreds of genes where the maximum count for the gene across all samples is below 10. For comparison, the median of these maximum counts across all genes is around 250. These low count genes are likely to represent technical noise.
@@ -72,12 +71,14 @@ A simple filtering approach is to remove all genes where the maximum read count
 
 ```{r}
 
+`%>%` <- magrittr::`%>%`
+
 data.frame(max_count = apply(counts.mat.ibd, 1, max, na.rm=TRUE)) %>% 
-  ggplot(aes(x = max_count)) + 
-    geom_histogram(bins = 200) + 
-    xlab("Max Counts (log10 scale)") + 
-    ylab("Frequency") +
-    scale_x_log10(n.breaks = 6, labels = scales::comma)
+  ggplot2::ggplot(ggplot2::aes(x = max_count)) + 
+    ggplot2::geom_histogram(bins = 200) + 
+    ggplot2::xlab("Max Counts (log10 scale)") + 
+    ggplot2::ylab("Frequency") +
+    ggplot2::scale_x_log10(n.breaks = 6, labels = scales::comma)
 
 ```
 
@@ -104,9 +105,6 @@ In order to be able to compare read counts between samples, we must first adjust
 
 ```{r}
 
-# load DESeq2 library
-suppressPackageStartupMessages(library(DESeq2))
-
 # convert the condition variable to a factor as required by DESeq2
 samp.info.ibd.sel[c('condition')] <- lapply(samp.info.ibd.sel[c('condition')], factor)
 
@@ -165,12 +163,12 @@ ms.jac = sapply(t.seq, function(t){
 
 # plot the threshold value against the value of the Multiset Jaccard index to visualise
 
-ggplot(data=data.frame(t = t.seq, jacc = ms.jac)) +
-            geom_line(aes(x=t, y=jacc)) +
-            geom_hline(yintercept = max(ms.jac), lty=2,col='gray') +
-            geom_point(aes(x=which.max(ms.jac), y=max(ms.jac)), col="red", cex=6, pch=1) +
-            xlab("Low Count Threshold") + 
-            ylab("Multiset Jaccard Index")
+ggplot2::ggplot(data=data.frame(t = t.seq, jacc = ms.jac)) +
+            ggplot2::geom_line(ggplot2::aes(x=t, y=jacc)) +
+            ggplot2::geom_hline(yintercept = max(ms.jac), lty=2,col='gray') +
+            ggplot2::geom_point(ggplot2::aes(x=which.max(ms.jac), y=max(ms.jac)), col="red", cex=6, pch=1) +
+            ggplot2::xlab("Low Count Threshold") + 
+            ggplot2::ylab("Multiset Jaccard Index")
 
 ```
 
@@ -266,6 +264,7 @@ h.threshold <- stats::qf(cooks.quantile, p, m - p)
 Filter the counts matrix to eliminate all genes where the cooks distance is over the outlier threshold. Here you can see that a further ~1,800 genes are filtered out based on having outlier read count values for at least one sample.
 
 ```{r}
+
 counts.mat.ibd.ol.filtered <-  counts.mat.ibd.filtered[which(apply(cooks.mat, 1, function(x){(max(x) < h.threshold) >= 1})),]
 
 sprintf("Genes filtered: %s; Genes remaining: %s", nrow(counts.mat.ibd.filtered)-nrow(counts.mat.ibd.ol.filtered), nrow(counts.mat.ibd.ol.filtered))
@@ -277,11 +276,11 @@ Run the same plot as above, and we can see that the genes with very low maximum
 ```{r}
 
 data.frame(max_count = apply(counts.mat.ibd.ol.filtered, 1, max, na.rm=TRUE)) %>% 
-  ggplot(aes(x = max_count)) + 
-    geom_histogram(bins = 200) + 
-    xlab("Max Counts (log10 scale)") + 
-    ylab("Frequency") +
-    scale_x_log10(n.breaks = 6, labels = scales::comma)
+  ggplot2::ggplot(ggplot2::aes(x = max_count)) + 
+    ggplot2::geom_histogram(bins = 200) + 
+    ggplot2::xlab("Max Counts (log10 scale)") + 
+    ggplot2::ylab("Frequency") +
+    ggplot2::scale_x_log10(n.breaks = 6, labels = scales::comma)
 
 ```
 

diff --git a/episodes/episode6.Rmd b/episodes/episode6.Rmd
@@ -40,7 +40,9 @@ And now read the files into R...
 
 ```{r}
 
-suppressPackageStartupMessages(library(tidyverse, quietly = TRUE))
+# suppressPackageStartupMessages(library(dplyr, quietly = TRUE))
+# suppressPackageStartupMessages(library(ggplot2, quietly = TRUE))
+# suppressPackageStartupMessages(library(tibble, quietly = TRUE))
 
 ```
 
@@ -62,10 +64,10 @@ counts.mat.ibd.ol.filtered %>%
   tibble::rownames_to_column("geneID")  %>%      
   reshape2::melt(id.vars = 'geneID', value.name='count', variable.name=c('sampleID')) %>% 
   .[sample(nrow(.), size = 5000, replace = FALSE),] %>% 
-  ggplot(aes(x = count)) + 
-    geom_histogram(bins = 200) + 
-    xlab("Raw Counts") + 
-    ylab("Frequency")
+  ggplot2::ggplot(ggplot2::aes(x = count)) + 
+    ggplot2::geom_histogram(bins = 200) + 
+    ggplot2::xlab("Raw Counts") + 
+    ggplot2::ylab("Frequency")
 
 
 ```
@@ -79,10 +81,10 @@ counts.mat.ibd.ol.filtered %>%
   data.frame(row.mean = apply(., 1, mean),
            row.var = apply(., 1, var)) %>%
   dplyr::filter(row.var < 5000) %>% 
-  ggplot(aes(x=row.mean, y=(row.var))) +
-    geom_point(alpha=0.1) +
-    xlab("Mean Count") +
-    ylab("Variance")
+  ggplot2::ggplot(ggplot2::aes(x=row.mean, y=(row.var))) +
+    ggplot2::geom_point(alpha=0.1) +
+    ggplot2::xlab("Mean Count") +
+    ggplot2::ylab("Variance")
 
 ```
 
@@ -120,10 +122,10 @@ counts.mat.ibd.vst %>%
   tibble::rownames_to_column("geneID")  %>%      
   reshape2::melt(id.vars = 'geneID', value.name='count', variable.name=c('sampleID')) %>% 
   .[sample(nrow(.), size = 5000, replace = FALSE),] %>% 
-  ggplot(aes(x = count)) + 
-    geom_histogram(bins = 200) + 
-    xlab("Raw Counts") + 
-    ylab("Frequency")
+  ggplot2::ggplot(ggplot2::aes(x = count)) + 
+    ggplot2::geom_histogram(bins = 200) + 
+    ggplot2::xlab("Raw Counts") + 
+    ggplot2::ylab("Frequency")
 
 ```
 
@@ -135,10 +137,10 @@ counts.mat.ibd.vst %>%
   data.frame(row.mean = apply(., 1, mean),
            row.var = apply(., 1, var)) %>%
   dplyr::filter(row.var < 2.5) %>% 
-  ggplot(aes(x=row.mean, y=(row.var))) +
-    geom_point(alpha=0.1) +
-    xlab("Mean Count") +
-    ylab("Variance")
+  ggplot2::ggplot(ggplot2::aes(x=row.mean, y=(row.var))) +
+    ggplot2::geom_point(alpha=0.1) +
+    ggplot2::xlab("Mean Count") +
+    ggplot2::ylab("Variance")
 
 
 ```
@@ -156,11 +158,11 @@ counts.mat.ibd.vst %>%
   tibble::rownames_to_column("geneID")  %>%  
   .[sample(nrow(.), size = 50, replace = FALSE),] %>% head(50) %>% 
   reshape2::melt(id.vars = 'geneID', value.name='count', variable.name=c('sampleID')) %>% 
-  ggplot(aes(x = reorder(geneID, count, median), y = count)) + 
-    geom_boxplot() + 
-    xlab("Gene") + 
-    ylab("VST Counts") +
-  theme(axis.text.x = element_blank())
+  ggplot2::ggplot(ggplot2::aes(x = reorder(geneID, count, median), y = count)) + 
+    ggplot2::geom_boxplot() + 
+    ggplot2::xlab("Gene") + 
+    ggplot2::ylab("VST Counts") +
+  ggplot2::theme(axis.text.x = ggplot2::element_blank())
 
 
 ```
@@ -179,6 +181,7 @@ On the other hand, there are algorithms that are insensitive to scale where scal
 
 Two widely used rescaling approaches are standardisation (also known as z-scoring), where the distribution of values in every variable is centred around a mean of zero with unit standard deviation, and min-max scaling (confusingly, also known as normalisation) where the values are rescaled to be within the range [0, 1]. In many cases either may be appropriate. The table below gives a few guidelines on when one may be more appropriate than the other.
 
+
 Scaling Method | Formula | When more appropriate
 --- | -- | -----------
 **Standardisation** | $\frac{(x - \mu)}{\sigma}$   | - Distribution of the variable data is Gaussian or unknown <br> - Data contains outlier values that would cause the remaining data to be compressed into a very small range if Min-Max scaling is used <br> - The algorithm being used assumes the data has a Gaussian distribution, e.g. **linear regression**