diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..88bc8f3 Binary files /dev/null and b/.DS_Store differ diff --git a/episodes/.DS_Store b/episodes/.DS_Store new file mode 100644 index 0000000..0c67f35 Binary files /dev/null and b/episodes/.DS_Store differ diff --git a/episodes/episode4.Rmd b/episodes/episode4.Rmd index d765b8c..89eb9f7 100644 --- a/episodes/episode4.Rmd +++ b/episodes/episode4.Rmd @@ -126,7 +126,9 @@ sprintf("The unique IDs that match the counts matrix are in column: %s", colname ```{r} -suppressPackageStartupMessages(library(tidyverse, quietly = TRUE)) +# suppressPackageStartupMessages(library(dplyr, quietly = TRUE)) +# suppressPackageStartupMessages(library(ggplot2, quietly = TRUE)) +# suppressPackageStartupMessages(library(tibble, quietly = TRUE)) ``` @@ -191,9 +193,13 @@ unique(c(samp.info.ibd.sel$sex, samp.info.ibd.sel$condition)) * Both `sex` and `condition` are consistently encoded over all samples. We should however remove the spaces in the values for `sampleID` and for the `condition` value ulcerative colitis. Note: since we are changing our unique identifier, we'll need to make the same update later in the counts matrix. +Before we do this, since we'll be using the pipe operator from the tidyverse libraries, we will to define the pipe operator from the magrittr package in the environment. + ```{r} +`%>%` <- magrittr::`%>%` + samp.info.ibd.sel <- samp.info.ibd.sel %>% dplyr::mutate(sampleID = gsub(" ", "_", sampleID), condition = gsub(" ", "_", condition)) @@ -212,7 +218,7 @@ samp.info.ibd.sel <- samp.info.ibd.sel %>% ```{r} samp.info.ibd.sel <- samp.info.ibd.sel %>% - dplyr::mutate(class = if_else(condition == 'normal', -1, 1)) + dplyr::mutate(class = dplyr::if_else(condition == 'normal', -1, 1)) ``` @@ -239,7 +245,7 @@ The two classes are approximately equally represented, so let's check everything ```{r} -glimpse(samp.info.ibd.sel) +dplyr::glimpse(samp.info.ibd.sel) ``` @@ -296,7 +302,7 @@ counts.mat.ibd <- raw.counts.ibd[-1,-1] rownames(counts.mat.ibd) <- NULL -counts.mat.ibd <- counts.mat.ibd %>% column_to_rownames('read') +counts.mat.ibd <- counts.mat.ibd %>% tibble::column_to_rownames('read') counts.mat.ibd[1:10,1:6] @@ -402,7 +408,7 @@ It is crucial that you document your data reformatting so that it is reproducibl Take a look at [The Tuburculosis (TB) Dataset - E-MTAB-6845](https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-6845). You can download the sdrf file from zenodo and save it to your `data` directory by running the following code. -```{r} +```r download.file(url = "https://zenodo.org/record/8125141/files/E-MTAB-6845.sdrf.txt", destfile = "data/E-MTAB-6845.sdrf.txt") @@ -466,7 +472,7 @@ dplyr::mutate(prog_status = factor(prog_status, levels = c("non_progressor","tb_ dplyr::mutate(prog_status = dplyr::recode(prog_status, "non-progressor" = "non_progressor", "TB progressor" = "tb_progressor"), - class = if_else(prog_status == 'non_progressor', -1, 1) + class = dplyr::if_else(prog_status == 'non_progressor', -1, 1) ) %>% dplyr::distinct() %>% @@ -499,7 +505,7 @@ samp.info.tb %>% dplyr::mutate(prog_status = dplyr::recode(prog_status, # recode target variable "non-progressor" = "non_progressor", "TB progressor" = "tb_progressor"), - class = if_else(prog_status == 'non_progressor', -1, 1) # create numerical class + class = dplyr::if_else(prog_status == 'non_progressor', -1, 1) # create numerical class ) %>% dplyr::mutate(prog_status = factor(prog_status, levels = c("non_progressor","tb_progressor")), diff --git a/episodes/episode5.Rmd b/episodes/episode5.Rmd index 1245913..4e59997 100644 --- a/episodes/episode5.Rmd +++ b/episodes/episode5.Rmd @@ -51,7 +51,9 @@ And now read the files into R... ```{r} -suppressPackageStartupMessages(library(tidyverse, quietly = TRUE)) +# suppressPackageStartupMessages(library(dplyr, quietly = TRUE)) +# suppressPackageStartupMessages(library(ggplot2, quietly = TRUE)) +# suppressPackageStartupMessages(library(tibble, quietly = TRUE)) ``` @@ -61,9 +63,6 @@ samp.info.ibd.sel <- read.table(file="./data/ibd.sample.info.txt", sep="\t", hea counts.mat.ibd <- read.table(file="./data/counts.mat.ibd.txt", sep='\t', header=T, fill=T, check.names=F) -# head(counts.mat.ibd) -# any(is.na(samp.info.ibd.sel)) - ``` Run the following code to view the histogram giving the frequency of the maximum count for each gene in the sample (plotted on a log10 scale). You'll see in the resulting plot that over 800 genes have no counts for any gene (i.e., maximum = 0), and that there are hundreds of genes where the maximum count for the gene across all samples is below 10. For comparison, the median of these maximum counts across all genes is around 250. These low count genes are likely to represent technical noise. @@ -72,12 +71,14 @@ A simple filtering approach is to remove all genes where the maximum read count ```{r} +`%>%` <- magrittr::`%>%` + data.frame(max_count = apply(counts.mat.ibd, 1, max, na.rm=TRUE)) %>% - ggplot(aes(x = max_count)) + - geom_histogram(bins = 200) + - xlab("Max Counts (log10 scale)") + - ylab("Frequency") + - scale_x_log10(n.breaks = 6, labels = scales::comma) + ggplot2::ggplot(ggplot2::aes(x = max_count)) + + ggplot2::geom_histogram(bins = 200) + + ggplot2::xlab("Max Counts (log10 scale)") + + ggplot2::ylab("Frequency") + + ggplot2::scale_x_log10(n.breaks = 6, labels = scales::comma) ``` @@ -104,9 +105,6 @@ In order to be able to compare read counts between samples, we must first adjust ```{r} -# load DESeq2 library -suppressPackageStartupMessages(library(DESeq2)) - # convert the condition variable to a factor as required by DESeq2 samp.info.ibd.sel[c('condition')] <- lapply(samp.info.ibd.sel[c('condition')], factor) @@ -165,12 +163,12 @@ ms.jac = sapply(t.seq, function(t){ # plot the threshold value against the value of the Multiset Jaccard index to visualise -ggplot(data=data.frame(t = t.seq, jacc = ms.jac)) + - geom_line(aes(x=t, y=jacc)) + - geom_hline(yintercept = max(ms.jac), lty=2,col='gray') + - geom_point(aes(x=which.max(ms.jac), y=max(ms.jac)), col="red", cex=6, pch=1) + - xlab("Low Count Threshold") + - ylab("Multiset Jaccard Index") +ggplot2::ggplot(data=data.frame(t = t.seq, jacc = ms.jac)) + + ggplot2::geom_line(ggplot2::aes(x=t, y=jacc)) + + ggplot2::geom_hline(yintercept = max(ms.jac), lty=2,col='gray') + + ggplot2::geom_point(ggplot2::aes(x=which.max(ms.jac), y=max(ms.jac)), col="red", cex=6, pch=1) + + ggplot2::xlab("Low Count Threshold") + + ggplot2::ylab("Multiset Jaccard Index") ``` @@ -266,6 +264,7 @@ h.threshold <- stats::qf(cooks.quantile, p, m - p) Filter the counts matrix to eliminate all genes where the cooks distance is over the outlier threshold. Here you can see that a further ~1,800 genes are filtered out based on having outlier read count values for at least one sample. ```{r} + counts.mat.ibd.ol.filtered <- counts.mat.ibd.filtered[which(apply(cooks.mat, 1, function(x){(max(x) < h.threshold) >= 1})),] sprintf("Genes filtered: %s; Genes remaining: %s", nrow(counts.mat.ibd.filtered)-nrow(counts.mat.ibd.ol.filtered), nrow(counts.mat.ibd.ol.filtered)) @@ -277,11 +276,11 @@ Run the same plot as above, and we can see that the genes with very low maximum ```{r} data.frame(max_count = apply(counts.mat.ibd.ol.filtered, 1, max, na.rm=TRUE)) %>% - ggplot(aes(x = max_count)) + - geom_histogram(bins = 200) + - xlab("Max Counts (log10 scale)") + - ylab("Frequency") + - scale_x_log10(n.breaks = 6, labels = scales::comma) + ggplot2::ggplot(ggplot2::aes(x = max_count)) + + ggplot2::geom_histogram(bins = 200) + + ggplot2::xlab("Max Counts (log10 scale)") + + ggplot2::ylab("Frequency") + + ggplot2::scale_x_log10(n.breaks = 6, labels = scales::comma) ``` diff --git a/episodes/episode6.Rmd b/episodes/episode6.Rmd index 6910290..e6e4ca6 100644 --- a/episodes/episode6.Rmd +++ b/episodes/episode6.Rmd @@ -40,7 +40,9 @@ And now read the files into R... ```{r} -suppressPackageStartupMessages(library(tidyverse, quietly = TRUE)) +# suppressPackageStartupMessages(library(dplyr, quietly = TRUE)) +# suppressPackageStartupMessages(library(ggplot2, quietly = TRUE)) +# suppressPackageStartupMessages(library(tibble, quietly = TRUE)) ``` @@ -62,10 +64,10 @@ counts.mat.ibd.ol.filtered %>% tibble::rownames_to_column("geneID") %>% reshape2::melt(id.vars = 'geneID', value.name='count', variable.name=c('sampleID')) %>% .[sample(nrow(.), size = 5000, replace = FALSE),] %>% - ggplot(aes(x = count)) + - geom_histogram(bins = 200) + - xlab("Raw Counts") + - ylab("Frequency") + ggplot2::ggplot(ggplot2::aes(x = count)) + + ggplot2::geom_histogram(bins = 200) + + ggplot2::xlab("Raw Counts") + + ggplot2::ylab("Frequency") ``` @@ -79,10 +81,10 @@ counts.mat.ibd.ol.filtered %>% data.frame(row.mean = apply(., 1, mean), row.var = apply(., 1, var)) %>% dplyr::filter(row.var < 5000) %>% - ggplot(aes(x=row.mean, y=(row.var))) + - geom_point(alpha=0.1) + - xlab("Mean Count") + - ylab("Variance") + ggplot2::ggplot(ggplot2::aes(x=row.mean, y=(row.var))) + + ggplot2::geom_point(alpha=0.1) + + ggplot2::xlab("Mean Count") + + ggplot2::ylab("Variance") ``` @@ -120,10 +122,10 @@ counts.mat.ibd.vst %>% tibble::rownames_to_column("geneID") %>% reshape2::melt(id.vars = 'geneID', value.name='count', variable.name=c('sampleID')) %>% .[sample(nrow(.), size = 5000, replace = FALSE),] %>% - ggplot(aes(x = count)) + - geom_histogram(bins = 200) + - xlab("Raw Counts") + - ylab("Frequency") + ggplot2::ggplot(ggplot2::aes(x = count)) + + ggplot2::geom_histogram(bins = 200) + + ggplot2::xlab("Raw Counts") + + ggplot2::ylab("Frequency") ``` @@ -135,10 +137,10 @@ counts.mat.ibd.vst %>% data.frame(row.mean = apply(., 1, mean), row.var = apply(., 1, var)) %>% dplyr::filter(row.var < 2.5) %>% - ggplot(aes(x=row.mean, y=(row.var))) + - geom_point(alpha=0.1) + - xlab("Mean Count") + - ylab("Variance") + ggplot2::ggplot(ggplot2::aes(x=row.mean, y=(row.var))) + + ggplot2::geom_point(alpha=0.1) + + ggplot2::xlab("Mean Count") + + ggplot2::ylab("Variance") ``` @@ -156,11 +158,11 @@ counts.mat.ibd.vst %>% tibble::rownames_to_column("geneID") %>% .[sample(nrow(.), size = 50, replace = FALSE),] %>% head(50) %>% reshape2::melt(id.vars = 'geneID', value.name='count', variable.name=c('sampleID')) %>% - ggplot(aes(x = reorder(geneID, count, median), y = count)) + - geom_boxplot() + - xlab("Gene") + - ylab("VST Counts") + - theme(axis.text.x = element_blank()) + ggplot2::ggplot(ggplot2::aes(x = reorder(geneID, count, median), y = count)) + + ggplot2::geom_boxplot() + + ggplot2::xlab("Gene") + + ggplot2::ylab("VST Counts") + + ggplot2::theme(axis.text.x = ggplot2::element_blank()) ``` @@ -179,6 +181,7 @@ On the other hand, there are algorithms that are insensitive to scale where scal Two widely used rescaling approaches are standardisation (also known as z-scoring), where the distribution of values in every variable is centred around a mean of zero with unit standard deviation, and min-max scaling (confusingly, also known as normalisation) where the values are rescaled to be within the range [0, 1]. In many cases either may be appropriate. The table below gives a few guidelines on when one may be more appropriate than the other. + Scaling Method | Formula | When more appropriate --- | -- | ----------- **Standardisation** | $\frac{(x - \mu)}{\sigma}$ | - Distribution of the variable data is Gaussian or unknown
- Data contains outlier values that would cause the remaining data to be compressed into a very small range if Min-Max scaling is used
- The algorithm being used assumes the data has a Gaussian distribution, e.g. **linear regression**