differences for PR #8

carpentries-incubator · May 7, 2024 · 15a93e1 · 15a93e1
1 parent fd1507d
commit 15a93e1
Show file tree

Hide file tree

Showing 6 changed files with 60 additions and 52 deletions.
diff --git a/episode2.md b/episode2.md
@@ -121,7 +121,7 @@ raw.counts.ibd <- read.table(file="data/E-MTAB-11349.counts.matrix.csv",
 writeLines(sprintf("%i %s", c(dim(raw.counts.ibd)[1], dim(raw.counts.ibd)[2]), c("rows corresponding to transcript IDs", "columns corresponding to samples")))
 ```
 
-```{.output}
+```output
 22751 rows corresponding to transcript IDs
 592 columns corresponding to samples
 ```
@@ -140,7 +140,7 @@ View a small subset of the data, (e.g. first ten rows and 8 columns) to see how
 raw.counts.ibd[1:10,1:8]
 ```
 
-```{.output}
+```output
             read Sample 1 Sample 2 Sample 3 Sample 4 Sample 5 Sample 6
 1   1          *    13961    16595    20722    17696    25703    20848
 2   2 ERCC-00002        0        0        0        0        0        0
@@ -171,15 +171,15 @@ samp.info.ibd <- read.table(file="data/E-MTAB-11349.sdrf.txt", sep="\t", header=
 sprintf("There are %i rows, corresponding to the samples", dim(samp.info.ibd)[1])
 ```
 
-```{.output}
+```output
 [1] "There are 590 rows, corresponding to the samples"
 ```
 
 ```r
 sprintf("There are %i columns, corresponding to the available variables for each sample", dim(samp.info.ibd)[2])
 ```
 
-```{.output}
+```output
 [1] "There are 32 columns, corresponding to the available variables for each sample"
 ```
 
@@ -190,7 +190,7 @@ If we view the column names, we can see that the file does indeed contain a set
 colnames(samp.info.ibd)
 ```
 
-```{.output}
+```output
  [1] "Source Name"                            
  [2] "Characteristics[organism]"              
  [3] "Characteristics[age]"                   

diff --git a/episode3.md b/episode3.md
@@ -116,23 +116,23 @@ If there is more than one SOFT file for a GEO Series, `getGEO()` will return a l
 gse212041 <- GEOquery::getGEO("GSE212041")
 ```
 
-```{.output}
+```output
 Setting options('download.file.method.GEOquery'='auto')
 ```
 
-```{.output}
+```output
 Setting options('GEOquery.inmemory.gpl'=FALSE)
 ```
 
-```{.output}
+```output
 Found 2 file(s)
 ```
 
-```{.output}
+```output
 GSE212041-GPL18573_series_matrix.txt.gz
 ```
 
-```{.output}
+```output
 GSE212041-GPL24676_series_matrix.txt.gz
 ```
 
@@ -141,7 +141,7 @@ GSE212041-GPL24676_series_matrix.txt.gz
 sprintf("Number of files downloaded: %i", length(gse212041))
 ```
 
-```{.output}
+```output
 [1] "Number of files downloaded: 2"
 ```
 
@@ -158,7 +158,7 @@ Write the code to check that the number of samples in each file gives us the tot
 writeLines(sprintf("file %i: %i samples", 1:2, c(dim(gse212041[[1]])[2], dim(gse212041[[2]])[2])))
 ```
 
-```{.output}
+```output
 file 1: 16 samples
 file 2: 765 samples
 ```
@@ -181,7 +181,7 @@ samp.info.cov19 <- Biobase::pData(gse212041[[2]])
 colnames(samp.info.cov19)
 ```
 
-```{.output}
+```output
  [1] "title"                   "geo_accession"          
  [3] "status"                  "submission_date"        
  [5] "last_update_date"        "type"                   
@@ -219,7 +219,7 @@ If we use the `exprs()` function to extract the counts data from the expression
 Biobase::exprs(gse212041[[2]])[,1:10]
 ```
 
-```{.output}
+```output
      GSM6507615 GSM6507616 GSM6507617 GSM6507618 GSM6507619 GSM6507620
      GSM6507621 GSM6507622 GSM6507623 GSM6507624
 ```
@@ -232,7 +232,7 @@ We can verify this by looking at the dimensions of the object in the exprs slot.
 dim(Biobase::exprs(gse212041[[2]]))
 ```
 
-```{.output}
+```output
 [1]   0 765
 ```
 

diff --git a/episode4.md b/episode4.md
@@ -86,7 +86,7 @@ We'll now apply these steps sequentially to the sample information for the IBD d
 dplyr::glimpse(samp.info.ibd)
 ```
 
-```{.output}
+```output
 Rows: 590
 Columns: 32
 $ `Source Name`                             <chr> "Sample 1", "Sample 2", "Sam…
@@ -153,7 +153,7 @@ for(i in seq_along(1:ncol(samp.info.ibd))){
 sprintf("The unique IDs that match the counts matrix are in column: %s", colnames(samp.info.ibd)[which(lst.colnames)])
 ```
 
-```{.output}
+```output
 [1] "The unique IDs that match the counts matrix are in column: Source Name"
 ```
 
@@ -196,7 +196,7 @@ samp.info.ibd.sel <- dplyr::rename(samp.info.ibd.sel,
 dplyr::glimpse(samp.info.ibd.sel)
 ```
 
-```{.output}
+```output
 Rows: 590
 Columns: 4
 $ sampleID  <chr> "Sample 1", "Sample 2", "Sample 3", "Sample 4", "Sample 5", …
@@ -225,7 +225,7 @@ samp.info.ibd.sel$condition[agrep("Crohns", samp.info.ibd.sel$condition)] <- "cr
 unique(c(samp.info.ibd.sel$sex, samp.info.ibd.sel$condition))
 ```
 
-```{.output}
+```output
 [1] "male"               "female"             "crohns_disease"    
 [4] "normal"             "ulcerative colitis"
 ```
@@ -275,7 +275,7 @@ samp.info.ibd.sel[c('sex', 'condition', 'class')] <- lapply(samp.info.ibd.sel[c(
 table(samp.info.ibd.sel$class)
 ```
 
-```{.output}
+```output
 
  -1   1 
 267 323 
@@ -290,7 +290,7 @@ The two classes are approximately equally represented, so let's check everything
 dplyr::glimpse(samp.info.ibd.sel)
 ```
 
-```{.output}
+```output
 Rows: 590
 Columns: 5
 $ sampleID  <chr> "Sample_1", "Sample_2", "Sample_3", "Sample_4", "Sample_5", …
@@ -334,7 +334,7 @@ Item  | Check For... | Rationale
 raw.counts.ibd[1:10,1:8]
 ```
 
-```{.output}
+```output
             read Sample 1 Sample 2 Sample 3 Sample 4 Sample 5 Sample 6
 1   1          *    13961    16595    20722    17696    25703    20848
 2   2 ERCC-00002        0        0        0        0        0        0
@@ -371,7 +371,7 @@ counts.mat.ibd <-  counts.mat.ibd %>% tibble::column_to_rownames('read')
 counts.mat.ibd[1:10,1:6]
 ```
 
-```{.output}
+```output
            Sample 1 Sample 2 Sample 3 Sample 4 Sample 5 Sample 6
 ERCC-00002        0        0        0        0        0        0
 ERCC-00003        0        0        0        0        0        0
@@ -398,15 +398,15 @@ ERCC-00019        0        0        0        0        0        0
 which(duplicated(rownames(counts.mat.ibd)))
 ```
 
-```{.output}
+```output
 integer(0)
 ```
 
 ```r
 which(duplicated(colnames(counts.mat.ibd)))
 ```
 
-```{.output}
+```output
 integer(0)
 ```
 
@@ -419,7 +419,7 @@ integer(0)
 if(!identical(colnames(counts.mat.ibd), samp.info.ibd.sel$sampleID)){stop()}
 ```
 
-```{.error}
+```error
 Error in eval(expr, envir, enclos): 
 ```
 
@@ -453,7 +453,7 @@ allMissValues <- function(x){all(is.na(x) | x == "")}
 allMissValues(counts.mat.ibd)
 ```
 
-```{.output}
+```output
 [1] FALSE
 ```
 
@@ -466,7 +466,7 @@ Take a final look at the cleaned up matrix.
 counts.mat.ibd[1:10,1:6]
 ```
 
-```{.output}
+```output
            Sample_1 Sample_2 Sample_3 Sample_4 Sample_5 Sample_6
 ERCC-00002        0        0        0        0        0        0
 ERCC-00003        0        0        0        0        0        0
@@ -484,15 +484,15 @@ ERCC-00019        0        0        0        0        0        0
 sprintf("There are %i rows, corresponding to the transcript IDs", dim(counts.mat.ibd)[1])
 ```
 
-```{.output}
+```output
 [1] "There are 22750 rows, corresponding to the transcript IDs"
 ```
 
 ```r
 sprintf("There are %i columns, corresponding to the samples", dim(counts.mat.ibd)[2])
 ```
 
-```{.output}
+```output
 [1] "There are 590 columns, corresponding to the samples"
 ```
 
@@ -630,7 +630,7 @@ samp.info.tb %>%
        dplyr::glimpse()                                                                     # view output
 ```
 
-```{.output}
+```output
 Rows: 360
 Columns: 3
 $ sampleID    <chr> "PR123_S19", "PR096_S13", "PR146_S14", "PR158_S12", "PR095…

diff --git a/episode5.md b/episode5.md
@@ -79,12 +79,14 @@ data.frame(max_count = apply(counts.mat.ibd, 1, max, na.rm=TRUE)) %>%
     ggplot2::scale_x_log10(n.breaks = 6, labels = scales::comma)
 ```
 
-```{.warning}
-Warning: Transformation introduced infinite values in continuous x-axis
+```warning
+Warning in ggplot2::scale_x_log10(n.breaks = 6, labels = scales::comma): log-10
+transformation introduced infinite values.
 ```
 
-```{.warning}
-Warning: Removed 10 rows containing non-finite values (`stat_bin()`).
+```warning
+Warning: Removed 10 rows containing non-finite outside the scale range
+(`stat_bin()`).
 ```
 
 <img src="fig/episode5-rendered-unnamed-chunk-3-1.png" style="display: block; margin: auto;" />
@@ -122,7 +124,7 @@ dds.ibd <- DESeq2::DESeqDataSetFromMatrix(
     design = ~ condition)
 ```
 
-```{.warning}
+```warning
 Warning: replacing previous import 'S4Arrays::makeNindexFromArrayViewport' by
 'DelayedArray::makeNindexFromArrayViewport' when loading 'SummarizedExperiment'
 ```
@@ -184,6 +186,12 @@ ggplot2::ggplot(data=data.frame(t = t.seq, jacc = ms.jac)) +
             ggplot2::ylab("Multiset Jaccard Index")
 ```
 
+```warning
+Warning in ggplot2::geom_point(ggplot2::aes(x = which.max(ms.jac), y = max(ms.jac)), : All aesthetics have length 1, but the data has 25 rows.
+ℹ Please consider using `annotate()` or provide this layer with data containing
+  a single row.
+```
+
 <img src="fig/episode5-rendered-unnamed-chunk-5-1.png" style="display: block; margin: auto;" />
 
 ::::::::::::::::::::::::::::::::::::: challenge 
@@ -201,7 +209,7 @@ The threshold value is given by the following code, which should return a value
 (t.hold <- which.max(ms.jac))
 ```
 
-```{.output}
+```output
 [1] 11
 ```
 
@@ -221,7 +229,7 @@ counts.mat.ibd.filtered <- counts.mat.ibd[which(apply(counts.ibd.norm, 1, functi
 sprintf("Genes filtered: %s; Genes remaining: %s", nrow(counts.mat.ibd)-nrow(counts.mat.ibd.filtered), nrow(counts.mat.ibd.filtered))
 ```
 
-```{.output}
+```output
 [1] "Genes filtered: 3712; Genes remaining: 19038"
 ```
 
@@ -239,7 +247,7 @@ Run the following code to view the top 10 values of read counts in the raw count
 tail(sort(as.matrix(counts.mat.ibd)),10)
 ```
 
-```{.output}
+```output
  [1] 2037946 2038514 2043983 2133125 2238093 2269033 2341479 2683585 3188911
 [10] 3191428
 ```
@@ -248,7 +256,7 @@ tail(sort(as.matrix(counts.mat.ibd)),10)
 sprintf("The mean read count value: %f", mean(as.matrix(counts.mat.ibd)))
 ```
 
-```{.output}
+```output
 [1] "The mean read count value: 506.731355"
 ```
 
@@ -273,41 +281,41 @@ Run `DESeq2` differential expression analysis, which automatically calculates th
 deseq.ibd <-  DESeq2::DESeq(dds.ibd.filt)
 ```
 
-```{.output}
+```output
 estimating size factors
 ```
 
-```{.output}
+```output
 estimating dispersions
 ```
 
-```{.output}
+```output
 gene-wise dispersion estimates
 ```
 
-```{.output}
+```output
 mean-dispersion relationship
 ```
 
-```{.output}
+```output
 final dispersion estimates
 ```
 
-```{.output}
+```output
 fitting model and testing
 ```
 
-```{.output}
+```output
 -- replacing outliers and refitting for 1559 genes
 -- DESeq argument 'minReplicatesForReplace' = 7 
 -- original counts are preserved in counts(dds)
 ```
 
-```{.output}
+```output
 estimating dispersions
 ```
 
-```{.output}
+```output
 fitting model and testing
 ```
 
@@ -335,7 +343,7 @@ counts.mat.ibd.ol.filtered <-  counts.mat.ibd.filtered[which(apply(cooks.mat, 1,
 sprintf("Genes filtered: %s; Genes remaining: %s", nrow(counts.mat.ibd.filtered)-nrow(counts.mat.ibd.ol.filtered), nrow(counts.mat.ibd.ol.filtered))
 ```
 
-```{.output}
+```output
 [1] "Genes filtered: 1776; Genes remaining: 17262"
 ```