Marker_assisted_selection_maize.Rmd

---
title: "Selecting markers for marker-assisted selection in maize"
author: Lindsay Clark, HPCBio, Roy J. Carver Biotechnology Center, University of Illinois,
  Urbana-Champaign
date: "April 2021"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
options(Biostrings.coloring = FALSE)
```

This document is modified from the
[marker-assisted selection](Marker_assisted_selection.md) workflow.

## Setup

Be sure you have completed the [setup](Setup.md) steps before working through
this pipeline.  First we'll load all the needed packages and functions.

```{r libs, message = FALSE, warning = FALSE}
library(VariantAnnotation)
library(Rsamtools)
library(ggplot2)
library(viridis)
source("src/marker_stats.R")
source("src/getNumGeno.R")
source("src/qtl_markers.R")
```

### Reference genome

Here we'll load the reference genome sequence for maize.  This was
already indexed using `indexFa` (see the setup steps).

The genome was obtained from https://download.maizegdb.org/Zm-B73-REFERENCE-GRAMENE-4.0/

```{r refgenome}
refgenome <- FaFile("data/Zm-B73-REFERENCE-GRAMENE-4.0.fasta")
```

### SNP positions from sequence

A spreadsheet was provided listing the markers of interest and their GBS tags.
Previous analysis indicated that many SNP positions were incorrect, so we will
attempt to determine the correct positions by sequence alignment.

```{r readqtl}
maize_qtl <- read.csv("data/Markers identified from QTL or GWAS studies with sequences - Latest - LVC edits.csv",
                      stringsAsFactors = FALSE)
```

We'll make a consensus sequence for each marker.

```{r consensus}
maize_qtl$Seq.consensus <- sapply(seq_len(nrow(maize_qtl)),
                                  function(i){
                                    consensusString(c(maize_qtl$Allele.1[i], maize_qtl$Allele.2[i]), ambiguityMap = IUPAC_CODE_MAP, threshold = 0.25)
                                  })
```


```{r exportfasta}
allele_seq <- DNAStringSet(maize_qtl$Seq.consensus)
names(allele_seq) <- paste(maize_qtl$QTL.marker, maize_qtl$Study, maize_qtl$Traits, sep = "; ")
writeXStringSet(allele_seq, filepath = "data/maize_qtl_sequence_2021-04-05.fasta")
```

#### BLAST

Alignment was performed with BLAST+/2.10.1, run on the Biocluster at the University of Illinois.

```{bash}
makeblastdb -in Zm-B73-REFERENCE-GRAMENE-4.0.fa -out blast/Zm-B73-4.0 -dbtype nucl -parse_seqids \
  -title Zm-B73-REFERENCE-GRAMENE-4.0

blastn -query data/maize_qtl_sequence_2021-04-05.fasta -db data/reference/blast/Zm-B73-4.0 \
  -num_threads 2 -outfmt 7 -out results/blast/maize_qtl_sequence_2021-04-05.blast.txt
```

How many BLAST hits did we have for each marker?

```{r blastfilter}
blast_lines <- readLines("results/maize_qtl_sequence_2021-04-05.blast.txt")
maize_qtl$N_blast_hits <- as.integer(sub("# ", "", sub(" hits found", "", grep("hits found", blast_lines, value = TRUE))))
```

We will identify the SNP position for each top hit where possible.

```{r eval = FALSE}
query_lines <- grep("^# Query:", blast_lines, value = TRUE)
tophit_lines <- blast_lines[grep("hits found", blast_lines) + 1]
tophit_lines[startsWith(tophit_lines, "#")] <- NA
cat(tophit_lines[!is.na(tophit_lines)], file = "results/maize_qtl_sequence_2021-04-05_tophits.blast.txt", sep = "\n")
```

```{r readblast}
maize_blast <- read.delim("results/maize_qtl_sequence_2021-04-05_tophits.blast.txt", header = FALSE, comment.char = "#")
colnames(maize_blast) <- c("query", "subject", "pct.identity", "alignment.length", "mismatches", "gap.opens",
                           "q.start", "q.end", "s.start", "s.end", "evalue", "bit.score")
maize_blast$query <- sub(";", "", maize_blast$query)
```

```{r}
maize_qtl$Blast_chrom[!is.na(tophit_lines)] <- maize_blast$subject
maize_qtl$Blast_pos[!is.na(tophit_lines)] <- 
  sapply(seq_len(nrow(maize_blast)),
         function(i){
           if(maize_blast$gap.opens[i] > 0) return(NA)
           snp_pad <- maize_qtl$SNP.Position[!is.na(tophit_lines)][i]
           snp_pad <- as.integer(sub(":.*$", "", snp_pad))
           if(maize_blast$s.end[i] > maize_blast$s.start[i]){
             return(maize_blast$s.start[i] + snp_pad - maize_blast$q.start[i] + 1L)
           } else {
             return(maize_blast$s.start[i] - snp_pad + maize_blast$q.start[i] - 1L)
           }
         })
maize_qtl$Blast_strand[!is.na(tophit_lines)] <-
  ifelse(maize_blast$s.end > maize_blast$s.start, "top", "bot")
```

For a few of them, SNP position could not be determined due to gap openings.  Those
sequences I BLASTed again with a different output format to determine the SNP positions.

```{r}
maize_qtl[!is.na(maize_qtl$Blast_chrom) & is.na(maize_qtl$Blast_pos), c("QTL.marker", "Seq.consensus")]

maize_qtl$Blast_pos[match("S10_138511949", maize_qtl$QTL.marker)] <- 138511994L
maize_qtl$Blast_pos[maize_qtl$QTL.marker == "S7_2840287"] <- 28402383L
maize_qtl$Blast_pos[match("S2_227406057", maize_qtl$QTL.marker)] <- 227406100L
maize_qtl$Blast_pos[maize_qtl$QTL.marker == "S8_164863938"] <- 164863937L # The SNP is an insertion

```


View how well the BLAST results match the provided data.  Some positions match exactly,
and most others are fairly close.

```{r}
maize_qtl[,c("QTL.marker", "Chrom", "Chrom.Pos", "Blast_chrom", "Blast_pos")]

temp <- maize_qtl$Chrom.Pos - maize_qtl$Blast_pos
hist(temp)
hist(temp[temp > -1.5e6])

maize_qtl[which(temp < -1.5e6), c("QTL.marker", "Chrom", "Chrom.Pos", "Blast_chrom", "Blast_pos")]
```

One had the wrong chromosome, so I'll manually correct it.

```{r}
maize_qtl$Blast_pos[match("S1_51819212", maize_qtl$QTL.marker)] <- 51819243L
maize_qtl$Blast_chrom[match("S1_51819212", maize_qtl$QTL.marker)] <- "Chr1"
maize_qtl$Blast_strand[match("S1_51819212", maize_qtl$QTL.marker)] <- "bot"
```

#### Bowtie2

A few markers did not yield BLAST results, so we'll see if we can get an alignment with Bowtie2.
Version 2.4.2 was used on the Biocluster at the University of Illinois.

```{bash}
bowtie2-build --threads 4 Zm-B73-REFERENCE-GRAMENE-4.0.fa bowtie/Zm-B73-4.0

bowtie2 --threads 2 -f -x data/reference/bowtie/Zm-B73-4.0 -U data/maize_qtl_sequence_2021-04-05.fasta \
  -S results/bowtie/maize_qtl_sequence_2021-04-05_bowtie_verysensitive.sam --very-sensitive
```

```{r}
samlines <- readLines("results/maize_qtl_sequence_2021-04-05_bowtie_verysensitive.sam")
samlines <- samlines[!startsWith(samlines, "@")]
samsplit <- strsplit(samlines, split = "\t")

maize_qtl$Bowtie_chrom <- sapply(samsplit,
                                 function(x){
                                   out <- x[3]
                                   if(out == "*") return(NA) else return(out)
                                 })

snppos_sam <- function(snp_pad, pos, flag, cigar){
  if(flag == "4") return(NA)
  pos <- as.integer(pos)
  cig_nums <- as.integer(strsplit(cigar, split = "[MID]")[[1]])
  cig_char <- strsplit(cigar, split = "[[:digit:]]")[[1]]
  cig_char <- cig_char[cig_char != ""]
  if(flag == "16"){
    snp_pad <- sum(cig_nums[cig_char %in% c("M", "I")]) - snp_pad - 1L
  }
  out <- pos
  for(i in seq_along(cig_nums)){
    if(cig_char[i] == "M"){
      if(cig_nums[i] > snp_pad){
        out <- out + snp_pad
        break
      } else {
        out <- out + cig_nums[i]
        snp_pad <- snp_pad - cig_nums[i]
      }
    }
    if(cig_char[i] == "I"){
      snp_pad <- snp_pad - cig_nums[i]
    }
    if(cig_char[i] == "D"){
      out <- out + cig_nums[i]
    }
  }
  return(out)
}

maize_qtl$Bowtie_pos <- sapply(seq_len(nrow(maize_qtl)),
                               function(i){
                                 snp_pad <- as.integer(sub(":.*$", "", maize_qtl$SNP.Position[i]))
                                 snppos_sam(snp_pad, samsplit[[i]][4], samsplit[[i]][2],
                                            samsplit[[i]][6])
                               })

maize_qtl$Bowtie_strand <- sapply(samsplit,
                                  function(x){
                                    if(x[2] == "0") return("top")
                                    if(x[2] == "16") return("bot")
                                    if(x[2] == "4") return(NA_character_)
                                  })
```

The only places where Bowtie2 found a different position from BLAST were cases
where it aligned to an entirely different chromosome.

```{r}
maize_qtl[which(maize_qtl$Blast_pos != maize_qtl$Bowtie_pos),
          c("QTL.marker", "Chrom", "Chrom.Pos", "Blast_chrom", "Blast_pos", "Bowtie_chrom", "Bowtie_pos")]
```

Where no BLAST alignment was found, sometimes Bowtie2 found an alignment,
although it did not always match the chromosome reported by the genotyping company.

```{r}
maize_qtl[is.na(maize_qtl$Blast_pos),
          c("QTL.marker", "Chrom", "Chrom.Pos", "Blast_chrom", "Blast_pos", "Bowtie_chrom", "Bowtie_pos")]
```

Let's start columns to store the final chromosome and position that we will use
after comparing both alignments.

```{r}
maize_qtl$Final_chrom <- maize_qtl$Blast_chrom
maize_qtl$Final_pos <- maize_qtl$Blast_pos
maize_qtl$Final_strand <- maize_qtl$Blast_strand
bad_blast <- which(is.na(maize_qtl$Blast_chrom) |
                     (maize_qtl$Blast_chrom != paste0("Chr", maize_qtl$Chrom) &
                        maize_qtl$Blast_pos != maize_qtl$Chrom.Pos))
bad_blast
maize_qtl$Final_chrom[bad_blast] <- NA
maize_qtl$Final_pos[bad_blast] <- NA
maize_qtl$Final_strand[bad_blast] <- NA
good_bowtie <- which(!is.na(maize_qtl$Bowtie_chrom) &
                       (maize_qtl$Bowtie_chrom == paste0("Chr", maize_qtl$Chrom) |
                        maize_qtl$Bowtie_pos == maize_qtl$Chrom.Pos))
bowtie_fill <- good_bowtie[good_bowtie %in% bad_blast]

maize_qtl$Final_chrom[bowtie_fill] <- maize_qtl$Bowtie_chrom[bowtie_fill]
maize_qtl$Final_pos[bowtie_fill] <- maize_qtl$Bowtie_pos[bowtie_fill]
maize_qtl$Final_strand[bowtie_fill] <- maize_qtl$Bowtie_strand[bowtie_fill]
```

#### Pairwise alignments based on supplied positions

Below is a list of markers where the chromosomes identified by neither BLAST
nor Bowtie match the supplied chromosome.  We'll try one more approach to align
them.

```{r}
to_fill <- which(is.na(maize_qtl$Final_chrom))
maize_qtl[to_fill,
          c("QTL.marker", "Chrom", "Chrom.Pos", "Blast_chrom", "Blast_pos", "Bowtie_chrom", "Bowtie_pos", "Seq.consensus")]
```

Given that other SNPs were off in position by up to 80 nt, and the tags
are 69 nt long, we will search sequence 200 nt on either side of each SNP
position.

```{r}
remaining_snps <- GRanges(paste0("Chr", maize_qtl$Chrom[to_fill]),
                          IRanges(start = maize_qtl$Chrom.Pos[to_fill],
                                  width = 1))
names(remaining_snps) <- maize_qtl$QTL.marker[to_fill]
remaining_snps_flank <- flank(remaining_snps, width = 200, both = TRUE)
remaining_snps_flank_seq <- scanFa(refgenome, remaining_snps_flank)
```

We'll do the alignment using the Biostrings package.

```{r}
aligns_forward <- aligns_reverse <- vector(mode = "list", length = 4)
best_align <- character(4)

for(i in 1:4){
  thistag <- DNAString(maize_qtl$Seq.consensus[match(names(remaining_snps)[i], maize_qtl$QTL.marker)])
  aligns_forward[[i]] <- pairwiseAlignment(thistag, remaining_snps_flank_seq[i],
                          substitutionMatrix = nucleotideSubstitutionMatrix(),
                          type = "local", gapOpening = 2, gapExtension = 2)
  aligns_reverse[[i]] <- pairwiseAlignment(reverseComplement(thistag), remaining_snps_flank_seq[i],
                          substitutionMatrix = nucleotideSubstitutionMatrix(),
                          type = "local", gapOpening = 2, gapExtension = 2)
  if(score(aligns_forward[[i]]) > score(aligns_reverse[[i]])){
    best_align[i] <- "top"
  } else {
    best_align[i] <- "bot"
  }
}
```

We can examine the output alignments.

```{r}
best_align

aligns_reverse[[1]]
aligns_forward[[2]]
aligns_reverse[[3]]
aligns_forward[[4]]
```

Now we can calculate SNP positions.

```{r}
maize_qtl$SNP.Position[to_fill]

maize_qtl$Final_pos[to_fill[1]] <- start(remaining_snps_flank)[1] + 188L - 1L + 69L - 15L - 1L
maize_qtl$Final_pos[to_fill[2]] <- start(remaining_snps_flank)[2] + 181L - 1L + 20L
maize_qtl$Final_pos[to_fill[3]] <- start(remaining_snps_flank)[3] + 161L - 1L + 69L - 44L - 1L
maize_qtl$Final_pos[to_fill[4]] <- start(remaining_snps_flank)[4] + 176L - 1L + 25L

maize_qtl$Final_chrom[to_fill] <- as.character(seqnames(remaining_snps))
maize_qtl$Final_strand[to_fill] <- best_align
```

Checking the results:

```{r}
maize_qtl[to_fill,
          c("QTL.marker", "Chrom", "Chrom.Pos", "Final_chrom", "Final_pos")]
```

As with the rest of the markers, some of the positions match and some don't.

Now I will save all of my positions to a CSV file.

```{r eval = FALSE}
write.csv(maize_qtl, file = "results/maize_qtl_positions_corrected.csv")
```

To reload without having to run the above code:

```{r eval = FALSE}
maize_qtl <- read.csv("results/maize_qtl_positions_corrected.csv",
                      row.names = 1)
```

### Public HapMap VCFs for maize

In order to annotate more flanking SNPs than were provided in the original
genotype files,
I downloaded publicly available maize variant data from Cyverse.  See
http://cbsusrv04.tc.cornell.edu/users/panzea/download.aspx?filegroupid=34
for instructions.  Each chromosome is in its own VCF.

```{r bzip_and_index, eval = FALSE}
vcfpath <- "/mnt/lvclark1/maize_snp/" # point to where you have the VCF files
vcffiles <- paste0("hmp321_agpv4_chr", 1:10, ".vcf.gz")
```

I'll import the QTL now so that I can just import the VCF regions of interest
and put them into a smaller file to download to my laptop.  Note that chromosome
names in the reference genome are formatted as "Chr1" whereas chromosome names
in the public VCFs are formatted as "1".

```{r qtl_ranges}
search_distance <- 1e5
qtl_ranges <- GRanges(maize_qtl$Final_chrom,
                      IRanges(start = maize_qtl$Final_pos - search_distance,
                              end = maize_qtl$Final_pos + search_distance))
names(qtl_ranges) <- maize_qtl$QTL.marker
```

```{r readbigvcfs, eval = FALSE}
vcflist <- numgenlist <- list()
length(vcflist) <- length(numgenlist) <- 10
for(chr in 1:10){
  qtlsubset <- seqnames(qtl_ranges) == paste0("Chr", chr)
  thisrange <- GRanges(as.character(chr),
                       ranges(qtl_ranges[qtlsubset]))
  names(thisrange) <- names(qtl_ranges[qtlsubset])
  svp <- ScanVcfParam(which = thisrange, geno = NA)
  thisfile <- paste0(vcfpath, vcffiles[chr])
  vcflist[[chr]] <- readVcf(thisfile,
                            genome = as.character(chr),
                            param = svp)
  numgenlist[[chr]] <- getNumGeno(thisfile, thisrange)
}

save(vcflist, numgenlist, file = "results/vcflist.RData")
```

The above code made a file small enough to download to my laptop. I will load
it to continue from here.

```{r loadvcflist}
load("results/vcflist.RData")
```

We should do QC to make sure the reference genome sequence matches what is
listed in the VCF.

```{r}
refcheck_public <- sapply(vcflist,
                   function(x){
                     ranges1 <- rowRanges_correctedSeqnames(x,
                                       fixfn = function(i){
                                         paste0("Chr", i)
                                       })
                     refcheck1 <- scanFa(refgenome, ranges1)
                     return(mean(refcheck1 == ranges1$REF))
                   })
all(refcheck_public == 1) # TRUE if all reference alleles match reference
```

Are the SNPs of interest in the public VCF?

```{r warning = FALSE}
maize_qtl$Public <- FALSE
maize_qtl$Public_name <- ""
for(i in seq_len(nrow(maize_qtl))){
  chrom <- as.integer(sub("Chr", "", maize_qtl$Final_chrom[i]))
  if(is.na(chrom)) next # one marker not on chromosome
  pos <- maize_qtl$Final_pos[i]
  maize_qtl$Public[i] <- pos %in% start(vcflist[[chrom]])
  if(maize_qtl$Public[i]){
    maize_qtl$Public_name[i] <- names(vcflist[[chrom]])[match(pos, start(vcflist[[chrom]]))]
  }
}

mean(maize_qtl$Public, na.rm = TRUE)
```

Only about 70% of these are in the public VCF.  Therefore, we'll rely on the
spreadsheet for information about target SNPs, and just use the public VCF
to discover flanking SNPs.

We'll convert the public VCF data from two lists into a single
`GRanges` object and a single matrix.

```{r warning = FALSE}
temp <- do.call(c, lapply(vcflist, rowRanges))
vcf_ranges <- rowRanges_correctedSeqnames(temp,
                                          fixfn = function(x) paste0("Chr", x))
rm(temp)

vcf_numgen <- do.call(rbind, numgenlist)
rm(numgenlist)
```

We'll look at the distribution of allele frequencies to see if we want to
filter some markers.

```{r}
vcf_alfreq <- rowMeans(vcf_numgen, na.rm = TRUE) / 2
hist(vcf_alfreq)

mean(vcf_alfreq > 0.01)
```

Only annotating SNPs found in greater than 1% of accessions seems reasonable;
these are the ones most likely to influence assay efficacy.

```{r}
vcf_ranges_filt <- vcf_ranges[vcf_alfreq > 0.01]
```

## Technical parameters for marker design from significant SNPs

Ideally, we would like to design markers directly from the significant hits.
We should check that they will make reasonably good KASP markers first,
however.

### GC content

PCR will work best when GC content is 40-60%.  We will first extract the
flanking sequence needed for marker design.

```{r qtlkaspseq}
qtl_kasp_ranges <- getKaspRange(GRanges(maize_qtl$Final_chrom,
                                        IRanges(start = maize_qtl$Final_pos,
                                                width = 1L)))
qtl_kasp_seq <- scanFa(refgenome, qtl_kasp_ranges)
```

Then we'll determine GC content.

```{r gccontent}
temp <- letterFrequency(qtl_kasp_seq, c("ATW", "GCS"))
maize_qtl$GCcontent <- temp[,2] / rowSums(temp)
```


```{r gccontenthist}
hist(maize_qtl$GCcontent, xlab = "GC content",
     main = "GC content flanking significant SNPs", breaks = 20)
```

Many are below the desired range, but not by much.

### Number of flanking SNPs

A few flanking SNPs are okay, but we want to make sure none of these
have an excessive amount.

```{r flankingsnps}
temp1 <- findOverlaps(narrow(qtl_kasp_ranges, start = 1, width = 50),
                      vcf_ranges_filt, type = "any")
nhits1 <- countLnodeHits(temp1)
temp2 <- findOverlaps(narrow(qtl_kasp_ranges, start = 52, width = 50),
                      vcf_ranges_filt, type = "any")
nhits2 <- countLnodeHits(temp2)
maize_qtl$Nflanking <- nhits1 + nhits2

hist(maize_qtl$Nflanking, xlab = "Number of flanking SNPs",
     main = "Number of flanking SNPs within 50 bp of significant SNPs")
table(maize_qtl$Nflanking)
```

For those with three or more, we might see if there are better markers.

## Evaluating nearby markers

For which markers do we want to search for substitutes?

```{r}
to_sub <- which(maize_qtl$Public & 
                  (maize_qtl$Nflanking > 2 | maize_qtl$GC < 0.35 | maize_qtl$GCcontent > 0.65))
maize_qtl[to_sub,c("QTL.marker", "Final_pos", "Final_chrom", "GCcontent", "Nflanking")]
```

We'll find markers in linkage disequilibrium with our targets and
calculate GC content and number of flanking SNPs.

```{r eval = FALSE}
candidate_list <- vector(mode = "list", length = length(to_sub))
names(candidate_list) <- maize_qtl$QTL.marker[to_sub]

for(i in seq_along(to_sub)){
  thisrow <- to_sub[i]
  pubname <- maize_qtl$Public_name[thisrow]
  origname <- maize_qtl$QTL.marker[thisrow]
  message(origname)
  nearby_markers <- names(vcf_ranges)[vcf_ranges$paramRangeID == origname]
  nearby_markers <- setdiff(nearby_markers, pubname)
  
  thisgeno <- vcf_numgen[pubname,]
  
  tempmarkers <- character(0)
  templd <- numeric(0)
  tempgc <- numeric(0)
  tempflank <- integer(0)
  
  for(m in nearby_markers){
    #if(sd(vcf_numgen[m,], na.rm = TRUE) == 0) next
    thisld <- cor(thisgeno, vcf_numgen[m,], use = "complete.obs", method = "pearson") ^ 2
    if(is.na(thisld) || thisld < 0.9) next # don't investigate marker unless very high LD to target
    tempmarkers <- c(tempmarkers, m)
    templd <- c(templd, thisld)
  }
  if(length(tempmarkers) > 0){
    # Get GC content and number of flanking SNPs
    this_kasp_range <- getKaspRange(vcf_ranges[tempmarkers])
    this_seq <- scanFa(refgenome, this_kasp_range)
    temp <- letterFrequency(this_seq, c("ATW", "GCS"))
    tempgc <- temp[,2] / rowSums(temp)
    
    fo <- GenomicRanges::findOverlaps(this_kasp_range, vcf_ranges_filt, type = "any")
    tempflank <- S4Vectors::countLnodeHits(fo) - 1L
  }
  
  candidate_list[[i]] <- data.frame(Marker = tempmarkers,
                                    LD = templd,
                                    GCcontent = tempgc,
                                    Nflanking = tempflank)
}

save(candidate_list, file = "results/candidate_list_maize.RData")
```

```{r}
load("results/candidate_list_maize.RData")
sapply(candidate_list, nrow)
candidate_list <- candidate_list[sapply(candidate_list, nrow) > 0]
names(candidate_list)
```

From the markers that were identified to be in LD, we will choose some good
alternatives.

```{r results = "hide"}
betterMarker <- function(m, tab = maize_qtl, lst = candidate_list, rng = vcf_ranges){
  row <- match(m, tab$QTL.marker)
  gc <-tab$GCcontent[row]
  nflank <- tab$Nflanking[row]
  pos <- tab$Final_pos[row]
  
  cand <- lst[[m]]
  # only look at markers that are an improvement on GC and # snps
  cand <- cand[abs(0.5 - cand$GCcontent) <= abs(0.5 - gc),]
  cand <- cand[cand$Nflanking <= nflank,]
  # if any have ideal characteristics, subset to those
  if(any(cand$Nflanking < 3 & cand$GCcontent > 0.4 & cand$GCcontent < 0.6)){
    cand <- cand[cand$Nflanking < 3 & cand$GCcontent > 0.4 & cand$GCcontent < 0.6,]
  }
  #if(nrow(cand == 0)) return(NA_character_)
  # find the physically closest marker
  cand$Pos <- start(rng[cand$Marker])
  cand$Dist <- abs(cand$Pos - pos)
  return(cand)
}

betterMarker("S10_16561261")  # use 10-16329120
betterMarker("S10_112661493") # use 10-111797724
betterMarker("S3_143135226")  # use 3-142270994
betterMarker("S1_102219796")  # keep original
betterMarker("S3_168085123")  # keep original
betterMarker("S3_144672580")  # keep original
betterMarker("S1_219230116")  # keep original; 1-216107167 is an insertion
betterMarker("S1_17612352")   # use 1-17281449
betterMarker("S5_203246920")  # use 5-198031485
betterMarker("S7_158457623")  # use 7-153071765
betterMarker("S9_136694198")  # keep original
betterMarker("S2_229069973")  # use 2-222755051
betterMarker("S1_51819212")   # keep original
betterMarker("S5_77117962")   # keep original

maize_qtl$Alt_marker <- NA_character_
maize_qtl$Alt_marker[match(c("S10_16561261", "S10_112661493", "S3_143135226", "S1_17612352", "S5_203246920", "S7_158457623", "S2_229069973"),
                           maize_qtl$QTL.marker)] <-
                           c( "10-16329120",  "10-111797724",  "3-142270994",  "1-17281449",  "5-198031485",  "7-153071765",  "2-222755051")
altrows <- which(!is.na(maize_qtl$Alt_marker))
maize_qtl$Alt_pos <- NA_integer_
maize_qtl$Alt_pos[altrows] <- start(vcf_ranges[maize_qtl$Alt_marker[altrows]])
maize_qtl$Alt_GCcontent <- NA_real_
maize_qtl$Alt_Nflanking <- NA_integer_
maize_qtl$Alt_GCcontent[altrows] <-
  sapply(altrows,
         function(x){
           cand <- candidate_list[[maize_qtl$QTL.marker[x]]]
           cand$GCcontent[match(maize_qtl$Alt_marker[x], cand$Marker)]
         })
maize_qtl$Alt_Nflanking[altrows] <-
  sapply(altrows,
         function(x){
           cand <- candidate_list[[maize_qtl$QTL.marker[x]]]
           cand$Nflanking[match(maize_qtl$Alt_marker[x], cand$Marker)]
         })
```

## Formatting sequences for KASP

I'll modify a previously written function to format sequences for KASP,
since we aren't starting directly from VCF.

```{r}
# Function to format SNPs for KASP assay, annotating flanking SNPs.
formatKasp2 <- function(rr, rr2, refgenome, allele1, allele2,
                        strand = rep("top", length(rr2)), flanking_bp = 50){

  seq <- Rsamtools::scanFa(refgenome, rr2, as = "DNAStringSet")
  fo <- GenomicRanges::findOverlaps(rr2, rr, type = "any")
  kaspstart <- BiocGenerics::start(rr2)
  for(i in seq_along(rr2)){
    toshift <- kaspstart[i] - 1
    theseSNPs <- rr[S4Vectors::subjectHits(fo)[S4Vectors::queryHits(fo) == i]]
    pos <- BiocGenerics::start(theseSNPs) - toshift
    alts <- lapply(theseSNPs$ALT,
                   function(x) paste(x[x %in% c("A", "C", "G", "T")], collapse = ""))
    ambig <- Biostrings::mergeIUPACLetters(paste0(theseSNPs$REF, alts))
    seq[[i]] <- Biostrings::replaceLetterAt(seq[[i]], pos, ambig)
  }
  seq[strand == "bot"] <- reverseComplement(seq[strand == "bot"])
  outstrings <- paste0(XVector::subseq(seq, start = 1, width = flanking_bp),
                       "[",
                       allele1, "/", allele2,
                       "]",
                       XVector::subseq(seq, start = flanking_bp + 2, width = flanking_bp))
  return(data.frame(SNP_ID = names(rr2),
                    Sequence = outstrings,
                    stringsAsFactors = FALSE))
}

names(qtl_kasp_ranges) <- maize_qtl$QTL.marker

kasp_strings_orig <- formatKasp2(vcf_ranges_filt, qtl_kasp_ranges, refgenome,
                                 substring(maize_qtl$Allele, 1, 1),
                                 substring(maize_qtl$Allele, 3, 3),
                                 maize_qtl$Final_strand)

alt_ranges <- vcf_ranges[maize_qtl$Alt_marker[altrows]]
kasp_ranges_alt <- getKaspRange(alt_ranges)

kasp_strings_alt <- formatKasp2(vcf_ranges_filt, kasp_ranges_alt, refgenome,
                                alt_ranges$REF, unlist(alt_ranges$ALT))

maize_qtl$KASP_string <- kasp_strings_orig$Sequence
maize_qtl$KASP_string_alt <- ""
maize_qtl$KASP_string_alt[altrows] <- kasp_strings_alt$Sequence
```

Finally, we'll export to CSV.

```{r eval = FALSE}
write.csv(maize_qtl, file = "results/maize_qtl_kasp_design_2021-04-08.csv", row.names = FALSE)
```