diff --git a/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala b/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala index a6583567f..a2ea75414 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala @@ -36,22 +36,30 @@ import com.fulcrumgenomics.util.{Io, ProgressLogger} """ |Copies the UMI at the end of the BAM's read name to the RX tag. | - |The read name is split on `:` characters with the last field is assumed to be the UMI sequence. The UMI + |The read name is split on `:` characters with the last field assumed to be the UMI sequence. The UMI |will be copied to the `RX` tag as per the SAM specification. If any read does not have a UMI composed of |valid bases (ACGTN), the program will report the error and fail. | - |If a read name contains multiple UMIs they may be delimited by either hyphens (`-`) or pluses (`+`). The - |resulting UMI in the `RX` tag will always be hyphen delimited. + |If a read name contains multiple UMIs they may be delimited (typically by a hyphen (`-`) or plus (`+`)). + |The `--umi-delimiter` option specifies the delimiter on which to split. The resulting UMI in the `RX` tag + |will always be hyphen delimited. + | + |Some tools (e.g. BCL Convert) may reverse-complement UMIs on R2 and add a prefix to indicate that the sequence + |has been reverse-complemented. The `--rc-prefix` option specifies the prefix character(s) and causes them to + |be removed. Additionally, if the `--normalize-rc-umis` flag is specified, any reverse-complemented UMIs will + |be normalized (i.e., reverse-completemented back to be in the forward orientation). | |To obtain behavior similar to `umi_tools`' `--umi-separator=":r"`, specify the delimiter and - |prefix separately, i.e. `--umi-delimiter=":"` and `--umi-prefix="r"`. + |prefix separately, i.e. `--field-delimiter=":"` and `--rc-prefix="r"`. """) class CopyUmiFromReadName -( @arg(flag='i', doc="The input BAM file") input: PathToBam, - @arg(flag='o', doc="The output BAM file") output: PathToBam, - @arg(doc="Remove the UMI from the read name") removeUmi: Boolean = false, - @arg(doc="Delimiter between the read name and UMI.") umiDelimiter: Char = ':', - @arg(doc="Any characters preceding the UMI sequence in the read name.") umiPrefix: Option[String] = None, +( @arg(flag='i', doc="The input BAM file.") input: PathToBam, + @arg(flag='o', doc="The output BAM file.") output: PathToBam, + @arg(doc="Remove the UMI from the read name.") removeUmi: Boolean = false, + @arg(doc="Delimiter between the read name and UMI.") fieldDelimiter: Char = ':', + @arg(doc="Delimiter between UMI sequences.") umiDelimiter: Char = '+', + @arg(doc="The prefix to a UMI sequence that indicates it is reverse-complemented.") rcPrefix: Option[String] = None, + @arg(doc="Whether to reverse-complement UMI sequences with the '--rc-prefix'.") normalizeRcUmis: Boolean = false, ) extends FgBioTool with LazyLogging { Io.assertReadable(input) @@ -63,7 +71,12 @@ class CopyUmiFromReadName val progress = new ProgressLogger(logger) source.foreach { rec => progress.record(rec) - writer += Umis.copyUmiFromReadName(rec=rec, removeUmi=removeUmi, delimiter=umiDelimiter, prefix=umiPrefix) + writer += Umis.copyUmiFromReadName(rec=rec, + removeUmi=removeUmi, + fieldDelimiter=fieldDelimiter, + umiDelimiter=umiDelimiter, + rcPrefix=rcPrefix, + normalizeRcUmis=normalizeRcUmis) } progress.logLast() source.safelyClose() diff --git a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala index ef9962007..610dd55c2 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala @@ -26,6 +26,7 @@ package com.fulcrumgenomics.umi import com.fulcrumgenomics.bam.api.SamRecord +import com.fulcrumgenomics.util.Sequences object Umis { @@ -38,17 +39,30 @@ object Umis { * * @param rec the record to modify * @param removeUmi true to remove the UMI from the read name, otherwise only copy the UMI to the tag - * @param delimiter the delimiter of fields within the read name + * @param fieldDelimiter the delimiter of fields within the read name + * @param umiDelimiter the delimiter between sequences in the UMI string + * @param rcPrefix the prefix of a UMI that indicates it is reverse-complimented + * @param normalizeRcUmis whether to normalize reverse-complemented UMIs * @return the modified record */ - def copyUmiFromReadName(rec: SamRecord, removeUmi: Boolean = false, delimiter: Char = ':', prefix: Option[String] = None): SamRecord = { + def copyUmiFromReadName(rec: SamRecord, + removeUmi: Boolean = false, + fieldDelimiter: Char = ':', + umiDelimiter: Char = '+', + rcPrefix: Option[String] = None, + normalizeRcUmis: Boolean = false): SamRecord = { // Extract and set the UMI - val umi = extractUmisFromReadName(rec.name, delimiter, strict=false, prefix=prefix) + val umi = extractUmisFromReadName(rec.name, + fieldDelimiter, + strict=false, + umiDelimiter, + rcPrefix=rcPrefix, + normalizeRcUmis=normalizeRcUmis) require(umi.nonEmpty, f"No valid UMI found in: ${rec.name}") umi.foreach(u => rec(ConsensusTags.UmiBases) = u) // Remove the UMI from the read name if requested - if (removeUmi) rec.name = rec.name.substring(0, rec.name.lastIndexOf(delimiter)) + if (removeUmi) rec.name = rec.name.substring(0, rec.name.lastIndexOf(fieldDelimiter)) rec } @@ -59,29 +73,43 @@ object Umis { * * See https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm * The UMI field is optional, so read names may or may not contain it. Illumina also specifies that the UMI - * field may contain multiple UMIs, in which case they will delimit them with `+` characters. Pluses will be - * translated to hyphens before returning. + * field may contain multiple UMIs, in which case they will delimit them with `umiDelimiter` characters, which + * will be translated to hyphens before returning. * * If `strict` is true the name _must_ contain either 7 or 8 colon-separated segments, with the UMI being the last in the case of 8 and `None` in the case of 7. * * If `strict` is false the last segment is returned so long as it appears to be a valid UMI. */ - def extractUmisFromReadName(name: String, delimiter: Char = ':', strict: Boolean, prefix: Option[String] = None): Option[String] = { + def extractUmisFromReadName(name: String, + fieldDelimiter: Char = ':', + strict: Boolean, + umiDelimiter: Char = '+', + rcPrefix: Option[String] = None, + normalizeRcUmis: Boolean = false): Option[String] = { // If strict, check that the read name actually has eight parts, which is expected val rawUmi = if (strict) { - val colons = name.count(_ == delimiter) + val colons = name.count(_ == fieldDelimiter) if (colons == 6) None - else if (colons == 7) Some(name.substring(name.lastIndexOf(delimiter) + 1, name.length)) + else if (colons == 7) Some(name.substring(name.lastIndexOf(fieldDelimiter) + 1, name.length)) else throw new IllegalArgumentException(s"Trying to extract UMI from read with ${colons + 1} parts (7-8 expected): ${name}") } else { - val idx = name.lastIndexOf(delimiter) - require(idx != -1, s"Read did not have multiple '${delimiter}'-separated fields: ${name}") + val idx = name.lastIndexOf(fieldDelimiter) + require(idx != -1, s"Read did not have multiple '${fieldDelimiter}'-separated fields: ${name}") Some(name.substring(idx + 1, name.length)) } - val umiSeq = rawUmi.map(seq => (if (prefix.isEmpty) seq else seq.stripPrefix(prefix.get))) - val umi = umiSeq.map(raw => (if (raw.indexOf('+') > 0) raw.replace('+', '-') else raw).toUpperCase) + var umi = rawUmi.map(raw => rcPrefix match { + case Some(prefix) if raw.indexOf(prefix) >= 0 && normalizeRcUmis => + raw.split(umiDelimiter).map(seq => + (if (seq.startsWith(prefix)) Sequences.revcomp(seq.stripPrefix(prefix)) else seq).toUpperCase + ).mkString("-") + case Some(prefix) if raw.indexOf(prefix) >= 0 => + raw.replace(prefix, "").replace(umiDelimiter, '-').toUpperCase + case _ if raw.indexOf(umiDelimiter) > 0 => raw.replace(umiDelimiter, '-').toUpperCase + case _ => raw.toUpperCase + }) + val valid = umi.forall(u => u.forall(isValidUmiCharacter)) if (strict && !valid) throw new IllegalArgumentException(s"Invalid UMI '${umi.get}' extracted from name '${name}") diff --git a/src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala b/src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala index 3503fdeba..85c4a81c0 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala @@ -33,14 +33,21 @@ class CopyUmiFromReadNameTest extends UnitSpec with OptionValues { private case class Result(name: String, umi: String) /** Runs CopyUmiFromReadName using the given read names returning the output read names and UMIs. */ - private def run(names: Iterable[String], removeUmi: Boolean, umiPrefix: Option[String] = None): IndexedSeq[Result] = { + private def run(names: Iterable[String], + removeUmi: Boolean, + rcPrefix: Option[String] = None, + normalizeRcUmis: Boolean = false): IndexedSeq[Result] = { // build the reads val builder = new SamBuilder() names.foreach { name => builder.addFrag(name=name, unmapped=true) } // run the tool val out = makeTempFile("test.", ".bam") - val tool = new CopyUmiFromReadName(input=builder.toTempFile(), output=out, removeUmi=removeUmi, umiPrefix=umiPrefix) + val tool = new CopyUmiFromReadName(input=builder.toTempFile(), + output=out, + removeUmi=removeUmi, + rcPrefix=rcPrefix, + normalizeRcUmis=normalizeRcUmis) executeFgbioTool(tool) // slurp the results @@ -70,10 +77,17 @@ class CopyUmiFromReadNameTest extends UnitSpec with OptionValues { results.map(_.umi) should contain theSameElementsInOrderAs Seq("AAAA", "CCCC", "GGGG", "AAAA-CCCC") } - it should "remove any additional separator characters preceding the UMI" in { - val names = Seq("1:rAAAA", "1:2:rCCCC", "1:2:3:rGGGG", "blah:rAAAA-CCCC") - val results = run(names=names, removeUmi=true, umiPrefix=Some("r")) + it should "remove a reverse-complement prefix to the UMI" in { + val names = Seq("1:rAAAA", "1:2:rCCCC", "1:2:3:rGGGG", "blah:rAAAA+CCCC") + val results = run(names=names, removeUmi=true, rcPrefix=Some("r"), normalizeRcUmis = false) results.map(_.name) should contain theSameElementsInOrderAs Seq("1", "1:2", "1:2:3", "blah") results.map(_.umi) should contain theSameElementsInOrderAs Seq("AAAA", "CCCC", "GGGG", "AAAA-CCCC") } + + it should "remove a reverse-complement prefix to the UMI and reverse-complement the UMI when '--normalize-rc-umis'" in { + val names = Seq("1:rAAAA", "1:2:rCCCC", "1:2:3:rGGGG", "blah:rAAAA+CCCC") + val results = run(names=names, removeUmi=true, rcPrefix=Some("r"), normalizeRcUmis = true) + results.map(_.name) should contain theSameElementsInOrderAs Seq("1", "1:2", "1:2:3", "blah") + results.map(_.umi) should contain theSameElementsInOrderAs Seq("TTTT", "GGGG", "CCCC", "TTTT-CCCC") + } } diff --git a/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala b/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala index 96298b246..3d8c010b3 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala @@ -92,9 +92,9 @@ class UmisTest extends UnitSpec with OptionValues { } it should "split on a different name delimiter if specified" in { - copyUmiFromReadName(rec=rec("UMI-A"), delimiter='-').nameAndUmi shouldBe ("UMI-A", "A") - copyUmiFromReadName(rec=rec("UMI-C-A"), delimiter='-').nameAndUmi shouldBe ("UMI-C-A", "A") - copyUmiFromReadName(rec=rec("UMI-C-ACC+GGT"), delimiter='-').nameAndUmi shouldBe ("UMI-C-ACC+GGT", "ACC-GGT") + copyUmiFromReadName(rec=rec("UMI-A"), fieldDelimiter='-').nameAndUmi shouldBe ("UMI-A", "A") + copyUmiFromReadName(rec=rec("UMI-C-A"), fieldDelimiter='-').nameAndUmi shouldBe ("UMI-C-A", "A") + copyUmiFromReadName(rec=rec("UMI-C-ACC+GGT"), fieldDelimiter='-').nameAndUmi shouldBe ("UMI-C-ACC+GGT", "ACC-GGT") } it should "change the UMI delimiter from + to -" in {