diff --git a/README.md b/README.md index 3b4f707..f4a0a8b 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ allhic optimize tests/test.counts_GATC.2g2.txt tests/test.clm Build genome release, including `.agp` and `.fasta` output. ```console -allhic build tests/test.counts_GATC.2g1.tour seq.fasta.gz +allhic build tests/test.counts_GATC.2g?.tour tests/seq.fasta.gz tests/asm-2g.chr.fasta ``` ### Plot @@ -89,7 +89,7 @@ allhic plot tests/test.bam tests/test.counts_GATC.2g1.tour ![allhicplot](images/allhic-plot-s.png) -## Pipeline +## Pipeline Following the 4 steps of `prune`, `extract`, `partition`, `optimize`, as described above. In summary, we have: @@ -99,7 +99,7 @@ allhic extract tests/test.bam tests/seq.fasta.gz allhic partition tests/test.counts_GATC.txt tests/test.pairs.txt 2 allhic optimize tests/test.counts_GATC.2g1.txt tests/test.clm allhic optimize tests/test.counts_GATC.2g2.txt tests/test.clm -allhic build tests/test.tour seq.fasta.gz +allhic build tests/test.counts_GATC.2g?.txt tests/seq.fasta.gz tests/asm-2g.chr.fasta ``` Or, in a single step: @@ -121,10 +121,9 @@ FASTA file. - [x] Use clustering when k = 1 - [x] Isolate matrix generation to "plot" - [x] Add "pipeline" to simplify execution -- [ ] Make "build" to merge subgroup tours +- [x] Make "build" to merge subgroup tours +- [x] Provide better error messages for "file not found" - [ ] Plot the boundary of the contigs in "plot" using genome.json -- [ ] Provide better error messages for "file not found" -- [ ] Merge tours from multiple partitions back to a single file - [ ] Add dot plot to "plot" - [ ] Compare numerical output with Lachesis - [ ] Improve Ler0 results diff --git a/agp.go b/agp.go index 199a162..7dde4cb 100644 --- a/agp.go +++ b/agp.go @@ -12,7 +12,6 @@ package allhic import ( "bufio" "bytes" - "os" "strconv" "strings" @@ -86,7 +85,7 @@ func (r *AGP) Add(row string) { // buildFasta builds target FASTA based on info from agpfile func buildFasta(agpfile string, seqs map[string]*seq.Seq) { agp := NewAGP(agpfile) - file, _ := os.Open(agpfile) + file := mustOpen(agpfile) log.Noticef("Parse agpfile `%s`", agpfile) scanner := bufio.NewScanner(file) @@ -95,7 +94,7 @@ func buildFasta(agpfile string, seqs map[string]*seq.Seq) { } var buf bytes.Buffer - outFile := RemoveExt(agpfile) + ".chr.fasta" + outFile := RemoveExt(agpfile) + ".fasta" outfh, _ := xopen.Wopen(outFile) prevObject := "" for _, line := range agp.lines { @@ -124,6 +123,7 @@ func buildFasta(agpfile string, seqs map[string]*seq.Seq) { // Last one writeRecord(prevObject, buf, outfh) buf.Reset() + log.Noticef("Assembly FASTA file `%s` built", outFile) } // writeRecord writes the FASTA record to the file diff --git a/anchor.go b/anchor.go index 3abe244..4d1f5f2 100644 --- a/anchor.go +++ b/anchor.go @@ -202,7 +202,7 @@ func (r *Anchorer) makeTrivialPaths(contigs []*Contig, flanksize int64) PathSet // ExtractInterContigLinks extracts links from the Bamfile func (r *Anchorer) ExtractInterContigLinks() { - fh, _ := os.Open(r.Bamfile) + fh := mustOpen(r.Bamfile) prefix := RemoveExt(r.Bamfile) disfile := prefix + ".dis" idsfile := prefix + ".ids" diff --git a/assess.go b/assess.go index 4555e04..201e9ea 100644 --- a/assess.go +++ b/assess.go @@ -98,11 +98,7 @@ func (r *Assesser) writePostProb(outfile string) { // readBed parses the bedfile to extract the start and stop for all the contigs func (r *Assesser) readBed() { - fh, err := os.Open(r.Bedfile) - if err != nil { - log.Errorf("bedfile `%s` does not exist", r.Bedfile) - os.Exit(1) - } + fh := mustOpen(r.Bedfile) log.Noticef("Parse bedfile `%s`", r.Bedfile) reader := bufio.NewReader(fh) @@ -142,7 +138,7 @@ func checkInRange(pos, start, end int) bool { // extractContigLinks builds the probability distribution of link sizes func (r *Assesser) extractContigLinks() { - fh, _ := os.Open(r.Bamfile) + fh := mustOpen(r.Bamfile) log.Noticef("Parse bamfile `%s`", r.Bamfile) br, _ := bam.NewReader(fh, 0) defer br.Close() diff --git a/base.go b/base.go index fefe8a4..611d6f8 100644 --- a/base.go +++ b/base.go @@ -349,7 +349,7 @@ func Percentage(a, b int) string { func ReadCSVLines(filename string) [][]string { log.Noticef("Parse csvfile `%s`", filename) - fh, _ := os.Open(filename) + fh := mustOpen(filename) defer fh.Close() var data [][]string @@ -384,3 +384,19 @@ func sortInt64s(a []int64) { func searchInt64(a []int64, x int64) int { return sort.Search(len(a), func(i int) bool { return a[i] >= x }) } + +// mustExist panics when a file is not found +func mustExist(filename string) { + if _, err := os.Stat(filename); os.IsNotExist(err) { + log.Fatal(err) + } +} + +// mustOpen wraps os.Open but panics if file not found +func mustOpen(filename string) *os.File { + f, err := os.Open(filename) + if err != nil { + log.Fatal(err) + } + return f +} diff --git a/build.go b/build.go index c1a0d2d..b02d535 100644 --- a/build.go +++ b/build.go @@ -22,9 +22,11 @@ import ( // Builder reconstructs the genome release AGP and FASTA files type Builder struct { - Tourfile string + Tourfiles []string Fastafile string - AGPfile string + // Output file + OutAGPfile string + OutFastafile string } // OOLine describes a simple contig entry in a scaffolding experiment @@ -59,15 +61,6 @@ func (r *OO) getFastaSizes(fastafile string) { } } -// readFiles initializes OO object -func (r *Builder) readFiles() *OO { - oo := new(OO) - oo.getFastaSizes(r.Fastafile) - oo.parseLastTour(r.Tourfile) - - return oo -} - // Add instantiates a new OOLine object and add to the array in OO func (r *OO) Add(scaffold, ctg string, ctgsize int, strand byte) { o := OOLine{scaffold, ctg, ctgsize, strand} @@ -76,7 +69,7 @@ func (r *OO) Add(scaffold, ctg string, ctgsize int, strand byte) { // writeAGP converts the simplistic OOLine into AGP format func (r *Builder) writeAGP(oo *OO) { - filename := r.AGPfile + r.OutAGPfile = RemoveExt(r.OutFastafile) + ".agp" gapSize := 100 gapType := "scaffold" linkage := "yes" @@ -86,7 +79,7 @@ func (r *Builder) writeAGP(oo *OO) { objectEnd := 1 partNumber := 0 componentType := 'W' - f, _ := os.Create(filename) + f, _ := os.Create(r.OutAGPfile) defer f.Close() w := bufio.NewWriter(f) components := 0 @@ -120,20 +113,28 @@ func (r *Builder) writeAGP(oo *OO) { components++ } w.Flush() - log.Noticef("A total of %d tigs written to `%s`", components, filename) + log.Noticef("A total of %d tigs written to `%s`", components, r.OutAGPfile) } -// Run kicks off the Builder +// Run kicks off the Build and constructs molecule using component FASTA sequence func (r *Builder) Run() { - r.AGPfile = RemoveExt(r.Tourfile) + ".agp" - r.Build(r.readFiles()) + oo := new(OO) + oo.getFastaSizes(r.Fastafile) + // oo.parseLastTour(r.Tourfile) + oo.mergeTours(r.Tourfiles) + r.writeAGP(oo) + buildFasta(r.OutAGPfile, oo.seqs) log.Notice("Success") } -// Build constructs molecule using component FASTA sequence -func (r *Builder) Build(oo *OO) { - r.writeAGP(oo) - buildFasta(r.AGPfile, oo.seqs) +// mergeTours merges a number of tours typically generated by partition and optimize +// In contrast to parseLastTour which only parse one tour +func (r *OO) mergeTours(tourfiles []string) { + for i, tourfile := range tourfiles { + seqid := fmt.Sprintf("g%d", i+1) + log.Noticef("Import `%s` => %s", tourfile, seqid) + r.parseLastTour(tourfile, seqid) + } } // parseLastTour reads tour from file @@ -141,7 +142,7 @@ func (r *Builder) Build(oo *OO) { // A tour file has the following format: // > name // contig1+ contig2- contig3? -func (r *OO) parseLastTour(tourfile string) { +func (r *OO) parseLastTour(tourfile string, seqid string) { words := parseTourFile(tourfile) var strand byte for _, tig := range words { @@ -151,7 +152,7 @@ func (r *OO) parseLastTour(tourfile string) { } else { strand = '?' } - r.Add("FINAL", tig, r.seqs[tig].Length(), strand) + r.Add(seqid, tig, r.seqs[tig].Length(), strand) } } @@ -163,7 +164,7 @@ func (r *OO) parseLastTour(tourfile string) { func (r *OO) ParseAllTours(tourfile string) { log.Noticef("Parse tourfile `%s`", tourfile) - file, _ := os.Open(tourfile) + file := mustOpen(tourfile) scanner := bufio.NewScanner(file) var ( name string diff --git a/clm.go b/clm.go index a582cf6..69ff82e 100644 --- a/clm.go +++ b/clm.go @@ -14,7 +14,6 @@ import ( "io" "math" "math/rand" - "os" "strconv" "strings" "sync" @@ -113,7 +112,7 @@ func NewCLM(Clmfile, REfile string) *CLM { // tig00035238 46779 recover // tig00030900 119291 func (r *CLM) readRE() { - file, _ := os.Open(r.REfile) + file := mustOpen(r.REfile) log.Noticef("Parse REfile `%s`", r.REfile) scanner := bufio.NewScanner(file) idx := 0 @@ -140,7 +139,7 @@ func rr(b byte) byte { // readClmLines parses the clmfile into a slice of CLMLine func readClmLines(clmfile string) []CLMLine { - file, _ := os.Open(clmfile) + file := mustOpen(clmfile) log.Noticef("Parse clmfile `%s`", clmfile) reader := bufio.NewReader(file) diff --git a/cmd/allhic.go b/cmd/allhic.go index a993946..de31f8d 100644 --- a/cmd/allhic.go +++ b/cmd/allhic.go @@ -12,6 +12,8 @@ package main import ( "fmt" "os" + "path" + "sort" "strconv" "strings" "time" @@ -23,16 +25,6 @@ import ( var log = logging.MustGetLogger("main") -// var format = logging.MustStringFormatter( -// `%{color}%{time:15:04:05} | main | %{level:.6s} %{color:reset} ** %{message} **`, -// ) - -// // Backend is the default stderr output -// var Backend = logging.NewLogBackend(os.Stderr, "", 0) - -// // BackendFormatter contains the fancy debug formatter -// var BackendFormatter = logging.NewBackendFormatter(Backend, format) - // init customizes how cli layout the command interface // Logo banner (Varsity style): // http://patorjk.com/software/taag/#p=testall&f=3D-ASCII&t=ALLHIC @@ -118,7 +110,7 @@ Given a bamfile, the goal of the pruning step is to remove all inter-allelic links, then it is possible to reconstruct allele-separated assemblies. `, Action: func(c *cli.Context) error { - if len(c.Args()) < 1 { + if c.NArg() < 1 { cli.ShowSubcommandHelp(c) return cli.NewExitError("Must specify bamfile", 1) } @@ -142,7 +134,7 @@ also prepares for the latter steps of ALLHIC. `, Flags: extractFlags, Action: func(c *cli.Context) error { - if len(c.Args()) < 2 { + if c.NArg() < 2 { cli.ShowSubcommandHelp(c) return cli.NewExitError("Must specify distfile, clmfile and bamfile", 1) } @@ -168,11 +160,10 @@ also prepares for the latter steps of ALLHIC. // of the contig linkage graph. // `, // Action: func(c *cli.Context) error { - // if len(c.Args()) < 1 { + // if c.NArg() < 1 { // cli.ShowSubcommandHelp(c) // return cli.NewExitError("Must specify bamfile", 1) // } - // bamfile := c.Args().Get(0) // p := allhic.Anchorer{Bamfile: bamfile} // p.Run() @@ -193,7 +184,7 @@ a hierarchical clustering algorithm using average links. The two input files can be generated with the "extract" sub-command. `, Action: func(c *cli.Context) error { - if len(c.Args()) < 3 { + if c.NArg() < 3 { cli.ShowSubcommandHelp(c) return cli.NewExitError("Must specify distfile", 1) } @@ -223,7 +214,7 @@ on a cluster). `, Flags: optimizeFlags, Action: func(c *cli.Context) error { - if len(c.Args()) < 2 { + if c.NArg() < 2 { cli.ShowSubcommandHelp(c) return cli.NewExitError("Must specify clmfile", 1) } @@ -247,21 +238,30 @@ on a cluster). Name: "build", Usage: "Build genome release", UsageText: ` - allhic build tourfile contigs.fasta [options] + allhic build tourfile1 tourfile2 ... contigs.fasta asm.chr.fasta [options] Build function: Convert the tourfile into the standard AGP file, which is then converted into a FASTA genome release. `, Action: func(c *cli.Context) error { - if len(c.Args()) < 2 { + if c.NArg() < 3 { cli.ShowSubcommandHelp(c) return cli.NewExitError("Must specify tourfile and fastafile", 1) } - tourfile := c.Args().Get(0) - fastafile := c.Args().Get(1) - p := allhic.Builder{Tourfile: tourfile, Fastafile: fastafile} + tourfiles := []string{} + for i := 0; i < c.NArg()-2; i++ { + tourfiles = append(tourfiles, c.Args().Get(i)) + } + sort.Strings(tourfiles) + + fastafile := c.Args().Get(c.NArg() - 2) + outfastafile := c.Args().Get(c.NArg() - 1) + + p := allhic.Builder{Tourfiles: tourfiles, + Fastafile: fastafile, + OutFastafile: outfastafile} p.Run() return nil }, @@ -276,7 +276,7 @@ Anchor function: Given a bamfile, we extract matrix of link counts and plot heatmap. `, Action: func(c *cli.Context) error { - if len(c.Args()) < 2 { + if c.NArg() < 2 { cli.ShowSubcommandHelp(c) return cli.NewExitError("Must specify bamfile", 1) } @@ -301,7 +301,7 @@ Compute the posterior probability of contig orientations after scaffolding as a quality assessment step. `, Action: func(c *cli.Context) error { - if len(c.Args()) < 3 { + if c.NArg() < 3 { cli.ShowSubcommandHelp(c) return cli.NewExitError("Must specify bamfile", 1) } @@ -330,7 +330,7 @@ A convenience driver function. Chain the following steps sequentially. `, Flags: append(extractFlags, optimizeFlags...), Action: func(c *cli.Context) error { - if len(c.Args()) < 3 { + if c.NArg() < 3 { cli.ShowSubcommandHelp(c) return cli.NewExitError("Must specify distfile, clmfile and bamfile", 1) } @@ -358,6 +358,7 @@ A convenience driver function. Chain the following steps sequentially. partitioner.Run() // Optimize the k groups separately + tourfiles := []string{} for i, refile := range partitioner.OutREfiles { banner(fmt.Sprintf("Optimize group %d", i)) optimizer := allhic.Optimizer{REfile: refile, @@ -365,12 +366,17 @@ A convenience driver function. Chain the following steps sequentially. RunGA: runGA, Resume: resume, Seed: seed, NPop: npop, NGen: ngen, MutProb: mutpb} optimizer.Run() + tourfiles = append(tourfiles, optimizer.OutTourFile) } // Run the final build - // log.Noticef("***** Build started *****") - // builder := allhic.Builder{Tourfile: tourfile, Fastafile: fastafile} - // builder.Run() + banner("Build started (AGP and FASTA)") + outfastafile := path.Join(path.Dir(tourfiles[0]), + fmt.Sprintf("asm-g%d.chr.fasta", k)) + builder := allhic.Builder{Tourfiles: tourfiles, + Fastafile: fastafile, + OutFastafile: outfastafile} + builder.Run() return nil }, diff --git a/extract.go b/extract.go index 3358700..405dbd1 100644 --- a/extract.go +++ b/extract.go @@ -177,6 +177,7 @@ func writeRE(outfile string, contigs []*ContigInfo) { func (r *Extracter) readFastaAndWriteRE() { outfile := RemoveExt(r.Bamfile) + ".counts_" + r.RE + ".txt" r.OutContigsfile = outfile + mustExist(r.Fastafile) reader, _ := fastx.NewDefaultReader(r.Fastafile) seq.ValidateSeq = false // This flag makes parsing FASTA much faster @@ -346,7 +347,7 @@ func (r *Extracter) findExpectedInterContigLinks(D, L1, L2 int) []float64 { // extractContigLinks converts the BAM file to .clm and .ids func (r *Extracter) extractContigLinks() { - fh, _ := os.Open(r.Bamfile) + fh := mustOpen(r.Bamfile) prefix := RemoveExt(r.Bamfile) clmfile := prefix + ".clm" r.OutClmfile = clmfile diff --git a/optimize.go b/optimize.go index 9731088..dcc0513 100644 --- a/optimize.go +++ b/optimize.go @@ -30,6 +30,8 @@ type Optimizer struct { MutProb float64 CrossProb float64 rng *rand.Rand + // Output files + OutTourFile string } // Run kicks off the Optimizer @@ -54,6 +56,7 @@ func (r *Optimizer) Run() { // tourfile logs the intermediate configurations log.Noticef("Optimization history logged to `%s`", tourfile) fwtour, _ := os.Create(tourfile) + r.OutTourFile = tourfile defer fwtour.Close() clm.printTour(os.Stdout, clm.Tour, "INIT") @@ -94,7 +97,7 @@ func (r *CLM) OptimizeOrientations(fwtour *os.File, phase int) (string, string) // parseTourFile parses tour file // Only the last line is retained anc onverted into a Tour func parseTourFile(filename string) []string { - f, _ := os.Open(filename) + f := mustOpen(filename) log.Noticef("Parse tour file `%s`", filename) defer f.Close()