diff --git a/README.md b/README.md
index 3b4f707..f4a0a8b 100644
--- a/README.md
+++ b/README.md
@@ -76,7 +76,7 @@ allhic optimize tests/test.counts_GATC.2g2.txt tests/test.clm
Build genome release, including `.agp` and `.fasta` output.
```console
-allhic build tests/test.counts_GATC.2g1.tour seq.fasta.gz
+allhic build tests/test.counts_GATC.2g?.tour tests/seq.fasta.gz tests/asm-2g.chr.fasta
```
### Plot
@@ -89,7 +89,7 @@ allhic plot tests/test.bam tests/test.counts_GATC.2g1.tour
![allhicplot](images/allhic-plot-s.png)
-## Pipeline
+## Pipeline
Following the 4 steps of `prune`, `extract`, `partition`, `optimize`, as described above.
In summary, we have:
@@ -99,7 +99,7 @@ allhic extract tests/test.bam tests/seq.fasta.gz
allhic partition tests/test.counts_GATC.txt tests/test.pairs.txt 2
allhic optimize tests/test.counts_GATC.2g1.txt tests/test.clm
allhic optimize tests/test.counts_GATC.2g2.txt tests/test.clm
-allhic build tests/test.tour seq.fasta.gz
+allhic build tests/test.counts_GATC.2g?.txt tests/seq.fasta.gz tests/asm-2g.chr.fasta
```
Or, in a single step:
@@ -121,10 +121,9 @@ FASTA file.
- [x] Use clustering when k = 1
- [x] Isolate matrix generation to "plot"
- [x] Add "pipeline" to simplify execution
-- [ ] Make "build" to merge subgroup tours
+- [x] Make "build" to merge subgroup tours
+- [x] Provide better error messages for "file not found"
- [ ] Plot the boundary of the contigs in "plot" using genome.json
-- [ ] Provide better error messages for "file not found"
-- [ ] Merge tours from multiple partitions back to a single file
- [ ] Add dot plot to "plot"
- [ ] Compare numerical output with Lachesis
- [ ] Improve Ler0 results
diff --git a/agp.go b/agp.go
index 199a162..7dde4cb 100644
--- a/agp.go
+++ b/agp.go
@@ -12,7 +12,6 @@ package allhic
import (
"bufio"
"bytes"
- "os"
"strconv"
"strings"
@@ -86,7 +85,7 @@ func (r *AGP) Add(row string) {
// buildFasta builds target FASTA based on info from agpfile
func buildFasta(agpfile string, seqs map[string]*seq.Seq) {
agp := NewAGP(agpfile)
- file, _ := os.Open(agpfile)
+ file := mustOpen(agpfile)
log.Noticef("Parse agpfile `%s`", agpfile)
scanner := bufio.NewScanner(file)
@@ -95,7 +94,7 @@ func buildFasta(agpfile string, seqs map[string]*seq.Seq) {
}
var buf bytes.Buffer
- outFile := RemoveExt(agpfile) + ".chr.fasta"
+ outFile := RemoveExt(agpfile) + ".fasta"
outfh, _ := xopen.Wopen(outFile)
prevObject := ""
for _, line := range agp.lines {
@@ -124,6 +123,7 @@ func buildFasta(agpfile string, seqs map[string]*seq.Seq) {
// Last one
writeRecord(prevObject, buf, outfh)
buf.Reset()
+ log.Noticef("Assembly FASTA file `%s` built", outFile)
}
// writeRecord writes the FASTA record to the file
diff --git a/anchor.go b/anchor.go
index 3abe244..4d1f5f2 100644
--- a/anchor.go
+++ b/anchor.go
@@ -202,7 +202,7 @@ func (r *Anchorer) makeTrivialPaths(contigs []*Contig, flanksize int64) PathSet
// ExtractInterContigLinks extracts links from the Bamfile
func (r *Anchorer) ExtractInterContigLinks() {
- fh, _ := os.Open(r.Bamfile)
+ fh := mustOpen(r.Bamfile)
prefix := RemoveExt(r.Bamfile)
disfile := prefix + ".dis"
idsfile := prefix + ".ids"
diff --git a/assess.go b/assess.go
index 4555e04..201e9ea 100644
--- a/assess.go
+++ b/assess.go
@@ -98,11 +98,7 @@ func (r *Assesser) writePostProb(outfile string) {
// readBed parses the bedfile to extract the start and stop for all the contigs
func (r *Assesser) readBed() {
- fh, err := os.Open(r.Bedfile)
- if err != nil {
- log.Errorf("bedfile `%s` does not exist", r.Bedfile)
- os.Exit(1)
- }
+ fh := mustOpen(r.Bedfile)
log.Noticef("Parse bedfile `%s`", r.Bedfile)
reader := bufio.NewReader(fh)
@@ -142,7 +138,7 @@ func checkInRange(pos, start, end int) bool {
// extractContigLinks builds the probability distribution of link sizes
func (r *Assesser) extractContigLinks() {
- fh, _ := os.Open(r.Bamfile)
+ fh := mustOpen(r.Bamfile)
log.Noticef("Parse bamfile `%s`", r.Bamfile)
br, _ := bam.NewReader(fh, 0)
defer br.Close()
diff --git a/base.go b/base.go
index fefe8a4..611d6f8 100644
--- a/base.go
+++ b/base.go
@@ -349,7 +349,7 @@ func Percentage(a, b int) string {
func ReadCSVLines(filename string) [][]string {
log.Noticef("Parse csvfile `%s`", filename)
- fh, _ := os.Open(filename)
+ fh := mustOpen(filename)
defer fh.Close()
var data [][]string
@@ -384,3 +384,19 @@ func sortInt64s(a []int64) {
func searchInt64(a []int64, x int64) int {
return sort.Search(len(a), func(i int) bool { return a[i] >= x })
}
+
+// mustExist panics when a file is not found
+func mustExist(filename string) {
+ if _, err := os.Stat(filename); os.IsNotExist(err) {
+ log.Fatal(err)
+ }
+}
+
+// mustOpen wraps os.Open but panics if file not found
+func mustOpen(filename string) *os.File {
+ f, err := os.Open(filename)
+ if err != nil {
+ log.Fatal(err)
+ }
+ return f
+}
diff --git a/build.go b/build.go
index c1a0d2d..b02d535 100644
--- a/build.go
+++ b/build.go
@@ -22,9 +22,11 @@ import (
// Builder reconstructs the genome release AGP and FASTA files
type Builder struct {
- Tourfile string
+ Tourfiles []string
Fastafile string
- AGPfile string
+ // Output file
+ OutAGPfile string
+ OutFastafile string
}
// OOLine describes a simple contig entry in a scaffolding experiment
@@ -59,15 +61,6 @@ func (r *OO) getFastaSizes(fastafile string) {
}
}
-// readFiles initializes OO object
-func (r *Builder) readFiles() *OO {
- oo := new(OO)
- oo.getFastaSizes(r.Fastafile)
- oo.parseLastTour(r.Tourfile)
-
- return oo
-}
-
// Add instantiates a new OOLine object and add to the array in OO
func (r *OO) Add(scaffold, ctg string, ctgsize int, strand byte) {
o := OOLine{scaffold, ctg, ctgsize, strand}
@@ -76,7 +69,7 @@ func (r *OO) Add(scaffold, ctg string, ctgsize int, strand byte) {
// writeAGP converts the simplistic OOLine into AGP format
func (r *Builder) writeAGP(oo *OO) {
- filename := r.AGPfile
+ r.OutAGPfile = RemoveExt(r.OutFastafile) + ".agp"
gapSize := 100
gapType := "scaffold"
linkage := "yes"
@@ -86,7 +79,7 @@ func (r *Builder) writeAGP(oo *OO) {
objectEnd := 1
partNumber := 0
componentType := 'W'
- f, _ := os.Create(filename)
+ f, _ := os.Create(r.OutAGPfile)
defer f.Close()
w := bufio.NewWriter(f)
components := 0
@@ -120,20 +113,28 @@ func (r *Builder) writeAGP(oo *OO) {
components++
}
w.Flush()
- log.Noticef("A total of %d tigs written to `%s`", components, filename)
+ log.Noticef("A total of %d tigs written to `%s`", components, r.OutAGPfile)
}
-// Run kicks off the Builder
+// Run kicks off the Build and constructs molecule using component FASTA sequence
func (r *Builder) Run() {
- r.AGPfile = RemoveExt(r.Tourfile) + ".agp"
- r.Build(r.readFiles())
+ oo := new(OO)
+ oo.getFastaSizes(r.Fastafile)
+ // oo.parseLastTour(r.Tourfile)
+ oo.mergeTours(r.Tourfiles)
+ r.writeAGP(oo)
+ buildFasta(r.OutAGPfile, oo.seqs)
log.Notice("Success")
}
-// Build constructs molecule using component FASTA sequence
-func (r *Builder) Build(oo *OO) {
- r.writeAGP(oo)
- buildFasta(r.AGPfile, oo.seqs)
+// mergeTours merges a number of tours typically generated by partition and optimize
+// In contrast to parseLastTour which only parse one tour
+func (r *OO) mergeTours(tourfiles []string) {
+ for i, tourfile := range tourfiles {
+ seqid := fmt.Sprintf("g%d", i+1)
+ log.Noticef("Import `%s` => %s", tourfile, seqid)
+ r.parseLastTour(tourfile, seqid)
+ }
}
// parseLastTour reads tour from file
@@ -141,7 +142,7 @@ func (r *Builder) Build(oo *OO) {
// A tour file has the following format:
// > name
// contig1+ contig2- contig3?
-func (r *OO) parseLastTour(tourfile string) {
+func (r *OO) parseLastTour(tourfile string, seqid string) {
words := parseTourFile(tourfile)
var strand byte
for _, tig := range words {
@@ -151,7 +152,7 @@ func (r *OO) parseLastTour(tourfile string) {
} else {
strand = '?'
}
- r.Add("FINAL", tig, r.seqs[tig].Length(), strand)
+ r.Add(seqid, tig, r.seqs[tig].Length(), strand)
}
}
@@ -163,7 +164,7 @@ func (r *OO) parseLastTour(tourfile string) {
func (r *OO) ParseAllTours(tourfile string) {
log.Noticef("Parse tourfile `%s`", tourfile)
- file, _ := os.Open(tourfile)
+ file := mustOpen(tourfile)
scanner := bufio.NewScanner(file)
var (
name string
diff --git a/clm.go b/clm.go
index a582cf6..69ff82e 100644
--- a/clm.go
+++ b/clm.go
@@ -14,7 +14,6 @@ import (
"io"
"math"
"math/rand"
- "os"
"strconv"
"strings"
"sync"
@@ -113,7 +112,7 @@ func NewCLM(Clmfile, REfile string) *CLM {
// tig00035238 46779 recover
// tig00030900 119291
func (r *CLM) readRE() {
- file, _ := os.Open(r.REfile)
+ file := mustOpen(r.REfile)
log.Noticef("Parse REfile `%s`", r.REfile)
scanner := bufio.NewScanner(file)
idx := 0
@@ -140,7 +139,7 @@ func rr(b byte) byte {
// readClmLines parses the clmfile into a slice of CLMLine
func readClmLines(clmfile string) []CLMLine {
- file, _ := os.Open(clmfile)
+ file := mustOpen(clmfile)
log.Noticef("Parse clmfile `%s`", clmfile)
reader := bufio.NewReader(file)
diff --git a/cmd/allhic.go b/cmd/allhic.go
index a993946..de31f8d 100644
--- a/cmd/allhic.go
+++ b/cmd/allhic.go
@@ -12,6 +12,8 @@ package main
import (
"fmt"
"os"
+ "path"
+ "sort"
"strconv"
"strings"
"time"
@@ -23,16 +25,6 @@ import (
var log = logging.MustGetLogger("main")
-// var format = logging.MustStringFormatter(
-// `%{color}%{time:15:04:05} | main | %{level:.6s} %{color:reset} ** %{message} **`,
-// )
-
-// // Backend is the default stderr output
-// var Backend = logging.NewLogBackend(os.Stderr, "", 0)
-
-// // BackendFormatter contains the fancy debug formatter
-// var BackendFormatter = logging.NewBackendFormatter(Backend, format)
-
// init customizes how cli layout the command interface
// Logo banner (Varsity style):
// http://patorjk.com/software/taag/#p=testall&f=3D-ASCII&t=ALLHIC
@@ -118,7 +110,7 @@ Given a bamfile, the goal of the pruning step is to remove all inter-allelic
links, then it is possible to reconstruct allele-separated assemblies.
`,
Action: func(c *cli.Context) error {
- if len(c.Args()) < 1 {
+ if c.NArg() < 1 {
cli.ShowSubcommandHelp(c)
return cli.NewExitError("Must specify bamfile", 1)
}
@@ -142,7 +134,7 @@ also prepares for the latter steps of ALLHIC.
`,
Flags: extractFlags,
Action: func(c *cli.Context) error {
- if len(c.Args()) < 2 {
+ if c.NArg() < 2 {
cli.ShowSubcommandHelp(c)
return cli.NewExitError("Must specify distfile, clmfile and bamfile", 1)
}
@@ -168,11 +160,10 @@ also prepares for the latter steps of ALLHIC.
// of the contig linkage graph.
// `,
// Action: func(c *cli.Context) error {
- // if len(c.Args()) < 1 {
+ // if c.NArg() < 1 {
// cli.ShowSubcommandHelp(c)
// return cli.NewExitError("Must specify bamfile", 1)
// }
-
// bamfile := c.Args().Get(0)
// p := allhic.Anchorer{Bamfile: bamfile}
// p.Run()
@@ -193,7 +184,7 @@ a hierarchical clustering algorithm using average links. The two input files
can be generated with the "extract" sub-command.
`,
Action: func(c *cli.Context) error {
- if len(c.Args()) < 3 {
+ if c.NArg() < 3 {
cli.ShowSubcommandHelp(c)
return cli.NewExitError("Must specify distfile", 1)
}
@@ -223,7 +214,7 @@ on a cluster).
`,
Flags: optimizeFlags,
Action: func(c *cli.Context) error {
- if len(c.Args()) < 2 {
+ if c.NArg() < 2 {
cli.ShowSubcommandHelp(c)
return cli.NewExitError("Must specify clmfile", 1)
}
@@ -247,21 +238,30 @@ on a cluster).
Name: "build",
Usage: "Build genome release",
UsageText: `
- allhic build tourfile contigs.fasta [options]
+ allhic build tourfile1 tourfile2 ... contigs.fasta asm.chr.fasta [options]
Build function:
Convert the tourfile into the standard AGP file, which is then converted
into a FASTA genome release.
`,
Action: func(c *cli.Context) error {
- if len(c.Args()) < 2 {
+ if c.NArg() < 3 {
cli.ShowSubcommandHelp(c)
return cli.NewExitError("Must specify tourfile and fastafile", 1)
}
- tourfile := c.Args().Get(0)
- fastafile := c.Args().Get(1)
- p := allhic.Builder{Tourfile: tourfile, Fastafile: fastafile}
+ tourfiles := []string{}
+ for i := 0; i < c.NArg()-2; i++ {
+ tourfiles = append(tourfiles, c.Args().Get(i))
+ }
+ sort.Strings(tourfiles)
+
+ fastafile := c.Args().Get(c.NArg() - 2)
+ outfastafile := c.Args().Get(c.NArg() - 1)
+
+ p := allhic.Builder{Tourfiles: tourfiles,
+ Fastafile: fastafile,
+ OutFastafile: outfastafile}
p.Run()
return nil
},
@@ -276,7 +276,7 @@ Anchor function:
Given a bamfile, we extract matrix of link counts and plot heatmap.
`,
Action: func(c *cli.Context) error {
- if len(c.Args()) < 2 {
+ if c.NArg() < 2 {
cli.ShowSubcommandHelp(c)
return cli.NewExitError("Must specify bamfile", 1)
}
@@ -301,7 +301,7 @@ Compute the posterior probability of contig orientations after scaffolding
as a quality assessment step.
`,
Action: func(c *cli.Context) error {
- if len(c.Args()) < 3 {
+ if c.NArg() < 3 {
cli.ShowSubcommandHelp(c)
return cli.NewExitError("Must specify bamfile", 1)
}
@@ -330,7 +330,7 @@ A convenience driver function. Chain the following steps sequentially.
`,
Flags: append(extractFlags, optimizeFlags...),
Action: func(c *cli.Context) error {
- if len(c.Args()) < 3 {
+ if c.NArg() < 3 {
cli.ShowSubcommandHelp(c)
return cli.NewExitError("Must specify distfile, clmfile and bamfile", 1)
}
@@ -358,6 +358,7 @@ A convenience driver function. Chain the following steps sequentially.
partitioner.Run()
// Optimize the k groups separately
+ tourfiles := []string{}
for i, refile := range partitioner.OutREfiles {
banner(fmt.Sprintf("Optimize group %d", i))
optimizer := allhic.Optimizer{REfile: refile,
@@ -365,12 +366,17 @@ A convenience driver function. Chain the following steps sequentially.
RunGA: runGA, Resume: resume,
Seed: seed, NPop: npop, NGen: ngen, MutProb: mutpb}
optimizer.Run()
+ tourfiles = append(tourfiles, optimizer.OutTourFile)
}
// Run the final build
- // log.Noticef("***** Build started *****")
- // builder := allhic.Builder{Tourfile: tourfile, Fastafile: fastafile}
- // builder.Run()
+ banner("Build started (AGP and FASTA)")
+ outfastafile := path.Join(path.Dir(tourfiles[0]),
+ fmt.Sprintf("asm-g%d.chr.fasta", k))
+ builder := allhic.Builder{Tourfiles: tourfiles,
+ Fastafile: fastafile,
+ OutFastafile: outfastafile}
+ builder.Run()
return nil
},
diff --git a/extract.go b/extract.go
index 3358700..405dbd1 100644
--- a/extract.go
+++ b/extract.go
@@ -177,6 +177,7 @@ func writeRE(outfile string, contigs []*ContigInfo) {
func (r *Extracter) readFastaAndWriteRE() {
outfile := RemoveExt(r.Bamfile) + ".counts_" + r.RE + ".txt"
r.OutContigsfile = outfile
+ mustExist(r.Fastafile)
reader, _ := fastx.NewDefaultReader(r.Fastafile)
seq.ValidateSeq = false // This flag makes parsing FASTA much faster
@@ -346,7 +347,7 @@ func (r *Extracter) findExpectedInterContigLinks(D, L1, L2 int) []float64 {
// extractContigLinks converts the BAM file to .clm and .ids
func (r *Extracter) extractContigLinks() {
- fh, _ := os.Open(r.Bamfile)
+ fh := mustOpen(r.Bamfile)
prefix := RemoveExt(r.Bamfile)
clmfile := prefix + ".clm"
r.OutClmfile = clmfile
diff --git a/optimize.go b/optimize.go
index 9731088..dcc0513 100644
--- a/optimize.go
+++ b/optimize.go
@@ -30,6 +30,8 @@ type Optimizer struct {
MutProb float64
CrossProb float64
rng *rand.Rand
+ // Output files
+ OutTourFile string
}
// Run kicks off the Optimizer
@@ -54,6 +56,7 @@ func (r *Optimizer) Run() {
// tourfile logs the intermediate configurations
log.Noticef("Optimization history logged to `%s`", tourfile)
fwtour, _ := os.Create(tourfile)
+ r.OutTourFile = tourfile
defer fwtour.Close()
clm.printTour(os.Stdout, clm.Tour, "INIT")
@@ -94,7 +97,7 @@ func (r *CLM) OptimizeOrientations(fwtour *os.File, phase int) (string, string)
// parseTourFile parses tour file
// Only the last line is retained anc onverted into a Tour
func parseTourFile(filename string) []string {
- f, _ := os.Open(filename)
+ f := mustOpen(filename)
log.Noticef("Parse tour file `%s`", filename)
defer f.Close()