diff --git a/README.md b/README.md
index 3ecb6b9..ca93176 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ The advantages of this implementation:
* no dependencies (no need curl, wget, awk, etc.)
* cross-platform (support for Windows, Mac OS, etc.)
- * regexp for parsing TOC
+ * `golang.org/x/net/html` for parsing TOC
* parallel processing of multiple documents
diff --git a/cmd/gh-md-toc/main.go b/cmd/gh-md-toc/main.go
index f09b9ba..25ed41f 100644
--- a/cmd/gh-md-toc/main.go
+++ b/cmd/gh-md-toc/main.go
@@ -48,10 +48,11 @@ func main() {
for _, p := range *paths {
ghdoc := ghtoc.NewGHDoc(p, absPathsInToc, *startDepth, *depth, !*noEscape, *token, *indent, *debug)
+ getFn := func(ch chan *ghtoc.GHToc, ghdoc *ghtoc.GHDoc) { ch <- ghdoc.GetToc() }
if *serial {
- ch <- ghdoc.GetToc()
+ getFn(ch, ghdoc)
} else {
- go func(path string) { ch <- ghdoc.GetToc() }(p)
+ go getFn(ch, ghdoc)
}
}
diff --git a/ghdoc.go b/ghdoc.go
index a7ed8e0..8179c66 100644
--- a/ghdoc.go
+++ b/ghdoc.go
@@ -6,7 +6,6 @@ import (
"log"
"net/url"
"os"
- "regexp"
"strconv"
"strings"
)
@@ -141,69 +140,62 @@ func (doc *GHDoc) GrabToc() *GHToc {
doc.d("GrabToc: start, html size: " + strconv.Itoa(len(doc.html)))
defer doc.d("GrabToc: done.")
- re := `(?si)
Blabla...
foo3
is even betterer
+The command foo3
is even bettererBlabla...
@@ -227,7 +226,7 @@ func TestGrabTocStartDepth(t *testing.T) {Blabla...
bar3
is even betterer
+The command bar3
is even bettererBlabla...
`, AbsPaths: false, @@ -236,7 +235,6 @@ func TestGrabTocStartDepth(t *testing.T) { Indent: 2, } toc := *doc.GrabToc() - for i := 0; i <= len(tocExpected)-1; i++ { if toc[i] != tocExpected[i] { t.Error("Res :", toc[i], "\nExpected :", tocExpected[i]) diff --git a/go.mod b/go.mod index b344236..67a976f 100644 --- a/go.mod +++ b/go.mod @@ -1,8 +1,12 @@ module github.com/ekalinin/github-markdown-toc.go -go 1.17 +go 1.19 -require gopkg.in/alecthomas/kingpin.v2 v2.2.4 +require ( + github.com/stretchr/testify v1.7.0 + golang.org/x/net v0.1.0 + gopkg.in/alecthomas/kingpin.v2 v2.2.4 +) require ( github.com/alecthomas/assert v0.0.0-20170929043011-405dbfeb8e38 // indirect @@ -11,7 +15,10 @@ require ( github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc // indirect github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf // indirect github.com/davecgh/go-spew v1.1.1 // indirect + github.com/kr/pretty v0.1.0 // indirect github.com/mattn/go-isatty v0.0.14 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect github.com/sergi/go-diff v1.2.0 // indirect - github.com/stretchr/testify v1.7.0 // indirect + gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 // indirect + gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c // indirect ) diff --git a/go.sum b/go.sum index 93e1e05..3d2a333 100644 --- a/go.sum +++ b/go.sum @@ -11,8 +11,10 @@ github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRF github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/mattn/go-isatty v0.0.14 h1:yVuAays6BHfxijgZPzw+3Zlu5yQgKGP2/hcQbHb7S9Y= github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94= @@ -24,11 +26,14 @@ github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+ github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c h1:F1jZWGFhYfh0Ci55sIpILtKKK8p3i2/krTr0H1rg74I= +golang.org/x/net v0.1.0 h1:hZ/3BUoy5aId7sCpA/Tc5lt8DkFgdVS2onTpJsZ/fl0= +golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco= golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.1.0 h1:kunALQeHf1/185U1i0GOB/fy1IPRDDpuoOOqRReG57U= gopkg.in/alecthomas/kingpin.v2 v2.2.4 h1:CC8tJ/xljioKrK6ii3IeWVXU4Tw7VB+LbjZBJaBxN50= gopkg.in/alecthomas/kingpin.v2 v2.2.4/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= diff --git a/headerfinder.go b/headerfinder.go new file mode 100644 index 0000000..32c5bc8 --- /dev/null +++ b/headerfinder.go @@ -0,0 +1,122 @@ +package ghtoc + +import ( + "io" + "strings" + + "golang.org/x/net/html" + "golang.org/x/net/html/atom" +) + +// HxDepth represents the header depth with H1 being 0. +type HxDepth int + +// InvalidDepth designates that the data atom is not a valid Hx. +const InvalidDepth HxDepth = -1 + +// MaxHxDepth is the maximum HxDepth value. +// H6 is the last Hx tag (5 = 6 - 1) +const MaxHxDepth HxDepth = 5 + +// Header represents an HTML header +type Header struct { + Depth HxDepth + Href string + Name string +} + +func findHeadersInString(str string) []Header { + r := strings.NewReader(str) + return findHeaders(r) +} + +func findHeaders(r io.Reader) []Header { + hdrs := make([]Header, 0) + tokenizer := html.NewTokenizer(r) + for { + tt := tokenizer.Next() + switch tt { + case html.ErrorToken: + return hdrs + case html.StartTagToken: + t := tokenizer.Token() + if hdr, ok := createHeader(tokenizer, t); ok { + hdrs = append(hdrs, hdr) + } + } + } +} + +func getHxDepth(dataAtom atom.Atom) HxDepth { + hxAtoms := []atom.Atom{ + atom.H1, + atom.H2, + atom.H3, + atom.H4, + atom.H5, + atom.H6, + } + for depth, hxAtom := range hxAtoms { + if dataAtom == hxAtom { + return HxDepth(depth) + } + } + return InvalidDepth +} + +func createHeader(tokenizer *html.Tokenizer, token html.Token) (Header, bool) { + hxDepth := getHxDepth(token.DataAtom) + if hxDepth == InvalidDepth { + return Header{}, false + } + + var href string + var nameParts []string + // Start at 1 because we are inside the Hx tag + tokenDepth := 1 + afterAnchor := false + for { + tokenizer.Next() + t := tokenizer.Token() + switch t.Type { + case html.ErrorToken: + return Header{}, false + case html.StartTagToken: + tokenDepth++ + if t.DataAtom == atom.A { + if hrefAttr, ok := findAttribute(t.Attr, "", "href"); ok { + href = hrefAttr.Val + } else { + // Expected to find href attribute + return Header{}, false + } + } + case html.EndTagToken: + switch t.DataAtom { + case token.DataAtom: + // If we encountered the matching end tag for the Hx, then we are done + return Header{ + Depth: hxDepth, + Name: removeStuff(strings.Join(nameParts, " ")), + Href: href, + }, true + case atom.A: + afterAnchor = true + } + tokenDepth-- + case html.TextToken: + if afterAnchor { + nameParts = append(nameParts, removeStuff(t.Data)) + } + } + } +} + +func findAttribute(attrs []html.Attribute, namespace, key string) (html.Attribute, bool) { + for _, attr := range attrs { + if attr.Namespace == namespace && attr.Key == key { + return attr, true + } + } + return html.Attribute{}, false +} diff --git a/headerfinder_test.go b/headerfinder_test.go new file mode 100644 index 0000000..c8ada93 --- /dev/null +++ b/headerfinder_test.go @@ -0,0 +1,111 @@ +package ghtoc + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "golang.org/x/net/html" + "golang.org/x/net/html/atom" +) + +const singleH1 = ` +