diff --git a/README.md b/README.md index 3ecb6b9..ca93176 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ The advantages of this implementation: * no dependencies (no need curl, wget, awk, etc.) * cross-platform (support for Windows, Mac OS, etc.) - * regexp for parsing TOC + * `golang.org/x/net/html` for parsing TOC * parallel processing of multiple documents diff --git a/cmd/gh-md-toc/main.go b/cmd/gh-md-toc/main.go index f09b9ba..25ed41f 100644 --- a/cmd/gh-md-toc/main.go +++ b/cmd/gh-md-toc/main.go @@ -48,10 +48,11 @@ func main() { for _, p := range *paths { ghdoc := ghtoc.NewGHDoc(p, absPathsInToc, *startDepth, *depth, !*noEscape, *token, *indent, *debug) + getFn := func(ch chan *ghtoc.GHToc, ghdoc *ghtoc.GHDoc) { ch <- ghdoc.GetToc() } if *serial { - ch <- ghdoc.GetToc() + getFn(ch, ghdoc) } else { - go func(path string) { ch <- ghdoc.GetToc() }(p) + go getFn(ch, ghdoc) } } diff --git a/ghdoc.go b/ghdoc.go index a7ed8e0..8179c66 100644 --- a/ghdoc.go +++ b/ghdoc.go @@ -6,7 +6,6 @@ import ( "log" "net/url" "os" - "regexp" "strconv" "strings" ) @@ -141,69 +140,62 @@ func (doc *GHDoc) GrabToc() *GHToc { doc.d("GrabToc: start, html size: " + strconv.Itoa(len(doc.html))) defer doc.d("GrabToc: done.") - re := `(?si)[1-6])>\s*` + - `]*>\s*` + - `.*?(?P.*?) 0 { + maxDepth = doc.Depth - 1 + } else { + maxDepth = int(MaxHxDepth) } - var tmpSection string - doc.d("GrabToc: processing groups ...") - doc.d("Including starting from level " + strconv.Itoa(doc.StartDepth)) - for _, group := range groups { - // format result - n, _ := strconv.Atoi(group["num"]) - if n <= doc.StartDepth { - continue - } - if doc.Depth > 0 && n > doc.Depth { - continue - } + hdrs := findHeadersInString(doc.html) - link, _ := url.QueryUnescape(group["href"]) - if doc.AbsPaths { - link = doc.Path + link + // Determine the min depth represented by the slice of headers. For example, if a document only + // has H2 tags and no H1 tags. We want the H2 TOC entries to not have an indent. + minHxDepth := MaxHxDepth + for _, hdr := range hdrs { + if hdr.Depth < minHxDepth { + minHxDepth = hdr.Depth } + } - tmpSection = removeStuff(group["name"]) - if doc.Escape { - tmpSection = EscapeSpecChars(tmpSection) + // Populate the toc with entries + toc := GHToc{} + for _, hdr := range hdrs { + hDepth := int(hdr.Depth) + if hDepth >= minDepth && hDepth <= maxDepth { + indentDepth := int(hdr.Depth) - int(minHxDepth) - doc.StartDepth + indent := strings.Repeat(listIndentation(), indentDepth) + toc = append(toc, doc.tocEntry(indent, hdr)) } - tocItem := strings.Repeat(listIndentation(), n-minHeaderNum-doc.StartDepth) + "* " + - "[" + tmpSection + "]" + - "(" + link + ")" - //fmt.Println(tocItem) - toc = append(toc, tocItem) } return &toc } +func (doc *GHDoc) tocEntry(indent string, hdr Header) string { + return indent + "* " + + "[" + doc.tocName(hdr.Name) + "]" + + "(" + doc.tocLink(hdr.Href) + ")" +} + +func (doc *GHDoc) tocName(name string) string { + if doc.Escape { + return EscapeSpecChars(name) + } + return name +} + +func (doc *GHDoc) tocLink(href string) string { + link, _ := url.QueryUnescape(href) + if doc.AbsPaths { + link = doc.Path + link + } + return link +} + // GetToc return GHToc for a document func (doc *GHDoc) GetToc() *GHToc { if err := doc.Convert2HTML(); err != nil { diff --git a/ghdoc_test.go b/ghdoc_test.go index 4b0d3dd..1913695 100644 --- a/ghdoc_test.go +++ b/ghdoc_test.go @@ -181,7 +181,6 @@ func TestGrabTocDepth(t *testing.T) { Indent: 2, } toc := *doc.GrabToc() - for i := 0; i <= len(tocExpected)-1; i++ { if toc[i] != tocExpected[i] { t.Error("Res :", toc[i], "\nExpected :", tocExpected[i]) @@ -211,7 +210,7 @@ func TestGrabTocStartDepth(t *testing.T) {

Blabla...

-The command foo3 is even betterer

+The command foo3 is even betterer

Blabla...

@@ -227,7 +226,7 @@ func TestGrabTocStartDepth(t *testing.T) {

Blabla...

-The command bar3 is even betterer

+The command bar3 is even betterer

Blabla...

`, AbsPaths: false, @@ -236,7 +235,6 @@ func TestGrabTocStartDepth(t *testing.T) { Indent: 2, } toc := *doc.GrabToc() - for i := 0; i <= len(tocExpected)-1; i++ { if toc[i] != tocExpected[i] { t.Error("Res :", toc[i], "\nExpected :", tocExpected[i]) diff --git a/go.mod b/go.mod index b344236..67a976f 100644 --- a/go.mod +++ b/go.mod @@ -1,8 +1,12 @@ module github.com/ekalinin/github-markdown-toc.go -go 1.17 +go 1.19 -require gopkg.in/alecthomas/kingpin.v2 v2.2.4 +require ( + github.com/stretchr/testify v1.7.0 + golang.org/x/net v0.1.0 + gopkg.in/alecthomas/kingpin.v2 v2.2.4 +) require ( github.com/alecthomas/assert v0.0.0-20170929043011-405dbfeb8e38 // indirect @@ -11,7 +15,10 @@ require ( github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc // indirect github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf // indirect github.com/davecgh/go-spew v1.1.1 // indirect + github.com/kr/pretty v0.1.0 // indirect github.com/mattn/go-isatty v0.0.14 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect github.com/sergi/go-diff v1.2.0 // indirect - github.com/stretchr/testify v1.7.0 // indirect + gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 // indirect + gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c // indirect ) diff --git a/go.sum b/go.sum index 93e1e05..3d2a333 100644 --- a/go.sum +++ b/go.sum @@ -11,8 +11,10 @@ github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRF github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/mattn/go-isatty v0.0.14 h1:yVuAays6BHfxijgZPzw+3Zlu5yQgKGP2/hcQbHb7S9Y= github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94= @@ -24,11 +26,14 @@ github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+ github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c h1:F1jZWGFhYfh0Ci55sIpILtKKK8p3i2/krTr0H1rg74I= +golang.org/x/net v0.1.0 h1:hZ/3BUoy5aId7sCpA/Tc5lt8DkFgdVS2onTpJsZ/fl0= +golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco= golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.1.0 h1:kunALQeHf1/185U1i0GOB/fy1IPRDDpuoOOqRReG57U= gopkg.in/alecthomas/kingpin.v2 v2.2.4 h1:CC8tJ/xljioKrK6ii3IeWVXU4Tw7VB+LbjZBJaBxN50= gopkg.in/alecthomas/kingpin.v2 v2.2.4/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= diff --git a/headerfinder.go b/headerfinder.go new file mode 100644 index 0000000..32c5bc8 --- /dev/null +++ b/headerfinder.go @@ -0,0 +1,122 @@ +package ghtoc + +import ( + "io" + "strings" + + "golang.org/x/net/html" + "golang.org/x/net/html/atom" +) + +// HxDepth represents the header depth with H1 being 0. +type HxDepth int + +// InvalidDepth designates that the data atom is not a valid Hx. +const InvalidDepth HxDepth = -1 + +// MaxHxDepth is the maximum HxDepth value. +// H6 is the last Hx tag (5 = 6 - 1) +const MaxHxDepth HxDepth = 5 + +// Header represents an HTML header +type Header struct { + Depth HxDepth + Href string + Name string +} + +func findHeadersInString(str string) []Header { + r := strings.NewReader(str) + return findHeaders(r) +} + +func findHeaders(r io.Reader) []Header { + hdrs := make([]Header, 0) + tokenizer := html.NewTokenizer(r) + for { + tt := tokenizer.Next() + switch tt { + case html.ErrorToken: + return hdrs + case html.StartTagToken: + t := tokenizer.Token() + if hdr, ok := createHeader(tokenizer, t); ok { + hdrs = append(hdrs, hdr) + } + } + } +} + +func getHxDepth(dataAtom atom.Atom) HxDepth { + hxAtoms := []atom.Atom{ + atom.H1, + atom.H2, + atom.H3, + atom.H4, + atom.H5, + atom.H6, + } + for depth, hxAtom := range hxAtoms { + if dataAtom == hxAtom { + return HxDepth(depth) + } + } + return InvalidDepth +} + +func createHeader(tokenizer *html.Tokenizer, token html.Token) (Header, bool) { + hxDepth := getHxDepth(token.DataAtom) + if hxDepth == InvalidDepth { + return Header{}, false + } + + var href string + var nameParts []string + // Start at 1 because we are inside the Hx tag + tokenDepth := 1 + afterAnchor := false + for { + tokenizer.Next() + t := tokenizer.Token() + switch t.Type { + case html.ErrorToken: + return Header{}, false + case html.StartTagToken: + tokenDepth++ + if t.DataAtom == atom.A { + if hrefAttr, ok := findAttribute(t.Attr, "", "href"); ok { + href = hrefAttr.Val + } else { + // Expected to find href attribute + return Header{}, false + } + } + case html.EndTagToken: + switch t.DataAtom { + case token.DataAtom: + // If we encountered the matching end tag for the Hx, then we are done + return Header{ + Depth: hxDepth, + Name: removeStuff(strings.Join(nameParts, " ")), + Href: href, + }, true + case atom.A: + afterAnchor = true + } + tokenDepth-- + case html.TextToken: + if afterAnchor { + nameParts = append(nameParts, removeStuff(t.Data)) + } + } + } +} + +func findAttribute(attrs []html.Attribute, namespace, key string) (html.Attribute, bool) { + for _, attr := range attrs { + if attr.Namespace == namespace && attr.Key == key { + return attr, true + } + } + return html.Attribute{}, false +} diff --git a/headerfinder_test.go b/headerfinder_test.go new file mode 100644 index 0000000..c8ada93 --- /dev/null +++ b/headerfinder_test.go @@ -0,0 +1,111 @@ +package ghtoc + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "golang.org/x/net/html" + "golang.org/x/net/html/atom" +) + +const singleH1 = ` +

Document Title

+` + +const singleH2 = ` +

+ + Interesting Section +

+` + +const multipleSections = ` +

Document Title

+Hi +

First Section

+Some Text +

First Subsection

+

Second Section

+

Second Subsection

+` + +func TestFindHeaders(t *testing.T) { + t.Run("single H1", func(t *testing.T) { + results := findHeadersInString(singleH1) + assert.Len(t, results, 1) + assert.Equal( + t, + Header{Depth: 0, Href: "#document-title", Name: "Document Title"}, + results[0], + ) + }) + t.Run("single H2", func(t *testing.T) { + results := findHeadersInString(singleH2) + assert.Len(t, results, 1) + assert.Equal( + t, + Header{Depth: 1, Href: "#interesting-section", Name: "Interesting Section"}, + results[0], + ) + }) + t.Run("multiple sections", func(t *testing.T) { + results := findHeadersInString(multipleSections) + assert.Len(t, results, 5) + assert.Equal( + t, + Header{Depth: 0, Href: "#document-title", Name: "Document Title"}, + results[0], + ) + assert.Equal( + t, + Header{Depth: 1, Href: "#first-section", Name: "First Section"}, + results[1], + ) + assert.Equal( + t, + Header{Depth: 2, Href: "#first-subsection", Name: "First Subsection"}, + results[2], + ) + assert.Equal( + t, + Header{Depth: 1, Href: "#second-section", Name: "Second Section"}, + results[3], + ) + assert.Equal( + t, + Header{Depth: 3, Href: "#second-subsection", Name: "Second Subsection"}, + results[4], + ) + }) +} + +func TestFindAttribute(t *testing.T) { + worldGreeting := html.Attribute{Namespace: "", Key: "greeting", Val: "Hello, World!"} + spaceGreeting := html.Attribute{Namespace: "outer-space", Key: "greeting", Val: "Hello, Space!"} + attrs := []html.Attribute{spaceGreeting, worldGreeting} + t.Run("attribute exists", func(t *testing.T) { + attr, ok := findAttribute(attrs, "", "greeting") + assert.True(t, ok) + assert.Equal(t, worldGreeting, attr) + + attr, ok = findAttribute(attrs, "outer-space", "greeting") + assert.True(t, ok) + assert.Equal(t, spaceGreeting, attr) + }) + t.Run("attribute does not exist", func(t *testing.T) { + _, ok := findAttribute(attrs, "", "doesnotexist") + assert.False(t, ok) + }) +} + +func TestGetHxDepth(t *testing.T) { + assert.Equal(t, HxDepth(0), getHxDepth(atom.H1)) + assert.Equal(t, HxDepth(1), getHxDepth(atom.H2)) + assert.Equal(t, HxDepth(2), getHxDepth(atom.H3)) + assert.Equal(t, HxDepth(3), getHxDepth(atom.H4)) + assert.Equal(t, HxDepth(4), getHxDepth(atom.H5)) + assert.Equal(t, HxDepth(5), getHxDepth(atom.H6)) + assert.Equal(t, InvalidDepth, getHxDepth(atom.A)) +}