Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: use golang.org/x/net/html to parse HTML instead of regular expressions #38

Open
wants to merge 18 commits into
base: master
Choose a base branch
from
Open
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ The advantages of this implementation:

* no dependencies (no need curl, wget, awk, etc.)
* cross-platform (support for Windows, Mac OS, etc.)
* regexp for parsing TOC
* `golang.org/x/net/html` for parsing TOC
* parallel processing of multiple documents


Expand Down
5 changes: 3 additions & 2 deletions cmd/gh-md-toc/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,11 @@ func main() {

for _, p := range *paths {
ghdoc := ghtoc.NewGHDoc(p, absPathsInToc, *startDepth, *depth, !*noEscape, *token, *indent, *debug)
getFn := func(ch chan *ghtoc.GHToc, ghdoc *ghtoc.GHDoc) { ch <- ghdoc.GetToc() }
if *serial {
ch <- ghdoc.GetToc()
getFn(ch, ghdoc)
} else {
go func(path string) { ch <- ghdoc.GetToc() }(p)
go getFn(ch, ghdoc)
}
}

Expand Down
94 changes: 43 additions & 51 deletions ghdoc.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import (
"log"
"net/url"
"os"
"regexp"
"strconv"
"strings"
)
Expand Down Expand Up @@ -141,69 +140,62 @@ func (doc *GHDoc) GrabToc() *GHToc {
doc.d("GrabToc: start, html size: " + strconv.Itoa(len(doc.html)))
defer doc.d("GrabToc: done.")

re := `(?si)<h(?P<num>[1-6])>\s*` +
`<a\s*id="user-content-[^"]*"\s*class="anchor"\s*` +
`(aria-hidden="[^"]*"\s*)?` +
`href="(?P<href>[^"]*)"[^>]*>\s*` +
`.*?</a>(?P<name>.*?)</h`
r := regexp.MustCompile(re)
listIndentation := generateListIndentation(doc.Indent)

toc := GHToc{}
minHeaderNum := 6
var groups []map[string]string
doc.d("GrabToc: matching ...")
for idx, match := range r.FindAllStringSubmatch(doc.html, -1) {
doc.d("GrabToc: match #" + strconv.Itoa(idx) + " ...")
group := make(map[string]string)
// fill map for groups
for i, name := range r.SubexpNames() {
if i == 0 || name == "" {
continue
}
doc.d("GrabToc: process group: " + name + ": " + match[i] + " ...")
group[name] = removeStuff(match[i])
}
// update minimum header number
n, _ := strconv.Atoi(group["num"])
if n < minHeaderNum {
minHeaderNum = n
}
groups = append(groups, group)
minDepth := doc.StartDepth
var maxDepth int
if doc.Depth > 0 {
maxDepth = doc.Depth - 1
} else {
maxDepth = int(MaxHxDepth)
}

var tmpSection string
doc.d("GrabToc: processing groups ...")
doc.d("Including starting from level " + strconv.Itoa(doc.StartDepth))
for _, group := range groups {
// format result
n, _ := strconv.Atoi(group["num"])
if n <= doc.StartDepth {
continue
}
if doc.Depth > 0 && n > doc.Depth {
continue
}
hdrs := findHeadersInString(doc.html)

link, _ := url.QueryUnescape(group["href"])
if doc.AbsPaths {
link = doc.Path + link
// Determine the min depth represented by the slice of headers. For example, if a document only
// has H2 tags and no H1 tags. We want the H2 TOC entries to not have an indent.
minHxDepth := MaxHxDepth
for _, hdr := range hdrs {
if hdr.Depth < minHxDepth {
minHxDepth = hdr.Depth
}
}

tmpSection = removeStuff(group["name"])
if doc.Escape {
tmpSection = EscapeSpecChars(tmpSection)
// Populate the toc with entries
toc := GHToc{}
for _, hdr := range hdrs {
hDepth := int(hdr.Depth)
if hDepth >= minDepth && hDepth <= maxDepth {
indentDepth := int(hdr.Depth) - int(minHxDepth) - doc.StartDepth
indent := strings.Repeat(listIndentation(), indentDepth)
toc = append(toc, doc.tocEntry(indent, hdr))
}
tocItem := strings.Repeat(listIndentation(), n-minHeaderNum-doc.StartDepth) + "* " +
"[" + tmpSection + "]" +
"(" + link + ")"
//fmt.Println(tocItem)
toc = append(toc, tocItem)
}

return &toc
}

func (doc *GHDoc) tocEntry(indent string, hdr Header) string {
return indent + "* " +
"[" + doc.tocName(hdr.Name) + "]" +
"(" + doc.tocLink(hdr.Href) + ")"
}

func (doc *GHDoc) tocName(name string) string {
if doc.Escape {
return EscapeSpecChars(name)
}
return name
}

func (doc *GHDoc) tocLink(href string) string {
link, _ := url.QueryUnescape(href)
if doc.AbsPaths {
link = doc.Path + link
}
return link
}

// GetToc return GHToc for a document
func (doc *GHDoc) GetToc() *GHToc {
if err := doc.Convert2HTML(); err != nil {
Expand Down
6 changes: 2 additions & 4 deletions ghdoc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,6 @@ func TestGrabTocDepth(t *testing.T) {
Indent: 2,
}
toc := *doc.GrabToc()

for i := 0; i <= len(tocExpected)-1; i++ {
if toc[i] != tocExpected[i] {
t.Error("Res :", toc[i], "\nExpected :", tocExpected[i])
Expand Down Expand Up @@ -211,7 +210,7 @@ func TestGrabTocStartDepth(t *testing.T) {
<p>Blabla...</p>

<h3>
<a id="user-content-the-command-foo3-is-even-betterer" class="anchor" href="#the-command-foo3-is-even-betterer" aria-hidden="true"><span class="octicon octicon-link"></span></a>The command <code>foo3</code> is even betterer</h2>
<a id="user-content-the-command-foo3-is-even-betterer" class="anchor" href="#the-command-foo3-is-even-betterer" aria-hidden="true"><span class="octicon octicon-link"></span></a>The command <code>foo3</code> is even betterer</h3>

<p>Blabla...</p>

Expand All @@ -227,7 +226,7 @@ func TestGrabTocStartDepth(t *testing.T) {
<p>Blabla...</p>

<h3>
<a id="user-content-the-command-bar3-is-even-betterer" class="anchor" href="#the-command-bar3-is-even-betterer" aria-hidden="true"><span class="octicon octicon-link"></span></a>The command <code>bar3</code> is even betterer</h2>
<a id="user-content-the-command-bar3-is-even-betterer" class="anchor" href="#the-command-bar3-is-even-betterer" aria-hidden="true"><span class="octicon octicon-link"></span></a>The command <code>bar3</code> is even betterer</h3>

<p>Blabla...</p>
`, AbsPaths: false,
Expand All @@ -236,7 +235,6 @@ func TestGrabTocStartDepth(t *testing.T) {
Indent: 2,
}
toc := *doc.GrabToc()

for i := 0; i <= len(tocExpected)-1; i++ {
if toc[i] != tocExpected[i] {
t.Error("Res :", toc[i], "\nExpected :", tocExpected[i])
Expand Down
13 changes: 10 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
module github.com/ekalinin/github-markdown-toc.go

go 1.17
go 1.19

require gopkg.in/alecthomas/kingpin.v2 v2.2.4
require (
github.com/stretchr/testify v1.7.0
golang.org/x/net v0.1.0
gopkg.in/alecthomas/kingpin.v2 v2.2.4
)

require (
github.com/alecthomas/assert v0.0.0-20170929043011-405dbfeb8e38 // indirect
Expand All @@ -11,7 +15,10 @@ require (
github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc // indirect
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/kr/pretty v0.1.0 // indirect
github.com/mattn/go-isatty v0.0.14 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/sergi/go-diff v1.2.0 // indirect
github.com/stretchr/testify v1.7.0 // indirect
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 // indirect
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c // indirect
)
7 changes: 6 additions & 1 deletion go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@ github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRF
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/mattn/go-isatty v0.0.14 h1:yVuAays6BHfxijgZPzw+3Zlu5yQgKGP2/hcQbHb7S9Y=
github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94=
Expand All @@ -24,11 +26,14 @@ github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c h1:F1jZWGFhYfh0Ci55sIpILtKKK8p3i2/krTr0H1rg74I=
golang.org/x/net v0.1.0 h1:hZ/3BUoy5aId7sCpA/Tc5lt8DkFgdVS2onTpJsZ/fl0=
golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco=
golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.1.0 h1:kunALQeHf1/185U1i0GOB/fy1IPRDDpuoOOqRReG57U=
gopkg.in/alecthomas/kingpin.v2 v2.2.4 h1:CC8tJ/xljioKrK6ii3IeWVXU4Tw7VB+LbjZBJaBxN50=
gopkg.in/alecthomas/kingpin.v2 v2.2.4/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo=
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
Expand Down
122 changes: 122 additions & 0 deletions headerfinder.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
package ghtoc

import (
"io"
"strings"

"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)

// HxDepth represents the header depth with H1 being 0.
type HxDepth int

// InvalidDepth designates that the data atom is not a valid Hx.
const InvalidDepth HxDepth = -1

// MaxHxDepth is the maximum HxDepth value.
// H6 is the last Hx tag (5 = 6 - 1)
const MaxHxDepth HxDepth = 5

// Header represents an HTML header
type Header struct {
Depth HxDepth
Href string
Name string
}

func findHeadersInString(str string) []Header {
r := strings.NewReader(str)
return findHeaders(r)
}

func findHeaders(r io.Reader) []Header {
hdrs := make([]Header, 0)
tokenizer := html.NewTokenizer(r)
for {
tt := tokenizer.Next()
switch tt {
case html.ErrorToken:
return hdrs
case html.StartTagToken:
t := tokenizer.Token()
if hdr, ok := createHeader(tokenizer, t); ok {
hdrs = append(hdrs, hdr)
}
}
}
}

func getHxDepth(dataAtom atom.Atom) HxDepth {
hxAtoms := []atom.Atom{
atom.H1,
atom.H2,
atom.H3,
atom.H4,
atom.H5,
atom.H6,
}
for depth, hxAtom := range hxAtoms {
if dataAtom == hxAtom {
return HxDepth(depth)
}
}
return InvalidDepth
}

func createHeader(tokenizer *html.Tokenizer, token html.Token) (Header, bool) {
hxDepth := getHxDepth(token.DataAtom)
if hxDepth == InvalidDepth {
return Header{}, false
}

var href string
var nameParts []string
// Start at 1 because we are inside the Hx tag
tokenDepth := 1
afterAnchor := false
for {
tokenizer.Next()
t := tokenizer.Token()
switch t.Type {
case html.ErrorToken:
return Header{}, false
case html.StartTagToken:
tokenDepth++
if t.DataAtom == atom.A {
if hrefAttr, ok := findAttribute(t.Attr, "", "href"); ok {
href = hrefAttr.Val
} else {
// Expected to find href attribute
return Header{}, false
}
}
case html.EndTagToken:
switch t.DataAtom {
case token.DataAtom:
// If we encountered the matching end tag for the Hx, then we are done
return Header{
Depth: hxDepth,
Name: removeStuff(strings.Join(nameParts, " ")),
Href: href,
}, true
case atom.A:
afterAnchor = true
}
tokenDepth--
case html.TextToken:
if afterAnchor {
nameParts = append(nameParts, removeStuff(t.Data))
}
}
}
}

func findAttribute(attrs []html.Attribute, namespace, key string) (html.Attribute, bool) {
for _, attr := range attrs {
if attr.Namespace == namespace && attr.Key == key {
return attr, true
}
}
return html.Attribute{}, false
}
Loading