diff --git a/htmlutil.go b/htmlutil.go index c096443..05210e0 100755 --- a/htmlutil.go +++ b/htmlutil.go @@ -14,6 +14,33 @@ limitations under the License. */ +// Package htmlutil implements a wrapper for Golang's html5 tokeniser / parser implementation, making it much easier to +// find and extract information, aiming to be powerful and intuitive while remaining a minimal and logical extension. +// +// There are three core components, the `htmlutil.Node` struct (a wrapper for `*html.Node`), the `htmlutil.Parse` +// function (optional), an ubiquitous filter algorithm used throughout this implementation, providing functionality +// similar to CSS selectors, and powered by optional (varargs) parameters in the form of chained closures with a +// signature of `func(htmlutil.Node) bool`. +// +// Filter behavior +// +// - based on a recursive algorithm where each node can match at most one filter, consuming it (for that sub-tree), +// and is added to the result if `len(filters) == 0` +// - every node in the tree is searched (in general, there is a "find" mode where only one result is returned) +// - nil filters are preemptively stripped, and so are treated like they were omitted +// - each node will be present in the result at most once, and will retain (depth first) order +// - behavior is undefined if the tree is not "well formed" (e.g. any cycles) +// - providing no filters will return ALL nodes (or if only one result is needed, the first node) +// - filter closures will not be called with a node with a nil `Data` field +// - filter closures will receive nodes with a `Depth` field relative to the original +// - the node's `Match` field stores the last "matched" node in the chain (note: duplicate matches for the same +// `*html.Node` are squashed), the root node is always treated as an initial match +// - resulting node values will retain the match chain (will always be non-nil if the root was non-nil) +// +// General behavior +// +// - a nil `Data` field for a `htmlutil.Node` indicates no node / no result, and methods should return default values, +// or other intuitive analog (behavior to make chaining far simpler) package htmlutil import ( @@ -47,74 +74,87 @@ func Parse(r io.Reader, filters ...func(node Node) bool) (Node, error) { } } -func (n Node) FilterNodes(filters ...func(node Node) bool) []Node { - return filterNodes(n, filters...) +// Attr will return the value of `n.Data.Attr`, returning nil if `n.Data` is nil +func (n Node) Attr() []html.Attribute { + if n.Data == nil { + return nil + } + return n.Data.Attr } -func (n Node) FindNode(filters ...func(node Node) bool) (Node, bool) { - return findNode(n, filters...) +// Offset is the difference between the depth of this node and the depth of last match, returning the depth of this +// node if `n.Match` is nil +func (n Node) Offset() int { + d := n.Depth + if n.Match != nil { + d -= n.Match.Depth + } + return d } -func (n Node) GetNode(filters ...func(node Node) bool) Node { - return getNode(n, filters...) +// Type will return the value of `n.Data.Type`, returning `html.ErrorNode` if `n.Data` is nil +func (n Node) Type() html.NodeType { + if n.Data != nil { + return n.Data.Type + } + return html.ErrorNode } -func (n Node) Attr() []html.Attribute { - if n.Data == nil { - return nil +// Tag will return `n.Data.Data` if the node has a type of `html.ElementNode`, otherwise it will return an empty string +func (n Node) Tag() string { + if n.Type() == html.ElementNode { + return n.Data.Data } - return n.Data.Attr + return "" } +// GetAttr matches on the first attribute (if any) for this node with the same namespace and key (key being case +// insensitive if namespace is empty), returning false if no match was found func (n Node) GetAttr(namespace string, key string) (html.Attribute, bool) { return getAttr(namespace, key, n.Attr()...) } +// GetAttrVal returns the value of any attribute matched by `n.GetAttr` func (n Node) GetAttrVal(namespace string, key string) string { return getAttrVal(namespace, key, n.Attr()...) } -func (n Node) EncodeHTML() string { - return encodeHTML(n.Data) -} - -func (n Node) EncodeText() string { - return encodeText(n.Data) +// String is an alias for `n.OuterHTML` +func (n Node) String() string { + return n.OuterHTML() } -func (n Node) String() string { - return n.EncodeHTML() +// OuterHTML encodes this node as html using the `html.Render` function, note that it will return an empty string +// if `n.Data` is nil, and will panic if any error is returned (which should only occur if the sub-tree is not +// "well formed") +func (n Node) OuterHTML() string { + return encodeHTML(n.Data) } -func (n Node) Range(fn func(i int, node Node) bool, filters ...func(node Node) bool) { - if fn == nil { - panic(errors.New("htmlutil.Node.Range nil fn")) - } - i := 0 - for node := n.FirstChild(filters...); node.Data != nil; node = node.NextSibling(filters...) { - if !fn(i, node) { - break - } - i++ - } +// OuterText builds a string from the data of all text nodes in the sub-tree, starting from and including `n` +func (n Node) OuterText() string { + return encodeText(n.Data) } -func (n Node) Children(filters ...func(node Node) bool) (children []Node) { +// InnerHTML builds a string using the outer html of all children matching all filters (see the `FindNode` method) +func (n Node) InnerHTML(filters ...func(node Node) bool) string { + var b []byte n.Range( func(i int, node Node) bool { - children = append(children, node) + b = append(b, []byte(node.OuterHTML())...) return true }, filters..., ) - return + return string(b) } -func (n Node) InnerHTML(filters ...func(node Node) bool) string { +// InnerText builds a string using the outer text of all children matching all filters (see the `FindNode` method) +func (n Node) InnerText(filters ...func(node Node) bool) string { var b []byte n.Range( func(i int, node Node) bool { - b = append(b, []byte(node.EncodeHTML())...) + b = append(b, []byte(node.OuterText())...) return true }, filters..., @@ -122,18 +162,64 @@ func (n Node) InnerHTML(filters ...func(node Node) bool) string { return string(b) } -func (n Node) InnerText(filters ...func(node Node) bool) string { - var b []byte +// SiblingIndex returns the total number of previous siblings matching any filters (see the `FindNode` method) +func (n Node) SiblingIndex(filters ...func(node Node) bool) int { + return siblingIndex(n, filters...) +} + +// SiblingLength returns the total number of siblings matching any filters (see the `FindNode` method) incremented by +// one for the current node, or returns 0 if the receiver has nil data (is empty) +func (n Node) SiblingLength(filters ...func(node Node) bool) int { + return siblingLength(n, filters...) +} + +// FilterNodes returns all nodes from the sub-tree (a search including the receiver) matching the filters (see package +// comment for filter behavior) +func (n Node) FilterNodes(filters ...func(node Node) bool) []Node { + return filterNodes(n, filters...) +} + +// FindNode returns the first node from the sub-tree (a search including the receiver) matching the filters (see +// package comment for filter behavior) +func (n Node) FindNode(filters ...func(node Node) bool) (Node, bool) { + return findNode(n, filters...) +} + +// GetNode returns the node returned by FindNode without the boolean flag indicating if there was a match, it is +// provided for chaining purposes, since this package deliberately handles a nil `Data` field gracefully +func (n Node) GetNode(filters ...func(node Node) bool) Node { + return getNode(n, filters...) +} + +// Range iterates on any children matching any filters (see the `FindNode` method), providing the (filtered) index +// and node to the provided fn, note that it will panic if fn is nil +func (n Node) Range(fn func(i int, node Node) bool, filters ...func(node Node) bool) { + if fn == nil { + panic(errors.New("htmlutil.Node.Range nil fn")) + } + i := 0 + for node := n.FirstChild(filters...); node.Data != nil; node = node.NextSibling(filters...) { + if !fn(i, node) { + break + } + i++ + } +} + +// Children builds a slice containing all child nodes using the `Range` method, passing through filters +func (n Node) Children(filters ...func(node Node) bool) (children []Node) { n.Range( func(i int, node Node) bool { - b = append(b, []byte(node.EncodeText())...) + children = append(children, node) return true }, filters..., ) - return string(b) + return } +// Parent will return the first parent node matching any filters (see the `FindNode` method), or a node with a nil +// `Data` property for no match, note that depth will be automatically decremented (potentially multiple times) func (n Node) Parent(filters ...func(node Node) bool) Node { n.Depth-- if n.Data != nil { @@ -147,6 +233,8 @@ func (n Node) Parent(filters ...func(node Node) bool) Node { return n } +// FirstChild will return the leftmost child node matching any filters (see the `FindNode` method), or a node with a +// nil `Data` property for no match, note that depth will be automatically incremented func (n Node) FirstChild(filters ...func(node Node) bool) Node { n.Depth++ if n.Data != nil { @@ -160,6 +248,8 @@ func (n Node) FirstChild(filters ...func(node Node) bool) Node { return n } +// LastChild will return the rightmost child node matching any filters (see the `FindNode` method), or a node with a +// nil `Data` property for no match, note that depth will be automatically incremented func (n Node) LastChild(filters ...func(node Node) bool) Node { n.Depth++ if n.Data != nil { @@ -173,6 +263,8 @@ func (n Node) LastChild(filters ...func(node Node) bool) Node { return n } +// PrevSibling will return the rightmost previous sibling node matching any filters (see the `FindNode` method), or a +// node with a nil `Data` property for no match func (n Node) PrevSibling(filters ...func(node Node) bool) Node { if n.Data != nil { n.Data = n.Data.PrevSibling @@ -185,6 +277,8 @@ func (n Node) PrevSibling(filters ...func(node Node) bool) Node { return n } +// NextSibling will return the leftmost next sibling node matching any filters (see the `FindNode` method), or a +// node with a nil `Data` property for no match func (n Node) NextSibling(filters ...func(node Node) bool) Node { if n.Data != nil { n.Data = n.Data.NextSibling @@ -196,36 +290,3 @@ func (n Node) NextSibling(filters ...func(node Node) bool) Node { } return n } - -func (n Node) MatchDepth() int { - d := n.Depth - if n.Match != nil { - d -= n.Match.Depth - } - return d -} - -func (n Node) Type() html.NodeType { - if n.Data != nil { - return n.Data.Type - } - return html.ErrorNode -} - -func (n Node) Tag() string { - if n.Type() == html.ElementNode { - return n.Data.Data - } - return "" -} - -// SiblingIndex returns the total number of previous siblings matching any filters -func (n Node) SiblingIndex(filters ...func(node Node) bool) int { - return siblingIndex(n, filters...) -} - -// SiblingLength returns the total number of siblings matching any filters incremented by one for the current node, -// or returns 0 if the receiver has nil data (is empty) -func (n Node) SiblingLength(filters ...func(node Node) bool) int { - return siblingLength(n, filters...) -} diff --git a/htmlutil_test.go b/htmlutil_test.go index 3dfac6c..31d4854 100755 --- a/htmlutil_test.go +++ b/htmlutil_test.go @@ -376,7 +376,7 @@ func TestGetNode_success(t *testing.T) { func(node Node) bool { return node.Tag() == `b` }, - ).EncodeHTML(); v != `a` { + ).OuterHTML(); v != `a` { t.Error(v) } } @@ -471,7 +471,7 @@ func TestNode_FilterNodes_depth(t *testing.T) { return node.Tag() == "div" && node.GetAttrVal(``, `a`) == "" }, func(node Node) bool { - return node.Type() == html.ElementNode && node.MatchDepth() == 2 + return node.Type() == html.ElementNode && node.Offset() == 2 }, ) if len(nodes) != 2 { @@ -480,7 +480,7 @@ func TestNode_FilterNodes_depth(t *testing.T) { if v := nodes[0].Depth; v != 5 { t.Error(v) } - if v := nodes[0].MatchDepth(); v != 2 { + if v := nodes[0].Offset(); v != 2 { t.Error(v) } if v := nodes[0].Match; v == nil { @@ -489,7 +489,7 @@ func TestNode_FilterNodes_depth(t *testing.T) { if v := v.Depth; v != 3 { t.Error(v) } - if v := v.MatchDepth(); v != 3 { + if v := v.Offset(); v != 3 { t.Error(v) } if v := v.Match; v == nil || v.Data == nil { @@ -506,7 +506,7 @@ func TestNode_FilterNodes_depth(t *testing.T) { t.Error(v) } else if v.Match != a.Match { t.Error(v) - } else if v := v.EncodeHTML(); v != `
five
six
` { + } else if v := v.OuterHTML(); v != `
five
six
` { t.Error(v) } a.Match = v.Match @@ -518,7 +518,7 @@ func TestNode_FilterNodes_depth(t *testing.T) { if v := nodes[0].Parent().PrevSibling(); v.Data == nil || v.Depth != 4 { t.Error(v) } else { - if v := v.EncodeHTML(); v != `
two
` { + if v := v.OuterHTML(); v != `
two
` { t.Error(v) } } @@ -535,10 +535,10 @@ func TestNode_FilterNodes_depth(t *testing.T) { t.Error(v) } } - if v := nodes[0].EncodeHTML(); v != `
four
` { + if v := nodes[0].OuterHTML(); v != `
four
` { t.Error(v) } - if v := nodes[1].EncodeHTML(); v != `
six
` { + if v := nodes[1].OuterHTML(); v != `
six
` { t.Error(v) } } @@ -572,18 +572,18 @@ func TestNode_FindNode_success(t *testing.T) { if n.Depth != 5 { t.Error(n.Depth) } else { - if v := n.Parent(); v.Data == nil || v.Depth != 4 || v.EncodeHTML() != `` { - t.Error(v.Data, v.Depth, v.EncodeHTML()) + if v := n.Parent(); v.Data == nil || v.Depth != 4 || v.OuterHTML() != `` { + t.Error(v.Data, v.Depth, v.OuterHTML()) } - if v := n.Parent(nil, nil, nil, nil, nil, nil, nil, nil, nil, nil); v.Data == nil || v.Depth != 4 || v.EncodeHTML() != `` { - t.Error(v.Data, v.Depth, v.EncodeHTML()) + if v := n.Parent(nil, nil, nil, nil, nil, nil, nil, nil, nil, nil); v.Data == nil || v.Depth != 4 || v.OuterHTML() != `` { + t.Error(v.Data, v.Depth, v.OuterHTML()) } if v := n.Parent( func(node Node) bool { return true }, - ); v.Data == nil || v.Depth != 4 || v.EncodeHTML() != `` { - t.Error(v.Data, v.Depth, v.EncodeHTML()) + ); v.Data == nil || v.Depth != 4 || v.OuterHTML() != `` { + t.Error(v.Data, v.Depth, v.OuterHTML()) } if v := n.Parent( nil, @@ -592,8 +592,8 @@ func TestNode_FindNode_success(t *testing.T) { func(node Node) bool { return node.Tag() == `b` }, - ); v.Data == nil || v.Depth != 4 || v.EncodeHTML() != `` { - t.Error(v.Data, v.Depth, v.EncodeHTML()) + ); v.Data == nil || v.Depth != 4 || v.OuterHTML() != `` { + t.Error(v.Data, v.Depth, v.OuterHTML()) } if v := n.Parent( nil, @@ -602,8 +602,8 @@ func TestNode_FindNode_success(t *testing.T) { func(node Node) bool { return node.Tag() == `body` }, - ); v.Data == nil || v.Depth != 2 || v.EncodeHTML() != `` { - t.Error(v.Data, v.Depth, v.EncodeHTML()) + ); v.Data == nil || v.Depth != 2 || v.OuterHTML() != `` { + t.Error(v.Data, v.Depth, v.OuterHTML()) } } @@ -840,19 +840,19 @@ func TestNode_Children(t *testing.T) { if v := nodes[0].Type(); v != html.ElementNode { t.Fatal(v) } - if v := nodes[0].EncodeHTML(); v != `` { + if v := nodes[0].OuterHTML(); v != `` { t.Fatal(v) } if v := nodes[1].Type(); v != html.TextNode { t.Fatal(v) } - if v := nodes[1].EncodeHTML(); v != ` ` { + if v := nodes[1].OuterHTML(); v != ` ` { t.Fatal(v) } if v := nodes[2].Type(); v != html.ElementNode { t.Fatal(v) } - if v := nodes[2].EncodeHTML(); v != `` { + if v := nodes[2].OuterHTML(); v != `` { t.Fatal(v) } } @@ -876,7 +876,7 @@ func TestNode_GetNode_success(t *testing.T) { func(node Node) bool { return node.Tag() == `b` }, - ).EncodeHTML(); v != `a` { + ).OuterHTML(); v != `a` { t.Error(v) } } @@ -906,7 +906,7 @@ func TestNode_Range_bailOutTrue(t *testing.T) { ) node.Range( func(i int, node Node) bool { - values = append(values, node.EncodeHTML()) + values = append(values, node.OuterHTML()) if i != index { t.Fatal(i, index) } @@ -939,7 +939,7 @@ func TestNode_Range_bailOutFalse(t *testing.T) { ) node.Range( func(i int, node Node) bool { - values = append(values, node.EncodeHTML()) + values = append(values, node.OuterHTML()) if i != index { t.Fatal(i, index) } @@ -1023,14 +1023,14 @@ func TestNode_Range_filter(t *testing.T) { count int filter = func(node Node) bool { count++ - return node.MatchDepth() == 0 && + return node.Offset() == 0 && node.Type() == html.ElementNode && node.Tag() != `no` } ) node.Range( func(i int, node Node) bool { - values = append(values, node.EncodeHTML()) + values = append(values, node.OuterHTML()) if i != index { t.Fatal(i, index) } @@ -1066,7 +1066,7 @@ func TestNode_Range_filter(t *testing.T) { func() { var children []string for _, node := range node.Children(filter) { - children = append(children, node.EncodeHTML()) + children = append(children, node.OuterHTML()) } if len(children) != 3 { t.Error(len(children)) @@ -1105,7 +1105,7 @@ func TestNode_Range_filter(t *testing.T) { func() { var children []string for node := node.LastChild(filter); node.Data != nil; node = node.PrevSibling(filter) { - children = append([]string{node.EncodeHTML()}, children...) + children = append([]string{node.OuterHTML()}, children...) } if len(children) != 3 { t.Error(len(children))