Skip to content

Commit

Permalink
refactor(api): comment everything and likely final breaking changes b…
Browse files Browse the repository at this point in the history
…efore v1
  • Loading branch information
joeycumines committed Feb 11, 2019
1 parent 5e6739f commit b777d8b
Show file tree
Hide file tree
Showing 2 changed files with 160 additions and 99 deletions.
203 changes: 132 additions & 71 deletions htmlutil.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,33 @@
limitations under the License.
*/

// Package htmlutil implements a wrapper for Golang's html5 tokeniser / parser implementation, making it much easier to
// find and extract information, aiming to be powerful and intuitive while remaining a minimal and logical extension.
//
// There are three core components, the `htmlutil.Node` struct (a wrapper for `*html.Node`), the `htmlutil.Parse`
// function (optional), an ubiquitous filter algorithm used throughout this implementation, providing functionality
// similar to CSS selectors, and powered by optional (varargs) parameters in the form of chained closures with a
// signature of `func(htmlutil.Node) bool`.
//
// Filter behavior
//
// - based on a recursive algorithm where each node can match at most one filter, consuming it (for that sub-tree),
// and is added to the result if `len(filters) == 0`
// - every node in the tree is searched (in general, there is a "find" mode where only one result is returned)
// - nil filters are preemptively stripped, and so are treated like they were omitted
// - each node will be present in the result at most once, and will retain (depth first) order
// - behavior is undefined if the tree is not "well formed" (e.g. any cycles)
// - providing no filters will return ALL nodes (or if only one result is needed, the first node)
// - filter closures will not be called with a node with a nil `Data` field
// - filter closures will receive nodes with a `Depth` field relative to the original
// - the node's `Match` field stores the last "matched" node in the chain (note: duplicate matches for the same
// `*html.Node` are squashed), the root node is always treated as an initial match
// - resulting node values will retain the match chain (will always be non-nil if the root was non-nil)
//
// General behavior
//
// - a nil `Data` field for a `htmlutil.Node` indicates no node / no result, and methods should return default values,
// or other intuitive analog (behavior to make chaining far simpler)
package htmlutil

import (
Expand Down Expand Up @@ -47,93 +74,152 @@ func Parse(r io.Reader, filters ...func(node Node) bool) (Node, error) {
}
}

func (n Node) FilterNodes(filters ...func(node Node) bool) []Node {
return filterNodes(n, filters...)
// Attr will return the value of `n.Data.Attr`, returning nil if `n.Data` is nil
func (n Node) Attr() []html.Attribute {
if n.Data == nil {
return nil
}
return n.Data.Attr
}

func (n Node) FindNode(filters ...func(node Node) bool) (Node, bool) {
return findNode(n, filters...)
// Offset is the difference between the depth of this node and the depth of last match, returning the depth of this
// node if `n.Match` is nil
func (n Node) Offset() int {
d := n.Depth
if n.Match != nil {
d -= n.Match.Depth
}
return d
}

func (n Node) GetNode(filters ...func(node Node) bool) Node {
return getNode(n, filters...)
// Type will return the value of `n.Data.Type`, returning `html.ErrorNode` if `n.Data` is nil
func (n Node) Type() html.NodeType {
if n.Data != nil {
return n.Data.Type
}
return html.ErrorNode
}

func (n Node) Attr() []html.Attribute {
if n.Data == nil {
return nil
// Tag will return `n.Data.Data` if the node has a type of `html.ElementNode`, otherwise it will return an empty string
func (n Node) Tag() string {
if n.Type() == html.ElementNode {
return n.Data.Data
}
return n.Data.Attr
return ""
}

// GetAttr matches on the first attribute (if any) for this node with the same namespace and key (key being case
// insensitive if namespace is empty), returning false if no match was found
func (n Node) GetAttr(namespace string, key string) (html.Attribute, bool) {
return getAttr(namespace, key, n.Attr()...)
}

// GetAttrVal returns the value of any attribute matched by `n.GetAttr`
func (n Node) GetAttrVal(namespace string, key string) string {
return getAttrVal(namespace, key, n.Attr()...)
}

func (n Node) EncodeHTML() string {
return encodeHTML(n.Data)
}

func (n Node) EncodeText() string {
return encodeText(n.Data)
// String is an alias for `n.OuterHTML`
func (n Node) String() string {
return n.OuterHTML()
}

func (n Node) String() string {
return n.EncodeHTML()
// OuterHTML encodes this node as html using the `html.Render` function, note that it will return an empty string
// if `n.Data` is nil, and will panic if any error is returned (which should only occur if the sub-tree is not
// "well formed")
func (n Node) OuterHTML() string {
return encodeHTML(n.Data)
}

func (n Node) Range(fn func(i int, node Node) bool, filters ...func(node Node) bool) {
if fn == nil {
panic(errors.New("htmlutil.Node.Range nil fn"))
}
i := 0
for node := n.FirstChild(filters...); node.Data != nil; node = node.NextSibling(filters...) {
if !fn(i, node) {
break
}
i++
}
// OuterText builds a string from the data of all text nodes in the sub-tree, starting from and including `n`
func (n Node) OuterText() string {
return encodeText(n.Data)
}

func (n Node) Children(filters ...func(node Node) bool) (children []Node) {
// InnerHTML builds a string using the outer html of all children matching all filters (see the `FindNode` method)
func (n Node) InnerHTML(filters ...func(node Node) bool) string {
var b []byte
n.Range(
func(i int, node Node) bool {
children = append(children, node)
b = append(b, []byte(node.OuterHTML())...)
return true
},
filters...,
)
return
return string(b)
}

func (n Node) InnerHTML(filters ...func(node Node) bool) string {
// InnerText builds a string using the outer text of all children matching all filters (see the `FindNode` method)
func (n Node) InnerText(filters ...func(node Node) bool) string {
var b []byte
n.Range(
func(i int, node Node) bool {
b = append(b, []byte(node.EncodeHTML())...)
b = append(b, []byte(node.OuterText())...)
return true
},
filters...,
)
return string(b)
}

func (n Node) InnerText(filters ...func(node Node) bool) string {
var b []byte
// SiblingIndex returns the total number of previous siblings matching any filters (see the `FindNode` method)
func (n Node) SiblingIndex(filters ...func(node Node) bool) int {
return siblingIndex(n, filters...)
}

// SiblingLength returns the total number of siblings matching any filters (see the `FindNode` method) incremented by
// one for the current node, or returns 0 if the receiver has nil data (is empty)
func (n Node) SiblingLength(filters ...func(node Node) bool) int {
return siblingLength(n, filters...)
}

// FilterNodes returns all nodes from the sub-tree (a search including the receiver) matching the filters (see package
// comment for filter behavior)
func (n Node) FilterNodes(filters ...func(node Node) bool) []Node {
return filterNodes(n, filters...)
}

// FindNode returns the first node from the sub-tree (a search including the receiver) matching the filters (see
// package comment for filter behavior)
func (n Node) FindNode(filters ...func(node Node) bool) (Node, bool) {
return findNode(n, filters...)
}

// GetNode returns the node returned by FindNode without the boolean flag indicating if there was a match, it is
// provided for chaining purposes, since this package deliberately handles a nil `Data` field gracefully
func (n Node) GetNode(filters ...func(node Node) bool) Node {
return getNode(n, filters...)
}

// Range iterates on any children matching any filters (see the `FindNode` method), providing the (filtered) index
// and node to the provided fn, note that it will panic if fn is nil
func (n Node) Range(fn func(i int, node Node) bool, filters ...func(node Node) bool) {
if fn == nil {
panic(errors.New("htmlutil.Node.Range nil fn"))
}
i := 0
for node := n.FirstChild(filters...); node.Data != nil; node = node.NextSibling(filters...) {
if !fn(i, node) {
break
}
i++
}
}

// Children builds a slice containing all child nodes using the `Range` method, passing through filters
func (n Node) Children(filters ...func(node Node) bool) (children []Node) {
n.Range(
func(i int, node Node) bool {
b = append(b, []byte(node.EncodeText())...)
children = append(children, node)
return true
},
filters...,
)
return string(b)
return
}

// Parent will return the first parent node matching any filters (see the `FindNode` method), or a node with a nil
// `Data` property for no match, note that depth will be automatically decremented (potentially multiple times)
func (n Node) Parent(filters ...func(node Node) bool) Node {
n.Depth--
if n.Data != nil {
Expand All @@ -147,6 +233,8 @@ func (n Node) Parent(filters ...func(node Node) bool) Node {
return n
}

// FirstChild will return the leftmost child node matching any filters (see the `FindNode` method), or a node with a
// nil `Data` property for no match, note that depth will be automatically incremented
func (n Node) FirstChild(filters ...func(node Node) bool) Node {
n.Depth++
if n.Data != nil {
Expand All @@ -160,6 +248,8 @@ func (n Node) FirstChild(filters ...func(node Node) bool) Node {
return n
}

// LastChild will return the rightmost child node matching any filters (see the `FindNode` method), or a node with a
// nil `Data` property for no match, note that depth will be automatically incremented
func (n Node) LastChild(filters ...func(node Node) bool) Node {
n.Depth++
if n.Data != nil {
Expand All @@ -173,6 +263,8 @@ func (n Node) LastChild(filters ...func(node Node) bool) Node {
return n
}

// PrevSibling will return the rightmost previous sibling node matching any filters (see the `FindNode` method), or a
// node with a nil `Data` property for no match
func (n Node) PrevSibling(filters ...func(node Node) bool) Node {
if n.Data != nil {
n.Data = n.Data.PrevSibling
Expand All @@ -185,6 +277,8 @@ func (n Node) PrevSibling(filters ...func(node Node) bool) Node {
return n
}

// NextSibling will return the leftmost next sibling node matching any filters (see the `FindNode` method), or a
// node with a nil `Data` property for no match
func (n Node) NextSibling(filters ...func(node Node) bool) Node {
if n.Data != nil {
n.Data = n.Data.NextSibling
Expand All @@ -196,36 +290,3 @@ func (n Node) NextSibling(filters ...func(node Node) bool) Node {
}
return n
}

func (n Node) MatchDepth() int {
d := n.Depth
if n.Match != nil {
d -= n.Match.Depth
}
return d
}

func (n Node) Type() html.NodeType {
if n.Data != nil {
return n.Data.Type
}
return html.ErrorNode
}

func (n Node) Tag() string {
if n.Type() == html.ElementNode {
return n.Data.Data
}
return ""
}

// SiblingIndex returns the total number of previous siblings matching any filters
func (n Node) SiblingIndex(filters ...func(node Node) bool) int {
return siblingIndex(n, filters...)
}

// SiblingLength returns the total number of siblings matching any filters incremented by one for the current node,
// or returns 0 if the receiver has nil data (is empty)
func (n Node) SiblingLength(filters ...func(node Node) bool) int {
return siblingLength(n, filters...)
}
Loading

0 comments on commit b777d8b

Please sign in to comment.