diff --git a/README.md b/README.md index 5e217b3..1b62e16 100644 --- a/README.md +++ b/README.md @@ -44,13 +44,12 @@ Options: Many options of the following tests can customised. Items marked :soon: are not checked yet, but will be *soon*. - `a` `link` `img` `script`: Whether internal links work / are valid. -- `a`: :soon: Whether internal hashes work. +- `a`: Whether internal hashes work. - `a` `link` `img` `script`: Whether external links work. - `a`: :soon: Whether external hashes work. - `a` `link`: Whether external links use HTTPS. -- `a` `link`: Whether external links use HTTPS. - `img`: Whether your images have valid alt attributes. -- `meta`: :soon: Whether favicons are valid. +- `link`: Whether pages have a valid favicon. - `meta`: :soon: Whether images and URLs in the OpenGraph metadata are valid. - `meta` `title`: :soon: Whether you've got the [recommended tags](https://support.google.com/webmasters/answer/79812?hl=en) in your head. @@ -90,12 +89,13 @@ htmltest uses a YAML configuration file. Put `.htmltest.yml` in the same directo | `CheckScripts` | Enables checking `" + nodeDoc, _ := html.Parse(strings.NewReader(snip)) + nodeImg := nodeDoc.FirstChild.FirstChild.NextSibling.FirstChild + attrs := ExtractAttrs(nodeImg.Attr, []string{"src", "alt"}) + + assert.Equals(t, "src", attrs["src"], "x") + assert.Equals(t, "alt", attrs["alt"], "y") + assert.NotEquals(t, "foo", attrs["foo"], "bar") +} + +func TestAttrPresent(t *testing.T) { + snip := "" + nodeDoc, _ := html.Parse(strings.NewReader(snip)) + nodeImg := nodeDoc.FirstChild.FirstChild.NextSibling.FirstChild + + assert.Equals(t, "src in attr", AttrPresent(nodeImg.Attr, "src"), true) + assert.Equals(t, "alt in attr", AttrPresent(nodeImg.Attr, "src"), true) + assert.NotEquals(t, "foo in attr", AttrPresent(nodeImg.Attr, "src"), false) +} + +func TestAttrValIdId(t *testing.T) { + snip := "
" + nodeDoc, _ := html.Parse(strings.NewReader(snip)) + nodeH1 := nodeDoc.FirstChild.FirstChild.NextSibling.FirstChild + + assert.Equals(t, "h1 id", GetId(nodeH1.Attr), "x") +} + +func TestAttrValIdName(t *testing.T) { + snip := "" + nodeDoc, _ := html.Parse(strings.NewReader(snip)) + nodeH1 := nodeDoc.FirstChild.FirstChild.NextSibling.FirstChild + + assert.Equals(t, "h1 name", GetId(nodeH1.Attr), "x") +} diff --git a/htmldoc/document.go b/htmldoc/document.go index 57cbff4..37862a9 100644 --- a/htmldoc/document.go +++ b/htmldoc/document.go @@ -3,16 +3,18 @@ package htmldoc import ( "golang.org/x/net/html" "os" - "path" - "regexp" + "sync" ) type Document struct { - FilePath string // Relative to the shell session - SitePath string // Relative to the site root - Directory string - HTMLNode *html.Node - State DocumentState + FilePath string // Relative to the shell session + SitePath string // Relative to the site root + Directory string + htmlMutex *sync.Mutex + htmlNode *html.Node + hashMap map[string]*html.Node + NodesOfInterest []*html.Node + State DocumentState } // Used by checks that depend on the document being parsed @@ -20,7 +22,23 @@ type DocumentState struct { FaviconPresent bool } +func (doc *Document) Init() { + // Setup the document, doesn't mesh nice with the NewXYZ() convention but + // many optional parameters for Document and no parameter overloading in Go + doc.htmlMutex = &sync.Mutex{} + doc.NodesOfInterest = make([]*html.Node, 0) + doc.hashMap = make(map[string]*html.Node) +} + func (doc *Document) Parse() { + // Parse the document + // Either called when the document is tested or when another document needs + // data from this one. + doc.htmlMutex.Lock() // MUTEX + if doc.htmlNode != nil { + doc.htmlMutex.Unlock() // MUTEX + return + } // Open, parse, and close document f, err := os.Open(doc.FilePath) checkErr(err) @@ -29,73 +47,34 @@ func (doc *Document) Parse() { htmlNode, err := html.Parse(f) checkErr(err) - doc.HTMLNode = htmlNode -} - -func DocumentsFromDir(path string, ignorePatterns []interface{}) []Document { - // Nice proxy for recurseDir - return recurseDir(path, ignorePatterns, "") + doc.htmlNode = htmlNode + doc.parseNode(htmlNode) + doc.htmlMutex.Unlock() // MUTEX } -func recurseDir(basePath string, ignorePatterns []interface{}, dPath string) []Document { - // Recursive function that returns all Document struts in a given - // os directory. - // basePath: the directory to scan - // dPath: the subdirectory within basePath we're scanning - // ignorePatterns: string slice of dPaths to ignore - - documents := make([]Document, 0) - - if isDirIgnored(ignorePatterns, dPath) { - return documents - } - - // Open directory to scan - f, err := os.Open(path.Join(basePath, dPath)) - checkErr(err) - defer f.Close() - - // Get FileInfo of directory (scan it) - fi, err := f.Stat() - checkErr(err) - - if fi.IsDir() { // Double check we're dealing with a directory - // Read all FileInfo-s from directory, Readdir(count int) - fis, err := f.Readdir(-1) - checkErr(err) - - // Iterate over contents of directory - for _, fileinfo := range fis { - fPath := path.Join(dPath, fileinfo.Name()) - if fileinfo.IsDir() { - // If item is a dir, we need to iterate further, save returned documents - documents = append(documents, recurseDir(basePath, ignorePatterns, fPath)...) - } else if path.Ext(fileinfo.Name()) == ".html" || path.Ext(fileinfo.Name()) == ".htm" { - // If a file, save to filename list - documents = append(documents, Document{ - FilePath: path.Join(basePath, fPath), - SitePath: fPath, - Directory: dPath, - }) - } +func (doc *Document) parseNode(n *html.Node) { + if n.Type == html.ElementNode { + // If present save fragment identifier to the hashMap + nodeId := GetId(n.Attr) + if nodeId != "" { + doc.hashMap[nodeId] = n + } + // Identify and store tags of interest + switch n.Data { + case "a", "link", "img", "script": + doc.NodesOfInterest = append(doc.NodesOfInterest, n) + case "pre", "code": + return // Everything within these elements is not to be interpreted } - } else { // It's a file, return single file - filePath := path.Join(basePath, dPath) - documents = append(documents, Document{ - FilePath: filePath, - SitePath: path.Base(filePath), - Directory: dPath, - }) } - - return documents + // Iterate over children + for c := n.FirstChild; c != nil; c = c.NextSibling { + doc.parseNode(c) + } } -func isDirIgnored(ignorePatterns []interface{}, dir string) bool { - for _, item := range ignorePatterns { - if ok, _ := regexp.MatchString(item.(string), dir+"/"); ok { - return true - } - } - return false +func (doc *Document) IsHashValid(hash string) bool { + doc.Parse() // Ensure doc has been parsed + _, ok := doc.hashMap[hash] + return ok } diff --git a/htmldoc/document_store.go b/htmldoc/document_store.go new file mode 100644 index 0000000..441de67 --- /dev/null +++ b/htmldoc/document_store.go @@ -0,0 +1,117 @@ +package htmldoc + +import ( + "os" + "path" + "regexp" +) + +type DocumentStore struct { + BasePath string // Path, relative to cwd, the site is located in + IgnorePatterns []interface{} // Regexes of directories to ignore + Documents []*Document // All of the documents, used to iterate over + DocumentPathMap map[string]*Document // Maps slash separated paths to documents + DocumentExtension string // File extension to look for + DirectoryIndex string // What file is the index of the directory +} + +func NewDocumentStore() DocumentStore { + return DocumentStore{ + Documents: make([]*Document, 0), + DocumentPathMap: make(map[string]*Document), + } +} + +func (dS *DocumentStore) AddDocument(doc *Document) { + // Save reference to document to various data stores + dS.Documents = append(dS.Documents, doc) + dS.DocumentPathMap[doc.SitePath] = doc +} + +func (dS *DocumentStore) Discover() { + // Find all documents in BasePath + dS.discoverRecurse(".") +} + +func (dS *DocumentStore) isDirIgnored(dir string) bool { + // Does path dir match IgnorePatterns? + for _, item := range dS.IgnorePatterns { + if ok, _ := regexp.MatchString(item.(string), dir+"/"); ok { + return true + } + } + return false +} + +func (dS *DocumentStore) discoverRecurse(dPath string) { + // Recurse over relative path dPath, saves found documents to dS + if dS.isDirIgnored(dPath) { + return + } + + // Open directory to scan + f, err := os.Open(path.Join(dS.BasePath, dPath)) + checkErr(err) + defer f.Close() + + // Get FileInfo of directory (scan it) + fi, err := f.Stat() + checkErr(err) + + if fi.IsDir() { // Double check we're dealing with a directory + // Read all FileInfo-s from directory, Readdir(count int) + fis, err := f.Readdir(-1) + checkErr(err) + + // Iterate over contents of directory + for _, fileinfo := range fis { + fPath := path.Join(dPath, fileinfo.Name()) + if fileinfo.IsDir() { + // If item is a dir, we delve deeper + dS.discoverRecurse(fPath) + } else if path.Ext(fileinfo.Name()) == dS.DocumentExtension { + // If a file, create and save document + newDoc := &Document{ + FilePath: path.Join(dS.BasePath, fPath), + SitePath: fPath, + Directory: dPath, + } + newDoc.Init() + dS.AddDocument(newDoc) + } + } + } else { // It's a file, return single file + panic("discoverRecurse encountered a file: " + dPath) + } + +} + +func (dS *DocumentStore) ResolvePath(refPath string) (*Document, bool) { + // Resolves internal absolute paths to documents + + // Match root document + if refPath == "/" { + d0, b0 := dS.DocumentPathMap[dS.DirectoryIndex] + return d0, b0 + } + + if refPath[0] == '/' && len(refPath) > 1 { + // Is an absolute link, remove the leading slash for map lookup + refPath = refPath[1:len(refPath)] + } + + // Try path as-is, path.ext + d1, b1 := dS.DocumentPathMap[refPath] + if b1 { + // as-is worked, return that + return d1, b1 + } + + // Try as a directory, path.ext/index.html + d2, b2 := dS.DocumentPathMap[path.Join(refPath, dS.DirectoryIndex)] + return d2, b2 +} + +func (dS *DocumentStore) ResolveRef(ref *Reference) (*Document, bool) { + return dS.ResolvePath(ref.RefSitePath()) +} diff --git a/htmldoc/document_store_test.go b/htmldoc/document_store_test.go new file mode 100644 index 0000000..c96e080 --- /dev/null +++ b/htmldoc/document_store_test.go @@ -0,0 +1,77 @@ +package htmldoc + +import ( + "github.com/daviddengcn/go-assert" + "testing" +) + +func TestDocumentStoreDiscover(t *testing.T) { + // documentstore can scan an os directory + dS := NewDocumentStore() + dS.BasePath = "fixtures/documents" + dS.DocumentExtension = ".html" // Ignores .htm + dS.DirectoryIndex = "index.html" + dS.Discover() + // Fixtures dir has eight documents in various folders + assert.Equals(t, "document count", len(dS.Documents), 6) +} + +func TestDocumentStoreIgnorePatterns(t *testing.T) { + // documentstore can scan an os directory + dS := NewDocumentStore() + dS.BasePath = "fixtures/documents" + dS.DocumentExtension = ".html" // Ignores .htm + dS.DirectoryIndex = "index.html" + dS.IgnorePatterns = []interface{}{"^lib/"} + dS.Discover() + // Fixtures dir has seven documents in various folders, (one ignored in lib) + assert.Equals(t, "document count", len(dS.Documents), 5) +} + +func TestDocumentStoreDocumentExists(t *testing.T) { + // documentstore knows if documents exist or not + dS := NewDocumentStore() + dS.BasePath = "fixtures/documents" + dS.DocumentExtension = ".html" + dS.DirectoryIndex = "index.html" + dS.Discover() + _, b1 := dS.DocumentPathMap["index.html"] + assert.IsTrue(t, "index.html exists", b1) + _, b2 := dS.DocumentPathMap["dir2/index.html"] + assert.IsTrue(t, "dir2/index.html exists", b2) + _, b3 := dS.DocumentPathMap["foo.html"] + assert.IsFalse(t, "foo.html does not exist", b3) + _, b4 := dS.DocumentPathMap["dir3/index.html"] + assert.IsFalse(t, "dir3/index.html does not exist", b4) +} + +func TestDocumentStoreDocumentResolve(t *testing.T) { + // documentstore correctly resolves documents + dS := NewDocumentStore() + dS.BasePath = "fixtures/documents" + dS.DocumentExtension = ".html" + dS.DirectoryIndex = "index.html" + dS.Discover() + d0, b0 := dS.ResolvePath("/") + assert.IsTrue(t, "root document exists", b0) + assert.Equals(t, "/ resolves to index.html", + d0.FilePath, "fixtures/documents/index.html") + d1, b1 := dS.ResolvePath("/contact.html") + assert.IsTrue(t, "/contact.html exists", b1) + assert.Equals(t, "/contact.html resolves to correct document", + d1.FilePath, "fixtures/documents/contact.html") + d2, b2 := dS.ResolvePath("dir2/index.html") + assert.IsTrue(t, "dir2/index.html exists", b2) + assert.Equals(t, "dir2/index.html resolves to correct document", + d2.FilePath, "fixtures/documents/dir2/index.html") + d3, b3 := dS.ResolvePath("dir2/") + assert.IsTrue(t, "dir2/index.html exists", b3) + assert.Equals(t, "dir2/index.html resolves to correct document", + d3.FilePath, "fixtures/documents/dir2/index.html") + d4, b4 := dS.ResolvePath("dir2") + assert.IsTrue(t, "dir2/index.html exists", b4) + assert.Equals(t, "dir2/index.html resolves to correct document", + d4.FilePath, "fixtures/documents/dir2/index.html") + _, b5 := dS.ResolvePath("does-not-exist") + assert.IsFalse(t, "does not return doc for invalid path", b5) +} diff --git a/htmldoc/document_test.go b/htmldoc/document_test.go index c0fd9d8..0e0420d 100644 --- a/htmldoc/document_test.go +++ b/htmldoc/document_test.go @@ -10,14 +10,30 @@ func TestDocumentParse(t *testing.T) { doc := Document{ FilePath: "fixtures/documents/index.html", } + doc.Init() doc.Parse() - nodeElem := doc.HTMLNode.FirstChild.FirstChild.NextSibling.FirstChild + nodeElem := doc.htmlNode.FirstChild.FirstChild.NextSibling.FirstChild assert.Equals(t, "document first body node", nodeElem.Data, "h1") } -func TestDocumentsFromDir(t *testing.T) { - // it creates Document struts from an os directory - docs := DocumentsFromDir("fixtures/documents", []interface{}{"^lib/"}) - // Fixtures dir has seven documents in various folders - assert.Equals(t, "document count", len(docs), 7) +func TestDocumentNodesOfInterest(t *testing.T) { + doc := Document{ + FilePath: "fixtures/documents/nodes.htm", + } + doc.Init() + doc.Parse() + assert.Equals(t, "nodes of interest", len(doc.NodesOfInterest), 4) +} + +func TestDocumentIsHashValid(t *testing.T) { + // parse a document and check we have valid nodes + doc := Document{ + FilePath: "fixtures/documents/index.html", + } + doc.Init() + doc.Parse() + + assert.IsTrue(t, "#xyz present", doc.IsHashValid("xyz")) + assert.IsTrue(t, "#prq present", doc.IsHashValid("prq")) + assert.IsFalse(t, "#abc present", doc.IsHashValid("abc")) } diff --git a/htmldoc/fixtures/conf.json b/htmldoc/fixtures/conf.json deleted file mode 100644 index 3f17ad0..0000000 --- a/htmldoc/fixtures/conf.json +++ /dev/null @@ -1 +0,0 @@ -DirectoryPath: "htmldoc/fixtures/documents" diff --git a/htmldoc/fixtures/documents/index.html b/htmldoc/fixtures/documents/index.html index afc2780..2fc8836 100644 --- a/htmldoc/fixtures/documents/index.html +++ b/htmldoc/fixtures/documents/index.html @@ -1 +1,9 @@Lorem ipsum
+ +Another para
+ ++ + + + ++
+
+
+
+
+
diff --git a/htmldoc/reference.go b/htmldoc/reference.go
index 7b74911..196866e 100644
--- a/htmldoc/reference.go
+++ b/htmldoc/reference.go
@@ -73,11 +73,7 @@ func (ref *Reference) IsInternalAbsolute() bool {
return !strings.HasPrefix(ref.Path, "//") && strings.HasPrefix(ref.Path, "/")
}
-func (ref *Reference) AbsolutePath() string {
- // If external return unchanged
- if ref.Scheme() != "file" {
- return ref.URL.Path
- }
+func (ref *Reference) RefSitePath() string {
// If internal, return a path to the referenced file relative to the 'site root'
// Strip shit off the end?
if ref.IsInternalAbsolute() {
diff --git a/htmldoc/reference_test.go b/htmldoc/reference_test.go
index 3fad37d..3e527b6 100644
--- a/htmldoc/reference_test.go
+++ b/htmldoc/reference_test.go
@@ -19,7 +19,7 @@ func TestReferenceScheme(t *testing.T) {
doc := Document{
SitePath: "doc.html",
- HTMLNode: nodeDoc,
+ htmlNode: nodeDoc,
}
var ref *Reference
@@ -28,6 +28,8 @@ func TestReferenceScheme(t *testing.T) {
assert.Equals(t, "http reference", ref.Scheme(), "http")
ref = NewReference(&doc, nodeElem, "https://test.com")
assert.Equals(t, "https reference", ref.Scheme(), "https")
+ ref = NewReference(&doc, nodeElem, "//test.com")
+ assert.Equals(t, "https reference", ref.Scheme(), "https")
ref = NewReference(&doc, nodeElem,
"https://photos.smugmug.com/photos/i-CNHsHLM/0/440x622/i-CNHsHLM-440x622.jpg")
assert.Equals(t, "http reference", ref.Scheme(), "https")
@@ -53,13 +55,19 @@ func TestReferenceURLString(t *testing.T) {
doc := Document{
SitePath: "doc.html",
- HTMLNode: nodeDoc,
+ htmlNode: nodeDoc,
}
var ref *Reference
- ref = NewReference(&doc, nodeElem, "google.com")
- assert.Equals(t, "URLString", ref.URLString(), "google.com")
+ ref = NewReference(&doc, nodeElem, "http://example.com")
+ assert.Equals(t, "URLString", ref.URLString(), "http://example.com")
+ ref = NewReference(&doc, nodeElem, "http://example.com/")
+ assert.Equals(t, "URLString", ref.URLString(), "http://example.com/")
+ ref = NewReference(&doc, nodeElem, "https://example.com")
+ assert.Equals(t, "URLString", ref.URLString(), "https://example.com")
+ ref = NewReference(&doc, nodeElem, "//example.com")
+ assert.Equals(t, "URLString", ref.URLString(), "https://example.com")
}
@@ -69,7 +77,7 @@ func TestReferenceIsInternalAbsolute(t *testing.T) {
doc := Document{
SitePath: "doc.html",
- HTMLNode: nodeDoc,
+ htmlNode: nodeDoc,
}
var ref *Reference
@@ -93,21 +101,21 @@ func TestReferenceAbsolutePath(t *testing.T) {
doc := Document{
SitePath: "doc.html",
Directory: "directory/subdir",
- HTMLNode: nodeDoc,
+ htmlNode: nodeDoc,
}
var ref *Reference
ref = NewReference(&doc, nodeElem, "/abc/page.html")
- assert.Equals(t, "internal absolute reference", ref.AbsolutePath(), "/abc/page.html")
+ assert.Equals(t, "internal absolute reference", ref.RefSitePath(), "/abc/page.html")
ref = NewReference(&doc, nodeElem, "/yyz")
- assert.Equals(t, "internal absolute reference", ref.AbsolutePath(), "/yyz")
+ assert.Equals(t, "internal absolute reference", ref.RefSitePath(), "/yyz")
ref = NewReference(&doc, nodeElem, "zzy")
- assert.Equals(t, "internal relative reference", ref.AbsolutePath(), "directory/subdir/zzy")
+ assert.Equals(t, "internal relative reference", ref.RefSitePath(), "directory/subdir/zzy")
ref = NewReference(&doc, nodeElem, "zzy/uup.jjr")
- assert.Equals(t, "internal relative reference", ref.AbsolutePath(), "directory/subdir/zzy/uup.jjr")
+ assert.Equals(t, "internal relative reference", ref.RefSitePath(), "directory/subdir/zzy/uup.jjr")
ref = NewReference(&doc, nodeElem, "./zzy/uup.jjr")
- assert.Equals(t, "internal relative reference", ref.AbsolutePath(), "directory/subdir/zzy/uup.jjr")
+ assert.Equals(t, "internal relative reference", ref.RefSitePath(), "directory/subdir/zzy/uup.jjr")
}
func TestURLStripQueryString(t *testing.T) {
diff --git a/htmltest/attr_test.go b/htmltest/attr_test.go
deleted file mode 100644
index a61a193..0000000
--- a/htmltest/attr_test.go
+++ /dev/null
@@ -1,29 +0,0 @@
-package htmltest
-
-import (
- "github.com/daviddengcn/go-assert"
- "golang.org/x/net/html"
- "strings"
- "testing"
-)
-
-func TestExtractAttrs(t *testing.T) {
- snip := ""
- nodeDoc, _ := html.Parse(strings.NewReader(snip))
- nodeImg := nodeDoc.FirstChild.FirstChild.NextSibling.FirstChild
- attrs := extractAttrs(nodeImg.Attr, []string{"src", "alt"})
-
- assert.Equals(t, "src", attrs["src"], "x")
- assert.Equals(t, "alt", attrs["alt"], "y")
- assert.NotEquals(t, "foo", attrs["foo"], "bar")
-}
-
-func TestAttrPresent(t *testing.T) {
- snip := ""
- nodeDoc, _ := html.Parse(strings.NewReader(snip))
- nodeImg := nodeDoc.FirstChild.FirstChild.NextSibling.FirstChild
-
- assert.Equals(t, "src in attr", attrPresent(nodeImg.Attr, "src"), true)
- assert.Equals(t, "alt in attr", attrPresent(nodeImg.Attr, "src"), true)
- assert.NotEquals(t, "foo in attr", attrPresent(nodeImg.Attr, "src"), false)
-}
diff --git a/htmltest/check-img.go b/htmltest/check-img.go
index d81338e..02e7281 100644
--- a/htmltest/check-img.go
+++ b/htmltest/check-img.go
@@ -8,11 +8,11 @@ import (
)
func (hT *HtmlTest) checkImg(document *htmldoc.Document, node *html.Node) {
- attrs := extractAttrs(node.Attr,
+ attrs := htmldoc.ExtractAttrs(node.Attr,
[]string{"src", "alt", hT.opts.IgnoreTagAttribute})
// Ignore if data-proofer-ignore set
- if attrPresent(node.Attr, hT.opts.IgnoreTagAttribute) {
+ if htmldoc.AttrPresent(node.Attr, hT.opts.IgnoreTagAttribute) {
return
}
@@ -20,13 +20,13 @@ func (hT *HtmlTest) checkImg(document *htmldoc.Document, node *html.Node) {
ref := htmldoc.NewReference(document, node, attrs["src"])
// Check alt present, fail if absent unless asked to ignore
- if !attrPresent(node.Attr, "alt") && !hT.opts.IgnoreAltMissing {
+ if !htmldoc.AttrPresent(node.Attr, "alt") && !hT.opts.IgnoreAltMissing {
hT.issueStore.AddIssue(issues.Issue{
Level: issues.ERROR,
Message: "alt attribute missing",
Reference: ref,
})
- } else if attrPresent(node.Attr, "alt") {
+ } else if htmldoc.AttrPresent(node.Attr, "alt") {
// Following checks require alt attr is present
if len(attrs["alt"]) == 0 {
// Check alt has length, fail if empty
@@ -47,7 +47,7 @@ func (hT *HtmlTest) checkImg(document *htmldoc.Document, node *html.Node) {
}
// Check src present, fail if absent
- if !attrPresent(node.Attr, "src") {
+ if !htmldoc.AttrPresent(node.Attr, "src") {
hT.issueStore.AddIssue(issues.Issue{
Level: issues.ERROR,
Message: "src attribute missing",
diff --git a/htmltest/check-img_test.go b/htmltest/check-img_test.go
index 3133847..5e44fb9 100644
--- a/htmltest/check-img_test.go
+++ b/htmltest/check-img_test.go
@@ -7,12 +7,14 @@ import (
func TestImageExternalWorking(t *testing.T) {
// passes for existing external images
+ t_SkipShortExternal(t)
hT := t_testFile("fixtures/images/existingImageExternal.html")
t_expectIssueCount(t, hT, 0)
}
func TestImageExternalMissing(t *testing.T) {
// fails for missing external images
+ t_SkipShortExternal(t)
hT := t_testFile("fixtures/images/missingImageExternal.html")
t_expectIssueCount(t, hT, 1)
// Issue contains "no such host"
@@ -21,12 +23,14 @@ func TestImageExternalMissing(t *testing.T) {
func TestImageExternalMissingProtocolValid(t *testing.T) {
// works for valid images missing the protocol
+ t_SkipShortExternal(t)
hT := t_testFile("fixtures/images/image_missing_protocol_valid.html")
t_expectIssueCount(t, hT, 0)
}
func TestImageExternalMissingProtocolInvalid(t *testing.T) {
// fails for invalid images missing the protocol
+ t_SkipShortExternal(t)
hT := t_testFile("fixtures/images/image_missing_protocol_invalid.html")
t_expectIssueCount(t, hT, 1)
// t_expectIssue(t, hT, message, 1)
@@ -34,12 +38,14 @@ func TestImageExternalMissingProtocolInvalid(t *testing.T) {
func TestImageExternalInsecureDefault(t *testing.T) {
// passes for HTTP images by default
+ t_SkipShortExternal(t)
hT := t_testFile("fixtures/images/src_http.html")
t_expectIssueCount(t, hT, 0)
}
func TestImageExternalInsecureOption(t *testing.T) {
// fails for HTTP images when asked
+ t_SkipShortExternal(t)
hT := t_testFileOpts("fixtures/images/src_http.html",
map[string]interface{}{"EnforceHTTPS": true})
t_expectIssueCount(t, hT, 1)
@@ -107,6 +113,7 @@ func TestImageSrcEmpty(t *testing.T) {
func TestImageSrcLineBreaks(t *testing.T) {
// deals with linebreaks in src
+ t_SkipShortExternal(t) // TODO use internal images
hT := t_testFile("fixtures/images/lineBreaks.html")
t_expectIssueCount(t, hT, 0)
}
diff --git a/htmltest/check-link.go b/htmltest/check-link.go
index a600a8a..f37841d 100644
--- a/htmltest/check-link.go
+++ b/htmltest/check-link.go
@@ -13,16 +13,16 @@ import (
)
func (hT *HtmlTest) checkLink(document *htmldoc.Document, node *html.Node) {
- attrs := extractAttrs(node.Attr,
+ attrs := htmldoc.ExtractAttrs(node.Attr,
[]string{"href", "rel", hT.opts.IgnoreTagAttribute})
// Ignore if data-proofer-ignore set
- if attrPresent(node.Attr, hT.opts.IgnoreTagAttribute) {
+ if htmldoc.AttrPresent(node.Attr, hT.opts.IgnoreTagAttribute) {
return
}
// Check if favicon
- if attrPresent(node.Attr, "rel") &&
+ if htmldoc.AttrPresent(node.Attr, "rel") &&
(attrs["rel"] == "icon" || attrs["rel"] == "shortcut icon") &&
node.Parent.Data == "head" {
document.State.FaviconPresent = true
@@ -32,7 +32,7 @@ func (hT *HtmlTest) checkLink(document *htmldoc.Document, node *html.Node) {
ref := htmldoc.NewReference(document, node, attrs["href"])
// Check for missing href, fail for link nodes
- if !attrPresent(node.Attr, "href") {
+ if !htmldoc.AttrPresent(node.Attr, "href") {
switch node.Data {
case "a":
hT.issueStore.AddIssue(issues.Issue{
@@ -86,6 +86,8 @@ func (hT *HtmlTest) checkLink(document *htmldoc.Document, node *html.Node) {
hT.checkExternal(ref)
case "file":
hT.checkInternal(ref)
+ case "self":
+ hT.checkInternalHash(ref)
case "mailto":
hT.checkMailto(ref)
case "tel":
@@ -103,7 +105,7 @@ func (hT *HtmlTest) checkExternal(ref *htmldoc.Reference) {
if !hT.opts.CheckExternal {
hT.issueStore.AddIssue(issues.Issue{
Level: issues.DEBUG,
- Message: "skipping",
+ Message: "skipping external check",
Reference: ref,
})
return
@@ -208,7 +210,7 @@ func (hT *HtmlTest) checkExternal(ref *htmldoc.Reference) {
Reference: ref,
})
default:
- attrs := extractAttrs(ref.Node.Attr, []string{"rel"})
+ attrs := htmldoc.ExtractAttrs(ref.Node.Attr, []string{"rel"})
if attrs["rel"] == "canonical" && hT.opts.IgnoreCanonicalBrokenLinks {
hT.issueStore.AddIssue(issues.Issue{
Level: issues.WARNING,
@@ -231,41 +233,96 @@ func (hT *HtmlTest) checkInternal(ref *htmldoc.Reference) {
if !hT.opts.CheckInternal {
hT.issueStore.AddIssue(issues.Issue{
Level: issues.DEBUG,
- Message: "skipping",
+ Message: "skipping internal check",
Reference: ref,
})
return
}
- // Resolve a filesystem path for reference
- refOsPath := path.Join(hT.opts.DirectoryPath, ref.AbsolutePath())
- hT.checkFile(ref, refOsPath)
+
+ // First lookup in document store,
+ refDoc, refExists := hT.documentStore.ResolveRef(ref)
+
+ if refExists {
+ // If path doesn't end in slash and the resolved ref is an index.html, complain
+ if ref.URL.Path[len(ref.URL.Path)-1] != '/' && path.Base(refDoc.SitePath) == hT.opts.DirectoryIndex {
+ hT.issueStore.AddIssue(issues.Issue{
+ Level: issues.ERROR,
+ Message: "target is a directory, href lacks trailing slash",
+ Reference: ref,
+ })
+ }
+ } else {
+ // If that fails attempt to lookup with filesystem, resolve a path and check
+ refOsPath := path.Join(hT.opts.DirectoryPath, ref.RefSitePath())
+ hT.checkFile(ref, refOsPath)
+ }
+
+ if len(ref.URL.Fragment) > 0 {
+ // Is also a hash link
+ hT.checkInternalHash(ref)
+ }
}
-func (hT *HtmlTest) checkFile(ref *htmldoc.Reference, absPath string) {
- f, err := os.Stat(absPath)
- if os.IsNotExist(err) {
+func (hT *HtmlTest) checkInternalHash(ref *htmldoc.Reference) {
+ if !hT.opts.CheckInternalHash {
hT.issueStore.AddIssue(issues.Issue{
- Level: issues.ERROR,
- Message: "target does not exist",
+ Level: issues.DEBUG,
+ Message: "skipping hash check",
Reference: ref,
})
return
}
- checkErr(err) // Crash on other errors
- if f.IsDir() {
- if !strings.HasSuffix(ref.URL.Path, "/") && !hT.opts.IgnoreDirectoryMissingTrailingSlash {
+ // var refDoc *htmldoc.Document
+ if len(ref.URL.Fragment) == 0 {
+ hT.issueStore.AddIssue(issues.Issue{
+ Level: issues.ERROR,
+ Message: "missing hash",
+ Reference: ref,
+ })
+ }
+
+ if len(ref.URL.Path) > 0 {
+ // internal
+ refDoc, _ := hT.documentStore.ResolveRef(ref)
+ if !refDoc.IsHashValid(ref.URL.Fragment) {
hT.issueStore.AddIssue(issues.Issue{
Level: issues.ERROR,
- Message: "target is a directory, href lacks trailing slash",
+ Message: "hash does not exist",
Reference: ref,
})
- return
}
+ } else {
+ // self
+ if !ref.Document.IsHashValid(ref.URL.Fragment) {
+ hT.issueStore.AddIssue(issues.Issue{
+ Level: issues.ERROR,
+ Message: "hash does not exist",
+ Reference: ref,
+ })
+ }
+ }
+}
- hT.checkFile(ref, path.Join(absPath, hT.opts.DirectoryIndex))
+func (hT *HtmlTest) checkFile(ref *htmldoc.Reference, absPath string) {
+ f, err := os.Stat(absPath)
+ if os.IsNotExist(err) {
+ hT.issueStore.AddIssue(issues.Issue{
+ Level: issues.ERROR,
+ Message: "target does not exist",
+ Reference: ref,
+ })
return
}
+ checkErr(err) // Crash on other errors
+
+ if f.IsDir() {
+ hT.issueStore.AddIssue(issues.Issue{
+ Level: issues.ERROR,
+ Message: "target is a directory, no index",
+ Reference: ref,
+ })
+ }
}
func (hT *HtmlTest) checkMailto(ref *htmldoc.Reference) {
diff --git a/htmltest/check-link_test.go b/htmltest/check-link_test.go
index 2432acc..82bd474 100644
--- a/htmltest/check-link_test.go
+++ b/htmltest/check-link_test.go
@@ -22,6 +22,7 @@ func TestAnchorIgnorable(t *testing.T) {
func TestAnchorExternalBroken(t *testing.T) {
// fails for broken external links
+ t_SkipShortExternal(t)
hT := t_testFile("fixtures/links/brokenLinkExternal.html")
t_expectIssueCount(t, hT, 1)
}
@@ -35,6 +36,7 @@ func TestAnchorExternalIgnore(t *testing.T) {
func TestAnchorExternalHashBrokenDefault(t *testing.T) {
// passes for broken external hashes by default
+ t_SkipShortExternal(t)
hT := t_testFile("fixtures/links/brokenHashOnTheWeb.html")
t_expectIssueCount(t, hT, 0)
}
@@ -42,6 +44,7 @@ func TestAnchorExternalHashBrokenDefault(t *testing.T) {
func TestAnchorExternalHashBrokenOption(t *testing.T) {
// fails for broken external hashes when asked
t.Skip("Not yet implemented")
+ t_SkipShortExternal(t)
hT := t_testFile("fixtures/links/brokenHashOnTheWeb.html")
t_expectIssueCount(t, hT, 1)
t_expectIssue(t, hT, "no such hash", 1)
@@ -51,24 +54,28 @@ func TestAnchorExternalCache(t *testing.T) {
// does not check links with parameters multiple times
// TODO check cache is being checked
t.Skip("Not yet implemented")
+ t_SkipShortExternal(t)
hT := t_testFile("fixtures/links/check_just_once.html")
t_expectIssueCount(t, hT, 0)
}
func TestAnchorExternalHrefMalformed(t *testing.T) {
// does not explode on bad external links in files
+ t_SkipShortExternal(t)
hT := t_testFile("fixtures/links/bad_external_links.html")
t_expectIssueCount(t, hT, 2)
}
func TestAnchorExternalInsecureDefault(t *testing.T) {
// passes for non-HTTPS links when not asked
+ t_SkipShortExternal(t)
hT := t_testFile("fixtures/links/non_https.html")
t_expectIssueCount(t, hT, 0)
}
func TestAnchorExternalInsecureOption(t *testing.T) {
// fails for non-HTTPS links when asked
+ t_SkipShortExternal(t)
hT := t_testFileOpts("fixtures/links/non_https.html",
map[string]interface{}{"EnforceHTTPS": true})
t_expectIssueCount(t, hT, 1)
@@ -77,12 +84,14 @@ func TestAnchorExternalInsecureOption(t *testing.T) {
func TestAnchorExternalHrefIP(t *testing.T) {
// fails for broken IP address links
+ t_SkipShortExternal(t)
hT := t_testFile("fixtures/links/ip_href.html")
t_expectIssueCount(t, hT, 2)
}
func TestAnchorExternalHrefIPTimeout(t *testing.T) {
// fails for broken IP address links
+ t_SkipShortExternal(t)
hT := t_testFile("fixtures/links/ip_timeout.html")
t_expectIssueCount(t, hT, 1)
t_expectIssue(t, hT, "request exceeded our ExternalTimeout", 1)
@@ -91,6 +100,7 @@ func TestAnchorExternalHrefIPTimeout(t *testing.T) {
func TestAnchorExternalFollowRedirects(t *testing.T) {
// should follow redirects
t.Skip("Need new link, times out")
+ t_SkipShortExternal(t)
hT := t_testFile("fixtures/links/linkWithRedirect.html")
t_expectIssueCount(t, hT, 0)
}
@@ -98,6 +108,7 @@ func TestAnchorExternalFollowRedirects(t *testing.T) {
func TestAnchorExternalFollowRedirectsDisabled(t *testing.T) {
// fails on redirects if not following
t.Skip("Not yet implemented, need new link, times out")
+ t_SkipShortExternal(t)
hT := t_testFile("fixtures/links/linkWithRedirect.html")
t_expectIssueCount(t, hT, 99)
t_expectIssue(t, hT, "PLACEHOLDER", 99)
@@ -105,18 +116,21 @@ func TestAnchorExternalFollowRedirectsDisabled(t *testing.T) {
func TestAnchorExternalHTTPS(t *testing.T) {
// should understand https
+ t_SkipShortExternal(t)
hT := t_testFile("fixtures/links/linkWithHttps.html")
t_expectIssueCount(t, hT, 0)
}
func TestAnchorExternalMissingProtocolValid(t *testing.T) {
// works for valid links missing the protocol
+ t_SkipShortExternal(t)
hT := t_testFile("fixtures/links/link_missing_protocol_valid.html")
t_expectIssueCount(t, hT, 0)
}
func TestAnchorExternalMissingProtocolInvalid(t *testing.T) {
// fails for invalid links missing the protocol
+ t_SkipShortExternal(t)
hT := t_testFile("fixtures/links/link_missing_protocol_invalid.html")
t_expectIssueCount(t, hT, 1)
// t_expectIssue(t, hT, "no such host", 1)
@@ -124,18 +138,21 @@ func TestAnchorExternalMissingProtocolInvalid(t *testing.T) {
func TestLinkExternalHrefPipes(t *testing.T) {
// works for pipes in the URL
+ t_SkipShortExternal(t)
hT := t_testFile("fixtures/links/escape_pipes.html")
t_expectIssueCount(t, hT, 0)
}
func TestAnchorExternalHrefNonstandardChars(t *testing.T) {
// passes non-standard characters
+ t_SkipShortExternal(t)
hT := t_testFile("fixtures/links/non_standard_characters.html")
t_expectIssueCount(t, hT, 0)
}
func TestAnchorExternalHrefUTF8(t *testing.T) {
// passes for external UTF-8 links
+ t_SkipShortExternal(t)
hT := t_testFile("fixtures/links/utf8Link.html")
t_expectIssueCount(t, hT, 0)
}
@@ -161,12 +178,40 @@ func TestAnchorInternalRelativeLinksBase(t *testing.T) {
t_expectIssueCount(t, hT, 0)
}
-func TestAnchorInternalHashBroken(t *testing.T) {
+func TestAnchorHashInternalValid(t *testing.T) {
+ // passes for valid internal hash
+ hT := t_testFile("fixtures/links/hashInternalOk.html")
+ t_expectIssueCount(t, hT, 0)
+}
+
+func TestAnchorHashInternalBroken(t *testing.T) {
// fails for broken internal hash
- t.Skip("Not yet implemented")
- hT := t_testFile("fixtures/links/brokenHashInternal.html")
- t_expectIssueCount(t, hT, 99)
- t_expectIssue(t, hT, "PLACEHOLDER", 99)
+ hT := t_testFile("fixtures/links/hashInternalBroken.html")
+ t_expectIssueCount(t, hT, 1)
+ t_expectIssue(t, hT, "hash does not exist", 1)
+}
+
+func TestAnchorHashSelfValid(t *testing.T) {
+ // passes for valid self hash
+ hT := t_testFile("fixtures/links/hashSelfOk.html")
+ t_expectIssueCount(t, hT, 0)
+}
+
+func TestAnchorHashSelfBroken(t *testing.T) {
+ // fails for broken self hash
+ hT := t_testFile("fixtures/links/hashSelfBroken.html")
+ t_expectIssueCount(t, hT, 1)
+ t_expectIssue(t, hT, "hash does not exist", 1)
+}
+
+func TestAnchorHashBrokenIgnore(t *testing.T) {
+ // fails for broken internal hash
+ hT1 := t_testFileOpts("fixtures/links/hashInternalBroken.html",
+ map[string]interface{}{"CheckInternalHash": false})
+ hT2 := t_testFileOpts("fixtures/links/hashSelfBroken.html",
+ map[string]interface{}{"CheckInternalHash": false})
+ t_expectIssueCount(t, hT1, 0)
+ t_expectIssueCount(t, hT2, 0)
}
func TestAnchorDirectoryRootResolve(t *testing.T) {
@@ -186,7 +231,7 @@ func TestAnchorDirectoryCustomRootBroken(t *testing.T) {
// fails if custom directory index file doesn't exist
hT := t_testFile("fixtures/links/link_pointing_to_directory.html")
t_expectIssueCount(t, hT, 1)
- t_expectIssue(t, hT, "target does not exist", 1)
+ t_expectIssue(t, hT, "target is a directory, no index", 1)
}
func TestAnchorDirectoryNoTrailingSlash(t *testing.T) {
@@ -196,7 +241,7 @@ func TestAnchorDirectoryNoTrailingSlash(t *testing.T) {
t_expectIssue(t, hT, "target is a directory, href lacks trailing slash", 1)
}
-func TestAnchorDirectoryNoTrailingSlashQueryHash(t *testing.T) {
+func TestAnchorDirectoryQueryHash(t *testing.T) {
// fails for internal linking to a directory without trailing slash
hT := t_testFile("fixtures/links/link_directory_with_slash_query_hash.html")
t_expectIssueCount(t, hT, 0)
@@ -208,6 +253,12 @@ func TestAnchorDirectoryHtmlExtension(t *testing.T) {
t_expectIssueCount(t, hT, 0)
}
+func TestAnchorDirectoryWithEncodedCharacters(t *testing.T) {
+ // passes for folder with encoded characters
+ hT := t_testFile("fixtures/links/linkToFolderWithSpace.html")
+ t_expectIssueCount(t, hT, 0)
+}
+
func TestAnchorInternalRootLink(t *testing.T) {
// properly checks links to root
hT := t_testFile("fixtures/links/rootLink/rootLink.html")
@@ -239,7 +290,8 @@ func TestAnchorInternalDashedAttrs(t *testing.T) {
}
func TestAnchorInternalCaseMismatch(t *testing.T) {
- // does not complain for internal links with mismatched cases
+ // does not complain for internal hash links with mismatched cases
+ t.Skip("Unsure on whether we should ignore case, pretty sure we shouldn't")
hT := t_testFile("fixtures/links/ignores_cases.html")
t_expectIssueCount(t, hT, 0)
}
@@ -348,12 +400,14 @@ func TestLinkHrefAbsent(t *testing.T) {
func TestLinkHrefBrokenCanonicalDefault(t *testing.T) {
// works for valid href within link elements
+ t_SkipShortExternal(t)
hT := t_testFile("fixtures/links/brokenCanonicalLink.html")
t_expectIssueCount(t, hT, 0)
}
func TestLinkHrefBrokenCanonicalOption(t *testing.T) {
// works for valid href within link elements
+ t_SkipShortExternal(t)
hT := t_testFileOpts("fixtures/links/brokenCanonicalLink.html",
map[string]interface{}{"IgnoreCanonicalBrokenLinks": false})
t_expectIssueCount(t, hT, 1)
diff --git a/htmltest/check-script.go b/htmltest/check-script.go
index af26c52..d14000d 100644
--- a/htmltest/check-script.go
+++ b/htmltest/check-script.go
@@ -7,11 +7,11 @@ import (
)
func (hT *HtmlTest) checkScript(document *htmldoc.Document, node *html.Node) {
- attrs := extractAttrs(node.Attr,
+ attrs := htmldoc.ExtractAttrs(node.Attr,
[]string{"src", hT.opts.IgnoreTagAttribute})
// Ignore if data-proofer-ignore set
- if attrPresent(node.Attr, hT.opts.IgnoreTagAttribute) {
+ if htmldoc.AttrPresent(node.Attr, hT.opts.IgnoreTagAttribute) {
return
}
@@ -19,7 +19,7 @@ func (hT *HtmlTest) checkScript(document *htmldoc.Document, node *html.Node) {
ref := htmldoc.NewReference(document, node, attrs["src"])
// Check src problems
- if attrPresent(node.Attr, "src") && len(attrs["src"]) == 0 {
+ if htmldoc.AttrPresent(node.Attr, "src") && len(attrs["src"]) == 0 {
// Check src has length, fail if empty
hT.issueStore.AddIssue(issues.Issue{
Level: issues.ERROR,
@@ -30,7 +30,7 @@ func (hT *HtmlTest) checkScript(document *htmldoc.Document, node *html.Node) {
}
// Check invalid content
- if !attrPresent(node.Attr, "src") && node.FirstChild == nil {
+ if !htmldoc.AttrPresent(node.Attr, "src") && node.FirstChild == nil {
hT.issueStore.AddIssue(issues.Issue{
Level: issues.ERROR,
Message: "script content missing / no src attribute",
diff --git a/htmltest/check-script_test.go b/htmltest/check-script_test.go
index 1382439..91400f6 100644
--- a/htmltest/check-script_test.go
+++ b/htmltest/check-script_test.go
@@ -8,12 +8,14 @@ import (
func TestScriptExternalSrcValid(t *testing.T) {
// passes for valid external src
+ t_SkipShortExternal(t)
hT := t_testFile("fixtures/scripts/script_valid_external.html")
t_expectIssueCount(t, hT, 0)
}
func TestScriptExternalSrcBroken(t *testing.T) {
// fails for broken external src
+ t_SkipShortExternal(t)
hT := t_testFile("fixtures/scripts/script_broken_external.html")
t_expectIssueCount(t, hT, 1)
// t_expectIssue(t, hT, "no such host", 1)
@@ -21,12 +23,14 @@ func TestScriptExternalSrcBroken(t *testing.T) {
func TestScriptExternalInsecureDefault(t *testing.T) {
// passes for HTTP scripts by default
+ t_SkipShortExternal(t)
hT := t_testFile("fixtures/scripts/scriptInsecure.html")
t_expectIssueCount(t, hT, 0)
}
func TestScriptExternalInsecureOption(t *testing.T) {
// fails for HTTP scripts when asked
+ t_SkipShortExternal(t)
hT := t_testFileOpts("fixtures/scripts/scriptInsecure.html",
map[string]interface{}{"EnforceHTTPS": true})
t_expectIssueCount(t, hT, 1)
diff --git a/htmltest/external_benchmark_test.go b/htmltest/external_benchmark_test.go
index d770405..7879357 100644
--- a/htmltest/external_benchmark_test.go
+++ b/htmltest/external_benchmark_test.go
@@ -8,6 +8,6 @@ import (
func BenchmarkExternal(b *testing.B) {
for i := 0; i < b.N; i++ {
t_testDirectoryOpts("/home/will/local/history-project/_site/",
- map[string]interface{}{"LogLevel": issues.NONE})
+ map[string]interface{}{"LogLevel": issues.INFO, "CheckExternal": false})
}
}
diff --git a/htmltest/fixtures/images/ignorableAltViaOptions.html b/htmltest/fixtures/images/ignorableAltViaOptions.html
index 35473d3..7b5a19a 100644
--- a/htmltest/fixtures/images/ignorableAltViaOptions.html
+++ b/htmltest/fixtures/images/ignorableAltViaOptions.html
@@ -5,9 +5,6 @@
Relative to self
-
-Blah blah blah.
-