Skip to content

Commit

Permalink
Add fallback-encoding per-repo option for non-utf8 text files
Browse files Browse the repository at this point in the history
  • Loading branch information
tgulacsi committed Apr 12, 2021
1 parent 60b8104 commit c210eb0
Show file tree
Hide file tree
Showing 5 changed files with 50 additions and 20 deletions.
1 change: 1 addition & 0 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ type Repo struct {
ExcludeDotFiles bool `json:"exclude-dot-files"`
EnablePollUpdates *bool `json:"enable-poll-updates"`
EnablePushUpdates *bool `json:"enable-push-updates"`
FallbackEncoding string `json:"fallback-encoding"`
}

// Used for interpreting the config value for fields that use *bool. If a value
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ go 1.13
require (
github.com/blang/semver v3.5.1+incompatible
github.com/go-bindata/go-bindata v3.1.2+incompatible // indirect
golang.org/x/text v0.3.5
)
3 changes: 3 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@ github.com/blang/semver v3.5.1+incompatible/go.mod h1:kRBLl5iJ+tD4TcOOxsy/0fnweb
github.com/go-bindata/go-bindata v1.0.0 h1:DZ34txDXWn1DyWa+vQf7V9ANc2ILTtrEjtlsdJRF26M=
github.com/go-bindata/go-bindata v3.1.2+incompatible h1:5vjJMVhowQdPzjE1LdxyFF7YFTXg5IgGVW4gBr5IbvE=
github.com/go-bindata/go-bindata v3.1.2+incompatible/go.mod h1:xK8Dsgwmeed+BBsSy2XTopBn/8uK2HWuGSnA11C3Joo=
golang.org/x/text v0.3.5 h1:i6eZZ+zk0SOf0xgBpEpPD18qWcJda6q1sxt3S0kzyUQ=
golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
57 changes: 38 additions & 19 deletions index/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,14 @@ import (

"github.com/hound-search/hound/codesearch/index"
"github.com/hound-search/hound/codesearch/regexp"
"golang.org/x/text/encoding"
)

const (
matchLimit = 5000
manifestFilename = "metadata.gob"
excludedFileJsonFilename = "excluded_files.json"
filePeekSize = 2048
filePeekSize = 1 << 20
)

const (
Expand All @@ -38,6 +39,7 @@ type Index struct {
type IndexOptions struct {
ExcludeDotFiles bool
SpecialFiles []string
FallbackEnc encoding.Encoding
}

type SearchOptions struct {
Expand Down Expand Up @@ -236,34 +238,34 @@ func (n *Index) Search(pat string, opt *SearchOptions) (*SearchResponse, error)
Matches: results,
FilesWithMatch: filesFound,
FilesOpened: filesOpened,
Duration: time.Now().Sub(startedAt), //nolint
Duration: time.Now().Sub(startedAt), //nolint
Revision: n.Ref.Rev,
}, nil
}

func isTextFile(filename string) (bool, error) {
func isTextFile(filename string) (isText bool, isUTF8 bool, err error) {
buf := make([]byte, filePeekSize)
r, err := os.Open(filename)
if err != nil {
return false, err
return false, false, err
}
defer r.Close()

n, err := io.ReadFull(r, buf)
if err != nil && err != io.ErrUnexpectedEOF && err != io.EOF {
return false, err
return false, false, err
}

buf = buf[:n]

if n < filePeekSize {
// read the whole file, must be valid.
return utf8.Valid(buf), nil
if n < filePeekSize && utf8.Valid(buf) || // read the whole file, must be valid.
n >= filePeekSize && validUTF8IgnoringPartialTrailingRune(buf) { // read a prefix, allow trailing partial runes.
return true, true, nil
}

// read a prefix, allow trailing partial runes.
return validUTF8IgnoringPartialTrailingRune(buf), nil

if isBinary(buf) {
return false, false, nil
}
return true, false, nil
}

// Determines if the buffer contains valid UTF8 encoded string data. The buffer is assumed
Expand Down Expand Up @@ -292,17 +294,30 @@ func validUTF8IgnoringPartialTrailingRune(p []byte) bool {
return true
}

func addFileToIndex(ix *index.IndexWriter, dst, src, path string) (string, error) {
func isBinary(p []byte) bool {
for _, c := range p {
if c < 10 {
return true
}
}
return false
}

func addFileToIndex(ix *index.IndexWriter, dst, src, path string, enc encoding.Encoding) (string, error) {
rel, err := filepath.Rel(src, path)
if err != nil {
return "", err
}

r, err := os.Open(path)
fh, err := os.Open(path)
if err != nil {
return "", err
}
defer r.Close()
defer fh.Close()
r := io.Reader(fh)
if enc != nil {
r = enc.NewDecoder().Reader(r)
}

dup := filepath.Join(dst, "raw", rel)
w, err := os.Create(dup)
Expand Down Expand Up @@ -364,7 +379,7 @@ func indexAllFiles(opt *IndexOptions, dst, src string) error {
}
defer fileHandle.Close()

if err := filepath.Walk(src, func(path string, info os.FileInfo, err error) error { //nolint
if err := filepath.Walk(src, func(path string, info os.FileInfo, err error) error { //nolint
name := info.Name()
rel, err := filepath.Rel(src, path)
if err != nil {
Expand Down Expand Up @@ -404,20 +419,24 @@ func indexAllFiles(opt *IndexOptions, dst, src string) error {
return nil
}

txt, err := isTextFile(path)
isText, isUTF8, err := isTextFile(path)
if err != nil {
return err
}

if !txt {
if !isText {
excluded = append(excluded, &ExcludedFile{
rel,
reasonNotText,
})
return nil
}
var enc encoding.Encoding
if !isUTF8 {
enc = opt.FallbackEnc
}

reasonForExclusion, err := addFileToIndex(ix, dst, src, path)
reasonForExclusion, err := addFileToIndex(ix, dst, src, path, enc)
if err != nil {
return err
}
Expand Down
8 changes: 7 additions & 1 deletion searcher/searcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"github.com/hound-search/hound/config"
"github.com/hound-search/hound/index"
"github.com/hound-search/hound/vcs"
"golang.org/x/text/encoding/htmlindex"
)

type Searcher struct {
Expand Down Expand Up @@ -264,7 +265,7 @@ func reportOnMemory() {
// Utility function for producing a hex encoded sha1 hash for a string.
func hashFor(name string) string {
h := sha1.New()
h.Write([]byte(name)) //nolint
h.Write([]byte(name)) //nolint
return hex.EncodeToString(h.Sum(nil))
}

Expand Down Expand Up @@ -411,6 +412,11 @@ func newSearcher(
ExcludeDotFiles: repo.ExcludeDotFiles,
SpecialFiles: wd.SpecialFiles(),
}
if repo.FallbackEncoding != "" {
if opt.FallbackEnc, err = htmlindex.Get(repo.FallbackEncoding); err != nil {
return nil, fmt.Errorf("%s.fallback-encoding=%q: %w", name, repo.FallbackEncoding, err)
}
}

rev, err := wd.PullOrClone(vcsDir, repo.Url)
if err != nil {
Expand Down

0 comments on commit c210eb0

Please sign in to comment.