Skip to content

Commit

Permalink
chore(grep): do not cut words in excerpt
Browse files Browse the repository at this point in the history
  • Loading branch information
Daniel Dabrowski committed Jun 4, 2019
1 parent 420d18a commit 61b5198
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 19 deletions.
38 changes: 31 additions & 7 deletions cli/grep.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ import (
thisS3 "github.com/dabdada/s3-grep/s3"
)

var MAX_EXCERPT_LENGTH = 120

type grepResult struct {
Key string
LineNum int
Expand Down Expand Up @@ -40,7 +42,7 @@ func Grep(session *config.AWSSession, bucketName string, query string, ignoreCas
for {
select {
case result := <-results:
fmt.Printf("s3://%s/%s %s:%d\n", bucketName, result.Key, result.Excerpt, result.LineNum)
fmt.Printf("s3://%s/%s %d:%s\n", bucketName, result.Key, result.LineNum, result.Excerpt)
case i := <-done:
finished += i
default:
Expand Down Expand Up @@ -95,14 +97,36 @@ func grepInObjectContent(session *config.AWSSession, bucketName string, objects
done <- 1
}

// Get a small Excerpt of a byte array
// Get a Excerpt of a byte array
//
// 10 chars before and after the substring
// If the line is not MAX_EXCERPT_LENGTH long, the whole text will be returned.
// Otherwise a 120 char excerpt is returned.
func getContentExcerpt(text []byte, query []byte) []byte {
queryLength := float64(len(query))
index := float64(bytes.Index(text, query))
from := int(math.Max(index-10, 0))
to := int(math.Min(float64(index+queryLength+ 10), float64(len(text))))
textLenght := len(text)
if textLenght <= MAX_EXCERPT_LENGTH {
return text
}
queryLength := len(query)
excerptLengthLeftAndRight := (MAX_EXCERPT_LENGTH - queryLength) / 2
index := bytes.Index(text, query)
from := int(math.Max(float64(index-excerptLengthLeftAndRight), 0))

// Do not cut in the middle of words.
if text[from] == byte(' ') {
from++
} else if from != 0 {
from = bytes.Index(text[from:textLenght], []byte(" ")) + 1 + from
}

to := int(math.Min(float64(index+queryLength+excerptLengthLeftAndRight), float64(textLenght)))
if to != textLenght {
offset := bytes.Index(text[to:textLenght], []byte(" "))
if offset < 0 {
to = textLenght
} else {
to += offset
}
}

return text[from:to]
}
Expand Down
36 changes: 24 additions & 12 deletions cli/grep_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,22 +159,34 @@ func TestGetContentExcerpt(t *testing.T) {
}{
{"starts with query", []byte("someThing"), []byte("some"), []byte("someThing")},
{
"query in the middle but not enough chars before",
[]byte("someThing"),
[]byte("Thing"),
[]byte("someThing"),
"query in the middle but not more than MAX_EXCERPT_LENGTH chars in text",
[]byte("Bounty tackle nipper red ensign execution dock Sail ho spirits hail-shot scourge"),
[]byte("dock"),
[]byte("Bounty tackle nipper red ensign execution dock Sail ho spirits hail-shot scourge"),
},
{
"query in the middle not enough chars to the left and right",
[]byte("someThing"),
[]byte("meT"),
[]byte("someThing"),
"query in the middle not enough chars to the left",
[]byte("Bounty tackle nipper red ensign execution dock Sail ho spirits hail-shot scourge of the seven seas barkadeer booty keel hands provost loaded to the gunwalls"),
[]byte("nipper"),
[]byte("Bounty tackle nipper red ensign execution dock Sail ho spirits hail-shot scourge"),
},
{
"query in the middle not enough chars to the right",
[]byte("Bounty tackle nipper red ensign execution dock Sail ho spirits hail-shot scourge of the seven seas barkadeer booty keel hands provost loaded to the gunwalls"),
[]byte("barkadeer"),
[]byte("Sail ho spirits hail-shot scourge of the seven seas barkadeer booty keel hands provost loaded to the gunwalls"),
},
{
"more than enough chars right and left of the query",
[]byte("someThingSuperLongAndWeirdOnlyForTesting"),
[]byte("Long"),
[]byte("ThingSuperLongAndWeirdOn"),
[]byte("Bounty tackle nipper red ensign execution dock Sail ho spirits hail-shot scourge of the seven seas barkadeer booty keel hands provost loaded to the gunwalls"),
[]byte("shot"),
[]byte("nipper red ensign execution dock Sail ho spirits hail-shot scourge of the seven seas barkadeer booty keel hands provost"),
},
{
"more than enough chars right and left of the query, find a space in from index",
[]byte("Bounty tackle nipper red ensign execution dock Sail ho spirits hail-shot scourge of the seven seas barkadeer booty keel hands provost loaded to the gunwalls"),
[]byte("even"),
[]byte("execution dock Sail ho spirits hail-shot scourge of the seven seas barkadeer booty keel hands provost loaded to the gunwalls"),
},
}

Expand All @@ -184,7 +196,7 @@ func TestGetContentExcerpt(t *testing.T) {
actual := getContentExcerpt(tt.text, tt.query)

if !bytes.Equal(tt.expected, actual) {
t.Errorf("expected excerpt is '%s' but actual was %s", tt.expected, actual)
t.Errorf("expected excerpt is '%s' but actual was '%s'", tt.expected, actual)
}
})
}
Expand Down

0 comments on commit 61b5198

Please sign in to comment.