Skip to content

Commit

Permalink
Implement better compression for non-ascii (see Ed-von-Schleck/shoco#11)
Browse files Browse the repository at this point in the history
  • Loading branch information
tmthrgd committed Feb 13, 2017
1 parent 5a2245a commit d5c20b2
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 23 deletions.
55 changes: 55 additions & 0 deletions shoco.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,14 @@ func findBestEncoding(indices *[maxSuccessorN + 1]int16, nConsecutive int) int {
// in must not contain any zero-bytes otherwise Decompress will
// fail.
func Compress(in []byte) (out []byte) {
return compress(in, false)
}

func ProposedCompress(in []byte) (out []byte) {
return compress(in, true)
}

func compress(in []byte, proposed bool) (out []byte) {
var buf bytes.Buffer
buf.Grow(len(in))

Expand Down Expand Up @@ -96,6 +104,27 @@ func Compress(in []byte) (out []byte) {
}
}

if proposed {
// See https://github.com/Ed-von-Schleck/shoco/issues/11
if in[0]&0x80 != 0 || in[0] < 0x09 {
j := byte(1)
for ; int(j) < len(in) && j <= 0x09; j++ {
if in[j]&0x80 == 0 && in[j] >= 0x09 {
break
}
}

buf.WriteByte(j - 1)
buf.Write(in[:j])
in = in[j:]
} else {
buf.WriteByte(in[0])
in = in[1:]
}

continue
}

if in[0]&0x80 != 0 { // non-ascii case
buf.WriteByte(0x00) // put in a sentinel byte
}
Expand All @@ -108,12 +137,38 @@ func Compress(in []byte) (out []byte) {
}

func Decompress(in []byte) (out []byte, err error) {
return decompress(in, false)
}

func ProposedDecompress(in []byte) (out []byte, err error) {
return decompress(in, true)
}

func decompress(in []byte, proposed bool) (out []byte, err error) {
var buf bytes.Buffer
buf.Grow(len(in) * 2)

for len(in) != 0 {
mark := decodeHeader(in[0])
if mark < 0 {
if proposed {
// See https://github.com/Ed-von-Schleck/shoco/issues/11
if in[0] < 0x09 {
j := in[0] + 1
if len(in) < int(j) {
return nil, ErrInvalid
}

buf.Write(in[1 : 1+j])
in = in[1+j:]
} else {
buf.WriteByte(in[0])
in = in[1:]
}

continue
}

if in[0] == 0x00 { // ignore the sentinel value for non-ascii chars
if len(in) < 2 {
return nil, ErrInvalid
Expand Down
61 changes: 38 additions & 23 deletions shoco_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,25 @@ import (
"testing/quick"
)

func compress(in string) string {
func testCompress(in string, proposed bool) string {
if proposed {
return hex.EncodeToString(ProposedCompress([]byte(in)))
}

return hex.EncodeToString(Compress([]byte(in)))
}

func decompress(in string) (string, error) {
func testDecompress(in string, proposed bool) (string, error) {
b, err := hex.DecodeString(in)
if err != nil {
return "", err
}

if proposed {
out, err := ProposedDecompress(b)
return string(out), err
}

out, err := Decompress(b)
return string(out), err
}
Expand All @@ -32,31 +41,37 @@ func decompress(in string) (string, error) {
// Array.from(shoco.compress("Übergrößenträger")).map(x => ('00' + x.toString(16)).slice(-2)).join('')
// in the development console on https://ed-von-schleck.github.io/shoco/
var testCases = []struct {
in, out string
in, out string
proposed bool
}{
{"", ""},
{"test", "c899"},
{"shoco", "a26fac"},
{"shoco is a C library to compress and decompress short strings. It is very fast and easy to use. The default compression model is optimized for english words, but you can generate your own compression model based on your specific input data.", "a26fac20892061204320a6df9b79209120d625ce1d20846420e70484a4737320d09a7420d07199732e2049742089207680792066867420846420658679209120ab652e20549420b86661aa7420d625ce1d698d20b6b86c2089206f70c8db7a8220668e20c04e896820d917732c20bf7420798c20af6e20e908906620798c72206f776e20d625ce1d698d20b6b86c20df5064208d20798c72207370656369666963208870a920dccc2e"},
{"shoco is free software, distributed under the MIT license.", "a26fac208920669c6520d11fd8182c20dc499ddeca6420d50072209065204d495420d2b16ea02e"},
{"Übergrößenträger", "00c3009cbc72677200c300b600c3009fc05e00c300a46780"},
{"Hello, 世界", "48c14d2c2000e400b8009600e70095008c"},
{"Go is an open source programming language that makes it easy to build simple, reliable, and efficient software.", "476f20892084206f708120d100ad20709e679f6ddac120d3817561676520c80920b56b83208a20658679209120bf696c6420d0dda42c20ce2a61bd652c20846420656666696369817420d11fd8182e"},
{"\u263a\u263b\u2639", "00e2009800ba00e2009800bb00e2009800b9"},
{"a\u263ab\u263bc\u2639d", "6100e2009800ba6200e2009800bb6300e2009800b964"},
{"1\u20002\u20013\u20024", "3100e2008000803200e2008000813300e20080008234"},
{"\u0250\u0250\u0250\u0250\u0250", "00c9009000c9009000c9009000c9009000c90090"},
{"\t\v\r\f\n\u0085\u00a0\u2000\u3000", "090b0d0c0a00c2008500c200a000e20080008000e300800080"},
{"abcçdefgğhıijklmnoöprsştuüvyz", "61626300c300a7b8666700c4009f6800c400b1696a6b6c6d6e6f00c300b670727300c5009f747500c300bc76797a"},
{"ÿøû", "00c300bf00c300b800c300bb"},
{"μ", "00ce00bc"},
{"μδ", "00ce00bc00ce00b4"},
{"\U0001f601", "00f0009f00980081"},
{"", "", false},
{"test", "c899", false},
{"shoco", "a26fac", false},
{"shoco is a C library to compress and decompress short strings. It is very fast and easy to use. The default compression model is optimized for english words, but you can generate your own compression model based on your specific input data.", "a26fac20892061204320a6df9b79209120d625ce1d20846420e70484a4737320d09a7420d07199732e2049742089207680792066867420846420658679209120ab652e20549420b86661aa7420d625ce1d698d20b6b86c2089206f70c8db7a8220668e20c04e896820d917732c20bf7420798c20af6e20e908906620798c72206f776e20d625ce1d698d20b6b86c20df5064208d20798c72207370656369666963208870a920dccc2e", false},
{"shoco is free software, distributed under the MIT license.", "a26fac208920669c6520d11fd8182c20dc499ddeca6420d50072209065204d495420d2b16ea02e", false},
{"Übergrößenträger", "00c3009cbc72677200c300b600c3009fc05e00c300a46780", false},
{"Hello, 世界", "48c14d2c2000e400b8009600e70095008c", false},
{"Go is an open source programming language that makes it easy to build simple, reliable, and efficient software.", "476f20892084206f708120d100ad20709e679f6ddac120d3817561676520c80920b56b83208a20658679209120bf696c6420d0dda42c20ce2a61bd652c20846420656666696369817420d11fd8182e", false},
{"\u263a\u263b\u2639", "00e2009800ba00e2009800bb00e2009800b9", false},
{"a\u263ab\u263bc\u2639d", "6100e2009800ba6200e2009800bb6300e2009800b964", false},
{"1\u20002\u20013\u20024", "3100e2008000803200e2008000813300e20080008234", false},
{"\u0250\u0250\u0250\u0250\u0250", "00c9009000c9009000c9009000c9009000c90090", false},
{"\t\v\r\f\n\u0085\u00a0\u2000\u3000", "090b0d0c0a00c2008500c200a000e20080008000e300800080", false},
{"abcçdefgğhıijklmnoöprsştuüvyz", "61626300c300a7b8666700c4009f6800c400b1696a6b6c6d6e6f00c300b670727300c5009f747500c300bc76797a", false},
{"ÿøû", "00c300bf00c300b800c300bb", false},
{"μ", "00ce00bc", false},
{"μδ", "00ce00bc00ce00b4", false},
{"\U0001f601", "00f0009f00980081", false},

// See https://github.com/Ed-von-Schleck/shoco/issues/11
{"μ", "01cebc", true},
{"μδ", "03cebcceb4", true},
{"\U0001f601", "03f09f9881", true},
}

func TestCompress(t *testing.T) {
for i, testCase := range testCases {
if out := compress(testCase.in); out != testCase.out {
if out := testCompress(testCase.in, testCase.proposed); out != testCase.out {
t.Errorf("failed for test case #%d", i)
t.Logf("got: %s", out)
t.Logf("expected: %s", testCase.out)
Expand All @@ -66,7 +81,7 @@ func TestCompress(t *testing.T) {

func TestDecompress(t *testing.T) {
for i, testCase := range testCases {
in, err := decompress(testCase.out)
in, err := testDecompress(testCase.out, testCase.proposed)
if err != nil {
t.Errorf("failed for test case #%d", i)
t.Log(err)
Expand Down

0 comments on commit d5c20b2

Please sign in to comment.