From d5c20b25265275ad96a96491405035fddac367db Mon Sep 17 00:00:00 2001 From: Tom Thorogood Date: Mon, 13 Feb 2017 11:44:59 +1030 Subject: [PATCH] Implement better compression for non-ascii (see Ed-von-Schleck/shoco#11) --- shoco.go | 55 ++++++++++++++++++++++++++++++++++++++++++++++ shoco_test.go | 61 ++++++++++++++++++++++++++++++++------------------- 2 files changed, 93 insertions(+), 23 deletions(-) diff --git a/shoco.go b/shoco.go index 7b25dfa..b7ebc3a 100644 --- a/shoco.go +++ b/shoco.go @@ -53,6 +53,14 @@ func findBestEncoding(indices *[maxSuccessorN + 1]int16, nConsecutive int) int { // in must not contain any zero-bytes otherwise Decompress will // fail. func Compress(in []byte) (out []byte) { + return compress(in, false) +} + +func ProposedCompress(in []byte) (out []byte) { + return compress(in, true) +} + +func compress(in []byte, proposed bool) (out []byte) { var buf bytes.Buffer buf.Grow(len(in)) @@ -96,6 +104,27 @@ func Compress(in []byte) (out []byte) { } } + if proposed { + // See https://github.com/Ed-von-Schleck/shoco/issues/11 + if in[0]&0x80 != 0 || in[0] < 0x09 { + j := byte(1) + for ; int(j) < len(in) && j <= 0x09; j++ { + if in[j]&0x80 == 0 && in[j] >= 0x09 { + break + } + } + + buf.WriteByte(j - 1) + buf.Write(in[:j]) + in = in[j:] + } else { + buf.WriteByte(in[0]) + in = in[1:] + } + + continue + } + if in[0]&0x80 != 0 { // non-ascii case buf.WriteByte(0x00) // put in a sentinel byte } @@ -108,12 +137,38 @@ func Compress(in []byte) (out []byte) { } func Decompress(in []byte) (out []byte, err error) { + return decompress(in, false) +} + +func ProposedDecompress(in []byte) (out []byte, err error) { + return decompress(in, true) +} + +func decompress(in []byte, proposed bool) (out []byte, err error) { var buf bytes.Buffer buf.Grow(len(in) * 2) for len(in) != 0 { mark := decodeHeader(in[0]) if mark < 0 { + if proposed { + // See https://github.com/Ed-von-Schleck/shoco/issues/11 + if in[0] < 0x09 { + j := in[0] + 1 + if len(in) < int(j) { + return nil, ErrInvalid + } + + buf.Write(in[1 : 1+j]) + in = in[1+j:] + } else { + buf.WriteByte(in[0]) + in = in[1:] + } + + continue + } + if in[0] == 0x00 { // ignore the sentinel value for non-ascii chars if len(in) < 2 { return nil, ErrInvalid diff --git a/shoco_test.go b/shoco_test.go index 777e423..202e1a4 100644 --- a/shoco_test.go +++ b/shoco_test.go @@ -14,16 +14,25 @@ import ( "testing/quick" ) -func compress(in string) string { +func testCompress(in string, proposed bool) string { + if proposed { + return hex.EncodeToString(ProposedCompress([]byte(in))) + } + return hex.EncodeToString(Compress([]byte(in))) } -func decompress(in string) (string, error) { +func testDecompress(in string, proposed bool) (string, error) { b, err := hex.DecodeString(in) if err != nil { return "", err } + if proposed { + out, err := ProposedDecompress(b) + return string(out), err + } + out, err := Decompress(b) return string(out), err } @@ -32,31 +41,37 @@ func decompress(in string) (string, error) { // Array.from(shoco.compress("Übergrößenträger")).map(x => ('00' + x.toString(16)).slice(-2)).join('') // in the development console on https://ed-von-schleck.github.io/shoco/ var testCases = []struct { - in, out string + in, out string + proposed bool }{ - {"", ""}, - {"test", "c899"}, - {"shoco", "a26fac"}, - {"shoco is a C library to compress and decompress short strings. It is very fast and easy to use. The default compression model is optimized for english words, but you can generate your own compression model based on your specific input data.", "a26fac20892061204320a6df9b79209120d625ce1d20846420e70484a4737320d09a7420d07199732e2049742089207680792066867420846420658679209120ab652e20549420b86661aa7420d625ce1d698d20b6b86c2089206f70c8db7a8220668e20c04e896820d917732c20bf7420798c20af6e20e908906620798c72206f776e20d625ce1d698d20b6b86c20df5064208d20798c72207370656369666963208870a920dccc2e"}, - {"shoco is free software, distributed under the MIT license.", "a26fac208920669c6520d11fd8182c20dc499ddeca6420d50072209065204d495420d2b16ea02e"}, - {"Übergrößenträger", "00c3009cbc72677200c300b600c3009fc05e00c300a46780"}, - {"Hello, 世界", "48c14d2c2000e400b8009600e70095008c"}, - {"Go is an open source programming language that makes it easy to build simple, reliable, and efficient software.", "476f20892084206f708120d100ad20709e679f6ddac120d3817561676520c80920b56b83208a20658679209120bf696c6420d0dda42c20ce2a61bd652c20846420656666696369817420d11fd8182e"}, - {"\u263a\u263b\u2639", "00e2009800ba00e2009800bb00e2009800b9"}, - {"a\u263ab\u263bc\u2639d", "6100e2009800ba6200e2009800bb6300e2009800b964"}, - {"1\u20002\u20013\u20024", "3100e2008000803200e2008000813300e20080008234"}, - {"\u0250\u0250\u0250\u0250\u0250", "00c9009000c9009000c9009000c9009000c90090"}, - {"\t\v\r\f\n\u0085\u00a0\u2000\u3000", "090b0d0c0a00c2008500c200a000e20080008000e300800080"}, - {"abcçdefgğhıijklmnoöprsştuüvyz", "61626300c300a7b8666700c4009f6800c400b1696a6b6c6d6e6f00c300b670727300c5009f747500c300bc76797a"}, - {"ÿøû", "00c300bf00c300b800c300bb"}, - {"μ", "00ce00bc"}, - {"μδ", "00ce00bc00ce00b4"}, - {"\U0001f601", "00f0009f00980081"}, + {"", "", false}, + {"test", "c899", false}, + {"shoco", "a26fac", false}, + {"shoco is a C library to compress and decompress short strings. It is very fast and easy to use. The default compression model is optimized for english words, but you can generate your own compression model based on your specific input data.", "a26fac20892061204320a6df9b79209120d625ce1d20846420e70484a4737320d09a7420d07199732e2049742089207680792066867420846420658679209120ab652e20549420b86661aa7420d625ce1d698d20b6b86c2089206f70c8db7a8220668e20c04e896820d917732c20bf7420798c20af6e20e908906620798c72206f776e20d625ce1d698d20b6b86c20df5064208d20798c72207370656369666963208870a920dccc2e", false}, + {"shoco is free software, distributed under the MIT license.", "a26fac208920669c6520d11fd8182c20dc499ddeca6420d50072209065204d495420d2b16ea02e", false}, + {"Übergrößenträger", "00c3009cbc72677200c300b600c3009fc05e00c300a46780", false}, + {"Hello, 世界", "48c14d2c2000e400b8009600e70095008c", false}, + {"Go is an open source programming language that makes it easy to build simple, reliable, and efficient software.", "476f20892084206f708120d100ad20709e679f6ddac120d3817561676520c80920b56b83208a20658679209120bf696c6420d0dda42c20ce2a61bd652c20846420656666696369817420d11fd8182e", false}, + {"\u263a\u263b\u2639", "00e2009800ba00e2009800bb00e2009800b9", false}, + {"a\u263ab\u263bc\u2639d", "6100e2009800ba6200e2009800bb6300e2009800b964", false}, + {"1\u20002\u20013\u20024", "3100e2008000803200e2008000813300e20080008234", false}, + {"\u0250\u0250\u0250\u0250\u0250", "00c9009000c9009000c9009000c9009000c90090", false}, + {"\t\v\r\f\n\u0085\u00a0\u2000\u3000", "090b0d0c0a00c2008500c200a000e20080008000e300800080", false}, + {"abcçdefgğhıijklmnoöprsştuüvyz", "61626300c300a7b8666700c4009f6800c400b1696a6b6c6d6e6f00c300b670727300c5009f747500c300bc76797a", false}, + {"ÿøû", "00c300bf00c300b800c300bb", false}, + {"μ", "00ce00bc", false}, + {"μδ", "00ce00bc00ce00b4", false}, + {"\U0001f601", "00f0009f00980081", false}, + + // See https://github.com/Ed-von-Schleck/shoco/issues/11 + {"μ", "01cebc", true}, + {"μδ", "03cebcceb4", true}, + {"\U0001f601", "03f09f9881", true}, } func TestCompress(t *testing.T) { for i, testCase := range testCases { - if out := compress(testCase.in); out != testCase.out { + if out := testCompress(testCase.in, testCase.proposed); out != testCase.out { t.Errorf("failed for test case #%d", i) t.Logf("got: %s", out) t.Logf("expected: %s", testCase.out) @@ -66,7 +81,7 @@ func TestCompress(t *testing.T) { func TestDecompress(t *testing.T) { for i, testCase := range testCases { - in, err := decompress(testCase.out) + in, err := testDecompress(testCase.out, testCase.proposed) if err != nil { t.Errorf("failed for test case #%d", i) t.Log(err)