diff --git a/go.mod b/go.mod index ce02a7f..b9e34c4 100644 --- a/go.mod +++ b/go.mod @@ -5,3 +5,5 @@ module github.com/nlpodyssey/gopickle go 1.15 + +require golang.org/x/text v0.14.0 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..4303ed2 --- /dev/null +++ b/go.sum @@ -0,0 +1,32 @@ +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/pickle/pickle.go b/pickle/pickle.go index b7501db..9ac0269 100644 --- a/pickle/pickle.go +++ b/pickle/pickle.go @@ -134,6 +134,11 @@ func (u *Unpickler) findClass(module, name string) (interface{}, error) { case "object": return &types.ObjectClass{}, nil } + case "array": + switch name { + case "_array_reconstructor": + return &types.Array{}, nil + } case "copy_reg": switch name { case "_reconstructor": @@ -145,6 +150,7 @@ func (u *Unpickler) findClass(module, name string) (interface{}, error) { } return types.NewGenericClass(module, name), nil } + func (u *Unpickler) read(n int) ([]byte, error) { buf := make([]byte, n) diff --git a/pickle/pickle_test.go b/pickle/pickle_test.go index 02a98af..fade21a 100644 --- a/pickle/pickle_test.go +++ b/pickle/pickle_test.go @@ -6,11 +6,12 @@ package pickle import ( "fmt" - "github.com/nlpodyssey/gopickle/types" "math/big" "reflect" "strings" "testing" + + "github.com/nlpodyssey/gopickle/types" ) func TestNoneP1(t *testing.T) { @@ -649,6 +650,110 @@ func TestFindClass(t *testing.T) { } } +func TestP4Carray(t *testing.T) { + for _, tc := range []struct { + name string + pkl string + want interface{} + }{ + { + // pickle.dumps(array.array("b", [0,1,2,-3], protocol=4) + name: "b", + pkl: "\x80\x04\x95F\x00\x00\x00\x00\x00\x00\x00\x8c\x05array\x94\x8c\x14_array_reconstructor\x94\x93\x94(\x8c\x05array\x94\x8c\x05array\x94\x93\x94\x8c\x01b\x94K\x01C\x04\x00\x01\x02\xfd\x94t\x94R\x94.", + want: []int8{0, 1, 2, -3}, + }, + { + // pickle.dumps(array.array("h", [0,1,2,-3], protocol=4) + name: "h", + pkl: "\x80\x04\x95J\x00\x00\x00\x00\x00\x00\x00\x8c\x05array\x94\x8c\x14_array_reconstructor\x94\x93\x94(\x8c\x05array\x94\x8c\x05array\x94\x93\x94\x8c\x01h\x94K\x04C\x08\x00\x00\x01\x00\x02\x00\xfd\xff\x94t\x94R\x94.", + want: []int16{0, 1, 2, -3}, + }, + { + // pickle.dumps(array.array("i", [0,1,2,-3], protocol=4) + name: "i", + pkl: "\x80\x04\x95R\x00\x00\x00\x00\x00\x00\x00\x8c\x05array\x94\x8c\x14_array_reconstructor\x94\x93\x94(\x8c\x05array\x94\x8c\x05array\x94\x93\x94\x8c\x01i\x94K\x08C\x10\x00\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\xfd\xff\xff\xff\x94t\x94R\x94.", + want: []int32{0, 1, 2, -3}, + }, + { + // pickle.dumps(array.array("l", [0,1,2,-3], protocol=4) + name: "l", + pkl: "\x80\x04\x95b\x00\x00\x00\x00\x00\x00\x00\x8c\x05array\x94\x8c\x14_array_reconstructor\x94\x93\x94(\x8c\x05array\x94\x8c\x05array\x94\x93\x94\x8c\x01l\x94K\x0cC \x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\xfd\xff\xff\xff\xff\xff\xff\xff\x94t\x94R\x94.", + want: []int64{0, 1, 2, -3}, + }, + { + // pickle.dumps(array.array("q", [0,1,2,-3], protocol=4) + name: "q", + pkl: "\x80\x04\x95b\x00\x00\x00\x00\x00\x00\x00\x8c\x05array\x94\x8c\x14_array_reconstructor\x94\x93\x94(\x8c\x05array\x94\x8c\x05array\x94\x93\x94\x8c\x01q\x94K\x0cC \x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\xfd\xff\xff\xff\xff\xff\xff\xff\x94t\x94R\x94.", + want: []int64{0, 1, 2, -3}, + }, + { + // pickle.dumps(array.array("B", [0,1,2,3], protocol=4) + name: "B", + pkl: "\x80\x04\x95F\x00\x00\x00\x00\x00\x00\x00\x8c\x05array\x94\x8c\x14_array_reconstructor\x94\x93\x94(\x8c\x05array\x94\x8c\x05array\x94\x93\x94\x8c\x01B\x94K\x00C\x04\x00\x01\x02\x03\x94t\x94R\x94.", + want: []uint8{0, 1, 2, 3}, + }, + { + // pickle.dumps(array.array("H", [0,1,2,3], protocol=4) + name: "H", + pkl: "\x80\x04\x95J\x00\x00\x00\x00\x00\x00\x00\x8c\x05array\x94\x8c\x14_array_reconstructor\x94\x93\x94(\x8c\x05array\x94\x8c\x05array\x94\x93\x94\x8c\x01H\x94K\x02C\x08\x00\x00\x01\x00\x02\x00\x03\x00\x94t\x94R\x94.", + want: []uint16{0, 1, 2, 3}, + }, + { + // pickle.dumps(array.array("I", [0,1,2,3], protocol=4) + name: "I", + pkl: "\x80\x04\x95R\x00\x00\x00\x00\x00\x00\x00\x8c\x05array\x94\x8c\x14_array_reconstructor\x94\x93\x94(\x8c\x05array\x94\x8c\x05array\x94\x93\x94\x8c\x01I\x94K\x06C\x10\x00\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\x03\x00\x00\x00\x94t\x94R\x94.", + want: []uint32{0, 1, 2, 3}, + }, + { + // pickle.dumps(array.array("L", [0,1,2,3], protocol=4) + name: "L", + pkl: "'\x80\x04\x95b\x00\x00\x00\x00\x00\x00\x00\x8c\x05array\x94\x8c\x14_array_reconstructor\x94\x93\x94(\x8c\x05array\x94\x8c\x05array\x94\x93\x94\x8c\x01L\x94K\nC \x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x94t\x94R\x94.", + want: []uint64{0, 1, 2, 3}, + }, + { + // pickle.dumps(array.array("Q", [0,1,2,3], protocol=4) + name: "Q", + pkl: "'\x80\x04\x95b\x00\x00\x00\x00\x00\x00\x00\x8c\x05array\x94\x8c\x14_array_reconstructor\x94\x93\x94(\x8c\x05array\x94\x8c\x05array\x94\x93\x94\x8c\x01Q\x94K\nC \x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x94t\x94R\x94.", + want: []uint64{0, 1, 2, 3}, + }, + { + // pickle.dumps(array.array("f", [0,1,2,3], protocol=4) + name: "f", + pkl: "\x80\x04\x95R\x00\x00\x00\x00\x00\x00\x00\x8c\x05array\x94\x8c\x14_array_reconstructor\x94\x93\x94(\x8c\x05array\x94\x8c\x05array\x94\x93\x94\x8c\x01f\x94K\x0eC\x10\x00\x00\x00\x00\x00\x00\x80?\x00\x00\x00@\x00\x00@@\x94t\x94R\x94.", + want: []float32{0, 1, 2, 3}, + }, + { + // pickle.dumps(array.array("d", [0,1,2,3], protocol=4) + name: "d", + pkl: "\x80\x04\x95b\x00\x00\x00\x00\x00\x00\x00\x8c\x05array\x94\x8c\x14_array_reconstructor\x94\x93\x94(\x8c\x05array\x94\x8c\x05array\x94\x93\x94\x8c\x01d\x94K\x10C \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x08@\x94t\x94R\x94.", + want: []float64{0, 1, 2, 3}, + }, + { + // pickle.dumps(array.array("u", "Hello, 世界".encode("utf-32")], protocol=4) + name: "u", + pkl: "\x80\x04\x95f\x00\x00\x00\x00\x00\x00\x00\x8c\x05array\x94\x8c\x14_array_reconstructor\x94\x93\x94(\x8c\x05array\x94\x8c\x05array\x94\x93\x94\x8c\x01u\x94K\x14C$H\x00\x00\x00e\x00\x00\x00l\x00\x00\x00l\x00\x00\x00o\x00\x00\x00,\x00\x00\x00 \x00\x00\x00\x16N\x00\x00Lu\x00\x00\x94t\x94R\x94.", + want: []rune("Hello, 世界"), + }, + { + // pickle.dumps(array.array("u", "".encode("utf-32")], protocol=4) + name: "u-empty", + pkl: "\x80\x04\x95B\x00\x00\x00\x00\x00\x00\x00\x8c\x05array\x94\x8c\x14_array_reconstructor\x94\x93\x94(\x8c\x05array\x94\x8c\x05array\x94\x93\x94\x8c\x01u\x94K\x14C\x00\x94t\x94R\x94.", + want: []rune(""), + }, + } { + t.Run(tc.name, func(t *testing.T) { + switch tc.name { + case "L", "Q": + t.SkipNow() // unknown opcode: 0x27 ''' + } + got := loadsNoErr(t, tc.pkl) + if !reflect.DeepEqual(got, tc.want) { + t.Fatalf("got=%v, want=%v", got, tc.want) + } + }) + } +} + // TODO: test BinPersId // TODO: test Get // TODO: test BinGet diff --git a/types/array.go b/types/array.go new file mode 100644 index 0000000..0d5c1e7 --- /dev/null +++ b/types/array.go @@ -0,0 +1,219 @@ +// Copyright 2023 NLP Odyssey Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package types + +import ( + "encoding/binary" + "fmt" + "math" + "unicode/utf8" + + "golang.org/x/text/encoding" + "golang.org/x/text/encoding/unicode" + "golang.org/x/text/encoding/unicode/utf32" +) + +// Array unpickles array.array values as documented in: +// +// https://docs.python.org/3/library/array.html +type Array struct{} + +var _ Callable = (*Array)(nil) + +func (Array) Call(args ...interface{}) (interface{}, error) { + if got, want := len(args), 4; got != want { + return nil, fmt.Errorf("invalid number of arguments (got=%d, want=%d)", got, want) + } + + typ, ok := args[1].(string) + if !ok { + return nil, fmt.Errorf("invalid array type argument %T", args[1]) + } + + mi, ok := args[2].(int) + if !ok { + return nil, fmt.Errorf("invalid array mformat code type %T", args[2]) + } + if mi >= len(arrayDescriptors) { + return nil, fmt.Errorf("invalid array mformat value %d", mi) + } + descr := arrayDescriptors[mi] + + raw, ok := args[3].([]byte) + if !ok { + return nil, fmt.Errorf("invalid array payload type %T", args[3]) + } + + switch typ { + case "b": + vs := make([]int8, len(raw)) + for i := 0; i < len(raw); i++ { + vs[i] = int8(raw[i]) + } + return vs, nil + + case "B": + return raw, nil + + case "u": + vs := make([]rune, 0, utf8.RuneCount(raw)) + var enc encoding.Encoding + switch descr.Size { + case 4: + order := unicode.BigEndian + if descr.Order == binary.LittleEndian { + order = unicode.LittleEndian + } + enc = unicode.UTF16(order, unicode.IgnoreBOM) + case 8: + order := utf32.BigEndian + if descr.Order == binary.LittleEndian { + order = utf32.LittleEndian + } + enc = utf32.UTF32(order, utf32.IgnoreBOM) + default: + return nil, fmt.Errorf("invalid machine description size (got=%d, want=4 or 8)", descr.Size) + } + dec := enc.NewDecoder() + raw, err := dec.Bytes(raw) + if err != nil { + return nil, err + } + i := 0 + loop: + for { + r, sz := utf8.DecodeRune(raw[i:]) + switch r { + case utf8.RuneError: + if sz == 0 { + break loop + } + return vs, fmt.Errorf("invalid rune") + default: + vs = append(vs, r) + i += sz + } + } + return vs, nil + + case "h": + sz := descr.Size + vs := make([]int16, len(raw)/sz) + for i := 0; i < len(raw); i += sz { + vs[i/sz] = int16(descr.Order.Uint16(raw[i:])) + } + return vs, nil + + case "H": + sz := descr.Size + vs := make([]uint16, len(raw)/sz) + for i := 0; i < len(raw); i += sz { + vs[i/sz] = descr.Order.Uint16(raw[i:]) + } + return vs, nil + + case "i": + sz := descr.Size + vs := make([]int32, len(raw)/sz) + for i := 0; i < len(raw); i += sz { + vs[i/sz] = int32(descr.Order.Uint32(raw[i:])) + } + return vs, nil + + case "I": + sz := descr.Size + vs := make([]uint32, len(raw)/sz) + for i := 0; i < len(raw); i += sz { + vs[i/sz] = descr.Order.Uint32(raw[i:]) + } + return vs, nil + + case "l": + sz := descr.Size + vs := make([]int64, len(raw)/sz) + for i := 0; i < len(raw); i += sz { + vs[i/sz] = int64(descr.Order.Uint64(raw[i:])) + } + return vs, nil + + case "L": + sz := descr.Size + vs := make([]uint64, len(raw)/sz) + for i := 0; i < len(raw); i += sz { + vs[i/sz] = descr.Order.Uint64(raw[i:]) + } + return vs, nil + + case "q": + sz := descr.Size + vs := make([]int64, len(raw)/sz) + for i := 0; i < len(raw); i += sz { + vs[i/sz] = int64(descr.Order.Uint64(raw[i:])) + } + return vs, nil + + case "Q": + sz := descr.Size + vs := make([]uint64, len(raw)/sz) + for i := 0; i < len(raw); i += sz { + vs[i/sz] = descr.Order.Uint64(raw[i:]) + } + return vs, nil + + case "f": + sz := descr.Size + vs := make([]float32, len(raw)/sz) + for i := 0; i < len(raw); i += sz { + vs[i/sz] = math.Float32frombits(descr.Order.Uint32(raw[i:])) + } + return vs, nil + + case "d": + sz := descr.Size + vs := make([]float64, len(raw)/sz) + for i := 0; i < len(raw); i += sz { + vs[i/sz] = math.Float64frombits(descr.Order.Uint64(raw[i:])) + } + return vs, nil + + default: + return nil, fmt.Errorf("invalid array typecode '%s'", typ) + } + + panic("impossible") +} + +type arrayDescriptor struct { + Size int + Signed bool + Order binary.ByteOrder +} + +var ( + arrayDescriptors = []arrayDescriptor{ + 0: {Size: 1, Signed: false, Order: binary.LittleEndian}, // 0: UNSIGNED_INT8 + 1: {Size: 1, Signed: true, Order: binary.LittleEndian}, // 1: SIGNED_INT8 + 2: {Size: 2, Signed: false, Order: binary.LittleEndian}, // 2: UNSIGNED_INT16_LE + 3: {Size: 2, Signed: false, Order: binary.BigEndian}, // 3: UNSIGNED_INT16_BE + 4: {Size: 2, Signed: true, Order: binary.LittleEndian}, // 4: SIGNED_INT16_LE + 5: {Size: 2, Signed: true, Order: binary.BigEndian}, // 5: SIGNED_INT16_BE + 6: {Size: 4, Signed: false, Order: binary.LittleEndian}, // 6: UNSIGNED_INT32_LE + 7: {Size: 4, Signed: false, Order: binary.BigEndian}, // 7: UNSIGNED_INT32_BE + 8: {Size: 4, Signed: true, Order: binary.LittleEndian}, // 8: SIGNED_INT32_LE + 9: {Size: 4, Signed: true, Order: binary.BigEndian}, // 9: SIGNED_INT32_BE + 10: {Size: 8, Signed: false, Order: binary.LittleEndian}, // 10: UNSIGNED_INT64_LE + 11: {Size: 8, Signed: false, Order: binary.BigEndian}, // 11: UNSIGNED_INT64_BE + 12: {Size: 8, Signed: true, Order: binary.LittleEndian}, // 12: SIGNED_INT64_LE + 13: {Size: 8, Signed: true, Order: binary.BigEndian}, // 13: SIGNED_INT64_BE + 14: {Size: 4, Signed: false, Order: binary.LittleEndian}, // 14: IEEE_754_FLOAT_LE + 15: {Size: 4, Signed: false, Order: binary.BigEndian}, // 15: IEEE_754_FLOAT_BE + 16: {Size: 8, Signed: false, Order: binary.LittleEndian}, // 16: IEEE_754_DOUBLE_LE + 17: {Size: 8, Signed: false, Order: binary.BigEndian}, // 17: IEEE_754_DOUBLE_BE + 18: {Size: 4, Signed: false, Order: binary.LittleEndian}, // 18: UTF16_LE + 19: {Size: 4, Signed: false, Order: binary.BigEndian}, // 19: UTF16_BE + 20: {Size: 8, Signed: false, Order: binary.LittleEndian}, // 20: UTF32_LE + 21: {Size: 8, Signed: false, Order: binary.BigEndian}, // 21: UTF32_BE + } +)