Skip to content

Commit

Permalink
perf: Improved Bit (Un)packing Performance (#280)
Browse files Browse the repository at this point in the history
I was very surprised by this but getting rid of the shifting yielded a
huge performance boost for me locally. I was benchmarking some pandas
code that took ~500us to unpack 1 million boolean values - with this
simple change that time fell to ~30us

Not an expert in assembly but here is what godbolt produces to set the
index of 1 before:

```asm
        movzx   eax, BYTE PTR [rbp-1]
        shr     al
        mov     edx, eax
        .loc 1 6 6
        mov     rax, QWORD PTR [rbp-32]
        add     rax, 1
        .loc 1 6 24
        and     edx, 1
        .loc 1 6 10
        mov     BYTE PTR [rax], dl
```

and after:

```asm
        movzx   eax, BYTE PTR [rbp-1]
        and     eax, 2
        .loc 1 6 25
        test    eax, eax
        setne   dl
        .loc 1 6 6
        mov     rax, QWORD PTR [rbp-32]
        add     rax, 1
        .loc 1 6 10
        mov     BYTE PTR [rax], dl
```

Assuming the `shr` instruction is inefficient compared to the `test` /
`setne` approach taken in the latter
  • Loading branch information
WillAyd authored Oct 6, 2023
1 parent 3952320 commit 6006ca2
Showing 1 changed file with 24 additions and 20 deletions.
44 changes: 24 additions & 20 deletions src/nanoarrow/buffer_inline.h
Original file line number Diff line number Diff line change
Expand Up @@ -223,35 +223,39 @@ static inline int64_t _ArrowBytesForBits(int64_t bits) {
}

static inline void _ArrowBitsUnpackInt8(const uint8_t word, int8_t* out) {
out[0] = (word >> 0) & 1;
out[1] = (word >> 1) & 1;
out[2] = (word >> 2) & 1;
out[3] = (word >> 3) & 1;
out[4] = (word >> 4) & 1;
out[5] = (word >> 5) & 1;
out[6] = (word >> 6) & 1;
out[7] = (word >> 7) & 1;
out[0] = (word & 0x1) != 0;
out[1] = (word & 0x2) != 0;
out[2] = (word & 0x4) != 0;
out[3] = (word & 0x8) != 0;
out[4] = (word & 0x10) != 0;
out[5] = (word & 0x20) != 0;
out[6] = (word & 0x40) != 0;
out[7] = (word & 0x80) != 0;
}

static inline void _ArrowBitsUnpackInt32(const uint8_t word, int32_t* out) {
out[0] = (word >> 0) & 1;
out[1] = (word >> 1) & 1;
out[2] = (word >> 2) & 1;
out[3] = (word >> 3) & 1;
out[4] = (word >> 4) & 1;
out[5] = (word >> 5) & 1;
out[6] = (word >> 6) & 1;
out[7] = (word >> 7) & 1;
out[0] = (word & 0x1) != 0;
out[1] = (word & 0x2) != 0;
out[2] = (word & 0x4) != 0;
out[3] = (word & 0x8) != 0;
out[4] = (word & 0x10) != 0;
out[5] = (word & 0x20) != 0;
out[6] = (word & 0x40) != 0;
out[7] = (word & 0x80) != 0;
}

static inline void _ArrowBitmapPackInt8(const int8_t* values, uint8_t* out) {
*out = (values[0] | values[1] << 1 | values[2] << 2 | values[3] << 3 | values[4] << 4 |
values[5] << 5 | values[6] << 6 | values[7] << 7);
*out = (values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) |
((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) |
((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) |
((values[7] + 0x7f) & 0x80));
}

static inline void _ArrowBitmapPackInt32(const int32_t* values, uint8_t* out) {
*out = (values[0] | values[1] << 1 | values[2] << 2 | values[3] << 3 | values[4] << 4 |
values[5] << 5 | values[6] << 6 | values[7] << 7);
*out = (values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) |
((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) |
((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) |
((values[7] + 0x7f) & 0x80));
}

static inline int8_t ArrowBitGet(const uint8_t* bits, int64_t i) {
Expand Down

0 comments on commit 6006ca2

Please sign in to comment.