Skip to content

Commit

Permalink
x/crypto/chacha20, x/crypto/poly1305: Add MIPSLE assembly version
Browse files Browse the repository at this point in the history
Add assembly optimized versions for ChaCha20 and Poly1305
crypto algorithms for MIPSLE.

The algorithms have been ported from other ASM implementations,
both of which are dual licensed under “GPL-2.0 OR MIT”
- https://github.com/torvalds/linux/blob/1b294a1f35616977caddaddf3e9d28e576a1adbc/arch/mips/crypto/chacha-core.S
- https://github.com/WireGuard/wireguard-monolithic-historical/blob/edad0d6e99e5133b1e8e865d727a25fff6399cb4/src/crypto/zinc/poly1305/poly1305-mips.S

The following are benchmarks done on a MT7688. It compares
the base go implementation with the assembly version, once
with a MIPS32r1 IS and once with MIPS32r2 IS.

goos: linux
goarch: mipsle
pkg: golang.org/x/crypto/chacha20
                 │   old.txt    │                asm.txt                 │            asm-mips32r2.txt            │
                 │     B/s      │      B/s       vs base                 │      B/s       vs base                 │
ChaCha20/64        4.015Mi ± 1%   10.376Mi ± 1%  +158.43% (p=0.000 n=10)   13.485Mi ± 2%  +235.87% (p=0.000 n=10)
ChaCha20/256       4.473Mi ± 1%   12.846Mi ± 1%  +187.21% (p=0.000 n=10)   18.859Mi ± 3%  +321.64% (p=0.000 n=10)
ChaCha20/10x25     3.119Mi ± 1%    6.104Mi ± 2%   +95.72% (p=0.000 n=10)    7.181Mi ± 3%  +130.28% (p=0.000 n=10)
ChaCha20/4096      4.659Mi ± 4%   13.609Mi ± 4%  +192.12% (p=0.000 n=10)   20.270Mi ± 5%  +335.11% (p=0.000 n=10)
ChaCha20/100x40    4.020Mi ± 2%    9.918Mi ± 3%  +146.74% (p=0.000 n=10)   13.433Mi ± 5%  +234.16% (p=0.000 n=10)
ChaCha20/65536     4.301Mi ± 1%    9.727Mi ± 1%  +126.16% (p=0.000 n=10)   12.393Mi ± 0%  +188.14% (p=0.000 n=10)
ChaCha20/1000x65   4.187Mi ± 1%   10.076Mi ± 2%  +140.66% (p=0.000 n=10)   13.032Mi ± 2%  +211.28% (p=0.000 n=10)
geomean            4.082Mi         10.11Mi       +147.56%                   13.47Mi       +229.90%

pkg: golang.org/x/crypto/internal/poly1305
                 │   old.txt    │                 asm.txt                 │            asm-mips32r2.txt             │
                 │     B/s      │      B/s       vs base                  │      B/s       vs base                  │
64                 5.307Mi ± 0%   21.009Mi ± 0%   +295.87% (p=0.000 n=10)   20.938Mi ± 0%   +294.52% (p=0.000 n=10)
1K                 6.566Mi ± 1%   66.676Mi ± 0%   +915.47% (p=0.000 n=10)   66.042Mi ± 0%   +905.81% (p=0.000 n=10)
2M                 5.140Mi ± 1%   47.135Mi ± 0%   +816.98% (p=0.000 n=10)   47.016Mi ± 0%   +814.66% (p=0.000 n=10)
64Unaligned        5.322Mi ± 1%   21.024Mi ± 0%   +295.07% (p=0.000 n=10)   20.871Mi ± 1%   +292.20% (p=0.000 n=10)
1KUnaligned        6.561Mi ± 0%   66.614Mi ± 0%   +915.26% (p=0.000 n=10)   66.333Mi ± 0%   +910.97% (p=0.000 n=10)
2MUnaligned        5.140Mi ± 1%   47.197Mi ± 1%   +818.18% (p=0.000 n=10)   47.126Mi ± 0%   +816.79% (p=0.000 n=10)
Write64            6.599Mi ± 0%   57.268Mi ± 0%   +767.77% (p=0.000 n=10)   57.368Mi ± 0%   +769.29% (p=0.000 n=10)
Write1K            6.819Mi ± 0%   79.408Mi ± 0%  +1064.55% (p=0.000 n=10)   79.246Mi ± 0%  +1062.17% (p=0.000 n=10)
Write2M            5.140Mi ± 0%   47.169Mi ± 0%   +817.63% (p=0.000 n=10)   47.116Mi ± 0%   +816.60% (p=0.000 n=10)
Write64Unaligned   6.428Mi ± 3%   56.992Mi ± 1%   +786.65% (p=0.000 n=10)   56.424Mi ± 1%   +777.82% (p=0.000 n=10)
Write1KUnaligned   6.814Mi ± 2%   79.293Mi ± 0%  +1063.68% (p=0.000 n=10)   79.513Mi ± 0%  +1066.90% (p=0.000 n=10)
Write2MUnaligned   5.016Mi ± 2%   47.183Mi ± 1%   +840.59% (p=0.000 n=10)   47.183Mi ± 0%   +840.59% (p=0.000 n=10)
geomean            5.858Mi         49.17Mi        +739.29%                   49.02Mi        +736.70%

pkg: golang.org/x/crypto/chacha20poly1305
                              │   old.txt    │                 asm.txt                 │            asm-mips32r2.txt            │
                              │     B/s      │      B/s        vs base                 │      B/s       vs base                 │
Chacha20Poly1305/Open-64        1.230Mi ± 4%    3.042Mi ±  1%  +147.29% (p=0.000 n=10)    3.548Mi ± 2%  +188.37% (p=0.000 n=10)
Chacha20Poly1305/Seal-64        1.144Mi ± 1%    3.462Mi ±  1%  +202.50% (p=0.000 n=10)    3.810Mi ± 1%  +232.92% (p=0.000 n=10)
Chacha20Poly1305/Open-64-X      908.2Ki ± 1%   1718.8Ki ±  2%   +89.25% (p=0.000 n=10)   1840.8Ki ± 2%  +102.69% (p=0.000 n=10)
Chacha20Poly1305/Seal-64-X      839.8Ki ± 1%   1894.5Ki ±  2%  +125.58% (p=0.000 n=10)   2006.8Ki ± 2%  +138.95% (p=0.000 n=10)
Chacha20Poly1305/Open-1024      2.594Mi ± 3%    9.975Mi ±  1%  +284.56% (p=0.000 n=10)   13.208Mi ± 3%  +409.19% (p=0.000 n=10)
Chacha20Poly1305/Seal-1024      2.551Mi ± 1%   10.600Mi ±  2%  +315.51% (p=0.000 n=10)   14.353Mi ± 3%  +462.62% (p=0.000 n=10)
Chacha20Poly1305/Open-1024-X    2.470Mi ± 0%    8.569Mi ±  0%  +246.91% (p=0.000 n=10)   10.705Mi ± 2%  +333.40% (p=0.000 n=10)
Chacha20Poly1305/Seal-1024-X    2.413Mi ± 1%    9.036Mi ±  1%  +274.51% (p=0.000 n=10)   11.330Mi ± 1%  +369.57% (p=0.000 n=10)
Chacha20Poly1305/Open-1350      2.594Mi ± 3%    9.899Mi ±  2%  +281.62% (p=0.000 n=10)   13.237Mi ± 2%  +410.29% (p=0.000 n=10)
Chacha20Poly1305/Seal-1350      2.556Mi ± 1%   10.471Mi ±  1%  +309.70% (p=0.000 n=10)   13.452Mi ± 1%  +426.31% (p=0.000 n=10)
Chacha20Poly1305/Open-1350-X    2.503Mi ± 2%    8.817Mi ±  1%  +252.19% (p=0.000 n=10)   11.382Mi ± 1%  +354.67% (p=0.000 n=10)
Chacha20Poly1305/Seal-1350-X    2.460Mi ± 0%    9.093Mi ±  1%  +269.57% (p=0.000 n=10)   11.873Mi ± 2%  +382.56% (p=0.000 n=10)
Chacha20Poly1305/Open-2048      2.694Mi ± 2%   11.024Mi ±  2%  +309.20% (p=0.000 n=10)   14.963Mi ± 1%  +455.40% (p=0.000 n=10)
Chacha20Poly1305/Seal-2048      2.699Mi ± 0%   11.477Mi ±  2%  +325.27% (p=0.000 n=10)   15.240Mi ± 1%  +464.66% (p=0.000 n=10)
Chacha20Poly1305/Open-2048-X    2.637Mi ± 1%   10.056Mi ±  1%  +281.37% (p=0.000 n=10)   13.375Mi ± 1%  +407.23% (p=0.000 n=10)
Chacha20Poly1305/Seal-2048-X    2.627Mi ± 1%   10.328Mi ±  2%  +293.10% (p=0.000 n=10)   13.819Mi ± 2%  +425.95% (p=0.000 n=10)
Chacha20Poly1305/Open-4096      2.732Mi ± 5%   11.225Mi ±  4%  +310.82% (p=0.000 n=10)   16.041Mi ± 4%  +487.09% (p=0.000 n=10)
Chacha20Poly1305/Seal-4096      2.704Mi ± 2%   10.839Mi ±  7%  +300.88% (p=0.000 n=10)   15.693Mi ± 7%  +480.42% (p=0.000 n=10)
Chacha20Poly1305/Open-4096-X    2.670Mi ± 1%   10.381Mi ±  4%  +288.75% (p=0.000 n=10)   15.035Mi ± 4%  +463.04% (p=0.000 n=10)
Chacha20Poly1305/Seal-4096-X    2.680Mi ± 1%   10.867Mi ±  5%  +305.52% (p=0.000 n=10)   15.421Mi ± 7%  +475.44% (p=0.000 n=10)
Chacha20Poly1305/Open-8192      2.708Mi ± 2%   11.053Mi ±  3%  +308.10% (p=0.000 n=10)   15.926Mi ± 5%  +488.03% (p=0.000 n=10)
Chacha20Poly1305/Seal-8192      2.632Mi ± 4%   10.896Mi ±  6%  +313.95% (p=0.000 n=10)   16.031Mi ± 5%  +509.06% (p=0.000 n=10)
Chacha20Poly1305/Open-8192-X    2.666Mi ± 4%   10.948Mi ±  4%  +310.73% (p=0.000 n=10)   15.855Mi ± 3%  +494.81% (p=0.000 n=10)
Chacha20Poly1305/Seal-8192-X    2.637Mi ± 2%   10.805Mi ±  2%  +309.76% (p=0.000 n=10)   14.725Mi ± 6%  +458.41% (p=0.000 n=10)
Chacha20Poly1305/Open-16384     2.499Mi ± 4%   10.405Mi ± 13%  +316.41% (p=0.000 n=10)   13.628Mi ± 7%  +445.42% (p=0.000 n=10)
Chacha20Poly1305/Seal-16384     2.484Mi ± 4%    9.069Mi ±  4%  +265.07% (p=0.000 n=10)   12.131Mi ± 3%  +388.29% (p=0.000 n=10)
Chacha20Poly1305/Open-16384-X   2.389Mi ± 7%   10.028Mi ±  5%  +319.76% (p=0.000 n=10)   14.472Mi ± 3%  +505.79% (p=0.000 n=10)
Chacha20Poly1305/Seal-16384-X   2.475Mi ± 4%    9.084Mi ±  2%  +267.05% (p=0.000 n=10)   12.212Mi ± 6%  +393.45% (p=0.000 n=10)
geomean                         2.259Mi         8.271Mi        +266.21%                   10.90Mi       +382.79%

Fixes golang/go#39139
  • Loading branch information
stffabi authored and kszaq committed Oct 30, 2024
1 parent 9fadb0b commit 4bdae4b
Show file tree
Hide file tree
Showing 7 changed files with 487 additions and 3 deletions.
16 changes: 16 additions & 0 deletions chacha20/chacha_mipsle.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build gc && !purego

package chacha20

const bufSize = blockSize

//go:noescape
func xorKeyStream(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)

func (s *Cipher) xorKeyStreamBlocks(dst, src []byte) {
xorKeyStream(dst, src, &s.key, &s.nonce, &s.counter)
}
185 changes: 185 additions & 0 deletions chacha20/chacha_mipsle.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Ported from https://github.com/torvalds/linux/blob/1b294a1f35616977caddaddf3e9d28e576a1adbc/arch/mips/crypto/chacha-core.S
// which is licensed under:
// # ====================================================================
// # SPDX-License-Identifier: GPL-2.0 OR MIT
// #
// # Copyright (C) 2016-2018 René van Dorst <[email protected]>. All Rights Reserved.
// # Copyright (C) 2015-2019 Jason A. Donenfeld <[email protected]>. All Rights Reserved.
// # ====================================================================

//go:build gc && !purego

#include "textflag.h"

#define X0 R1
#define X1 R2
#define X2 R3
#define X3 R4
#define X4 R5
#define X5 R6
#define X6 R7
#define X7 R8
#define X8 R9
#define X9 R10
#define X10 R11
#define X11 R12
#define X12 R13
#define X13 R14
#define X14 R15
#define X15 R16

#define DST R17
#define SRC R18
#define SRC_LEN R19
#define KEY R20
#define NONCE R21
#define CTR R22

#define LOOP_I R24
#define TMP R25

#ifdef GOMIPS_r2
#define hasROTR
#endif
#ifdef GOMIPS_r5
#define hasROTR
#endif

#ifdef hasROTR
#define ROTL(S, R) \
ROTR $(32-S), R
#else
#define ROTL(S, R) \
SLL $(S), R, TMP \
SRL $(32-S), R \
OR TMP, R
#endif

#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
ADDU K, A \
ADDU L, B \
ADDU M, C \
ADDU N, D \
XOR A, V \
XOR B, W \
XOR C, Y \
XOR D, Z \
ROTL (S, V) \
ROTL (S, W) \
ROTL (S, Y) \
ROTL (S, Z)

#define FOR_STATE(OP, OP_MEM) \
OP ( $0x61707865, X0 ) \ // expa
OP ( $0x3320646e, X1 ) \ // nd 3
OP ( $0x79622d32, X2 ) \ // 2-by
OP ( $0x6b206574, X3 ) \ // te k
OP_MEM ( 0(KEY), X4 ) \
OP_MEM ( 4(KEY), X5 ) \
OP_MEM ( 8(KEY), X6 ) \
OP_MEM ( 12(KEY), X7 ) \
OP_MEM ( 16(KEY), X8 ) \
OP_MEM ( 20(KEY), X9 ) \
OP_MEM ( 24(KEY), X10 ) \
OP_MEM ( 28(KEY), X11 ) \
OP ( CTR, X12 ) \
OP_MEM ( 0(NONCE), X13 ) \
OP_MEM ( 4(NONCE), X14 ) \
OP_MEM ( 8(NONCE), X15 )

#define movw(x, y) \
MOVW x, y

#define ADD(V, REG) \
ADDU V, REG

#define ADD_MEM(ADDR, REG) \
MOVW ADDR, TMP \
ADDU TMP, REG

// XOR_STREAM_WORD works with unaligned memory, this is quite important since the strams might not be aligned.
// Especially during the use in TLS the memory is often unaligned.
#define XOR_STREAM_WORD( OFF, REG) \
MOVWL (4*OFF + 3)(SRC), TMP \
MOVWR (4*OFF)(SRC), TMP \
XOR REG, TMP \
MOVWL TMP, (4*OFF + 3)(DST) \
MOVWR TMP, (4*OFF)(DST)

// func xorKeyStream(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
TEXT ·xorKeyStream(SB), NOSPLIT|NOFRAME, $0
MOVW dst+0(FP), DST
MOVW src+12(FP), SRC
MOVW src_len+16(FP), SRC_LEN
MOVW key+24(FP), KEY
MOVW nonce+28(FP), NONCE
MOVW counter+32(FP), CTR

// load counter
MOVW (CTR), CTR

chacha:

// load initial State into X*
FOR_STATE ( movw, movw )

// set number of rounds
MOVW $20, LOOP_I

loop:
AXR( X0,X1,X2,X3, X4,X5,X6,X7, X12,X13,X14,X15, 16)
AXR( X8,X9,X10,X11, X12,X13,X14,X15, X4,X5,X6,X7, 12)
AXR( X0,X1,X2,X3, X4,X5,X6,X7, X12,X13,X14,X15, 8)
AXR( X8,X9,X10,X11, X12,X13,X14,X15, X4,X5,X6,X7, 7)
AXR( X0,X1,X2,X3, X5,X6,X7,X4, X15,X12,X13,X14, 16)
AXR( X10,X11,X8,X9, X15,X12,X13,X14, X5,X6,X7,X4, 12)
AXR( X0,X1,X2,X3, X5,X6,X7,X4, X15,X12,X13,X14, 8)
AXR( X10,X11,X8,X9, X15,X12,X13,X14, X5,X6,X7,X4, 7)

ADDU $-2, LOOP_I
BNE LOOP_I, loop

// add back the initial state to generate the key stream
FOR_STATE ( ADD, ADD_MEM )

// xor the key stream with the source and write out the result
XOR_STREAM_WORD (0, X0)
XOR_STREAM_WORD (1, X1)
XOR_STREAM_WORD (2, X2)
XOR_STREAM_WORD (3, X3)
XOR_STREAM_WORD (4, X4)
XOR_STREAM_WORD (5, X5)
XOR_STREAM_WORD (6, X6)
XOR_STREAM_WORD (7, X7)
XOR_STREAM_WORD (8, X8)
XOR_STREAM_WORD (9, X9)
XOR_STREAM_WORD (10, X10)
XOR_STREAM_WORD (11, X11)
XOR_STREAM_WORD (12, X12)
XOR_STREAM_WORD (13, X13)
XOR_STREAM_WORD (14, X14)
XOR_STREAM_WORD (15, X15)

// decrement length
ADDU $-64, SRC_LEN, SRC_LEN

// increment pointers
MOVW $64(DST), DST
MOVW $64(SRC), SRC

// increment counter
ADDU $1, CTR

// loop if there's still data
BNE SRC_LEN, chacha

// store Counter
MOVW counter+32(FP), TMP
MOVW CTR, (TMP)

RET

2 changes: 1 addition & 1 deletion chacha20/chacha_noasm.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build (!arm64 && !s390x && !ppc64le) || !gc || purego
//go:build (!arm64 && !s390x && !ppc64le && !mipsle) || !gc || purego

package chacha20

Expand Down
2 changes: 1 addition & 1 deletion chacha20poly1305/chacha20poly1305_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ func benchamarkChaCha20Poly1305Open(b *testing.B, buf []byte, nonceSize int) {
}

func BenchmarkChacha20Poly1305(b *testing.B) {
for _, length := range []int{64, 1350, 8 * 1024} {
for _, length := range []int{64, 1024, 1350, 2 * 1024, 4 * 1024, 8 * 1024, 16 * 1024} {
b.Run("Open-"+strconv.Itoa(length), func(b *testing.B) {
benchamarkChaCha20Poly1305Open(b, make([]byte, length), NonceSize)
})
Expand Down
2 changes: 1 addition & 1 deletion internal/poly1305/mac_noasm.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build (!amd64 && !ppc64le && !s390x) || !gc || purego
//go:build (!amd64 && !ppc64le && !s390x && !mipsle) || !gc || purego

package poly1305

Expand Down
53 changes: 53 additions & 0 deletions internal/poly1305/sum_mipsle.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build gc && !purego

package poly1305

// mac is a wrapper for macGeneric that redirects calls that would have gone to
// updateGeneric to update.
//
// Its Write and Sum methods are otherwise identical to the macGeneric ones, but
// using function pointers would carry a major performance cost.
type mac struct{ macGeneric }

func (h *mac) Write(p []byte) (int, error) {
nn := len(p)
if h.offset > 0 {
n := copy(h.buffer[h.offset:], p)
if h.offset+n < TagSize {
h.offset += n
return nn, nil
}
p = p[n:]
h.offset = 0
update(&h.macState, h.buffer[:], 1)
}
if n := len(p) - (len(p) % TagSize); n > 0 {
update(&h.macState, p[:n], 1)
p = p[n:]
}
if len(p) > 0 {
h.offset += copy(h.buffer[h.offset:], p)
}
return nn, nil
}

func (h *mac) Sum(out *[16]byte) {
state := h.macState
if n := h.offset; n > 0 {
h.buffer[n] = 1
n++
for ; n < TagSize; n++ {
h.buffer[n] = 0
}

update(&state, h.buffer[:], 0)
}
finalize(out, &state.h, &state.s)
}

//go:noescape
func update(state *macState, msg []byte, padbit uint32)
Loading

0 comments on commit 4bdae4b

Please sign in to comment.