addmul_amd64.s

// The MIT License (MIT)
//
// Copyright (C) 2016-2017 Vivint, Inc.
// Copyright (c) 2015 Klaus Post
// Copyright (c) 2015 Backblaze
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

/*
The corresponding C implementations:

void addmul(
	uint8_t * restrict lowhigh,
	uint8_t * restrict in,
	uint8_t * restrict out,
	int n
) {
	for(int i = 0; i < n; i++){
		int value = in[i];
		int low   = value & 15;
		int high  = value >> 4;
		out[i] = out[i] ^ lowhigh[low] ^ lowhigh[high+16];
	}
}

void addmulSSSE3(
	uint8_t * restrict lowhigh,
	uint8_t * restrict in,
	uint8_t * restrict out,
	int n
) {
	int i = 0;

	__m128i lotbl = _mm_loadu_si128((__m128i*)(&lowhigh[0]));
	__m128i hitbl = _mm_loadu_si128((__m128i*)(&lowhigh[16]));

	__m128i lomask = _mm_set1_epi8(0xF);

	#pragma nounroll
	for(i = 0; i < (n/16)*16; i += 16){
		__m128i input8  = _mm_loadu_si128((__m128i*)(&in[i]));
		__m128i output8 = _mm_loadu_si128((__m128i*)(&out[i]));

		__m128i lo8 = _mm_and_si128(lomask, input8);
		__m128i hi8 = _mm_and_si128(lomask, _mm_srli_si128(input8, 4)); // simulate shrli epi8

		output8 = _mm_xor_si128(output8, _mm_shuffle_epi8(lotbl, lo8));
		output8 = _mm_xor_si128(output8, _mm_shuffle_epi8(hitbl, hi8));

		_mm_storeu_si128((__m128i*)(&out[i]), output8);
	}
}
*/

#include "textflag.h"
DATA  nybble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
DATA  nybble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
DATA  nybble_mask<>+0x10(SB)/8, $0x0F0F0F0F0F0F0F0F
DATA  nybble_mask<>+0x18(SB)/8, $0x0F0F0F0F0F0F0F0F
GLOBL nybble_mask<>(SB), (NOPTR+RODATA), $32

#define LOWHIGH  DI
#define LOW   X8
#define HIGH  X9
#define IN    SI
#define OUT   DX
#define INDEX AX

#define LEN   CX
#define LEN16 R8 // LEN16 = (LEN / 16) * 16

#define LOMASK X7 // LOMASK = repeated 15 
// X0-X5 temps

// func addmulSSSE3(lowhigh *[2][16]byte, in, out *byte, len int)
TEXT ·addmulSSSE3(SB), 7, $0
	MOVQ _in+8(FP),   IN
	MOVQ _out+16(FP), OUT
	MOVQ _len+24(FP), LEN

	MOVQ LEN,  LEN16
	ANDQ $-16, LEN16

	JLE start_slow // if LEN16 == 0 { goto done }
	
	MOVQ _lohi+0(FP), LOWHIGH
	MOVOU    (LOWHIGH), LOW
	MOVOU  16(LOWHIGH), HIGH
	
	MOVOU  nybble_mask<>(SB), LOMASK
	XORQ   INDEX, INDEX // INDEX = 0

loop16:
	MOVOU  (IN)(INDEX*1),  X0 // X0 = INPUT[INDEX]
	MOVOU  LOW,  X4            // X4 = copy(LOW)
	MOVOU  (OUT)(INDEX*1), X2 // X2 = OUT[INDEX]
	MOVOU  X0, X1              // X0 = input[index] & 15
	MOVOU  HIGH, X5            // X5 = copy(HIGH)
	
	PAND   LOMASK, X0
	PSRLQ  $4, X1              // X1 = input[index]
	PSHUFB X0, X4             // X4 = LOW[X0]

	PAND   LOMASK, X1         // X1 = input[index] >> 4
	PSHUFB X1, X5            // X5 = HIGH[X1]
	PXOR   X4, X2            // X2 = OUT[INDEX] ^ X4 ^ X5
	PXOR   X5, X2

	MOVOU X2, 0(OUT)(INDEX*1)
	
	ADDQ $16,   INDEX
	CMPQ LEN16, INDEX // INDEX < LEN16
	JG loop16

start_slow:
	MOVQ  _len+32(FP), LOWHIGH
	MOVQ LEN16, INDEX
	CMPQ LEN, INDEX
	JLE done

loop1:
	MOVBQZX (IN)(INDEX*1),   R9  // R9  := in[index]
	MOVBQZX (LOWHIGH)(R9*1), R10 // R10 := multiply[R9]
	XORB R10B, (OUT)(INDEX*1)    // out[index] ^= R10
	INCQ INDEX
	CMPQ LEN, INDEX
	JG loop1

done:
	RET

#undef LOWHIGH
#undef LOW
#undef HIGH
#undef IN
#undef OUT
#undef LEN
#undef INDEX
#undef LEN16
#undef LOMASK

// func addmulAVX2(lowhigh *[2][16]byte, in, out *byte, len int)
TEXT ·addmulAVX2(SB), 7, $0
	MOVQ  low+0(FP), SI     // SI: &lowhigh
	MOVOU (SI),   X6        // X6: low
	MOVOU 16(SI), X7        // X7: high
	
	MOVQ  $15, BX           // BX: low mask
	MOVQ  BX, X5
		
	MOVQ  len+24(FP), R9 // R9: len(in), len(out)

	LONG $0x384de3c4; WORD $0x01f6 // VINSERTI128 YMM6, YMM6, XMM6, 1 ; low
	LONG $0x3845e3c4; WORD $0x01ff // VINSERTI128 YMM7, YMM7, XMM7, 1 ; high
	LONG $0x787d62c4; BYTE $0xc5   // VPBROADCASTB YMM8, XMM5         ; X8: lomask (unpacked)

	SHRQ  $5, R9         // len(in) / 32
	MOVQ  out+16(FP), DX // DX: &out
	MOVQ  in+8(FP), SI   // R11: &in
	TESTQ R9, R9
	JZ    done_xor_avx2

loopback_xor_avx2:
	LONG $0x066ffec5             // VMOVDQU YMM0, [rsi]
	LONG $0x226ffec5             // VMOVDQU YMM4, [rdx]
	LONG $0xd073f5c5; BYTE $0x04 // VPSRLQ  YMM1, YMM0, 4   ; X1: high input
	LONG $0xdb7dc1c4; BYTE $0xc0 // VPAND   YMM0, YMM0, YMM8      ; X0: low input
	LONG $0xdb75c1c4; BYTE $0xc8 // VPAND   YMM1, YMM1, YMM8      ; X1: high input
	LONG $0x004de2c4; BYTE $0xd0 // VPSHUFB  YMM2, YMM6, YMM0   ; X2: mul low part
	LONG $0x0045e2c4; BYTE $0xd9 // VPSHUFB  YMM3, YMM7, YMM1   ; X2: mul high part
	LONG $0xdbefedc5             // VPXOR   YMM3, YMM2, YMM3    ; X3: Result
	LONG $0xe4efe5c5             // VPXOR   YMM4, YMM3, YMM4    ; X4: Result
	LONG $0x227ffec5             // VMOVDQU [rdx], YMM4

	ADDQ $32, SI           // in+=32
	ADDQ $32, DX           // out+=32
	SUBQ $1, R9
	JNZ  loopback_xor_avx2

done_xor_avx2:
	// VZEROUPPER
	BYTE $0xc5; BYTE $0xf8; BYTE $0x77
	RET