diff --git a/bench/plot.py b/bench/plot.py new file mode 100755 index 00000000..c07bfd72 --- /dev/null +++ b/bench/plot.py @@ -0,0 +1,217 @@ +#!/usr/bin/env python3 + +import csv +from pathlib import Path +from dataclasses import dataclass +from enum import IntEnum, Enum, auto +from matplotlib import pyplot as plt +import click + + +class OpType(IntEnum): + OP1 = 1 # number_of_iterations, check_is_ok, sdev, mean, median, list_of_results + OP2 = 2 # inlen, number_of_iterations, check_is_ok, sdev, mean, median, list_of_results + OP3 = 3 # outlen, inlen, number_of_iterations, check_is_ok, sdev, mean, median, list_of_results + + +@dataclass(frozen=True) +class ImplFunction(object): + name: str + optype: OpType + + +class ImplType(Enum): + crypto_hash = (ImplFunction("", OpType.OP2), ) + crypto_kem = (ImplFunction("keypair", OpType.OP1), + ImplFunction("keypair_derand", OpType.OP1), + ImplFunction("enc", OpType.OP1), + ImplFunction("enc_derand", OpType.OP1), + ImplFunction("dec", OpType.OP1)) + crypto_onetimeauth = (ImplFunction("", OpType.OP2), ImplFunction("verify", OpType.OP2)) + crypto_scalarmult = (ImplFunction("base", OpType.OP1), ImplFunction("", OpType.OP1)) + crypto_secretbox = (ImplFunction("", OpType.OP2), ImplFunction("open", OpType.OP2), ImplFunction("open_forgery", OpType.OP2)) + crypto_sign = (ImplFunction("keypair", OpType.OP1), ImplFunction("", OpType.OP2), ImplFunction("open", OpType.OP2)) + crypto_stream = (ImplFunction("", OpType.OP2), ImplFunction("xor", OpType.OP2)) + crypto_xof = (ImplFunction("", OpType.OP3), ) + + +@dataclass +class Results(object): + name: str + """Name of the results config (machine/before/after/DOITM).""" + type: ImplType + """Implementation type (like crypto_kem, ...).""" + impl: str + """Implementation (like kyber512...).""" + arch: str + """Architecture (like amd64...).""" + variant: str + """The variant (like ref, avx, ...).""" + func: ImplFunction + """The function (like keypair gen, ...).""" + data: list + """The raw data.""" + + def __str__(self) -> str: + return f"{self.name} {self.type.name} {self.impl} {self.arch} {self.variant} {self.func.name} {self.func.optype.name}" + + def __repr__(self) -> str: + s = str(self) + "\n" + for l in self.data: + s += ", ".join(map(str, l)) + "\n" + return s + + +def plot_op1(ax, *results: Results): + labels = [] + data = [] + for result in results: + if not(result.data): + print(f"Skipping {result}") + continue + for line in result.data: + measurements = line[0] + ok = line[1] + sdev = line[2] + mean = line[3] + median = line[4] + rest = line[5:] + data.append(rest) + labels.append(result.name) + ax.boxplot(data, labels=labels) + ax.set_ylabel("cycles") + + +def plot_op2(ax, *results: Results): + for result in results: + if not(result.data): + print(f"Skipping {result}") + continue + lengths = [] + sdevs = [] + means = [] + for line in result.data: + inlen = line[0] + measurements = line[1] + ok = line[2] + sdev = line[3] + mean = line[4] + median = line[5] + rest = line[6:] + lengths.append(inlen) + sdevs.append(sdev) + means.append(mean) + ax.plot(lengths, means, label=result.name) + ax.legend(loc="best") + ax.set_xlabel("inlen") + ax.set_ylabel("cycles") + + +def plot_op3(ax, *results: Results): + for result in results: + if not(result.data): + print(f"Skipping {result}") + continue + lengths = [] + sdevs = [] + means = [] + for line in result.data: + outlen = line[0] + inlen = line[1] + measurements = line[2] + ok = line[3] + sdev = line[4] + mean = line[5] + median = line[6] + rest = line[7:] + lengths.append((outlen, inlen)) + sdevs.append(sdev) + means.append(mean) + ax.plot(lengths, means, label=result.name) + ax.legend(loc="best") + ax.set_xlabel("inlen") + ax.set_ylabel("cycles") + + +def load_directory(directory: Path) -> list[Results]: + bin_dir = directory / "bin" + all_results = [] + for impl_type in ImplType: + type_dir = bin_dir / impl_type.name + # sign, kem and stream (except xsalsa20) have additional subdirectory for the primitive + if impl_type in (ImplType.crypto_sign, ImplType.crypto_kem, ImplType.crypto_stream): + top_levels = list(type_dir.iterdir()) + impl_dirs = sum(map(lambda top: list(top.iterdir()) if top.name != "xsalsa20" else [top], top_levels), []) + else: + impl_dirs = list(type_dir.iterdir()) + for impl_dir in impl_dirs: + impl_name = impl_dir.name + for arch_dir in impl_dir.iterdir(): + arch = arch_dir.name + for variant_dir in arch_dir.iterdir(): + variant = variant_dir.name + for fname in variant_dir.glob("*.csv"): + for func in impl_type.value: + func_name = f"{variant}_{func.name}" if func.name else variant + if str(fname).endswith(func_name + ".csv"): + break + else: + raise ValueError("Unknown function") + with fname.open("r") as f: + reader = csv.reader(f) + data = [list(map(lambda x: float(x.strip()) if "." in x else int(x.strip()), line)) for line in reader] + results = Results(directory.name, impl_type, impl_name, arch, variant, func, data) + all_results.append(results) + return all_results + + +@click.command() +@click.argument("dirs", nargs=-1, type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path), required=True) +def main(dirs): + # (ImplType, impl, arch, variant, func) -> list[Results] + result_map = {} + for directory in dirs: + click.echo(f"Processing {directory}.") + results = load_directory(directory) + for r in results: + ident = (r.type, r.impl, r.arch, r.variant, r.func) + result_map.setdefault(ident, []) + result_map[ident].append(r) + + # (ImplType, impl, arch, variant) -> (func -> list[Results]) + func_map = {} + for ident, results in result_map.items(): + merged_ident = (ident[0], ident[1], ident[2], ident[3]) # all but the func + func_map.setdefault(merged_ident, {}) + func_map[merged_ident][ident[4]] = results + + for ident, func_result_map in func_map.items(): + funcs = ident[0].value + fig, axs = plt.subplots(len(funcs), figsize=(5, len(funcs)*4)) + if len(funcs) == 1: + axs = [axs] + for i, func in enumerate(funcs): + results = func_result_map.get(func) + if not results: + continue + ax = axs[i] + if len(results) <= 1: + print(f"Not enough results for {name}.") + continue + ax.set_title(func.name) + if func.optype == OpType.OP1: + plot_op1(ax, *results) + elif func.optype == OpType.OP2: + plot_op2(ax, *results) + elif func.optype == OpType.OP3: + plot_op3(ax, *results) + name = f"{ident[0].name}_{ident[1]}_{ident[2]}_{ident[3]}" + fname = name + ".png" + fig.suptitle(name) + fig.tight_layout() + fig.savefig(fname, dpi=300) + plt.close(fig) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/ci/jlog b/scripts/ci/jlog index fe99c9c4..f3dbf4c7 100755 --- a/scripts/ci/jlog +++ b/scripts/ci/jlog @@ -34,13 +34,13 @@ implementations_status "${wcard}.error" $error; echo "${BOLD}Status: ${NORMAL}" # print implementations with zero warnings in 'green' -cat $warning | egrep -E "^0, " | \ +cat $warning | grep -E "^0, " | \ while read line; do echo "${GREEN}${BOLD}OK, ${line}${NORMAL}" done # print implementations with some warnings in 'yellow' -cat $warning | egrep -vE "^0, " | \ +cat $warning | grep -vE "^0, " | \ while read line; do echo "${YELLOW}${BOLD}W, ${line}${NORMAL}" done diff --git a/scripts/ci/reporter/jlog b/scripts/ci/reporter/jlog index 1c26d46d..e488434a 100755 --- a/scripts/ci/reporter/jlog +++ b/scripts/ci/reporter/jlog @@ -36,7 +36,7 @@ print() file=$3; label=$4; - egrep -E "$filter" $file | \ + grep -E "$filter" $file | \ while read line; do line=${line/$dir\//}; echo -e "${color}${BOLD}${label}, ${line}${NORMAL}" @@ -46,7 +46,7 @@ print() clear_empty() { file=$1; - egrep -E "^0" $file | cut -d',' -f2 | \ + grep -E "^0" $file | cut -d',' -f2 | \ while read log; do rm -f "$log"; done diff --git a/src/Makefile.checksct b/src/Makefile.checksct index 7a1bfc0c..6055547e 100644 --- a/src/Makefile.checksct +++ b/src/Makefile.checksct @@ -7,8 +7,8 @@ ifneq ($(OP),) SCT_FLAGS ?= -CHECK_SCT_S = ($(JASMINC) -slice $* -checkSCT $(SCT_FLAGS) $< > $@ 2>&1) $(CIT) -CHECK_SCT = ($(JASMINC) -checkSCT $(SCT_FLAGS) $< > $@ 2>&1) $(CIT) +CHECK_SCT_S = ($(JAZZCT) --slice $* --speculative $(SCT_FLAGS) $< > $@ 2>&1) $(CIT) +CHECK_SCT = ($(JAZZCT) --speculative $(SCT_FLAGS) $< > $@ 2>&1) $(CIT) SCT_TARGETS = $(addsuffix .sct, $(FUNCTIONS)) diff --git a/src/Makefile.common b/src/Makefile.common index ab28f62b..10a967ee 100644 --- a/src/Makefile.common +++ b/src/Makefile.common @@ -36,6 +36,7 @@ JEXT ?= jazz override JFLAGS += -noinsertarraycopy JINCLUDE = -I Jade:$(SRC) JASMIN ?= jasminc +JAZZCT ?= jazzct JASMINC := $(JASMIN) $(JFLAGS) $(JINCLUDE) COMPILE = ($(JASMINC) -o $@ $<) $(CIT) diff --git a/src/common/doit.jinc b/src/common/doit.jinc new file mode 100644 index 00000000..81db77cb --- /dev/null +++ b/src/common/doit.jinc @@ -0,0 +1,135 @@ +// This file contains some utility functions that replace instructions +// that are not on the DOIT list of guaranteed constant-time instructions. + +// ROL is not DOIT, so use shifts. +inline fn __ROL32(reg u32 x, inline int c) -> reg u32 +{ + reg u32 xt xb; + xt = x; + if (c != 0 && c != 32) + { + xb = x; + xt <<= c; + xb >>= 32 - c; + xt |= xb; + } + return xt; +} + +inline fn __ROL32x(reg u32 x, inline int c) -> reg u32 +{ + reg u32 y; + if (c != 0 && c != 32) + { + y = x; + x <<= c; + y >>= 32 - c; + x |= y; + } + return x; +} + +inline fn __ROL32s(reg u32 x, inline int c) -> reg u32 +{ + stack u32 y; + if (c != 0 && c != 32) + { + y = x; + x <<= c; + y >>= 32 - c; + x |= y; + } + return x; +} + +inline fn __ROL32i(reg u32 x, inline int c) -> reg u32 +{ + ?{}, x = #ROL_32(x, c); + return x; +} + +// ROR is also not DOIT. +inline fn __ROR32(reg u32 x, inline int c) -> reg u32 +{ + x = __ROL32(x, 32 - c); + return x; +} + +// ROL is not DOIT, so use shifts. +inline fn __ROL64(reg u64 x, inline int c) -> reg u64 +{ + reg u64 xt xb; + xt = x; + if (c != 0 && c != 64) + { + xb = x; + xt <<= c; + xb >>= 64 - c; + xt |= xb; + } + return xt; +} + +// ROR is also not DOIT. +inline fn __ROR64(reg u64 x, inline int c) -> reg u64 +{ + x = __ROL64(x, 64 - c); + return x; +} + +// POPCNT is not DOIT. +inline fn __POPCNT32(reg u32 i) -> reg u32 +{ + reg u32 x y; + + // i = i - ((i >> 1) & 0x55555555); // add pairs of bits + x = i >> 1; + x &= 0x55555555; + i -= x; + + // i = (i & 0x33333333) + ((i >> 2) & 0x33333333); // quads + x = i & 0x33333333; + y = i >> 2; + y &= 0x33333333; + i = x + y; + + // i = (i + (i >> 4)) & 0x0f0f0f0f; // groups of 8 + x = i >> 4; + x += i; + i = x & 0x0f0f0f0f; + + // i *= 0x01010101; // horizontal sum of bytes + i *= 0x01010101; + // i >> 24; // return just that top byte + i >>= 24; + return i; +} + +inline fn __POPCNT64(reg u64 i) -> reg u64 +{ + reg u64 x y; + // i -= (i >> 1) & 0x5555555555555555; //put count of each 2 bits into those 2 bits + x = i; + x >>= 1; + x &= 0x5555555555555555; + i -= x; + + // i = (i & 0x3333333333333333) + ((i >> 2) & 0x3333333333333333); //put count of each 4 bits into those 4 bits + x = i; + x &= 0x3333333333333333; + y = i >> 2; + y &= 0x3333333333333333; + i = x + y; + + // i = (i + (i >> 4)) & 0x0f0f0f0f0f0f0f0f; //put count of each 8 bits into those 8 bits + x = i; + x >>= 4; + x += i; + x &= 0x0f0f0f0f0f0f0f0f; + + // (i * 0x0101010101010101) >> 56; //returns left 8 bits of x + (x<<8) + (x<<16) + (x<<24) + ... + x *= 0x0101010101010101; + x >>= 56; + + return x; +} \ No newline at end of file diff --git a/src/common/keccak/keccak1600/amd64/bmi1/keccakf1600.jinc b/src/common/keccak/keccak1600/amd64/bmi1/keccakf1600.jinc index 565c69ae..cb685a45 100644 --- a/src/common/keccak/keccak1600/amd64/bmi1/keccakf1600.jinc +++ b/src/common/keccak/keccak1600/amd64/bmi1/keccakf1600.jinc @@ -1,5 +1,6 @@ from Jade require "common/keccak/keccak1600/amd64/spec/keccakf1600_globals.jinc" // KECCAK1600_RC from Jade require "common/keccak/keccak1600/amd64/spec/keccakf1600.jinc" // __rhotates_spec +from Jade require "common/doit.jinc" //__ROL64 // C[x] = A[x,0] ^ A[x,1] ^ A[x,2] ^ A[x,3] ^ A[x,4] inline fn __theta_sum_bmi1(reg ptr u64[25] a) -> reg u64[5] @@ -32,7 +33,7 @@ inline fn __theta_rol_bmi1(reg u64[5] c) -> reg u64[5] d[x] = c[(x + 1) % 5]; // D[x] = ROT(D[x], 1) - _, _, d[x] = #ROL_64(d[x], 1); + d[x] = __ROL64(d[x], 1); // D[x] ^= C[x-1] d[x] ^= c[(x - 1 + 5) % 5]; @@ -70,7 +71,9 @@ inline fn __rol_sum_bmi1( // B[x] = ROT( B[x], r[x',y'] ); if(r != 0) - { _, _, b[x] = #ROL_64(b[x], r); } + { + b[x] = __ROL64(b[x], r); + } } diff --git a/src/common/keccak/keccak1600/amd64/ref/keccakf1600.jinc b/src/common/keccak/keccak1600/amd64/ref/keccakf1600.jinc index c748586d..7559172c 100644 --- a/src/common/keccak/keccak1600/amd64/ref/keccakf1600.jinc +++ b/src/common/keccak/keccak1600/amd64/ref/keccakf1600.jinc @@ -1,5 +1,6 @@ from Jade require "common/keccak/keccak1600/amd64/spec/keccakf1600_globals.jinc" // KECCAK1600_RC from Jade require "common/keccak/keccak1600/amd64/spec/keccakf1600.jinc" // __rhotates_spec +from Jade require "common/doit.jinc" //__ROL64 // C[x] = A[x,0] ^ A[x,1] ^ A[x,2] ^ A[x,3] ^ A[x,4] inline fn __theta_sum_ref(stack u64[25] a) -> reg u64[5] @@ -32,7 +33,7 @@ inline fn __theta_rol_ref(reg u64[5] c) -> reg u64[5] d[x] = c[(x + 1) % 5]; // D[x] = ROT(D[x], 1) - _, _, d[x] = #ROL_64(d[x], 1); + d[x] = __ROL64(d[x], 1); // D[x] ^= C[x-1] d[x] ^= c[(x - 1 + 5) % 5]; @@ -70,7 +71,9 @@ inline fn __rol_sum_ref( // B[x] = ROT( B[x], r[x',y'] ); if(r != 0) - { _, _, b[x] = #ROL_64(b[x], r); } + { + b[x] = __ROL64(b[x], r); + } } diff --git a/src/common/keccak/keccak1600/amd64/ref/keccakf1600_v0.jinc b/src/common/keccak/keccak1600/amd64/ref/keccakf1600_v0.jinc index 260147be..16302623 100644 --- a/src/common/keccak/keccak1600/amd64/ref/keccakf1600_v0.jinc +++ b/src/common/keccak/keccak1600/amd64/ref/keccakf1600_v0.jinc @@ -1,3 +1,4 @@ +from Jade require "common/doit.jinc" //__ROL64 u64[24] KECCAK_RC = { 0x0000000000000001 @@ -61,17 +62,6 @@ inline fn __rhotates(inline int x y) -> inline int return r; } -inline fn __ROL64(reg u64 x, inline int c) -> reg u64 -{ - reg u64 y; - - if (c == 0) - { y = x; } - else - { _, _, y = #ROL_64(x, c); } - return y; -} - inline fn __theta_sum_ref(stack u64[25] a) -> reg u64[5] { inline int i j; @@ -96,7 +86,7 @@ inline fn __theta_rol_ref(reg u64[5] c) -> reg u64[5] for i = 0 to 5 { d[i] = c[(i+1)%5]; - _, _, d[i] = #ROL_64(d[i], 1); + d[i] = __ROL64(d[i], 1); d[i] ^= c[(i+4)%5]; } diff --git a/src/common/keccak/keccak1600/amd64/ref1/keccakf1600.jinc b/src/common/keccak/keccak1600/amd64/ref1/keccakf1600.jinc index e261b30b..545e7d38 100644 --- a/src/common/keccak/keccak1600/amd64/ref1/keccakf1600.jinc +++ b/src/common/keccak/keccak1600/amd64/ref1/keccakf1600.jinc @@ -1,5 +1,6 @@ from Jade require "common/keccak/keccak1600/amd64/spec/keccakf1600_globals.jinc" // KECCAK1600_RC from Jade require "common/keccak/keccak1600/amd64/spec/keccakf1600.jinc" // __rhotates_spec +from Jade require "common/doit.jinc" //__ROL64 // C[x] = A[x,0] ^ A[x,1] ^ A[x,2] ^ A[x,3] ^ A[x,4] inline fn __theta_sum_ref1(reg ptr u64[25] a) -> reg u64[5] @@ -32,7 +33,7 @@ inline fn __theta_rol_ref1(reg u64[5] c) -> reg u64[5] d[x] = c[(x + 1) % 5]; // D[x] = ROT(D[x], 1) - _, _, d[x] = #ROL_64(d[x], 1); + d[x] = __ROL64(d[x], 1); // D[x] ^= C[x-1] d[x] ^= c[(x - 1 + 5) % 5]; @@ -70,7 +71,9 @@ inline fn __rol_sum_ref1( // B[x] = ROT( B[x], r[x',y'] ); if(r != 0) - { _, _, b[x] = #ROL_64(b[x], r); } + { + b[x] = __ROL64(b[x], r); + } } diff --git a/src/common/keccak/keccak1600/amd64/spec/keccakf1600.jinc b/src/common/keccak/keccak1600/amd64/spec/keccakf1600.jinc index a9113bc5..ea4b4bdb 100644 --- a/src/common/keccak/keccak1600/amd64/spec/keccakf1600.jinc +++ b/src/common/keccak/keccak1600/amd64/spec/keccakf1600.jinc @@ -1,5 +1,6 @@ require "keccakf1600_globals.jinc" +from Jade require "common/doit.jinc" //__ROL64 inline fn __index_spec(inline int x y) -> inline int { @@ -53,7 +54,9 @@ inline fn __theta_spec(stack u64[25] a) -> stack u64[25] for x = 0 to 5 { d[x] = c[(x + 1) % 5]; - _, _, d[x] = #ROL_64(d[x], 1); + + d[x] = __ROL64(d[x], 1); + d[x] ^= c[(x + 4) % 5]; } @@ -70,12 +73,16 @@ inline fn __theta_spec(stack u64[25] a) -> stack u64[25] inline fn __rho_spec(stack u64[25] a) -> stack u64[25] { inline int x y i z; + reg u64 t; for x = 0 to 5 { for y = 0 to 5 { i = __index_spec(x, y); z = __keccak_rho_offsets_spec(i); - _, _, a[i] = #ROL_64(a[i], z); + + t = a[i]; + t = __ROL64(t, z); + a[i] = t; } } diff --git a/src/crypto_hash/sha256/amd64/ref/sha256.jinc b/src/crypto_hash/sha256/amd64/ref/sha256.jinc index fa7497e4..4cab1790 100644 --- a/src/crypto_hash/sha256/amd64/ref/sha256.jinc +++ b/src/crypto_hash/sha256/amd64/ref/sha256.jinc @@ -1,5 +1,6 @@ require "sha256_globals.jinc" +from Jade require "common/doit.jinc" //__ROR32 inline fn __initH_ref() -> stack u32[8] { @@ -52,12 +53,13 @@ inline fn __store_H_ref(reg ptr u32[8] H, reg u32 a b c d e f g h) -> reg ptr u3 inline fn __store_ref(reg u64 out, stack u32[8] H) { inline int i; - reg u32 v; for i=0 to 8 - { v = H[i]; - v = #BSWAP_32(v); - (u32)[out + i*4] = v; + { // BSWAP could be used here, but it is not DOIT. + (u8)[out + i*4] = H[u8 i*4 + 3]; + (u8)[out + i*4 + 1] = H[u8 i*4 + 2]; + (u8)[out + i*4 + 2] = H[u8 i*4 + 1]; + (u8)[out + i*4 + 3] = H[u8 i*4]; } } @@ -71,10 +73,8 @@ inline fn __SHR_ref(reg u32 x, inline int c) -> reg u32 inline fn __ROTR_ref(reg u32 x, inline int c) -> reg u32 { - reg u32 r; - r = x; - _, _, r = #ROR_32(r, c); - return r; + x = __ROR32(x, c); + return x; } //(x & y) ^ (!x & z) @@ -186,7 +186,7 @@ inline fn __Wt_ref(stack u32[64] W, inline int t) -> stack u32[64] fn _blocks_0_ref(reg ptr u32[8] _H, reg u64 in inlen) -> reg ptr u32[8], reg u64, reg u64 { inline int t; - reg u32 T1 T2 a b c d e f g h r v; + reg u32 T1 T2 a b c d e f g h r; stack u32[64] W; reg ptr u32[64] Kp; stack ptr u32[8] Hp; @@ -202,9 +202,11 @@ fn _blocks_0_ref(reg ptr u32[8] _H, reg u64 in inlen) -> reg ptr u32[8], reg u64 while(inlen >= 64) { for t=0 to 16 - { v = (u32)[in + t*4]; - v = #BSWAP_32(v); - W[t] = v; + { //BSWAP could be used here, but it is not DOIT. + W[u8 t*4] = (u8)[in + t*4 + 3]; + W[u8 t*4 + 1] = (u8)[in + t*4 + 2]; + W[u8 t*4 + 2] = (u8)[in + t*4 + 1]; + W[u8 t*4 + 3] = (u8)[in + t*4]; } in_s = in; @@ -270,7 +272,7 @@ fn _blocks_0_ref(reg ptr u32[8] _H, reg u64 in inlen) -> reg ptr u32[8], reg u64 fn _blocks_1_ref(reg ptr u32[8] _H, reg ptr u32[32] sblocks, reg u64 nblocks) -> reg ptr u32[8], reg ptr u32[32] { inline int t; - reg u32 T1 T2 a b c d e f g h r v; + reg u32 T1 T2 a b c d e f g h r; stack u32[64] W; reg ptr u32[64] Kp; stack ptr u32[8] Hp; @@ -290,9 +292,11 @@ fn _blocks_1_ref(reg ptr u32[8] _H, reg ptr u32[32] sblocks, reg u64 nblocks) -> s_i = i; oblocks = i << 4; for t=0 to 16 - { v = sblocks[(int)oblocks + t]; - v = #BSWAP_32(v); - W[t] = v; + { //BSWAP could be used here, but it is not DOIT. + W[u8 t*4] = sblocks[u8 oblocks*4 + t*4 + 3]; + W[u8 t*4 + 1] = sblocks[u8 oblocks*4 + t*4 + 2]; + W[u8 t*4 + 2] = sblocks[u8 oblocks*4 + t*4 + 1]; + W[u8 t*4 + 3] = sblocks[u8 oblocks*4 + t*4]; } s_sblocks = sblocks; diff --git a/src/crypto_hash/sha512/amd64/ref/sha512.jinc b/src/crypto_hash/sha512/amd64/ref/sha512.jinc index 184af39b..76426f24 100644 --- a/src/crypto_hash/sha512/amd64/ref/sha512.jinc +++ b/src/crypto_hash/sha512/amd64/ref/sha512.jinc @@ -1,5 +1,6 @@ require "sha512_globals.jinc" +from Jade require "common/doit.jinc" //__ROR64 inline fn __initH_ref() -> stack u64[8] { @@ -51,13 +52,14 @@ inline fn __store_H_ref(reg ptr u64[8] H, reg u64 a b c d e f g h) -> reg ptr u6 inline fn __store_ref(reg u64 out, stack u64[8] H) { - inline int i; - reg u64 v; + inline int i j; for i=0 to 8 - { v = H[i]; - v = #BSWAP_64(v); - (u64)[out + i*8] = v; + { // BSWAP could be used here, but it is not DOIT. + for j=0 to 8 + { + (u8)[out + i*8 + j] = H[u8 i*8 + (7 - j)]; + } } } @@ -71,10 +73,8 @@ inline fn __SHR_ref(reg u64 x, inline int c) -> reg u64 inline fn __ROTR_ref(reg u64 x, inline int c) -> reg u64 { - reg u64 r; - r = x; - _, _, r = #ROR_64(r, c); - return r; + x = __ROR64(x, c); + return x; } //(x & y) ^ (!x & z) @@ -185,8 +185,8 @@ inline fn __Wt_ref(stack u64[80] W, inline int t) -> stack u64[80] fn _blocks_0_ref(reg ptr u64[8] _H, reg u64 in inlen) -> reg ptr u64[8], reg u64, reg u64 { - inline int t; - reg u64 T1 T2 a b c d e f g h r v; + inline int t u; + reg u64 T1 T2 a b c d e f g h r; stack u64[80] W; reg ptr u64[80] Kp; stack ptr u64[8] Hp; @@ -202,9 +202,11 @@ fn _blocks_0_ref(reg ptr u64[8] _H, reg u64 in inlen) -> reg ptr u64[8], reg u64 while(inlen >= 128) { for t=0 to 16 - { v = (u64)[in + t*8]; - v = #BSWAP_64(v); - W[t] = v; + { // BSWAP could be used here, but it is not DOIT. + for u=0 to 8 + { + W[u8 t*8 + u] = (u8)[in + t*8 + (7 - u)]; + } } in_s = in; @@ -269,8 +271,8 @@ fn _blocks_0_ref(reg ptr u64[8] _H, reg u64 in inlen) -> reg ptr u64[8], reg u64 fn _blocks_1_ref(reg ptr u64[8] _H, reg ptr u64[32] sblocks, reg u64 nblocks) -> reg ptr u64[8], reg ptr u64[32] { - inline int t; - reg u64 T1 T2 a b c d e f g h r v; + inline int t u; + reg u64 T1 T2 a b c d e f g h r; stack u64[80] W; reg ptr u64[80] Kp; stack ptr u64[8] Hp; @@ -290,9 +292,11 @@ fn _blocks_1_ref(reg ptr u64[8] _H, reg ptr u64[32] sblocks, reg u64 nblocks) -> s_i = i; oblocks = i << 4; for t=0 to 16 - { v = sblocks[(int)oblocks + t]; - v = #BSWAP_64(v); - W[t] = v; + { // BSWAP could be used here, but it is not DOIT. + for u=0 to 8 + { + W[u8 t*8 + u] = sblocks[u8 oblocks*8 + t*8 + (7 - u)]; + } } s_sblocks = sblocks; diff --git a/src/crypto_kem/kyber/kyber512/amd64/avx2/indcpa.jinc b/src/crypto_kem/kyber/kyber512/amd64/avx2/indcpa.jinc index 1f9e08d6..602bd6a5 100644 --- a/src/crypto_kem/kyber/kyber512/amd64/avx2/indcpa.jinc +++ b/src/crypto_kem/kyber/kyber512/amd64/avx2/indcpa.jinc @@ -23,7 +23,8 @@ fn __indcpa_keypair_derand(reg u64 pkp, reg u64 skp, reg ptr u8[KYBER_SYMBYTES] for i=0 to KYBER_SYMBYTES/8 { t64 = buf[u64 i]; - publicseed[u64 i] = t64; + // We declassify here because we are reading the public part of the seed. + #declassify publicseed[u64 i] = t64; t64 = buf[u64 i + KYBER_SYMBYTES/8]; noiseseed[u64 i] = t64; } @@ -82,7 +83,8 @@ fn __indcpa_enc_0(stack u64 sctp, reg ptr u8[KYBER_INDCPA_MSGBYTES] msgp, reg u6 while (i < KYBER_SYMBYTES/8) { t64 = (u64)[pkp]; - publicseed[u64 (int)i] = t64; + // We declassify here because we are reading the public part of the seed from the public key. + #declassify publicseed[u64 (int)i] = t64; pkp += 8; i += 1; } @@ -148,7 +150,8 @@ fn __indcpa_enc_1(reg ptr u8[KYBER_INDCPA_BYTES] ctp, reg ptr u8[KYBER_INDCPA_MS while (i < KYBER_SYMBYTES/8) { t64 = (u64)[pkp]; - publicseed[u64 (int)i] = t64; + // We declassify here because we are reading the public part of the seed from the public key. + #declassify publicseed[u64 (int)i] = t64; pkp += 8; i += 1; } diff --git a/src/crypto_kem/kyber/kyber512/amd64/ref/indcpa.jinc b/src/crypto_kem/kyber/kyber512/amd64/ref/indcpa.jinc index c1bb634b..040c59c0 100644 --- a/src/crypto_kem/kyber/kyber512/amd64/ref/indcpa.jinc +++ b/src/crypto_kem/kyber/kyber512/amd64/ref/indcpa.jinc @@ -23,7 +23,8 @@ fn __indcpa_keypair_derand(reg u64 pkp, reg u64 skp, reg ptr u8[KYBER_SYMBYTES] for i=0 to KYBER_SYMBYTES/8 { t64 = buf[u64 i]; - publicseed[u64 i] = t64; + // We declassify here because we are reading the public part of the seed. + #declassify publicseed[u64 i] = t64; t64 = buf[u64 i + KYBER_SYMBYTES/8]; noiseseed[u64 i] = t64; } @@ -88,7 +89,8 @@ fn __indcpa_enc_0(stack u64 sctp, reg ptr u8[KYBER_INDCPA_MSGBYTES] msgp, reg u6 while (i < KYBER_SYMBYTES/8) { t64 = (u64)[pkp]; - publicseed[u64 (int)i] = t64; + // We declassify here because we are reading the public part of the seed from the public key. + #declassify publicseed[u64 (int)i] = t64; pkp += 8; i += 1; } @@ -163,7 +165,8 @@ fn __indcpa_enc_1(reg ptr u8[KYBER_INDCPA_BYTES] ctp, reg ptr u8[KYBER_INDCPA_MS while (i < KYBER_SYMBYTES/8) { t64 = (u64)[pkp]; - publicseed[u64 (int)i] = t64; + // We declassify here because we are reading the public part of the seed from the public key. + #declassify publicseed[u64 (int)i] = t64; pkp += 8; i += 1; } diff --git a/src/crypto_kem/kyber/kyber768/amd64/avx2/indcpa.jinc b/src/crypto_kem/kyber/kyber768/amd64/avx2/indcpa.jinc index 9852b48f..89207574 100644 --- a/src/crypto_kem/kyber/kyber768/amd64/avx2/indcpa.jinc +++ b/src/crypto_kem/kyber/kyber768/amd64/avx2/indcpa.jinc @@ -23,7 +23,8 @@ fn __indcpa_keypair_derand(reg u64 pkp, reg u64 skp, reg ptr u8[KYBER_SYMBYTES] for i=0 to KYBER_SYMBYTES/8 { t64 = buf[u64 i]; - publicseed[u64 i] = t64; + // We declassify here because we are reading the public part of the seed. + #declassify publicseed[u64 i] = t64; t64 = buf[u64 i + KYBER_SYMBYTES/8]; noiseseed[u64 i] = t64; } @@ -85,7 +86,8 @@ fn __indcpa_enc_0(stack u64 sctp, reg ptr u8[KYBER_INDCPA_MSGBYTES] msgp, reg u6 while (i < KYBER_SYMBYTES/8) { t64 = (u64)[pkp]; - publicseed[u64 (int)i] = t64; + // We declassify here because we are reading the public part of the seed from the public key. + #declassify publicseed[u64 (int)i] = t64; pkp += 8; i += 1; } @@ -150,7 +152,8 @@ fn __indcpa_enc_1(reg ptr u8[KYBER_INDCPA_BYTES] ctp, reg ptr u8[KYBER_INDCPA_MS while (i < KYBER_SYMBYTES/8) { t64 = (u64)[pkp]; - publicseed[u64 (int)i] = t64; + // We declassify here because we are reading the public part of the seed from the public key. + #declassify publicseed[u64 (int)i] = t64; pkp += 8; i += 1; } diff --git a/src/crypto_kem/kyber/kyber768/amd64/ref/indcpa.jinc b/src/crypto_kem/kyber/kyber768/amd64/ref/indcpa.jinc index 34c8982f..c2254b90 100644 --- a/src/crypto_kem/kyber/kyber768/amd64/ref/indcpa.jinc +++ b/src/crypto_kem/kyber/kyber768/amd64/ref/indcpa.jinc @@ -37,7 +37,8 @@ fn __indcpa_keypair_derand(reg u64 pkp, reg u64 skp, reg ptr u8[KYBER_SYMBYTES] for i=0 to KYBER_SYMBYTES/8 { t64 = buf[u64 i]; - publicseed[u64 i] = t64; + // We declassify here because we are reading the public part of the seed. + #declassify publicseed[u64 i] = t64; t64 = buf[u64 i + KYBER_SYMBYTES/8]; noiseseed[u64 i] = t64; } @@ -110,7 +111,8 @@ fn __indcpa_enc(stack u64 sctp, reg ptr u8[32] msgp, reg u64 pkp, reg ptr u8[KYB while (i < KYBER_SYMBYTES/8) { t64 = (u64)[pkp]; - publicseed.[u64 8 * (int)i] = t64; + // We declassify here because we are reading the public part of the seed from the public key. + #declassify publicseed.[u64 8 * (int)i] = t64; pkp += 8; i += 1; } @@ -195,7 +197,8 @@ fn __iindcpa_enc(reg ptr u8[KYBER_CT_LEN] ctp, reg ptr u8[32] msgp, reg u64 pkp, while (i < KYBER_SYMBYTES/8) { t64 = (u64)[pkp]; - publicseed.[u64 8*(int)i] = t64; + // We declassify here because we are reading the public part of the seed from the public key. + #declassify publicseed.[u64 8*(int)i] = t64; pkp += 8; i += 1; } diff --git a/src/crypto_onetimeauth/poly1305/amd64/avx/poly1305.jinc b/src/crypto_onetimeauth/poly1305/amd64/avx/poly1305.jinc index d3e9f7cc..89769a37 100644 --- a/src/crypto_onetimeauth/poly1305/amd64/avx/poly1305.jinc +++ b/src/crypto_onetimeauth/poly1305/amd64/avx/poly1305.jinc @@ -29,8 +29,10 @@ inline fn __unpack_avx( r12[u64 o + 2] = l; l = rt[0]; - ?{},l = #SHRD(l, rt[1], 52); - h = l; + l >>= 52; + h = rt[1]; + h <<= 12; + l |= h; l &= mask26; r12[u64 o + 4] = l; @@ -38,8 +40,12 @@ inline fn __unpack_avx( l >>= 26; l &= mask26; r12[u64 o + 6] = l; + l = rt[1]; - ?{}, l = #SHRD(l, rt[2], 40); + l >>= 40; + h = rt[2]; + h <<= 24; + l |= h; r12[u64 o + 8] = l; return r12; diff --git a/src/crypto_onetimeauth/poly1305/amd64/avx2/poly1305.jinc b/src/crypto_onetimeauth/poly1305/amd64/avx2/poly1305.jinc index f641f9dd..933b001d 100644 --- a/src/crypto_onetimeauth/poly1305/amd64/avx2/poly1305.jinc +++ b/src/crypto_onetimeauth/poly1305/amd64/avx2/poly1305.jinc @@ -29,8 +29,10 @@ inline fn __unpack_avx2( r1234[u64 o + 4] = l; l = rt[0]; - ?{}, l = #SHRD(l, rt[1], 52); - h = l; + l >>= 52; + h = rt[1]; + h <<= 12; + l |= h; l &= mask26; r1234[u64 o + 8] = l; @@ -38,8 +40,12 @@ inline fn __unpack_avx2( l >>= 26; l &= mask26; r1234[u64 o + 12] = l; + l = rt[1]; - ?{}, l = #SHRD(l, rt[2], 40); + l >>= 40; + h = rt[2]; + h <<= 24; + l |= h; r1234[u64 o + 16] = l; return r1234; diff --git a/src/crypto_scalarmult/curve25519/amd64/ref5/mul5.jinc b/src/crypto_scalarmult/curve25519/amd64/ref5/mul5.jinc index 8d1c379a..cb56a0c2 100644 --- a/src/crypto_scalarmult/curve25519/amd64/ref5/mul5.jinc +++ b/src/crypto_scalarmult/curve25519/amd64/ref5/mul5.jinc @@ -119,18 +119,33 @@ inline fn __mul5_rss(stack u64[5] xa ya) -> reg u64[5] cf, r[3] += mulrax; _, mulr31 += mulrdx + cf; mulredmask = 0x7FFFFFFFFFFFF; - ?{}, mulr01 = #SHLD(mulr01, r[0], 13); + mulr01 <<= 13; + mulrax = r[0]; + mulrax >>= 51; + mulr01 |= mulrax; r[0] &= mulredmask; - ?{}, mulr11 = #SHLD(mulr11, r[1], 13); + mulr11 <<= 13; + mulrax = r[1]; + mulrax >>= 51; + mulr11 |= mulrax; r[1] &= mulredmask; r[1] += mulr01; - ?{}, mulr21 = #SHLD(mulr21, r[2], 13); + mulr21 <<= 13; + mulrax = r[2]; + mulrax >>= 51; + mulr21 |= mulrax; r[2] &= mulredmask; r[2] += mulr11; - ?{}, mulr31 = #SHLD(mulr31, r[3], 13); + mulr31 <<= 13; + mulrax = r[3]; + mulrax >>= 51; + mulr31 |= mulrax; r[3] &= mulredmask; r[3] += mulr21; - ?{}, mulr41 = #SHLD(mulr41, r[4], 13); + mulr41 <<= 13; + mulrax = r[4]; + mulrax >>= 51; + mulr41 |= mulrax; r[4] &= mulredmask; r[4] += mulr31; mulr41 = mulr41 * 19; @@ -293,18 +308,33 @@ fn _mul5_pp(reg ptr u64[5] xa ya) -> reg ptr u64[5] cf, r[3] += mulrax; _, mulr31 += mulrdx + cf; mulredmask = 0x7FFFFFFFFFFFF; - ?{}, mulr01 = #SHLD(mulr01, r[0], 13); + mulr01 <<= 13; + mulrax = r[0]; + mulrax >>= 51; + mulr01 |= mulrax; r[0] &= mulredmask; - ?{}, mulr11 = #SHLD(mulr11, r[1], 13); + mulr11 <<= 13; + mulrax = r[1]; + mulrax >>= 51; + mulr11 |= mulrax; r[1] &= mulredmask; r[1] += mulr01; - ?{}, mulr21 = #SHLD(mulr21, r[2], 13); + mulr21 <<= 13; + mulrax = r[2]; + mulrax >>= 51; + mulr21 |= mulrax; r[2] &= mulredmask; r[2] += mulr11; - ?{}, mulr31 = #SHLD(mulr31, r[3], 13); + mulr31 <<= 13; + mulrax = r[3]; + mulrax >>= 51; + mulr31 |= mulrax; r[3] &= mulredmask; r[3] += mulr21; - ?{}, mulr41 = #SHLD(mulr41, r[4], 13); + mulr41 <<= 13; + mulrax = r[4]; + mulrax >>= 51; + mulr41 |= mulrax; r[4] &= mulredmask; r[4] += mulr31; mulr41 = mulr41 * 19; diff --git a/src/crypto_scalarmult/curve25519/amd64/ref5/sqr5.jinc b/src/crypto_scalarmult/curve25519/amd64/ref5/sqr5.jinc index 64a6e3f1..bcca236f 100644 --- a/src/crypto_scalarmult/curve25519/amd64/ref5/sqr5.jinc +++ b/src/crypto_scalarmult/curve25519/amd64/ref5/sqr5.jinc @@ -79,18 +79,33 @@ inline fn __sqr5_rs(stack u64[5] xa) -> reg u64[5] cf, r[3] += squarerax; _, squarer31 += squarerdx + cf; squareredmask = 0x7FFFFFFFFFFFF; - _, _, _, _, _, squarer01 = #SHLD(squarer01, r[0], 13); + squarer01 <<= 13; + squarerax = r[0]; + squarerax >>= 51; + squarer01 |= squarerax; r[0] &= squareredmask; - _, _, _, _, _, squarer11 = #SHLD(squarer11, r[1], 13); + squarer11 <<= 13; + squarerax = r[1]; + squarerax >>= 51; + squarer11 |= squarerax; r[1] &= squareredmask; r[1] += squarer01; - _, _, _, _, _, squarer21 = #SHLD(squarer21, r[2], 13); + squarer21 <<= 13; + squarerax = r[2]; + squarerax >>= 51; + squarer21 |= squarerax; r[2] &= squareredmask; r[2] += squarer11; - _, _, _, _, _, squarer31 = #SHLD(squarer31, r[3], 13); + squarer31 <<= 13; + squarerax = r[3]; + squarerax >>= 51; + squarer31 |= squarerax; r[3] &= squareredmask; r[3] += squarer21; - _, _, _, _, _, squarer41 = #SHLD(squarer41, r[4], 13); + squarer41 <<= 13; + squarerax = r[4]; + squarerax >>= 51; + squarer41 |= squarerax; r[4] &= squareredmask; r[4] += squarer31; squarer41 = squarer41 * 19; @@ -217,18 +232,33 @@ fn _sqr5_p(reg ptr u64[5] xa) -> reg ptr u64[5] cf, r[3] += squarerax; _, squarer31 += squarerdx + cf; squareredmask = 0x7FFFFFFFFFFFF; - _, _, _, _, _, squarer01 = #SHLD(squarer01, r[0], 13); + squarer01 <<= 13; + squarerax = r[0]; + squarerax >>= 51; + squarer01 |= squarerax; r[0] &= squareredmask; - _, _, _, _, _, squarer11 = #SHLD(squarer11, r[1], 13); + squarer11 <<= 13; + squarerax = r[1]; + squarerax >>= 51; + squarer11 |= squarerax; r[1] &= squareredmask; r[1] += squarer01; - _, _, _, _, _, squarer21 = #SHLD(squarer21, r[2], 13); + squarer21 <<= 13; + squarerax = r[2]; + squarerax >>= 51; + squarer21 |= squarerax; r[2] &= squareredmask; r[2] += squarer11; - _, _, _, _, _, squarer31 = #SHLD(squarer31, r[3], 13); + squarer31 <<= 13; + squarerax = r[3]; + squarerax >>= 51; + squarer31 |= squarerax; r[3] &= squareredmask; r[3] += squarer21; - _, _, _, _, _, squarer41 = #SHLD(squarer41, r[4], 13); + squarer41 <<= 13; + squarerax = r[4]; + squarerax >>= 51; + squarer41 |= squarerax; r[4] &= squareredmask; r[4] += squarer31; squarer41 = squarer41 * 19; diff --git a/src/crypto_secretbox/xsalsa20poly1305/amd64/avx/salsa20_32D.jinc b/src/crypto_secretbox/xsalsa20poly1305/amd64/avx/salsa20_32D.jinc index 885f045c..fc76f192 100644 --- a/src/crypto_secretbox/xsalsa20poly1305/amd64/avx/salsa20_32D.jinc +++ b/src/crypto_secretbox/xsalsa20poly1305/amd64/avx/salsa20_32D.jinc @@ -59,7 +59,7 @@ inline fn __line_ref_32(reg u32[16] k, inline int a b c r) -> reg u32[16] reg u32 t; t = k[b]; t += k[c]; - _, _, t = #ROL_32(t, r); + t = __ROL32x(t, r); k[a] ^= t; return k; } @@ -75,58 +75,79 @@ inline fn __quarter_round_ref_32(reg u32[16] k, inline int a b c d) -> reg u32[1 } -inline fn __column_round_ref_32(reg u32[16] k, stack u32 k2 k3) -> reg u32[16], stack u32, stack u32 +inline fn __column_round_ref_32(reg u32[16] k, stack u32 k2 k3 k6) -> reg u32[16], stack u32, stack u32, stack u32 { - stack u32 k12 k13; + stack u32 k9 k12 k13; - k = __quarter_round_ref_32(k, 0, 4, 8, 12); k12 = k[12]; k[2] = k2; - k = __quarter_round_ref_32(k, 5, 9, 13, 1); k13 = k[13]; k[3] = k3; - k = __quarter_round_ref_32(k, 10, 14, 2, 6); + k = __quarter_round_ref_32(k, 0, 4, 8, 12); k12 = k[12]; + k = __quarter_round_ref_32(k, 5, 9, 13, 1); k9 = k[9]; k13 = k[13]; k[2] = k2; k[6] = k6; + k = __quarter_round_ref_32(k, 10, 14, 2, 6); k[3] = k3; k = __quarter_round_ref_32(k, 15, 3, 7, 11); - return k, k12, k13; + return k, k9, k12, k13; } -inline fn __line_round_ref_32(reg u32[16] k, stack u32 k12 k13) -> reg u32[16], stack u32, stack u32 +inline fn __line_round_ref_32(reg u32[16] k, stack u32 k9 k12 k13) -> reg u32[16], stack u32, stack u32, stack u32 { - stack u32 k2 k3; + stack u32 k2 k3 k6; - k = __quarter_round_ref_32(k, 0, 1, 2, 3); k2 = k[2]; k[12] = k12; - k = __quarter_round_ref_32(k, 5, 6, 7, 4); k3 = k[3]; k[13] = k13; - k = __quarter_round_ref_32(k, 10, 11, 8, 9); + k = __quarter_round_ref_32(k, 0, 1, 2, 3); k2 = k[2]; k3 = k[3]; + k = __quarter_round_ref_32(k, 5, 6, 7, 4); k6 = k[6]; k[9] = k9; + k = __quarter_round_ref_32(k, 10, 11, 8, 9); k[12] = k12; k[13] = k13; k = __quarter_round_ref_32(k, 15, 12, 13, 14); - return k, k2, k3; + return k, k2, k3, k6; } -inline fn __double_round_ref_32(reg u32[16] k, stack u32 k2 k3) -> reg u32[16], stack u32, stack u32 +// The function below requires the spillage of some state on the stack. +// ┌──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬─────────────────┐ +// │ 0│ 1│ 2│ 3│ 4│ 5│ 6│ 7│ 8│ 9│10│11│12│13│14│15│ Spilled values │ +// ┌────────┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼─────────────────┤ + - Value used +// │ Column │ +│ │ S│ S│ +│ │ S│ │ +│ │ │ │ +│ │ │ │ 3 │ +// │ round ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤ │ S - Stack spills +// │ │ │ +│ S│ S│ │ +│ S│ │ │ +│ │ │ S│ +│ │ │ 4 │ +// │ ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤ │ +// │ │ │ │ +│ S│ │ │ +│ │ │ S│ +│ │ S│ S│ +│ │ 4 │ +// │ ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤ │ +// │ │ │ │ │ +│ │ │ │ +│ │ S│ │ +│ S│ S│ │ +│ 3 │ +// ├────────┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤ │ +// │ Line │ +│ +│ +│ +│ │ │ │ │ │ S│ │ │ S│ S│ │ │ 3 │ +// │ round ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤ │ +// │ │ │ │ S│ S│ +│ +│ +│ +│ │ S│ │ │ S│ S│ │ │ 5 │ +// │ ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤ │ +// │ │ │ │ S│ S│ │ │ S│ │ +│ +│ +│ +│ S│ S│ │ │ 5 │ +// │ ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤ │ +// │ │ │ │ S│ S│ │ │ S│ │ │ │ │ │ +│ +│ +│ +│ 3 │ +// └────────┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴─────────────────┘ +// +inline fn __double_round_ref_32(reg u32[16] k, stack u32 k2 k3 k6) -> reg u32[16], stack u32, stack u32, stack u32 { - stack u32 k12 k13; + stack u32 k9 k12 k13; - k, k12, k13 = __column_round_ref_32(k, k2, k3); - k, k2, k3 = __line_round_ref_32(k, k12, k13); - return k, k2, k3; + k, k9, k12, k13 = __column_round_ref_32(k, k2, k3, k6); + k, k2, k3, k6 = __line_round_ref_32(k, k9, k12, k13); + return k, k2, k3, k6; } inline fn __rounds_ref_32(reg u32[16] k, stack u32 k2 k3) -> reg u32[16], stack u32 { - stack u32 s_c k15; + stack u32 s_c k15 k6; reg u32 c; + k6 = k[6]; + c = 10; while { s_c = c; - k, k2, k3 = __double_round_ref_32(k, k2, k3); + k, k2, k3, k6 = __double_round_ref_32(k, k2, k3, k6); c = s_c; ?{}, c = #DEC_32(c); } (c > 0) - k15 = k[15]; - k[2] = k2; - k[3] = k3; + k15 = k[15]; k[2] = k2; k[3] = k3; k[6] = k6; return k, k15; } diff --git a/src/crypto_secretbox/xsalsa20poly1305/amd64/avx/xsalsa20poly1305.jinc b/src/crypto_secretbox/xsalsa20poly1305/amd64/avx/xsalsa20poly1305.jinc index 1b96d27b..fb4dbc9c 100644 --- a/src/crypto_secretbox/xsalsa20poly1305/amd64/avx/xsalsa20poly1305.jinc +++ b/src/crypto_secretbox/xsalsa20poly1305/amd64/avx/xsalsa20poly1305.jinc @@ -63,6 +63,9 @@ inline fn __xsalsa20poly1305_avx_open(reg u64 m c clen nonce key) -> reg u64 clen = #LEA(clen - 32); r = __poly1305_verify_avx_k(tag, ct, clen, subkey_p); + // We declassify the result of tag verification, as the function returns it anyway. + // This is a hack due to the annotation getting lost if put directly on the inline function. + #declassify r = r; if(r == 0) { m = m_s; diff --git a/src/crypto_secretbox/xsalsa20poly1305/amd64/avx2/salsa20_32D.jinc b/src/crypto_secretbox/xsalsa20poly1305/amd64/avx2/salsa20_32D.jinc index 885f045c..fc76f192 100644 --- a/src/crypto_secretbox/xsalsa20poly1305/amd64/avx2/salsa20_32D.jinc +++ b/src/crypto_secretbox/xsalsa20poly1305/amd64/avx2/salsa20_32D.jinc @@ -59,7 +59,7 @@ inline fn __line_ref_32(reg u32[16] k, inline int a b c r) -> reg u32[16] reg u32 t; t = k[b]; t += k[c]; - _, _, t = #ROL_32(t, r); + t = __ROL32x(t, r); k[a] ^= t; return k; } @@ -75,58 +75,79 @@ inline fn __quarter_round_ref_32(reg u32[16] k, inline int a b c d) -> reg u32[1 } -inline fn __column_round_ref_32(reg u32[16] k, stack u32 k2 k3) -> reg u32[16], stack u32, stack u32 +inline fn __column_round_ref_32(reg u32[16] k, stack u32 k2 k3 k6) -> reg u32[16], stack u32, stack u32, stack u32 { - stack u32 k12 k13; + stack u32 k9 k12 k13; - k = __quarter_round_ref_32(k, 0, 4, 8, 12); k12 = k[12]; k[2] = k2; - k = __quarter_round_ref_32(k, 5, 9, 13, 1); k13 = k[13]; k[3] = k3; - k = __quarter_round_ref_32(k, 10, 14, 2, 6); + k = __quarter_round_ref_32(k, 0, 4, 8, 12); k12 = k[12]; + k = __quarter_round_ref_32(k, 5, 9, 13, 1); k9 = k[9]; k13 = k[13]; k[2] = k2; k[6] = k6; + k = __quarter_round_ref_32(k, 10, 14, 2, 6); k[3] = k3; k = __quarter_round_ref_32(k, 15, 3, 7, 11); - return k, k12, k13; + return k, k9, k12, k13; } -inline fn __line_round_ref_32(reg u32[16] k, stack u32 k12 k13) -> reg u32[16], stack u32, stack u32 +inline fn __line_round_ref_32(reg u32[16] k, stack u32 k9 k12 k13) -> reg u32[16], stack u32, stack u32, stack u32 { - stack u32 k2 k3; + stack u32 k2 k3 k6; - k = __quarter_round_ref_32(k, 0, 1, 2, 3); k2 = k[2]; k[12] = k12; - k = __quarter_round_ref_32(k, 5, 6, 7, 4); k3 = k[3]; k[13] = k13; - k = __quarter_round_ref_32(k, 10, 11, 8, 9); + k = __quarter_round_ref_32(k, 0, 1, 2, 3); k2 = k[2]; k3 = k[3]; + k = __quarter_round_ref_32(k, 5, 6, 7, 4); k6 = k[6]; k[9] = k9; + k = __quarter_round_ref_32(k, 10, 11, 8, 9); k[12] = k12; k[13] = k13; k = __quarter_round_ref_32(k, 15, 12, 13, 14); - return k, k2, k3; + return k, k2, k3, k6; } -inline fn __double_round_ref_32(reg u32[16] k, stack u32 k2 k3) -> reg u32[16], stack u32, stack u32 +// The function below requires the spillage of some state on the stack. +// ┌──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬─────────────────┐ +// │ 0│ 1│ 2│ 3│ 4│ 5│ 6│ 7│ 8│ 9│10│11│12│13│14│15│ Spilled values │ +// ┌────────┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼─────────────────┤ + - Value used +// │ Column │ +│ │ S│ S│ +│ │ S│ │ +│ │ │ │ +│ │ │ │ 3 │ +// │ round ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤ │ S - Stack spills +// │ │ │ +│ S│ S│ │ +│ S│ │ │ +│ │ │ S│ +│ │ │ 4 │ +// │ ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤ │ +// │ │ │ │ +│ S│ │ │ +│ │ │ S│ +│ │ S│ S│ +│ │ 4 │ +// │ ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤ │ +// │ │ │ │ │ +│ │ │ │ +│ │ S│ │ +│ S│ S│ │ +│ 3 │ +// ├────────┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤ │ +// │ Line │ +│ +│ +│ +│ │ │ │ │ │ S│ │ │ S│ S│ │ │ 3 │ +// │ round ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤ │ +// │ │ │ │ S│ S│ +│ +│ +│ +│ │ S│ │ │ S│ S│ │ │ 5 │ +// │ ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤ │ +// │ │ │ │ S│ S│ │ │ S│ │ +│ +│ +│ +│ S│ S│ │ │ 5 │ +// │ ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤ │ +// │ │ │ │ S│ S│ │ │ S│ │ │ │ │ │ +│ +│ +│ +│ 3 │ +// └────────┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴─────────────────┘ +// +inline fn __double_round_ref_32(reg u32[16] k, stack u32 k2 k3 k6) -> reg u32[16], stack u32, stack u32, stack u32 { - stack u32 k12 k13; + stack u32 k9 k12 k13; - k, k12, k13 = __column_round_ref_32(k, k2, k3); - k, k2, k3 = __line_round_ref_32(k, k12, k13); - return k, k2, k3; + k, k9, k12, k13 = __column_round_ref_32(k, k2, k3, k6); + k, k2, k3, k6 = __line_round_ref_32(k, k9, k12, k13); + return k, k2, k3, k6; } inline fn __rounds_ref_32(reg u32[16] k, stack u32 k2 k3) -> reg u32[16], stack u32 { - stack u32 s_c k15; + stack u32 s_c k15 k6; reg u32 c; + k6 = k[6]; + c = 10; while { s_c = c; - k, k2, k3 = __double_round_ref_32(k, k2, k3); + k, k2, k3, k6 = __double_round_ref_32(k, k2, k3, k6); c = s_c; ?{}, c = #DEC_32(c); } (c > 0) - k15 = k[15]; - k[2] = k2; - k[3] = k3; + k15 = k[15]; k[2] = k2; k[3] = k3; k[6] = k6; return k, k15; } diff --git a/src/crypto_secretbox/xsalsa20poly1305/amd64/avx2/xsalsa20poly1305.jinc b/src/crypto_secretbox/xsalsa20poly1305/amd64/avx2/xsalsa20poly1305.jinc index 76f24a0c..68a8461d 100644 --- a/src/crypto_secretbox/xsalsa20poly1305/amd64/avx2/xsalsa20poly1305.jinc +++ b/src/crypto_secretbox/xsalsa20poly1305/amd64/avx2/xsalsa20poly1305.jinc @@ -63,6 +63,9 @@ inline fn __xsalsa20poly1305_avx2_open(reg u64 m c clen nonce key) -> reg u64 clen = #LEA(clen - 32); r = __poly1305_verify_avx2_k(tag, ct, clen, subkey_p); + // We declassify the result of tag verification, as the function returns it anyway. + // This is a hack due to the annotation getting lost if put directly on the inline function. + #declassify r = r; if(r == 0) { m = m_s; diff --git a/src/crypto_secretbox/xsalsa20poly1305/amd64/ref/salsa20_32D.jinc b/src/crypto_secretbox/xsalsa20poly1305/amd64/ref/salsa20_32D.jinc index 885f045c..fc76f192 100644 --- a/src/crypto_secretbox/xsalsa20poly1305/amd64/ref/salsa20_32D.jinc +++ b/src/crypto_secretbox/xsalsa20poly1305/amd64/ref/salsa20_32D.jinc @@ -59,7 +59,7 @@ inline fn __line_ref_32(reg u32[16] k, inline int a b c r) -> reg u32[16] reg u32 t; t = k[b]; t += k[c]; - _, _, t = #ROL_32(t, r); + t = __ROL32x(t, r); k[a] ^= t; return k; } @@ -75,58 +75,79 @@ inline fn __quarter_round_ref_32(reg u32[16] k, inline int a b c d) -> reg u32[1 } -inline fn __column_round_ref_32(reg u32[16] k, stack u32 k2 k3) -> reg u32[16], stack u32, stack u32 +inline fn __column_round_ref_32(reg u32[16] k, stack u32 k2 k3 k6) -> reg u32[16], stack u32, stack u32, stack u32 { - stack u32 k12 k13; + stack u32 k9 k12 k13; - k = __quarter_round_ref_32(k, 0, 4, 8, 12); k12 = k[12]; k[2] = k2; - k = __quarter_round_ref_32(k, 5, 9, 13, 1); k13 = k[13]; k[3] = k3; - k = __quarter_round_ref_32(k, 10, 14, 2, 6); + k = __quarter_round_ref_32(k, 0, 4, 8, 12); k12 = k[12]; + k = __quarter_round_ref_32(k, 5, 9, 13, 1); k9 = k[9]; k13 = k[13]; k[2] = k2; k[6] = k6; + k = __quarter_round_ref_32(k, 10, 14, 2, 6); k[3] = k3; k = __quarter_round_ref_32(k, 15, 3, 7, 11); - return k, k12, k13; + return k, k9, k12, k13; } -inline fn __line_round_ref_32(reg u32[16] k, stack u32 k12 k13) -> reg u32[16], stack u32, stack u32 +inline fn __line_round_ref_32(reg u32[16] k, stack u32 k9 k12 k13) -> reg u32[16], stack u32, stack u32, stack u32 { - stack u32 k2 k3; + stack u32 k2 k3 k6; - k = __quarter_round_ref_32(k, 0, 1, 2, 3); k2 = k[2]; k[12] = k12; - k = __quarter_round_ref_32(k, 5, 6, 7, 4); k3 = k[3]; k[13] = k13; - k = __quarter_round_ref_32(k, 10, 11, 8, 9); + k = __quarter_round_ref_32(k, 0, 1, 2, 3); k2 = k[2]; k3 = k[3]; + k = __quarter_round_ref_32(k, 5, 6, 7, 4); k6 = k[6]; k[9] = k9; + k = __quarter_round_ref_32(k, 10, 11, 8, 9); k[12] = k12; k[13] = k13; k = __quarter_round_ref_32(k, 15, 12, 13, 14); - return k, k2, k3; + return k, k2, k3, k6; } -inline fn __double_round_ref_32(reg u32[16] k, stack u32 k2 k3) -> reg u32[16], stack u32, stack u32 +// The function below requires the spillage of some state on the stack. +// ┌──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬─────────────────┐ +// │ 0│ 1│ 2│ 3│ 4│ 5│ 6│ 7│ 8│ 9│10│11│12│13│14│15│ Spilled values │ +// ┌────────┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼─────────────────┤ + - Value used +// │ Column │ +│ │ S│ S│ +│ │ S│ │ +│ │ │ │ +│ │ │ │ 3 │ +// │ round ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤ │ S - Stack spills +// │ │ │ +│ S│ S│ │ +│ S│ │ │ +│ │ │ S│ +│ │ │ 4 │ +// │ ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤ │ +// │ │ │ │ +│ S│ │ │ +│ │ │ S│ +│ │ S│ S│ +│ │ 4 │ +// │ ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤ │ +// │ │ │ │ │ +│ │ │ │ +│ │ S│ │ +│ S│ S│ │ +│ 3 │ +// ├────────┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤ │ +// │ Line │ +│ +│ +│ +│ │ │ │ │ │ S│ │ │ S│ S│ │ │ 3 │ +// │ round ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤ │ +// │ │ │ │ S│ S│ +│ +│ +│ +│ │ S│ │ │ S│ S│ │ │ 5 │ +// │ ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤ │ +// │ │ │ │ S│ S│ │ │ S│ │ +│ +│ +│ +│ S│ S│ │ │ 5 │ +// │ ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤ │ +// │ │ │ │ S│ S│ │ │ S│ │ │ │ │ │ +│ +│ +│ +│ 3 │ +// └────────┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴─────────────────┘ +// +inline fn __double_round_ref_32(reg u32[16] k, stack u32 k2 k3 k6) -> reg u32[16], stack u32, stack u32, stack u32 { - stack u32 k12 k13; + stack u32 k9 k12 k13; - k, k12, k13 = __column_round_ref_32(k, k2, k3); - k, k2, k3 = __line_round_ref_32(k, k12, k13); - return k, k2, k3; + k, k9, k12, k13 = __column_round_ref_32(k, k2, k3, k6); + k, k2, k3, k6 = __line_round_ref_32(k, k9, k12, k13); + return k, k2, k3, k6; } inline fn __rounds_ref_32(reg u32[16] k, stack u32 k2 k3) -> reg u32[16], stack u32 { - stack u32 s_c k15; + stack u32 s_c k15 k6; reg u32 c; + k6 = k[6]; + c = 10; while { s_c = c; - k, k2, k3 = __double_round_ref_32(k, k2, k3); + k, k2, k3, k6 = __double_round_ref_32(k, k2, k3, k6); c = s_c; ?{}, c = #DEC_32(c); } (c > 0) - k15 = k[15]; - k[2] = k2; - k[3] = k3; + k15 = k[15]; k[2] = k2; k[3] = k3; k[6] = k6; return k, k15; } diff --git a/src/crypto_secretbox/xsalsa20poly1305/amd64/ref/xsalsa20poly1305.jinc b/src/crypto_secretbox/xsalsa20poly1305/amd64/ref/xsalsa20poly1305.jinc index 93a8a688..8da21efb 100644 --- a/src/crypto_secretbox/xsalsa20poly1305/amd64/ref/xsalsa20poly1305.jinc +++ b/src/crypto_secretbox/xsalsa20poly1305/amd64/ref/xsalsa20poly1305.jinc @@ -61,6 +61,9 @@ inline fn __xsalsa20poly1305_ref_open(reg u64 m c clen nonce key) -> reg u64 clen = #LEA(clen - 32); r = __poly1305_verify_ref_k(tag, ct, clen, subkey_p); + // We declassify the result of tag verification, as the function returns it anyway. + // This is a hack due to the annotation getting lost if put directly on the inline function. + #declassify r = r; if(r == 0) { m = m_s; diff --git a/src/crypto_sign/dilithium/common/amd64/fips202.jinc b/src/crypto_sign/dilithium/common/amd64/fips202.jinc index aeb015ad..25c8bd16 100644 --- a/src/crypto_sign/dilithium/common/amd64/fips202.jinc +++ b/src/crypto_sign/dilithium/common/amd64/fips202.jinc @@ -1,3 +1,4 @@ +from Jade require "common/doit.jinc" /* -- Stolen and modified from the Kyber repo -- */ param int SHAKE128_RATE = 168; @@ -24,7 +25,7 @@ fn theta(reg ptr u64[25] a) -> reg ptr u64[25] { for x = 0 to 5 { d[x] = c[(x + 1) % 5]; - ?{}, d[x] = #ROL_64(d[x], 1); + d[x] = __ROL64(d[x], 1); d[x] ^= c[(x + 4) % 5]; } @@ -66,7 +67,7 @@ fn rho(reg ptr u64[25] a) -> reg ptr u64[25] { for y = 0 to 5 { i = index(x, y); z = keccakRhoOffsets(i); - _, _, a[i] = #ROL_64(a[i], z); + a[i] = __ROL64(a[i], z); } } diff --git a/src/crypto_stream/chacha/common/amd64/ref/chacha_core.jinc b/src/crypto_stream/chacha/common/amd64/ref/chacha_core.jinc index d0f238d6..aeecff10 100644 --- a/src/crypto_stream/chacha/common/amd64/ref/chacha_core.jinc +++ b/src/crypto_stream/chacha/common/amd64/ref/chacha_core.jinc @@ -1,3 +1,4 @@ +from Jade require "common/doit.jinc" //__ROL32 // the following implementation requires: // - (even) param int CHACHA_ROUNDS; @@ -22,7 +23,7 @@ inline fn __copy_state_ref(stack u32[16] st) -> reg u32[16], stack u32 } -/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// // not exported; may be useful as spec; @@ -30,7 +31,7 @@ inline fn __line_ref(reg u32[16] k, inline int a b c r) -> reg u32[16] { k[a] += k[b]; k[c] ^= k[a]; - _, _, k[c] = #ROL_32(k[c], r); + k[c] = __ROL32x(k[c], r); return k; } @@ -111,9 +112,9 @@ inline fn __half_round_inline_ref( k[d0] ^= k[a0]; k[d1] ^= k[a1]; - - _, _, k[d0] = #ROL_32(k[d0], 16); - _, _, k[d1] = #ROL_32(k[d1], 16); + + k[d0] = __ROL32x(k[d0], 16); + k[d1] = __ROL32x(k[d1], 16); //k = line(k, c, d, b, 12); k[c0] += k[d0]; @@ -122,9 +123,9 @@ inline fn __half_round_inline_ref( k[b0] ^= k[c0]; k[b1] ^= k[c1]; - _, _, k[b0] = #ROL_32(k[b0], 12); - _, _, k[b1] = #ROL_32(k[b1], 12); - + k[b0] = __ROL32x(k[b0], 12); + k[b1] = __ROL32x(k[b1], 12); + //k = line(k, a, b, d, 8); k[a0] += k[b0]; k[a1] += k[b1]; @@ -132,8 +133,8 @@ inline fn __half_round_inline_ref( k[d0] ^= k[a0]; k[d1] ^= k[a1]; - _, _, k[d0] = #ROL_32(k[d0], 8); - _, _, k[d1] = #ROL_32(k[d1], 8); + k[d0] = __ROL32x(k[d0], 8); + k[d1] = __ROL32x(k[d1], 8); //k = line(k, c, d, b, 7); k[c0] += k[d0]; @@ -142,22 +143,43 @@ inline fn __half_round_inline_ref( k[b0] ^= k[c0]; k[b1] ^= k[c1]; - _, _, k[b0] = #ROL_32(k[b0], 7); - _, _, k[b1] = #ROL_32(k[b1], 7); + k[b0] = __ROL32x(k[b0], 7); + k[b1] = __ROL32x(k[b1], 7); return k; } // used; -inline fn __double_round_inline_ref(reg u32[16] k, stack u32 k14 k15) -> reg u32[16], stack u32, stack u32 +// +// +// The function below requires the spillage of some state on the stack +// this is due to the use of an auxilliary register in the implementation +// of __ROL32x. +// ┌──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┐ +// │ 0│ 1│ 2│ 3│ 4│ 5│ 6│ 7│ 8│ 9│10│11│12│13│14│15│ + - Value used +// ┌───────┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤ +// │ │ +│ │ +│ │ +│ S│ +│ │ +│ │ +│ │ +│ │ +│ S│ S - Stack spills +// │ Round ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤ +// │ │ │ +│ │ +│ S│ +│ │ +│ │ +│ │ +│ │ +│ S│ +│ +// ├───────┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤ +// │ │ +│ +│ │ │ S│ +│ +│ │ │ │ +│ +│ +│ │ S│ +│ +// │ Round ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤ +// │ │ │ │ +│ +│ +│ S│ │ +│ +│ +│ │ │ │ +│ +│ S│ +// └───────┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┘ +// +inline fn __double_round_inline_ref(reg u32[16] k, stack u32 k4 k5 k14 k15) -> reg u32[16], stack u32, stack u32, stack u32, stack u32 { k[14] = k14; + k[4] = k4; k = __half_round_inline_ref(k, 0, 4, 8, 12, 2, 6, 10, 14); k14 = k[14]; + k4 = k[4]; k[15] = k15; + k[5] = k5; + k = __half_round_inline_ref(k, 1, 5, 9, 13, 3, 7, 11, 15); @@ -166,35 +188,42 @@ inline fn __double_round_inline_ref(reg u32[16] k, stack u32 k14 k15) -> reg u32 0, 5, 10, 15); k15 = k[15]; + k5 = k[5]; k[14] = k14; + k[4] = k4; k = __half_round_inline_ref(k, 2, 7, 8, 13, 3, 4, 9, 14); k14 = k[14]; + k4 = k[4]; - return k, k14, k15; + return k, k4, k5, k14, k15; } // used; inline fn __rounds_inline_ref(reg u32[16] k, stack u32 k15) -> reg u32[16], stack u32 { - stack u32 s_c k14; + stack u32 s_c k4 k5 k14; reg u32 c; k14 = k[14]; + k4 = k[4]; + k5 = k[5]; c = (CHACHA_ROUNDS/2); while { s_c = c; - k, k14, k15 = __double_round_inline_ref(k, k14, k15); + k, k4, k5, k14, k15 = __double_round_inline_ref(k, k4, k5, k14, k15); c = s_c; (_,_,_,_,c) = #DEC_32(c); } (c > 0) + k[4] = k4; + k[5] = k5; k[14] = k14; return k, k15; } diff --git a/src/crypto_stream/salsa20/common/amd64/ref/salsa20_core.jinc b/src/crypto_stream/salsa20/common/amd64/ref/salsa20_core.jinc index 91103b72..ed884d50 100644 --- a/src/crypto_stream/salsa20/common/amd64/ref/salsa20_core.jinc +++ b/src/crypto_stream/salsa20/common/amd64/ref/salsa20_core.jinc @@ -1,3 +1,4 @@ +from Jade require "common/doit.jinc" //__ROL32 // the following implementation requires: // - (even) param int SALSA20_ROUNDS; @@ -33,7 +34,7 @@ inline fn __line_ref(reg u32[16] k, inline int a b c r) -> reg u32[16] reg u32 t; t = k[b]; t += k[c]; - _, _, t = #ROL_32(t, r); + t = __ROL32x(t, r); k[a] ^= t; return k; } @@ -49,58 +50,81 @@ inline fn __quarter_round_ref(reg u32[16] k, inline int a b c d) -> reg u32[16] } -inline fn __column_round_ref(reg u32[16] k, stack u32 k2 k3) -> reg u32[16], stack u32, stack u32 +inline fn __column_round_ref(reg u32[16] k, stack u32 k2 k3 k6) -> reg u32[16], stack u32, stack u32, stack u32 { - stack u32 k12 k13; + stack u32 k9 k12 k13; - k = __quarter_round_ref(k, 0, 4, 8, 12); k12 = k[12]; k[2] = k2; - k = __quarter_round_ref(k, 5, 9, 13, 1); k13 = k[13]; k[3] = k3; - k = __quarter_round_ref(k, 10, 14, 2, 6); + k = __quarter_round_ref(k, 0, 4, 8, 12); k12 = k[12]; + k = __quarter_round_ref(k, 5, 9, 13, 1); k9 = k[9]; k13 = k[13]; k[2] = k2; k[6] = k6; + k = __quarter_round_ref(k, 10, 14, 2, 6); k[3] = k3; k = __quarter_round_ref(k, 15, 3, 7, 11); - return k, k12, k13; + return k, k9, k12, k13; } -inline fn __line_round_ref(reg u32[16] k, stack u32 k12 k13) -> reg u32[16], stack u32, stack u32 +inline fn __line_round_ref(reg u32[16] k, stack u32 k9 k12 k13) -> reg u32[16], stack u32, stack u32, stack u32 { - stack u32 k2 k3; + stack u32 k2 k3 k6; - k = __quarter_round_ref(k, 0, 1, 2, 3); k2 = k[2]; k[12] = k12; - k = __quarter_round_ref(k, 5, 6, 7, 4); k3 = k[3]; k[13] = k13; - k = __quarter_round_ref(k, 10, 11, 8, 9); + k = __quarter_round_ref(k, 0, 1, 2, 3); k2 = k[2]; k3 = k[3]; + k = __quarter_round_ref(k, 5, 6, 7, 4); k6 = k[6]; k[9] = k9; + k = __quarter_round_ref(k, 10, 11, 8, 9); k[12] = k12; k[13] = k13; k = __quarter_round_ref(k, 15, 12, 13, 14); - return k, k2, k3; + return k, k2, k3, k6; } -inline fn __double_round_ref(reg u32[16] k, stack u32 k2 k3) -> reg u32[16], stack u32, stack u32 +// The function below requires the spillage of some state on the stack. +// ┌──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬─────────────────┐ +// │ 0│ 1│ 2│ 3│ 4│ 5│ 6│ 7│ 8│ 9│10│11│12│13│14│15│ Spilled values │ +// ┌────────┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼─────────────────┤ + - Value used +// │ Column │ +│ │ S│ S│ +│ │ S│ │ +│ │ │ │ +│ │ │ │ 3 │ +// │ round ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤ │ S - Stack spills +// │ │ │ +│ S│ S│ │ +│ S│ │ │ +│ │ │ S│ +│ │ │ 4 │ +// │ ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤ │ +// │ │ │ │ +│ S│ │ │ +│ │ │ S│ +│ │ S│ S│ +│ │ 4 │ +// │ ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤ │ +// │ │ │ │ │ +│ │ │ │ +│ │ S│ │ +│ S│ S│ │ +│ 3 │ +// ├────────┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤ │ +// │ Line │ +│ +│ +│ +│ │ │ │ │ │ S│ │ │ S│ S│ │ │ 3 │ +// │ round ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤ │ +// │ │ │ │ S│ S│ +│ +│ +│ +│ │ S│ │ │ S│ S│ │ │ 5 │ +// │ ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤ │ +// │ │ │ │ S│ S│ │ │ S│ │ +│ +│ +│ +│ S│ S│ │ │ 5 │ +// │ ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤ │ +// │ │ │ │ S│ S│ │ │ S│ │ │ │ │ │ +│ +│ +│ +│ 3 │ +// └────────┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴─────────────────┘ +// +inline fn __double_round_ref(reg u32[16] k, stack u32 k2 k3 k6) -> reg u32[16], stack u32, stack u32, stack u32 { - stack u32 k12 k13; + stack u32 k9 k12 k13; - k, k12, k13 = __column_round_ref(k, k2, k3); - k, k2, k3 = __line_round_ref(k, k12, k13); - return k, k2, k3; + k, k9, k12, k13 = __column_round_ref(k, k2, k3, k6); + k, k2, k3, k6 = __line_round_ref(k, k9, k12, k13); + return k, k2, k3, k6; } inline fn __rounds_ref(reg u32[16] k, stack u32 k2 k3) -> reg u32[16], stack u32 { - stack u32 s_c k15; + stack u32 s_c k15 k6; reg u32 c; + k6 = k[6]; + c = (SALSA20_ROUNDS/2); while { s_c = c; - k, k2, k3 = __double_round_ref(k, k2, k3); + k, k2, k3, k6 = __double_round_ref(k, k2, k3, k6); c = s_c; (_,_,_,_,c) = #DEC_32(c); } (c > 0) - k15 = k[15]; k[2] = k2; k[3] = k3; + k15 = k[15]; k[2] = k2; k[3] = k3; k[6] = k6; return k, k15; } diff --git a/src/crypto_stream/xsalsa20/amd64/avx/Makefile b/src/crypto_stream/xsalsa20/amd64/avx/Makefile index a5c992e6..60659907 100644 --- a/src/crypto_stream/xsalsa20/amd64/avx/Makefile +++ b/src/crypto_stream/xsalsa20/amd64/avx/Makefile @@ -1,2 +1,3 @@ +override JFLAGS += -lazy-regalloc SRCS := stream.jazz include ../../../../Makefile.common diff --git a/src/crypto_stream/xsalsa20/amd64/avx2/Makefile b/src/crypto_stream/xsalsa20/amd64/avx2/Makefile index a5c992e6..60659907 100644 --- a/src/crypto_stream/xsalsa20/amd64/avx2/Makefile +++ b/src/crypto_stream/xsalsa20/amd64/avx2/Makefile @@ -1,2 +1,3 @@ +override JFLAGS += -lazy-regalloc SRCS := stream.jazz include ../../../../Makefile.common diff --git a/src/crypto_xof/shake128/amd64/avx2/shake128_4x.jinc b/src/crypto_xof/shake128/amd64/avx2/shake128_4x.jinc index d372b648..975484bd 100644 --- a/src/crypto_xof/shake128/amd64/avx2/shake128_4x.jinc +++ b/src/crypto_xof/shake128/amd64/avx2/shake128_4x.jinc @@ -14,6 +14,7 @@ inline fn __shake128_squeezeblock4x( { reg u256 t256; reg u128 t128; + reg u64 t64; inline int i; state = _keccakf1600_4x_avx2(state); @@ -21,11 +22,13 @@ inline fn __shake128_squeezeblock4x( for i = 0 to (SHAKE128_RATE / 8) { t256 = state[i]; t128 = (128u)t256; - h0[u64 i] = #VMOVLPD(t128); - h1[u64 i] = #VMOVHPD(t128); + h0[u64 i] = (64u)t128; + t128 = #VPSRLDQ(t128, 8); + h1[u64 i] = (64u)t128; t128 = #VEXTRACTI128(t256, 1); - h2[u64 i] = #VMOVLPD(t128); - h3[u64 i] = #VMOVHPD(t128); + h2[u64 i] = (64u)t128; + t128 = #VPSRLDQ(t128, 8); + h3[u64 i] = (64u)t128; } return state, h0, h1, h2, h3; diff --git a/src/crypto_xof/shake256/amd64/avx2/shake256_4x.jinc b/src/crypto_xof/shake256/amd64/avx2/shake256_4x.jinc index c4d1db6b..0688b7de 100644 --- a/src/crypto_xof/shake256/amd64/avx2/shake256_4x.jinc +++ b/src/crypto_xof/shake256/amd64/avx2/shake256_4x.jinc @@ -1,5 +1,5 @@ -from Jade require "common/keccak/keccak1600/amd64/avx2/keccakf1600_4x.jinc" +from Jade require "common/keccak/keccak1600/amd64/avx2/keccakf1600_4x.jinc" // _keccakf1600_4x_avx2_ from Jade require "common/keccak/common/fips202_params.jinc" // SHAKE256_RATE inline fn __shake256_squeezeblock4x( @@ -14,6 +14,7 @@ inline fn __shake256_squeezeblock4x( { reg u256 t256; reg u128 t128; + reg u64 t64; inline int i; state = _keccakf1600_4x_avx2(state); @@ -21,11 +22,13 @@ inline fn __shake256_squeezeblock4x( for i = 0 to (SHAKE256_RATE / 8) { t256 = state[i]; t128 = (128u)t256; - h0[u64 i] = #VMOVLPD(t128); - h1[u64 i] = #VMOVHPD(t128); + h0[u64 i] = (64u)t128; + t128 = #VPSRLDQ(t128, 8); + h1[u64 i] = (64u)t128; t128 = #VEXTRACTI128(t256, 1); - h2[u64 i] = #VMOVLPD(t128); - h3[u64 i] = #VMOVHPD(t128); + h2[u64 i] = (64u)t128; + t128 = #VPSRLDQ(t128, 8); + h3[u64 i] = (64u)t128; } return state, h0, h1, h2, h3;