diff --git a/.gitmodules b/.gitmodules index 226d184b..a0ef671b 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,6 @@ [submodule "submodules/crypto-specs"] path = submodules/crypto-specs url = https://github.com/formosa-crypto/crypto-specs.git +[submodule "submodules/formosa-25519"] + path = submodules/formosa-25519 + url = https://github.com/formosa-crypto/formosa-25519.git diff --git a/src/Makefile.common b/src/Makefile.common index 9b1b272f..8280ceac 100644 --- a/src/Makefile.common +++ b/src/Makefile.common @@ -33,7 +33,7 @@ endif # -------------------------------------------------------------------- JEXT ?= jazz -override JFLAGS += -noinsertarraycopy +override JFLAGS += -noinsertarraycopy $(addprefix -slice ,$(FUNCTIONS)) JINCLUDE = -I Jade:$(SRC) JASMIN ?= jasminc diff --git a/src/crypto_kem/xwing/amd64/ref/scalarmult.jinc b/src/crypto_kem/xwing/amd64/ref/scalarmult.jinc index 98a12c78..b1b0b3d0 100644 --- a/src/crypto_kem/xwing/amd64/ref/scalarmult.jinc +++ b/src/crypto_kem/xwing/amd64/ref/scalarmult.jinc @@ -1,4 +1,4 @@ -from Jade require "crypto_scalarmult/curve25519/amd64/ref4/curve25519.jinc" +from Jade require "crypto_scalarmult/curve25519/amd64/ref4/scalarmult.jazz" fn xwing_x25519_base_p_rp(#spill_to_mmx reg u64 qp, reg ptr u8[32] _n) { diff --git a/src/crypto_scalarmult/curve25519/amd64/common/51/add5.jinc b/src/crypto_scalarmult/curve25519/amd64/common/51/add5.jinc deleted file mode 100644 index 28bda505..00000000 --- a/src/crypto_scalarmult/curve25519/amd64/common/51/add5.jinc +++ /dev/null @@ -1,46 +0,0 @@ -inline fn __add5_rrs(reg u64[5] f, stack u64[5] g) -> reg u64[5] -{ - inline int i; - reg u64[5] h; - - h = #copy(f); - - h[0] += g[0]; - for i=1 to 5 - { h[i] += g[i]; } - - return h; -} - -inline fn __add5_sss(stack u64[5] fs gs) -> stack u64[5] -{ - stack u64[5] hs; - reg u64[5] h f; - - f = #copy(fs); - h = __add5_rrs(f, gs); - hs = #copy(h); - - return hs; -} - -inline fn __add5_ssr(stack u64[5] fs, reg u64[5] g) -> stack u64[5] -{ - stack u64[5] hs; - reg u64[5] h; - - h = __add5_rrs(g, fs); - hs = #copy(h); - - return hs; -} - -inline fn __add5_rsr(stack u64[5] fs, reg u64[5] g) -> reg u64[5] -{ - reg u64[5] h; - - h = __add5_rrs(g, fs); - - return h; -} - diff --git a/src/crypto_scalarmult/curve25519/amd64/common/51/cswap5.jinc b/src/crypto_scalarmult/curve25519/amd64/common/51/cswap5.jinc deleted file mode 100644 index 4e0bae61..00000000 --- a/src/crypto_scalarmult/curve25519/amd64/common/51/cswap5.jinc +++ /dev/null @@ -1,113 +0,0 @@ -inline fn __cswap5( - stack u64[5] x2, - reg u64[5] z2r, - stack u64[5] x3, - stack u64[5] z3, - reg u64 toswap) - -> - stack u64[5], - reg u64[5], - stack u64[5], - stack u64[5] -{ - inline int i; - reg u64[5] t4 x2r x3r; - reg u64 t mask; - - ?{}, mask = #set0(); - mask -= toswap; // if toswap == 1 mask = -1 or all bits at 1, 0 otherwise - - // swap between z2 and z3 - t4 = #copy(z2r); - for i=0 to 5 - { t4[i] ^= z3[i]; - t4[i] &= mask; } // t4 = z2 ^ z3 - - for i=0 to 5 - { z2r[i] ^= t4[i]; - t = z3[i]; - t ^= t4[i]; - z3[i] = t; } - - // swap between x2 and x3 - x3r = #copy(x3); - for i=0 to 5 { x2r[i] = x2[i]; - t = x3r[i]; - t ^= x2r[i]; - t &= mask; - x2r[i] ^= t; - x3r[i] ^= t; - x2[i] = x2r[i]; - x3[i] = x3r[i]; } - - return x2, z2r, x3, z3; -} - -inline fn __cswap5_ssss( - stack u64[5] xs, - stack u64[5] ys, - reg u64 swap) - -> - stack u64[5], - stack u64[5] -{ - inline int i; - reg u64[5] x y; - reg u64 t mask; - - x = #copy(xs); - - mask = 0; - mask -= swap; - - for i=0 to 5 - { - y[i] = ys[i]; - - t = x[i]; - t ^= y[i]; - t &= mask; - - x[i] ^= t; // ^ (x[i] ^ y[i]) if swap == 1 - y[i] ^= t; - - ys[i] = y[i]; - } - - xs = #copy(x); - - return xs, ys; -} - -inline fn __cswap5_rsrs( - reg u64[5] x, - stack u64[5] ys, - reg u64 swap) - -> - reg u64[5], - stack u64[5] -{ - inline int i; - reg u64[5] y; - reg u64 t mask; - - mask = 0; - mask -= swap; - - for i=0 to 5 - { - y[i] = ys[i]; - - t = x[i]; - t ^= y[i]; - t &= mask; - - x[i] ^= t; // ^ (x[i] ^ y[i]) if swap == 1 - y[i] ^= t; - - ys[i] = y[i]; - } - - return x, ys; -} - diff --git a/src/crypto_scalarmult/curve25519/amd64/common/51/decode_u5.jinc b/src/crypto_scalarmult/curve25519/amd64/common/51/decode_u5.jinc deleted file mode 100644 index bc467d49..00000000 --- a/src/crypto_scalarmult/curve25519/amd64/common/51/decode_u5.jinc +++ /dev/null @@ -1,53 +0,0 @@ -inline fn __decode_u_coordinate5(reg u64[4] t) -> reg u64[5] -{ - reg u64[5] u; - reg u64 mask; - - mask = 0x7ffffffffffff; - - //u[0] = t[0] & mask; // 51; 13 left - u[0] = t[0]; - u[0] &= mask; - - //u[1] = (t[1] << 13) || (t[0] >> 51) & mask; // 38; 26 left - u[1] = t[1]; - u[1] <<= 13; - t[0] >>= 51; - u[1] |= t[0]; - u[1] &= mask; - - //u[2] = (t[2] << 26) || (t[1] >> 38) & mask; // 25; 39 left - u[2] = t[2]; - u[2] <<= 26; - t[1] >>= 38; - u[2] |= t[1]; - u[2] &= mask; - - //u[3] = (t[3] << 39) || (t[2] >> 25) & mask; // 12; '52' left - u[3] = t[3]; - u[3] <<= 39; - t[2] >>= 25; - u[3] |= t[2]; - u[3] &= mask; - - //u[4] = (t[3] >> 12) & mask; - u[4] = t[3]; - u[4] >>= 12; - u[4] &= mask; - - return u; -} - -inline fn __decode_u_coordinate_base5() -> reg u64[5] -{ - reg u64[5] u; - - u[0] = 9; - u[1] = 0; - u[2] = 0; - u[3] = 0; - u[4] = 0; - - return u; -} - diff --git a/src/crypto_scalarmult/curve25519/amd64/common/51/init_points5.jinc b/src/crypto_scalarmult/curve25519/amd64/common/51/init_points5.jinc deleted file mode 100644 index fe705c48..00000000 --- a/src/crypto_scalarmult/curve25519/amd64/common/51/init_points5.jinc +++ /dev/null @@ -1,56 +0,0 @@ -inline fn __init_points5( - reg u64[5] initr) - -> - stack u64[5], - reg u64[5], - stack u64[5], - stack u64[5] -{ - inline int i; - stack u64[5] x2 x3 z3; - reg u64[5] z2r; - reg u64 z; - - ?{}, z = #set0(); - - x2[0] = 1; - z2r[0] = 0; - x3 = #copy(initr); - z3[0] = 1; - - for i=1 to 5 - { x2[i] = z; - z2r[i] = z; - z3[i] = z; - } - - // (1, 0, init, 1) - return x2, z2r, x3, z3; -} - -inline fn __init_points5_x3() - -> - stack u64[5], - reg u64[5], - stack u64[5] -{ - inline int i; - stack u64[5] f1s f3s; - reg u64[5] f2; - reg u64 z; - - ?{}, z = #set0(); - - f1s[0] = 1; - f2[0] = 1; - f3s[0] = 1; - - for i=1 to 5 - { f1s[i] = z; - f2[i] = z; - f3s[i] = z; - } - - return f1s, f2, f3s; -} - diff --git a/src/crypto_scalarmult/curve25519/amd64/common/51/sub5.jinc b/src/crypto_scalarmult/curve25519/amd64/common/51/sub5.jinc deleted file mode 100644 index 9ab467e8..00000000 --- a/src/crypto_scalarmult/curve25519/amd64/common/51/sub5.jinc +++ /dev/null @@ -1,73 +0,0 @@ -inline fn __sub5_rrs(reg u64[5] f, stack u64[5] gs) -> reg u64[5] -{ - inline int i; - reg u64[5] h; - reg u64 _2p0 _2p1234; - - _2p0 = 0xfffffffffffda; - _2p1234 = 0xffffffffffffe; - - h = #copy(f); - h[0] += _2p0; - for i=1 to 5 - { h[i] += _2p1234; } - - for i=0 to 5 - { h[i] -= gs[i]; } - - return h; -} - -inline fn __sub5_sss(stack u64[5] fs gs) -> stack u64[5] -{ - stack u64[5] hs; - reg u64[5] h f; - - f = #copy(fs); - h = __sub5_rrs(f, gs); - hs = #copy(h); - - return hs; -} - -inline fn __sub5_rss(stack u64[5] fs gs) -> reg u64[5] -{ - reg u64[5] h f; - - f = #copy(fs); - h = __sub5_rrs(f, gs); - - return h; -} - -inline fn __sub5_rsr(stack u64[5] fs, reg u64[5] g) -> reg u64[5] -{ - inline int i; - reg u64[5] h; - reg u64 _2p0 _2p1234; - - _2p0 = 0xfffffffffffda; - _2p1234 = 0xffffffffffffe; - - h = #copy(fs); - h[0] += _2p0; - for i=1 to 5 - { h[i] += _2p1234; } - - for i=0 to 5 - { h[i] -= g[i]; } - - return h; -} - -inline fn __sub5_ssr(stack u64[5] fs, reg u64[5] g) -> stack u64[5] -{ - stack u64[5] hs; - reg u64[5] h; - - h = __sub5_rsr(fs, g); - hs = #copy(h); - - return hs; -} - diff --git a/src/crypto_scalarmult/curve25519/amd64/common/51/tobytes5.jinc b/src/crypto_scalarmult/curve25519/amd64/common/51/tobytes5.jinc deleted file mode 100644 index b74a2363..00000000 --- a/src/crypto_scalarmult/curve25519/amd64/common/51/tobytes5.jinc +++ /dev/null @@ -1,86 +0,0 @@ -inline fn __tobytes5(reg u64[5] f) -> reg u64[4] -{ - reg bool eq; - reg u64 loop; - reg u64[4] h; - reg u64 t two51minus1 two51minus19; - - two51minus1 = 0x0007FFFFFFFFFFFF; - two51minus19 = two51minus1; - two51minus19 -= 18; - loop = 3; - - while(loop > 0){ - t = f[0]; - t >>= 51; - f[0] &= two51minus1; - f[1] += t; - t = f[1]; - t >>= 51; - f[1] &= two51minus1; - f[2] += t; - t = f[2]; - t >>= 51; - f[2] &= two51minus1; - f[3] += t; - t = f[3]; - t >>= 51; - f[3] &= two51minus1; - f[4] += t; - t = f[4]; - t >>= 51; - f[4] &= two51minus1; - t *= 19; - f[0] += t; - loop = loop - 1; - } - t = 1; - - //signed> 13); // 26 spent; 25 left - h[1] = f[2]; - h[1] <<= 38; - f[1] >>= 13; - h[1] |= f[1]; - - // h[2] = (f[3] << 25) || (f[2] >> 26); // 39 spent; 12 left - h[2] = f[3]; - h[2] <<= 25; - f[2] >>= 26; - h[2] |= f[2]; - - // h[3] = f[4] << 12 || (f[3] >> 39); // 51 spent; 0 left - h[3] = f[4]; - h[3] <<= 12; - f[3] >>= 39; - h[3] |= f[3]; - - return h; -} - diff --git a/src/crypto_scalarmult/curve25519/amd64/common/64/add4.jinc b/src/crypto_scalarmult/curve25519/amd64/common/64/add4.jinc deleted file mode 100644 index 24ae0d79..00000000 --- a/src/crypto_scalarmult/curve25519/amd64/common/64/add4.jinc +++ /dev/null @@ -1,65 +0,0 @@ -// h = f + g -// h = 2**0*f0 + 2**64*f1 + 2**128*f2 + 2**192*f3 + -// 2**0*g0 + 2**64*g1 + 2**128*g2 + 2**192*g3 - -inline fn __add4_rrs(reg u64[4] f, stack u64[4] g) -> reg u64[4] -{ - inline int i; - reg bool cf; - reg u64[4] h; - reg u64 z; - - ?{}, z = #set0(); - - h = #copy(f); - - cf, h[0] += g[0]; - for i=1 to 4 - { cf, h[i] += g[i] + cf; } - - _, z -= z - cf; - z &= 38; - - cf, h[0] += z; - for i=1 to 4 - { cf, h[i] += 0 + cf; } - - _, z -= z - cf; - z &= 38; - h[0] += z; - - return h; -} - -inline fn __add4_sss(stack u64[4] fs gs) -> stack u64[4] -{ - stack u64[4] hs; - reg u64[4] h f; - - f = #copy(fs); - h = __add4_rrs(f, gs); - hs = #copy(h); - - return hs; -} - -inline fn __add4_ssr(stack u64[4] fs, reg u64[4] g) -> stack u64[4] -{ - stack u64[4] hs; - reg u64[4] h; - - h = __add4_rrs(g, fs); - hs = #copy(h); - - return hs; -} - -inline fn __add4_rsr(stack u64[4] fs, reg u64[4] g) -> reg u64[4] -{ - reg u64[4] h; - - h = __add4_rrs(g, fs); - - return h; -} - diff --git a/src/crypto_scalarmult/curve25519/amd64/common/64/cswap4.jinc b/src/crypto_scalarmult/curve25519/amd64/common/64/cswap4.jinc deleted file mode 100644 index d3f128d2..00000000 --- a/src/crypto_scalarmult/curve25519/amd64/common/64/cswap4.jinc +++ /dev/null @@ -1,112 +0,0 @@ -inline fn __cswap4( - stack u64[4] x2, - reg u64[4] z2r, - stack u64[4] x3, - stack u64[4] z3, - reg u64 toswap) - -> - stack u64[4], - reg u64[4], - stack u64[4], - stack u64[4] -{ - inline int i; - reg u64[4] t4 x2r x3r z3r; - reg u64 t mask; - - ?{}, mask = #set0(); - mask -= toswap; // if toswap == 1 mask = -1 or all bits at 1, 0 otherwise - - // swap between z2 and z3 - z3r = #copy(z3); - t4 = #copy(z2r); - - for i=0 to 4 { t4[i] ^= z3r[i]; } // t4 = z2 ^ z3 - for i=0 to 4 { t4[i] &= mask; } // t4 = (z2 ^ z3) & mask --> if m==0 then t4 = {0} - for i=0 to 4 { z2r[i] ^= t4[i]; - z3r[i] ^= t4[i]; - z3[i] = z3r[i]; } - - // swap between x3r and z3 - x3r = #copy(x3); - - for i=0 to 4 { x2r[i] = x2[i]; - t = x3r[i]; - t ^= x2r[i]; - t &= mask; - x2r[i] ^= t; - x3r[i] ^= t; - x2[i] = x2r[i]; - x3[i] = x3r[i]; } - - return x2, z2r, x3, z3; -} - -inline fn __cswap4_ssss( - stack u64[4] xs, - stack u64[4] ys, - reg u64 swap) - -> - stack u64[4], - stack u64[4] -{ - inline int i; - reg u64[4] x y; - reg u64 t mask; - - x = #copy(xs); - - mask = 0; - mask -= swap; - - for i=0 to 4 - { - y[i] = ys[i]; - - t = x[i]; - t ^= y[i]; - t &= mask; - - x[i] ^= t; // ^ (x[i] ^ y[i]) if swap == 1 - y[i] ^= t; - - ys[i] = y[i]; - } - - xs = #copy(x); - - return xs, ys; -} - -inline fn __cswap4_rsrs( - reg u64[4] x, - stack u64[4] ys, - reg u64 swap) - -> - reg u64[4], - stack u64[4] -{ - inline int i; - reg u64[4] y; - reg u64 t mask; - - mask = 0; - mask -= swap; - - for i=0 to 4 - { - y[i] = ys[i]; - - t = x[i]; - t ^= y[i]; - t &= mask; - - x[i] ^= t; // ^ (x[i] ^ y[i]) if swap == 1 - y[i] ^= t; - - ys[i] = y[i]; - } - - return x, ys; -} - diff --git a/src/crypto_scalarmult/curve25519/amd64/common/64/decode_u4.jinc b/src/crypto_scalarmult/curve25519/amd64/common/64/decode_u4.jinc deleted file mode 100644 index 04b1029c..00000000 --- a/src/crypto_scalarmult/curve25519/amd64/common/64/decode_u4.jinc +++ /dev/null @@ -1,18 +0,0 @@ -inline fn __decode_u_coordinate4(reg u64[4] u) -> reg u64[4] -{ - u[3] &= 0x7fffffffffffffff; - return u; -} - -inline fn __decode_u_coordinate_base4() -> reg u64[4] -{ - reg u64[4] u; - - u[0] = 9; - u[1] = 0; - u[2] = 0; - u[3] = 0; - - return u; -} - diff --git a/src/crypto_scalarmult/curve25519/amd64/common/64/init_points4.jinc b/src/crypto_scalarmult/curve25519/amd64/common/64/init_points4.jinc deleted file mode 100644 index e6d72583..00000000 --- a/src/crypto_scalarmult/curve25519/amd64/common/64/init_points4.jinc +++ /dev/null @@ -1,56 +0,0 @@ -inline fn __init_points4( - reg u64[4] initr) - -> - stack u64[4], - reg u64[4], - stack u64[4], - stack u64[4] -{ - inline int i; - stack u64[4] x2 x3 z3; - reg u64[4] z2r; - reg u64 z; - - ?{}, z = #set0(); - - x2[0] = 1; - z2r[0] = 0; - x3 = #copy(initr); - z3[0] = 1; - - for i=1 to 4 - { x2[i] = z; - z2r[i] = z; - z3[i] = z; - } - - // (1, 0, init, 1) - return x2, z2r, x3, z3; -} - -inline fn __init_points4_x3() - -> - stack u64[4], - reg u64[4], - stack u64[4] -{ - inline int i; - stack u64[4] f1s f3s; - reg u64[4] f2; - reg u64 z; - - ?{}, z = #set0(); - - f1s[0] = 1; - f2[0] = 1; - f3s[0] = 1; - - for i=1 to 4 - { f1s[i] = z; - f2[i] = z; - f3s[i] = z; - } - - return f1s, f2, f3s; -} - diff --git a/src/crypto_scalarmult/curve25519/amd64/common/64/sub4.jinc b/src/crypto_scalarmult/curve25519/amd64/common/64/sub4.jinc deleted file mode 100644 index 8f6f66e6..00000000 --- a/src/crypto_scalarmult/curve25519/amd64/common/64/sub4.jinc +++ /dev/null @@ -1,95 +0,0 @@ -// h = f - g -// h = (2**0*f0 + 2**64*f1 + 2**128*f2 + 2**192*f3) - -// (2**0*g0 + 2**64*g1 + 2**128*g2 + 2**192*g3) - -inline fn __sub4_rrs(reg u64[4] f, stack u64[4] gs) -> reg u64[4] -{ - inline int i; - reg bool cf; - reg u64[4] h; - reg u64 z; - - ?{}, z = #set0(); - - h = #copy(f); - - cf, h[0] -= gs[0]; - for i=1 to 4 - { cf, h[i] -= gs[i] - cf; } - - _, z -= z - cf; - z &= 38; - - cf, h[0] -= z; - for i=1 to 4 - { cf, h[i] -= 0 - cf; } - - _, z -= z - cf; - z &= 38; - h[0] -= z; - - return h; -} - -inline fn __sub4_sss(stack u64[4] fs gs) -> stack u64[4] -{ - stack u64[4] hs; - reg u64[4] h f; - - f = #copy(fs); - h = __sub4_rrs(f, gs); - hs = #copy(h); - - return hs; -} - -inline fn __sub4_rss(stack u64[4] fs gs) -> reg u64[4] -{ - reg u64[4] h f; - - f = #copy(fs); - h = __sub4_rrs(f, gs); - - return h; -} - -inline fn __sub4_rsr(stack u64[4] fs, reg u64[4] g) -> reg u64[4] -{ - inline int i; - reg bool cf; - reg u64[4] h; - reg u64 z; - - ?{}, z = #set0(); - - h = #copy(fs); - - cf, h[0] -= g[0]; - for i=1 to 4 - { cf, h[i] -= g[i] - cf; } - - _, z -= z - cf; - z &= 38; - - cf, h[0] -= z; - for i=1 to 4 - { cf, h[i] -= 0 - cf; } - - _, z -= z - cf; - z &= 38; - h[0] -= z; - - return h; -} - -inline fn __sub4_ssr(stack u64[4] fs, reg u64[4] g) -> stack u64[4] -{ - stack u64[4] hs; - reg u64[4] h; - - h = __sub4_rsr(fs, g); - hs = #copy(h); - - return hs; -} - diff --git a/src/crypto_scalarmult/curve25519/amd64/common/64/tobytes4.jinc b/src/crypto_scalarmult/curve25519/amd64/common/64/tobytes4.jinc deleted file mode 100644 index 3079462b..00000000 --- a/src/crypto_scalarmult/curve25519/amd64/common/64/tobytes4.jinc +++ /dev/null @@ -1,30 +0,0 @@ -inline fn __tobytes4(reg u64[4] f) -> reg u64[4] -{ - reg bool cf; - reg u64 t; - - t = #LEA(f[3] + f[3]); - ?{}, f[3] = #SAR(f[3], 63); - t >>= 1; - f[3] &= 19; - f[3] += 19; - - cf, f[0] += f[3]; - cf, f[1] += 0 + cf; - cf, f[2] += 0 + cf; - _, t += 0 + cf; - - f[3] = #LEA(t + t); - ?{}, t = #SAR(t, 63); - f[3] >>= 1; - t = !t; - t &= 19; - - cf, f[0] -= t; - cf, f[1] -= 0 - cf; - cf, f[2] -= 0 - cf; - _, f[3] -= 0 - cf; - - return f; - -} diff --git a/src/crypto_scalarmult/curve25519/amd64/common/bit.jinc b/src/crypto_scalarmult/curve25519/amd64/common/bit.jinc deleted file mode 100644 index f4eb4f30..00000000 --- a/src/crypto_scalarmult/curve25519/amd64/common/bit.jinc +++ /dev/null @@ -1,28 +0,0 @@ -inline fn __ith_bit(stack u8[32] k, reg u64 ctr) -> reg u64 -{ - reg u64 p bit; - - p = ctr; - p >>= 3; - bit = (64u) k[(int) p]; - - p = ctr; - p &= 7; - bit >>= (p & 63); - - bit &= 1; - - return bit; -} - -inline fn __next_bit(stack u64 k) -> reg u64, stack u64 -{ - reg bool cf; - reg u64 b one; - - ?{}, b = #set0(); - one = 1; - _, cf, _, _, _, k = #SHL(k, 1); - b = one if cf; - return b, k; -} diff --git a/src/crypto_scalarmult/curve25519/amd64/common/decode_scalar.jinc b/src/crypto_scalarmult/curve25519/amd64/common/decode_scalar.jinc deleted file mode 100644 index ce2e08bb..00000000 --- a/src/crypto_scalarmult/curve25519/amd64/common/decode_scalar.jinc +++ /dev/null @@ -1,28 +0,0 @@ -inline fn __decode_scalar(reg u64[4] k) -> stack u8[32] -{ - inline int i; - stack u8[32] ks; - - for i=0 to 4 - { ks[u64 i] = k[i]; } - - ks[0] &= 0xf8; - ks[31] &= 0x7f; - ks[31] |= 0x40; - - return ks; -} - -inline fn __decode_scalar_shl1(reg u64[4] k) -> stack u64[4] -{ - stack u64[4] ks; - - k[3] <<= 1; - k[0] &= 0xfffffffffffffff8; - k[3] |= 0x8000000000000000; - - ks = #copy(k); - - return ks; -} - diff --git a/src/crypto_scalarmult/curve25519/amd64/common/load_store4.jinc b/src/crypto_scalarmult/curve25519/amd64/common/load_store4.jinc deleted file mode 100644 index d417cfba..00000000 --- a/src/crypto_scalarmult/curve25519/amd64/common/load_store4.jinc +++ /dev/null @@ -1,19 +0,0 @@ -inline fn __load4(reg u64 p) -> reg u64[4] -{ - inline int i; - reg u64[4] a; - - for i=0 to 4 - { a[i] = [p + 8*i]; } - - return a; -} - -inline fn __store4(reg u64 p, reg u64[4] a) -{ - inline int i; - - for i=0 to 4 - { [p + 8*i] = a[i]; } -} - diff --git a/src/crypto_scalarmult/curve25519/amd64/mulx/curve25519.jinc b/src/crypto_scalarmult/curve25519/amd64/mulx/curve25519.jinc deleted file mode 100644 index 7f2d397b..00000000 --- a/src/crypto_scalarmult/curve25519/amd64/mulx/curve25519.jinc +++ /dev/null @@ -1,168 +0,0 @@ -from Jade require "crypto_scalarmult/curve25519/amd64/common/bit.jinc" -from Jade require "crypto_scalarmult/curve25519/amd64/common/decode_scalar.jinc" - -from Jade require "crypto_scalarmult/curve25519/amd64/common/64/decode_u4.jinc" -from Jade require "crypto_scalarmult/curve25519/amd64/common/64/init_points4.jinc" -from Jade require "crypto_scalarmult/curve25519/amd64/common/64/add4.jinc" -from Jade require "crypto_scalarmult/curve25519/amd64/common/64/sub4.jinc" -from Jade require "crypto_scalarmult/curve25519/amd64/common/64/cswap4.jinc" -from Jade require "crypto_scalarmult/curve25519/amd64/common/64/tobytes4.jinc" - -require "mul4.jinc" -require "sqr4.jinc" -require "invert4.jinc" - -inline fn __add_and_double4( - stack u64[4] init, - stack u64[4] x2, - reg u64[4] z2r, - stack u64[4] x3, - stack u64[4] z3) - -> - stack u64[4], - reg u64[4], - stack u64[4], - stack u64[4] -{ - stack u64[4] z2 t0 t1 t2; - reg u64[4] t1r; - - t0 = __sub4_ssr(x2, z2r); - x2 = __add4_ssr(x2, z2r); - - t1 = __sub4_sss(x3, z3); - z2 = __add4_sss(x3, z3); - - z3 = __mul4_sss(x2, t1); - z2 = __mul4_sss(z2, t0); - - t2 = __sqr4_ss(x2); - t1r = __sqr4_rs(t0); - - x3 = __add4_sss(z3, z2); - z2 = __sub4_sss(z3, z2); - - t0 = __sub4_ssr(t2, t1r); - x2 = __mul4_ssr(t2, t1r); - - z2 = __sqr4_ss(z2); - z3 = __mul4_a24_ss(t0, 121665); - x3 = __sqr4_ss(x3); - - t2 = __add4_sss(t2, z3); - z3 = __mul4_sss(init, z2); - z2r = __mul4_rss(t0, t2); - - return x2, z2r, x3, z3; -} - -inline fn __montgomery_ladder_step4( - stack u8[32] k, - stack u64[4] init, - stack u64[4] x2, - reg u64[4] z2r, - stack u64[4] x3, - stack u64[4] z3, - stack u64 swapped, - reg u64 ctr) - -> - stack u64[4], - reg u64[4], - stack u64[4], - stack u64[4], - stack u64 -{ - reg u64 toswap bit; - - bit = __ith_bit(k, ctr); - - toswap = swapped; - toswap ^= bit; - - x2, z2r, x3, z3 = __cswap4(x2, z2r, x3, z3, toswap); - swapped = bit; - - x2, z2r, x3, z3 = __add_and_double4(init, x2, z2r, x3, z3); - - return x2, z2r, x3, z3, swapped; -} - - -inline fn __montgomery_ladder4( - reg u64[4] u, - stack u8[32] k) - -> - stack u64[4], - reg u64[4] -{ - stack u64[4] us x2 x3 z3; - reg u64[4] z2r; - stack u64 swapped; - #spill_to_mmx reg u64 ctr; - - (x2,z2r,x3,z3) = __init_points4(u); - us = #copy(u); - - ctr = 255; - swapped = 0; - - while - { - ctr -= 1; - () = #spill(ctr); - - (x2, z2r, x3, z3, swapped) = - __montgomery_ladder_step4(k, us, x2, z2r, x3, z3, swapped, ctr); - - () = #unspill(ctr); - } (ctr > 0) - - return x2, z2r; -} - -inline fn __encode_point4(stack u64[4] x2, reg u64[4] z2r) -> reg u64[4] -{ - reg u64[4] r; - - z2r = __invert4(z2r); - r = __mul4_rsr(x2, z2r); - r = __tobytes4(r); - - return r; -} - -inline fn __curve25519_internal_mulx(stack u8[32] k, reg u64[4] u) -> reg u64[4] -{ - stack u64[4] x2; - reg u64[4] z2r r; - - (x2,z2r) = __montgomery_ladder4(u, k); - r = __encode_point4(x2,z2r); - - return r; -} - -inline fn __curve25519_mulx(reg u64[4] _k _u) -> reg u64[4] -{ - stack u8[32] k; - reg u64[4] u r; - - k = __decode_scalar(_k); - u = __decode_u_coordinate4(_u); - r = __curve25519_internal_mulx(k, u); - - return r; -} - -inline fn __curve25519_mulx_base(reg u64[4] _k) -> reg u64[4] -{ - stack u8[32] k; - reg u64[4] u r; - - k = __decode_scalar(_k); - u = __decode_u_coordinate_base4(); - r = __curve25519_internal_mulx(k, u); - - return r; -} - diff --git a/src/crypto_scalarmult/curve25519/amd64/mulx/invert4.jinc b/src/crypto_scalarmult/curve25519/amd64/mulx/invert4.jinc deleted file mode 100644 index a311cd71..00000000 --- a/src/crypto_scalarmult/curve25519/amd64/mulx/invert4.jinc +++ /dev/null @@ -1,100 +0,0 @@ -require "mul4.jinc" -require "sqr4.jinc" - -inline fn __invert4(reg u64[4] f) -> reg u64[4] -{ - reg u32 i; - stack u64[4] fs t0s t1s t2s; - reg u64[4] t0 t1 t2 t3; - - fs = #copy(f); - - // z2 = z1^2^1 - t0 = _sqr4_rr_(f); - t0s = #copy(t0); - - // z8 = z2^2^2 - t1 = _sqr4_rr_(t0); - t1 = _sqr4_rr_(t1); - - // z9 = z1*z8 - t1 = _mul4_rsr_(fs,t1); - t1s = #copy(t1); - - // z11 = z2*z9 - t0 = _mul4_rsr_(t0s,t1); - t0s = #copy(t0); - - // z22 = z11^2^1 - t2 = _sqr4_rr_(t0); - - // z_5_0 = z9*z22 - t1 = _mul4_rsr_(t1s,t2); - t1s = #copy(t1); - - // z_10_5 = z_5_0^2^5 - t2 = _sqr4_rr_(t1); - i = 4/2; - t2 = _it_sqr4_x2_(t2, i); - t2s = #copy(t2); - - // z_10_0 = z_10_5*z_5_0 - t1 = _mul4_rsr_(t1s,t2); - t1s = #copy(t1); - - // z_20_10 = z_10_0^2^10 - i = 10/2; - t2 = _it_sqr4_x2_(t1, i); - - // z_20_0 = z_20_10*z_10_0 - t2 = _mul4_rsr_(t1s,t2); - t2s = #copy(t2); - - // z_40_20 = z_20_0^2^20 - i = 20/2; - t3 = _it_sqr4_x2_(t2, i); - - // z_40_0 = z_40_20*z_20_0 - t2 = _mul4_rsr_(t2s,t3); - - // z_50_10 = z_40_0^2^10 - i = 10/2; - t2 = _it_sqr4_x2_(t2, i); - - // z_50_0 = z_50_10*z_10_0 - t1 = _mul4_rsr_(t1s,t2); - t1s = #copy(t1); - - // z_100_50 = z_50_0^2^50 - i = 50/2; - t2 = _it_sqr4_x2_(t1, i); - - // z_100_0 = z_100_50*z_50_0 - t2 = _mul4_rsr_(t1s,t2); - t2s = #copy(t2); - - // z_200_100 = z_100_0^2^100 - i = 100/2; - t3 = _it_sqr4_x2_(t2, i); - - // z_200_0 = z_200_100*z_100_0 - t2 = _mul4_rsr_(t2s,t3); - - // z_250_50 = z_200_0^2^50 - i = 50/2; - t2 = _it_sqr4_x2_(t2, i); - - // z_250_0 = z_250_50*z_50_0 - t1 = _mul4_rsr_(t1s,t2); - - // z_255_5 = z_250_0^2^5 - i = 4/2; - t1 = _it_sqr4_x2_(t1, i); - t1 = _sqr4_rr_(t1); - - // z_255_21 = z_255_5*z11 - t1 = _mul4_rsr_(t0s,t1); - - return t1; -} - diff --git a/src/crypto_scalarmult/curve25519/amd64/mulx/mul4.jinc b/src/crypto_scalarmult/curve25519/amd64/mulx/mul4.jinc deleted file mode 100644 index 31d31a3b..00000000 --- a/src/crypto_scalarmult/curve25519/amd64/mulx/mul4.jinc +++ /dev/null @@ -1,298 +0,0 @@ -require "reduce4.jinc" - -inline fn __mul4_c0 -( reg u64 f0, - reg u64[4] g, - reg u64 z, // zero - reg bool cf of // cf = 0 and of = 0 - ) -> - reg u64[4], - reg u64[4], - reg bool, - reg bool -{ - reg u64 lo; - reg u64[4] h r; - - (h[1], h[0]) = #MULX ( f0, g[0] ); - - ( h[2], lo ) = #MULX ( f0, g[1] ); - cf, h[1] = #ADCX ( h[1], lo, cf ); - - ( h[3], lo ) = #MULX ( f0, g[2] ); - cf, h[2] = #ADCX ( h[2], lo, cf ); - - ( r[0], lo ) = #MULX ( f0, g[3] ); - cf, h[3] = #ADCX ( h[3], lo, cf ); - - cf, r[0] = #ADCX ( r[0], z, cf ); // cf = 0 - - return h, r, cf, of; -} - -inline fn __mul4_c1 -( reg u64[4] h, - reg u64[4] r, - reg u64 f, - reg u64[4] g, - reg u64 z, // zero - reg bool cf of // cf = 0 and of = 0 - ) -> - reg u64[4], - reg u64[4], - reg bool, - reg bool -{ - reg u64 hi lo; - - ( hi, lo ) = #MULX ( f, g[0] ); - of, h[1] = #ADOX ( h[1], lo, of ); - cf, h[2] = #ADCX ( h[2], hi, cf ); - - ( hi, lo ) = #MULX ( f, g[1] ); - of, h[2] = #ADOX ( h[2], lo, of ); - cf, h[3] = #ADCX ( h[3], hi, cf ); - - ( hi, lo ) = #MULX ( f, g[2] ); - of, h[3] = #ADOX ( h[3], lo, of ); - cf, r[0] = #ADCX ( r[0], hi, cf ); - - ( r[1], lo ) = #MULX ( f, g[3] ); - of, r[0] = #ADOX ( r[0], lo, of); - - cf, r[1] = #ADCX ( r[1], z, cf); - of, r[1] = #ADOX ( r[1], z, of); - - return h, r, cf, of; -} - -inline fn __mul4_c2 -( reg u64[4] h, - reg u64[4] r, - reg u64 f, - reg u64[4] g, - reg u64 z, // zero - reg bool cf of // cf = 0 and of = 0 - ) -> - reg u64[4], - reg u64[4], - reg bool, - reg bool -{ - reg u64 hi lo; - - ( hi, lo ) = #MULX ( f, g[0] ); - of, h[2] = #ADOX ( h[2], lo, of ); - cf, h[3] = #ADCX ( h[3], hi, cf ); - - ( hi, lo ) = #MULX ( f, g[1] ); - of, h[3] = #ADOX ( h[3], lo, of ); - cf, r[0] = #ADCX ( r[0], hi, cf ); - - ( hi, lo ) = #MULX ( f, g[2] ); - of, r[0] = #ADOX ( r[0], lo, of ); - cf, r[1] = #ADCX ( r[1], hi, cf ); - - ( r[2], lo ) = #MULX ( f, g[3] ); - of, r[1] = #ADOX ( r[1], lo, of); - - cf, r[2] = #ADCX ( r[2], z, cf); - of, r[2] = #ADOX ( r[2], z, of); - - return h, r, cf, of; -} - -inline fn __mul4_c3 -( reg u64[4] h, - reg u64[4] r, - reg u64 f, - reg u64[4] g, - reg u64 z, // zero - reg bool cf of // cf = 0 and of = 0 - ) -> - reg u64[4], - reg u64[4], - reg bool, - reg bool -{ - reg u64 hi lo; - - ( hi, lo ) = #MULX ( f, g[0] ); - of, h[3] = #ADOX ( h[3], lo, of ); - cf, r[0] = #ADCX ( r[0], hi, cf ); - - ( hi, lo ) = #MULX ( f, g[1] ); - of, r[0] = #ADOX ( r[0], lo, of ); - cf, r[1] = #ADCX ( r[1], hi, cf ); - - ( hi, lo ) = #MULX ( f, g[2] ); - of, r[1] = #ADOX ( r[1], lo, of ); - cf, r[2] = #ADCX ( r[2], hi, cf ); - - ( r[3], lo ) = #MULX ( f, g[3] ); - of, r[2] = #ADOX ( r[2], lo, of); - - cf, r[3] = #ADCX ( r[3], z, cf); - of, r[3] = #ADOX ( r[3], z, of); - - return h, r, cf, of; -} - -inline fn __mul4_rsr(stack u64[4] fs, reg u64[4] g) -> reg u64[4] -{ - reg bool cf of; - reg u64[4] h r; - reg u64 _38 f z; - - of, cf, _, _, _, z = #set0(); - - f = fs[0]; - h, r, cf, of = __mul4_c0( f, g, z, cf, of); - - f = fs[1]; - h, r, cf, of = __mul4_c1(h, r, f, g, z, cf, of); - - f = fs[2]; - h, r, cf, of = __mul4_c2(h, r, f, g, z, cf, of); - - f = fs[3]; - h, r, cf, of = __mul4_c3(h, r, f, g, z, cf, of); - - _38 = 38; - h = __reduce4(h, r, _38, z, cf, of); - - return h; -} - -inline fn __mul4_rpr(reg ptr u64[4] fp, reg u64[4] g) -> reg u64[4] -{ - reg bool cf of; - reg u64[4] h r; - reg u64 _38 f z; - - of, cf, _, _, _, z = #set0(); - - f = fp[0]; - h, r, cf, of = __mul4_c0( f, g, z, cf, of); - - f = fp[1]; - h, r, cf, of = __mul4_c1(h, r, f, g, z, cf, of); - - f = fp[2]; - h, r, cf, of = __mul4_c2(h, r, f, g, z, cf, of); - - f = fp[3]; - h, r, cf, of = __mul4_c3(h, r, f, g, z, cf, of); - - _38 = 38; - h = __reduce4(h, r, _38, z, cf, of); - - return h; -} - -fn _mul4_rpr(reg ptr u64[4] fp, reg u64[4] g) -> reg u64[4] -{ - reg u64[4] h; - - h = __mul4_rpr(fp, g); - - return h; -} - -inline fn _mul4_rsr_(stack u64[4] _fs, reg u64[4] _g) -> reg u64[4] -{ - reg ptr u64[4] fp; - reg u64[4] _h h g; - - fp = _fs; - g = #copy(_g); - h = _mul4_rpr(fp, g); - _h = #copy(h); - - return _h; -} - -inline fn __mul4_ssr(stack u64[4] fs, reg u64[4] g) -> stack u64[4] -{ - stack u64[4] hs; - reg u64[4] h; - - h = __mul4_rsr(fs, g); - hs = #copy(h); - - return hs; -} - -inline fn __mul4_sss(stack u64[4] fs gs) -> stack u64[4] -{ - stack u64[4] hs; - reg u64[4] h g; - - g = #copy(gs); - h = __mul4_rsr(fs, g); - hs = #copy(h); - - return hs; -} - -inline fn __mul4_rss(stack u64[4] fs gs) -> reg u64[4] -{ - reg u64[4] h g; - - g = #copy(gs); - h = __mul4_rsr(fs, g); - - return h; -} - -// //////////////////////////////////////////////////////////////////////////// - -inline fn __mul4_a24_rs(stack u64[4] fs, inline u64 a24) -> reg u64[4] -{ - reg bool cf; - reg u64[4] h; - reg u64 c r0 lo; - - c = a24; - - (h[1], h[0]) = #MULX(c, fs[0]); - (h[2], lo) = #MULX(c, fs[1]); - - cf, h[1] += lo; - - (h[3], lo) = #MULX(c, fs[2]); - - cf, h[2] += lo + cf; - - (r0, lo) = #MULX(c, fs[3]); - - cf, h[3] += lo + cf; - - _, r0 += 0 + cf; - - _, _, _, _, _, r0 = #IMULri (r0, 38); - - cf, h[0] += r0; - cf, h[1] += 0 + cf; - cf, h[2] += 0 + cf; - cf, h[3] += 0 + cf; - - _, c -= c - cf; - - c &= 38; - h[0] += c; - - return h; -} - -inline fn __mul4_a24_ss(stack u64[4] fs, inline u64 a24) -> stack u64[4] -{ - stack u64[4] hs; - reg u64[4] h; - - h = __mul4_a24_rs(fs, a24); - hs = #copy(h); - - return hs; -} - diff --git a/src/crypto_scalarmult/curve25519/amd64/mulx/reduce4.jinc b/src/crypto_scalarmult/curve25519/amd64/mulx/reduce4.jinc deleted file mode 100644 index 6767216e..00000000 --- a/src/crypto_scalarmult/curve25519/amd64/mulx/reduce4.jinc +++ /dev/null @@ -1,45 +0,0 @@ -inline fn __reduce4 -( reg u64[4] h, - reg u64[4] r, - reg u64 _38, - reg u64 z, // zero - reg bool cf of // cf = 0 and of = 0 -) -> reg u64[4] -{ - reg u64 hi lo; - - // - ( hi, lo ) = #MULX ( _38, r[0] ); - of, h[0] = #ADOX ( h[0], lo, of ); - cf, h[1] = #ADCX ( h[1], hi, cf ); - - ( hi, lo ) = #MULX ( _38, r[1] ); - of, h[1] = #ADOX ( h[1], lo, of ); - cf, h[2] = #ADCX ( h[2], hi, cf ); - - ( hi, lo ) = #MULX ( _38, r[2] ); - of, h[2] = #ADOX ( h[2], lo, of ); - cf, h[3] = #ADCX ( h[3], hi, cf ); - - ( r[0], lo ) = #MULX ( _38, r[3] ); - of, h[3] = #ADOX ( h[3], lo, of ); - - cf, r[0] = #ADCX ( r[0], z, cf ); - of, r[0] = #ADOX ( r[0], z, of ); - - // - _,_,_,_,_,lo = #IMULri ( r[0], 38 ); - - cf, h[0] += lo; - cf, h[1] += z + cf; - cf, h[2] += z + cf; - cf, h[3] += z + cf; - - // h[0] += (z - cf) & 38; - _, z -= z - cf; // if cf = 1 then z = 0xFF..FF else z = 0 - z &= 38; // if cf = 1 then z = 38 else z = 0 - h[0] += z; // - - return h; -} - diff --git a/src/crypto_scalarmult/curve25519/amd64/mulx/scalarmult.jazz b/src/crypto_scalarmult/curve25519/amd64/mulx/scalarmult.jazz index 26ca4ffb..24eec027 100644 --- a/src/crypto_scalarmult/curve25519/amd64/mulx/scalarmult.jazz +++ b/src/crypto_scalarmult/curve25519/amd64/mulx/scalarmult.jazz @@ -1,5 +1,1251 @@ -from Jade require "crypto_scalarmult/curve25519/amd64/common/load_store4.jinc" -require "curve25519.jinc" +//BOR#from formosa25519 require "crypto_scalarmult/curve25519/amd64/common/load_store4.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/common/load_store4.jinc +inline fn __load4(reg u64 p) -> reg u64[4] +{ + inline int i; + reg u64[4] a; + + for i=0 to 4 + { a[i] = [p + 8*i]; } + + return a; +} + +inline fn __store4(reg u64 p, reg u64[4] a) +{ + inline int i; + + for i=0 to 4 + { [p + 8*i] = a[i]; } +} + +//EOR# +//BOR#require "curve25519.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/mulx/curve25519.jinc +//BOR#from formosa25519 require "crypto_scalarmult/curve25519/amd64/common/bit.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/common/bit.jinc +inline fn __ith_bit(stack u8[32] k, reg u64 ctr) -> reg u64 +{ + reg u64 p bit; + + p = ctr; + p >>= 3; + bit = (64u) k[(int) p]; + + p = ctr; + p &= 7; + bit >>= (p & 63); + + bit &= 1; + + return bit; +} + +inline fn __next_bit(stack u64 k) -> reg u64, stack u64 +{ + reg bool cf; + reg u64 b one; + + ?{}, b = #set0(); + one = 1; + _, cf, _, _, _, k = #SHL(k, 1); + b = one if cf; + return b, k; +} +//EOR# +//BOR#from formosa25519 require "crypto_scalarmult/curve25519/amd64/common/decode_scalar.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/common/decode_scalar.jinc +inline fn __decode_scalar(reg u64[4] k) -> stack u8[32] +{ + inline int i; + stack u8[32] ks; + + for i=0 to 4 + { ks[u64 i] = k[i]; } + + ks[0] &= 0xf8; + ks[31] &= 0x7f; + ks[31] |= 0x40; + + return ks; +} + +inline fn __decode_scalar_shl1(reg u64[4] k) -> stack u64[4] +{ + stack u64[4] ks; + + k[3] <<= 1; + k[0] &= 0xfffffffffffffff8; + k[3] |= 0x8000000000000000; + + ks = #copy(k); + + return ks; +} + +//EOR# + +//BOR#from formosa25519 require "crypto_scalarmult/curve25519/amd64/common/64/decode_u4.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/common/64/decode_u4.jinc +inline fn __decode_u_coordinate4(reg u64[4] u) -> reg u64[4] +{ + u[3] &= 0x7fffffffffffffff; + return u; +} + +inline fn __decode_u_coordinate_base4() -> reg u64[4] +{ + reg u64[4] u; + + u[0] = 9; + u[1] = 0; + u[2] = 0; + u[3] = 0; + + return u; +} + +//EOR# +//BOR#from formosa25519 require "crypto_scalarmult/curve25519/amd64/common/64/init_points4.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/common/64/init_points4.jinc +inline fn __init_points4( + reg u64[4] initr) + -> + stack u64[4], + reg u64[4], + stack u64[4], + stack u64[4] +{ + inline int i; + stack u64[4] x2 x3 z3; + reg u64[4] z2r; + reg u64 z; + + ?{}, z = #set0(); + + x2[0] = 1; + z2r[0] = 0; + x3 = #copy(initr); + z3[0] = 1; + + for i=1 to 4 + { x2[i] = z; + z2r[i] = z; + z3[i] = z; + } + + // (1, 0, init, 1) + return x2, z2r, x3, z3; +} + +inline fn __init_points4_x3() + -> + stack u64[4], + reg u64[4], + stack u64[4] +{ + inline int i; + stack u64[4] f1s f3s; + reg u64[4] f2; + reg u64 z; + + ?{}, z = #set0(); + + f1s[0] = 1; + f2[0] = 1; + f3s[0] = 1; + + for i=1 to 4 + { f1s[i] = z; + f2[i] = z; + f3s[i] = z; + } + + return f1s, f2, f3s; +} + +//EOR# +//BOR#from formosa25519 require "crypto_scalarmult/curve25519/amd64/common/64/add4.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/common/64/add4.jinc +// h = f + g +// h = 2**0*f0 + 2**64*f1 + 2**128*f2 + 2**192*f3 + +// 2**0*g0 + 2**64*g1 + 2**128*g2 + 2**192*g3 + +inline fn __add4_rrs(reg u64[4] f, stack u64[4] g) -> reg u64[4] +{ + inline int i; + reg bool cf; + reg u64[4] h; + reg u64 z; + + ?{}, z = #set0(); + + h = #copy(f); + + cf, h[0] += g[0]; + for i=1 to 4 + { cf, h[i] += g[i] + cf; } + + _, z -= z - cf; + z &= 38; + + cf, h[0] += z; + for i=1 to 4 + { cf, h[i] += 0 + cf; } + + _, z -= z - cf; + z &= 38; + h[0] += z; + + return h; +} + +inline fn __add4_sss(stack u64[4] fs gs) -> stack u64[4] +{ + stack u64[4] hs; + reg u64[4] h f; + + f = #copy(fs); + h = __add4_rrs(f, gs); + hs = #copy(h); + + return hs; +} + +inline fn __add4_ssr(stack u64[4] fs, reg u64[4] g) -> stack u64[4] +{ + stack u64[4] hs; + reg u64[4] h; + + h = __add4_rrs(g, fs); + hs = #copy(h); + + return hs; +} + +inline fn __add4_rsr(stack u64[4] fs, reg u64[4] g) -> reg u64[4] +{ + reg u64[4] h; + + h = __add4_rrs(g, fs); + + return h; +} + +//EOR# +//BOR#from formosa25519 require "crypto_scalarmult/curve25519/amd64/common/64/sub4.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/common/64/sub4.jinc +// h = f - g +// h = (2**0*f0 + 2**64*f1 + 2**128*f2 + 2**192*f3) - +// (2**0*g0 + 2**64*g1 + 2**128*g2 + 2**192*g3) + +inline fn __sub4_rrs(reg u64[4] f, stack u64[4] gs) -> reg u64[4] +{ + inline int i; + reg bool cf; + reg u64[4] h; + reg u64 z; + + ?{}, z = #set0(); + + h = #copy(f); + + cf, h[0] -= gs[0]; + for i=1 to 4 + { cf, h[i] -= gs[i] - cf; } + + _, z -= z - cf; + z &= 38; + + cf, h[0] -= z; + for i=1 to 4 + { cf, h[i] -= 0 - cf; } + + _, z -= z - cf; + z &= 38; + h[0] -= z; + + return h; +} + +inline fn __sub4_sss(stack u64[4] fs gs) -> stack u64[4] +{ + stack u64[4] hs; + reg u64[4] h f; + + f = #copy(fs); + h = __sub4_rrs(f, gs); + hs = #copy(h); + + return hs; +} + +inline fn __sub4_rss(stack u64[4] fs gs) -> reg u64[4] +{ + reg u64[4] h f; + + f = #copy(fs); + h = __sub4_rrs(f, gs); + + return h; +} + +inline fn __sub4_rsr(stack u64[4] fs, reg u64[4] g) -> reg u64[4] +{ + inline int i; + reg bool cf; + reg u64[4] h; + reg u64 z; + + ?{}, z = #set0(); + + h = #copy(fs); + + cf, h[0] -= g[0]; + for i=1 to 4 + { cf, h[i] -= g[i] - cf; } + + _, z -= z - cf; + z &= 38; + + cf, h[0] -= z; + for i=1 to 4 + { cf, h[i] -= 0 - cf; } + + _, z -= z - cf; + z &= 38; + h[0] -= z; + + return h; +} + +inline fn __sub4_ssr(stack u64[4] fs, reg u64[4] g) -> stack u64[4] +{ + stack u64[4] hs; + reg u64[4] h; + + h = __sub4_rsr(fs, g); + hs = #copy(h); + + return hs; +} + +//EOR# +//BOR#from formosa25519 require "crypto_scalarmult/curve25519/amd64/common/64/cswap4.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/common/64/cswap4.jinc +inline fn __cswap4( + stack u64[4] x2, + reg u64[4] z2r, + stack u64[4] x3, + stack u64[4] z3, + reg u64 toswap) + -> + stack u64[4], + reg u64[4], + stack u64[4], + stack u64[4] +{ + inline int i; + reg u64[4] t4 x2r x3r z3r; + reg u64 t mask; + + ?{}, mask = #set0(); + mask -= toswap; // if toswap == 1 mask = -1 or all bits at 1, 0 otherwise + + // swap between z2 and z3 + z3r = #copy(z3); + t4 = #copy(z2r); + + for i=0 to 4 { t4[i] ^= z3r[i]; } // t4 = z2 ^ z3 + for i=0 to 4 { t4[i] &= mask; } // t4 = (z2 ^ z3) & mask --> if m==0 then t4 = {0} + for i=0 to 4 { z2r[i] ^= t4[i]; + z3r[i] ^= t4[i]; + z3[i] = z3r[i]; } + + // swap between x3r and z3 + x3r = #copy(x3); + + for i=0 to 4 { x2r[i] = x2[i]; + t = x3r[i]; + t ^= x2r[i]; + t &= mask; + x2r[i] ^= t; + x3r[i] ^= t; + x2[i] = x2r[i]; + x3[i] = x3r[i]; } + + return x2, z2r, x3, z3; +} + +inline fn __cswap4_ssss( + stack u64[4] xs, + stack u64[4] ys, + reg u64 swap) + -> + stack u64[4], + stack u64[4] +{ + inline int i; + reg u64[4] x y; + reg u64 t mask; + + x = #copy(xs); + + mask = 0; + mask -= swap; + + for i=0 to 4 + { + y[i] = ys[i]; + + t = x[i]; + t ^= y[i]; + t &= mask; + + x[i] ^= t; // ^ (x[i] ^ y[i]) if swap == 1 + y[i] ^= t; + + ys[i] = y[i]; + } + + xs = #copy(x); + + return xs, ys; +} + +inline fn __cswap4_rsrs( + reg u64[4] x, + stack u64[4] ys, + reg u64 swap) + -> + reg u64[4], + stack u64[4] +{ + inline int i; + reg u64[4] y; + reg u64 t mask; + + mask = 0; + mask -= swap; + + for i=0 to 4 + { + y[i] = ys[i]; + + t = x[i]; + t ^= y[i]; + t &= mask; + + x[i] ^= t; // ^ (x[i] ^ y[i]) if swap == 1 + y[i] ^= t; + + ys[i] = y[i]; + } + + return x, ys; +} + +//EOR# +//BOR#from formosa25519 require "crypto_scalarmult/curve25519/amd64/common/64/tobytes4.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/common/64/tobytes4.jinc +inline fn __tobytes4(reg u64[4] f) -> reg u64[4] +{ + reg bool cf; + reg u64 t; + + t = #LEA(f[3] + f[3]); + ?{}, f[3] = #SAR(f[3], 63); + t >>= 1; + f[3] &= 19; + f[3] += 19; + + cf, f[0] += f[3]; + cf, f[1] += 0 + cf; + cf, f[2] += 0 + cf; + _, t += 0 + cf; + + f[3] = #LEA(t + t); + ?{}, t = #SAR(t, 63); + f[3] >>= 1; + t = !t; + t &= 19; + + cf, f[0] -= t; + cf, f[1] -= 0 - cf; + cf, f[2] -= 0 - cf; + _, f[3] -= 0 - cf; + + return f; + +} +//EOR# + +//BOR#require "mul4.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/mulx/mul4.jinc +//BOR#require "reduce4.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/mulx/reduce4.jinc +inline fn __reduce4 +( reg u64[4] h, + reg u64[4] r, + reg u64 _38, + reg u64 z, // zero + reg bool cf of // cf = 0 and of = 0 +) -> reg u64[4] +{ + reg u64 hi lo; + + // + ( hi, lo ) = #MULX ( _38, r[0] ); + of, h[0] = #ADOX ( h[0], lo, of ); + cf, h[1] = #ADCX ( h[1], hi, cf ); + + ( hi, lo ) = #MULX ( _38, r[1] ); + of, h[1] = #ADOX ( h[1], lo, of ); + cf, h[2] = #ADCX ( h[2], hi, cf ); + + ( hi, lo ) = #MULX ( _38, r[2] ); + of, h[2] = #ADOX ( h[2], lo, of ); + cf, h[3] = #ADCX ( h[3], hi, cf ); + + ( r[0], lo ) = #MULX ( _38, r[3] ); + of, h[3] = #ADOX ( h[3], lo, of ); + + cf, r[0] = #ADCX ( r[0], z, cf ); + of, r[0] = #ADOX ( r[0], z, of ); + + // + _,_,_,_,_,lo = #IMULri ( r[0], 38 ); + + cf, h[0] += lo; + cf, h[1] += z + cf; + cf, h[2] += z + cf; + cf, h[3] += z + cf; + + // h[0] += (z - cf) & 38; + _, z -= z - cf; // if cf = 1 then z = 0xFF..FF else z = 0 + z &= 38; // if cf = 1 then z = 38 else z = 0 + h[0] += z; // + + return h; +} + +//EOR# + +inline fn __mul4_c0 +( reg u64 f0, + reg u64[4] g, + reg u64 z, // zero + reg bool cf of // cf = 0 and of = 0 + ) -> + reg u64[4], + reg u64[4], + reg bool, + reg bool +{ + reg u64 lo; + reg u64[4] h r; + + (h[1], h[0]) = #MULX ( f0, g[0] ); + + ( h[2], lo ) = #MULX ( f0, g[1] ); + cf, h[1] = #ADCX ( h[1], lo, cf ); + + ( h[3], lo ) = #MULX ( f0, g[2] ); + cf, h[2] = #ADCX ( h[2], lo, cf ); + + ( r[0], lo ) = #MULX ( f0, g[3] ); + cf, h[3] = #ADCX ( h[3], lo, cf ); + + cf, r[0] = #ADCX ( r[0], z, cf ); // cf = 0 + + return h, r, cf, of; +} + +inline fn __mul4_c1 +( reg u64[4] h, + reg u64[4] r, + reg u64 f, + reg u64[4] g, + reg u64 z, // zero + reg bool cf of // cf = 0 and of = 0 + ) -> + reg u64[4], + reg u64[4], + reg bool, + reg bool +{ + reg u64 hi lo; + + ( hi, lo ) = #MULX ( f, g[0] ); + of, h[1] = #ADOX ( h[1], lo, of ); + cf, h[2] = #ADCX ( h[2], hi, cf ); + + ( hi, lo ) = #MULX ( f, g[1] ); + of, h[2] = #ADOX ( h[2], lo, of ); + cf, h[3] = #ADCX ( h[3], hi, cf ); + + ( hi, lo ) = #MULX ( f, g[2] ); + of, h[3] = #ADOX ( h[3], lo, of ); + cf, r[0] = #ADCX ( r[0], hi, cf ); + + ( r[1], lo ) = #MULX ( f, g[3] ); + of, r[0] = #ADOX ( r[0], lo, of); + + cf, r[1] = #ADCX ( r[1], z, cf); + of, r[1] = #ADOX ( r[1], z, of); + + return h, r, cf, of; +} + +inline fn __mul4_c2 +( reg u64[4] h, + reg u64[4] r, + reg u64 f, + reg u64[4] g, + reg u64 z, // zero + reg bool cf of // cf = 0 and of = 0 + ) -> + reg u64[4], + reg u64[4], + reg bool, + reg bool +{ + reg u64 hi lo; + + ( hi, lo ) = #MULX ( f, g[0] ); + of, h[2] = #ADOX ( h[2], lo, of ); + cf, h[3] = #ADCX ( h[3], hi, cf ); + + ( hi, lo ) = #MULX ( f, g[1] ); + of, h[3] = #ADOX ( h[3], lo, of ); + cf, r[0] = #ADCX ( r[0], hi, cf ); + + ( hi, lo ) = #MULX ( f, g[2] ); + of, r[0] = #ADOX ( r[0], lo, of ); + cf, r[1] = #ADCX ( r[1], hi, cf ); + + ( r[2], lo ) = #MULX ( f, g[3] ); + of, r[1] = #ADOX ( r[1], lo, of); + + cf, r[2] = #ADCX ( r[2], z, cf); + of, r[2] = #ADOX ( r[2], z, of); + + return h, r, cf, of; +} + +inline fn __mul4_c3 +( reg u64[4] h, + reg u64[4] r, + reg u64 f, + reg u64[4] g, + reg u64 z, // zero + reg bool cf of // cf = 0 and of = 0 + ) -> + reg u64[4], + reg u64[4], + reg bool, + reg bool +{ + reg u64 hi lo; + + ( hi, lo ) = #MULX ( f, g[0] ); + of, h[3] = #ADOX ( h[3], lo, of ); + cf, r[0] = #ADCX ( r[0], hi, cf ); + + ( hi, lo ) = #MULX ( f, g[1] ); + of, r[0] = #ADOX ( r[0], lo, of ); + cf, r[1] = #ADCX ( r[1], hi, cf ); + + ( hi, lo ) = #MULX ( f, g[2] ); + of, r[1] = #ADOX ( r[1], lo, of ); + cf, r[2] = #ADCX ( r[2], hi, cf ); + + ( r[3], lo ) = #MULX ( f, g[3] ); + of, r[2] = #ADOX ( r[2], lo, of); + + cf, r[3] = #ADCX ( r[3], z, cf); + of, r[3] = #ADOX ( r[3], z, of); + + return h, r, cf, of; +} + +inline fn __mul4_rsr(stack u64[4] fs, reg u64[4] g) -> reg u64[4] +{ + reg bool cf of; + reg u64[4] h r; + reg u64 _38 f z; + + of, cf, _, _, _, z = #set0(); + + f = fs[0]; + h, r, cf, of = __mul4_c0( f, g, z, cf, of); + + f = fs[1]; + h, r, cf, of = __mul4_c1(h, r, f, g, z, cf, of); + + f = fs[2]; + h, r, cf, of = __mul4_c2(h, r, f, g, z, cf, of); + + f = fs[3]; + h, r, cf, of = __mul4_c3(h, r, f, g, z, cf, of); + + _38 = 38; + h = __reduce4(h, r, _38, z, cf, of); + + return h; +} + +inline fn __mul4_rpr(reg ptr u64[4] fp, reg u64[4] g) -> reg u64[4] +{ + reg bool cf of; + reg u64[4] h r; + reg u64 _38 f z; + + of, cf, _, _, _, z = #set0(); + + f = fp[0]; + h, r, cf, of = __mul4_c0( f, g, z, cf, of); + + f = fp[1]; + h, r, cf, of = __mul4_c1(h, r, f, g, z, cf, of); + + f = fp[2]; + h, r, cf, of = __mul4_c2(h, r, f, g, z, cf, of); + + f = fp[3]; + h, r, cf, of = __mul4_c3(h, r, f, g, z, cf, of); + + _38 = 38; + h = __reduce4(h, r, _38, z, cf, of); + + return h; +} + +fn _mul4_rpr(reg ptr u64[4] fp, reg u64[4] g) -> reg u64[4] +{ + reg u64[4] h; + + h = __mul4_rpr(fp, g); + + return h; +} + +inline fn _mul4_rsr_(stack u64[4] _fs, reg u64[4] _g) -> reg u64[4] +{ + reg ptr u64[4] fp; + reg u64[4] _h h g; + + fp = _fs; + g = #copy(_g); + h = _mul4_rpr(fp, g); + _h = #copy(h); + + return _h; +} + +inline fn __mul4_ssr(stack u64[4] fs, reg u64[4] g) -> stack u64[4] +{ + stack u64[4] hs; + reg u64[4] h; + + h = __mul4_rsr(fs, g); + hs = #copy(h); + + return hs; +} + +inline fn __mul4_sss(stack u64[4] fs gs) -> stack u64[4] +{ + stack u64[4] hs; + reg u64[4] h g; + + g = #copy(gs); + h = __mul4_rsr(fs, g); + hs = #copy(h); + + return hs; +} + +inline fn __mul4_rss(stack u64[4] fs gs) -> reg u64[4] +{ + reg u64[4] h g; + + g = #copy(gs); + h = __mul4_rsr(fs, g); + + return h; +} + +// //////////////////////////////////////////////////////////////////////////// + +inline fn __mul4_a24_rs(stack u64[4] fs, inline u64 a24) -> reg u64[4] +{ + reg bool cf; + reg u64[4] h; + reg u64 c r0 lo; + + c = a24; + + (h[1], h[0]) = #MULX(c, fs[0]); + (h[2], lo) = #MULX(c, fs[1]); + + cf, h[1] += lo; + + (h[3], lo) = #MULX(c, fs[2]); + + cf, h[2] += lo + cf; + + (r0, lo) = #MULX(c, fs[3]); + + cf, h[3] += lo + cf; + + _, r0 += 0 + cf; + + _, _, _, _, _, r0 = #IMULri (r0, 38); + + cf, h[0] += r0; + cf, h[1] += 0 + cf; + cf, h[2] += 0 + cf; + cf, h[3] += 0 + cf; + + _, c -= c - cf; + + c &= 38; + h[0] += c; + + return h; +} + +inline fn __mul4_a24_ss(stack u64[4] fs, inline u64 a24) -> stack u64[4] +{ + stack u64[4] hs; + reg u64[4] h; + + h = __mul4_a24_rs(fs, a24); + hs = #copy(h); + + return hs; +} + +//EOR# +//BOR#require "sqr4.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/mulx/sqr4.jinc +//BOR#require "reduce4.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/mulx/reduce4.jinc +//EOR# + +inline fn __sqr4_rr(reg u64[4] f) -> reg u64[4] +{ + reg bool cf of; + reg u64[8] t; + reg u64[4] h r; + reg u64 z _38 fx; + + of, cf, _, _, _, z = #set0(); + + // 0 + fx = f[0]; + + (t[1], h[0]) = #MULX ( fx, fx ); // f0*f0 + (h[2], h[1]) = #MULX ( fx, f[1] ); // f0*f1 + + (h[3], t[2]) = #MULX ( fx, f[2] ); // f0*f2 + cf, h[2] = #ADCX ( h[2], t[2], cf ); + + (r[0], t[3]) = #MULX ( fx, f[3] ); // f0*f3 + cf, h[3] = #ADCX ( h[3], t[3], cf ); + + // 1 + fx = f[1]; + + (t[4], t[3]) = #MULX ( fx, f[2] ); // f1*f2 + + of, h[3] = #ADOX ( h[3], t[3], of ); + cf, r[0] = #ADCX ( r[0], t[4], cf ); + + (r[1], t[4]) = #MULX ( fx, f[3] ); // f1*f3 + of, r[0] = #ADOX ( r[0], t[4], of ); + + (t[3], t[2]) = #MULX ( fx, fx ); // f1*f1 + + // 2 + fx = f[2]; + + (r[2], t[5]) = #MULX ( fx, f[3] ); // f2*f3 + + cf, r[1] = #ADCX ( r[1], t[5], cf ); + of, r[1] = #ADOX ( r[1], z, of ); + + cf, r[2] = #ADCX ( r[2], z, cf ); // cf = 0 + of, r[2] = #ADOX ( r[2], z, of ); // of = 0 ?? TODO: VERIFYME + + (t[5], t[4]) = #MULX ( fx, fx ); // f2*f2 + + // 3 + fx = f[3]; + + (r[3], t[6]) = #MULX ( fx, fx ); // f3*f3 + + // + cf, h[1] = #ADCX ( h[1], h[1], cf ); + of, h[1] = #ADOX ( h[1], t[1], of ); + + cf, h[2] = #ADCX ( h[2], h[2], cf ); + of, h[2] = #ADOX ( h[2], t[2], of ); + + cf, h[3] = #ADCX ( h[3], h[3], cf ); + of, h[3] = #ADOX ( h[3], t[3], of ); + + cf, r[0] = #ADCX ( r[0], r[0], cf ); + of, r[0] = #ADOX ( r[0], t[4], of ); + + cf, r[1] = #ADCX ( r[1], r[1], cf ); + of, r[1] = #ADOX ( r[1], t[5], of ); + + cf, r[2] = #ADCX ( r[2], r[2], cf ); + of, r[2] = #ADOX ( r[2], t[6], of ); + + cf, r[3] = #ADCX ( r[3], z, cf ); // cf = 0 + of, r[3] = #ADOX ( r[3], z, of ); // of = 0 ?? TODO: VERIFYME + + _38 = 38; + h = __reduce4(h, r, _38, z, cf, of); + + return h; +} + +fn _sqr4_rr(reg u64[4] f) -> reg u64[4] +{ + reg u64[4] h; + h = __sqr4_rr(f); + return h; +} + +inline fn _sqr4_rr_(reg u64[4] _f) -> reg u64[4] +{ + reg u64[4] _h h f; + + f = #copy(_f); + h = _sqr4_rr(f); + _h = #copy(h); + + return _h; +} + +inline fn __it_sqr4_x2(reg u64[4] f, reg u32 i) -> reg u64[4] +{ + reg bool zf; + reg u64[4] h; + stack u32 _i; + + while + { _i = i; + + h = __sqr4_rr(f); + f = __sqr4_rr(h); + + i = _i; + _,_,_,zf,i = #DEC_32(i); + } (!zf) + + return f; +} + +fn _it_sqr4_x2(reg u64[4] f, reg u32 i) -> reg u64[4] +{ + f = __it_sqr4_x2(f, i); + return f; +} + +inline fn _it_sqr4_x2_(reg u64[4] _f, reg u32 i) -> reg u64[4] +{ + reg u64[4] f; + f = #copy(_f); + f = _it_sqr4_x2(f, i); + return f; +} + + +inline fn __sqr4_ss(stack u64[4] fs) -> stack u64[4] +{ + stack u64[4] hs; + reg u64[4] f h; + + f = #copy(fs); + h = __sqr4_rr(f); + hs = #copy(h); + + return hs; +} + +inline fn __sqr4_sr(reg u64[4] f) -> stack u64[4] +{ + stack u64[4] hs; + reg u64[4] h; + + h = __sqr4_rr(f); + hs = #copy(h); + + return hs; +} + +inline fn __sqr4_rs(stack u64[4] fs) -> reg u64[4] +{ + reg u64[4] f h; + + f = #copy(fs); + h = __sqr4_rr(f); + + return h; +} + +//EOR# +//BOR#require "invert4.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/mulx/invert4.jinc +//BOR#require "mul4.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/mulx/mul4.jinc +//EOR# +//BOR#require "sqr4.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/mulx/sqr4.jinc +//EOR# + +inline fn __invert4(reg u64[4] f) -> reg u64[4] +{ + reg u32 i; + stack u64[4] fs t0s t1s t2s; + reg u64[4] t0 t1 t2 t3; + + fs = #copy(f); + + // z2 = z1^2^1 + t0 = _sqr4_rr_(f); + t0s = #copy(t0); + + // z8 = z2^2^2 + t1 = _sqr4_rr_(t0); + t1 = _sqr4_rr_(t1); + + // z9 = z1*z8 + t1 = _mul4_rsr_(fs,t1); + t1s = #copy(t1); + + // z11 = z2*z9 + t0 = _mul4_rsr_(t0s,t1); + t0s = #copy(t0); + + // z22 = z11^2^1 + t2 = _sqr4_rr_(t0); + + // z_5_0 = z9*z22 + t1 = _mul4_rsr_(t1s,t2); + t1s = #copy(t1); + + // z_10_5 = z_5_0^2^5 + t2 = _sqr4_rr_(t1); + i = 4/2; + t2 = _it_sqr4_x2_(t2, i); + t2s = #copy(t2); + + // z_10_0 = z_10_5*z_5_0 + t1 = _mul4_rsr_(t1s,t2); + t1s = #copy(t1); + + // z_20_10 = z_10_0^2^10 + i = 10/2; + t2 = _it_sqr4_x2_(t1, i); + + // z_20_0 = z_20_10*z_10_0 + t2 = _mul4_rsr_(t1s,t2); + t2s = #copy(t2); + + // z_40_20 = z_20_0^2^20 + i = 20/2; + t3 = _it_sqr4_x2_(t2, i); + + // z_40_0 = z_40_20*z_20_0 + t2 = _mul4_rsr_(t2s,t3); + + // z_50_10 = z_40_0^2^10 + i = 10/2; + t2 = _it_sqr4_x2_(t2, i); + + // z_50_0 = z_50_10*z_10_0 + t1 = _mul4_rsr_(t1s,t2); + t1s = #copy(t1); + + // z_100_50 = z_50_0^2^50 + i = 50/2; + t2 = _it_sqr4_x2_(t1, i); + + // z_100_0 = z_100_50*z_50_0 + t2 = _mul4_rsr_(t1s,t2); + t2s = #copy(t2); + + // z_200_100 = z_100_0^2^100 + i = 100/2; + t3 = _it_sqr4_x2_(t2, i); + + // z_200_0 = z_200_100*z_100_0 + t2 = _mul4_rsr_(t2s,t3); + + // z_250_50 = z_200_0^2^50 + i = 50/2; + t2 = _it_sqr4_x2_(t2, i); + + // z_250_0 = z_250_50*z_50_0 + t1 = _mul4_rsr_(t1s,t2); + + // z_255_5 = z_250_0^2^5 + i = 4/2; + t1 = _it_sqr4_x2_(t1, i); + t1 = _sqr4_rr_(t1); + + // z_255_21 = z_255_5*z11 + t1 = _mul4_rsr_(t0s,t1); + + return t1; +} + +//EOR# + +inline fn __add_and_double4( + stack u64[4] init, + stack u64[4] x2, + reg u64[4] z2r, + stack u64[4] x3, + stack u64[4] z3) + -> + stack u64[4], + reg u64[4], + stack u64[4], + stack u64[4] +{ + stack u64[4] z2 t0 t1 t2; + reg u64[4] t1r; + + t0 = __sub4_ssr(x2, z2r); + x2 = __add4_ssr(x2, z2r); + + t1 = __sub4_sss(x3, z3); + z2 = __add4_sss(x3, z3); + + z3 = __mul4_sss(x2, t1); + z2 = __mul4_sss(z2, t0); + + t2 = __sqr4_ss(x2); + t1r = __sqr4_rs(t0); + + x3 = __add4_sss(z3, z2); + z2 = __sub4_sss(z3, z2); + + t0 = __sub4_ssr(t2, t1r); + x2 = __mul4_ssr(t2, t1r); + + z2 = __sqr4_ss(z2); + z3 = __mul4_a24_ss(t0, 121665); + x3 = __sqr4_ss(x3); + + t2 = __add4_sss(t2, z3); + z3 = __mul4_sss(init, z2); + z2r = __mul4_rss(t0, t2); + + return x2, z2r, x3, z3; +} + +inline fn __montgomery_ladder_step4( + stack u8[32] k, + stack u64[4] init, + stack u64[4] x2, + reg u64[4] z2r, + stack u64[4] x3, + stack u64[4] z3, + stack u64 swapped, + reg u64 ctr) + -> + stack u64[4], + reg u64[4], + stack u64[4], + stack u64[4], + stack u64 +{ + reg u64 toswap bit; + + bit = __ith_bit(k, ctr); + + toswap = swapped; + toswap ^= bit; + + x2, z2r, x3, z3 = __cswap4(x2, z2r, x3, z3, toswap); + swapped = bit; + + x2, z2r, x3, z3 = __add_and_double4(init, x2, z2r, x3, z3); + + return x2, z2r, x3, z3, swapped; +} + + +inline fn __montgomery_ladder4( + reg u64[4] u, + stack u8[32] k) + -> + stack u64[4], + reg u64[4] +{ + stack u64[4] us x2 x3 z3; + reg u64[4] z2r; + stack u64 swapped; + #spill_to_mmx reg u64 ctr; + + (x2,z2r,x3,z3) = __init_points4(u); + us = #copy(u); + + ctr = 255; + swapped = 0; + + while + { + ctr -= 1; + () = #spill(ctr); + + (x2, z2r, x3, z3, swapped) = + __montgomery_ladder_step4(k, us, x2, z2r, x3, z3, swapped, ctr); + + () = #unspill(ctr); + } (ctr > 0) + + return x2, z2r; +} + +inline fn __encode_point4(stack u64[4] x2, reg u64[4] z2r) -> reg u64[4] +{ + reg u64[4] r; + + z2r = __invert4(z2r); + r = __mul4_rsr(x2, z2r); + r = __tobytes4(r); + + return r; +} + +inline fn __curve25519_internal_mulx(stack u8[32] k, reg u64[4] u) -> reg u64[4] +{ + stack u64[4] x2; + reg u64[4] z2r r; + + (x2,z2r) = __montgomery_ladder4(u, k); + r = __encode_point4(x2,z2r); + + return r; +} + +inline fn __curve25519_mulx(reg u64[4] _k _u) -> reg u64[4] +{ + stack u8[32] k; + reg u64[4] u r; + + k = __decode_scalar(_k); + u = __decode_u_coordinate4(_u); + r = __curve25519_internal_mulx(k, u); + + return r; +} + +inline fn __curve25519_mulx_base(reg u64[4] _k) -> reg u64[4] +{ + stack u8[32] k; + reg u64[4] u r; + + k = __decode_scalar(_k); + u = __decode_u_coordinate_base4(); + r = __curve25519_internal_mulx(k, u); + + return r; +} + +//EOR# export fn jade_scalarmult_curve25519_amd64_mulx(#spill_to_mmx reg u64 qp np pp) -> reg u64 { diff --git a/src/crypto_scalarmult/curve25519/amd64/mulx/sqr4.jinc b/src/crypto_scalarmult/curve25519/amd64/mulx/sqr4.jinc deleted file mode 100644 index 755f4bd8..00000000 --- a/src/crypto_scalarmult/curve25519/amd64/mulx/sqr4.jinc +++ /dev/null @@ -1,167 +0,0 @@ -require "reduce4.jinc" - -inline fn __sqr4_rr(reg u64[4] f) -> reg u64[4] -{ - reg bool cf of; - reg u64[8] t; - reg u64[4] h r; - reg u64 z _38 fx; - - of, cf, _, _, _, z = #set0(); - - // 0 - fx = f[0]; - - (t[1], h[0]) = #MULX ( fx, fx ); // f0*f0 - (h[2], h[1]) = #MULX ( fx, f[1] ); // f0*f1 - - (h[3], t[2]) = #MULX ( fx, f[2] ); // f0*f2 - cf, h[2] = #ADCX ( h[2], t[2], cf ); - - (r[0], t[3]) = #MULX ( fx, f[3] ); // f0*f3 - cf, h[3] = #ADCX ( h[3], t[3], cf ); - - // 1 - fx = f[1]; - - (t[4], t[3]) = #MULX ( fx, f[2] ); // f1*f2 - - of, h[3] = #ADOX ( h[3], t[3], of ); - cf, r[0] = #ADCX ( r[0], t[4], cf ); - - (r[1], t[4]) = #MULX ( fx, f[3] ); // f1*f3 - of, r[0] = #ADOX ( r[0], t[4], of ); - - (t[3], t[2]) = #MULX ( fx, fx ); // f1*f1 - - // 2 - fx = f[2]; - - (r[2], t[5]) = #MULX ( fx, f[3] ); // f2*f3 - - cf, r[1] = #ADCX ( r[1], t[5], cf ); - of, r[1] = #ADOX ( r[1], z, of ); - - cf, r[2] = #ADCX ( r[2], z, cf ); // cf = 0 - of, r[2] = #ADOX ( r[2], z, of ); // of = 0 ?? TODO: VERIFYME - - (t[5], t[4]) = #MULX ( fx, fx ); // f2*f2 - - // 3 - fx = f[3]; - - (r[3], t[6]) = #MULX ( fx, fx ); // f3*f3 - - // - cf, h[1] = #ADCX ( h[1], h[1], cf ); - of, h[1] = #ADOX ( h[1], t[1], of ); - - cf, h[2] = #ADCX ( h[2], h[2], cf ); - of, h[2] = #ADOX ( h[2], t[2], of ); - - cf, h[3] = #ADCX ( h[3], h[3], cf ); - of, h[3] = #ADOX ( h[3], t[3], of ); - - cf, r[0] = #ADCX ( r[0], r[0], cf ); - of, r[0] = #ADOX ( r[0], t[4], of ); - - cf, r[1] = #ADCX ( r[1], r[1], cf ); - of, r[1] = #ADOX ( r[1], t[5], of ); - - cf, r[2] = #ADCX ( r[2], r[2], cf ); - of, r[2] = #ADOX ( r[2], t[6], of ); - - cf, r[3] = #ADCX ( r[3], z, cf ); // cf = 0 - of, r[3] = #ADOX ( r[3], z, of ); // of = 0 ?? TODO: VERIFYME - - _38 = 38; - h = __reduce4(h, r, _38, z, cf, of); - - return h; -} - -fn _sqr4_rr(reg u64[4] f) -> reg u64[4] -{ - reg u64[4] h; - h = __sqr4_rr(f); - return h; -} - -inline fn _sqr4_rr_(reg u64[4] _f) -> reg u64[4] -{ - reg u64[4] _h h f; - - f = #copy(_f); - h = _sqr4_rr(f); - _h = #copy(h); - - return _h; -} - -inline fn __it_sqr4_x2(reg u64[4] f, reg u32 i) -> reg u64[4] -{ - reg bool zf; - reg u64[4] h; - stack u32 _i; - - while - { _i = i; - - h = __sqr4_rr(f); - f = __sqr4_rr(h); - - i = _i; - _,_,_,zf,i = #DEC_32(i); - } (!zf) - - return f; -} - -fn _it_sqr4_x2(reg u64[4] f, reg u32 i) -> reg u64[4] -{ - f = __it_sqr4_x2(f, i); - return f; -} - -inline fn _it_sqr4_x2_(reg u64[4] _f, reg u32 i) -> reg u64[4] -{ - reg u64[4] f; - f = #copy(_f); - f = _it_sqr4_x2(f, i); - return f; -} - - -inline fn __sqr4_ss(stack u64[4] fs) -> stack u64[4] -{ - stack u64[4] hs; - reg u64[4] f h; - - f = #copy(fs); - h = __sqr4_rr(f); - hs = #copy(h); - - return hs; -} - -inline fn __sqr4_sr(reg u64[4] f) -> stack u64[4] -{ - stack u64[4] hs; - reg u64[4] h; - - h = __sqr4_rr(f); - hs = #copy(h); - - return hs; -} - -inline fn __sqr4_rs(stack u64[4] fs) -> reg u64[4] -{ - reg u64[4] f h; - - f = #copy(fs); - h = __sqr4_rr(f); - - return h; -} - diff --git a/src/crypto_scalarmult/curve25519/amd64/ref4/curve25519.jinc b/src/crypto_scalarmult/curve25519/amd64/ref4/curve25519.jinc deleted file mode 100644 index abc13cd7..00000000 --- a/src/crypto_scalarmult/curve25519/amd64/ref4/curve25519.jinc +++ /dev/null @@ -1,200 +0,0 @@ -from Jade require "crypto_scalarmult/curve25519/amd64/common/bit.jinc" -from Jade require "crypto_scalarmult/curve25519/amd64/common/decode_scalar.jinc" -from Jade require "crypto_scalarmult/curve25519/amd64/common/load_store4.jinc" - -from Jade require "crypto_scalarmult/curve25519/amd64/common/64/decode_u4.jinc" -from Jade require "crypto_scalarmult/curve25519/amd64/common/64/init_points4.jinc" -from Jade require "crypto_scalarmult/curve25519/amd64/common/64/add4.jinc" -from Jade require "crypto_scalarmult/curve25519/amd64/common/64/sub4.jinc" -from Jade require "crypto_scalarmult/curve25519/amd64/common/64/cswap4.jinc" -from Jade require "crypto_scalarmult/curve25519/amd64/common/64/tobytes4.jinc" - -require "mul4.jinc" -require "sqr4.jinc" -require "invert4.jinc" - -inline fn __add_and_double4( - stack u64[4] init, - stack u64[4] x2, - reg u64[4] z2r, - stack u64[4] x3, - stack u64[4] z3) - -> - stack u64[4], - reg u64[4], - stack u64[4], - stack u64[4] -{ - stack u64[4] z2 t0 t1 t2; - - t0 = __sub4_ssr(x2, z2r); - x2 = __add4_ssr(x2, z2r); - - t1 = __sub4_sss(x3, z3); - z2 = __add4_sss(x3, z3); - - z3 = __mul4_sss(x2, t1); - z2 = __mul4_sss(z2, t0); - - t2 = __sqr4_ss(x2); - t1 = __sqr4_ss(t0); - - x3 = __add4_sss(z3, z2); - z2 = __sub4_sss(z3, z2); - - x2 = __mul4_sss(t2, t1); - t0 = __sub4_sss(t2, t1); - - z2 = __sqr4_ss(z2); - z3 = __mul4_a24_ss(t0, 121665); - x3 = __sqr4_ss(x3); - - t2 = __add4_sss(t2, z3); - z3 = __mul4_sss(init, z2); - z2r = __mul4_rss(t0, t2); - - return x2, z2r, x3, z3; -} - -inline fn __montgomery_ladder_step4( - stack u8[32] k, - stack u64[4] init, - stack u64[4] x2, - reg u64[4] z2r, - stack u64[4] x3, - stack u64[4] z3, - stack u64 swapped, - reg u64 ctr) - -> - stack u64[4], - reg u64[4], - stack u64[4], - stack u64[4], - stack u64 -{ - reg u64 toswap bit; - - bit = __ith_bit(k, ctr); - - toswap = swapped; - toswap ^= bit; - - x2, z2r, x3, z3 = __cswap4(x2, z2r, x3, z3, toswap); - swapped = bit; - - x2, z2r, x3, z3 = __add_and_double4(init, x2, z2r, x3, z3); - - return x2, z2r, x3, z3, swapped; -} - - -inline fn __montgomery_ladder4( - reg u64[4] u, - stack u8[32] k) - -> - stack u64[4], - reg u64[4] -{ - stack u64[4] us x2 x3 z3; - reg u64[4] z2r; - stack u64 swapped; - #spill_to_mmx reg u64 ctr; - - (x2,z2r,x3,z3) = __init_points4(u); - us = #copy(u); - - ctr = 255; - swapped = 0; - - while - { - ctr -= 1; - () = #spill(ctr); - - (x2, z2r, x3, z3, swapped) = - __montgomery_ladder_step4(k, us, x2, z2r, x3, z3, swapped, ctr); - - () = #unspill(ctr); - } (ctr > 0) - - return x2, z2r; -} - -inline fn __encode_point4(stack u64[4] x2, reg u64[4] z2r) -> reg u64[4] -{ - stack u64[4] z2; - reg u64[4] r; - - z2 = #copy(z2r); - z2 = __invert4(z2); - r = __mul4_rss(x2, z2); - r = __tobytes4(r); - - return r; -} - -inline fn __curve25519_internal_ref4(stack u8[32] k, reg u64[4] u) -> reg u64[4] -{ - stack u64[4] x2; - reg u64[4] z2r r; - - (x2,z2r) = __montgomery_ladder4(u, k); - r = __encode_point4(x2,z2r); - - return r; -} - -fn _curve25519_ref4(reg u64[4] _k _u) -> reg u64[4] -{ - stack u8[32] k; - reg u64[4] u r; - - k = __decode_scalar(_k); - u = __decode_u_coordinate4(_u); - r = __curve25519_internal_ref4(k, u); - - return r; -} - -inline fn __curve25519_ref4_ptr(#spill_to_mmx reg u64 rp, reg u64 kp up) -{ - reg u64[4] r k u; - - () = #spill(rp); - - k = __load4(kp); - u = __load4(up); - r = _curve25519_ref4(k, u); - - () = #unspill(rp); - - __store4(rp, r); -} - - -fn _curve25519_ref4_base(reg u64[4] _k) -> reg u64[4] -{ - stack u8[32] k; - reg u64[4] u r; - - k = __decode_scalar(_k); - u = __decode_u_coordinate_base4(); - r = __curve25519_internal_ref4(k, u); - - return r; -} - -inline fn __curve25519_ref4_base_ptr(#spill_to_mmx reg u64 rp, reg u64 kp) -{ - reg u64[4] r k; - - () = #spill(rp); - - k = __load4(kp); - r = _curve25519_ref4_base(k); - - () = #unspill(rp); - - __store4(rp, r); -} - diff --git a/src/crypto_scalarmult/curve25519/amd64/ref4/invert4.jinc b/src/crypto_scalarmult/curve25519/amd64/ref4/invert4.jinc deleted file mode 100644 index df8ed434..00000000 --- a/src/crypto_scalarmult/curve25519/amd64/ref4/invert4.jinc +++ /dev/null @@ -1,88 +0,0 @@ -require "mul4.jinc" -require "sqr4.jinc" - -inline fn __invert4(stack u64[4] fs) -> stack u64[4] -{ - stack u64[4] t0s t1s t2s t3s; - reg u32 i; - - // z2 = z1^2^1 - t0s = _sqr4_ss_(fs); - - // z8 = z2^2^2 - t1s = _sqr4_ss_(t0s); - t1s = _sqr4_s_(t1s); - - // z9 = z1*z8 - t1s = _mul4_ss_(t1s, fs); - - // z11 = z2*z9 - t0s = _mul4_ss_(t0s,t1s); - - // z22 = z11^2^1 - t2s = _sqr4_ss_(t0s); - - // z_5_0 = z9*z22 - t1s = _mul4_ss_(t1s,t2s); - - // z_10_5 = z_5_0^2^5 - t2s = _sqr4_ss_(t1s); - i = 4; - t2s = _it_sqr4_s_(t2s, i); - - // z_10_0 = z_10_5*z_5_0 - t1s = _mul4_ss_(t1s,t2s); - - // z_20_10 = z_10_0^2^10 - i = 10; - t2s = _it_sqr4_ss_(t2s, t1s, i); - - // z_20_0 = z_20_10*z_10_0 - t2s = _mul4_ss_(t2s, t1s); - - // z_40_20 = z_20_0^2^20 - i = 20; - t3s = _it_sqr4_ss_(t3s, t2s, i); - - // z_40_0 = z_40_20*z_20_0 - t2s = _mul4_ss_(t2s,t3s); - - // z_50_10 = z_40_0^2^10 - i = 10; - t2s = _it_sqr4_s_(t2s, i); - - // z_50_0 = z_50_10*z_10_0 - t1s = _mul4_ss_(t1s,t2s); - - // z_100_50 = z_50_0^2^50 - i = 50; - t2s = _it_sqr4_ss_(t2s, t1s, i); - - // z_100_0 = z_100_50*z_50_0 - t2s = _mul4_ss_(t2s, t1s); - - // z_200_100 = z_100_0^2^100 - i = 100; - t3s = _it_sqr4_ss_(t3s, t2s, i); - - // z_200_0 = z_200_100*z_100_0 - t2s = _mul4_ss_(t2s,t3s); - - // z_250_50 = z_200_0^2^50 - i = 50; - t2s = _it_sqr4_s_(t2s, i); - - // z_250_0 = z_250_50*z_50_0 - t1s = _mul4_ss_(t1s,t2s); - - // z_255_5 = z_250_0^2^5 - i = 4; - t1s = _it_sqr4_s_(t1s, i); - t1s = _sqr4_s_(t1s); - - // z_255_21 = z_255_5*z11 - t1s = _mul4_ss_(t1s, t0s); - - return t1s; -} - diff --git a/src/crypto_scalarmult/curve25519/amd64/ref4/mul4.jinc b/src/crypto_scalarmult/curve25519/amd64/ref4/mul4.jinc deleted file mode 100644 index 1534d6eb..00000000 --- a/src/crypto_scalarmult/curve25519/amd64/ref4/mul4.jinc +++ /dev/null @@ -1,191 +0,0 @@ -require "reduce4.jinc" - -inline fn __mul4_rss(stack u64[4] xa ya) -> reg u64[4] -{ - reg u64[8] z; - reg u64[4] r x y; - reg u64 h l hprev; - reg bool cf; - inline int i j; - - for i = 2 to 8 { z[i] = #MOV(0); } - - x[0] = xa[0]; - for j = 0 to 4 { - y[j] = ya[j]; - h, l = y[j] * x[0]; - if (j == 0) { - z[0] = l; - z[1] = h; - } else { - cf, z[j] += l; - _, z[j + 1] += h + cf; - } - } - - for i = 1 to 4 { - x[i] = xa[i]; - for j = 0 to 4 { - y[j] = ya[j]; - h, l = y[j] * x[i]; - cf, z[i+j] += l; - if (j == 0) { - hprev = #MOV(0); - _, hprev += h + cf; - } else { - _, h += 0 + cf; - cf, z[i+j] += hprev; - if (1 <= j && j < 4 - 1) { - hprev = #MOV(0); - _, hprev += h + cf; - } else { /* j = 4 */ - cf, z[i + j + 1] += h + cf; - } - } - } - } - - r = __reduce4(z); - - return r; -} - -inline fn __mul4_sss(stack u64[4] xa ya) -> stack u64[4] -{ - stack u64[4] rs; - reg u64[4] r; - - r = __mul4_rss(xa, ya); - rs = #copy(r); - - return rs; -} - -// //////////////////////////////////////////////////////////////////////////// - -#[returnaddress="stack"] -fn _mul4_pp(reg ptr u64[4] xa ya) -> reg ptr u64[4] -{ - reg u64[8] z; - reg u64[4] r x y; - reg u64 h l hprev; - reg bool cf; - inline int i j; - - for i = 2 to 8 { z[i] = #MOV(0); } - - x[0] = xa[0]; - for j = 0 to 4 { - y[j] = ya[j]; - h, l = y[j] * x[0]; - if (j == 0) { - z[0] = l; - z[1] = h; - } else { - cf, z[j] += l; - _, z[j + 1] += h + cf; - } - } - - for i = 1 to 4 { - x[i] = xa[i]; - for j = 0 to 4 { - y[j] = ya[j]; - h, l = y[j] * x[i]; - cf, z[i+j] += l; - if (j == 0) { - hprev = #MOV(0); - _, hprev += h + cf; - } else { - _, h += 0 + cf; - cf, z[i+j] += hprev; - if (1 <= j && j < 4 - 1) { - hprev = #MOV(0); - _, hprev += h + cf; - } else { /* j = 4 */ - cf, z[i + j + 1] += h + cf; - } - } - } - } - - r = __reduce4(z); - - for i=0 to 4 - { xa[i] = r[i]; } - - return xa; -} - -inline fn _mul4_ss_(stack u64[4] xa ya) -> stack u64[4] -{ - reg ptr u64[4] xp yp; - - xp = xa; - yp = ya; - xp = _mul4_pp(xp, yp); - - xa = xp; - return xa; -} - -// //////////////////////////////////////////////////////////////////////////// - -inline fn __mul4_a24_rs(stack u64[4] xa, inline u64 a24) -> reg u64[4] -{ - reg u64 rax rdx c t1 t2 t3 t4; - reg u64[4] r; - reg bool cf; - - c = a24; - - rax = xa[0]; - rdx, rax = rax * c; - r[0] = rax; - r[1] = rdx; - - rax = xa[2]; - rdx, rax = rax * c; - r[2] = rax; - r[3] = rdx; - - rax = xa[1]; - rdx, rax = rax * c; - t1 = rax; - t2 = rdx; - - rax = xa[3]; - rdx, rax = rax * c; - t3 = rax; - t4 = rdx; - - cf, r[1] += t1; - cf, r[2] += t2 + cf; - cf, r[3] += t3 + cf; - _, t4 += 0 + cf; - _, t4 *= 38; - - cf, r[0] += t4; - cf, r[1] += 0 + cf; - cf, r[2] += 0 + cf; - cf, r[3] += 0 + cf; - - t1 = 38; - t2 = #MOV(0); - t1 = t2 if !cf; - r[0] += t1; - - return r; -} - -inline fn __mul4_a24_ss(stack u64[4] xa, inline u64 a24) -> stack u64[4] -{ - stack u64[4] rs; - reg u64[4] r; - - r = __mul4_a24_rs(xa, a24); - rs = #copy(r); - - return rs; -} - diff --git a/src/crypto_scalarmult/curve25519/amd64/ref4/reduce4.jinc b/src/crypto_scalarmult/curve25519/amd64/ref4/reduce4.jinc deleted file mode 100644 index f068f90f..00000000 --- a/src/crypto_scalarmult/curve25519/amd64/ref4/reduce4.jinc +++ /dev/null @@ -1,57 +0,0 @@ -inline fn __reduce4(reg u64[8] z) -> reg u64[4] -{ - reg u64 z8 r0 r38 rax h l; - reg u64[4] r; - reg bool cf; - inline int i; - - r38 = 38; - - rax = z[4]; - h, l = rax * r38; - r[0] = l; - r[1] = h; - - rax = z[5]; - h, l = rax * r38; - cf, r[1] += l; - - r[2] = #MOV(0); - rax = z[6]; - _, r[2] += h + cf; - h, l = rax * r38; - cf, r[2] += l; - - r[3] = #MOV(0); - rax = z[7]; - _, r[3] += h + cf; - h, l = rax * r38; - cf, r[3] += l; - - z8 = #MOV(0); - _, z8 += h + cf; - - cf, r[0] += z[0]; - - for i = 1 to 4 { - cf, r[i] += z[i] + cf; - } - - _, z8 += 0 + cf; - z8 *= 38; - - r0 = #MOV(0); - - cf, r[0] += z8; - for i = 1 to 4 { - cf, r[i] += r0 + cf; - } - - _, r0 += r0 + cf; - - r0 *= 38; - r[0] += r0; - - return r; -} - diff --git a/src/crypto_scalarmult/curve25519/amd64/ref4/scalarmult.jazz b/src/crypto_scalarmult/curve25519/amd64/ref4/scalarmult.jazz index 261848dd..fdd8c19f 100644 --- a/src/crypto_scalarmult/curve25519/amd64/ref4/scalarmult.jazz +++ b/src/crypto_scalarmult/curve25519/amd64/ref4/scalarmult.jazz @@ -1,5 +1,1309 @@ -from Jade require "crypto_scalarmult/curve25519/amd64/common/load_store4.jinc" -require "curve25519.jinc" +//BOR#from formosa25519 require "crypto_scalarmult/curve25519/amd64/common/load_store4.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/common/load_store4.jinc +inline fn __load4(reg u64 p) -> reg u64[4] +{ + inline int i; + reg u64[4] a; + + for i=0 to 4 + { a[i] = [p + 8*i]; } + + return a; +} + +inline fn __store4(reg u64 p, reg u64[4] a) +{ + inline int i; + + for i=0 to 4 + { [p + 8*i] = a[i]; } +} + +//EOR# +//BOR#require "curve25519.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/ref4/curve25519.jinc +//BOR#from formosa25519 require "crypto_scalarmult/curve25519/amd64/common/bit.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/common/bit.jinc +inline fn __ith_bit(stack u8[32] k, reg u64 ctr) -> reg u64 +{ + reg u64 p bit; + + p = ctr; + p >>= 3; + bit = (64u) k[(int) p]; + + p = ctr; + p &= 7; + bit >>= (p & 63); + + bit &= 1; + + return bit; +} + +inline fn __next_bit(stack u64 k) -> reg u64, stack u64 +{ + reg bool cf; + reg u64 b one; + + ?{}, b = #set0(); + one = 1; + _, cf, _, _, _, k = #SHL(k, 1); + b = one if cf; + return b, k; +} +//EOR# +//BOR#from formosa25519 require "crypto_scalarmult/curve25519/amd64/common/decode_scalar.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/common/decode_scalar.jinc +inline fn __decode_scalar(reg u64[4] k) -> stack u8[32] +{ + inline int i; + stack u8[32] ks; + + for i=0 to 4 + { ks[u64 i] = k[i]; } + + ks[0] &= 0xf8; + ks[31] &= 0x7f; + ks[31] |= 0x40; + + return ks; +} + +inline fn __decode_scalar_shl1(reg u64[4] k) -> stack u64[4] +{ + stack u64[4] ks; + + k[3] <<= 1; + k[0] &= 0xfffffffffffffff8; + k[3] |= 0x8000000000000000; + + ks = #copy(k); + + return ks; +} + +//EOR# +//BOR#from formosa25519 require "crypto_scalarmult/curve25519/amd64/common/load_store4.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/common/load_store4.jinc +//EOR# + +//BOR#from formosa25519 require "crypto_scalarmult/curve25519/amd64/common/64/decode_u4.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/common/64/decode_u4.jinc +inline fn __decode_u_coordinate4(reg u64[4] u) -> reg u64[4] +{ + u[3] &= 0x7fffffffffffffff; + return u; +} + +inline fn __decode_u_coordinate_base4() -> reg u64[4] +{ + reg u64[4] u; + + u[0] = 9; + u[1] = 0; + u[2] = 0; + u[3] = 0; + + return u; +} + +//EOR# +//BOR#from formosa25519 require "crypto_scalarmult/curve25519/amd64/common/64/init_points4.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/common/64/init_points4.jinc +inline fn __init_points4( + reg u64[4] initr) + -> + stack u64[4], + reg u64[4], + stack u64[4], + stack u64[4] +{ + inline int i; + stack u64[4] x2 x3 z3; + reg u64[4] z2r; + reg u64 z; + + ?{}, z = #set0(); + + x2[0] = 1; + z2r[0] = 0; + x3 = #copy(initr); + z3[0] = 1; + + for i=1 to 4 + { x2[i] = z; + z2r[i] = z; + z3[i] = z; + } + + // (1, 0, init, 1) + return x2, z2r, x3, z3; +} + +inline fn __init_points4_x3() + -> + stack u64[4], + reg u64[4], + stack u64[4] +{ + inline int i; + stack u64[4] f1s f3s; + reg u64[4] f2; + reg u64 z; + + ?{}, z = #set0(); + + f1s[0] = 1; + f2[0] = 1; + f3s[0] = 1; + + for i=1 to 4 + { f1s[i] = z; + f2[i] = z; + f3s[i] = z; + } + + return f1s, f2, f3s; +} + +//EOR# +//BOR#from formosa25519 require "crypto_scalarmult/curve25519/amd64/common/64/add4.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/common/64/add4.jinc +// h = f + g +// h = 2**0*f0 + 2**64*f1 + 2**128*f2 + 2**192*f3 + +// 2**0*g0 + 2**64*g1 + 2**128*g2 + 2**192*g3 + +inline fn __add4_rrs(reg u64[4] f, stack u64[4] g) -> reg u64[4] +{ + inline int i; + reg bool cf; + reg u64[4] h; + reg u64 z; + + ?{}, z = #set0(); + + h = #copy(f); + + cf, h[0] += g[0]; + for i=1 to 4 + { cf, h[i] += g[i] + cf; } + + _, z -= z - cf; + z &= 38; + + cf, h[0] += z; + for i=1 to 4 + { cf, h[i] += 0 + cf; } + + _, z -= z - cf; + z &= 38; + h[0] += z; + + return h; +} + +inline fn __add4_sss(stack u64[4] fs gs) -> stack u64[4] +{ + stack u64[4] hs; + reg u64[4] h f; + + f = #copy(fs); + h = __add4_rrs(f, gs); + hs = #copy(h); + + return hs; +} + +inline fn __add4_ssr(stack u64[4] fs, reg u64[4] g) -> stack u64[4] +{ + stack u64[4] hs; + reg u64[4] h; + + h = __add4_rrs(g, fs); + hs = #copy(h); + + return hs; +} + +inline fn __add4_rsr(stack u64[4] fs, reg u64[4] g) -> reg u64[4] +{ + reg u64[4] h; + + h = __add4_rrs(g, fs); + + return h; +} + +//EOR# +//BOR#from formosa25519 require "crypto_scalarmult/curve25519/amd64/common/64/sub4.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/common/64/sub4.jinc +// h = f - g +// h = (2**0*f0 + 2**64*f1 + 2**128*f2 + 2**192*f3) - +// (2**0*g0 + 2**64*g1 + 2**128*g2 + 2**192*g3) + +inline fn __sub4_rrs(reg u64[4] f, stack u64[4] gs) -> reg u64[4] +{ + inline int i; + reg bool cf; + reg u64[4] h; + reg u64 z; + + ?{}, z = #set0(); + + h = #copy(f); + + cf, h[0] -= gs[0]; + for i=1 to 4 + { cf, h[i] -= gs[i] - cf; } + + _, z -= z - cf; + z &= 38; + + cf, h[0] -= z; + for i=1 to 4 + { cf, h[i] -= 0 - cf; } + + _, z -= z - cf; + z &= 38; + h[0] -= z; + + return h; +} + +inline fn __sub4_sss(stack u64[4] fs gs) -> stack u64[4] +{ + stack u64[4] hs; + reg u64[4] h f; + + f = #copy(fs); + h = __sub4_rrs(f, gs); + hs = #copy(h); + + return hs; +} + +inline fn __sub4_rss(stack u64[4] fs gs) -> reg u64[4] +{ + reg u64[4] h f; + + f = #copy(fs); + h = __sub4_rrs(f, gs); + + return h; +} + +inline fn __sub4_rsr(stack u64[4] fs, reg u64[4] g) -> reg u64[4] +{ + inline int i; + reg bool cf; + reg u64[4] h; + reg u64 z; + + ?{}, z = #set0(); + + h = #copy(fs); + + cf, h[0] -= g[0]; + for i=1 to 4 + { cf, h[i] -= g[i] - cf; } + + _, z -= z - cf; + z &= 38; + + cf, h[0] -= z; + for i=1 to 4 + { cf, h[i] -= 0 - cf; } + + _, z -= z - cf; + z &= 38; + h[0] -= z; + + return h; +} + +inline fn __sub4_ssr(stack u64[4] fs, reg u64[4] g) -> stack u64[4] +{ + stack u64[4] hs; + reg u64[4] h; + + h = __sub4_rsr(fs, g); + hs = #copy(h); + + return hs; +} + +//EOR# +//BOR#from formosa25519 require "crypto_scalarmult/curve25519/amd64/common/64/cswap4.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/common/64/cswap4.jinc +inline fn __cswap4( + stack u64[4] x2, + reg u64[4] z2r, + stack u64[4] x3, + stack u64[4] z3, + reg u64 toswap) + -> + stack u64[4], + reg u64[4], + stack u64[4], + stack u64[4] +{ + inline int i; + reg u64[4] t4 x2r x3r z3r; + reg u64 t mask; + + ?{}, mask = #set0(); + mask -= toswap; // if toswap == 1 mask = -1 or all bits at 1, 0 otherwise + + // swap between z2 and z3 + z3r = #copy(z3); + t4 = #copy(z2r); + + for i=0 to 4 { t4[i] ^= z3r[i]; } // t4 = z2 ^ z3 + for i=0 to 4 { t4[i] &= mask; } // t4 = (z2 ^ z3) & mask --> if m==0 then t4 = {0} + for i=0 to 4 { z2r[i] ^= t4[i]; + z3r[i] ^= t4[i]; + z3[i] = z3r[i]; } + + // swap between x3r and z3 + x3r = #copy(x3); + + for i=0 to 4 { x2r[i] = x2[i]; + t = x3r[i]; + t ^= x2r[i]; + t &= mask; + x2r[i] ^= t; + x3r[i] ^= t; + x2[i] = x2r[i]; + x3[i] = x3r[i]; } + + return x2, z2r, x3, z3; +} + +inline fn __cswap4_ssss( + stack u64[4] xs, + stack u64[4] ys, + reg u64 swap) + -> + stack u64[4], + stack u64[4] +{ + inline int i; + reg u64[4] x y; + reg u64 t mask; + + x = #copy(xs); + + mask = 0; + mask -= swap; + + for i=0 to 4 + { + y[i] = ys[i]; + + t = x[i]; + t ^= y[i]; + t &= mask; + + x[i] ^= t; // ^ (x[i] ^ y[i]) if swap == 1 + y[i] ^= t; + + ys[i] = y[i]; + } + + xs = #copy(x); + + return xs, ys; +} + +inline fn __cswap4_rsrs( + reg u64[4] x, + stack u64[4] ys, + reg u64 swap) + -> + reg u64[4], + stack u64[4] +{ + inline int i; + reg u64[4] y; + reg u64 t mask; + + mask = 0; + mask -= swap; + + for i=0 to 4 + { + y[i] = ys[i]; + + t = x[i]; + t ^= y[i]; + t &= mask; + + x[i] ^= t; // ^ (x[i] ^ y[i]) if swap == 1 + y[i] ^= t; + + ys[i] = y[i]; + } + + return x, ys; +} + +//EOR# +//BOR#from formosa25519 require "crypto_scalarmult/curve25519/amd64/common/64/tobytes4.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/common/64/tobytes4.jinc +inline fn __tobytes4(reg u64[4] f) -> reg u64[4] +{ + reg bool cf; + reg u64 t; + + t = #LEA(f[3] + f[3]); + ?{}, f[3] = #SAR(f[3], 63); + t >>= 1; + f[3] &= 19; + f[3] += 19; + + cf, f[0] += f[3]; + cf, f[1] += 0 + cf; + cf, f[2] += 0 + cf; + _, t += 0 + cf; + + f[3] = #LEA(t + t); + ?{}, t = #SAR(t, 63); + f[3] >>= 1; + t = !t; + t &= 19; + + cf, f[0] -= t; + cf, f[1] -= 0 - cf; + cf, f[2] -= 0 - cf; + _, f[3] -= 0 - cf; + + return f; + +} +//EOR# + +//BOR#require "mul4.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/ref4/mul4.jinc +//BOR#require "reduce4.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/ref4/reduce4.jinc +inline fn __reduce4(reg u64[8] z) -> reg u64[4] +{ + reg u64 z8 r0 r38 rax h l; + reg u64[4] r; + reg bool cf; + inline int i; + + r38 = 38; + + rax = z[4]; + h, l = rax * r38; + r[0] = l; + r[1] = h; + + rax = z[5]; + h, l = rax * r38; + cf, r[1] += l; + + r[2] = #MOV(0); + rax = z[6]; + _, r[2] += h + cf; + h, l = rax * r38; + cf, r[2] += l; + + r[3] = #MOV(0); + rax = z[7]; + _, r[3] += h + cf; + h, l = rax * r38; + cf, r[3] += l; + + z8 = #MOV(0); + _, z8 += h + cf; + + cf, r[0] += z[0]; + + for i = 1 to 4 { + cf, r[i] += z[i] + cf; + } + + _, z8 += 0 + cf; + z8 *= 38; + + r0 = #MOV(0); + + cf, r[0] += z8; + for i = 1 to 4 { + cf, r[i] += r0 + cf; + } + + _, r0 += r0 + cf; + + r0 *= 38; + r[0] += r0; + + return r; +} + +//EOR# + +inline fn __mul4_rss(stack u64[4] xa ya) -> reg u64[4] +{ + reg u64[8] z; + reg u64[4] r x y; + reg u64 h l hprev; + reg bool cf; + inline int i j; + + for i = 2 to 8 { z[i] = #MOV(0); } + + x[0] = xa[0]; + for j = 0 to 4 { + y[j] = ya[j]; + h, l = y[j] * x[0]; + if (j == 0) { + z[0] = l; + z[1] = h; + } else { + cf, z[j] += l; + _, z[j + 1] += h + cf; + } + } + + for i = 1 to 4 { + x[i] = xa[i]; + for j = 0 to 4 { + y[j] = ya[j]; + h, l = y[j] * x[i]; + cf, z[i+j] += l; + if (j == 0) { + hprev = #MOV(0); + _, hprev += h + cf; + } else { + _, h += 0 + cf; + cf, z[i+j] += hprev; + if (1 <= j && j < 4 - 1) { + hprev = #MOV(0); + _, hprev += h + cf; + } else { /* j = 4 */ + cf, z[i + j + 1] += h + cf; + } + } + } + } + + r = __reduce4(z); + + return r; +} + +inline fn __mul4_sss(stack u64[4] xa ya) -> stack u64[4] +{ + stack u64[4] rs; + reg u64[4] r; + + r = __mul4_rss(xa, ya); + rs = #copy(r); + + return rs; +} + +// //////////////////////////////////////////////////////////////////////////// + +#[returnaddress="stack"] +fn _mul4_pp(reg ptr u64[4] xa ya) -> reg ptr u64[4] +{ + reg u64[8] z; + reg u64[4] r x y; + reg u64 h l hprev; + reg bool cf; + inline int i j; + + for i = 2 to 8 { z[i] = #MOV(0); } + + x[0] = xa[0]; + for j = 0 to 4 { + y[j] = ya[j]; + h, l = y[j] * x[0]; + if (j == 0) { + z[0] = l; + z[1] = h; + } else { + cf, z[j] += l; + _, z[j + 1] += h + cf; + } + } + + for i = 1 to 4 { + x[i] = xa[i]; + for j = 0 to 4 { + y[j] = ya[j]; + h, l = y[j] * x[i]; + cf, z[i+j] += l; + if (j == 0) { + hprev = #MOV(0); + _, hprev += h + cf; + } else { + _, h += 0 + cf; + cf, z[i+j] += hprev; + if (1 <= j && j < 4 - 1) { + hprev = #MOV(0); + _, hprev += h + cf; + } else { /* j = 4 */ + cf, z[i + j + 1] += h + cf; + } + } + } + } + + r = __reduce4(z); + + for i=0 to 4 + { xa[i] = r[i]; } + + return xa; +} + +inline fn _mul4_ss_(stack u64[4] xa ya) -> stack u64[4] +{ + reg ptr u64[4] xp yp; + + xp = xa; + yp = ya; + xp = _mul4_pp(xp, yp); + + xa = xp; + return xa; +} + +// //////////////////////////////////////////////////////////////////////////// + +inline fn __mul4_a24_rs(stack u64[4] xa, inline u64 a24) -> reg u64[4] +{ + reg u64 rax rdx c t1 t2 t3 t4; + reg u64[4] r; + reg bool cf; + + c = a24; + + rax = xa[0]; + rdx, rax = rax * c; + r[0] = rax; + r[1] = rdx; + + rax = xa[2]; + rdx, rax = rax * c; + r[2] = rax; + r[3] = rdx; + + rax = xa[1]; + rdx, rax = rax * c; + t1 = rax; + t2 = rdx; + + rax = xa[3]; + rdx, rax = rax * c; + t3 = rax; + t4 = rdx; + + cf, r[1] += t1; + cf, r[2] += t2 + cf; + cf, r[3] += t3 + cf; + _, t4 += 0 + cf; + _, t4 *= 38; + + cf, r[0] += t4; + cf, r[1] += 0 + cf; + cf, r[2] += 0 + cf; + cf, r[3] += 0 + cf; + + t1 = 38; + t2 = #MOV(0); + t1 = t2 if !cf; + r[0] += t1; + + return r; +} + +inline fn __mul4_a24_ss(stack u64[4] xa, inline u64 a24) -> stack u64[4] +{ + stack u64[4] rs; + reg u64[4] r; + + r = __mul4_a24_rs(xa, a24); + rs = #copy(r); + + return rs; +} + +//EOR# +//BOR#require "sqr4.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/ref4/sqr4.jinc +//BOR#require "reduce4.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/ref4/reduce4.jinc +//EOR# + +inline fn __sqr4_rs(stack u64[4] xa) -> reg u64[4] +{ + reg u64 zero rax rdx; + reg u64[8] z; + reg u64[4] r; + reg u64[5] t; + reg bool cf; + + z[7] = #MOV(0); + zero = #MOV(0); + + // 2*x01 + 2*x02 + 2*x03 + 2*x12 + 2*x13 + 2*x23 + // + x00 + x11 + x22 + x33 + + rax = xa[1]; + rdx, rax = rax * xa[0]; + z[1] = rax; + z[2] = rdx; + + rax = xa[2]; + rdx, rax = rax * xa[1]; + z[3] = rax; + z[4] = rdx; + + rax = xa[3]; + rdx, rax = rax * xa[2]; + z[5] = rax; + z[6] = rdx; + + // [2*]x01 + 2*x02 + 2*x03 + [2*]x12 + 2*x13 + [2*]x23 + // + x00 + x11 + x22 + x33 + + rax = xa[2]; + rdx, rax = rax * xa[0]; + cf, z[2] += rax; + cf, z[3] += rdx + cf; + _, z[4] += zero + cf; + + rax = xa[3]; + rdx, rax = rax * xa[1]; + cf, z[4] += rax; + cf, z[5] += rdx + cf; + _, z[6] += zero + cf; + + // [2*]x01 + [2*]x02 + 2*x03 + [2*]x12 + [2*]x13 + [2*]x23 + // + x00 + x11 + x22 + x33 + + rax = xa[3]; + rdx, rax = rax * xa[0]; + cf, z[3] += rax; + cf, z[4] += rdx + cf; + cf, z[5] += zero + cf; + cf, z[6] += zero + cf; + _, z[7] += zero + cf; + + // x01 + x02 + x03 + x12 + x13 + x23 + // + x00 + x11 + x22 + x33 + + // set z<1..2n+1> = 2*z<1..2n+1> since + // we have summed all x_i*x_j with i<>j + // so far and these occur twice + cf, z[1] += z[1]; + cf, z[2] += z[2] + cf; + cf, z[3] += z[3] + cf; + cf, z[4] += z[4] + cf; + cf, z[5] += z[5] + cf; + cf, z[6] += z[6] + cf; + cf, z[7] += z[7] + cf; + + // x00 + x11 + x22 + x33 + + rax = xa[0]; + rdx, rax = rax * xa[0]; + z[0] = rax; + t[0] = rdx; + + rax = xa[1]; + rdx, rax = rax * xa[1]; + t[1] = rax; + t[2] = rdx; + + rax = xa[2]; + rdx, rax = rax * xa[2]; + t[3] = rax; + t[4] = rdx; + + cf, z[1] += t[0]; + cf, z[2] += t[1] + cf; + cf, z[3] += t[2] + cf; + cf, z[4] += t[3] + cf; + cf, z[5] += t[4] + cf; + cf, z[6] += 0 + cf; + _, z[7] += 0 + cf; + + rax = xa[3]; + rdx, rax = rax * xa[3]; + cf, z[6] += rax; + _, z[7] += rdx + cf; + + r = __reduce4(z); + + return r; +} + +inline fn __sqr4_ss(stack u64[4] xa) -> stack u64[4] +{ + stack u64[4] rs; + reg u64[4] r; + + r = __sqr4_rs(xa); + rs = #copy(r); + + return rs; +} + +// //////////////////////////////////////////////////////////////////////////// + +// TODO replace "-> reg ptr u64[4]" by "reg u64[4]" when r.a. @ f call +#[returnaddress="stack"] +fn _sqr4_p(reg ptr u64[4] xa) -> reg ptr u64[4] +{ + inline int i; + reg u64 zero rax rdx; + reg u64[8] z; + reg u64[4] r; + reg u64[5] t; + reg bool cf; + + z[7] = #MOV(0); + zero = #MOV(0); + + // 2*x01 + 2*x02 + 2*x03 + 2*x12 + 2*x13 + 2*x23 + // + x00 + x11 + x22 + x33 + + rax = xa[1]; + rdx, rax = rax * xa[0]; + z[1] = rax; + z[2] = rdx; + + rax = xa[2]; + rdx, rax = rax * xa[1]; + z[3] = rax; + z[4] = rdx; + + rax = xa[3]; + rdx, rax = rax * xa[2]; + z[5] = rax; + z[6] = rdx; + + // [2*]x01 + 2*x02 + 2*x03 + [2*]x12 + 2*x13 + [2*]x23 + // + x00 + x11 + x22 + x33 + + rax = xa[2]; + rdx, rax = rax * xa[0]; + cf, z[2] += rax; + cf, z[3] += rdx + cf; + _, z[4] += zero + cf; + + rax = xa[3]; + rdx, rax = rax * xa[1]; + cf, z[4] += rax; + cf, z[5] += rdx + cf; + _, z[6] += zero + cf; + + // [2*]x01 + [2*]x02 + 2*x03 + [2*]x12 + [2*]x13 + [2*]x23 + // + x00 + x11 + x22 + x33 + + rax = xa[3]; + rdx, rax = rax * xa[0]; + cf, z[3] += rax; + cf, z[4] += rdx + cf; + cf, z[5] += zero + cf; + cf, z[6] += zero + cf; + _, z[7] += zero + cf; + + // x01 + x02 + x03 + x12 + x13 + x23 + // + x00 + x11 + x22 + x33 + + // set z<1..2n+1> = 2*z<1..2n+1> since + // we have summed all x_i*x_j with i<>j + // so far and these occur twice + cf, z[1] += z[1]; + cf, z[2] += z[2] + cf; + cf, z[3] += z[3] + cf; + cf, z[4] += z[4] + cf; + cf, z[5] += z[5] + cf; + cf, z[6] += z[6] + cf; + cf, z[7] += z[7] + cf; + + // x00 + x11 + x22 + x33 + + rax = xa[0]; + rdx, rax = rax * xa[0]; + z[0] = rax; + t[0] = rdx; + + rax = xa[1]; + rdx, rax = rax * xa[1]; + t[1] = rax; + t[2] = rdx; + + rax = xa[2]; + rdx, rax = rax * xa[2]; + t[3] = rax; + t[4] = rdx; + + cf, z[1] += t[0]; + cf, z[2] += t[1] + cf; + cf, z[3] += t[2] + cf; + cf, z[4] += t[3] + cf; + cf, z[5] += t[4] + cf; + cf, z[6] += 0 + cf; + _, z[7] += 0 + cf; + + rax = xa[3]; + rdx, rax = rax * xa[3]; + cf, z[6] += rax; + _, z[7] += rdx + cf; + + r = __reduce4(z); + + for i=0 to 4 + { xa[i] = r[i]; } + + return xa; +} + +inline fn _sqr4_ss_(stack u64[4] xa) -> stack u64[4] +{ + inline int j; + stack u64[4] ra; + reg ptr u64[4] rp; + reg u64 t; + + for j=0 to 4 + { t = xa[j]; ra[j] = t; } + + rp = ra; + rp = _sqr4_p(rp); + ra = rp; + + return ra; +} + +inline fn _sqr4_s_(stack u64[4] x) -> stack u64[4] +{ + reg ptr u64[4] xp; + + xp = x; + xp = _sqr4_p(xp); + x = xp; + + return x; +} + +// //////////////////////////////////////////////////////////////////////////// + +#[returnaddress="stack"] +fn _it_sqr4_p(reg ptr u64[4] x, reg u32 i) -> reg ptr u64[4] +{ + reg bool zf; + + while { + x = _sqr4_p(x); + _,_,_,zf,i = #DEC_32(i); + }(!zf) + + return x; +} + +inline fn _it_sqr4_s_(stack u64[4] x, reg u32 i) -> stack u64[4] +{ + reg ptr u64[4] xp; + + xp = x; + xp = _it_sqr4_p(xp, i); + x = xp; + + return x; +} + +inline fn _it_sqr4_ss_(stack u64[4] r x, reg u32 i) -> stack u64[4] +{ + inline int j; + reg ptr u64[4] rp; + reg u64 t; + + for j=0 to 4 + { t = x[j]; r[j] = t; } + + rp = r; + rp = _it_sqr4_p(rp, i); + r = rp; + + return r; +} + +//EOR# +//BOR#require "invert4.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/ref4/invert4.jinc +//BOR#require "mul4.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/ref4/mul4.jinc +//EOR# +//BOR#require "sqr4.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/ref4/sqr4.jinc +//EOR# + +inline fn __invert4(stack u64[4] fs) -> stack u64[4] +{ + stack u64[4] t0s t1s t2s t3s; + reg u32 i; + + // z2 = z1^2^1 + t0s = _sqr4_ss_(fs); + + // z8 = z2^2^2 + t1s = _sqr4_ss_(t0s); + t1s = _sqr4_s_(t1s); + + // z9 = z1*z8 + t1s = _mul4_ss_(t1s, fs); + + // z11 = z2*z9 + t0s = _mul4_ss_(t0s,t1s); + + // z22 = z11^2^1 + t2s = _sqr4_ss_(t0s); + + // z_5_0 = z9*z22 + t1s = _mul4_ss_(t1s,t2s); + + // z_10_5 = z_5_0^2^5 + t2s = _sqr4_ss_(t1s); + i = 4; + t2s = _it_sqr4_s_(t2s, i); + + // z_10_0 = z_10_5*z_5_0 + t1s = _mul4_ss_(t1s,t2s); + + // z_20_10 = z_10_0^2^10 + i = 10; + t2s = _it_sqr4_ss_(t2s, t1s, i); + + // z_20_0 = z_20_10*z_10_0 + t2s = _mul4_ss_(t2s, t1s); + + // z_40_20 = z_20_0^2^20 + i = 20; + t3s = _it_sqr4_ss_(t3s, t2s, i); + + // z_40_0 = z_40_20*z_20_0 + t2s = _mul4_ss_(t2s,t3s); + + // z_50_10 = z_40_0^2^10 + i = 10; + t2s = _it_sqr4_s_(t2s, i); + + // z_50_0 = z_50_10*z_10_0 + t1s = _mul4_ss_(t1s,t2s); + + // z_100_50 = z_50_0^2^50 + i = 50; + t2s = _it_sqr4_ss_(t2s, t1s, i); + + // z_100_0 = z_100_50*z_50_0 + t2s = _mul4_ss_(t2s, t1s); + + // z_200_100 = z_100_0^2^100 + i = 100; + t3s = _it_sqr4_ss_(t3s, t2s, i); + + // z_200_0 = z_200_100*z_100_0 + t2s = _mul4_ss_(t2s,t3s); + + // z_250_50 = z_200_0^2^50 + i = 50; + t2s = _it_sqr4_s_(t2s, i); + + // z_250_0 = z_250_50*z_50_0 + t1s = _mul4_ss_(t1s,t2s); + + // z_255_5 = z_250_0^2^5 + i = 4; + t1s = _it_sqr4_s_(t1s, i); + t1s = _sqr4_s_(t1s); + + // z_255_21 = z_255_5*z11 + t1s = _mul4_ss_(t1s, t0s); + + return t1s; +} + +//EOR# + +inline fn __add_and_double4( + stack u64[4] init, + stack u64[4] x2, + reg u64[4] z2r, + stack u64[4] x3, + stack u64[4] z3) + -> + stack u64[4], + reg u64[4], + stack u64[4], + stack u64[4] +{ + stack u64[4] z2 t0 t1 t2; + + t0 = __sub4_ssr(x2, z2r); + x2 = __add4_ssr(x2, z2r); + + t1 = __sub4_sss(x3, z3); + z2 = __add4_sss(x3, z3); + + z3 = __mul4_sss(x2, t1); + z2 = __mul4_sss(z2, t0); + + t2 = __sqr4_ss(x2); + t1 = __sqr4_ss(t0); + + x3 = __add4_sss(z3, z2); + z2 = __sub4_sss(z3, z2); + + x2 = __mul4_sss(t2, t1); + t0 = __sub4_sss(t2, t1); + + z2 = __sqr4_ss(z2); + z3 = __mul4_a24_ss(t0, 121665); + x3 = __sqr4_ss(x3); + + t2 = __add4_sss(t2, z3); + z3 = __mul4_sss(init, z2); + z2r = __mul4_rss(t0, t2); + + return x2, z2r, x3, z3; +} + +inline fn __montgomery_ladder_step4( + stack u8[32] k, + stack u64[4] init, + stack u64[4] x2, + reg u64[4] z2r, + stack u64[4] x3, + stack u64[4] z3, + stack u64 swapped, + reg u64 ctr) + -> + stack u64[4], + reg u64[4], + stack u64[4], + stack u64[4], + stack u64 +{ + reg u64 toswap bit; + + bit = __ith_bit(k, ctr); + + toswap = swapped; + toswap ^= bit; + + x2, z2r, x3, z3 = __cswap4(x2, z2r, x3, z3, toswap); + swapped = bit; + + x2, z2r, x3, z3 = __add_and_double4(init, x2, z2r, x3, z3); + + return x2, z2r, x3, z3, swapped; +} + + +inline fn __montgomery_ladder4( + reg u64[4] u, + stack u8[32] k) + -> + stack u64[4], + reg u64[4] +{ + stack u64[4] us x2 x3 z3; + reg u64[4] z2r; + stack u64 swapped; + #spill_to_mmx reg u64 ctr; + + (x2,z2r,x3,z3) = __init_points4(u); + us = #copy(u); + + ctr = 255; + swapped = 0; + + while + { + ctr -= 1; + () = #spill(ctr); + + (x2, z2r, x3, z3, swapped) = + __montgomery_ladder_step4(k, us, x2, z2r, x3, z3, swapped, ctr); + + () = #unspill(ctr); + } (ctr > 0) + + return x2, z2r; +} + +inline fn __encode_point4(stack u64[4] x2, reg u64[4] z2r) -> reg u64[4] +{ + stack u64[4] z2; + reg u64[4] r; + + z2 = #copy(z2r); + z2 = __invert4(z2); + r = __mul4_rss(x2, z2); + r = __tobytes4(r); + + return r; +} + +inline fn __curve25519_internal_ref4(stack u8[32] k, reg u64[4] u) -> reg u64[4] +{ + stack u64[4] x2; + reg u64[4] z2r r; + + (x2,z2r) = __montgomery_ladder4(u, k); + r = __encode_point4(x2,z2r); + + return r; +} + +fn _curve25519_ref4(reg u64[4] _k _u) -> reg u64[4] +{ + stack u8[32] k; + reg u64[4] u r; + + k = __decode_scalar(_k); + u = __decode_u_coordinate4(_u); + r = __curve25519_internal_ref4(k, u); + + return r; +} + +inline fn __curve25519_ref4_ptr(#spill_to_mmx reg u64 rp, reg u64 kp up) +{ + reg u64[4] r k u; + + () = #spill(rp); + + k = __load4(kp); + u = __load4(up); + r = _curve25519_ref4(k, u); + + () = #unspill(rp); + + __store4(rp, r); +} + + +fn _curve25519_ref4_base(reg u64[4] _k) -> reg u64[4] +{ + stack u8[32] k; + reg u64[4] u r; + + k = __decode_scalar(_k); + u = __decode_u_coordinate_base4(); + r = __curve25519_internal_ref4(k, u); + + return r; +} + +inline fn __curve25519_ref4_base_ptr(#spill_to_mmx reg u64 rp, reg u64 kp) +{ + reg u64[4] r k; + + () = #spill(rp); + + k = __load4(kp); + r = _curve25519_ref4_base(k); + + () = #unspill(rp); + + __store4(rp, r); +} + +//EOR# export fn jade_scalarmult_curve25519_amd64_ref4(#spill_to_mmx reg u64 qp np pp) -> reg u64 { diff --git a/src/crypto_scalarmult/curve25519/amd64/ref4/sqr4.jinc b/src/crypto_scalarmult/curve25519/amd64/ref4/sqr4.jinc deleted file mode 100644 index a5d57d15..00000000 --- a/src/crypto_scalarmult/curve25519/amd64/ref4/sqr4.jinc +++ /dev/null @@ -1,299 +0,0 @@ -require "reduce4.jinc" - -inline fn __sqr4_rs(stack u64[4] xa) -> reg u64[4] -{ - reg u64 zero rax rdx; - reg u64[8] z; - reg u64[4] r; - reg u64[5] t; - reg bool cf; - - z[7] = #MOV(0); - zero = #MOV(0); - - // 2*x01 + 2*x02 + 2*x03 + 2*x12 + 2*x13 + 2*x23 - // + x00 + x11 + x22 + x33 - - rax = xa[1]; - rdx, rax = rax * xa[0]; - z[1] = rax; - z[2] = rdx; - - rax = xa[2]; - rdx, rax = rax * xa[1]; - z[3] = rax; - z[4] = rdx; - - rax = xa[3]; - rdx, rax = rax * xa[2]; - z[5] = rax; - z[6] = rdx; - - // [2*]x01 + 2*x02 + 2*x03 + [2*]x12 + 2*x13 + [2*]x23 - // + x00 + x11 + x22 + x33 - - rax = xa[2]; - rdx, rax = rax * xa[0]; - cf, z[2] += rax; - cf, z[3] += rdx + cf; - _, z[4] += zero + cf; - - rax = xa[3]; - rdx, rax = rax * xa[1]; - cf, z[4] += rax; - cf, z[5] += rdx + cf; - _, z[6] += zero + cf; - - // [2*]x01 + [2*]x02 + 2*x03 + [2*]x12 + [2*]x13 + [2*]x23 - // + x00 + x11 + x22 + x33 - - rax = xa[3]; - rdx, rax = rax * xa[0]; - cf, z[3] += rax; - cf, z[4] += rdx + cf; - cf, z[5] += zero + cf; - cf, z[6] += zero + cf; - _, z[7] += zero + cf; - - // x01 + x02 + x03 + x12 + x13 + x23 - // + x00 + x11 + x22 + x33 - - // set z<1..2n+1> = 2*z<1..2n+1> since - // we have summed all x_i*x_j with i<>j - // so far and these occur twice - cf, z[1] += z[1]; - cf, z[2] += z[2] + cf; - cf, z[3] += z[3] + cf; - cf, z[4] += z[4] + cf; - cf, z[5] += z[5] + cf; - cf, z[6] += z[6] + cf; - cf, z[7] += z[7] + cf; - - // x00 + x11 + x22 + x33 - - rax = xa[0]; - rdx, rax = rax * xa[0]; - z[0] = rax; - t[0] = rdx; - - rax = xa[1]; - rdx, rax = rax * xa[1]; - t[1] = rax; - t[2] = rdx; - - rax = xa[2]; - rdx, rax = rax * xa[2]; - t[3] = rax; - t[4] = rdx; - - cf, z[1] += t[0]; - cf, z[2] += t[1] + cf; - cf, z[3] += t[2] + cf; - cf, z[4] += t[3] + cf; - cf, z[5] += t[4] + cf; - cf, z[6] += 0 + cf; - _, z[7] += 0 + cf; - - rax = xa[3]; - rdx, rax = rax * xa[3]; - cf, z[6] += rax; - _, z[7] += rdx + cf; - - r = __reduce4(z); - - return r; -} - -inline fn __sqr4_ss(stack u64[4] xa) -> stack u64[4] -{ - stack u64[4] rs; - reg u64[4] r; - - r = __sqr4_rs(xa); - rs = #copy(r); - - return rs; -} - -// //////////////////////////////////////////////////////////////////////////// - -// TODO replace "-> reg ptr u64[4]" by "reg u64[4]" when r.a. @ f call -#[returnaddress="stack"] -fn _sqr4_p(reg ptr u64[4] xa) -> reg ptr u64[4] -{ - inline int i; - reg u64 zero rax rdx; - reg u64[8] z; - reg u64[4] r; - reg u64[5] t; - reg bool cf; - - z[7] = #MOV(0); - zero = #MOV(0); - - // 2*x01 + 2*x02 + 2*x03 + 2*x12 + 2*x13 + 2*x23 - // + x00 + x11 + x22 + x33 - - rax = xa[1]; - rdx, rax = rax * xa[0]; - z[1] = rax; - z[2] = rdx; - - rax = xa[2]; - rdx, rax = rax * xa[1]; - z[3] = rax; - z[4] = rdx; - - rax = xa[3]; - rdx, rax = rax * xa[2]; - z[5] = rax; - z[6] = rdx; - - // [2*]x01 + 2*x02 + 2*x03 + [2*]x12 + 2*x13 + [2*]x23 - // + x00 + x11 + x22 + x33 - - rax = xa[2]; - rdx, rax = rax * xa[0]; - cf, z[2] += rax; - cf, z[3] += rdx + cf; - _, z[4] += zero + cf; - - rax = xa[3]; - rdx, rax = rax * xa[1]; - cf, z[4] += rax; - cf, z[5] += rdx + cf; - _, z[6] += zero + cf; - - // [2*]x01 + [2*]x02 + 2*x03 + [2*]x12 + [2*]x13 + [2*]x23 - // + x00 + x11 + x22 + x33 - - rax = xa[3]; - rdx, rax = rax * xa[0]; - cf, z[3] += rax; - cf, z[4] += rdx + cf; - cf, z[5] += zero + cf; - cf, z[6] += zero + cf; - _, z[7] += zero + cf; - - // x01 + x02 + x03 + x12 + x13 + x23 - // + x00 + x11 + x22 + x33 - - // set z<1..2n+1> = 2*z<1..2n+1> since - // we have summed all x_i*x_j with i<>j - // so far and these occur twice - cf, z[1] += z[1]; - cf, z[2] += z[2] + cf; - cf, z[3] += z[3] + cf; - cf, z[4] += z[4] + cf; - cf, z[5] += z[5] + cf; - cf, z[6] += z[6] + cf; - cf, z[7] += z[7] + cf; - - // x00 + x11 + x22 + x33 - - rax = xa[0]; - rdx, rax = rax * xa[0]; - z[0] = rax; - t[0] = rdx; - - rax = xa[1]; - rdx, rax = rax * xa[1]; - t[1] = rax; - t[2] = rdx; - - rax = xa[2]; - rdx, rax = rax * xa[2]; - t[3] = rax; - t[4] = rdx; - - cf, z[1] += t[0]; - cf, z[2] += t[1] + cf; - cf, z[3] += t[2] + cf; - cf, z[4] += t[3] + cf; - cf, z[5] += t[4] + cf; - cf, z[6] += 0 + cf; - _, z[7] += 0 + cf; - - rax = xa[3]; - rdx, rax = rax * xa[3]; - cf, z[6] += rax; - _, z[7] += rdx + cf; - - r = __reduce4(z); - - for i=0 to 4 - { xa[i] = r[i]; } - - return xa; -} - -inline fn _sqr4_ss_(stack u64[4] xa) -> stack u64[4] -{ - inline int j; - stack u64[4] ra; - reg ptr u64[4] rp; - reg u64 t; - - for j=0 to 4 - { t = xa[j]; ra[j] = t; } - - rp = ra; - rp = _sqr4_p(rp); - ra = rp; - - return ra; -} - -inline fn _sqr4_s_(stack u64[4] x) -> stack u64[4] -{ - reg ptr u64[4] xp; - - xp = x; - xp = _sqr4_p(xp); - x = xp; - - return x; -} - -// //////////////////////////////////////////////////////////////////////////// - -#[returnaddress="stack"] -fn _it_sqr4_p(reg ptr u64[4] x, reg u32 i) -> reg ptr u64[4] -{ - reg bool zf; - - while { - x = _sqr4_p(x); - _,_,_,zf,i = #DEC_32(i); - }(!zf) - - return x; -} - -inline fn _it_sqr4_s_(stack u64[4] x, reg u32 i) -> stack u64[4] -{ - reg ptr u64[4] xp; - - xp = x; - xp = _it_sqr4_p(xp, i); - x = xp; - - return x; -} - -inline fn _it_sqr4_ss_(stack u64[4] r x, reg u32 i) -> stack u64[4] -{ - inline int j; - reg ptr u64[4] rp; - reg u64 t; - - for j=0 to 4 - { t = x[j]; r[j] = t; } - - rp = r; - rp = _it_sqr4_p(rp, i); - r = rp; - - return r; -} - diff --git a/src/crypto_scalarmult/curve25519/amd64/ref5/curve25519.jinc b/src/crypto_scalarmult/curve25519/amd64/ref5/curve25519.jinc deleted file mode 100644 index 2a289f2d..00000000 --- a/src/crypto_scalarmult/curve25519/amd64/ref5/curve25519.jinc +++ /dev/null @@ -1,171 +0,0 @@ -from Jade require "crypto_scalarmult/curve25519/amd64/common/bit.jinc" -from Jade require "crypto_scalarmult/curve25519/amd64/common/decode_scalar.jinc" - -from Jade require "crypto_scalarmult/curve25519/amd64/common/51/decode_u5.jinc" -from Jade require "crypto_scalarmult/curve25519/amd64/common/51/init_points5.jinc" -from Jade require "crypto_scalarmult/curve25519/amd64/common/51/add5.jinc" -from Jade require "crypto_scalarmult/curve25519/amd64/common/51/sub5.jinc" -from Jade require "crypto_scalarmult/curve25519/amd64/common/51/cswap5.jinc" -from Jade require "crypto_scalarmult/curve25519/amd64/common/51/tobytes5.jinc" - -require "mul5.jinc" -require "sqr5.jinc" -require "invert5.jinc" - -inline fn __add_and_double5( - stack u64[5] init, - stack u64[5] x2, - reg u64[5] z2r, - stack u64[5] x3, - stack u64[5] z3) - -> - stack u64[5], - reg u64[5], - stack u64[5], - stack u64[5] -{ - stack u64[5] z2 t0 t1 t2; - - t0 = __sub5_ssr(x2, z2r); - x2 = __add5_ssr(x2, z2r); - - t1 = __sub5_sss(x3, z3); - z2 = __add5_sss(x3, z3); - - z3 = __mul5_sss(x2, t1); - z2 = __mul5_sss(z2, t0); - - t2 = __sqr5_ss(x2); - t1 = __sqr5_ss(t0); - - x3 = __add5_sss(z3, z2); - z2 = __sub5_sss(z3, z2); - - x2 = __mul5_sss(t2, t1); - t0 = __sub5_sss(t2, t1); - - z2 = __sqr5_ss(z2); - t2 = __mul5_a24_add_sss(t0, t2, 996679680); // 121665 * 2^13 - x3 = __sqr5_ss(x3); - z3 = __mul5_sss(init, z2); - z2r = __mul5_rss(t0, t2); - - return x2, z2r, x3, z3; -} - -inline fn __montgomery_ladder_step5( - stack u8[32] k, - stack u64[5] init, - stack u64[5] x2, - reg u64[5] z2r, - stack u64[5] x3, - stack u64[5] z3, - stack u64 swapped, - reg u64 ctr) - -> - stack u64[5], - reg u64[5], - stack u64[5], - stack u64[5], - stack u64 -{ - reg u64 toswap bit; - - bit = __ith_bit(k, ctr); - - toswap = swapped; - toswap ^= bit; - - x2, z2r, x3, z3 = __cswap5(x2, z2r, x3, z3, toswap); - swapped = bit; - - x2, z2r, x3, z3 = __add_and_double5(init, x2, z2r, x3, z3); - - return x2, z2r, x3, z3, swapped; -} - - -inline fn __montgomery_ladder5( - reg u64[5] u, - stack u8[32] k) - -> - stack u64[5], - reg u64[5] -{ - stack u64[5] us x2 x3 z3; - reg u64[5] z2r; - stack u64 swapped; - #spill_to_mmx reg u64 ctr; - - (x2,z2r,x3,z3) = __init_points5(u); - us = #copy(u); - - ctr = 255; - swapped = 0; - - while - { - ctr -= 1; - () = #spill(ctr); - - (x2, z2r, x3, z3, swapped) = - __montgomery_ladder_step5(k, us, x2, z2r, x3, z3, swapped, ctr); - - () = #unspill(ctr); - } (ctr > 0) - - return x2, z2r; -} - -inline fn __encode_point5(stack u64[5] x2, reg u64[5] z2r) -> reg u64[4] -{ - stack u64[5] z2; - reg u64[5] r1; - reg u64[4] r2; - - z2 = #copy(z2r); - z2 = __invert5(z2); - r1 = __mul5_rss(x2, z2); - r2 = __tobytes5(r1); - - return r2; -} - -inline fn __curve25519_internal_ref5(stack u8[32] k, reg u64[5] u) -> reg u64[4] -{ - stack u64[5] x2; - reg u64[5] z2r; - reg u64[4] r; - - (x2,z2r) = __montgomery_ladder5(u, k); - r = __encode_point5(x2,z2r); - - return r; -} - -inline fn __curve25519_ref5(reg u64[4] _k _u) -> reg u64[4] -{ - stack u8[32] k; - reg u64[5] u; - reg u64[4] r; - - k = __decode_scalar(_k); - u = __decode_u_coordinate5(_u); - r = __curve25519_internal_ref5(k, u); - - return r; -} - -inline fn __curve25519_ref5_base(reg u64[4] _k) -> reg u64[4] -{ - stack u8[32] k; - reg u64[5] u; - reg u64[4] r; - - k = __decode_scalar(_k); - u = __decode_u_coordinate_base5(); - r = __curve25519_internal_ref5(k, u); - - return r; -} - diff --git a/src/crypto_scalarmult/curve25519/amd64/ref5/invert5.jinc b/src/crypto_scalarmult/curve25519/amd64/ref5/invert5.jinc deleted file mode 100644 index 7350d40c..00000000 --- a/src/crypto_scalarmult/curve25519/amd64/ref5/invert5.jinc +++ /dev/null @@ -1,88 +0,0 @@ -require "mul5.jinc" -require "sqr5.jinc" - -inline fn __invert5(stack u64[5] fs) -> stack u64[5] -{ - stack u64[5] t0s t1s t2s t3s; - reg u32 i; - - // z2 = z1^2^1 - t0s = _sqr5_ss_(fs); - - // z8 = z2^2^2 - t1s = _sqr5_ss_(t0s); - t1s = _sqr5_s_(t1s); - - // z9 = z1*z8 - t1s = _mul5_ss_(t1s, fs); - - // z11 = z2*z9 - t0s = _mul5_ss_(t0s,t1s); - - // z22 = z11^2^1 - t2s = _sqr5_ss_(t0s); - - // z_5_0 = z9*z22 - t1s = _mul5_ss_(t1s,t2s); - - // z_10_5 = z_5_0^2^5 - t2s = _sqr5_ss_(t1s); - i = 4; - t2s = _it_sqr5_s_(t2s, i); - - // z_10_0 = z_10_5*z_5_0 - t1s = _mul5_ss_(t1s,t2s); - - // z_20_10 = z_10_0^2^10 - i = 10; - t2s = _it_sqr5_ss_(t2s, t1s, i); - - // z_20_0 = z_20_10*z_10_0 - t2s = _mul5_ss_(t2s, t1s); - - // z_40_20 = z_20_0^2^20 - i = 20; - t3s = _it_sqr5_ss_(t3s, t2s, i); - - // z_40_0 = z_40_20*z_20_0 - t2s = _mul5_ss_(t2s,t3s); - - // z_50_10 = z_40_0^2^10 - i = 10; - t2s = _it_sqr5_s_(t2s, i); - - // z_50_0 = z_50_10*z_10_0 - t1s = _mul5_ss_(t1s,t2s); - - // z_100_50 = z_50_0^2^50 - i = 50; - t2s = _it_sqr5_ss_(t2s, t1s, i); - - // z_100_0 = z_100_50*z_50_0 - t2s = _mul5_ss_(t2s, t1s); - - // z_200_100 = z_100_0^2^100 - i = 100; - t3s = _it_sqr5_ss_(t3s, t2s, i); - - // z_200_0 = z_200_100*z_100_0 - t2s = _mul5_ss_(t2s,t3s); - - // z_250_50 = z_200_0^2^50 - i = 50; - t2s = _it_sqr5_s_(t2s, i); - - // z_250_0 = z_250_50*z_50_0 - t1s = _mul5_ss_(t1s,t2s); - - // z_255_5 = z_250_0^2^5 - i = 4; - t1s = _it_sqr5_s_(t1s, i); - t1s = _sqr5_s_(t1s); - - // z_255_21 = z_255_5*z11 - t1s = _mul5_ss_(t1s, t0s); - - return t1s; -} - diff --git a/src/crypto_scalarmult/curve25519/amd64/ref5/mul5.jinc b/src/crypto_scalarmult/curve25519/amd64/ref5/mul5.jinc deleted file mode 100644 index 8d1c379a..00000000 --- a/src/crypto_scalarmult/curve25519/amd64/ref5/mul5.jinc +++ /dev/null @@ -1,418 +0,0 @@ -// code originally from amd64-51 qhasm implementation -// - and adapted from https://github.com/tfaoliveira/qhasm-translator/blob/master/test-cases/crypto_scalarmult/curve25519/amd64-51/fe25519_mul.mil - -inline fn __mul5_rss(stack u64[5] xa ya) -> reg u64[5] -{ - reg bool cf; - reg u64 mulr01 mulr11 mulr21 mulr31 mulr41; - reg u64 mulrax mulrdx; - reg u64 mulredmask; - reg u64 mult; - stack u64 mulx319_stack mulx419_stack; - reg u64[5] r; - - mulrax = xa[3]; - mulrax *= 19; - mulx319_stack = mulrax; - mulrdx, mulrax = mulrax * ya[2]; - r[0] = mulrax; - mulr01 = mulrdx; - mulrax = xa[4]; - mulrax *= 19; - mulx419_stack = mulrax; - mulrdx, mulrax = mulrax * ya[1]; - cf, r[0] += mulrax; - _, mulr01 += mulrdx + cf; - mulrax = xa[0]; - mulrdx, mulrax = mulrax * ya[0]; - cf, r[0] += mulrax; - _, mulr01 += mulrdx + cf; - mulrax = xa[0]; - mulrdx, mulrax = mulrax * ya[1]; - r[1] = mulrax; - mulr11 = mulrdx; - mulrax = xa[0]; - mulrdx, mulrax = mulrax * ya[2]; - r[2] = mulrax; - mulr21 = mulrdx; - mulrax = xa[0]; - mulrdx, mulrax = mulrax * ya[3]; - r[3] = mulrax; - mulr31 = mulrdx; - mulrax = xa[0]; - mulrdx, mulrax = mulrax * ya[4]; - r[4] = mulrax; - mulr41 = mulrdx; - mulrax = xa[1]; - mulrdx, mulrax = mulrax * ya[0]; - cf, r[1] += mulrax; - _, mulr11 += mulrdx + cf; - mulrax = xa[1]; - mulrdx, mulrax = mulrax * ya[1]; - cf, r[2] += mulrax; - _, mulr21 += mulrdx + cf; - mulrax = xa[1]; - mulrdx, mulrax = mulrax * ya[2]; - cf, r[3] += mulrax; - _, mulr31 += mulrdx + cf; - mulrax = xa[1]; - mulrdx, mulrax = mulrax * ya[3]; - cf, r[4] += mulrax; - _, mulr41 += mulrdx + cf; - mulrax = xa[1]; - mulrax *= 19; - mulrdx, mulrax = mulrax * ya[4]; - cf, r[0] += mulrax; - _, mulr01 += mulrdx + cf; - mulrax = xa[2]; - mulrdx, mulrax = mulrax * ya[0]; - cf, r[2] += mulrax; - _, mulr21 += mulrdx + cf; - mulrax = xa[2]; - mulrdx, mulrax = mulrax * ya[1]; - cf, r[3] += mulrax; - _, mulr31 += mulrdx + cf; - mulrax = xa[2]; - mulrdx, mulrax = mulrax * ya[2]; - cf, r[4] += mulrax; - _, mulr41 += mulrdx + cf; - mulrax = xa[2]; - mulrax *= 19; - mulrdx, mulrax = mulrax * ya[3]; - cf, r[0] += mulrax; - _, mulr01 += mulrdx + cf; - mulrax = xa[2]; - mulrax *= 19; - mulrdx, mulrax = mulrax * ya[4]; - cf, r[1] += mulrax; - _, mulr11 += mulrdx + cf; - mulrax = xa[3]; - mulrdx, mulrax = mulrax * ya[0]; - cf, r[3] += mulrax; - _, mulr31 += mulrdx + cf; - mulrax = xa[3]; - mulrdx, mulrax = mulrax * ya[1]; - cf, r[4] += mulrax; - _, mulr41 += mulrdx + cf; - mulrax = mulx319_stack; - mulrdx, mulrax = mulrax * ya[3]; - cf, r[1] += mulrax; - _, mulr11 += mulrdx + cf; - mulrax = mulx319_stack; - mulrdx, mulrax = mulrax * ya[4]; - cf, r[2] += mulrax; - _, mulr21 += mulrdx + cf; - mulrax = xa[4]; - mulrdx, mulrax = mulrax * ya[0]; - cf, r[4] += mulrax; - _, mulr41 += mulrdx + cf; - mulrax = mulx419_stack; - mulrdx, mulrax = mulrax * ya[2]; - cf, r[1] += mulrax; - _, mulr11 += mulrdx + cf; - mulrax = mulx419_stack; - mulrdx, mulrax = mulrax * ya[3]; - cf, r[2] += mulrax; - _, mulr21 += mulrdx + cf; - mulrax = mulx419_stack; - mulrdx, mulrax = mulrax * ya[4]; - cf, r[3] += mulrax; - _, mulr31 += mulrdx + cf; - mulredmask = 0x7FFFFFFFFFFFF; - ?{}, mulr01 = #SHLD(mulr01, r[0], 13); - r[0] &= mulredmask; - ?{}, mulr11 = #SHLD(mulr11, r[1], 13); - r[1] &= mulredmask; - r[1] += mulr01; - ?{}, mulr21 = #SHLD(mulr21, r[2], 13); - r[2] &= mulredmask; - r[2] += mulr11; - ?{}, mulr31 = #SHLD(mulr31, r[3], 13); - r[3] &= mulredmask; - r[3] += mulr21; - ?{}, mulr41 = #SHLD(mulr41, r[4], 13); - r[4] &= mulredmask; - r[4] += mulr31; - mulr41 = mulr41 * 19; - r[0] += mulr41; - mult = r[0]; - mult >>= 51; - mult += r[1]; - r[1] = mult; - mult >>= 51; - r[0] &= mulredmask; - mult += r[2]; - r[2] = mult; - mult >>= 51; - r[1] &= mulredmask; - mult += r[3]; - r[3] = mult; - mult >>= 51; - r[2] &= mulredmask; - mult += r[4]; - r[4] = mult; - mult >>= 51; - r[3] &= mulredmask; - mult *= 19; - r[0] += mult; - r[4] &= mulredmask; - - return r; -} - -inline fn __mul5_sss(stack u64[5] xa ya) -> stack u64[5] -{ - stack u64[5] rs; - reg u64[5] r; - - r = __mul5_rss(xa, ya); - rs = #copy(r); - - return rs; -} - -// //////////////////////////////////////////////////////////////////////////// - -#[returnaddress="stack"] -fn _mul5_pp(reg ptr u64[5] xa ya) -> reg ptr u64[5] -{ - inline int i; - reg bool cf; - reg u64 mulr01 mulr11 mulr21 mulr31 mulr41; - reg u64 mulrax mulrdx; - reg u64 mulredmask; - reg u64 mult; - stack u64 mulx319_stack mulx419_stack; - reg u64[5] r; - - mulrax = xa[3]; - mulrax *= 19; - mulx319_stack = mulrax; - mulrdx, mulrax = mulrax * ya[2]; - r[0] = mulrax; - mulr01 = mulrdx; - mulrax = xa[4]; - mulrax *= 19; - mulx419_stack = mulrax; - mulrdx, mulrax = mulrax * ya[1]; - cf, r[0] += mulrax; - _, mulr01 += mulrdx + cf; - mulrax = xa[0]; - mulrdx, mulrax = mulrax * ya[0]; - cf, r[0] += mulrax; - _, mulr01 += mulrdx + cf; - mulrax = xa[0]; - mulrdx, mulrax = mulrax * ya[1]; - r[1] = mulrax; - mulr11 = mulrdx; - mulrax = xa[0]; - mulrdx, mulrax = mulrax * ya[2]; - r[2] = mulrax; - mulr21 = mulrdx; - mulrax = xa[0]; - mulrdx, mulrax = mulrax * ya[3]; - r[3] = mulrax; - mulr31 = mulrdx; - mulrax = xa[0]; - mulrdx, mulrax = mulrax * ya[4]; - r[4] = mulrax; - mulr41 = mulrdx; - mulrax = xa[1]; - mulrdx, mulrax = mulrax * ya[0]; - cf, r[1] += mulrax; - _, mulr11 += mulrdx + cf; - mulrax = xa[1]; - mulrdx, mulrax = mulrax * ya[1]; - cf, r[2] += mulrax; - _, mulr21 += mulrdx + cf; - mulrax = xa[1]; - mulrdx, mulrax = mulrax * ya[2]; - cf, r[3] += mulrax; - _, mulr31 += mulrdx + cf; - mulrax = xa[1]; - mulrdx, mulrax = mulrax * ya[3]; - cf, r[4] += mulrax; - _, mulr41 += mulrdx + cf; - mulrax = xa[1]; - mulrax *= 19; - mulrdx, mulrax = mulrax * ya[4]; - cf, r[0] += mulrax; - _, mulr01 += mulrdx + cf; - mulrax = xa[2]; - mulrdx, mulrax = mulrax * ya[0]; - cf, r[2] += mulrax; - _, mulr21 += mulrdx + cf; - mulrax = xa[2]; - mulrdx, mulrax = mulrax * ya[1]; - cf, r[3] += mulrax; - _, mulr31 += mulrdx + cf; - mulrax = xa[2]; - mulrdx, mulrax = mulrax * ya[2]; - cf, r[4] += mulrax; - _, mulr41 += mulrdx + cf; - mulrax = xa[2]; - mulrax *= 19; - mulrdx, mulrax = mulrax * ya[3]; - cf, r[0] += mulrax; - _, mulr01 += mulrdx + cf; - mulrax = xa[2]; - mulrax *= 19; - mulrdx, mulrax = mulrax * ya[4]; - cf, r[1] += mulrax; - _, mulr11 += mulrdx + cf; - mulrax = xa[3]; - mulrdx, mulrax = mulrax * ya[0]; - cf, r[3] += mulrax; - _, mulr31 += mulrdx + cf; - mulrax = xa[3]; - mulrdx, mulrax = mulrax * ya[1]; - cf, r[4] += mulrax; - _, mulr41 += mulrdx + cf; - mulrax = mulx319_stack; - mulrdx, mulrax = mulrax * ya[3]; - cf, r[1] += mulrax; - _, mulr11 += mulrdx + cf; - mulrax = mulx319_stack; - mulrdx, mulrax = mulrax * ya[4]; - cf, r[2] += mulrax; - _, mulr21 += mulrdx + cf; - mulrax = xa[4]; - mulrdx, mulrax = mulrax * ya[0]; - cf, r[4] += mulrax; - _, mulr41 += mulrdx + cf; - mulrax = mulx419_stack; - mulrdx, mulrax = mulrax * ya[2]; - cf, r[1] += mulrax; - _, mulr11 += mulrdx + cf; - mulrax = mulx419_stack; - mulrdx, mulrax = mulrax * ya[3]; - cf, r[2] += mulrax; - _, mulr21 += mulrdx + cf; - mulrax = mulx419_stack; - mulrdx, mulrax = mulrax * ya[4]; - cf, r[3] += mulrax; - _, mulr31 += mulrdx + cf; - mulredmask = 0x7FFFFFFFFFFFF; - ?{}, mulr01 = #SHLD(mulr01, r[0], 13); - r[0] &= mulredmask; - ?{}, mulr11 = #SHLD(mulr11, r[1], 13); - r[1] &= mulredmask; - r[1] += mulr01; - ?{}, mulr21 = #SHLD(mulr21, r[2], 13); - r[2] &= mulredmask; - r[2] += mulr11; - ?{}, mulr31 = #SHLD(mulr31, r[3], 13); - r[3] &= mulredmask; - r[3] += mulr21; - ?{}, mulr41 = #SHLD(mulr41, r[4], 13); - r[4] &= mulredmask; - r[4] += mulr31; - mulr41 = mulr41 * 19; - r[0] += mulr41; - mult = r[0]; - mult >>= 51; - mult += r[1]; - r[1] = mult; - mult >>= 51; - r[0] &= mulredmask; - mult += r[2]; - r[2] = mult; - mult >>= 51; - r[1] &= mulredmask; - mult += r[3]; - r[3] = mult; - mult >>= 51; - r[2] &= mulredmask; - mult += r[4]; - r[4] = mult; - mult >>= 51; - r[3] &= mulredmask; - mult *= 19; - r[0] += mult; - r[4] &= mulredmask; - - for i=0 to 5 - { xa[i] = r[i]; } - - return xa; -} - -inline fn _mul5_ss_(stack u64[5] xa ya) -> stack u64[5] -{ - reg ptr u64[5] xp yp; - - xp = xa; - yp = ya; - xp = _mul5_pp(xp, yp); - - xa = xp; - return xa; -} - -// //////////////////////////////////////////////////////////////////////////// - -inline fn __mul5_a24_add_rss(stack u64[5] xa ya, inline u64 _a24) -> reg u64[5] -{ - reg u64 a24; - reg u64 mul121666rax mul121666rdx; - reg u64[5] r; - - a24 = _a24; - - // xa[0] * a24 - mul121666rax = xa[0]; - mul121666rdx, mul121666rax = mul121666rax * a24; - mul121666rax >>= 13; - r[0] = mul121666rax; - r[1] = mul121666rdx; - - // xa[1] * a24 - mul121666rax = xa[1]; - mul121666rdx, mul121666rax = mul121666rax * a24; - mul121666rax >>= 13; - r[1] += mul121666rax; - r[2] = mul121666rdx; - - // xa[2] * a24 - mul121666rax = xa[2]; - mul121666rdx, mul121666rax = mul121666rax * a24; - mul121666rax >>= 13; - r[2] += mul121666rax; - r[3] = mul121666rdx; - - // xa[3] * a24 - mul121666rax = xa[3]; - mul121666rdx, mul121666rax = mul121666rax * a24; - mul121666rax >>= 13; - r[3] += mul121666rax; - r[4] = mul121666rdx; - - // xa[4] * a24 - mul121666rax = xa[4]; - mul121666rdx, mul121666rax = mul121666rax * a24; - mul121666rax >>= 13; - r[4] += mul121666rax; - mul121666rdx *= 19; - - r[0] += mul121666rdx; - - r[0] += ya[0]; - r[1] += ya[1]; - r[2] += ya[2]; - r[3] += ya[3]; - r[4] += ya[4]; - - return r; -} - -inline fn __mul5_a24_add_sss(stack u64[5] xa ya, inline u64 a24) -> stack u64[5] -{ - stack u64[5] rs; - reg u64[5] r; - - r = __mul5_a24_add_rss(xa, ya, a24); - rs = #copy(r); - - return rs; -} - diff --git a/src/crypto_scalarmult/curve25519/amd64/ref5/scalarmult.jazz b/src/crypto_scalarmult/curve25519/amd64/ref5/scalarmult.jazz index 41043499..37fde6ca 100644 --- a/src/crypto_scalarmult/curve25519/amd64/ref5/scalarmult.jazz +++ b/src/crypto_scalarmult/curve25519/amd64/ref5/scalarmult.jazz @@ -1,5 +1,1532 @@ -from Jade require "crypto_scalarmult/curve25519/amd64/common/load_store4.jinc" -require "curve25519.jinc" +//BOR#from formosa25519 require "crypto_scalarmult/curve25519/amd64/common/load_store4.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/common/load_store4.jinc +inline fn __load4(reg u64 p) -> reg u64[4] +{ + inline int i; + reg u64[4] a; + + for i=0 to 4 + { a[i] = [p + 8*i]; } + + return a; +} + +inline fn __store4(reg u64 p, reg u64[4] a) +{ + inline int i; + + for i=0 to 4 + { [p + 8*i] = a[i]; } +} + +//EOR# +//BOR#require "curve25519.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/ref5/curve25519.jinc +//BOR#from formosa25519 require "crypto_scalarmult/curve25519/amd64/common/bit.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/common/bit.jinc +inline fn __ith_bit(stack u8[32] k, reg u64 ctr) -> reg u64 +{ + reg u64 p bit; + + p = ctr; + p >>= 3; + bit = (64u) k[(int) p]; + + p = ctr; + p &= 7; + bit >>= (p & 63); + + bit &= 1; + + return bit; +} + +inline fn __next_bit(stack u64 k) -> reg u64, stack u64 +{ + reg bool cf; + reg u64 b one; + + ?{}, b = #set0(); + one = 1; + _, cf, _, _, _, k = #SHL(k, 1); + b = one if cf; + return b, k; +} +//EOR# +//BOR#from formosa25519 require "crypto_scalarmult/curve25519/amd64/common/decode_scalar.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/common/decode_scalar.jinc +inline fn __decode_scalar(reg u64[4] k) -> stack u8[32] +{ + inline int i; + stack u8[32] ks; + + for i=0 to 4 + { ks[u64 i] = k[i]; } + + ks[0] &= 0xf8; + ks[31] &= 0x7f; + ks[31] |= 0x40; + + return ks; +} + +inline fn __decode_scalar_shl1(reg u64[4] k) -> stack u64[4] +{ + stack u64[4] ks; + + k[3] <<= 1; + k[0] &= 0xfffffffffffffff8; + k[3] |= 0x8000000000000000; + + ks = #copy(k); + + return ks; +} + +//EOR# + +//BOR#from formosa25519 require "crypto_scalarmult/curve25519/amd64/common/51/decode_u5.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/common/51/decode_u5.jinc +inline fn __decode_u_coordinate5(reg u64[4] t) -> reg u64[5] +{ + reg u64[5] u; + reg u64 mask; + + mask = 0x7ffffffffffff; + + //u[0] = t[0] & mask; // 51; 13 left + u[0] = t[0]; + u[0] &= mask; + + //u[1] = (t[1] << 13) || (t[0] >> 51) & mask; // 38; 26 left + u[1] = t[1]; + u[1] <<= 13; + t[0] >>= 51; + u[1] |= t[0]; + u[1] &= mask; + + //u[2] = (t[2] << 26) || (t[1] >> 38) & mask; // 25; 39 left + u[2] = t[2]; + u[2] <<= 26; + t[1] >>= 38; + u[2] |= t[1]; + u[2] &= mask; + + //u[3] = (t[3] << 39) || (t[2] >> 25) & mask; // 12; '52' left + u[3] = t[3]; + u[3] <<= 39; + t[2] >>= 25; + u[3] |= t[2]; + u[3] &= mask; + + //u[4] = (t[3] >> 12) & mask; + u[4] = t[3]; + u[4] >>= 12; + u[4] &= mask; + + return u; +} + +inline fn __decode_u_coordinate_base5() -> reg u64[5] +{ + reg u64[5] u; + + u[0] = 9; + u[1] = 0; + u[2] = 0; + u[3] = 0; + u[4] = 0; + + return u; +} + +//EOR# +//BOR#from formosa25519 require "crypto_scalarmult/curve25519/amd64/common/51/init_points5.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/common/51/init_points5.jinc +inline fn __init_points5( + reg u64[5] initr) + -> + stack u64[5], + reg u64[5], + stack u64[5], + stack u64[5] +{ + inline int i; + stack u64[5] x2 x3 z3; + reg u64[5] z2r; + reg u64 z; + + ?{}, z = #set0(); + + x2[0] = 1; + z2r[0] = 0; + x3 = #copy(initr); + z3[0] = 1; + + for i=1 to 5 + { x2[i] = z; + z2r[i] = z; + z3[i] = z; + } + + // (1, 0, init, 1) + return x2, z2r, x3, z3; +} + +inline fn __init_points5_x3() + -> + stack u64[5], + reg u64[5], + stack u64[5] +{ + inline int i; + stack u64[5] f1s f3s; + reg u64[5] f2; + reg u64 z; + + ?{}, z = #set0(); + + f1s[0] = 1; + f2[0] = 1; + f3s[0] = 1; + + for i=1 to 5 + { f1s[i] = z; + f2[i] = z; + f3s[i] = z; + } + + return f1s, f2, f3s; +} + +//EOR# +//BOR#from formosa25519 require "crypto_scalarmult/curve25519/amd64/common/51/add5.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/common/51/add5.jinc +inline fn __add5_rrs(reg u64[5] f, stack u64[5] g) -> reg u64[5] +{ + inline int i; + reg u64[5] h; + + h = #copy(f); + + h[0] += g[0]; + for i=1 to 5 + { h[i] += g[i]; } + + return h; +} + +inline fn __add5_sss(stack u64[5] fs gs) -> stack u64[5] +{ + stack u64[5] hs; + reg u64[5] h f; + + f = #copy(fs); + h = __add5_rrs(f, gs); + hs = #copy(h); + + return hs; +} + +inline fn __add5_ssr(stack u64[5] fs, reg u64[5] g) -> stack u64[5] +{ + stack u64[5] hs; + reg u64[5] h; + + h = __add5_rrs(g, fs); + hs = #copy(h); + + return hs; +} + +inline fn __add5_rsr(stack u64[5] fs, reg u64[5] g) -> reg u64[5] +{ + reg u64[5] h; + + h = __add5_rrs(g, fs); + + return h; +} + +//EOR# +//BOR#from formosa25519 require "crypto_scalarmult/curve25519/amd64/common/51/sub5.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/common/51/sub5.jinc +inline fn __sub5_rrs(reg u64[5] f, stack u64[5] gs) -> reg u64[5] +{ + inline int i; + reg u64[5] h; + reg u64 _2p0 _2p1234; + + _2p0 = 0xfffffffffffda; + _2p1234 = 0xffffffffffffe; + + h = #copy(f); + h[0] += _2p0; + for i=1 to 5 + { h[i] += _2p1234; } + + for i=0 to 5 + { h[i] -= gs[i]; } + + return h; +} + +inline fn __sub5_sss(stack u64[5] fs gs) -> stack u64[5] +{ + stack u64[5] hs; + reg u64[5] h f; + + f = #copy(fs); + h = __sub5_rrs(f, gs); + hs = #copy(h); + + return hs; +} + +inline fn __sub5_rss(stack u64[5] fs gs) -> reg u64[5] +{ + reg u64[5] h f; + + f = #copy(fs); + h = __sub5_rrs(f, gs); + + return h; +} + +inline fn __sub5_rsr(stack u64[5] fs, reg u64[5] g) -> reg u64[5] +{ + inline int i; + reg u64[5] h; + reg u64 _2p0 _2p1234; + + _2p0 = 0xfffffffffffda; + _2p1234 = 0xffffffffffffe; + + h = #copy(fs); + h[0] += _2p0; + for i=1 to 5 + { h[i] += _2p1234; } + + for i=0 to 5 + { h[i] -= g[i]; } + + return h; +} + +inline fn __sub5_ssr(stack u64[5] fs, reg u64[5] g) -> stack u64[5] +{ + stack u64[5] hs; + reg u64[5] h; + + h = __sub5_rsr(fs, g); + hs = #copy(h); + + return hs; +} + +//EOR# +//BOR#from formosa25519 require "crypto_scalarmult/curve25519/amd64/common/51/cswap5.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/common/51/cswap5.jinc +inline fn __cswap5( + stack u64[5] x2, + reg u64[5] z2r, + stack u64[5] x3, + stack u64[5] z3, + reg u64 toswap) + -> + stack u64[5], + reg u64[5], + stack u64[5], + stack u64[5] +{ + inline int i; + reg u64[5] t4 x2r x3r; + reg u64 t mask; + + ?{}, mask = #set0(); + mask -= toswap; // if toswap == 1 mask = -1 or all bits at 1, 0 otherwise + + // swap between z2 and z3 + t4 = #copy(z2r); + for i=0 to 5 + { t4[i] ^= z3[i]; + t4[i] &= mask; } // t4 = z2 ^ z3 + + for i=0 to 5 + { z2r[i] ^= t4[i]; + t = z3[i]; + t ^= t4[i]; + z3[i] = t; } + + // swap between x2 and x3 + x3r = #copy(x3); + for i=0 to 5 { x2r[i] = x2[i]; + t = x3r[i]; + t ^= x2r[i]; + t &= mask; + x2r[i] ^= t; + x3r[i] ^= t; + x2[i] = x2r[i]; + x3[i] = x3r[i]; } + + return x2, z2r, x3, z3; +} + +inline fn __cswap5_ssss( + stack u64[5] xs, + stack u64[5] ys, + reg u64 swap) + -> + stack u64[5], + stack u64[5] +{ + inline int i; + reg u64[5] x y; + reg u64 t mask; + + x = #copy(xs); + + mask = 0; + mask -= swap; + + for i=0 to 5 + { + y[i] = ys[i]; + + t = x[i]; + t ^= y[i]; + t &= mask; + + x[i] ^= t; // ^ (x[i] ^ y[i]) if swap == 1 + y[i] ^= t; + + ys[i] = y[i]; + } + + xs = #copy(x); + + return xs, ys; +} + +inline fn __cswap5_rsrs( + reg u64[5] x, + stack u64[5] ys, + reg u64 swap) + -> + reg u64[5], + stack u64[5] +{ + inline int i; + reg u64[5] y; + reg u64 t mask; + + mask = 0; + mask -= swap; + + for i=0 to 5 + { + y[i] = ys[i]; + + t = x[i]; + t ^= y[i]; + t &= mask; + + x[i] ^= t; // ^ (x[i] ^ y[i]) if swap == 1 + y[i] ^= t; + + ys[i] = y[i]; + } + + return x, ys; +} + +//EOR# +//BOR#from formosa25519 require "crypto_scalarmult/curve25519/amd64/common/51/tobytes5.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/common/51/tobytes5.jinc +inline fn __tobytes5(reg u64[5] f) -> reg u64[4] +{ + reg bool eq; + reg u64 loop; + reg u64[4] h; + reg u64 t two51minus1 two51minus19; + + two51minus1 = 0x0007FFFFFFFFFFFF; + two51minus19 = two51minus1; + two51minus19 -= 18; + loop = 3; + + while(loop > 0){ + t = f[0]; + t >>= 51; + f[0] &= two51minus1; + f[1] += t; + t = f[1]; + t >>= 51; + f[1] &= two51minus1; + f[2] += t; + t = f[2]; + t >>= 51; + f[2] &= two51minus1; + f[3] += t; + t = f[3]; + t >>= 51; + f[3] &= two51minus1; + f[4] += t; + t = f[4]; + t >>= 51; + f[4] &= two51minus1; + t *= 19; + f[0] += t; + loop = loop - 1; + } + t = 1; + + //signed> 13); // 26 spent; 25 left + h[1] = f[2]; + h[1] <<= 38; + f[1] >>= 13; + h[1] |= f[1]; + + // h[2] = (f[3] << 25) || (f[2] >> 26); // 39 spent; 12 left + h[2] = f[3]; + h[2] <<= 25; + f[2] >>= 26; + h[2] |= f[2]; + + // h[3] = f[4] << 12 || (f[3] >> 39); // 51 spent; 0 left + h[3] = f[4]; + h[3] <<= 12; + f[3] >>= 39; + h[3] |= f[3]; + + return h; +} + +//EOR# + +//BOR#require "mul5.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/ref5/mul5.jinc +// code originally from amd64-51 qhasm implementation +// - and adapted from https://github.com/tfaoliveira/qhasm-translator/blob/master/test-cases/crypto_scalarmult/curve25519/amd64-51/fe25519_mul.mil + +inline fn __mul5_rss(stack u64[5] xa ya) -> reg u64[5] +{ + reg bool cf; + reg u64 mulr01 mulr11 mulr21 mulr31 mulr41; + reg u64 mulrax mulrdx; + reg u64 mulredmask; + reg u64 mult; + stack u64 mulx319_stack mulx419_stack; + reg u64[5] r; + + mulrax = xa[3]; + mulrax *= 19; + mulx319_stack = mulrax; + mulrdx, mulrax = mulrax * ya[2]; + r[0] = mulrax; + mulr01 = mulrdx; + mulrax = xa[4]; + mulrax *= 19; + mulx419_stack = mulrax; + mulrdx, mulrax = mulrax * ya[1]; + cf, r[0] += mulrax; + _, mulr01 += mulrdx + cf; + mulrax = xa[0]; + mulrdx, mulrax = mulrax * ya[0]; + cf, r[0] += mulrax; + _, mulr01 += mulrdx + cf; + mulrax = xa[0]; + mulrdx, mulrax = mulrax * ya[1]; + r[1] = mulrax; + mulr11 = mulrdx; + mulrax = xa[0]; + mulrdx, mulrax = mulrax * ya[2]; + r[2] = mulrax; + mulr21 = mulrdx; + mulrax = xa[0]; + mulrdx, mulrax = mulrax * ya[3]; + r[3] = mulrax; + mulr31 = mulrdx; + mulrax = xa[0]; + mulrdx, mulrax = mulrax * ya[4]; + r[4] = mulrax; + mulr41 = mulrdx; + mulrax = xa[1]; + mulrdx, mulrax = mulrax * ya[0]; + cf, r[1] += mulrax; + _, mulr11 += mulrdx + cf; + mulrax = xa[1]; + mulrdx, mulrax = mulrax * ya[1]; + cf, r[2] += mulrax; + _, mulr21 += mulrdx + cf; + mulrax = xa[1]; + mulrdx, mulrax = mulrax * ya[2]; + cf, r[3] += mulrax; + _, mulr31 += mulrdx + cf; + mulrax = xa[1]; + mulrdx, mulrax = mulrax * ya[3]; + cf, r[4] += mulrax; + _, mulr41 += mulrdx + cf; + mulrax = xa[1]; + mulrax *= 19; + mulrdx, mulrax = mulrax * ya[4]; + cf, r[0] += mulrax; + _, mulr01 += mulrdx + cf; + mulrax = xa[2]; + mulrdx, mulrax = mulrax * ya[0]; + cf, r[2] += mulrax; + _, mulr21 += mulrdx + cf; + mulrax = xa[2]; + mulrdx, mulrax = mulrax * ya[1]; + cf, r[3] += mulrax; + _, mulr31 += mulrdx + cf; + mulrax = xa[2]; + mulrdx, mulrax = mulrax * ya[2]; + cf, r[4] += mulrax; + _, mulr41 += mulrdx + cf; + mulrax = xa[2]; + mulrax *= 19; + mulrdx, mulrax = mulrax * ya[3]; + cf, r[0] += mulrax; + _, mulr01 += mulrdx + cf; + mulrax = xa[2]; + mulrax *= 19; + mulrdx, mulrax = mulrax * ya[4]; + cf, r[1] += mulrax; + _, mulr11 += mulrdx + cf; + mulrax = xa[3]; + mulrdx, mulrax = mulrax * ya[0]; + cf, r[3] += mulrax; + _, mulr31 += mulrdx + cf; + mulrax = xa[3]; + mulrdx, mulrax = mulrax * ya[1]; + cf, r[4] += mulrax; + _, mulr41 += mulrdx + cf; + mulrax = mulx319_stack; + mulrdx, mulrax = mulrax * ya[3]; + cf, r[1] += mulrax; + _, mulr11 += mulrdx + cf; + mulrax = mulx319_stack; + mulrdx, mulrax = mulrax * ya[4]; + cf, r[2] += mulrax; + _, mulr21 += mulrdx + cf; + mulrax = xa[4]; + mulrdx, mulrax = mulrax * ya[0]; + cf, r[4] += mulrax; + _, mulr41 += mulrdx + cf; + mulrax = mulx419_stack; + mulrdx, mulrax = mulrax * ya[2]; + cf, r[1] += mulrax; + _, mulr11 += mulrdx + cf; + mulrax = mulx419_stack; + mulrdx, mulrax = mulrax * ya[3]; + cf, r[2] += mulrax; + _, mulr21 += mulrdx + cf; + mulrax = mulx419_stack; + mulrdx, mulrax = mulrax * ya[4]; + cf, r[3] += mulrax; + _, mulr31 += mulrdx + cf; + mulredmask = 0x7FFFFFFFFFFFF; + ?{}, mulr01 = #SHLD(mulr01, r[0], 13); + r[0] &= mulredmask; + ?{}, mulr11 = #SHLD(mulr11, r[1], 13); + r[1] &= mulredmask; + r[1] += mulr01; + ?{}, mulr21 = #SHLD(mulr21, r[2], 13); + r[2] &= mulredmask; + r[2] += mulr11; + ?{}, mulr31 = #SHLD(mulr31, r[3], 13); + r[3] &= mulredmask; + r[3] += mulr21; + ?{}, mulr41 = #SHLD(mulr41, r[4], 13); + r[4] &= mulredmask; + r[4] += mulr31; + mulr41 = mulr41 * 19; + r[0] += mulr41; + mult = r[0]; + mult >>= 51; + mult += r[1]; + r[1] = mult; + mult >>= 51; + r[0] &= mulredmask; + mult += r[2]; + r[2] = mult; + mult >>= 51; + r[1] &= mulredmask; + mult += r[3]; + r[3] = mult; + mult >>= 51; + r[2] &= mulredmask; + mult += r[4]; + r[4] = mult; + mult >>= 51; + r[3] &= mulredmask; + mult *= 19; + r[0] += mult; + r[4] &= mulredmask; + + return r; +} + +inline fn __mul5_sss(stack u64[5] xa ya) -> stack u64[5] +{ + stack u64[5] rs; + reg u64[5] r; + + r = __mul5_rss(xa, ya); + rs = #copy(r); + + return rs; +} + +// //////////////////////////////////////////////////////////////////////////// + +#[returnaddress="stack"] +fn _mul5_pp(reg ptr u64[5] xa ya) -> reg ptr u64[5] +{ + inline int i; + reg bool cf; + reg u64 mulr01 mulr11 mulr21 mulr31 mulr41; + reg u64 mulrax mulrdx; + reg u64 mulredmask; + reg u64 mult; + stack u64 mulx319_stack mulx419_stack; + reg u64[5] r; + + mulrax = xa[3]; + mulrax *= 19; + mulx319_stack = mulrax; + mulrdx, mulrax = mulrax * ya[2]; + r[0] = mulrax; + mulr01 = mulrdx; + mulrax = xa[4]; + mulrax *= 19; + mulx419_stack = mulrax; + mulrdx, mulrax = mulrax * ya[1]; + cf, r[0] += mulrax; + _, mulr01 += mulrdx + cf; + mulrax = xa[0]; + mulrdx, mulrax = mulrax * ya[0]; + cf, r[0] += mulrax; + _, mulr01 += mulrdx + cf; + mulrax = xa[0]; + mulrdx, mulrax = mulrax * ya[1]; + r[1] = mulrax; + mulr11 = mulrdx; + mulrax = xa[0]; + mulrdx, mulrax = mulrax * ya[2]; + r[2] = mulrax; + mulr21 = mulrdx; + mulrax = xa[0]; + mulrdx, mulrax = mulrax * ya[3]; + r[3] = mulrax; + mulr31 = mulrdx; + mulrax = xa[0]; + mulrdx, mulrax = mulrax * ya[4]; + r[4] = mulrax; + mulr41 = mulrdx; + mulrax = xa[1]; + mulrdx, mulrax = mulrax * ya[0]; + cf, r[1] += mulrax; + _, mulr11 += mulrdx + cf; + mulrax = xa[1]; + mulrdx, mulrax = mulrax * ya[1]; + cf, r[2] += mulrax; + _, mulr21 += mulrdx + cf; + mulrax = xa[1]; + mulrdx, mulrax = mulrax * ya[2]; + cf, r[3] += mulrax; + _, mulr31 += mulrdx + cf; + mulrax = xa[1]; + mulrdx, mulrax = mulrax * ya[3]; + cf, r[4] += mulrax; + _, mulr41 += mulrdx + cf; + mulrax = xa[1]; + mulrax *= 19; + mulrdx, mulrax = mulrax * ya[4]; + cf, r[0] += mulrax; + _, mulr01 += mulrdx + cf; + mulrax = xa[2]; + mulrdx, mulrax = mulrax * ya[0]; + cf, r[2] += mulrax; + _, mulr21 += mulrdx + cf; + mulrax = xa[2]; + mulrdx, mulrax = mulrax * ya[1]; + cf, r[3] += mulrax; + _, mulr31 += mulrdx + cf; + mulrax = xa[2]; + mulrdx, mulrax = mulrax * ya[2]; + cf, r[4] += mulrax; + _, mulr41 += mulrdx + cf; + mulrax = xa[2]; + mulrax *= 19; + mulrdx, mulrax = mulrax * ya[3]; + cf, r[0] += mulrax; + _, mulr01 += mulrdx + cf; + mulrax = xa[2]; + mulrax *= 19; + mulrdx, mulrax = mulrax * ya[4]; + cf, r[1] += mulrax; + _, mulr11 += mulrdx + cf; + mulrax = xa[3]; + mulrdx, mulrax = mulrax * ya[0]; + cf, r[3] += mulrax; + _, mulr31 += mulrdx + cf; + mulrax = xa[3]; + mulrdx, mulrax = mulrax * ya[1]; + cf, r[4] += mulrax; + _, mulr41 += mulrdx + cf; + mulrax = mulx319_stack; + mulrdx, mulrax = mulrax * ya[3]; + cf, r[1] += mulrax; + _, mulr11 += mulrdx + cf; + mulrax = mulx319_stack; + mulrdx, mulrax = mulrax * ya[4]; + cf, r[2] += mulrax; + _, mulr21 += mulrdx + cf; + mulrax = xa[4]; + mulrdx, mulrax = mulrax * ya[0]; + cf, r[4] += mulrax; + _, mulr41 += mulrdx + cf; + mulrax = mulx419_stack; + mulrdx, mulrax = mulrax * ya[2]; + cf, r[1] += mulrax; + _, mulr11 += mulrdx + cf; + mulrax = mulx419_stack; + mulrdx, mulrax = mulrax * ya[3]; + cf, r[2] += mulrax; + _, mulr21 += mulrdx + cf; + mulrax = mulx419_stack; + mulrdx, mulrax = mulrax * ya[4]; + cf, r[3] += mulrax; + _, mulr31 += mulrdx + cf; + mulredmask = 0x7FFFFFFFFFFFF; + ?{}, mulr01 = #SHLD(mulr01, r[0], 13); + r[0] &= mulredmask; + ?{}, mulr11 = #SHLD(mulr11, r[1], 13); + r[1] &= mulredmask; + r[1] += mulr01; + ?{}, mulr21 = #SHLD(mulr21, r[2], 13); + r[2] &= mulredmask; + r[2] += mulr11; + ?{}, mulr31 = #SHLD(mulr31, r[3], 13); + r[3] &= mulredmask; + r[3] += mulr21; + ?{}, mulr41 = #SHLD(mulr41, r[4], 13); + r[4] &= mulredmask; + r[4] += mulr31; + mulr41 = mulr41 * 19; + r[0] += mulr41; + mult = r[0]; + mult >>= 51; + mult += r[1]; + r[1] = mult; + mult >>= 51; + r[0] &= mulredmask; + mult += r[2]; + r[2] = mult; + mult >>= 51; + r[1] &= mulredmask; + mult += r[3]; + r[3] = mult; + mult >>= 51; + r[2] &= mulredmask; + mult += r[4]; + r[4] = mult; + mult >>= 51; + r[3] &= mulredmask; + mult *= 19; + r[0] += mult; + r[4] &= mulredmask; + + for i=0 to 5 + { xa[i] = r[i]; } + + return xa; +} + +inline fn _mul5_ss_(stack u64[5] xa ya) -> stack u64[5] +{ + reg ptr u64[5] xp yp; + + xp = xa; + yp = ya; + xp = _mul5_pp(xp, yp); + + xa = xp; + return xa; +} + +// //////////////////////////////////////////////////////////////////////////// + +inline fn __mul5_a24_add_rss(stack u64[5] xa ya, inline u64 _a24) -> reg u64[5] +{ + reg u64 a24; + reg u64 mul121666rax mul121666rdx; + reg u64[5] r; + + a24 = _a24; + + // xa[0] * a24 + mul121666rax = xa[0]; + mul121666rdx, mul121666rax = mul121666rax * a24; + mul121666rax >>= 13; + r[0] = mul121666rax; + r[1] = mul121666rdx; + + // xa[1] * a24 + mul121666rax = xa[1]; + mul121666rdx, mul121666rax = mul121666rax * a24; + mul121666rax >>= 13; + r[1] += mul121666rax; + r[2] = mul121666rdx; + + // xa[2] * a24 + mul121666rax = xa[2]; + mul121666rdx, mul121666rax = mul121666rax * a24; + mul121666rax >>= 13; + r[2] += mul121666rax; + r[3] = mul121666rdx; + + // xa[3] * a24 + mul121666rax = xa[3]; + mul121666rdx, mul121666rax = mul121666rax * a24; + mul121666rax >>= 13; + r[3] += mul121666rax; + r[4] = mul121666rdx; + + // xa[4] * a24 + mul121666rax = xa[4]; + mul121666rdx, mul121666rax = mul121666rax * a24; + mul121666rax >>= 13; + r[4] += mul121666rax; + mul121666rdx *= 19; + + r[0] += mul121666rdx; + + r[0] += ya[0]; + r[1] += ya[1]; + r[2] += ya[2]; + r[3] += ya[3]; + r[4] += ya[4]; + + return r; +} + +inline fn __mul5_a24_add_sss(stack u64[5] xa ya, inline u64 a24) -> stack u64[5] +{ + stack u64[5] rs; + reg u64[5] r; + + r = __mul5_a24_add_rss(xa, ya, a24); + rs = #copy(r); + + return rs; +} + +//EOR# +//BOR#require "sqr5.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/ref5/sqr5.jinc +inline fn __sqr5_rs(stack u64[5] xa) -> reg u64[5] +{ + reg bool cf; + reg u64[5] r; + reg u64 squarer01 squarer11 squarer21 squarer31 squarer41; + reg u64 squarerax squarerdx; + reg u64 squareredmask squaret; + + squarerax = xa[0]; + squarerdx, squarerax = squarerax * xa[0]; + r[0] = squarerax; + squarer01 = squarerdx; + squarerax = xa[0]; + squarerax <<= 1; + squarerdx, squarerax = squarerax * xa[1]; + r[1] = squarerax; + squarer11 = squarerdx; + squarerax = xa[0]; + squarerax <<= 1; + squarerdx, squarerax = squarerax * xa[2]; + r[2] = squarerax; + squarer21 = squarerdx; + squarerax = xa[0]; + squarerax <<= 1; + squarerdx, squarerax = squarerax * xa[3]; + r[3] = squarerax; + squarer31 = squarerdx; + squarerax = xa[0]; + squarerax <<= 1; + squarerdx, squarerax = squarerax * xa[4]; + r[4] = squarerax; + squarer41 = squarerdx; + squarerax = xa[1]; + squarerdx, squarerax = squarerax * xa[1]; + cf, r[2] += squarerax; + _, squarer21 += squarerdx + cf; + squarerax = xa[1]; + squarerax <<= 1; + squarerdx, squarerax = squarerax * xa[2]; + cf, r[3] += squarerax; + _, squarer31 += squarerdx + cf; + squarerax = xa[1]; + squarerax <<= 1; + squarerdx, squarerax = squarerax * xa[3]; + cf, r[4] += squarerax; + _, squarer41 += squarerdx + cf; + squarerax = xa[1]; + squarerax *= 38; + squarerdx, squarerax = squarerax * xa[4]; + cf, r[0] += squarerax; + _, squarer01 += squarerdx + cf; + squarerax = xa[2]; + squarerdx, squarerax = squarerax * xa[2]; + cf, r[4] += squarerax; + _, squarer41 += squarerdx + cf; + squarerax = xa[2]; + squarerax *= 38; + squarerdx, squarerax = squarerax * xa[3]; + cf, r[0] += squarerax; + _, squarer01 += squarerdx + cf; + squarerax = xa[2]; + squarerax *= 38; + squarerdx, squarerax = squarerax * xa[4]; + cf, r[1] += squarerax; + _, squarer11 += squarerdx + cf; + squarerax = xa[3]; + squarerax *= 19; + squarerdx, squarerax = squarerax * xa[3]; + cf, r[1] += squarerax; + _, squarer11 += squarerdx + cf; + squarerax = xa[3]; + squarerax *= 38; + squarerdx, squarerax = squarerax * xa[4]; + cf, r[2] += squarerax; + _, squarer21 += squarerdx + cf; + squarerax = xa[4]; + squarerax *= 19; + squarerdx, squarerax = squarerax * xa[4]; + cf, r[3] += squarerax; + _, squarer31 += squarerdx + cf; + squareredmask = 0x7FFFFFFFFFFFF; + _, _, _, _, _, squarer01 = #SHLD(squarer01, r[0], 13); + r[0] &= squareredmask; + _, _, _, _, _, squarer11 = #SHLD(squarer11, r[1], 13); + r[1] &= squareredmask; + r[1] += squarer01; + _, _, _, _, _, squarer21 = #SHLD(squarer21, r[2], 13); + r[2] &= squareredmask; + r[2] += squarer11; + _, _, _, _, _, squarer31 = #SHLD(squarer31, r[3], 13); + r[3] &= squareredmask; + r[3] += squarer21; + _, _, _, _, _, squarer41 = #SHLD(squarer41, r[4], 13); + r[4] &= squareredmask; + r[4] += squarer31; + squarer41 = squarer41 * 19; + r[0] += squarer41; + squaret = r[0]; + squaret >>= 51; + squaret += r[1]; + r[0] &= squareredmask; + r[1] = squaret; + squaret >>= 51; + squaret += r[2]; + r[1] &= squareredmask; + r[2] = squaret; + squaret >>= 51; + squaret += r[3]; + r[2] &= squareredmask; + r[3] = squaret; + squaret >>= 51; + squaret += r[4]; + r[3] &= squareredmask; + r[4] = squaret; + squaret >>= 51; + squaret *= 19; + r[0] += squaret; + r[4] &= squareredmask; + + return r; +} + +inline fn __sqr5_ss(stack u64[5] xa) -> stack u64[5] +{ + stack u64[5] rs; + reg u64[5] r; + + r = __sqr5_rs(xa); + rs = #copy(r); + + return rs; +} + +// //////////////////////////////////////////////////////////////////////////// + +// TODO replace "-> reg ptr u64[5]" by "reg u64[5]" when r.a. @ f call +#[returnaddress="stack"] +fn _sqr5_p(reg ptr u64[5] xa) -> reg ptr u64[5] +{ + inline int i; + reg bool cf; + reg u64[5] r; + reg u64 squarer01 squarer11 squarer21 squarer31 squarer41; + reg u64 squarerax squarerdx; + reg u64 squareredmask squaret; + + squarerax = xa[0]; + squarerdx, squarerax = squarerax * xa[0]; + r[0] = squarerax; + squarer01 = squarerdx; + squarerax = xa[0]; + squarerax <<= 1; + squarerdx, squarerax = squarerax * xa[1]; + r[1] = squarerax; + squarer11 = squarerdx; + squarerax = xa[0]; + squarerax <<= 1; + squarerdx, squarerax = squarerax * xa[2]; + r[2] = squarerax; + squarer21 = squarerdx; + squarerax = xa[0]; + squarerax <<= 1; + squarerdx, squarerax = squarerax * xa[3]; + r[3] = squarerax; + squarer31 = squarerdx; + squarerax = xa[0]; + squarerax <<= 1; + squarerdx, squarerax = squarerax * xa[4]; + r[4] = squarerax; + squarer41 = squarerdx; + squarerax = xa[1]; + squarerdx, squarerax = squarerax * xa[1]; + cf, r[2] += squarerax; + _, squarer21 += squarerdx + cf; + squarerax = xa[1]; + squarerax <<= 1; + squarerdx, squarerax = squarerax * xa[2]; + cf, r[3] += squarerax; + _, squarer31 += squarerdx + cf; + squarerax = xa[1]; + squarerax <<= 1; + squarerdx, squarerax = squarerax * xa[3]; + cf, r[4] += squarerax; + _, squarer41 += squarerdx + cf; + squarerax = xa[1]; + squarerax *= 38; + squarerdx, squarerax = squarerax * xa[4]; + cf, r[0] += squarerax; + _, squarer01 += squarerdx + cf; + squarerax = xa[2]; + squarerdx, squarerax = squarerax * xa[2]; + cf, r[4] += squarerax; + _, squarer41 += squarerdx + cf; + squarerax = xa[2]; + squarerax *= 38; + squarerdx, squarerax = squarerax * xa[3]; + cf, r[0] += squarerax; + _, squarer01 += squarerdx + cf; + squarerax = xa[2]; + squarerax *= 38; + squarerdx, squarerax = squarerax * xa[4]; + cf, r[1] += squarerax; + _, squarer11 += squarerdx + cf; + squarerax = xa[3]; + squarerax *= 19; + squarerdx, squarerax = squarerax * xa[3]; + cf, r[1] += squarerax; + _, squarer11 += squarerdx + cf; + squarerax = xa[3]; + squarerax *= 38; + squarerdx, squarerax = squarerax * xa[4]; + cf, r[2] += squarerax; + _, squarer21 += squarerdx + cf; + squarerax = xa[4]; + squarerax *= 19; + squarerdx, squarerax = squarerax * xa[4]; + cf, r[3] += squarerax; + _, squarer31 += squarerdx + cf; + squareredmask = 0x7FFFFFFFFFFFF; + _, _, _, _, _, squarer01 = #SHLD(squarer01, r[0], 13); + r[0] &= squareredmask; + _, _, _, _, _, squarer11 = #SHLD(squarer11, r[1], 13); + r[1] &= squareredmask; + r[1] += squarer01; + _, _, _, _, _, squarer21 = #SHLD(squarer21, r[2], 13); + r[2] &= squareredmask; + r[2] += squarer11; + _, _, _, _, _, squarer31 = #SHLD(squarer31, r[3], 13); + r[3] &= squareredmask; + r[3] += squarer21; + _, _, _, _, _, squarer41 = #SHLD(squarer41, r[4], 13); + r[4] &= squareredmask; + r[4] += squarer31; + squarer41 = squarer41 * 19; + r[0] += squarer41; + squaret = r[0]; + squaret >>= 51; + squaret += r[1]; + r[0] &= squareredmask; + r[1] = squaret; + squaret >>= 51; + squaret += r[2]; + r[1] &= squareredmask; + r[2] = squaret; + squaret >>= 51; + squaret += r[3]; + r[2] &= squareredmask; + r[3] = squaret; + squaret >>= 51; + squaret += r[4]; + r[3] &= squareredmask; + r[4] = squaret; + squaret >>= 51; + squaret *= 19; + r[0] += squaret; + r[4] &= squareredmask; + + for i=0 to 5 + { xa[i] = r[i]; } + + return xa; +} + +inline fn _sqr5_ss_(stack u64[5] xa) -> stack u64[5] +{ + inline int j; + stack u64[5] ra; + reg ptr u64[5] rp; + reg u64 t; + + for j=0 to 5 + { t = xa[j]; ra[j] = t; } + + rp = ra; + rp = _sqr5_p(rp); + ra = rp; + + return ra; +} + +inline fn _sqr5_s_(stack u64[5] x) -> stack u64[5] +{ + reg ptr u64[5] xp; + + xp = x; + xp = _sqr5_p(xp); + x = xp; + + return x; +} + +// //////////////////////////////////////////////////////////////////////////// + +#[returnaddress="stack"] +fn _it_sqr5_p(reg ptr u64[5] x, reg u32 i) -> reg ptr u64[5] +{ + reg bool zf; + + while { + x = _sqr5_p(x); + _,_,_,zf,i = #DEC_32(i); + }(!zf) + + return x; +} + +inline fn _it_sqr5_s_(stack u64[5] x, reg u32 i) -> stack u64[5] +{ + reg ptr u64[5] xp; + + xp = x; + xp = _it_sqr5_p(xp, i); + x = xp; + + return x; +} + +inline fn _it_sqr5_ss_(stack u64[5] r x, reg u32 i) -> stack u64[5] +{ + inline int j; + reg ptr u64[5] rp; + reg u64 t; + + for j=0 to 5 + { t = x[j]; r[j] = t; } + + rp = r; + rp = _it_sqr5_p(rp, i); + r = rp; + + return r; +} + +//EOR# +//BOR#require "invert5.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/ref5/invert5.jinc +//BOR#require "mul5.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/ref5/mul5.jinc +//EOR# +//BOR#require "sqr5.jinc"#formosa-25519/src/crypto_scalarmult/curve25519/amd64/ref5/sqr5.jinc +//EOR# + +inline fn __invert5(stack u64[5] fs) -> stack u64[5] +{ + stack u64[5] t0s t1s t2s t3s; + reg u32 i; + + // z2 = z1^2^1 + t0s = _sqr5_ss_(fs); + + // z8 = z2^2^2 + t1s = _sqr5_ss_(t0s); + t1s = _sqr5_s_(t1s); + + // z9 = z1*z8 + t1s = _mul5_ss_(t1s, fs); + + // z11 = z2*z9 + t0s = _mul5_ss_(t0s,t1s); + + // z22 = z11^2^1 + t2s = _sqr5_ss_(t0s); + + // z_5_0 = z9*z22 + t1s = _mul5_ss_(t1s,t2s); + + // z_10_5 = z_5_0^2^5 + t2s = _sqr5_ss_(t1s); + i = 4; + t2s = _it_sqr5_s_(t2s, i); + + // z_10_0 = z_10_5*z_5_0 + t1s = _mul5_ss_(t1s,t2s); + + // z_20_10 = z_10_0^2^10 + i = 10; + t2s = _it_sqr5_ss_(t2s, t1s, i); + + // z_20_0 = z_20_10*z_10_0 + t2s = _mul5_ss_(t2s, t1s); + + // z_40_20 = z_20_0^2^20 + i = 20; + t3s = _it_sqr5_ss_(t3s, t2s, i); + + // z_40_0 = z_40_20*z_20_0 + t2s = _mul5_ss_(t2s,t3s); + + // z_50_10 = z_40_0^2^10 + i = 10; + t2s = _it_sqr5_s_(t2s, i); + + // z_50_0 = z_50_10*z_10_0 + t1s = _mul5_ss_(t1s,t2s); + + // z_100_50 = z_50_0^2^50 + i = 50; + t2s = _it_sqr5_ss_(t2s, t1s, i); + + // z_100_0 = z_100_50*z_50_0 + t2s = _mul5_ss_(t2s, t1s); + + // z_200_100 = z_100_0^2^100 + i = 100; + t3s = _it_sqr5_ss_(t3s, t2s, i); + + // z_200_0 = z_200_100*z_100_0 + t2s = _mul5_ss_(t2s,t3s); + + // z_250_50 = z_200_0^2^50 + i = 50; + t2s = _it_sqr5_s_(t2s, i); + + // z_250_0 = z_250_50*z_50_0 + t1s = _mul5_ss_(t1s,t2s); + + // z_255_5 = z_250_0^2^5 + i = 4; + t1s = _it_sqr5_s_(t1s, i); + t1s = _sqr5_s_(t1s); + + // z_255_21 = z_255_5*z11 + t1s = _mul5_ss_(t1s, t0s); + + return t1s; +} + +//EOR# + +inline fn __add_and_double5( + stack u64[5] init, + stack u64[5] x2, + reg u64[5] z2r, + stack u64[5] x3, + stack u64[5] z3) + -> + stack u64[5], + reg u64[5], + stack u64[5], + stack u64[5] +{ + stack u64[5] z2 t0 t1 t2; + + t0 = __sub5_ssr(x2, z2r); + x2 = __add5_ssr(x2, z2r); + + t1 = __sub5_sss(x3, z3); + z2 = __add5_sss(x3, z3); + + z3 = __mul5_sss(x2, t1); + z2 = __mul5_sss(z2, t0); + + t2 = __sqr5_ss(x2); + t1 = __sqr5_ss(t0); + + x3 = __add5_sss(z3, z2); + z2 = __sub5_sss(z3, z2); + + x2 = __mul5_sss(t2, t1); + t0 = __sub5_sss(t2, t1); + + z2 = __sqr5_ss(z2); + t2 = __mul5_a24_add_sss(t0, t2, 996679680); // 121665 * 2^13 + x3 = __sqr5_ss(x3); + z3 = __mul5_sss(init, z2); + z2r = __mul5_rss(t0, t2); + + return x2, z2r, x3, z3; +} + +inline fn __montgomery_ladder_step5( + stack u8[32] k, + stack u64[5] init, + stack u64[5] x2, + reg u64[5] z2r, + stack u64[5] x3, + stack u64[5] z3, + stack u64 swapped, + reg u64 ctr) + -> + stack u64[5], + reg u64[5], + stack u64[5], + stack u64[5], + stack u64 +{ + reg u64 toswap bit; + + bit = __ith_bit(k, ctr); + + toswap = swapped; + toswap ^= bit; + + x2, z2r, x3, z3 = __cswap5(x2, z2r, x3, z3, toswap); + swapped = bit; + + x2, z2r, x3, z3 = __add_and_double5(init, x2, z2r, x3, z3); + + return x2, z2r, x3, z3, swapped; +} + + +inline fn __montgomery_ladder5( + reg u64[5] u, + stack u8[32] k) + -> + stack u64[5], + reg u64[5] +{ + stack u64[5] us x2 x3 z3; + reg u64[5] z2r; + stack u64 swapped; + #spill_to_mmx reg u64 ctr; + + (x2,z2r,x3,z3) = __init_points5(u); + us = #copy(u); + + ctr = 255; + swapped = 0; + + while + { + ctr -= 1; + () = #spill(ctr); + + (x2, z2r, x3, z3, swapped) = + __montgomery_ladder_step5(k, us, x2, z2r, x3, z3, swapped, ctr); + + () = #unspill(ctr); + } (ctr > 0) + + return x2, z2r; +} + +inline fn __encode_point5(stack u64[5] x2, reg u64[5] z2r) -> reg u64[4] +{ + stack u64[5] z2; + reg u64[5] r1; + reg u64[4] r2; + + z2 = #copy(z2r); + z2 = __invert5(z2); + r1 = __mul5_rss(x2, z2); + r2 = __tobytes5(r1); + + return r2; +} + +inline fn __curve25519_internal_ref5(stack u8[32] k, reg u64[5] u) -> reg u64[4] +{ + stack u64[5] x2; + reg u64[5] z2r; + reg u64[4] r; + + (x2,z2r) = __montgomery_ladder5(u, k); + r = __encode_point5(x2,z2r); + + return r; +} + +inline fn __curve25519_ref5(reg u64[4] _k _u) -> reg u64[4] +{ + stack u8[32] k; + reg u64[5] u; + reg u64[4] r; + + k = __decode_scalar(_k); + u = __decode_u_coordinate5(_u); + r = __curve25519_internal_ref5(k, u); + + return r; +} + +inline fn __curve25519_ref5_base(reg u64[4] _k) -> reg u64[4] +{ + stack u8[32] k; + reg u64[5] u; + reg u64[4] r; + + k = __decode_scalar(_k); + u = __decode_u_coordinate_base5(); + r = __curve25519_internal_ref5(k, u); + + return r; +} + +//EOR# export fn jade_scalarmult_curve25519_amd64_ref5(#spill_to_mmx reg u64 qp np pp) -> reg u64 { diff --git a/src/crypto_scalarmult/curve25519/amd64/ref5/sqr5.jinc b/src/crypto_scalarmult/curve25519/amd64/ref5/sqr5.jinc deleted file mode 100644 index 64a6e3f1..00000000 --- a/src/crypto_scalarmult/curve25519/amd64/ref5/sqr5.jinc +++ /dev/null @@ -1,333 +0,0 @@ -inline fn __sqr5_rs(stack u64[5] xa) -> reg u64[5] -{ - reg bool cf; - reg u64[5] r; - reg u64 squarer01 squarer11 squarer21 squarer31 squarer41; - reg u64 squarerax squarerdx; - reg u64 squareredmask squaret; - - squarerax = xa[0]; - squarerdx, squarerax = squarerax * xa[0]; - r[0] = squarerax; - squarer01 = squarerdx; - squarerax = xa[0]; - squarerax <<= 1; - squarerdx, squarerax = squarerax * xa[1]; - r[1] = squarerax; - squarer11 = squarerdx; - squarerax = xa[0]; - squarerax <<= 1; - squarerdx, squarerax = squarerax * xa[2]; - r[2] = squarerax; - squarer21 = squarerdx; - squarerax = xa[0]; - squarerax <<= 1; - squarerdx, squarerax = squarerax * xa[3]; - r[3] = squarerax; - squarer31 = squarerdx; - squarerax = xa[0]; - squarerax <<= 1; - squarerdx, squarerax = squarerax * xa[4]; - r[4] = squarerax; - squarer41 = squarerdx; - squarerax = xa[1]; - squarerdx, squarerax = squarerax * xa[1]; - cf, r[2] += squarerax; - _, squarer21 += squarerdx + cf; - squarerax = xa[1]; - squarerax <<= 1; - squarerdx, squarerax = squarerax * xa[2]; - cf, r[3] += squarerax; - _, squarer31 += squarerdx + cf; - squarerax = xa[1]; - squarerax <<= 1; - squarerdx, squarerax = squarerax * xa[3]; - cf, r[4] += squarerax; - _, squarer41 += squarerdx + cf; - squarerax = xa[1]; - squarerax *= 38; - squarerdx, squarerax = squarerax * xa[4]; - cf, r[0] += squarerax; - _, squarer01 += squarerdx + cf; - squarerax = xa[2]; - squarerdx, squarerax = squarerax * xa[2]; - cf, r[4] += squarerax; - _, squarer41 += squarerdx + cf; - squarerax = xa[2]; - squarerax *= 38; - squarerdx, squarerax = squarerax * xa[3]; - cf, r[0] += squarerax; - _, squarer01 += squarerdx + cf; - squarerax = xa[2]; - squarerax *= 38; - squarerdx, squarerax = squarerax * xa[4]; - cf, r[1] += squarerax; - _, squarer11 += squarerdx + cf; - squarerax = xa[3]; - squarerax *= 19; - squarerdx, squarerax = squarerax * xa[3]; - cf, r[1] += squarerax; - _, squarer11 += squarerdx + cf; - squarerax = xa[3]; - squarerax *= 38; - squarerdx, squarerax = squarerax * xa[4]; - cf, r[2] += squarerax; - _, squarer21 += squarerdx + cf; - squarerax = xa[4]; - squarerax *= 19; - squarerdx, squarerax = squarerax * xa[4]; - cf, r[3] += squarerax; - _, squarer31 += squarerdx + cf; - squareredmask = 0x7FFFFFFFFFFFF; - _, _, _, _, _, squarer01 = #SHLD(squarer01, r[0], 13); - r[0] &= squareredmask; - _, _, _, _, _, squarer11 = #SHLD(squarer11, r[1], 13); - r[1] &= squareredmask; - r[1] += squarer01; - _, _, _, _, _, squarer21 = #SHLD(squarer21, r[2], 13); - r[2] &= squareredmask; - r[2] += squarer11; - _, _, _, _, _, squarer31 = #SHLD(squarer31, r[3], 13); - r[3] &= squareredmask; - r[3] += squarer21; - _, _, _, _, _, squarer41 = #SHLD(squarer41, r[4], 13); - r[4] &= squareredmask; - r[4] += squarer31; - squarer41 = squarer41 * 19; - r[0] += squarer41; - squaret = r[0]; - squaret >>= 51; - squaret += r[1]; - r[0] &= squareredmask; - r[1] = squaret; - squaret >>= 51; - squaret += r[2]; - r[1] &= squareredmask; - r[2] = squaret; - squaret >>= 51; - squaret += r[3]; - r[2] &= squareredmask; - r[3] = squaret; - squaret >>= 51; - squaret += r[4]; - r[3] &= squareredmask; - r[4] = squaret; - squaret >>= 51; - squaret *= 19; - r[0] += squaret; - r[4] &= squareredmask; - - return r; -} - -inline fn __sqr5_ss(stack u64[5] xa) -> stack u64[5] -{ - stack u64[5] rs; - reg u64[5] r; - - r = __sqr5_rs(xa); - rs = #copy(r); - - return rs; -} - -// //////////////////////////////////////////////////////////////////////////// - -// TODO replace "-> reg ptr u64[5]" by "reg u64[5]" when r.a. @ f call -#[returnaddress="stack"] -fn _sqr5_p(reg ptr u64[5] xa) -> reg ptr u64[5] -{ - inline int i; - reg bool cf; - reg u64[5] r; - reg u64 squarer01 squarer11 squarer21 squarer31 squarer41; - reg u64 squarerax squarerdx; - reg u64 squareredmask squaret; - - squarerax = xa[0]; - squarerdx, squarerax = squarerax * xa[0]; - r[0] = squarerax; - squarer01 = squarerdx; - squarerax = xa[0]; - squarerax <<= 1; - squarerdx, squarerax = squarerax * xa[1]; - r[1] = squarerax; - squarer11 = squarerdx; - squarerax = xa[0]; - squarerax <<= 1; - squarerdx, squarerax = squarerax * xa[2]; - r[2] = squarerax; - squarer21 = squarerdx; - squarerax = xa[0]; - squarerax <<= 1; - squarerdx, squarerax = squarerax * xa[3]; - r[3] = squarerax; - squarer31 = squarerdx; - squarerax = xa[0]; - squarerax <<= 1; - squarerdx, squarerax = squarerax * xa[4]; - r[4] = squarerax; - squarer41 = squarerdx; - squarerax = xa[1]; - squarerdx, squarerax = squarerax * xa[1]; - cf, r[2] += squarerax; - _, squarer21 += squarerdx + cf; - squarerax = xa[1]; - squarerax <<= 1; - squarerdx, squarerax = squarerax * xa[2]; - cf, r[3] += squarerax; - _, squarer31 += squarerdx + cf; - squarerax = xa[1]; - squarerax <<= 1; - squarerdx, squarerax = squarerax * xa[3]; - cf, r[4] += squarerax; - _, squarer41 += squarerdx + cf; - squarerax = xa[1]; - squarerax *= 38; - squarerdx, squarerax = squarerax * xa[4]; - cf, r[0] += squarerax; - _, squarer01 += squarerdx + cf; - squarerax = xa[2]; - squarerdx, squarerax = squarerax * xa[2]; - cf, r[4] += squarerax; - _, squarer41 += squarerdx + cf; - squarerax = xa[2]; - squarerax *= 38; - squarerdx, squarerax = squarerax * xa[3]; - cf, r[0] += squarerax; - _, squarer01 += squarerdx + cf; - squarerax = xa[2]; - squarerax *= 38; - squarerdx, squarerax = squarerax * xa[4]; - cf, r[1] += squarerax; - _, squarer11 += squarerdx + cf; - squarerax = xa[3]; - squarerax *= 19; - squarerdx, squarerax = squarerax * xa[3]; - cf, r[1] += squarerax; - _, squarer11 += squarerdx + cf; - squarerax = xa[3]; - squarerax *= 38; - squarerdx, squarerax = squarerax * xa[4]; - cf, r[2] += squarerax; - _, squarer21 += squarerdx + cf; - squarerax = xa[4]; - squarerax *= 19; - squarerdx, squarerax = squarerax * xa[4]; - cf, r[3] += squarerax; - _, squarer31 += squarerdx + cf; - squareredmask = 0x7FFFFFFFFFFFF; - _, _, _, _, _, squarer01 = #SHLD(squarer01, r[0], 13); - r[0] &= squareredmask; - _, _, _, _, _, squarer11 = #SHLD(squarer11, r[1], 13); - r[1] &= squareredmask; - r[1] += squarer01; - _, _, _, _, _, squarer21 = #SHLD(squarer21, r[2], 13); - r[2] &= squareredmask; - r[2] += squarer11; - _, _, _, _, _, squarer31 = #SHLD(squarer31, r[3], 13); - r[3] &= squareredmask; - r[3] += squarer21; - _, _, _, _, _, squarer41 = #SHLD(squarer41, r[4], 13); - r[4] &= squareredmask; - r[4] += squarer31; - squarer41 = squarer41 * 19; - r[0] += squarer41; - squaret = r[0]; - squaret >>= 51; - squaret += r[1]; - r[0] &= squareredmask; - r[1] = squaret; - squaret >>= 51; - squaret += r[2]; - r[1] &= squareredmask; - r[2] = squaret; - squaret >>= 51; - squaret += r[3]; - r[2] &= squareredmask; - r[3] = squaret; - squaret >>= 51; - squaret += r[4]; - r[3] &= squareredmask; - r[4] = squaret; - squaret >>= 51; - squaret *= 19; - r[0] += squaret; - r[4] &= squareredmask; - - for i=0 to 5 - { xa[i] = r[i]; } - - return xa; -} - -inline fn _sqr5_ss_(stack u64[5] xa) -> stack u64[5] -{ - inline int j; - stack u64[5] ra; - reg ptr u64[5] rp; - reg u64 t; - - for j=0 to 5 - { t = xa[j]; ra[j] = t; } - - rp = ra; - rp = _sqr5_p(rp); - ra = rp; - - return ra; -} - -inline fn _sqr5_s_(stack u64[5] x) -> stack u64[5] -{ - reg ptr u64[5] xp; - - xp = x; - xp = _sqr5_p(xp); - x = xp; - - return x; -} - -// //////////////////////////////////////////////////////////////////////////// - -#[returnaddress="stack"] -fn _it_sqr5_p(reg ptr u64[5] x, reg u32 i) -> reg ptr u64[5] -{ - reg bool zf; - - while { - x = _sqr5_p(x); - _,_,_,zf,i = #DEC_32(i); - }(!zf) - - return x; -} - -inline fn _it_sqr5_s_(stack u64[5] x, reg u32 i) -> stack u64[5] -{ - reg ptr u64[5] xp; - - xp = x; - xp = _it_sqr5_p(xp, i); - x = xp; - - return x; -} - -inline fn _it_sqr5_ss_(stack u64[5] r x, reg u32 i) -> stack u64[5] -{ - inline int j; - reg ptr u64[5] rp; - reg u64 t; - - for j=0 to 5 - { t = x[j]; r[j] = t; } - - rp = r; - rp = _it_sqr5_p(rp, i); - r = rp; - - return r; -} - diff --git a/submodules/formosa-25519 b/submodules/formosa-25519 new file mode 160000 index 00000000..2c230ade --- /dev/null +++ b/submodules/formosa-25519 @@ -0,0 +1 @@ +Subproject commit 2c230ade5aad8c307652915cf3af040e6d816bd0