From 9287a6cdc37c7c37e744f8418a13a74bb0e629ef Mon Sep 17 00:00:00 2001 From: Daniel Schadt Date: Thu, 17 Apr 2025 12:56:44 +0200 Subject: fuzz against slow aez-ref, not fast aez-ni Two reasons: First, this allows us to test more of the algorithm, as the (slow) reference implementation supports multiple associated data items, large values for tau, ... Second, this avoids the segfault crash, which is a limit of the fast implementation (the assumption there is that data is aligned properly, and even a read out-of-bounds will not cause a segfault). --- aezref/aezv5/aesni/encrypt.c | 944 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 944 insertions(+) create mode 100644 aezref/aezv5/aesni/encrypt.c (limited to 'aezref/aezv5/aesni/encrypt.c') diff --git a/aezref/aezv5/aesni/encrypt.c b/aezref/aezv5/aesni/encrypt.c new file mode 100644 index 0000000..9c86bcf --- /dev/null +++ b/aezref/aezv5/aesni/encrypt.c @@ -0,0 +1,944 @@ +/* + // AEZ v5 AES-NI version. AEZ info: http://www.cs.ucdavis.edu/~rogaway/aez + // + // REQUIREMENTS: - Intel or ARM CPU supporting AES instructions + // - Faster if all pointers are 16-byte aligned. + // - Max 16 byte nonce, 16 byte authenticator + // - Single AD (AEZ spec allows vector AD but this code doesn't) + // - Max 2^32-1 byte buffers allowed (due to using unsigned int) + // + // Written by Ted Krovetz (ted@krovetz.net). Last modified 21 March 2017. + // + // This is free and unencumbered software released into the public domain. + // + // Anyone is free to copy, modify, publish, use, compile, sell, or + // distribute this software, either in source code form or as a compiled + // binary, for any purpose, commercial or non-commercial, and by any + // means. + // + // In jurisdictions that recognize copyright laws, the author or authors + // of this software dedicate any and all copyright interest in the + // software to the public domain. We make this dedication for the benefit + // of the public at large and to the detriment of our heirs and + // successors. We intend this dedication to be an overt act of + // relinquishment in perpetuity of all present and future rights to this + // software under copyright law. + // + // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + // IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + // OTHER DEALINGS IN THE SOFTWARE. + // + // For more information, please refer to + */ + +#include "crypto_aead.h" +#include +#include + +/* ------------------------------------------------------------------------- */ +#if __AES__ /* Defined by gcc/clang when compiling for AES-NI */ +/* ------------------------------------------------------------------------- */ + +#include +#include +#define block __m128i + +/* ------------------------------------------------------------------------- */ + +#define zero _mm_setzero_si128() +#define vadd(x,y) _mm_add_epi8(x,y) +#define vand(x,y) _mm_and_si128(x,y) +#define vandnot(x,y) _mm_andnot_si128(x,y) /* (~x)&y */ +#define vor(x,y) _mm_or_si128(x,y) +#define vxor(x,y) _mm_xor_si128(x,y) + +static int is_zero(block x) { return _mm_testz_si128(x,x); } /* 0 or 1 */ + +static block sll4(block x) { + return vor(_mm_srli_epi64(x, 4), _mm_slli_epi64(_mm_srli_si128(x, 8), 60)); +} + +static block srl4(block x) { + return vor(_mm_slli_epi64(x, 4), _mm_srli_epi64(_mm_slli_si128(x, 8), 60)); +} + +static __m128i bswap16(__m128i b) { + const __m128i t = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); + return _mm_shuffle_epi8(b,t); +} + +static __m128i double_block(__m128i bl) { + const __m128i mask = _mm_set_epi32(135,1,1,1); + __m128i tmp = _mm_srai_epi32(bl, 31); + tmp = _mm_and_si128(tmp, mask); + tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(2,1,0,3)); + bl = _mm_slli_epi32(bl, 1); + return _mm_xor_si128(bl,tmp); +} + +static __m128i aes(__m128i *key, __m128i in, __m128i first_key) { + in = vxor(in, first_key); + in = _mm_aesenc_si128 (in,key[0]); + in = _mm_aesenc_si128 (in,key[2]); + in = _mm_aesenc_si128 (in,key[5]); + in = _mm_aesenc_si128 (in,key[0]); + in = _mm_aesenc_si128 (in,key[2]); + in = _mm_aesenc_si128 (in,key[5]); + in = _mm_aesenc_si128 (in,key[0]); + in = _mm_aesenc_si128 (in,key[2]); + in = _mm_aesenc_si128 (in,key[5]); + return _mm_aesenc_si128 (in,key[0]); +} + +static __m128i aes4(__m128i in, __m128i a, __m128i b, + __m128i c, __m128i d, __m128i e) { + in = _mm_aesenc_si128(vxor(in,a),b); + in = _mm_aesenc_si128(in,c); + in = _mm_aesenc_si128(in,d); + return _mm_aesenc_si128 (in,e); +} + +#define aes4pre(in,a,b,c,d) aes4(in,a,b,c,d,zero) + +static __m128i loadu(const void *p) { return _mm_loadu_si128((__m128i*)p); } +static void storeu(const void *p, __m128i x) {_mm_storeu_si128((__m128i*)p,x);} + +#define load loadu /* Intel with AES-NI has fast unaligned loads/stores */ +#define store storeu + +/* ------------------------------------------------------------------------- */ +#elif __ARM_FEATURE_CRYPTO +/* ------------------------------------------------------------------------- */ + +#include +#define block uint8x16_t + +#define zero vmovq_n_u8(0) +#define vadd(x,y) vaddq_u8(x,y) +#define vand(x,y) vandq_u8(x,y) +#define vandnot(x,y) vbicq_u8(y,x) /* (~x)&y */ +#define vor(x,y) vorrq_u8(x,y) +#define vxor(x,y) veorq_u8(x,y) + +static int is_zero(block x) { /* 0 or 1 */ + uint8x8_t t = vorr_u8(vget_high_u8(x), vget_low_u8(x)); + return vget_lane_u64(vreinterpret_u64_u8(t),0) == 0; +} + +static block srl4(block x) { + const block mask = {15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,0}; + uint8x16_t tmp = vandq_u8(vshrq_n_u8(vextq_u8(x, x, 1),4),mask); + return veorq_u8(tmp,vshlq_n_u8(x,4)); +} + +static block sll4(block x) { + const block mask = {0,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15}; + uint8x16_t tmp = vshlq_n_u8(vandq_u8(vextq_u8(x, x, 15),mask),4); + return veorq_u8(tmp,vshrq_n_u8(x,4)); +} + +static uint8x16_t bswap16(uint8x16_t b) { return b; } /* Not with uint8x16_t */ + +static block double_block(block b) { + const block mask = {135,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}; + block tmp = (block)vshrq_n_s8((int8x16_t)b,7); + tmp = vandq_u8(tmp, mask); + tmp = vextq_u8(tmp, tmp, 1); /* Rotate high byte to low end */ + b = vshlq_n_u8(b,1); + return veorq_u8(tmp,b); +} + +static uint8x16_t aes(uint8x16_t *key, uint8x16_t in, uint8x16_t first_key) { + in = vaesmcq_u8(vaeseq_u8(in, first_key)); + in = vaesmcq_u8(vaeseq_u8(in, key[0])); + in = vaesmcq_u8(vaeseq_u8(in, key[2])); + in = vaesmcq_u8(vaeseq_u8(in, key[5])); + in = vaesmcq_u8(vaeseq_u8(in, key[0])); + in = vaesmcq_u8(vaeseq_u8(in, key[2])); + in = vaesmcq_u8(vaeseq_u8(in, key[5])); + in = vaesmcq_u8(vaeseq_u8(in, key[0])); + in = vaesmcq_u8(vaeseq_u8(in, key[2])); + in = vaesmcq_u8(vaeseq_u8(in, key[5])); + return vxor(in, key[0]); +} + +static uint8x16_t aes4pre(uint8x16_t in, uint8x16_t a, uint8x16_t b, + uint8x16_t c, uint8x16_t d) { + in = vaesmcq_u8(vaeseq_u8(in, a)); + in = vaesmcq_u8(vaeseq_u8(in, b)); + in = vaesmcq_u8(vaeseq_u8(in, c)); + return vaesmcq_u8(vaeseq_u8(in, d)); +} + +#define aes4(in,a,b,c,d,e) vxor(aes4pre(in,a,b,c,d),e) + +static uint8x16_t load(const void *p) { return *(uint8x16_t *)p; } +static void store(void *p, uint8x16_t x) { *(uint8x16_t *)p = x; } + +#define loadu load /* ARMv8 allows unaligned loads/stores */ +#define storeu store /* ARMv8 allows unaligned stores */ + +/* ------------------------------------------------------------------------- */ +#else +#error - This implementation requires __AES__ or __ARM_FEATURE_CRYPTO +#endif +/* ------------------------------------------------------------------------- */ + +#define vxor3(x,y,z) vxor(vxor(x,y),z) +#define vxor4(w,x,y,z) vxor(vxor(w,x),vxor(y,z)) +#define load_partial(p,n) loadu(p) + +/* +Might need a version like this if, for example, we want to load a 12-byte nonce +into a 16-byte block. + +static block load_partial(const void *p, unsigned n) { + if ((intptr_t)p % 16 == 0) return load(p); + else { + block tmp; unsigned i; + for (i=0; iI[0] = loadu(key); + ctx->J[0] = loadu(key+16); + ctx->L[0] = loadu(key+32); + } else { + blake2b(ctx, 48, 0, 0, key, keylen); /* Puts IJL into ctx */ + ctx->L[0] = ctx->J[0]; /* Rearrange. */ + ctx->J[0] = ctx->I[1]; /* Rearrange. */ + } + /* Fill remaining ctx locations with doublings */ + ctx->I[1] = double_block(bswap16(ctx->I[0])); /* No post-bswap */ + ctx->J[1] = bswap16(tmp = double_block(bswap16(ctx->J[0]))); + ctx->J[2] = bswap16(double_block(tmp)); + ctx->L[1] = bswap16(tmp = double_block(bswap16(ctx->L[0]))); + ctx->L[2] = bswap16(double_block(tmp)); + ctx->delta3_cache = zero; +} + +/* ------------------------------------------------------------------------- */ + +/* !! Warning !! Only handles nbytes <= 16 and abytes <= 16 */ +static block aez_hash(aez_ctx_t *ctx, char *n, unsigned nbytes, char *ad, + unsigned adbytes, unsigned abytes) { + block o1, o2, o3, o4, o5, o6, o7, o8, sum, offset, tmp; + block I=ctx->I[0], Ifordoubling = ctx->I[1], I2 = bswap16(Ifordoubling); + block L=ctx->L[0], L2=ctx->L[1],L4=ctx->L[2]; + block J=ctx->J[0], J2 = ctx->J[1], J4 = ctx->J[2], J5 = vxor(J,J4); + + /* Process abytes and nonce */ + offset = vxor4(J, J2, I2, L); + tmp = zero_set_byte((char)(8*abytes),15); + sum = aes4pre(offset,tmp,J,I,L); + + if (nbytes==16) sum = aes4(vxor(loadu(n), J4), vxor(I2, L),J,I,L,sum); + else sum = aes4(vxor(J4, I), + one_zero_pad(load_partial(n,nbytes),16-nbytes),J,I,L,sum); + + if (ad) { /* Possible easy misuse: ad==null && adbytes==0 */ + if (adbytes==0) { + ctx->delta3_cache = aes4pre(vxor(J5, I), loadu(pad+32),J,I,L); + } else { + block delta3 = zero; + offset = vxor(J5, I2); + while (adbytes >= 8*16) { + o1 = vxor(offset,L); + o2 = vxor(offset,L2); + o3 = vxor(o1,L2); + o4 = vxor(offset,L4); + o5 = vxor(o1,L4); + o6 = vxor(o2,L4); + o7 = vxor(o3,L4); + o8 = offset; + Ifordoubling = double_block(Ifordoubling); + offset = vxor(J5, bswap16(Ifordoubling)); + delta3 = vxor(delta3, aes4pre(load(ad+ 0), o1, J, I, L)); + delta3 = vxor(delta3, aes4pre(load(ad+ 16), o2, J, I, L)); + delta3 = vxor(delta3, aes4pre(load(ad+ 32), o3, J, I, L)); + delta3 = vxor(delta3, aes4pre(load(ad+ 48), o4, J, I, L)); + delta3 = vxor(delta3, aes4pre(load(ad+ 64), o5, J, I, L)); + delta3 = vxor(delta3, aes4pre(load(ad+ 80), o6, J, I, L)); + delta3 = vxor(delta3, aes4pre(load(ad+ 96), o7, J, I, L)); + delta3 = vxor(delta3, aes4pre(load(ad+112), o8, J, I, L)); + adbytes-=8*16; ad+=8*16; + } + if (adbytes >= 4*16) { + o1 = vxor(offset,L); + o2 = vxor(offset,L2); + o3 = vxor(o1,L2); + o4 = offset = vxor(offset,L4); + delta3 = vxor(delta3, aes4pre(load(ad+ 0), o1, J, I, L)); + delta3 = vxor(delta3, aes4pre(load(ad+ 16), o2, J, I, L)); + delta3 = vxor(delta3, aes4pre(load(ad+ 32), o3, J, I, L)); + delta3 = vxor(delta3, aes4pre(load(ad+ 48), o4, J, I, L)); + adbytes-=4*16; ad+=4*16; + } + if (adbytes >= 2*16) { + o1 = vxor(offset,L); + o2 = offset = vxor(offset,L2); + delta3 = vxor(delta3, aes4pre(load(ad+ 0), o1, J, I, L)); + delta3 = vxor(delta3, aes4pre(load(ad+ 16), o2, J, I, L)); + adbytes-=2*16; ad+=2*16; + } + if (adbytes >= 1*16) { + o1 = vxor(offset,L); + delta3 = vxor(delta3, aes4pre(load(ad+ 0), o1, J, I, L)); + adbytes-=1*16; ad+=1*16; + } + if (adbytes) { + tmp = vxor3(J5, I, one_zero_pad(load(ad),16-adbytes)); + delta3 = aes4(vxor(J5, I), one_zero_pad(load(ad),16-adbytes), + J, I, L, delta3); + } + ctx->delta3_cache = delta3; + } + } + return vxor(sum,ctx->delta3_cache); +} + +/* ------------------------------------------------------------------------- */ + +static block pass_one(aez_ctx_t *ctx, block *src, unsigned bytes, block *dst) { + block o1, o2, o3, o4, o5, o6, o7, o8, offset, tmp, sum=zero; + block I=ctx->I[0], Ifordoubling = ctx->I[1]; + block L=ctx->L[0], L2=ctx->L[1],L4=ctx->L[2]; + block J=ctx->J[0]; + offset = vxor(J, bswap16(Ifordoubling)); + while (bytes >= 16*16) { + o1 = vxor(offset,L); + o2 = vxor(offset,L2); + o3 = vxor(o1,L2); + o4 = vxor(offset,L4); + o5 = vxor(o1,L4); + o6 = vxor(o2,L4); + o7 = vxor(o3,L4); + o8 = offset; + Ifordoubling = double_block(Ifordoubling); + offset = vxor(J,bswap16(Ifordoubling)); + store(dst+ 0, aes4(load(src + 1),o1, J, I, L, load(src+ 0))); + store(dst+ 2, aes4(load(src + 3),o2, J, I, L, load(src+ 2))); + store(dst+ 4, aes4(load(src + 5),o3, J, I, L, load(src+ 4))); + store(dst+ 6, aes4(load(src + 7),o4, J, I, L, load(src+ 6))); + store(dst+ 8, aes4(load(src + 9),o5, J, I, L, load(src+ 8))); + store(dst+10, aes4(load(src +11),o6, J, I, L, load(src+10))); + store(dst+12, aes4(load(src +13),o7, J, I, L, load(src+12))); + store(dst+14, aes4(load(src +15),o8, J, I, L, load(src+14))); + tmp=aes4(I,load(dst+ 0),J,I,L,load(src+ 1));store(dst+ 1,tmp); + sum=vxor(sum,tmp); + tmp=aes4(I,load(dst+ 2),J,I,L,load(src+ 3)); + store(dst+ 3,tmp);sum=vxor(sum,tmp); + tmp=aes4(I,load(dst+ 4),J,I,L,load(src+ 5)); + store(dst+ 5,tmp);sum=vxor(sum,tmp); + tmp=aes4(I,load(dst+ 6),J,I,L,load(src+ 7)); + store(dst+ 7,tmp);sum=vxor(sum,tmp); + tmp=aes4(I,load(dst+ 8),J,I,L,load(src+ 9)); + store(dst+ 9,tmp);sum=vxor(sum,tmp); + tmp=aes4(I,load(dst+10),J,I,L,load(src+11)); + store(dst+11,tmp);sum=vxor(sum,tmp); + tmp=aes4(I,load(dst+12),J,I,L,load(src+13)); + store(dst+13,tmp);sum=vxor(sum,tmp); + tmp=aes4(I,load(dst+14),J,I,L,load(src+15)); + store(dst+15,tmp);sum=vxor(sum,tmp); + bytes -= 16*16; dst += 16; src += 16; + } + if (bytes >= 8*16) { + o1 = vxor(offset,L); + o2 = vxor(offset,L2); + o3 = vxor(o1,L2); + o4 = offset = vxor(offset,L4); + store(dst+ 0, aes4(load(src + 1),o1, J, I, L, load(src+ 0))); + store(dst+ 2, aes4(load(src + 3),o2, J, I, L, load(src+ 2))); + store(dst+ 4, aes4(load(src + 5),o3, J, I, L, load(src+ 4))); + store(dst+ 6, aes4(load(src + 7),o4, J, I, L, load(src+ 6))); + tmp=aes4(I,load(dst+ 0),J,I,L,load(src+ 1)); + store(dst+ 1,tmp);sum=vxor(sum,tmp); + tmp=aes4(I,load(dst+ 2),J,I,L,load(src+ 3)); + store(dst+ 3,tmp);sum=vxor(sum,tmp); + tmp=aes4(I,load(dst+ 4),J,I,L,load(src+ 5)); + store(dst+ 5,tmp);sum=vxor(sum,tmp); + tmp=aes4(I,load(dst+ 6),J,I,L,load(src+ 7)); + store(dst+ 7,tmp);sum=vxor(sum,tmp); + bytes -= 8*16; dst += 8; src += 8; + } + if (bytes >= 4*16) { + o1 = vxor(offset,L); + o2 = offset = vxor(offset,L2); + store(dst+ 0, aes4(load(src + 1),o1, J, I, L, load(src+ 0))); + store(dst+ 2, aes4(load(src + 3),o2, J, I, L, load(src+ 2))); + tmp=aes4(I,load(dst+ 0),J,I,L,load(src+ 1)); + store(dst+ 1,tmp);sum=vxor(sum,tmp); + tmp=aes4(I,load(dst+ 2),J,I,L,load(src+ 3)); + store(dst+ 3,tmp);sum=vxor(sum,tmp); + bytes -= 4*16; dst += 4; src += 4; + } + if (bytes) { + o1 = vxor(offset,L); + store(dst+ 0, aes4(load(src + 1),o1, J, I, L, load(src+ 0))); + tmp=aes4(I,load(dst+ 0),J,I,L,load(src+ 1)); + store(dst+ 1,tmp);sum=vxor(sum,tmp); + } + return sum; +} + +/* ------------------------------------------------------------------------- */ + +static block pass_two(aez_ctx_t *ctx, block s, unsigned bytes, block *dst) { + block o1, o2, o3, o4, o5, o6, o7, o8, sum=zero, offset, fs[8], tmp[8]; + block I=ctx->I[0], Ifordoubling = ctx->I[1]; + block L=ctx->L[0], L2=ctx->L[1],L4=ctx->L[2]; + block J=ctx->J[0], J2=ctx->J[1], J3=vxor(J,J2); + offset = vxor(J2, bswap16(Ifordoubling)); + while (bytes >= 16*16) { + o1 = vxor(offset,L); + o2 = vxor(offset,L2); + o3 = vxor(o1,L2); + o4 = vxor(offset,L4); + o5 = vxor(o1,L4); + o6 = vxor(o2,L4); + o7 = vxor(o3,L4); + o8 = offset; + Ifordoubling = double_block(Ifordoubling); + offset = vxor(J2, bswap16(Ifordoubling)); + fs[0] = aes4pre(s,o1,J,I,L); fs[1] = aes4pre(s,o2,J,I,L); + fs[2] = aes4pre(s,o3,J,I,L); fs[3] = aes4pre(s,o4,J,I,L); + fs[4] = aes4pre(s,o5,J,I,L); fs[5] = aes4pre(s,o6,J,I,L); + fs[6] = aes4pre(s,o7,J,I,L); fs[7] = aes4pre(s,o8,J,I,L); + o1 = vxor(J3,o1); o2 = vxor(J3,o2); + o3 = vxor(J3,o3); o4 = vxor(J3,o4); + o5 = vxor(J3,o5); o6 = vxor(J3,o6); + o7 = vxor(J3,o7); o8 = vxor(J3,o8); + tmp[0] = vxor(load(dst+ 0),fs[0]); sum = vxor(sum,tmp[0]); + store(dst+ 0,vxor(load(dst+ 1),fs[0])); + tmp[1] = vxor(load(dst+ 2),fs[1]); sum = vxor(sum,tmp[1]); + store(dst+ 2,vxor(load(dst+ 3),fs[1])); + tmp[2] = vxor(load(dst+ 4),fs[2]); sum = vxor(sum,tmp[2]); + store(dst+ 4,vxor(load(dst+ 5),fs[2])); + tmp[3] = vxor(load(dst+ 6),fs[3]); sum = vxor(sum,tmp[3]); + store(dst+ 6,vxor(load(dst+ 7),fs[3])); + tmp[4] = vxor(load(dst+ 8),fs[4]); sum = vxor(sum,tmp[4]); + store(dst+ 8,vxor(load(dst+ 9),fs[4])); + tmp[5] = vxor(load(dst+10),fs[5]); sum = vxor(sum,tmp[5]); + store(dst+10,vxor(load(dst+11),fs[5])); + tmp[6] = vxor(load(dst+12),fs[6]); sum = vxor(sum,tmp[6]); + store(dst+12,vxor(load(dst+13),fs[6])); + tmp[7] = vxor(load(dst+14),fs[7]); sum = vxor(sum,tmp[7]); + store(dst+14,vxor(load(dst+15),fs[7])); + store(dst+ 1, aes4(I,load(dst+ 0), J, I, L, tmp[0])); + store(dst+ 3, aes4(I,load(dst+ 2), J, I, L, tmp[1])); + store(dst+ 5, aes4(I,load(dst+ 4), J, I, L, tmp[2])); + store(dst+ 7, aes4(I,load(dst+ 6), J, I, L, tmp[3])); + store(dst+ 9, aes4(I,load(dst+ 8), J, I, L, tmp[4])); + store(dst+11, aes4(I,load(dst+10), J, I, L, tmp[5])); + store(dst+13, aes4(I,load(dst+12), J, I, L, tmp[6])); + store(dst+15, aes4(I,load(dst+14), J, I, L, tmp[7])); + store(dst+ 0, aes4(load(dst+ 1),o1, J, I, L, load(dst+ 0))); + store(dst+ 2, aes4(load(dst+ 3),o2, J, I, L, load(dst+ 2))); + store(dst+ 4, aes4(load(dst+ 5),o3, J, I, L, load(dst+ 4))); + store(dst+ 6, aes4(load(dst+ 7),o4, J, I, L, load(dst+ 6))); + store(dst+ 8, aes4(load(dst+ 9),o5, J, I, L, load(dst+ 8))); + store(dst+10, aes4(load(dst+11),o6, J, I, L, load(dst+10))); + store(dst+12, aes4(load(dst+13),o7, J, I, L, load(dst+12))); + store(dst+14, aes4(load(dst+15),o8, J, I, L, load(dst+14))); + bytes -= 16*16; dst += 16; + } + if (bytes >= 8*16) { + o1 = vxor(offset,L); + o2 = vxor(offset,L2); + o3 = vxor(o1,L2); + o4 = offset = vxor(offset,L4); + fs[0] = aes4pre(s,o1,J,I,L); fs[1] = aes4pre(s,o2,J,I,L); + fs[2] = aes4pre(s,o3,J,I,L); fs[3] = aes4pre(s,o4,J,I,L); + o1 = vxor(J3,o1); o2 = vxor(J3,o2); + o3 = vxor(J3,o3); o4 = vxor(J3,o4); + tmp[0] = vxor(load(dst+ 0),fs[0]); sum = vxor(sum,tmp[0]); + store(dst+ 0,vxor(load(dst+ 1),fs[0])); + tmp[1] = vxor(load(dst+ 2),fs[1]); sum = vxor(sum,tmp[1]); + store(dst+ 2,vxor(load(dst+ 3),fs[1])); + tmp[2] = vxor(load(dst+ 4),fs[2]); sum = vxor(sum,tmp[2]); + store(dst+ 4,vxor(load(dst+ 5),fs[2])); + tmp[3] = vxor(load(dst+ 6),fs[3]); sum = vxor(sum,tmp[3]); + store(dst+ 6,vxor(load(dst+ 7),fs[3])); + store(dst+ 1, aes4(I,load(dst+ 0), J, I, L, tmp[0])); + store(dst+ 3, aes4(I,load(dst+ 2), J, I, L, tmp[1])); + store(dst+ 5, aes4(I,load(dst+ 4), J, I, L, tmp[2])); + store(dst+ 7, aes4(I,load(dst+ 6), J, I, L, tmp[3])); + store(dst+ 0, aes4(load(dst+ 1),o1, J, I, L, load(dst+ 0))); + store(dst+ 2, aes4(load(dst+ 3),o2, J, I, L, load(dst+ 2))); + store(dst+ 4, aes4(load(dst+ 5),o3, J, I, L, load(dst+ 4))); + store(dst+ 6, aes4(load(dst+ 7),o4, J, I, L, load(dst+ 6))); + bytes -= 8*16; dst += 8; + } + if (bytes >= 4*16) { + o1 = vxor(offset,L); + o2 = offset = vxor(offset,L2); + fs[0] = aes4pre(s,o1,J,I,L); fs[1] = aes4pre(s,o2,J,I,L); + o1 = vxor(J3,o1); o2 = vxor(J3,o2); + tmp[0] = vxor(load(dst+ 0),fs[0]); sum = vxor(sum,tmp[0]); + store(dst+ 0,vxor(load(dst+ 1),fs[0])); + tmp[1] = vxor(load(dst+ 2),fs[1]); sum = vxor(sum,tmp[1]); + store(dst+ 2,vxor(load(dst+ 3),fs[1])); + store(dst+ 1, aes4(I,load(dst+ 0), J, I, L, tmp[0])); + store(dst+ 3, aes4(I,load(dst+ 2), J, I, L, tmp[1])); + store(dst+ 0, aes4(load(dst+ 1),o1, J, I, L, load(dst+ 0))); + store(dst+ 2, aes4(load(dst+ 3),o2, J, I, L, load(dst+ 2))); + bytes -= 4*16; dst += 4; + } + if (bytes) { + o1 = vxor(offset,L); + fs[0] = aes4pre(s,o1,J,I,L); + o1 = vxor(J3,o1); + tmp[0] = vxor(load(dst+ 0),fs[0]); sum = vxor(sum,tmp[0]); + store(dst+ 0,vxor(load(dst+ 1),fs[0])); + store(dst+ 1, aes4(I,load(dst+ 0), J, I, L, tmp[0])); + store(dst+ 0, aes4(load(dst+ 1),o1, J, I, L, load(dst+ 0))); + } + return sum; +} + +/* ------------------------------------------------------------------------- */ + +static int cipher_aez_core(aez_ctx_t *ctx, block t, int d, char *src, + unsigned bytes, unsigned abytes, char *dst) { + block s, x, y, frag0, frag1, final0, final1; + block I=ctx->I[0], L=ctx->L[0], J=ctx->J[0]; + block L4=ctx->L[2], I2 = bswap16(ctx->I[1]); + unsigned i, frag_bytes, initial_bytes; + + if (!d) bytes += abytes; + frag_bytes = bytes % 32; + initial_bytes = bytes - frag_bytes - 32; + + /* Compute x and store intermediate results */ + x = pass_one(ctx, (block*)src, initial_bytes, (block*)dst); + if (frag_bytes >= 16) { + frag0 = load(src + initial_bytes); + frag1 = one_zero_pad(load(src + initial_bytes + 16), 32-frag_bytes); + x = aes4(frag0, vxor(L4, I2), J, I, L, x); + x = vxor(x, aes4pre(frag1, vxor3(I2, L4, L), J, I, L)); + } else if (frag_bytes) { + frag0 = one_zero_pad(load(src + initial_bytes), 16-frag_bytes); + x = aes4(frag0, vxor(L4, I2), J, I, L, x); + } + + /* Calculate s and final block values (y xor'd to final1 later) */ + final0 = vxor3(loadu(src + (bytes - 32)), x, t); + if (d || !abytes) final1 = loadu(src+(bytes-32)+16); + else final1 = zero_pad(loadu(src+(bytes-32)+16), abytes); + final0 = aes4(final1, vxor(I2, ctx->L[d]), J, I, L, final0); + final1 = vxor(final1, aes((block*)ctx, final0, ctx->L[d])); + s = vxor(final0, final1); + final0 = vxor(final0, aes((block*)ctx, final1, ctx->L[d^1])); + /* Decryption: final0 should hold abytes zero bytes. If not, failure */ + if (d && !is_zero(vandnot(loadu(pad+abytes),final0))) return -1; + final1 = aes4(final0, vxor(I2, ctx->L[d^1]), J, I, L, final1); + + /* Compute y and store final results */ + y = pass_two(ctx, s, initial_bytes, (block*)dst); + if (frag_bytes >= 16) { + frag0 = vxor(frag0, aes((block*)ctx, s, L4)); + frag1 = vxor(frag1, aes((block*)ctx, s, vxor(L4, L))); + frag1 = one_zero_pad(frag1, 32-frag_bytes); + y = aes4(frag0, vxor(I2, L4), J, I, L, y); + y = vxor(y, aes4pre(frag1, vxor3(I2, L4, L), J, I, L)); + store(dst + initial_bytes, frag0); + store(dst + initial_bytes + 16, frag1); + } else if (frag_bytes) { + frag0 = vxor(frag0, aes((block*)ctx, s, L4)); + frag0 = one_zero_pad(frag0, 16-frag_bytes); + y = aes4(frag0, vxor(I2, L4), J, I, L, y); + store(dst + initial_bytes, frag0); + } + + storeu(dst + (bytes - 32), vxor3(final1, y, t)); + if (!d || !abytes) + storeu(dst + (bytes - 32) + 16, final0); + else { + for (i=0; i<16-abytes; i++) + ((char*)dst + (bytes - 16))[i] = ((char*)&final0)[i]; + } + return 0; +} + +/* ------------------------------------------------------------------------- */ + +static int cipher_aez_tiny(aez_ctx_t *ctx, block t, int d, char *src, + unsigned bytes, unsigned abytes, char *dst) { + block l, r, tmp, one, rcon, buf[2], mask_10, mask_ff; + block I=ctx->I[0], L=ctx->L[0], J=ctx->J[0], t_orig = t; + block L2=ctx->L[1], L4=ctx->L[2], I2 = bswap16(ctx->I[1]); + unsigned rnds, i; + + /* load src into buf, zero pad, update bytes for abytes */ + if (bytes >= 16) { + buf[0] = load(src); + buf[1] = zero_pad(load_partial(src+16,bytes-16),32-bytes); + } else { + buf[0] = zero_pad(load_partial(src,bytes),16-bytes); + buf[1] = zero; + } + if (!d) bytes += abytes; + + /* load l/r, create 10* padding masks, shift r 4 bits if odd length */ + l = buf[0]; + r = loadu((char*)buf+bytes/2); + mask_ff = loadu(pad+16-bytes/2); + mask_10 = loadu(pad+32-bytes/2); + if (bytes&1) { /* Odd length. Deal with nibbles. */ + mask_10 = sll4(mask_10); + ((char*)&mask_ff)[bytes/2] = (char)0xf0; + r = bswap16(r); + r = srl4(r); + r = bswap16(r); + } + r = vor(vand(r, mask_ff), mask_10); + + /* Add tweak offset into t, and determine the number of rounds */ + if (bytes >= 16) { + t = vxor4(t, I2, L2, L4); /* (0,6) offset */ + rnds = 8; + } else { + t = vxor(vxor4(t, I2, L2, L4), L); /* (0,7) offset */ + if (bytes>=3) rnds = 10; else if (bytes==2) rnds = 16; else rnds = 24; + } + + if (!d) { + one = zero_set_byte(1,15); + rcon = zero; + } else { + one = zero_set_byte(-1,15); + rcon = zero_set_byte((char)(rnds-1),15); + } + + if ((d) && (bytes < 16)) { + block offset = vxor3(I2, L, L2); + tmp = vor(l, loadu(pad+32)); + tmp = aes4pre(t_orig, vxor(tmp,offset), J, I, L); + tmp = vand(tmp, loadu(pad+32)); + l = vxor(l, tmp); + } + + /* Feistel */ + for (i=0; iL[0], ctx->L[1])); + for (i=0; iL[0], ctx->L[1])), 16-abytes); + return is_zero(vandnot(t, claimed)) - 1; /* is_zero return 0 or 1 */ + } else if (bytes < 32) { + return cipher_aez_tiny(ctx, t, 1, src, bytes, abytes, dst); + } else { + return cipher_aez_core(ctx, t, 1, src, bytes, abytes, dst); + } +} + +/* ------------------------------------------------------------------------- */ +/* Reference Blake2b code, here for convenience, and not for speed. */ +/* Dowloaded Sep 2015 from https://github.com/mjosaarinen/blake2_mjosref */ + +#include + +typedef struct { + uint8_t b[128]; + uint64_t h[8]; + uint64_t t[2]; + size_t c; + size_t outlen; +} blake2b_ctx; + +#ifndef ROTR64 +#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y)))) +#endif + +#define B2B_GET64(p) \ +(((uint64_t) ((uint8_t *) (p))[0]) ^ \ +(((uint64_t) ((uint8_t *) (p))[1]) << 8) ^ \ +(((uint64_t) ((uint8_t *) (p))[2]) << 16) ^ \ +(((uint64_t) ((uint8_t *) (p))[3]) << 24) ^ \ +(((uint64_t) ((uint8_t *) (p))[4]) << 32) ^ \ +(((uint64_t) ((uint8_t *) (p))[5]) << 40) ^ \ +(((uint64_t) ((uint8_t *) (p))[6]) << 48) ^ \ +(((uint64_t) ((uint8_t *) (p))[7]) << 56)) + +#define B2B_G(a, b, c, d, x, y) { \ +v[a] = v[a] + v[b] + x; \ +v[d] = ROTR64(v[d] ^ v[a], 32); \ +v[c] = v[c] + v[d]; \ +v[b] = ROTR64(v[b] ^ v[c], 24); \ +v[a] = v[a] + v[b] + y; \ +v[d] = ROTR64(v[d] ^ v[a], 16); \ +v[c] = v[c] + v[d]; \ +v[b] = ROTR64(v[b] ^ v[c], 63); } + +static const uint64_t blake2b_iv[8] = { + 0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, + 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1, + 0x510E527FADE682D1, 0x9B05688C2B3E6C1F, + 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179 +}; + +static void blake2b_compress(blake2b_ctx *ctx, int last) +{ + const uint8_t sigma[12][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } + }; + int i; + uint64_t v[16], m[16]; + + for (i = 0; i < 8; i++) { + v[i] = ctx->h[i]; + v[i + 8] = blake2b_iv[i]; + } + + v[12] ^= ctx->t[0]; + v[13] ^= ctx->t[1]; + if (last) + v[14] = ~v[14]; + + for (i = 0; i < 16; i++) + m[i] = B2B_GET64(&ctx->b[8 * i]); + + for (i = 0; i < 12; i++) { + B2B_G( 0, 4, 8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]); + B2B_G( 1, 5, 9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]); + B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]); + B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]); + B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]); + B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]); + B2B_G( 2, 7, 8, 13, m[sigma[i][12]], m[sigma[i][13]]); + B2B_G( 3, 4, 9, 14, m[sigma[i][14]], m[sigma[i][15]]); + } + + for( i = 0; i < 8; ++i ) + ctx->h[i] ^= v[i] ^ v[i + 8]; +} + +static void blake2b_update(blake2b_ctx *ctx, + const void *in, size_t inlen) +{ + size_t i; + + for (i = 0; i < inlen; i++) { + if (ctx->c == 128) { + ctx->t[0] += ctx->c; + if (ctx->t[0] < ctx->c) + ctx->t[1]++; + blake2b_compress(ctx, 0); + ctx->c = 0; + } + ctx->b[ctx->c++] = ((const uint8_t *) in)[i]; + } +} + +static void blake2b_final(blake2b_ctx *ctx, void *out) +{ + size_t i; + + ctx->t[0] += ctx->c; + if (ctx->t[0] < ctx->c) + ctx->t[1]++; + + while (ctx->c < 128) + ctx->b[ctx->c++] = 0; + blake2b_compress(ctx, 1); + + for (i = 0; i < ctx->outlen; i++) { + ((uint8_t *) out)[i] = + (ctx->h[i >> 3] >> (8 * (i & 7))) & 0xFF; + } +} + +static int blake2b_init(blake2b_ctx *ctx, size_t outlen, + const void *key, size_t keylen) +{ + size_t i; + + if (outlen == 0 || outlen > 64 || keylen > 64) + return -1; + + for (i = 0; i < 8; i++) + ctx->h[i] = blake2b_iv[i]; + ctx->h[0] ^= 0x01010000 ^ (keylen << 8) ^ outlen; + + ctx->t[0] = 0; + ctx->t[1] = 0; + ctx->c = 0; + ctx->outlen = outlen; + + for (i = keylen; i < 128; i++) + ctx->b[i] = 0; + if (keylen > 0) { + blake2b_update(ctx, key, keylen); + ctx->c = 128; + } + + return 0; +} + +static int blake2b(void *out, size_t outlen, + const void *key, size_t keylen, + const void *in, size_t inlen) +{ + blake2b_ctx ctx; + + if (blake2b_init(&ctx, outlen, key, keylen)) + return -1; + blake2b_update(&ctx, in, inlen); + blake2b_final(&ctx, out); + + return 0; +} + +/* ------------------------------------------------------------------------- */ +/* aez mapping for CAESAR competition */ + +int crypto_aead_encrypt( + unsigned char *c,unsigned long long *clen, + const unsigned char *m,unsigned long long mlen, + const unsigned char *ad,unsigned long long adlen, + const unsigned char *nsec, + const unsigned char *npub, + const unsigned char *k +) +{ + aez_ctx_t ctx; + (void)nsec; + if (clen) *clen = mlen+16; + aez_setup((unsigned char *)k, 48, &ctx); + aez_encrypt(&ctx, (char *)npub, 12, + (char *)ad, (unsigned)adlen, 16, + (char *)m, (unsigned)mlen, (char *)c); + return 0; +} + +int crypto_aead_decrypt( + unsigned char *m,unsigned long long *mlen, + unsigned char *nsec, + const unsigned char *c,unsigned long long clen, + const unsigned char *ad,unsigned long long adlen, + const unsigned char *npub, + const unsigned char *k +) +{ + aez_ctx_t ctx; + (void)nsec; + if (mlen) *mlen = clen-16; + aez_setup((unsigned char *)k, 48, &ctx); + return aez_decrypt(&ctx, (char *)npub, 12, + (char *)ad, (unsigned)adlen, 16, + (char *)c, (unsigned)clen, (char *)m); +} -- cgit v1.2.3