/* // AEZ v5 AES-NI version. AEZ info: http://www.cs.ucdavis.edu/~rogaway/aez // // REQUIREMENTS: - Intel or ARM CPU supporting AES instructions // - Faster if all pointers are 16-byte aligned. // - Max 16 byte nonce, 16 byte authenticator // - Single AD (AEZ spec allows vector AD but this code doesn't) // - Max 2^32-1 byte buffers allowed (due to using unsigned int) // // Written by Ted Krovetz (ted@krovetz.net). Last modified 21 March 2017. // // This is free and unencumbered software released into the public domain. // // Anyone is free to copy, modify, publish, use, compile, sell, or // distribute this software, either in source code form or as a compiled // binary, for any purpose, commercial or non-commercial, and by any // means. // // In jurisdictions that recognize copyright laws, the author or authors // of this software dedicate any and all copyright interest in the // software to the public domain. We make this dedication for the benefit // of the public at large and to the detriment of our heirs and // successors. We intend this dedication to be an overt act of // relinquishment in perpetuity of all present and future rights to this // software under copyright law. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. // IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR // OTHER DEALINGS IN THE SOFTWARE. // // For more information, please refer to */ #include "crypto_aead.h" #include #include /* ------------------------------------------------------------------------- */ #if __AES__ /* Defined by gcc/clang when compiling for AES-NI */ /* ------------------------------------------------------------------------- */ #include #include #define block __m128i /* ------------------------------------------------------------------------- */ #define zero _mm_setzero_si128() #define vadd(x,y) _mm_add_epi8(x,y) #define vand(x,y) _mm_and_si128(x,y) #define vandnot(x,y) _mm_andnot_si128(x,y) /* (~x)&y */ #define vor(x,y) _mm_or_si128(x,y) #define vxor(x,y) _mm_xor_si128(x,y) static int is_zero(block x) { return _mm_testz_si128(x,x); } /* 0 or 1 */ static block sll4(block x) { return vor(_mm_srli_epi64(x, 4), _mm_slli_epi64(_mm_srli_si128(x, 8), 60)); } static block srl4(block x) { return vor(_mm_slli_epi64(x, 4), _mm_srli_epi64(_mm_slli_si128(x, 8), 60)); } static __m128i bswap16(__m128i b) { const __m128i t = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); return _mm_shuffle_epi8(b,t); } static __m128i double_block(__m128i bl) { const __m128i mask = _mm_set_epi32(135,1,1,1); __m128i tmp = _mm_srai_epi32(bl, 31); tmp = _mm_and_si128(tmp, mask); tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(2,1,0,3)); bl = _mm_slli_epi32(bl, 1); return _mm_xor_si128(bl,tmp); } static __m128i aes(__m128i *key, __m128i in, __m128i first_key) { in = vxor(in, first_key); in = _mm_aesenc_si128 (in,key[0]); in = _mm_aesenc_si128 (in,key[2]); in = _mm_aesenc_si128 (in,key[5]); in = _mm_aesenc_si128 (in,key[0]); in = _mm_aesenc_si128 (in,key[2]); in = _mm_aesenc_si128 (in,key[5]); in = _mm_aesenc_si128 (in,key[0]); in = _mm_aesenc_si128 (in,key[2]); in = _mm_aesenc_si128 (in,key[5]); return _mm_aesenc_si128 (in,key[0]); } static __m128i aes4(__m128i in, __m128i a, __m128i b, __m128i c, __m128i d, __m128i e) { in = _mm_aesenc_si128(vxor(in,a),b); in = _mm_aesenc_si128(in,c); in = _mm_aesenc_si128(in,d); return _mm_aesenc_si128 (in,e); } #define aes4pre(in,a,b,c,d) aes4(in,a,b,c,d,zero) static __m128i loadu(const void *p) { return _mm_loadu_si128((__m128i*)p); } static void storeu(const void *p, __m128i x) {_mm_storeu_si128((__m128i*)p,x);} #define load loadu /* Intel with AES-NI has fast unaligned loads/stores */ #define store storeu /* ------------------------------------------------------------------------- */ #elif __ARM_FEATURE_CRYPTO /* ------------------------------------------------------------------------- */ #include #define block uint8x16_t #define zero vmovq_n_u8(0) #define vadd(x,y) vaddq_u8(x,y) #define vand(x,y) vandq_u8(x,y) #define vandnot(x,y) vbicq_u8(y,x) /* (~x)&y */ #define vor(x,y) vorrq_u8(x,y) #define vxor(x,y) veorq_u8(x,y) static int is_zero(block x) { /* 0 or 1 */ uint8x8_t t = vorr_u8(vget_high_u8(x), vget_low_u8(x)); return vget_lane_u64(vreinterpret_u64_u8(t),0) == 0; } static block srl4(block x) { const block mask = {15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,0}; uint8x16_t tmp = vandq_u8(vshrq_n_u8(vextq_u8(x, x, 1),4),mask); return veorq_u8(tmp,vshlq_n_u8(x,4)); } static block sll4(block x) { const block mask = {0,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15}; uint8x16_t tmp = vshlq_n_u8(vandq_u8(vextq_u8(x, x, 15),mask),4); return veorq_u8(tmp,vshrq_n_u8(x,4)); } static uint8x16_t bswap16(uint8x16_t b) { return b; } /* Not with uint8x16_t */ static block double_block(block b) { const block mask = {135,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}; block tmp = (block)vshrq_n_s8((int8x16_t)b,7); tmp = vandq_u8(tmp, mask); tmp = vextq_u8(tmp, tmp, 1); /* Rotate high byte to low end */ b = vshlq_n_u8(b,1); return veorq_u8(tmp,b); } static uint8x16_t aes(uint8x16_t *key, uint8x16_t in, uint8x16_t first_key) { in = vaesmcq_u8(vaeseq_u8(in, first_key)); in = vaesmcq_u8(vaeseq_u8(in, key[0])); in = vaesmcq_u8(vaeseq_u8(in, key[2])); in = vaesmcq_u8(vaeseq_u8(in, key[5])); in = vaesmcq_u8(vaeseq_u8(in, key[0])); in = vaesmcq_u8(vaeseq_u8(in, key[2])); in = vaesmcq_u8(vaeseq_u8(in, key[5])); in = vaesmcq_u8(vaeseq_u8(in, key[0])); in = vaesmcq_u8(vaeseq_u8(in, key[2])); in = vaesmcq_u8(vaeseq_u8(in, key[5])); return vxor(in, key[0]); } static uint8x16_t aes4pre(uint8x16_t in, uint8x16_t a, uint8x16_t b, uint8x16_t c, uint8x16_t d) { in = vaesmcq_u8(vaeseq_u8(in, a)); in = vaesmcq_u8(vaeseq_u8(in, b)); in = vaesmcq_u8(vaeseq_u8(in, c)); return vaesmcq_u8(vaeseq_u8(in, d)); } #define aes4(in,a,b,c,d,e) vxor(aes4pre(in,a,b,c,d),e) static uint8x16_t load(const void *p) { return *(uint8x16_t *)p; } static void store(void *p, uint8x16_t x) { *(uint8x16_t *)p = x; } #define loadu load /* ARMv8 allows unaligned loads/stores */ #define storeu store /* ARMv8 allows unaligned stores */ /* ------------------------------------------------------------------------- */ #else #error - This implementation requires __AES__ or __ARM_FEATURE_CRYPTO #endif /* ------------------------------------------------------------------------- */ #define vxor3(x,y,z) vxor(vxor(x,y),z) #define vxor4(w,x,y,z) vxor(vxor(w,x),vxor(y,z)) #define load_partial(p,n) loadu(p) /* Might need a version like this if, for example, we want to load a 12-byte nonce into a 16-byte block. static block load_partial(const void *p, unsigned n) { if ((intptr_t)p % 16 == 0) return load(p); else { block tmp; unsigned i; for (i=0; iI[0] = loadu(key); ctx->J[0] = loadu(key+16); ctx->L[0] = loadu(key+32); } else { blake2b(ctx, 48, 0, 0, key, keylen); /* Puts IJL into ctx */ ctx->L[0] = ctx->J[0]; /* Rearrange. */ ctx->J[0] = ctx->I[1]; /* Rearrange. */ } /* Fill remaining ctx locations with doublings */ ctx->I[1] = double_block(bswap16(ctx->I[0])); /* No post-bswap */ ctx->J[1] = bswap16(tmp = double_block(bswap16(ctx->J[0]))); ctx->J[2] = bswap16(double_block(tmp)); ctx->L[1] = bswap16(tmp = double_block(bswap16(ctx->L[0]))); ctx->L[2] = bswap16(double_block(tmp)); ctx->delta3_cache = zero; } /* ------------------------------------------------------------------------- */ /* !! Warning !! Only handles nbytes <= 16 and abytes <= 16 */ static block aez_hash(aez_ctx_t *ctx, char *n, unsigned nbytes, char *ad, unsigned adbytes, unsigned abytes) { block o1, o2, o3, o4, o5, o6, o7, o8, sum, offset, tmp; block I=ctx->I[0], Ifordoubling = ctx->I[1], I2 = bswap16(Ifordoubling); block L=ctx->L[0], L2=ctx->L[1],L4=ctx->L[2]; block J=ctx->J[0], J2 = ctx->J[1], J4 = ctx->J[2], J5 = vxor(J,J4); /* Process abytes and nonce */ offset = vxor4(J, J2, I2, L); tmp = zero_set_byte((char)(8*abytes),15); sum = aes4pre(offset,tmp,J,I,L); if (nbytes==16) sum = aes4(vxor(loadu(n), J4), vxor(I2, L),J,I,L,sum); else sum = aes4(vxor(J4, I), one_zero_pad(load_partial(n,nbytes),16-nbytes),J,I,L,sum); if (ad) { /* Possible easy misuse: ad==null && adbytes==0 */ if (adbytes==0) { ctx->delta3_cache = aes4pre(vxor(J5, I), loadu(pad+32),J,I,L); } else { block delta3 = zero; offset = vxor(J5, I2); while (adbytes >= 8*16) { o1 = vxor(offset,L); o2 = vxor(offset,L2); o3 = vxor(o1,L2); o4 = vxor(offset,L4); o5 = vxor(o1,L4); o6 = vxor(o2,L4); o7 = vxor(o3,L4); o8 = offset; Ifordoubling = double_block(Ifordoubling); offset = vxor(J5, bswap16(Ifordoubling)); delta3 = vxor(delta3, aes4pre(load(ad+ 0), o1, J, I, L)); delta3 = vxor(delta3, aes4pre(load(ad+ 16), o2, J, I, L)); delta3 = vxor(delta3, aes4pre(load(ad+ 32), o3, J, I, L)); delta3 = vxor(delta3, aes4pre(load(ad+ 48), o4, J, I, L)); delta3 = vxor(delta3, aes4pre(load(ad+ 64), o5, J, I, L)); delta3 = vxor(delta3, aes4pre(load(ad+ 80), o6, J, I, L)); delta3 = vxor(delta3, aes4pre(load(ad+ 96), o7, J, I, L)); delta3 = vxor(delta3, aes4pre(load(ad+112), o8, J, I, L)); adbytes-=8*16; ad+=8*16; } if (adbytes >= 4*16) { o1 = vxor(offset,L); o2 = vxor(offset,L2); o3 = vxor(o1,L2); o4 = offset = vxor(offset,L4); delta3 = vxor(delta3, aes4pre(load(ad+ 0), o1, J, I, L)); delta3 = vxor(delta3, aes4pre(load(ad+ 16), o2, J, I, L)); delta3 = vxor(delta3, aes4pre(load(ad+ 32), o3, J, I, L)); delta3 = vxor(delta3, aes4pre(load(ad+ 48), o4, J, I, L)); adbytes-=4*16; ad+=4*16; } if (adbytes >= 2*16) { o1 = vxor(offset,L); o2 = offset = vxor(offset,L2); delta3 = vxor(delta3, aes4pre(load(ad+ 0), o1, J, I, L)); delta3 = vxor(delta3, aes4pre(load(ad+ 16), o2, J, I, L)); adbytes-=2*16; ad+=2*16; } if (adbytes >= 1*16) { o1 = vxor(offset,L); delta3 = vxor(delta3, aes4pre(load(ad+ 0), o1, J, I, L)); adbytes-=1*16; ad+=1*16; } if (adbytes) { tmp = vxor3(J5, I, one_zero_pad(load(ad),16-adbytes)); delta3 = aes4(vxor(J5, I), one_zero_pad(load(ad),16-adbytes), J, I, L, delta3); } ctx->delta3_cache = delta3; } } return vxor(sum,ctx->delta3_cache); } /* ------------------------------------------------------------------------- */ static block pass_one(aez_ctx_t *ctx, block *src, unsigned bytes, block *dst) { block o1, o2, o3, o4, o5, o6, o7, o8, offset, tmp, sum=zero; block I=ctx->I[0], Ifordoubling = ctx->I[1]; block L=ctx->L[0], L2=ctx->L[1],L4=ctx->L[2]; block J=ctx->J[0]; offset = vxor(J, bswap16(Ifordoubling)); while (bytes >= 16*16) { o1 = vxor(offset,L); o2 = vxor(offset,L2); o3 = vxor(o1,L2); o4 = vxor(offset,L4); o5 = vxor(o1,L4); o6 = vxor(o2,L4); o7 = vxor(o3,L4); o8 = offset; Ifordoubling = double_block(Ifordoubling); offset = vxor(J,bswap16(Ifordoubling)); store(dst+ 0, aes4(load(src + 1),o1, J, I, L, load(src+ 0))); store(dst+ 2, aes4(load(src + 3),o2, J, I, L, load(src+ 2))); store(dst+ 4, aes4(load(src + 5),o3, J, I, L, load(src+ 4))); store(dst+ 6, aes4(load(src + 7),o4, J, I, L, load(src+ 6))); store(dst+ 8, aes4(load(src + 9),o5, J, I, L, load(src+ 8))); store(dst+10, aes4(load(src +11),o6, J, I, L, load(src+10))); store(dst+12, aes4(load(src +13),o7, J, I, L, load(src+12))); store(dst+14, aes4(load(src +15),o8, J, I, L, load(src+14))); tmp=aes4(I,load(dst+ 0),J,I,L,load(src+ 1));store(dst+ 1,tmp); sum=vxor(sum,tmp); tmp=aes4(I,load(dst+ 2),J,I,L,load(src+ 3)); store(dst+ 3,tmp);sum=vxor(sum,tmp); tmp=aes4(I,load(dst+ 4),J,I,L,load(src+ 5)); store(dst+ 5,tmp);sum=vxor(sum,tmp); tmp=aes4(I,load(dst+ 6),J,I,L,load(src+ 7)); store(dst+ 7,tmp);sum=vxor(sum,tmp); tmp=aes4(I,load(dst+ 8),J,I,L,load(src+ 9)); store(dst+ 9,tmp);sum=vxor(sum,tmp); tmp=aes4(I,load(dst+10),J,I,L,load(src+11)); store(dst+11,tmp);sum=vxor(sum,tmp); tmp=aes4(I,load(dst+12),J,I,L,load(src+13)); store(dst+13,tmp);sum=vxor(sum,tmp); tmp=aes4(I,load(dst+14),J,I,L,load(src+15)); store(dst+15,tmp);sum=vxor(sum,tmp); bytes -= 16*16; dst += 16; src += 16; } if (bytes >= 8*16) { o1 = vxor(offset,L); o2 = vxor(offset,L2); o3 = vxor(o1,L2); o4 = offset = vxor(offset,L4); store(dst+ 0, aes4(load(src + 1),o1, J, I, L, load(src+ 0))); store(dst+ 2, aes4(load(src + 3),o2, J, I, L, load(src+ 2))); store(dst+ 4, aes4(load(src + 5),o3, J, I, L, load(src+ 4))); store(dst+ 6, aes4(load(src + 7),o4, J, I, L, load(src+ 6))); tmp=aes4(I,load(dst+ 0),J,I,L,load(src+ 1)); store(dst+ 1,tmp);sum=vxor(sum,tmp); tmp=aes4(I,load(dst+ 2),J,I,L,load(src+ 3)); store(dst+ 3,tmp);sum=vxor(sum,tmp); tmp=aes4(I,load(dst+ 4),J,I,L,load(src+ 5)); store(dst+ 5,tmp);sum=vxor(sum,tmp); tmp=aes4(I,load(dst+ 6),J,I,L,load(src+ 7)); store(dst+ 7,tmp);sum=vxor(sum,tmp); bytes -= 8*16; dst += 8; src += 8; } if (bytes >= 4*16) { o1 = vxor(offset,L); o2 = offset = vxor(offset,L2); store(dst+ 0, aes4(load(src + 1),o1, J, I, L, load(src+ 0))); store(dst+ 2, aes4(load(src + 3),o2, J, I, L, load(src+ 2))); tmp=aes4(I,load(dst+ 0),J,I,L,load(src+ 1)); store(dst+ 1,tmp);sum=vxor(sum,tmp); tmp=aes4(I,load(dst+ 2),J,I,L,load(src+ 3)); store(dst+ 3,tmp);sum=vxor(sum,tmp); bytes -= 4*16; dst += 4; src += 4; } if (bytes) { o1 = vxor(offset,L); store(dst+ 0, aes4(load(src + 1),o1, J, I, L, load(src+ 0))); tmp=aes4(I,load(dst+ 0),J,I,L,load(src+ 1)); store(dst+ 1,tmp);sum=vxor(sum,tmp); } return sum; } /* ------------------------------------------------------------------------- */ static block pass_two(aez_ctx_t *ctx, block s, unsigned bytes, block *dst) { block o1, o2, o3, o4, o5, o6, o7, o8, sum=zero, offset, fs[8], tmp[8]; block I=ctx->I[0], Ifordoubling = ctx->I[1]; block L=ctx->L[0], L2=ctx->L[1],L4=ctx->L[2]; block J=ctx->J[0], J2=ctx->J[1], J3=vxor(J,J2); offset = vxor(J2, bswap16(Ifordoubling)); while (bytes >= 16*16) { o1 = vxor(offset,L); o2 = vxor(offset,L2); o3 = vxor(o1,L2); o4 = vxor(offset,L4); o5 = vxor(o1,L4); o6 = vxor(o2,L4); o7 = vxor(o3,L4); o8 = offset; Ifordoubling = double_block(Ifordoubling); offset = vxor(J2, bswap16(Ifordoubling)); fs[0] = aes4pre(s,o1,J,I,L); fs[1] = aes4pre(s,o2,J,I,L); fs[2] = aes4pre(s,o3,J,I,L); fs[3] = aes4pre(s,o4,J,I,L); fs[4] = aes4pre(s,o5,J,I,L); fs[5] = aes4pre(s,o6,J,I,L); fs[6] = aes4pre(s,o7,J,I,L); fs[7] = aes4pre(s,o8,J,I,L); o1 = vxor(J3,o1); o2 = vxor(J3,o2); o3 = vxor(J3,o3); o4 = vxor(J3,o4); o5 = vxor(J3,o5); o6 = vxor(J3,o6); o7 = vxor(J3,o7); o8 = vxor(J3,o8); tmp[0] = vxor(load(dst+ 0),fs[0]); sum = vxor(sum,tmp[0]); store(dst+ 0,vxor(load(dst+ 1),fs[0])); tmp[1] = vxor(load(dst+ 2),fs[1]); sum = vxor(sum,tmp[1]); store(dst+ 2,vxor(load(dst+ 3),fs[1])); tmp[2] = vxor(load(dst+ 4),fs[2]); sum = vxor(sum,tmp[2]); store(dst+ 4,vxor(load(dst+ 5),fs[2])); tmp[3] = vxor(load(dst+ 6),fs[3]); sum = vxor(sum,tmp[3]); store(dst+ 6,vxor(load(dst+ 7),fs[3])); tmp[4] = vxor(load(dst+ 8),fs[4]); sum = vxor(sum,tmp[4]); store(dst+ 8,vxor(load(dst+ 9),fs[4])); tmp[5] = vxor(load(dst+10),fs[5]); sum = vxor(sum,tmp[5]); store(dst+10,vxor(load(dst+11),fs[5])); tmp[6] = vxor(load(dst+12),fs[6]); sum = vxor(sum,tmp[6]); store(dst+12,vxor(load(dst+13),fs[6])); tmp[7] = vxor(load(dst+14),fs[7]); sum = vxor(sum,tmp[7]); store(dst+14,vxor(load(dst+15),fs[7])); store(dst+ 1, aes4(I,load(dst+ 0), J, I, L, tmp[0])); store(dst+ 3, aes4(I,load(dst+ 2), J, I, L, tmp[1])); store(dst+ 5, aes4(I,load(dst+ 4), J, I, L, tmp[2])); store(dst+ 7, aes4(I,load(dst+ 6), J, I, L, tmp[3])); store(dst+ 9, aes4(I,load(dst+ 8), J, I, L, tmp[4])); store(dst+11, aes4(I,load(dst+10), J, I, L, tmp[5])); store(dst+13, aes4(I,load(dst+12), J, I, L, tmp[6])); store(dst+15, aes4(I,load(dst+14), J, I, L, tmp[7])); store(dst+ 0, aes4(load(dst+ 1),o1, J, I, L, load(dst+ 0))); store(dst+ 2, aes4(load(dst+ 3),o2, J, I, L, load(dst+ 2))); store(dst+ 4, aes4(load(dst+ 5),o3, J, I, L, load(dst+ 4))); store(dst+ 6, aes4(load(dst+ 7),o4, J, I, L, load(dst+ 6))); store(dst+ 8, aes4(load(dst+ 9),o5, J, I, L, load(dst+ 8))); store(dst+10, aes4(load(dst+11),o6, J, I, L, load(dst+10))); store(dst+12, aes4(load(dst+13),o7, J, I, L, load(dst+12))); store(dst+14, aes4(load(dst+15),o8, J, I, L, load(dst+14))); bytes -= 16*16; dst += 16; } if (bytes >= 8*16) { o1 = vxor(offset,L); o2 = vxor(offset,L2); o3 = vxor(o1,L2); o4 = offset = vxor(offset,L4); fs[0] = aes4pre(s,o1,J,I,L); fs[1] = aes4pre(s,o2,J,I,L); fs[2] = aes4pre(s,o3,J,I,L); fs[3] = aes4pre(s,o4,J,I,L); o1 = vxor(J3,o1); o2 = vxor(J3,o2); o3 = vxor(J3,o3); o4 = vxor(J3,o4); tmp[0] = vxor(load(dst+ 0),fs[0]); sum = vxor(sum,tmp[0]); store(dst+ 0,vxor(load(dst+ 1),fs[0])); tmp[1] = vxor(load(dst+ 2),fs[1]); sum = vxor(sum,tmp[1]); store(dst+ 2,vxor(load(dst+ 3),fs[1])); tmp[2] = vxor(load(dst+ 4),fs[2]); sum = vxor(sum,tmp[2]); store(dst+ 4,vxor(load(dst+ 5),fs[2])); tmp[3] = vxor(load(dst+ 6),fs[3]); sum = vxor(sum,tmp[3]); store(dst+ 6,vxor(load(dst+ 7),fs[3])); store(dst+ 1, aes4(I,load(dst+ 0), J, I, L, tmp[0])); store(dst+ 3, aes4(I,load(dst+ 2), J, I, L, tmp[1])); store(dst+ 5, aes4(I,load(dst+ 4), J, I, L, tmp[2])); store(dst+ 7, aes4(I,load(dst+ 6), J, I, L, tmp[3])); store(dst+ 0, aes4(load(dst+ 1),o1, J, I, L, load(dst+ 0))); store(dst+ 2, aes4(load(dst+ 3),o2, J, I, L, load(dst+ 2))); store(dst+ 4, aes4(load(dst+ 5),o3, J, I, L, load(dst+ 4))); store(dst+ 6, aes4(load(dst+ 7),o4, J, I, L, load(dst+ 6))); bytes -= 8*16; dst += 8; } if (bytes >= 4*16) { o1 = vxor(offset,L); o2 = offset = vxor(offset,L2); fs[0] = aes4pre(s,o1,J,I,L); fs[1] = aes4pre(s,o2,J,I,L); o1 = vxor(J3,o1); o2 = vxor(J3,o2); tmp[0] = vxor(load(dst+ 0),fs[0]); sum = vxor(sum,tmp[0]); store(dst+ 0,vxor(load(dst+ 1),fs[0])); tmp[1] = vxor(load(dst+ 2),fs[1]); sum = vxor(sum,tmp[1]); store(dst+ 2,vxor(load(dst+ 3),fs[1])); store(dst+ 1, aes4(I,load(dst+ 0), J, I, L, tmp[0])); store(dst+ 3, aes4(I,load(dst+ 2), J, I, L, tmp[1])); store(dst+ 0, aes4(load(dst+ 1),o1, J, I, L, load(dst+ 0))); store(dst+ 2, aes4(load(dst+ 3),o2, J, I, L, load(dst+ 2))); bytes -= 4*16; dst += 4; } if (bytes) { o1 = vxor(offset,L); fs[0] = aes4pre(s,o1,J,I,L); o1 = vxor(J3,o1); tmp[0] = vxor(load(dst+ 0),fs[0]); sum = vxor(sum,tmp[0]); store(dst+ 0,vxor(load(dst+ 1),fs[0])); store(dst+ 1, aes4(I,load(dst+ 0), J, I, L, tmp[0])); store(dst+ 0, aes4(load(dst+ 1),o1, J, I, L, load(dst+ 0))); } return sum; } /* ------------------------------------------------------------------------- */ static int cipher_aez_core(aez_ctx_t *ctx, block t, int d, char *src, unsigned bytes, unsigned abytes, char *dst) { block s, x, y, frag0, frag1, final0, final1; block I=ctx->I[0], L=ctx->L[0], J=ctx->J[0]; block L4=ctx->L[2], I2 = bswap16(ctx->I[1]); unsigned i, frag_bytes, initial_bytes; if (!d) bytes += abytes; frag_bytes = bytes % 32; initial_bytes = bytes - frag_bytes - 32; /* Compute x and store intermediate results */ x = pass_one(ctx, (block*)src, initial_bytes, (block*)dst); if (frag_bytes >= 16) { frag0 = load(src + initial_bytes); frag1 = one_zero_pad(load(src + initial_bytes + 16), 32-frag_bytes); x = aes4(frag0, vxor(L4, I2), J, I, L, x); x = vxor(x, aes4pre(frag1, vxor3(I2, L4, L), J, I, L)); } else if (frag_bytes) { frag0 = one_zero_pad(load(src + initial_bytes), 16-frag_bytes); x = aes4(frag0, vxor(L4, I2), J, I, L, x); } /* Calculate s and final block values (y xor'd to final1 later) */ final0 = vxor3(loadu(src + (bytes - 32)), x, t); if (d || !abytes) final1 = loadu(src+(bytes-32)+16); else final1 = zero_pad(loadu(src+(bytes-32)+16), abytes); final0 = aes4(final1, vxor(I2, ctx->L[d]), J, I, L, final0); final1 = vxor(final1, aes((block*)ctx, final0, ctx->L[d])); s = vxor(final0, final1); final0 = vxor(final0, aes((block*)ctx, final1, ctx->L[d^1])); /* Decryption: final0 should hold abytes zero bytes. If not, failure */ if (d && !is_zero(vandnot(loadu(pad+abytes),final0))) return -1; final1 = aes4(final0, vxor(I2, ctx->L[d^1]), J, I, L, final1); /* Compute y and store final results */ y = pass_two(ctx, s, initial_bytes, (block*)dst); if (frag_bytes >= 16) { frag0 = vxor(frag0, aes((block*)ctx, s, L4)); frag1 = vxor(frag1, aes((block*)ctx, s, vxor(L4, L))); frag1 = one_zero_pad(frag1, 32-frag_bytes); y = aes4(frag0, vxor(I2, L4), J, I, L, y); y = vxor(y, aes4pre(frag1, vxor3(I2, L4, L), J, I, L)); store(dst + initial_bytes, frag0); store(dst + initial_bytes + 16, frag1); } else if (frag_bytes) { frag0 = vxor(frag0, aes((block*)ctx, s, L4)); frag0 = one_zero_pad(frag0, 16-frag_bytes); y = aes4(frag0, vxor(I2, L4), J, I, L, y); store(dst + initial_bytes, frag0); } storeu(dst + (bytes - 32), vxor3(final1, y, t)); if (!d || !abytes) storeu(dst + (bytes - 32) + 16, final0); else { for (i=0; i<16-abytes; i++) ((char*)dst + (bytes - 16))[i] = ((char*)&final0)[i]; } return 0; } /* ------------------------------------------------------------------------- */ static int cipher_aez_tiny(aez_ctx_t *ctx, block t, int d, char *src, unsigned bytes, unsigned abytes, char *dst) { block l, r, tmp, one, rcon, buf[2], mask_10, mask_ff; block I=ctx->I[0], L=ctx->L[0], J=ctx->J[0], t_orig = t; block L2=ctx->L[1], L4=ctx->L[2], I2 = bswap16(ctx->I[1]); unsigned rnds, i; /* load src into buf, zero pad, update bytes for abytes */ if (bytes >= 16) { buf[0] = load(src); buf[1] = zero_pad(load_partial(src+16,bytes-16),32-bytes); } else { buf[0] = zero_pad(load_partial(src,bytes),16-bytes); buf[1] = zero; } if (!d) bytes += abytes; /* load l/r, create 10* padding masks, shift r 4 bits if odd length */ l = buf[0]; r = loadu((char*)buf+bytes/2); mask_ff = loadu(pad+16-bytes/2); mask_10 = loadu(pad+32-bytes/2); if (bytes&1) { /* Odd length. Deal with nibbles. */ mask_10 = sll4(mask_10); ((char*)&mask_ff)[bytes/2] = (char)0xf0; r = bswap16(r); r = srl4(r); r = bswap16(r); } r = vor(vand(r, mask_ff), mask_10); /* Add tweak offset into t, and determine the number of rounds */ if (bytes >= 16) { t = vxor4(t, I2, L2, L4); /* (0,6) offset */ rnds = 8; } else { t = vxor(vxor4(t, I2, L2, L4), L); /* (0,7) offset */ if (bytes>=3) rnds = 10; else if (bytes==2) rnds = 16; else rnds = 24; } if (!d) { one = zero_set_byte(1,15); rcon = zero; } else { one = zero_set_byte(-1,15); rcon = zero_set_byte((char)(rnds-1),15); } if ((d) && (bytes < 16)) { block offset = vxor3(I2, L, L2); tmp = vor(l, loadu(pad+32)); tmp = aes4pre(t_orig, vxor(tmp,offset), J, I, L); tmp = vand(tmp, loadu(pad+32)); l = vxor(l, tmp); } /* Feistel */ for (i=0; iL[0], ctx->L[1])); for (i=0; iL[0], ctx->L[1])), 16-abytes); return is_zero(vandnot(t, claimed)) - 1; /* is_zero return 0 or 1 */ } else if (bytes < 32) { return cipher_aez_tiny(ctx, t, 1, src, bytes, abytes, dst); } else { return cipher_aez_core(ctx, t, 1, src, bytes, abytes, dst); } } /* ------------------------------------------------------------------------- */ /* Reference Blake2b code, here for convenience, and not for speed. */ /* Dowloaded Sep 2015 from https://github.com/mjosaarinen/blake2_mjosref */ #include typedef struct { uint8_t b[128]; uint64_t h[8]; uint64_t t[2]; size_t c; size_t outlen; } blake2b_ctx; #ifndef ROTR64 #define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y)))) #endif #define B2B_GET64(p) \ (((uint64_t) ((uint8_t *) (p))[0]) ^ \ (((uint64_t) ((uint8_t *) (p))[1]) << 8) ^ \ (((uint64_t) ((uint8_t *) (p))[2]) << 16) ^ \ (((uint64_t) ((uint8_t *) (p))[3]) << 24) ^ \ (((uint64_t) ((uint8_t *) (p))[4]) << 32) ^ \ (((uint64_t) ((uint8_t *) (p))[5]) << 40) ^ \ (((uint64_t) ((uint8_t *) (p))[6]) << 48) ^ \ (((uint64_t) ((uint8_t *) (p))[7]) << 56)) #define B2B_G(a, b, c, d, x, y) { \ v[a] = v[a] + v[b] + x; \ v[d] = ROTR64(v[d] ^ v[a], 32); \ v[c] = v[c] + v[d]; \ v[b] = ROTR64(v[b] ^ v[c], 24); \ v[a] = v[a] + v[b] + y; \ v[d] = ROTR64(v[d] ^ v[a], 16); \ v[c] = v[c] + v[d]; \ v[b] = ROTR64(v[b] ^ v[c], 63); } static const uint64_t blake2b_iv[8] = { 0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1, 0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179 }; static void blake2b_compress(blake2b_ctx *ctx, int last) { const uint8_t sigma[12][16] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } }; int i; uint64_t v[16], m[16]; for (i = 0; i < 8; i++) { v[i] = ctx->h[i]; v[i + 8] = blake2b_iv[i]; } v[12] ^= ctx->t[0]; v[13] ^= ctx->t[1]; if (last) v[14] = ~v[14]; for (i = 0; i < 16; i++) m[i] = B2B_GET64(&ctx->b[8 * i]); for (i = 0; i < 12; i++) { B2B_G( 0, 4, 8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]); B2B_G( 1, 5, 9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]); B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]); B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]); B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]); B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]); B2B_G( 2, 7, 8, 13, m[sigma[i][12]], m[sigma[i][13]]); B2B_G( 3, 4, 9, 14, m[sigma[i][14]], m[sigma[i][15]]); } for( i = 0; i < 8; ++i ) ctx->h[i] ^= v[i] ^ v[i + 8]; } static void blake2b_update(blake2b_ctx *ctx, const void *in, size_t inlen) { size_t i; for (i = 0; i < inlen; i++) { if (ctx->c == 128) { ctx->t[0] += ctx->c; if (ctx->t[0] < ctx->c) ctx->t[1]++; blake2b_compress(ctx, 0); ctx->c = 0; } ctx->b[ctx->c++] = ((const uint8_t *) in)[i]; } } static void blake2b_final(blake2b_ctx *ctx, void *out) { size_t i; ctx->t[0] += ctx->c; if (ctx->t[0] < ctx->c) ctx->t[1]++; while (ctx->c < 128) ctx->b[ctx->c++] = 0; blake2b_compress(ctx, 1); for (i = 0; i < ctx->outlen; i++) { ((uint8_t *) out)[i] = (ctx->h[i >> 3] >> (8 * (i & 7))) & 0xFF; } } static int blake2b_init(blake2b_ctx *ctx, size_t outlen, const void *key, size_t keylen) { size_t i; if (outlen == 0 || outlen > 64 || keylen > 64) return -1; for (i = 0; i < 8; i++) ctx->h[i] = blake2b_iv[i]; ctx->h[0] ^= 0x01010000 ^ (keylen << 8) ^ outlen; ctx->t[0] = 0; ctx->t[1] = 0; ctx->c = 0; ctx->outlen = outlen; for (i = keylen; i < 128; i++) ctx->b[i] = 0; if (keylen > 0) { blake2b_update(ctx, key, keylen); ctx->c = 128; } return 0; } static int blake2b(void *out, size_t outlen, const void *key, size_t keylen, const void *in, size_t inlen) { blake2b_ctx ctx; if (blake2b_init(&ctx, outlen, key, keylen)) return -1; blake2b_update(&ctx, in, inlen); blake2b_final(&ctx, out); return 0; } /* ------------------------------------------------------------------------- */ /* aez mapping for CAESAR competition */ int crypto_aead_encrypt( unsigned char *c,unsigned long long *clen, const unsigned char *m,unsigned long long mlen, const unsigned char *ad,unsigned long long adlen, const unsigned char *nsec, const unsigned char *npub, const unsigned char *k ) { aez_ctx_t ctx; (void)nsec; if (clen) *clen = mlen+16; aez_setup((unsigned char *)k, 48, &ctx); aez_encrypt(&ctx, (char *)npub, 12, (char *)ad, (unsigned)adlen, 16, (char *)m, (unsigned)mlen, (char *)c); return 0; } int crypto_aead_decrypt( unsigned char *m,unsigned long long *mlen, unsigned char *nsec, const unsigned char *c,unsigned long long clen, const unsigned char *ad,unsigned long long adlen, const unsigned char *npub, const unsigned char *k ) { aez_ctx_t ctx; (void)nsec; if (mlen) *mlen = clen-16; aez_setup((unsigned char *)k, 48, &ctx); return aez_decrypt(&ctx, (char *)npub, 12, (char *)ad, (unsigned)adlen, 16, (char *)c, (unsigned)clen, (char *)m); }