/*
 // AEZ v5 AES-NI version. AEZ info: http://www.cs.ucdavis.edu/~rogaway/aez
 //
 // REQUIREMENTS: - Intel or ARM CPU supporting AES instructions
 //               - Faster if all pointers are 16-byte aligned.
 //               - Max 16 byte nonce, 16 byte authenticator
 //               - Single AD (AEZ spec allows vector AD but this code doesn't)
 //               - Max 2^32-1 byte buffers allowed (due to using unsigned int)
 //
 // Written by Ted Krovetz (ted@krovetz.net). Last modified 21 March 2017.
 //
 // This is free and unencumbered software released into the public domain.
 //
 // Anyone is free to copy, modify, publish, use, compile, sell, or
 // distribute this software, either in source code form or as a compiled
 // binary, for any purpose, commercial or non-commercial, and by any
 // means.
 //
 // In jurisdictions that recognize copyright laws, the author or authors
 // of this software dedicate any and all copyright interest in the
 // software to the public domain. We make this dedication for the benefit
 // of the public at large and to the detriment of our heirs and
 // successors. We intend this dedication to be an overt act of
 // relinquishment in perpetuity of all present and future rights to this
 // software under copyright law.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 // IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 // OTHER DEALINGS IN THE SOFTWARE.
 //
 // For more information, please refer to <http://unlicense.org/>
 */

#include "crypto_aead.h"
#include <stdint.h>
#include <stddef.h>

/* ------------------------------------------------------------------------- */
#if __AES__                /* Defined by gcc/clang when compiling for AES-NI */
/* ------------------------------------------------------------------------- */

#include <smmintrin.h>
#include <wmmintrin.h>
#define block __m128i

/* ------------------------------------------------------------------------- */

#define zero           _mm_setzero_si128()
#define vadd(x,y)      _mm_add_epi8(x,y)
#define vand(x,y)      _mm_and_si128(x,y)
#define vandnot(x,y)   _mm_andnot_si128(x,y)  /* (~x)&y */
#define vor(x,y)       _mm_or_si128(x,y)
#define vxor(x,y)      _mm_xor_si128(x,y)

static int is_zero(block x) { return _mm_testz_si128(x,x); }      /* 0 or 1 */

static block sll4(block x) {
    return vor(_mm_srli_epi64(x, 4), _mm_slli_epi64(_mm_srli_si128(x, 8), 60));
}

static block srl4(block x) {
    return vor(_mm_slli_epi64(x, 4), _mm_srli_epi64(_mm_slli_si128(x, 8), 60));
}

static __m128i bswap16(__m128i b) {
    const __m128i t = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
    return _mm_shuffle_epi8(b,t);
}

static __m128i double_block(__m128i bl) {
    const __m128i mask = _mm_set_epi32(135,1,1,1);
    __m128i tmp = _mm_srai_epi32(bl, 31);
    tmp = _mm_and_si128(tmp, mask);
    tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(2,1,0,3));
    bl = _mm_slli_epi32(bl, 1);
    return _mm_xor_si128(bl,tmp);
}

static __m128i aes(__m128i *key, __m128i in, __m128i first_key) {
    in = vxor(in, first_key);
    in = _mm_aesenc_si128 (in,key[0]);
    in = _mm_aesenc_si128 (in,key[2]);
    in = _mm_aesenc_si128 (in,key[5]);
    in = _mm_aesenc_si128 (in,key[0]);
    in = _mm_aesenc_si128 (in,key[2]);
    in = _mm_aesenc_si128 (in,key[5]);
    in = _mm_aesenc_si128 (in,key[0]);
    in = _mm_aesenc_si128 (in,key[2]);
    in = _mm_aesenc_si128 (in,key[5]);
    return _mm_aesenc_si128 (in,key[0]);
}

static __m128i aes4(__m128i in, __m128i a, __m128i b,
                    __m128i c, __m128i d, __m128i e) {
    in = _mm_aesenc_si128(vxor(in,a),b);
    in = _mm_aesenc_si128(in,c);
    in = _mm_aesenc_si128(in,d);
    return _mm_aesenc_si128 (in,e);
}

#define aes4pre(in,a,b,c,d) aes4(in,a,b,c,d,zero)

static __m128i loadu(const void *p) { return _mm_loadu_si128((__m128i*)p); }
static void storeu(const void *p, __m128i x) {_mm_storeu_si128((__m128i*)p,x);}

#define load loadu      /* Intel with AES-NI has fast unaligned loads/stores */
#define store storeu

/* ------------------------------------------------------------------------- */
#elif __ARM_FEATURE_CRYPTO
/* ------------------------------------------------------------------------- */

#include <arm_neon.h>
#define block uint8x16_t

#define zero           vmovq_n_u8(0)
#define vadd(x,y)      vaddq_u8(x,y)
#define vand(x,y)      vandq_u8(x,y)
#define vandnot(x,y)   vbicq_u8(y,x)  /* (~x)&y */
#define vor(x,y)       vorrq_u8(x,y)
#define vxor(x,y)      veorq_u8(x,y)

static int is_zero(block x) {         /* 0 or 1 */
    uint8x8_t t = vorr_u8(vget_high_u8(x), vget_low_u8(x));
    return vget_lane_u64(vreinterpret_u64_u8(t),0) == 0;
}

static block srl4(block x) {
    const block mask = {15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,0};
    uint8x16_t tmp = vandq_u8(vshrq_n_u8(vextq_u8(x, x, 1),4),mask);
    return veorq_u8(tmp,vshlq_n_u8(x,4));
}

static block sll4(block x) {
    const block mask = {0,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15};
    uint8x16_t tmp = vshlq_n_u8(vandq_u8(vextq_u8(x, x, 15),mask),4);
    return veorq_u8(tmp,vshrq_n_u8(x,4));
}

static uint8x16_t bswap16(uint8x16_t b) { return b; } /* Not with uint8x16_t */

static block double_block(block b) {
    const block mask = {135,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
    block tmp = (block)vshrq_n_s8((int8x16_t)b,7);
    tmp = vandq_u8(tmp, mask);
    tmp = vextq_u8(tmp, tmp, 1);  /* Rotate high byte to low end */
    b = vshlq_n_u8(b,1);
    return veorq_u8(tmp,b);
}

static uint8x16_t aes(uint8x16_t *key, uint8x16_t in, uint8x16_t first_key) {
    in = vaesmcq_u8(vaeseq_u8(in, first_key));
    in = vaesmcq_u8(vaeseq_u8(in, key[0]));
    in = vaesmcq_u8(vaeseq_u8(in, key[2]));
    in = vaesmcq_u8(vaeseq_u8(in, key[5]));
    in = vaesmcq_u8(vaeseq_u8(in, key[0]));
    in = vaesmcq_u8(vaeseq_u8(in, key[2]));
    in = vaesmcq_u8(vaeseq_u8(in, key[5]));
    in = vaesmcq_u8(vaeseq_u8(in, key[0]));
    in = vaesmcq_u8(vaeseq_u8(in, key[2]));
    in = vaesmcq_u8(vaeseq_u8(in, key[5]));
    return vxor(in, key[0]);
}

static uint8x16_t aes4pre(uint8x16_t in, uint8x16_t a, uint8x16_t b,
                          uint8x16_t c, uint8x16_t d) {
    in = vaesmcq_u8(vaeseq_u8(in, a));
    in = vaesmcq_u8(vaeseq_u8(in, b));
    in = vaesmcq_u8(vaeseq_u8(in, c));
    return vaesmcq_u8(vaeseq_u8(in, d));
}

#define aes4(in,a,b,c,d,e) vxor(aes4pre(in,a,b,c,d),e) 

static uint8x16_t load(const void *p) { return *(uint8x16_t *)p; }
static void store(void *p, uint8x16_t x) { *(uint8x16_t *)p = x; }

#define loadu load    /* ARMv8 allows unaligned loads/stores */
#define storeu store  /* ARMv8 allows unaligned stores       */

/* ------------------------------------------------------------------------- */
#else
#error - This implementation requires __AES__ or __ARM_FEATURE_CRYPTO
#endif
/* ------------------------------------------------------------------------- */

#define vxor3(x,y,z)        vxor(vxor(x,y),z)
#define vxor4(w,x,y,z)      vxor(vxor(w,x),vxor(y,z))
#define load_partial(p,n)   loadu(p)

/*
Might need a version like this if, for example, we want to load a 12-byte nonce
into a 16-byte block.

static block load_partial(const void *p, unsigned n) {
    if ((intptr_t)p % 16 == 0) return load(p);
    else {
        block tmp; unsigned i;
        for (i=0; i<n; i++) ((char*)&tmp)[i] = ((char*)p)[i];
        return tmp;
    }
}
*/

static const unsigned char pad[] = {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
                                    0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
                                    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
                                    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
                                    0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
                                    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};

static block zero_pad(block x, unsigned zero_bytes) {
    return vand(x, loadu(pad + zero_bytes));
}

static block one_zero_pad(block x, unsigned one_zero_bytes) {
    block *p = (block*)(pad + one_zero_bytes);
    return vor(vand(x, loadu(p)), loadu(p+1));
}

static block zero_set_byte(char val, unsigned idx) {
    block tmp = zero; ((char *)&tmp)[idx] = val; return tmp;
}

/* ------------------------------------------------------------------------- */

typedef struct {   /* All data memory-correct except 2I register-correct */
    block I[2];    /* 1I, 2I */
    block J[3];    /* 1J,2J,4J */
    block L[3];    /* 1L,2L,4L */
    block delta3_cache;
} aez_ctx_t;

/* ------------------------------------------------------------------------- */

static int blake2b(void *out, size_t outlen,
                   const void *key, size_t keylen,
                   const void *in, size_t inlen);

/* ------------------------------------------------------------------------- */

void aez_setup(unsigned char *key, unsigned keylen, aez_ctx_t *ctx) {
    block tmp;
    if (keylen==48) {
        ctx->I[0] = loadu(key);
        ctx->J[0] = loadu(key+16);
        ctx->L[0] = loadu(key+32);
    } else {
        blake2b(ctx, 48, 0, 0, key, keylen);    /* Puts IJL into ctx */
        ctx->L[0] = ctx->J[0];                  /* Rearrange.        */
        ctx->J[0] = ctx->I[1];                  /* Rearrange.        */
    }
    /* Fill remaining ctx locations with doublings */
    ctx->I[1] = double_block(bswap16(ctx->I[0]));           /* No post-bswap */
    ctx->J[1] = bswap16(tmp = double_block(bswap16(ctx->J[0])));
    ctx->J[2] = bswap16(double_block(tmp));
    ctx->L[1] = bswap16(tmp = double_block(bswap16(ctx->L[0])));
    ctx->L[2] = bswap16(double_block(tmp));
    ctx->delta3_cache = zero;
}

/* ------------------------------------------------------------------------- */

/* !! Warning !! Only handles nbytes <= 16 and abytes <= 16 */
static block aez_hash(aez_ctx_t *ctx, char *n, unsigned nbytes, char *ad,
               unsigned adbytes, unsigned abytes) {
    block o1, o2, o3, o4, o5, o6, o7, o8, sum, offset, tmp;
    block I=ctx->I[0], Ifordoubling = ctx->I[1], I2 = bswap16(Ifordoubling);
    block L=ctx->L[0], L2=ctx->L[1],L4=ctx->L[2];
    block J=ctx->J[0], J2 = ctx->J[1], J4 = ctx->J[2], J5 = vxor(J,J4);

    /* Process abytes and nonce */
    offset = vxor4(J, J2, I2, L);
    tmp = zero_set_byte((char)(8*abytes),15);
    sum = aes4pre(offset,tmp,J,I,L);

    if (nbytes==16) sum = aes4(vxor(loadu(n), J4), vxor(I2, L),J,I,L,sum);
    else sum = aes4(vxor(J4, I),
                    one_zero_pad(load_partial(n,nbytes),16-nbytes),J,I,L,sum);

    if (ad) {  /* Possible easy misuse: ad==null && adbytes==0 */
        if (adbytes==0) {
            ctx->delta3_cache = aes4pre(vxor(J5, I), loadu(pad+32),J,I,L);
        } else {
            block delta3 = zero;
            offset = vxor(J5, I2);
            while (adbytes >= 8*16) {
                o1 = vxor(offset,L);
                o2 = vxor(offset,L2);
                o3 = vxor(o1,L2);
                o4 = vxor(offset,L4);
                o5 = vxor(o1,L4);
                o6 = vxor(o2,L4);
                o7 = vxor(o3,L4);
                o8 = offset;
                Ifordoubling = double_block(Ifordoubling);
                offset = vxor(J5, bswap16(Ifordoubling));
                delta3 = vxor(delta3, aes4pre(load(ad+  0), o1, J, I, L));
                delta3 = vxor(delta3, aes4pre(load(ad+ 16), o2, J, I, L));
                delta3 = vxor(delta3, aes4pre(load(ad+ 32), o3, J, I, L));
                delta3 = vxor(delta3, aes4pre(load(ad+ 48), o4, J, I, L));
                delta3 = vxor(delta3, aes4pre(load(ad+ 64), o5, J, I, L));
                delta3 = vxor(delta3, aes4pre(load(ad+ 80), o6, J, I, L));
                delta3 = vxor(delta3, aes4pre(load(ad+ 96), o7, J, I, L));
                delta3 = vxor(delta3, aes4pre(load(ad+112), o8, J, I, L));
                adbytes-=8*16; ad+=8*16;
            }
            if (adbytes >= 4*16) {
                o1 = vxor(offset,L);
                o2 = vxor(offset,L2);
                o3 = vxor(o1,L2);
                o4 = offset = vxor(offset,L4);
                delta3 = vxor(delta3, aes4pre(load(ad+  0), o1, J, I, L));
                delta3 = vxor(delta3, aes4pre(load(ad+ 16), o2, J, I, L));
                delta3 = vxor(delta3, aes4pre(load(ad+ 32), o3, J, I, L));
                delta3 = vxor(delta3, aes4pre(load(ad+ 48), o4, J, I, L));
                adbytes-=4*16; ad+=4*16;
            }
            if (adbytes >= 2*16) {
                o1 = vxor(offset,L);
                o2 = offset = vxor(offset,L2);
                delta3 = vxor(delta3, aes4pre(load(ad+  0), o1, J, I, L));
                delta3 = vxor(delta3, aes4pre(load(ad+ 16), o2, J, I, L));
                adbytes-=2*16; ad+=2*16;
            }
            if (adbytes >= 1*16) {
                o1 = vxor(offset,L);
                delta3 = vxor(delta3, aes4pre(load(ad+  0), o1, J, I, L));
                adbytes-=1*16; ad+=1*16;
            }
            if (adbytes) {
                tmp = vxor3(J5, I, one_zero_pad(load(ad),16-adbytes));
                delta3 = aes4(vxor(J5, I), one_zero_pad(load(ad),16-adbytes),
                              J, I, L, delta3);
            }
            ctx->delta3_cache = delta3;
        }
    }
    return vxor(sum,ctx->delta3_cache);
}

/* ------------------------------------------------------------------------- */

static block pass_one(aez_ctx_t *ctx, block *src, unsigned bytes, block *dst) {
    block o1, o2, o3, o4, o5, o6, o7, o8, offset, tmp, sum=zero;
    block I=ctx->I[0], Ifordoubling = ctx->I[1];
    block L=ctx->L[0], L2=ctx->L[1],L4=ctx->L[2];
    block J=ctx->J[0];
    offset = vxor(J, bswap16(Ifordoubling));
    while (bytes >= 16*16) {
        o1 = vxor(offset,L);
        o2 = vxor(offset,L2);
        o3 = vxor(o1,L2);
        o4 = vxor(offset,L4);
        o5 = vxor(o1,L4);
        o6 = vxor(o2,L4);
        o7 = vxor(o3,L4);
        o8 = offset;
        Ifordoubling = double_block(Ifordoubling);
        offset = vxor(J,bswap16(Ifordoubling));
        store(dst+ 0, aes4(load(src + 1),o1, J, I, L, load(src+ 0)));
        store(dst+ 2, aes4(load(src + 3),o2, J, I, L, load(src+ 2)));
        store(dst+ 4, aes4(load(src + 5),o3, J, I, L, load(src+ 4)));
        store(dst+ 6, aes4(load(src + 7),o4, J, I, L, load(src+ 6)));
        store(dst+ 8, aes4(load(src + 9),o5, J, I, L, load(src+ 8)));
        store(dst+10, aes4(load(src +11),o6, J, I, L, load(src+10)));
        store(dst+12, aes4(load(src +13),o7, J, I, L, load(src+12)));
        store(dst+14, aes4(load(src +15),o8, J, I, L, load(src+14)));
        tmp=aes4(I,load(dst+ 0),J,I,L,load(src+ 1));store(dst+ 1,tmp);
        sum=vxor(sum,tmp);
        tmp=aes4(I,load(dst+ 2),J,I,L,load(src+ 3));
        store(dst+ 3,tmp);sum=vxor(sum,tmp);
        tmp=aes4(I,load(dst+ 4),J,I,L,load(src+ 5));
        store(dst+ 5,tmp);sum=vxor(sum,tmp);
        tmp=aes4(I,load(dst+ 6),J,I,L,load(src+ 7));
        store(dst+ 7,tmp);sum=vxor(sum,tmp);
        tmp=aes4(I,load(dst+ 8),J,I,L,load(src+ 9));
        store(dst+ 9,tmp);sum=vxor(sum,tmp);
        tmp=aes4(I,load(dst+10),J,I,L,load(src+11));
        store(dst+11,tmp);sum=vxor(sum,tmp);
        tmp=aes4(I,load(dst+12),J,I,L,load(src+13));
        store(dst+13,tmp);sum=vxor(sum,tmp);
        tmp=aes4(I,load(dst+14),J,I,L,load(src+15));
        store(dst+15,tmp);sum=vxor(sum,tmp);
        bytes -= 16*16; dst += 16; src += 16;
    }
    if (bytes >= 8*16) {
        o1 = vxor(offset,L);
        o2 = vxor(offset,L2);
        o3 = vxor(o1,L2);
        o4 = offset = vxor(offset,L4);
        store(dst+ 0, aes4(load(src + 1),o1, J, I, L, load(src+ 0)));
        store(dst+ 2, aes4(load(src + 3),o2, J, I, L, load(src+ 2)));
        store(dst+ 4, aes4(load(src + 5),o3, J, I, L, load(src+ 4)));
        store(dst+ 6, aes4(load(src + 7),o4, J, I, L, load(src+ 6)));
        tmp=aes4(I,load(dst+ 0),J,I,L,load(src+ 1));
        store(dst+ 1,tmp);sum=vxor(sum,tmp);
        tmp=aes4(I,load(dst+ 2),J,I,L,load(src+ 3));
        store(dst+ 3,tmp);sum=vxor(sum,tmp);
        tmp=aes4(I,load(dst+ 4),J,I,L,load(src+ 5));
        store(dst+ 5,tmp);sum=vxor(sum,tmp);
        tmp=aes4(I,load(dst+ 6),J,I,L,load(src+ 7));
        store(dst+ 7,tmp);sum=vxor(sum,tmp);
        bytes -= 8*16; dst += 8; src += 8;
    }
    if (bytes >= 4*16) {
        o1 = vxor(offset,L);
        o2 = offset = vxor(offset,L2);
        store(dst+ 0, aes4(load(src + 1),o1, J, I, L, load(src+ 0)));
        store(dst+ 2, aes4(load(src + 3),o2, J, I, L, load(src+ 2)));
        tmp=aes4(I,load(dst+ 0),J,I,L,load(src+ 1));
        store(dst+ 1,tmp);sum=vxor(sum,tmp);
        tmp=aes4(I,load(dst+ 2),J,I,L,load(src+ 3));
        store(dst+ 3,tmp);sum=vxor(sum,tmp);
        bytes -= 4*16; dst += 4; src += 4;
    }
    if (bytes) {
        o1 = vxor(offset,L);
        store(dst+ 0, aes4(load(src + 1),o1, J, I, L, load(src+ 0)));
        tmp=aes4(I,load(dst+ 0),J,I,L,load(src+ 1));
        store(dst+ 1,tmp);sum=vxor(sum,tmp);
    }
    return sum;
}

/* ------------------------------------------------------------------------- */

static block pass_two(aez_ctx_t *ctx, block s, unsigned bytes, block *dst) {
    block o1, o2, o3, o4, o5, o6, o7, o8, sum=zero, offset, fs[8], tmp[8];
    block I=ctx->I[0], Ifordoubling = ctx->I[1];
    block L=ctx->L[0], L2=ctx->L[1],L4=ctx->L[2];
    block J=ctx->J[0], J2=ctx->J[1], J3=vxor(J,J2);
    offset = vxor(J2, bswap16(Ifordoubling));
    while (bytes >= 16*16) {
        o1 = vxor(offset,L);
        o2 = vxor(offset,L2);
        o3 = vxor(o1,L2);
        o4 = vxor(offset,L4);
        o5 = vxor(o1,L4);
        o6 = vxor(o2,L4);
        o7 = vxor(o3,L4);
        o8 = offset;
        Ifordoubling = double_block(Ifordoubling);
        offset = vxor(J2, bswap16(Ifordoubling));
        fs[0] = aes4pre(s,o1,J,I,L); fs[1] = aes4pre(s,o2,J,I,L);
        fs[2] = aes4pre(s,o3,J,I,L); fs[3] = aes4pre(s,o4,J,I,L);
        fs[4] = aes4pre(s,o5,J,I,L); fs[5] = aes4pre(s,o6,J,I,L);
        fs[6] = aes4pre(s,o7,J,I,L); fs[7] = aes4pre(s,o8,J,I,L);
        o1 = vxor(J3,o1); o2 = vxor(J3,o2);
        o3 = vxor(J3,o3); o4 = vxor(J3,o4);
        o5 = vxor(J3,o5); o6 = vxor(J3,o6);
        o7 = vxor(J3,o7); o8 = vxor(J3,o8);
        tmp[0] = vxor(load(dst+ 0),fs[0]); sum = vxor(sum,tmp[0]);
        store(dst+ 0,vxor(load(dst+ 1),fs[0]));
        tmp[1] = vxor(load(dst+ 2),fs[1]); sum = vxor(sum,tmp[1]);
        store(dst+ 2,vxor(load(dst+ 3),fs[1]));
        tmp[2] = vxor(load(dst+ 4),fs[2]); sum = vxor(sum,tmp[2]);
        store(dst+ 4,vxor(load(dst+ 5),fs[2]));
        tmp[3] = vxor(load(dst+ 6),fs[3]); sum = vxor(sum,tmp[3]);
        store(dst+ 6,vxor(load(dst+ 7),fs[3]));
        tmp[4] = vxor(load(dst+ 8),fs[4]); sum = vxor(sum,tmp[4]);
        store(dst+ 8,vxor(load(dst+ 9),fs[4]));
        tmp[5] = vxor(load(dst+10),fs[5]); sum = vxor(sum,tmp[5]);
        store(dst+10,vxor(load(dst+11),fs[5]));
        tmp[6] = vxor(load(dst+12),fs[6]); sum = vxor(sum,tmp[6]);
        store(dst+12,vxor(load(dst+13),fs[6]));
        tmp[7] = vxor(load(dst+14),fs[7]); sum = vxor(sum,tmp[7]);
        store(dst+14,vxor(load(dst+15),fs[7]));
        store(dst+ 1, aes4(I,load(dst+ 0), J, I, L, tmp[0]));
        store(dst+ 3, aes4(I,load(dst+ 2), J, I, L, tmp[1]));
        store(dst+ 5, aes4(I,load(dst+ 4), J, I, L, tmp[2]));
        store(dst+ 7, aes4(I,load(dst+ 6), J, I, L, tmp[3]));
        store(dst+ 9, aes4(I,load(dst+ 8), J, I, L, tmp[4]));
        store(dst+11, aes4(I,load(dst+10), J, I, L, tmp[5]));
        store(dst+13, aes4(I,load(dst+12), J, I, L, tmp[6]));
        store(dst+15, aes4(I,load(dst+14), J, I, L, tmp[7]));
        store(dst+ 0, aes4(load(dst+ 1),o1, J, I, L, load(dst+ 0)));
        store(dst+ 2, aes4(load(dst+ 3),o2, J, I, L, load(dst+ 2)));
        store(dst+ 4, aes4(load(dst+ 5),o3, J, I, L, load(dst+ 4)));
        store(dst+ 6, aes4(load(dst+ 7),o4, J, I, L, load(dst+ 6)));
        store(dst+ 8, aes4(load(dst+ 9),o5, J, I, L, load(dst+ 8)));
        store(dst+10, aes4(load(dst+11),o6, J, I, L, load(dst+10)));
        store(dst+12, aes4(load(dst+13),o7, J, I, L, load(dst+12)));
        store(dst+14, aes4(load(dst+15),o8, J, I, L, load(dst+14)));
        bytes -= 16*16; dst += 16;
    }
    if (bytes >= 8*16) {
        o1 = vxor(offset,L);
        o2 = vxor(offset,L2);
        o3 = vxor(o1,L2);
        o4 = offset = vxor(offset,L4);
        fs[0] = aes4pre(s,o1,J,I,L); fs[1] = aes4pre(s,o2,J,I,L);
        fs[2] = aes4pre(s,o3,J,I,L); fs[3] = aes4pre(s,o4,J,I,L);
        o1 = vxor(J3,o1); o2 = vxor(J3,o2);
        o3 = vxor(J3,o3); o4 = vxor(J3,o4);
        tmp[0] = vxor(load(dst+ 0),fs[0]); sum = vxor(sum,tmp[0]);
        store(dst+ 0,vxor(load(dst+ 1),fs[0]));
        tmp[1] = vxor(load(dst+ 2),fs[1]); sum = vxor(sum,tmp[1]);
        store(dst+ 2,vxor(load(dst+ 3),fs[1]));
        tmp[2] = vxor(load(dst+ 4),fs[2]); sum = vxor(sum,tmp[2]);
        store(dst+ 4,vxor(load(dst+ 5),fs[2]));
        tmp[3] = vxor(load(dst+ 6),fs[3]); sum = vxor(sum,tmp[3]);
        store(dst+ 6,vxor(load(dst+ 7),fs[3]));
        store(dst+ 1, aes4(I,load(dst+ 0), J, I, L, tmp[0]));
        store(dst+ 3, aes4(I,load(dst+ 2), J, I, L, tmp[1]));
        store(dst+ 5, aes4(I,load(dst+ 4), J, I, L, tmp[2]));
        store(dst+ 7, aes4(I,load(dst+ 6), J, I, L, tmp[3]));
        store(dst+ 0, aes4(load(dst+ 1),o1, J, I, L, load(dst+ 0)));
        store(dst+ 2, aes4(load(dst+ 3),o2, J, I, L, load(dst+ 2)));
        store(dst+ 4, aes4(load(dst+ 5),o3, J, I, L, load(dst+ 4)));
        store(dst+ 6, aes4(load(dst+ 7),o4, J, I, L, load(dst+ 6)));
        bytes -= 8*16; dst += 8;
    }
    if (bytes >= 4*16) {
        o1 = vxor(offset,L);
        o2 = offset = vxor(offset,L2);
        fs[0] = aes4pre(s,o1,J,I,L); fs[1] = aes4pre(s,o2,J,I,L);
        o1 = vxor(J3,o1); o2 = vxor(J3,o2);
        tmp[0] = vxor(load(dst+ 0),fs[0]); sum = vxor(sum,tmp[0]);
        store(dst+ 0,vxor(load(dst+ 1),fs[0]));
        tmp[1] = vxor(load(dst+ 2),fs[1]); sum = vxor(sum,tmp[1]);
        store(dst+ 2,vxor(load(dst+ 3),fs[1]));
        store(dst+ 1, aes4(I,load(dst+ 0), J, I, L, tmp[0]));
        store(dst+ 3, aes4(I,load(dst+ 2), J, I, L, tmp[1]));
        store(dst+ 0, aes4(load(dst+ 1),o1, J, I, L, load(dst+ 0)));
        store(dst+ 2, aes4(load(dst+ 3),o2, J, I, L, load(dst+ 2)));
        bytes -= 4*16; dst += 4;
    }
    if (bytes) {
        o1 = vxor(offset,L);
        fs[0] = aes4pre(s,o1,J,I,L);
        o1 = vxor(J3,o1);
        tmp[0] = vxor(load(dst+ 0),fs[0]); sum = vxor(sum,tmp[0]);
        store(dst+ 0,vxor(load(dst+ 1),fs[0]));
        store(dst+ 1, aes4(I,load(dst+ 0), J, I, L, tmp[0]));
        store(dst+ 0, aes4(load(dst+ 1),o1, J, I, L, load(dst+ 0)));
    }
    return sum;
}

/* ------------------------------------------------------------------------- */

static int cipher_aez_core(aez_ctx_t *ctx, block t, int d, char *src,
                           unsigned bytes, unsigned abytes, char *dst) {
    block s, x, y, frag0, frag1, final0, final1;
    block I=ctx->I[0], L=ctx->L[0], J=ctx->J[0];
    block L4=ctx->L[2], I2 = bswap16(ctx->I[1]);
    unsigned i, frag_bytes, initial_bytes;

    if (!d) bytes += abytes;
    frag_bytes = bytes % 32;
    initial_bytes = bytes - frag_bytes - 32;

    /* Compute x and store intermediate results */
    x = pass_one(ctx, (block*)src, initial_bytes, (block*)dst);
    if (frag_bytes >= 16) {
        frag0 = load(src + initial_bytes);
        frag1 = one_zero_pad(load(src + initial_bytes + 16), 32-frag_bytes);
        x  = aes4(frag0, vxor(L4, I2), J, I, L, x);
        x  = vxor(x, aes4pre(frag1, vxor3(I2, L4, L), J, I, L));
    } else if (frag_bytes) {
        frag0 = one_zero_pad(load(src + initial_bytes), 16-frag_bytes);
        x = aes4(frag0, vxor(L4, I2), J, I, L, x);
    }

    /* Calculate s and final block values (y xor'd to final1 later) */
    final0 = vxor3(loadu(src + (bytes - 32)), x, t);
    if (d || !abytes) final1 = loadu(src+(bytes-32)+16);
    else              final1 = zero_pad(loadu(src+(bytes-32)+16), abytes);
    final0 = aes4(final1, vxor(I2, ctx->L[d]), J, I, L, final0);
    final1 = vxor(final1, aes((block*)ctx, final0, ctx->L[d]));
    s = vxor(final0, final1);
    final0 = vxor(final0, aes((block*)ctx, final1, ctx->L[d^1]));
    /* Decryption: final0 should hold abytes zero bytes. If not, failure */
    if (d && !is_zero(vandnot(loadu(pad+abytes),final0))) return -1;
    final1 = aes4(final0, vxor(I2, ctx->L[d^1]), J, I, L, final1);

    /* Compute y and store final results */
    y = pass_two(ctx, s, initial_bytes, (block*)dst);
    if (frag_bytes >= 16) {
        frag0 = vxor(frag0, aes((block*)ctx, s, L4));
        frag1 = vxor(frag1, aes((block*)ctx, s, vxor(L4, L)));
        frag1 = one_zero_pad(frag1, 32-frag_bytes);
        y  = aes4(frag0, vxor(I2, L4),     J, I, L, y);
        y  = vxor(y, aes4pre(frag1, vxor3(I2, L4, L), J, I, L));
        store(dst + initial_bytes, frag0);
        store(dst + initial_bytes + 16, frag1);
    } else if (frag_bytes) {
        frag0 = vxor(frag0, aes((block*)ctx, s, L4));
        frag0 = one_zero_pad(frag0, 16-frag_bytes);
        y = aes4(frag0, vxor(I2, L4), J, I, L, y);
        store(dst + initial_bytes, frag0);
    }

    storeu(dst + (bytes - 32), vxor3(final1, y, t));
    if (!d || !abytes)
        storeu(dst + (bytes - 32) + 16, final0);
    else {
        for (i=0; i<16-abytes; i++)
            ((char*)dst + (bytes - 16))[i] = ((char*)&final0)[i];
    }
    return 0;
}

/* ------------------------------------------------------------------------- */

static int cipher_aez_tiny(aez_ctx_t *ctx, block t, int d, char *src,
                           unsigned bytes, unsigned abytes, char *dst) {
    block l, r, tmp, one, rcon, buf[2], mask_10, mask_ff;
    block I=ctx->I[0], L=ctx->L[0], J=ctx->J[0], t_orig = t;
    block L2=ctx->L[1], L4=ctx->L[2], I2 = bswap16(ctx->I[1]);
    unsigned rnds, i;

    /* load src into buf, zero pad, update bytes for abytes */
    if (bytes >= 16) {
        buf[0] = load(src);
        buf[1] = zero_pad(load_partial(src+16,bytes-16),32-bytes);
    } else {
        buf[0] = zero_pad(load_partial(src,bytes),16-bytes);
        buf[1] = zero;
    }
    if (!d) bytes += abytes;

    /* load l/r, create 10* padding masks, shift r 4 bits if odd length */
    l = buf[0];
    r = loadu((char*)buf+bytes/2);
    mask_ff = loadu(pad+16-bytes/2);
    mask_10 = loadu(pad+32-bytes/2);
    if (bytes&1) {  /* Odd length. Deal with nibbles. */
        mask_10 = sll4(mask_10);
        ((char*)&mask_ff)[bytes/2] = (char)0xf0;
        r = bswap16(r);
        r = srl4(r);
        r = bswap16(r);
    }
    r = vor(vand(r, mask_ff), mask_10);

    /* Add tweak offset into t, and determine the number of rounds */
    if (bytes >= 16) {
        t = vxor4(t, I2, L2, L4);             /* (0,6) offset */
        rnds = 8;
    } else {
        t = vxor(vxor4(t, I2, L2, L4), L); /* (0,7) offset */
        if (bytes>=3) rnds = 10; else if (bytes==2) rnds = 16; else rnds = 24;
    }

    if (!d) {
        one = zero_set_byte(1,15);
        rcon = zero;
    } else {
        one = zero_set_byte(-1,15);
        rcon = zero_set_byte((char)(rnds-1),15);
    }

    if ((d) && (bytes < 16)) {
        block offset = vxor3(I2, L, L2);
        tmp = vor(l, loadu(pad+32));
        tmp = aes4pre(t_orig, vxor(tmp,offset), J, I, L);
        tmp = vand(tmp, loadu(pad+32));
        l = vxor(l, tmp);
    }

    /* Feistel */
    for (i=0; i<rnds; i+=2) {
        l = vor(vand(aes4(t,vxor(r,rcon), J, I, L, l), mask_ff), mask_10);
        rcon = vadd(rcon,one);
        r = vor(vand(aes4(t,vxor(l,rcon), J, I, L, r), mask_ff), mask_10);
        rcon = vadd(rcon,one);
    }
    buf[0] = r;
    if (bytes&1) {
        l = bswap16(l);
        l = sll4(l);
        l = bswap16(l);
        r = vand(loadu((char*)buf+bytes/2), zero_set_byte((char)0xf0,0));
        l = vor(l, r);
    }
    storeu((char*)buf+bytes/2, l);
    if (d) {
        bytes -= abytes;
        if (abytes==16) tmp = loadu((char*)buf+bytes);
        else {
            tmp = zero;
            for (i=0; i<abytes; i++) ((char*)&tmp)[i] = ((char*)buf+bytes)[i];
        }
        if (!is_zero(tmp)) return -1;
    } else if (bytes < 16) {
        block offset = vxor3(I2, L, L2);
        tmp = vor(zero_pad(buf[0], 16-bytes), loadu(pad+32));
        tmp = aes4pre(t_orig,vxor(tmp,offset), J, I, L);
        buf[0] = vxor(buf[0], vand(tmp, loadu(pad+32)));
    }
    for (i=0; i<bytes; i++) dst[i] = ((char*)buf)[i];
    return 0;
}

/* ------------------------------------------------------------------------- */

void aez_encrypt(aez_ctx_t *ctx, char *n, unsigned nbytes,
                 char *ad, unsigned adbytes, unsigned abytes,
                 char *src, unsigned bytes, char *dst) {

    block t = aez_hash(ctx, n, nbytes, ad, adbytes, abytes);
    if (bytes==0) {
        unsigned i;
        t = aes((block*)ctx, t, vxor(ctx->L[0], ctx->L[1]));
        for (i=0; i<abytes; i++) dst[i] = ((char*)&t)[i];
    } else if (bytes+abytes < 32)
        cipher_aez_tiny(ctx, t, 0, src, bytes, abytes, dst);
    else
        cipher_aez_core(ctx, t, 0, src, bytes, abytes, dst);
}

/* ------------------------------------------------------------------------- */

int aez_decrypt(aez_ctx_t *ctx, char *n, unsigned nbytes,
                char *ad, unsigned adbytes, unsigned abytes,
                char *src, unsigned bytes, char *dst) {

    block t;
    if (bytes < abytes) return -1;
    t = aez_hash(ctx, n, nbytes, ad, adbytes, abytes);
    if (bytes==abytes) {
        block claimed = zero_pad(load_partial(src,abytes), 16-abytes);
        t = zero_pad(aes((block*)ctx, t, vxor(ctx->L[0], ctx->L[1])), 16-abytes);
        return is_zero(vandnot(t, claimed)) - 1;  /* is_zero return 0 or 1 */
    } else if (bytes < 32) {
        return cipher_aez_tiny(ctx, t, 1, src, bytes, abytes, dst);
    } else {
        return cipher_aez_core(ctx, t, 1, src, bytes, abytes, dst);
    }
}

/* ------------------------------------------------------------------------- */
/* Reference Blake2b code, here for convenience, and not for speed.          */
/* Dowloaded Sep 2015 from https://github.com/mjosaarinen/blake2_mjosref     */

#include <stdint.h>

typedef struct {
    uint8_t b[128];
    uint64_t h[8];
    uint64_t t[2];
    size_t c;
    size_t outlen;
} blake2b_ctx;

#ifndef ROTR64
#define ROTR64(x, y)  (((x) >> (y)) ^ ((x) << (64 - (y))))
#endif

#define B2B_GET64(p)                            \
(((uint64_t) ((uint8_t *) (p))[0]) ^        \
(((uint64_t) ((uint8_t *) (p))[1]) << 8) ^  \
(((uint64_t) ((uint8_t *) (p))[2]) << 16) ^ \
(((uint64_t) ((uint8_t *) (p))[3]) << 24) ^ \
(((uint64_t) ((uint8_t *) (p))[4]) << 32) ^ \
(((uint64_t) ((uint8_t *) (p))[5]) << 40) ^ \
(((uint64_t) ((uint8_t *) (p))[6]) << 48) ^ \
(((uint64_t) ((uint8_t *) (p))[7]) << 56))

#define B2B_G(a, b, c, d, x, y) {   \
v[a] = v[a] + v[b] + x;         \
v[d] = ROTR64(v[d] ^ v[a], 32); \
v[c] = v[c] + v[d];             \
v[b] = ROTR64(v[b] ^ v[c], 24); \
v[a] = v[a] + v[b] + y;         \
v[d] = ROTR64(v[d] ^ v[a], 16); \
v[c] = v[c] + v[d];             \
v[b] = ROTR64(v[b] ^ v[c], 63); }

static const uint64_t blake2b_iv[8] = {
    0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
    0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
    0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
    0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
};

static void blake2b_compress(blake2b_ctx *ctx, int last)
{
    const uint8_t sigma[12][16] = {
        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
        { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
        { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
        { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
        { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
        { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
        { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
        { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
        { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
        { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
        { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
    };
    int i;
    uint64_t v[16], m[16];

    for (i = 0; i < 8; i++) {
        v[i] = ctx->h[i];
        v[i + 8] = blake2b_iv[i];
    }

    v[12] ^= ctx->t[0];
    v[13] ^= ctx->t[1];
    if (last)
        v[14] = ~v[14];

    for (i = 0; i < 16; i++)
        m[i] = B2B_GET64(&ctx->b[8 * i]);

    for (i = 0; i < 12; i++) {
        B2B_G( 0, 4,  8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]);
        B2B_G( 1, 5,  9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]);
        B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]);
        B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]);
        B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]);
        B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]);
        B2B_G( 2, 7,  8, 13, m[sigma[i][12]], m[sigma[i][13]]);
        B2B_G( 3, 4,  9, 14, m[sigma[i][14]], m[sigma[i][15]]);
    }

    for( i = 0; i < 8; ++i )
        ctx->h[i] ^= v[i] ^ v[i + 8];
}

static void blake2b_update(blake2b_ctx *ctx,
                           const void *in, size_t inlen)
{
    size_t i;

    for (i = 0; i < inlen; i++) {
        if (ctx->c == 128) {
            ctx->t[0] += ctx->c;
            if (ctx->t[0] < ctx->c)
                ctx->t[1]++;
            blake2b_compress(ctx, 0);
            ctx->c = 0;
        }
        ctx->b[ctx->c++] = ((const uint8_t *) in)[i];
    }
}

static void blake2b_final(blake2b_ctx *ctx, void *out)
{
    size_t i;

    ctx->t[0] += ctx->c;
    if (ctx->t[0] < ctx->c)
        ctx->t[1]++;

    while (ctx->c < 128)
        ctx->b[ctx->c++] = 0;
    blake2b_compress(ctx, 1);

    for (i = 0; i < ctx->outlen; i++) {
        ((uint8_t *) out)[i] =
        (ctx->h[i >> 3] >> (8 * (i & 7))) & 0xFF;
    }
}

static int blake2b_init(blake2b_ctx *ctx, size_t outlen,
                        const void *key, size_t keylen)
{
    size_t i;

    if (outlen == 0 || outlen > 64 || keylen > 64)
        return -1;

    for (i = 0; i < 8; i++)
        ctx->h[i] = blake2b_iv[i];
    ctx->h[0] ^= 0x01010000 ^ (keylen << 8) ^ outlen;

    ctx->t[0] = 0;
    ctx->t[1] = 0;
    ctx->c = 0;
    ctx->outlen = outlen;

    for (i = keylen; i < 128; i++)
        ctx->b[i] = 0;
    if (keylen > 0) {
        blake2b_update(ctx, key, keylen);
        ctx->c = 128;
    }

    return 0;
}

static int blake2b(void *out, size_t outlen,
                   const void *key, size_t keylen,
                   const void *in, size_t inlen)
{
    blake2b_ctx ctx;

    if (blake2b_init(&ctx, outlen, key, keylen))
        return -1;
    blake2b_update(&ctx, in, inlen);
    blake2b_final(&ctx, out);

    return 0;
}

/* ------------------------------------------------------------------------- */
/* aez mapping for CAESAR competition                                        */

int crypto_aead_encrypt(
    unsigned char *c,unsigned long long *clen,
    const unsigned char *m,unsigned long long mlen,
    const unsigned char *ad,unsigned long long adlen,
    const unsigned char *nsec,
    const unsigned char *npub,
    const unsigned char *k
)
{
    aez_ctx_t ctx;
    (void)nsec;
    if (clen) *clen = mlen+16;
    aez_setup((unsigned char *)k, 48, &ctx);
    aez_encrypt(&ctx, (char *)npub, 12,
                 (char *)ad, (unsigned)adlen, 16,
                 (char *)m, (unsigned)mlen, (char *)c);
    return 0;
}

int crypto_aead_decrypt(
    unsigned char *m,unsigned long long *mlen,
    unsigned char *nsec,
    const unsigned char *c,unsigned long long clen,
    const unsigned char *ad,unsigned long long adlen,
    const unsigned char *npub,
    const unsigned char *k
)
{
    aez_ctx_t ctx;
    (void)nsec;
    if (mlen) *mlen = clen-16;
    aez_setup((unsigned char *)k, 48, &ctx);
    return aez_decrypt(&ctx, (char *)npub, 12,
                 (char *)ad, (unsigned)adlen, 16,
                 (char *)c, (unsigned)clen, (char *)m);
}