src/aesround.rs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162

use super::block::Block;

#[cfg(target_arch = "x86_64")]
pub type AesImpl = x86_64::AesNi;

#[cfg(not(target_arch = "x86_64"))]
pub type AesImpl = AesSoft;

pub trait AesRound {
    fn new(key_i: Block, key_j: Block, key_l: Block) -> Self;
    fn aes4(&self, value: Block) -> Block;
    fn aes10(&self, value: Block) -> Block;
}

/// Implementation of aes4 and aes10 in software.
///
/// Always available.
///
/// Uses the `aes` crate under the hood.
pub struct AesSoft {
    key_i: aes::Block,
    key_j: aes::Block,
    key_l: aes::Block,
}

impl AesRound for AesSoft {
    fn new(key_i: Block, key_j: Block, key_l: Block) -> Self {
        Self {
            key_i: key_i.bytes().into(),
            key_j: key_j.bytes().into(),
            key_l: key_l.bytes().into(),
        }
    }

    fn aes4(&self, value: Block) -> Block {
        let mut block: aes::Block = value.bytes().into();
        ::aes::hazmat::cipher_round(&mut block, &self.key_j);
        ::aes::hazmat::cipher_round(&mut block, &self.key_i);
        ::aes::hazmat::cipher_round(&mut block, &self.key_l);
        ::aes::hazmat::cipher_round(&mut block, &Block::null().bytes().into());
        <Block as From<[u8; 16]>>::from(block.into())
    }

    fn aes10(&self, value: Block) -> Block {
        let mut block: aes::Block = value.bytes().into();
        ::aes::hazmat::cipher_round(&mut block, &self.key_i);
        ::aes::hazmat::cipher_round(&mut block, &self.key_j);
        ::aes::hazmat::cipher_round(&mut block, &self.key_l);
        ::aes::hazmat::cipher_round(&mut block, &self.key_i);
        ::aes::hazmat::cipher_round(&mut block, &self.key_j);
        ::aes::hazmat::cipher_round(&mut block, &self.key_l);
        ::aes::hazmat::cipher_round(&mut block, &self.key_i);
        ::aes::hazmat::cipher_round(&mut block, &self.key_j);
        ::aes::hazmat::cipher_round(&mut block, &self.key_l);
        ::aes::hazmat::cipher_round(&mut block, &self.key_i);
        <Block as From<[u8; 16]>>::from(block.into())
    }
}

// It feels silly re-implementing the native AES instruction (especially since aes does use it
// under the hood), but there is a big benefit here:
// First, we can save time by only loading the keys once as a __m128i, which makes the whole thing
// a bit faster.
// More importantly though, the compiler does not inline the call to cipher_round, even when using
// target-cpu=native. I guess this is because it crosses a crate boundary (and cross-crate inlining
// only happens with LTO). In fact, compiling with lto=true does inline the call, but we don't want
// to force that to all library users. Anyway, by re-implementing the AES instruction here, we get
// nice inlining without relying on LTO and therefore a huge speedup, as AES is called a lot.
#[cfg(target_arch = "x86_64")]
pub mod x86_64 {
    use super::*;
    use core::arch::x86_64::*;

    cpufeatures::new!(cpuid_aes, "aes");

    pub struct AesNi {
        support: cpuid_aes::InitToken,
        fallback: AesSoft,
        key_i: __m128i,
        key_j: __m128i,
        key_l: __m128i,
        null: __m128i,
    }

    #[cfg(feature = "simd")]
    fn to_simd(block: Block) -> __m128i {
        block.simd().into()
    }

    #[cfg(not(feature = "simd"))]
    fn to_simd(block: Block) -> __m128i {
        let bytes = block.bytes();
        // SAFETY: loadu can load from unaligned memory
        unsafe { _mm_loadu_si128(bytes.as_ptr() as *const _) }
    }

    #[cfg(feature = "simd")]
    fn from_simd(simd: __m128i) -> Block {
        Block::from_simd(simd.into())
    }

    #[cfg(not(feature = "simd"))]
    fn from_simd(simd: __m128i) -> Block {
        let mut bytes = [0; 16];
        // SAFETY: storeu can store to unaligned memory
        unsafe {
            _mm_storeu_si128(bytes.as_mut_ptr() as *mut _, simd);
        }
        Block::from(bytes)
    }

    impl AesRound for AesNi {
        fn new(key_i: Block, key_j: Block, key_l: Block) -> Self {
            Self {
                support: cpuid_aes::init(),
                fallback: AesSoft::new(key_i, key_j, key_l),
                key_i: to_simd(key_i),
                key_j: to_simd(key_j),
                key_l: to_simd(key_l),
                null: to_simd(Block::null()),
            }
        }

        fn aes4(&self, value: Block) -> Block {
            if !self.support.get() {
                return self.fallback.aes4(value);
            }

            // SAFETY: Nothing should go wrong when calling AESENC
            unsafe {
                let mut block = to_simd(value);
                block = _mm_aesenc_si128(block, self.key_j);
                block = _mm_aesenc_si128(block, self.key_i);
                block = _mm_aesenc_si128(block, self.key_l);
                block = _mm_aesenc_si128(block, self.null);
                from_simd(block)
            }
        }

        fn aes10(&self, value: Block) -> Block {
            if !self.support.get() {
                return self.fallback.aes10(value);
            }

            // SAFETY: Nothing should go wrong when calling AESENC
            unsafe {
                let mut block = to_simd(value);
                block = _mm_aesenc_si128(block, self.key_i);
                block = _mm_aesenc_si128(block, self.key_j);
                block = _mm_aesenc_si128(block, self.key_l);
                block = _mm_aesenc_si128(block, self.key_i);
                block = _mm_aesenc_si128(block, self.key_j);
                block = _mm_aesenc_si128(block, self.key_l);
                block = _mm_aesenc_si128(block, self.key_i);
                block = _mm_aesenc_si128(block, self.key_j);
                block = _mm_aesenc_si128(block, self.key_l);
                block = _mm_aesenc_si128(block, self.key_i);
                from_simd(block)
            }
        }
    }
}