From 942f522a68adfb9c436a05e736bdaf07c8aa7db8 Mon Sep 17 00:00:00 2001 From: Daniel Schadt Date: Wed, 18 Jun 2025 12:00:39 +0200 Subject: update rationale for implementing AESNI ourselves --- src/aesround.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/aesround.rs b/src/aesround.rs index 6f63243..4ae3f6f 100644 --- a/src/aesround.rs +++ b/src/aesround.rs @@ -61,9 +61,11 @@ impl AesRound for AesSoft { // under the hood), but there is a big benefit here: // First, we can save time by only loading the keys once as a __m128i, which makes the whole thing // a bit faster. -// More importantly though, when using target-cpu=native, we get nicely vectorized AES instructions -// (VAESENC), which we don't get if we go through aes::hazmat::cipher_round. This is a *huge* -// speedup, which we don't want to miss. +// More importantly though, the compiler does not inline the call to cipher_round, even when using +// target-cpu=native. I guess this is because it crosses a crate boundary (and cross-crate inlining +// only happens with LTO). In fact, compiling with lto=true does inline the call, but we don't want +// to force that to all library users. Anyway, by re-implementing the AES instruction here, we get +// nice inlining without relying on LTO and therefore a huge speedup, as AES is called a lot. #[cfg(target_arch = "x86_64")] pub mod x86_64 { use super::*; -- cgit v1.2.3