@@ -9,6 +9,7 @@
#define CPUINFO_ALWAYS (1u << 0) /* so cpuinfo is nonzero */
#define CPUINFO_LSE (1u << 1)
#define CPUINFO_LSE2 (1u << 2)
+#define CPUINFO_AES (1u << 3)
/* Initialized with a constructor. */
extern unsigned cpuinfo;
@@ -20,6 +20,11 @@
#include "crypto/aes.h"
+#ifdef __aarch64__
+#include "host/cpuinfo.h"
+typedef uint8_t aes_vec_t __attribute__((vector_size(16)));
+#endif
+
#if SHIFT == 0
#define Reg MMXReg
#define XMM_ONLY(...)
@@ -2165,6 +2170,20 @@ void glue(helper_aesdec, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
Reg st = *v;
Reg rk = *s;
+#ifdef __aarch64__
+ if (cpuinfo & CPUINFO_AES) {
+ asm(" .arch_extension aes \n"
+ " aesd %0.16b, %1.16b \n"
+ " aesimc %0.16b, %0.16b \n"
+ " eor %0.16b, %0.16b, %2.16b \n"
+ : "=w"(*(aes_vec_t *)d)
+ : "w"((aes_vec_t){}),
+ "w"(*(aes_vec_t *)s),
+ "0"(*(aes_vec_t *)v));
+ return;
+ }
+#endif
+
for (i = 0 ; i < 2 << SHIFT ; i++) {
int j = i & 3;
d->L(i) = rk.L(i) ^ bswap32(AES_Td0[st.B(AES_ishifts[4 * j + 0])] ^
@@ -2180,6 +2199,19 @@ void glue(helper_aesdeclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
Reg st = *v;
Reg rk = *s;
+#ifdef __aarch64__
+ if (cpuinfo & CPUINFO_AES) {
+ asm(" .arch_extension aes \n"
+ " aesd %0.16b, %1.16b \n"
+ " eor %0.16b, %0.16b, %2.16b \n"
+ : "=w"(*(aes_vec_t *)d)
+ : "w"((aes_vec_t){}),
+ "w"(*(aes_vec_t *)s),
+ "0"(*(aes_vec_t *)v));
+ return;
+ }
+#endif
+
for (i = 0; i < 8 << SHIFT; i++) {
d->B(i) = rk.B(i) ^ (AES_isbox[st.B(AES_ishifts[i & 15] + (i & ~15))]);
}
@@ -2191,6 +2223,20 @@ void glue(helper_aesenc, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
Reg st = *v;
Reg rk = *s;
+#ifdef __aarch64__
+ if (cpuinfo & CPUINFO_AES) {
+ asm(" .arch_extension aes \n"
+ " aese %0.16b, %1.16b \n"
+ " aesmc %0.16b, %0.16b \n"
+ " eor %0.16b, %0.16b, %2.16b \n"
+ : "=w"(*(aes_vec_t *)d)
+ : "w"((aes_vec_t){}),
+ "w"(*(aes_vec_t *)s),
+ "0"(*(aes_vec_t *)v));
+ return;
+ }
+#endif
+
for (i = 0 ; i < 2 << SHIFT ; i++) {
int j = i & 3;
d->L(i) = rk.L(i) ^ bswap32(AES_Te0[st.B(AES_shifts[4 * j + 0])] ^
@@ -2206,6 +2252,19 @@ void glue(helper_aesenclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
Reg st = *v;
Reg rk = *s;
+#ifdef __aarch64__
+ if (cpuinfo & CPUINFO_AES) {
+ asm(" .arch_extension aes \n"
+ " aese %0.16b, %1.16b \n"
+ " eor %0.16b, %0.16b, %2.16b \n"
+ : "=w"(*(aes_vec_t *)d)
+ : "w"((aes_vec_t){}),
+ "w"(*(aes_vec_t *)s),
+ "0"(*(aes_vec_t *)v));
+ return;
+ }
+#endif
+
for (i = 0; i < 8 << SHIFT; i++) {
d->B(i) = rk.B(i) ^ (AES_sbox[st.B(AES_shifts[i & 15] + (i & ~15))]);
}
@@ -2217,6 +2276,16 @@ void glue(helper_aesimc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
int i;
Reg tmp = *s;
+#ifdef __aarch64__
+ if (cpuinfo & CPUINFO_AES) {
+ asm(" .arch_extension aes \n"
+ " aesimc %0.16b, %1.16b \n"
+ : "=w"(*(aes_vec_t *)d)
+ : "w"(*(aes_vec_t *)s));
+ return;
+ }
+#endif
+
for (i = 0 ; i < 4 ; i++) {
d->L(i) = bswap32(AES_imc[tmp.B(4 * i + 0)][0] ^
AES_imc[tmp.B(4 * i + 1)][1] ^
@@ -56,6 +56,7 @@ unsigned __attribute__((constructor)) cpuinfo_init(void)
unsigned long hwcap = qemu_getauxval(AT_HWCAP);
info |= (hwcap & HWCAP_ATOMICS ? CPUINFO_LSE : 0);
info |= (hwcap & HWCAP_USCAT ? CPUINFO_LSE2 : 0);
+ info |= (hwcap & HWCAP_AES ? CPUINFO_AES : 0);
#endif
#ifdef CONFIG_DARWIN
info |= sysctl_for_bool("hw.optional.arm.FEAT_LSE") * CPUINFO_LSE;
When available, use the AArch64 AES instructions to implement the x86 ones. These are not a 1:1 fit, but considerably more efficient, and without data dependent timing. For a typical benchmark (linux tcrypt mode=500), this gives a 2-3x speedup when running on ThunderX2. Signed-off-by: Ard Biesheuvel <ardb@kernel.org> --- host/include/aarch64/host/cpuinfo.h | 1 + target/i386/ops_sse.h | 69 ++++++++++++++++++++ util/cpuinfo-aarch64.c | 1 + 3 files changed, 71 insertions(+)